Skip to content

Commit

Permalink
Release/2.1: Port a few commits about WOQ and SmoothQuant from cpu-de…
Browse files Browse the repository at this point in the history
…vice (#2275)

* WOQ: blockwise quantization of activation (#2136)

* WOQ: Add blockwise quantization of activation

* Update docstring for woq qconfig

* Modify API name: M->BATCH, K->IC

* Improve doc string for get_weight_only_quant_qconfig_mapping

* Fix concat-linear bug and improve lowp_mode docstring

* Use PER_IC_BLOCK by default

* WOQ: fix bf16 correctness bug when lowp_mode=NONE (#2166)

* WOQ: fix bf16 correctness bug when lowp_mode=NONE

* improve ut

* fix clang-format issue

* Woq blockwise quant of weight (#2238)

* Add block-wise quantization of weight

* Use self-defined quantize/dequantize function instead of those from PyTorch

* Remove the old fallback and use the new one; Patch N and K when necessary

* Fix bug for concat linear

* Fix bugs about uncompressing int4 zero points and concat linear

* Fix bug about fallback path

* Fix clang-format issue

* Update llm example script and readme

* Fix deepspeed UT

* Block_k can be less than group_size; Update group_size docstring

* WOQ Bug fix: fuse linear-gelu instead of linear-new_gelu (#2265)

* WOQ Bug fix: fuse linear-gelu instead of linear-new_gelu

* Use _convert_woq instead of PyTorch's prepare/convert for woq in optimize_transformers

* Fix int8 concat linear accuracy issue by adding scales/zero points to IpexWoqLinear.from_float

* fix flake8 issue

* Move _convert_woq to quantization.convert

* Revert changes to quantization.convert to fix deepspeed UT failure

* SmoothQuant: make share_weight_observer configurable for layers like QKV (#2106)

* SmoothQuant: make share_weight_observer configurable for layers like QKV

* Add share_weight_observers to qconf_summary

* SmoothQuant: Do not insert mul if user cancels quantization of linear by qconf (#2254)

* SmoothQuant: Do not insert mul if user cancels quantization of linear by qconf

* Add UT

* Fix flake8 issue

* Bug fix: SmoothQuant custom sub-observers are shared by mistake (#2219)

* Bug fix: SmoothQuant custom sub-observers are shared by mistake

* Fix flake8 issue

* Update doc string for the change

* Bug fix: fail to detect SmoothQuant observer has run (#2269)

* Add an int8 linear op for Bitsandbytes (#2266)

* Add a int8 linear op for Bitsandbytes

* Rename the op to matmul_i8i8i32 since it's not specific for bnb

* Add meaningful error messages for TORCH_CHECK

* Fix clang-format issue

* Run flake8
  • Loading branch information
Xia-Weiwen authored Nov 17, 2023
1 parent 11484c3 commit ed5957e
Show file tree
Hide file tree
Showing 30 changed files with 2,774 additions and 1,071 deletions.
419 changes: 167 additions & 252 deletions csrc/cpu/aten/Linear.cpp

Large diffs are not rendered by default.

57 changes: 18 additions & 39 deletions csrc/cpu/aten/Linear.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,45 +81,27 @@ at::Tensor ipex_linear_eltwise(
// WOQ linear ops
at::Tensor woq_linear_pack_weight(
const at::Tensor& weight,
const at::Tensor& scale,
const at::Tensor& zero_points,
std::vector<int64_t>& weight_shape,
bool is_4bit,
int64_t group_size,
int64_t lowp_mode);

at::Tensor woq_linear_unpack_weight(
const at::Tensor& weight,
bool is_int4,
int64_t lowp_mode);

void woq_linear_kernel_output(
const at::Tensor& self,
const at::Tensor& weight,
const at::Tensor& scales_float,
const at::Tensor& zero_points_float,
const at::Tensor& bias,
int64_t lowp_mode,
at::Tensor& output);

at::Tensor woq_linear_kernel(
const at::Tensor& self,
const at::Tensor& weight,
const std::vector<at::Tensor>& scales_list,
const std::vector<at::Tensor>& zps_list,
const std::vector<at::Tensor>& bias_list,
bool is_int4,
int64_t group_size,
int64_t lowp_mode,
int64_t num_concats);

void woq_linear_eltwise_kernel_output(
const at::Tensor& self,
const at::Tensor& weight,
const at::Tensor& scales_float,
const at::Tensor& zero_points_float,
const at::Tensor& bias,
const c10::string_view& post_op,
const torch::List<c10::optional<at::Scalar>>& scalars,
const c10::optional<c10::string_view>& algorithm,
int64_t lowp_mode,
at::Tensor& output);
int64_t num_concats,
int64_t act_quant_mode);

at::Tensor woq_linear_eltwise_kernel(
const at::Tensor& self,
Expand All @@ -131,20 +113,10 @@ at::Tensor woq_linear_eltwise_kernel(
const torch::List<c10::optional<at::Scalar>>& scalars,
const c10::optional<c10::string_view>& algorithm,
bool is_int4,
int64_t lowp_mode,
int64_t num_concats);

at::Tensor woq_linear_add_kernel(
const at::Tensor& self,
const at::Tensor& weight,
const std::vector<at::Tensor>& scales_list,
const std::vector<at::Tensor>& zps_list,
const std::vector<at::Tensor>& bias_list,
bool is_int4,
int64_t group_size,
int64_t lowp_mode,
int64_t num_concats,
at::Tensor& accumu,
const c10::optional<at::Scalar>& alpha);
int64_t act_quant_mode);

at::Tensor woq_linear_add_kernel(
const at::Tensor& self,
Expand All @@ -153,9 +125,11 @@ at::Tensor woq_linear_add_kernel(
const std::vector<at::Tensor>& zps_list,
const std::vector<at::Tensor>& bias_list,
bool is_int4,
int64_t group_size,
int64_t lowp_mode,
int64_t num_concats,
const std::vector<at::Tensor>& others);
const std::vector<at::Tensor>& others,
int64_t act_quant_mode);

at::Tensor woq_linear_add_add_kernel(
const at::Tensor& self,
Expand All @@ -164,9 +138,11 @@ at::Tensor woq_linear_add_add_kernel(
const std::vector<at::Tensor>& zps_list,
const std::vector<at::Tensor>& bias_list,
bool is_int4,
int64_t group_size,
int64_t lowp_mode,
int64_t num_concats,
const std::vector<at::Tensor>& others);
const std::vector<at::Tensor>& others,
int64_t act_quant_mode);

namespace {
void woq_gemm_kernel_impl(
Expand Down Expand Up @@ -240,7 +216,10 @@ using woq_tpp_gemm_kernel_fn = at::Tensor (*)(
int64_t,
int64_t,
int64_t,
const std::vector<at::Tensor>&);
const std::vector<at::Tensor>&,
int64_t,
int64_t,
int64_t);

using woq_tpp_gemm_packB_fn =
at::Tensor (*)(const at::Tensor&, bool, size_t, size_t, int64_t);
Expand Down
21 changes: 20 additions & 1 deletion csrc/cpu/aten/kernels/WoqLinearKrnl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2658,6 +2658,16 @@ void woq_gemm_kernel_impl(
zero_points_float_ptr);
}
}
} else {
auto qw = woq_linear_unpackB_impl(weight);
auto w = qw.dequantize().to(self_.scalar_type()).to(c10::kFloat);
auto x = self.to(c10::ScalarType::Float);
auto out = at::linear(x, w);
if (bias.defined()) {
auto b = bias.to(self_.scalar_type()).to(c10::kFloat);
out = at::add(out, b);
}
output = out.to(self.scalar_type());
}
} else { // kPerChannelAffineFloatQParams

Expand Down Expand Up @@ -2805,7 +2815,7 @@ void woq_gemm_kernel_impl(
} else {
at::linear_out(output, self, w);
}
} else {
} else if (self_.scalar_type() == at::kBFloat16) {
auto w = weight.dequantize();
auto x = self.to(c10::ScalarType::Float);
// This is to align with the AVX512 kernel
Expand All @@ -2818,6 +2828,15 @@ void woq_gemm_kernel_impl(
out = at::add(out, bias);
}
output = out.to(self.scalar_type());
} else {
auto w = weight.dequantize().to(self_.scalar_type()).to(c10::kFloat);
auto x = self.to(c10::ScalarType::Float);
auto out = at::linear(x, w);
if (bias.defined()) {
auto b = bias.to(self_.scalar_type()).to(c10::kFloat);
out = at::add(out, b);
}
output = out.to(self.scalar_type());
}

#endif
Expand Down
Loading

0 comments on commit ed5957e

Please sign in to comment.