Release/2.1: Port a few commits about WOQ and SmoothQuant from cpu-de…

…vice (#2275) * WOQ: blockwise quantization of activation (#2136) * WOQ: Add blockwise quantization of activation * Update docstring for woq qconfig * Modify API name: M->BATCH, K->IC * Improve doc string for get_weight_only_quant_qconfig_mapping * Fix concat-linear bug and improve lowp_mode docstring * Use PER_IC_BLOCK by default * WOQ: fix bf16 correctness bug when lowp_mode=NONE (#2166) * WOQ: fix bf16 correctness bug when lowp_mode=NONE * improve ut * fix clang-format issue * Woq blockwise quant of weight (#2238) * Add block-wise quantization of weight * Use self-defined quantize/dequantize function instead of those from PyTorch * Remove the old fallback and use the new one; Patch N and K when necessary * Fix bug for concat linear * Fix bugs about uncompressing int4 zero points and concat linear * Fix bug about fallback path * Fix clang-format issue * Update llm example script and readme * Fix deepspeed UT * Block_k can be less than group_size; Update group_size docstring * WOQ Bug fix: fuse linear-gelu instead of linear-new_gelu (#2265) * WOQ Bug fix: fuse linear-gelu instead of linear-new_gelu * Use _convert_woq instead of PyTorch's prepare/convert for woq in optimize_transformers * Fix int8 concat linear accuracy issue by adding scales/zero points to IpexWoqLinear.from_float * fix flake8 issue * Move _convert_woq to quantization.convert * Revert changes to quantization.convert to fix deepspeed UT failure * SmoothQuant: make share_weight_observer configurable for layers like QKV (#2106) * SmoothQuant: make share_weight_observer configurable for layers like QKV * Add share_weight_observers to qconf_summary * SmoothQuant: Do not insert mul if user cancels quantization of linear by qconf (#2254) * SmoothQuant: Do not insert mul if user cancels quantization of linear by qconf * Add UT * Fix flake8 issue * Bug fix: SmoothQuant custom sub-observers are shared by mistake (#2219) * Bug fix: SmoothQuant custom sub-observers are shared by mistake * Fix flake8 issue * Update doc string for the change * Bug fix: fail to detect SmoothQuant observer has run (#2269) * Add an int8 linear op for Bitsandbytes (#2266) * Add a int8 linear op for Bitsandbytes * Rename the op to matmul_i8i8i32 since it's not specific for bnb * Add meaningful error messages for TORCH_CHECK * Fix clang-format issue * Run flake8
intel · Nov 17, 2023 · ed5957e · ed5957e
1 parent 11484c3
commit ed5957e
Show file tree

Hide file tree

Showing 30 changed files with 2,774 additions and 1,071 deletions.
diff --git a/csrc/cpu/aten/Linear.cpp b/csrc/cpu/aten/Linear.cpp
diff --git a/csrc/cpu/aten/Linear.h b/csrc/cpu/aten/Linear.h
@@ -81,45 +81,27 @@ at::Tensor ipex_linear_eltwise(
 // WOQ linear ops
 at::Tensor woq_linear_pack_weight(
     const at::Tensor& weight,
-    const at::Tensor& scale,
-    const at::Tensor& zero_points,
+    std::vector<int64_t>& weight_shape,
+    bool is_4bit,
+    int64_t group_size,
     int64_t lowp_mode);
 
 at::Tensor woq_linear_unpack_weight(
     const at::Tensor& weight,
     bool is_int4,
     int64_t lowp_mode);
 
-void woq_linear_kernel_output(
-    const at::Tensor& self,
-    const at::Tensor& weight,
-    const at::Tensor& scales_float,
-    const at::Tensor& zero_points_float,
-    const at::Tensor& bias,
-    int64_t lowp_mode,
-    at::Tensor& output);
-
 at::Tensor woq_linear_kernel(
     const at::Tensor& self,
     const at::Tensor& weight,
     const std::vector<at::Tensor>& scales_list,
     const std::vector<at::Tensor>& zps_list,
     const std::vector<at::Tensor>& bias_list,
     bool is_int4,
+    int64_t group_size,
     int64_t lowp_mode,
-    int64_t num_concats);
-
-void woq_linear_eltwise_kernel_output(
-    const at::Tensor& self,
-    const at::Tensor& weight,
-    const at::Tensor& scales_float,
-    const at::Tensor& zero_points_float,
-    const at::Tensor& bias,
-    const c10::string_view& post_op,
-    const torch::List<c10::optional<at::Scalar>>& scalars,
-    const c10::optional<c10::string_view>& algorithm,
-    int64_t lowp_mode,
-    at::Tensor& output);
+    int64_t num_concats,
+    int64_t act_quant_mode);
 
 at::Tensor woq_linear_eltwise_kernel(
     const at::Tensor& self,
@@ -131,20 +113,10 @@ at::Tensor woq_linear_eltwise_kernel(
     const torch::List<c10::optional<at::Scalar>>& scalars,
     const c10::optional<c10::string_view>& algorithm,
     bool is_int4,
-    int64_t lowp_mode,
-    int64_t num_concats);
-
-at::Tensor woq_linear_add_kernel(
-    const at::Tensor& self,
-    const at::Tensor& weight,
-    const std::vector<at::Tensor>& scales_list,
-    const std::vector<at::Tensor>& zps_list,
-    const std::vector<at::Tensor>& bias_list,
-    bool is_int4,
+    int64_t group_size,
     int64_t lowp_mode,
     int64_t num_concats,
-    at::Tensor& accumu,
-    const c10::optional<at::Scalar>& alpha);
+    int64_t act_quant_mode);
 
 at::Tensor woq_linear_add_kernel(
     const at::Tensor& self,
@@ -153,9 +125,11 @@ at::Tensor woq_linear_add_kernel(
     const std::vector<at::Tensor>& zps_list,
     const std::vector<at::Tensor>& bias_list,
     bool is_int4,
+    int64_t group_size,
     int64_t lowp_mode,
     int64_t num_concats,
-    const std::vector<at::Tensor>& others);
+    const std::vector<at::Tensor>& others,
+    int64_t act_quant_mode);
 
 at::Tensor woq_linear_add_add_kernel(
     const at::Tensor& self,
@@ -164,9 +138,11 @@ at::Tensor woq_linear_add_add_kernel(
     const std::vector<at::Tensor>& zps_list,
     const std::vector<at::Tensor>& bias_list,
     bool is_int4,
+    int64_t group_size,
     int64_t lowp_mode,
     int64_t num_concats,
-    const std::vector<at::Tensor>& others);
+    const std::vector<at::Tensor>& others,
+    int64_t act_quant_mode);
 
 namespace {
 void woq_gemm_kernel_impl(
@@ -240,7 +216,10 @@ using woq_tpp_gemm_kernel_fn = at::Tensor (*)(
     int64_t,
     int64_t,
     int64_t,
-    const std::vector<at::Tensor>&);
+    const std::vector<at::Tensor>&,
+    int64_t,
+    int64_t,
+    int64_t);
 
 using woq_tpp_gemm_packB_fn =
     at::Tensor (*)(const at::Tensor&, bool, size_t, size_t, int64_t);

diff --git a/csrc/cpu/aten/kernels/WoqLinearKrnl.cpp b/csrc/cpu/aten/kernels/WoqLinearKrnl.cpp
@@ -2658,6 +2658,16 @@ void woq_gemm_kernel_impl(
               zero_points_float_ptr);
         }
       }
+    } else {
+      auto qw = woq_linear_unpackB_impl(weight);
+      auto w = qw.dequantize().to(self_.scalar_type()).to(c10::kFloat);
+      auto x = self.to(c10::ScalarType::Float);
+      auto out = at::linear(x, w);
+      if (bias.defined()) {
+        auto b = bias.to(self_.scalar_type()).to(c10::kFloat);
+        out = at::add(out, b);
+      }
+      output = out.to(self.scalar_type());
     }
   } else { // kPerChannelAffineFloatQParams
 
@@ -2805,7 +2815,7 @@ void woq_gemm_kernel_impl(
     } else {
       at::linear_out(output, self, w);
     }
-  } else {
+  } else if (self_.scalar_type() == at::kBFloat16) {
     auto w = weight.dequantize();
     auto x = self.to(c10::ScalarType::Float);
     // This is to align with the AVX512 kernel
@@ -2818,6 +2828,15 @@ void woq_gemm_kernel_impl(
       out = at::add(out, bias);
     }
     output = out.to(self.scalar_type());
+  } else {
+    auto w = weight.dequantize().to(self_.scalar_type()).to(c10::kFloat);
+    auto x = self.to(c10::ScalarType::Float);
+    auto out = at::linear(x, w);
+    if (bias.defined()) {
+      auto b = bias.to(self_.scalar_type()).to(c10::kFloat);
+      out = at::add(out, b);
+    }
+    output = out.to(self.scalar_type());
   }
 
 #endif