Release/2.2: add weight attr to IpexWoqLinear module (#2488)

* Release/2.2: add weight attr to IpexWoqLinear module for dtype/shape queries * Let self.weight be a reference of packed weight * Add checks in UT * Fix typo
intel · Jan 19, 2024 · 7a7ba23 · 7a7ba23
1 parent b5e67ea
commit 7a7ba23
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 2 deletions.
diff --git a/intel_extension_for_pytorch/nn/modules/weight_only_quantization.py b/intel_extension_for_pytorch/nn/modules/weight_only_quantization.py
@@ -17,15 +17,18 @@ class IpexWoqLinear(nn.Module):
     A weight-only quantized (WOQ) linear module with floating point tensor as inputs and outputs.
     Weight is dequantized at runtime for computation.
     """
-    # version used in this class is different from the parent class nnq.Linear
-    _version = 4
 
     def __init__(self, in_features, out_features, bias_=True, dtype=torch.qint8):
         super().__init__()
         self.in_features = in_features
         self.out_features = out_features
         self.bias = bias_
         self.dtype = dtype
+        # This weight attribute is for queries of dtype, shape, etc.
+        # It is a reference of the packed weight in self._op_context
+        # The shape is not necessarily = [out_features, in_features] due to packing
+        # Its dtype is torch.int8 for INT8 and torch.uint8 for INT4
+        self.weight = None
         self._op_context = None
         self._lowp_mode = 0
         self._num_concats = 1
@@ -199,6 +202,7 @@ def from_float_and_int4_weight(
             num_concats,
             act_quant_mode,
         )
+        qlinear.weight = qlinear._op_context.get_weight()
         qlinear._lowp_mode = lowp_mode
         qlinear._num_concats = num_concats
         qlinear._act_quant_mode = act_quant_mode
@@ -236,6 +240,7 @@ def _init_cls(
             num_concats,
             act_quant_mode,
         )
+        qlinear.weight = qlinear._op_context.get_weight()
         qlinear._lowp_mode = lowp_mode
         qlinear._num_concats = num_concats
         qlinear._act_quant_mode = act_quant_mode
@@ -298,6 +303,7 @@ def _init_cls(
             num_concats,
             act_quant_mode,
         )
+        qlinear.weight = qlinear._op_context.get_weight()
 
         return qlinear
 

diff --git a/tests/cpu/test_quantization_default_recipe.py b/tests/cpu/test_quantization_default_recipe.py
@@ -811,6 +811,10 @@ def test(feature, has_bias):
                     ipex.nn.modules.weight_only_quantization.IpexWoqLinear
                 )
                 assert isinstance(woq_model.linear, woq_linear_class)
+                assert (
+                    woq_model.linear.weight is not None
+                    and woq_model.linear.weight.dtype == torch.int8
+                )
 
                 output2 = woq_model(data)
                 torch.testing.assert_close(output1, output2)
@@ -1046,6 +1050,10 @@ def test(feature, has_bias):
                     ipex.nn.modules.weight_only_quantization.IpexWoqLinear
                 )
                 assert isinstance(woq_model.linear, woq_linear_class)
+                assert (
+                    woq_model.linear.weight is not None
+                    and woq_model.linear.weight.dtype == torch.uint8
+                )
 
                 output2 = woq_model(data)
                 torch.testing.assert_close(output1, output2)