intel · chensuyue · Feb 28, 2024 · Feb 20, 2024 · Feb 22, 2024 · Feb 22, 2024
diff --git a/neural_compressor/adaptor/torch_utils/gptq.py b/neural_compressor/adaptor/torch_utils/gptq.py
@@ -668,7 +668,8 @@ def tmp(_, inp, out):
                 gptq_config[self.get_full_layer_name(layer_name, block_idx)] = {"scale": scale}
                 if not weight_config_this_layer["sym"]:
                     gptq_config[self.get_full_layer_name(layer_name, block_idx)]["zero"] = zp
-                if weight_config_this_layer["act_order"]:  # save perm for restoring the weights
+                if weight_config_this_layer["act_order"] and not weight_config_this_layer["static_groups"]:
+                    # save perm for restoring the weights, but only when static_groups is not enabled.
                     gptq_config[self.get_full_layer_name(layer_name, block_idx)]["perm"] = gptq_for_this_block[
                         layer_name
                     ].perm
@@ -828,6 +829,11 @@ def fasterquant(self, W, blocksize=128, percdamp=0.01, groupsize=-1, act_order=F
                             zero.append(self.quantizer.zero)
                     else:
                         idx = i1 + i
+                        if (i1 + i) % groupsize == 0:
+                            # load the pre-calculated quantization parameters in groups
+                            static_quantizer = groups[(i1 + i) // groupsize]
+                            scale.append(static_quantizer.scale)
+                            zero.append(static_quantizer.zero)
                         if act_order:
                             idx = perm[idx]
                         self.quantizer = groups[idx // groupsize]