support peft model quantization with SmoothQuant (#1282)

Peft model will use below arch: Linears in Linear. This pull request supports this arch with smoothquant. ``` (v): Linear( in_features=32, out_features=32, bias=False (lora_dropout): ModuleDict( (default): Dropout(p=0.1, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=32, out_features=8, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=8, out_features=32, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() ``` BTW, when IPEX version<=1.13, HistogramObserver doesn't support asym scheme, the zero_point is 0 for asym uint8, while the MinMaxObserver works well. Also, IPEX SmoothQuant Observer can only use save/load_qconf_summary once. The save_qconf_summary API will freeze the scale used in model and calibration won't work anymore. The load_qconf_summary will overwrite the scales used in model but only work in the first call. Here we implement normal observer to workaround this issue. --------- Signed-off-by: changwangss <chang1.wang@intel.com> Signed-off-by: Xin He <xin3.he@intel.com> Signed-off-by: y <xin3.he@intel.com> Signed-off-by: chensuyue <suyue.chen@intel.com> Co-authored-by: changwangss <chang1.wang@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: chen, suyue <suyue.chen@intel.com>
intel · Nov 6, 2023 · 5e21b70 · 5e21b70
1 parent 21668df
commit 5e21b70
Show file tree

Hide file tree

Showing 8 changed files with 15,163 additions and 165 deletions.
diff --git a/.azure-pipelines/scripts/ut/env_setup.sh b/.azure-pipelines/scripts/ut/env_setup.sh
@@ -92,7 +92,7 @@ pip install horovod
 pip install transformers
 
 if [[ $(echo "${test_case}" | grep -c "others") != 0 ]];then
-    pip install tf_slim xgboost accelerate==0.21.0
+    pip install tf_slim xgboost accelerate==0.21.0 peft
 elif [[ $(echo "${test_case}" | grep -c "nas") != 0 ]]; then
     pip install dynast==1.6.0rc1
 elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then

diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
@@ -1833,7 +1833,7 @@ def _apply_pre_optimization(self, model, tune_cfg, recover=False):
                 absorb_layer = op_name
                 absorbed_layer = info["absorbed_layer"]
                 input_minmax = info["input_minmax"]
-                weight_max = info["weight_max"]
+                weight_max = info["weight_max"].clamp(min=1e-5)
                 abs_input_max = torch.max(torch.abs(input_minmax[0]), torch.abs(input_minmax[1]))
                 input_power = torch.pow(abs_input_max, alpha)
                 weight_power = torch.pow(weight_max, 1 - alpha)
@@ -1858,11 +1858,12 @@ def qdq_quantize(self, model, tune_cfg):
         """
         q_model = model._model
         from .torch_utils.model_wrapper import QDQLinear, SQLinearWrapper
-        from .torch_utils.util import fetch_module, set_module
+        from .torch_utils.smooth_quant import get_module, set_module
 
         smoothquant_scale_info = {}
         fallback_op_name_list = []
         stats_result = {}
+        stats_result["Linear(failed when SQ)"] = {"INT8(QDQ)": 0, "BF16": 0, "FP32": 0}
         for (op_name, op_type), qconfig in tune_cfg["op"].items():
             if op_type == "Linear" and qconfig["weight"]["dtype"] != "int8":
                 fallback_op_name_list.append(op_name)
@@ -1876,13 +1877,16 @@ def qdq_quantize(self, model, tune_cfg):
                 alpha = info["alpha"]
                 absorbed_layer = info["absorbed_layer"]
                 input_minmax = info["input_minmax"]
-                weight_max = info["weight_max"]
+                weight_max = info["weight_max"].clamp(min=1e-5)
                 abs_input_max = torch.max(torch.abs(input_minmax[0]), torch.abs(input_minmax[1]))
                 input_power = torch.pow(abs_input_max, alpha)
                 weight_power = torch.pow(weight_max, 1 - alpha)
                 scale = torch.clip(input_power / weight_power, min=1e-5)
+                if torch.isnan(scale).any() or torch.isinf(scale).any():
+                    stats_result["Linear(failed when SQ)"]["FP32"] += 1
+                    continue  # for peft model,lora_B weights is 0.
                 for op_name in absorbed_layer:
-                    module = fetch_module(q_model, op_name)
+                    module = get_module(q_model, op_name)
                     new_module = SQLinearWrapper(module, 1.0 / scale, input_minmax, alpha)
                     set_module(q_model, op_name, new_module)
                     logger.debug(f"Current SmoothQuant alpha of {op_name} is {alpha}")
@@ -2858,7 +2862,7 @@ def _dump_model_op_stats(self, tune_cfg):
             output_data, header="Mixed Precision Statistics", field_names=["Op Type", "Total", "INT8", "BF16", "FP32"]
         ).print_stat()
 
-    def _cfg_to_qconfig(self, tune_cfg):
+    def _cfg_to_qconfig(self, tune_cfg, smooth_quant=False):
         """Convert tune configure to quantization config for each op.
 
         Args:
@@ -2949,7 +2953,7 @@ def _cfg_to_qconfig(self, tune_cfg):
         else:
             op_infos = copy.deepcopy(self.op_infos_from_cfgs)
             self.cfgs = torch_utils.util.check_cfg_and_qconfig(
-                tune_cfg["op"], self.cfgs, op_infos, self.output_tensor_id_op_name
+                tune_cfg["op"], self.cfgs, op_infos, self.output_tensor_id_op_name, smooth_quant
             )
 
             with open(self.ipex_config_path, "w") as write_f:
@@ -3112,7 +3116,7 @@ def _get_quantizable_ops_recursively(self, model, prefix, quantizable_ops):
                         smooth_quant_args = self.recipes.get("smooth_quant_args", {})
                         folding = smooth_quant_args.get("folding", False)
                         if not folding:
-                            if self.sq_minmax_init:
+                            if self.sq_minmax_init or self.version.release >= Version("2.2").release:
                                 from torch.ao.quantization.observer import MinMaxObserver
 
                                 static_qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(
@@ -3268,19 +3272,20 @@ def qdq_quantize(self, model, q_model, tune_cfg, dataloader, q_func):
         if sq_max_info:
             smoothquant_scale_info = {}
             from .torch_utils.model_wrapper import SQLinearWrapper
-            from .torch_utils.util import fetch_module
+            from .torch_utils.smooth_quant import get_module
 
             for _, info in sq_max_info.items():
                 alpha = info["alpha"]
                 absorbed_layer = info["absorbed_layer"]
                 input_minmax = info["input_minmax"]
-                weight_max = info["weight_max"]
+                # for peft model,lora_B weights is 0.
+                weight_max = info["weight_max"].clamp(min=1e-5)
                 abs_input_max = torch.max(torch.abs(input_minmax[0]), torch.abs(input_minmax[1]))
                 input_power = torch.pow(abs_input_max, alpha)
                 weight_power = torch.pow(weight_max, 1 - alpha)
                 scale = torch.clip(input_power / weight_power, min=1e-5)
                 for op_name in absorbed_layer:
-                    module = copy.deepcopy(fetch_module(q_model._model, op_name))
+                    module = copy.deepcopy(get_module(q_model._model, op_name))
                     new_module = SQLinearWrapper(module, 1.0 / scale, input_minmax, alpha)
                     weight_scale = new_module._get_weight_scale()
                     smoothquant_scale_info[op_name] = {
@@ -3296,7 +3301,7 @@ def qdq_quantize(self, model, q_model, tune_cfg, dataloader, q_func):
         # Check save_qconf_summary part is a workaround for IPEX bug.
         # Sometimes the prepared model from get_op_capablitiy loss this attribute
         if not hasattr(model._model, "save_qconf_summary") or not hasattr(model._model, "load_qconf_summary"):
-            if self.sq_minmax_init:
+            if self.sq_minmax_init or self.version.release >= Version("2.2").release:
                 from torch.ao.quantization.observer import MinMaxObserver
 
                 static_qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(
@@ -3313,10 +3318,14 @@ def qdq_quantize(self, model, q_model, tune_cfg, dataloader, q_func):
                     model._model, static_qconfig, example_inputs=self.example_inputs, inplace=inplace
                 )
 
-        # TODO: update_sq_scale is used to update observer, should fuse in _cfg_to_qconfig
+        # The IPEX SmoothQuant observer can only use save/load_qconf_summary once.
+        # The save_qconf_summary API will freeze the scale used in model and calibration won't work anymore.
+        # The load_qconf_summary will overwrite the scales used in model but only work in the first call.
+        # Here, we use INC collected scale for Linear and set normal observer instead of SQObserver \
+        # to make sure calibration works for other ops, like add, bmm.
         from .torch_utils.util import update_sq_scale
 
-        self._cfg_to_qconfig(tune_cfg)
+        self._cfg_to_qconfig(tune_cfg, smooth_quant=True)
         update_sq_scale(self.ipex_config_path, smoothquant_scale_info)
         model._model.load_qconf_summary(qconf_summary=self.ipex_config_path)
 
@@ -3337,10 +3346,6 @@ def qdq_quantize(self, model, q_model, tune_cfg, dataloader, q_func):
                 + "using scale info from SmoothQuant for Linear and "
                 + "one iter calibration for other ops."
             )
-            # update ipex_config.json with smoothquant_scale_info
-            model._model.save_qconf_summary(qconf_summary=self.ipex_config_path)
-            update_sq_scale(self.ipex_config_path, smoothquant_scale_info)
-            model._model.load_qconf_summary(qconf_summary=self.ipex_config_path)
 
         self._ipex_post_quant_process(model, q_model, dataloader, inplace=inplace)
 

diff --git a/neural_compressor/adaptor/pytorch_ipex.yaml b/neural_compressor/adaptor/pytorch_ipex.yaml
@@ -48,9 +48,9 @@
                         },
             'activation': {
                         'dtype': ['uint8'],
-                        'scheme': ['asym'],
+                        'scheme': ['asym', 'sym'],
                         'granularity': ['per_tensor'],
-                        'algorithm': ['minmax']
+                        'algorithm': ['minmax', 'kl']
                         }
                     },
           },

diff --git a/neural_compressor/adaptor/torch_utils/smooth_quant.py b/neural_compressor/adaptor/torch_utils/smooth_quant.py
@@ -182,6 +182,12 @@ def get_module(model, key):
     for name in name_list:
         if hasattr(module, name):
             module = getattr(module, name)
+        elif hasattr(module, "sq_linear"):  # for peft models
+            module = getattr(module, "sq_linear")
+            module = getattr(module, name)
+        elif hasattr(module, "orig_layer"):  # for peft models and auto alpha
+            module = getattr(module, "orig_layer")
+            module = getattr(module, name)
         else:
             module = module
     return module
@@ -200,8 +206,19 @@ def set_module(model, key, new_module):
     for name in name_list[:-1]:
         if hasattr(module, name):
             module = getattr(module, name)
+        elif hasattr(module, ("sq_linear")):  # for peft models that Linears are contained in Linear
+            module = getattr(module, "sq_linear")
+            module = getattr(module, name)
+        elif hasattr(module, ("orig_layer")):  # for peft models and auto alpha
+            module = getattr(module, "orig_layer")
+            module = getattr(module, name)
         else:
             module = module
+
+    if hasattr(module, "sq_linear") and name_list[-1] != "sq_linear":  # for peft models
+        module = getattr(module, "sq_linear")
+    if hasattr(module, "orig_layer") and name_list[-1] != "orig_layer":  # for peft models and auto alpha
+        module = getattr(module, "orig_layer")
     setattr(module, name_list[-1], new_module)
 
 
@@ -222,7 +239,7 @@ def cal_scale(input_max, weights, alpha, scale_type="orig"):
 class WrapperLayer(torch.nn.Module):
     def __init__(self, layer, input_min, input_max, save_q_input=False):
         super(WrapperLayer, self).__init__()
-        self.orig_layer = layer
+        self.add_module("orig_layer", layer)  # set orig_layer in get/set_module
         self.quant = False
         self.q_input = None
         self.fp32_output = None
@@ -281,7 +298,7 @@ class TorchSmoothQuant:
     to recover the weights if needed
     """
 
-    def __init__(self, model, dataloader, example_inputs=None, q_func=None, traced_model=None):
+    def __init__(self, model, dataloader=None, example_inputs=None, q_func=None, traced_model=None):
         """
         :param model: Torch model :param dataloader: Calibration dataloader :param traced_model: A specific model
         shares the same architecture as the model and could be traced by torch.jit. If not supplied, we use model
@@ -372,7 +389,7 @@ def _calibrate(self, absorb_to_layer, calib_iter, percentile):
         ##hook all the module
         hook_modules = {}
         for n, module in self.model.named_modules():
-            if module.__class__.__name__.split(".")[-1] in self.op_types:
+            if isinstance(module, tuple(self.op_types)):
                 hook_modules[n] = module
 
         self._add_min_max_observer(hook_modules, percentile)
@@ -547,6 +564,8 @@ def _cal_scales(self, absorb_to_layer, input_maxes, alpha=0.5, tuning=False):
                 alpha_tmp = alpha
             elif isinstance(alpha, dict):
                 alpha_tmp = alpha[key]
+            else:
+                alpha_tmp = alpha
             if alpha_tmp < 0:
                 scale = torch.ones((1), device=self.device)
             else:
@@ -670,7 +689,7 @@ def _get_sq_layer_names(self):
     def _get_all_hook_module_names(self):
         module_names = []
         for n, module in self.model.named_modules():
-            if module.__class__.__name__.split(".")[-1] in self.op_types:
+            if isinstance(module, tuple(self.op_types)):
                 module_names.append(n)
         return module_names
 
@@ -680,25 +699,27 @@ def _qdq_model_wrapper_for_auto(self, save_q_input=False):
         module_names = self._get_all_hook_module_names()
         self.to_unwrap_module_names = module_names
         for name in module_names:
+            if name not in self.input_mins:  # skip module if it's not used in calibration
+                continue
             module = get_module(self.model, name)
-            set_module(
-                self.model,
-                name,
-                WrapperLayer(module, self.input_mins[name], self.input_maxes[name], save_q_input=save_q_input),
-            )
+            new_module = WrapperLayer(module, self.input_mins[name], self.input_maxes[name], save_q_input=save_q_input)
+            set_module(self.model, name, new_module)
 
     def _qdq_model_unwrapper_for_auto(self):
         module_names = self.to_unwrap_module_names
         for name in module_names:
             module = get_module(self.model, name)
-            # print(name, flush=True)
+            if not hasattr(module, "orig_layer"):  # skip module if it's not used in calibration
+                continue
             set_module(self.model, name, module.orig_layer)
 
     def _change_qdq_for_auto(self, enable=True):
         module_names = self._get_all_hook_module_names()
         for name in module_names:
             name = name.split(".orig_layer")[0]
             module = get_module(self.model, name)
+            if not hasattr(module, "orig_layer"):  # skip module if it's not used in calibration
+                continue
             if enable:
                 module.enable_quant()
             else:
@@ -921,7 +942,7 @@ def transform(
         alpha=0.5,
         folding=False,
         percentile=100,
-        op_types=["Linear", "Conv2d"],
+        op_types=[torch.nn.Linear, torch.nn.Conv2d],
         scales_per_op=False,
         calib_iter=100,
         auto_alpha_args={"alpha_min": 0.0, "alpha_max": 1.0, "alpha_step": 0.1, "shared_criterion": "mean"},
@@ -953,12 +974,13 @@ def transform(
         self.recover()
         need_calibration = self._check_need_calibration(alpha, percentile, op_types, scales_per_op, calib_iter)
         with torch.no_grad():
+            str_op_types = [i.__name__ for i in op_types]
             input_maxes_abs = self.input_maxes_abs
             if need_calibration:  ##avoid multiple calibaration during tuning if the only difference is alpha
                 if self.insert_mul:
-                    self.self_absorb_layers = self._get_all_layer_names()  # TODO: only support linear now.
+                    self.self_absorb_layers = self._get_all_layer_names(op_types)  # TODO: only support linear now.
                     # fetch modules with the same input
-                    group_modules = self._trace(op_types, skip_unsupported_layers=False)
+                    group_modules = self._trace(str_op_types, skip_unsupported_layers=False)
                     if group_modules is not None:
                         # use one input for qkv
                         for k, v in group_modules.items():
@@ -969,7 +991,7 @@ def transform(
                         logger.debug(f"self_absorb_layers:{self.self_absorb_layers}")
                 if self.allow_absorb:
                     self.absorb_to_layer, no_absorb_layers = self._trace(
-                        op_types
+                        str_op_types
                     )  ##TODO we need to insert mul layer for no_absorb_layers later
                     if self.absorb_to_layer is None and no_absorb_layers is None:
                         return self.model
@@ -1061,28 +1083,18 @@ def recover(self):
             self.weight_scale_info = {}  ##clear the data
             self.absorb_scales_info = {}
 
-    def _get_all_layer_names(self, op_types=["Linear"]):
+    def _get_all_layer_names(self, op_types=[torch.nn.Linear]):
         """Try the model to find the layers which can be smooth quantized.
 
         :param op_types: The op types to be smooth quantized
         :return:
         self_absorb_layer: A dict, absorb layer name (itself): layers to be smooth quantized
         """
         self_absorb_layer = {}
+        op_types = [torch.nn.Linear]  # TODO： only support SQLinearWrapper
         for name, module in self.model.named_modules():
-            for op_type in op_types:
-                if op_type == str(module.__class__.__name__):
-                    self_absorb_layer[name] = [name]
-        # remove duplicate Linear if Linear is wrapped by Linear
-        key_list = list(self_absorb_layer.keys())
-        key_list.sort()
-        duplicate_list = []
-        for i, k1 in enumerate(key_list):
-            for k2 in key_list[i + 1 :]:
-                if k1 in k2:
-                    duplicate_list.append(k1)
-        for i in duplicate_list:
-            self_absorb_layer.pop(i)
+            if isinstance(module, tuple(op_types)):
+                self_absorb_layer[name] = [name]
         return self_absorb_layer
 
     def _get_example_input(self):
@@ -1334,46 +1346,3 @@ def remove_unsupported_layers(self, model, absorb_to_layer, no_absorb_layers):
             if supported:
                 res[key] = absorb_to_layer[key]
         return res
-
-
-def update_sq_scale(ipex_config_path, smoothquant_scale_info):
-    """Update ipex_config.json with smoothquant scale info generated by our algorithm.
-
-    Args:
-        ipex_config_path (str): a path to temporary ipex_config.json file.
-        smoothquant_scale_info (dict): a dict contains smoothquant scale info.
-    """
-    with open(ipex_config_path, "r") as f:
-        ipex_config = json.load(f)
-        for module_name, v in ipex_config.items():
-            if "q_op_infos" in v and v["q_op_infos"]:
-                for op_num, v1 in v["q_op_infos"].items():
-                    # update alpha data instead of updating weight scale
-                    op_name = v1["fqn"]  # fqn always exists even it's empty.
-                    if op_name in smoothquant_scale_info:
-                        # observers were overridden by the fallback step, setting it back.
-                        v1["activation_observer"] = {
-                            "name": "SmoothQuantActivationObserver",
-                            "smooth_quant_enabled": False,
-                            "dtype": "torch.quint8",
-                            "qscheme": "torch.per_tensor_affine",
-                            "reduce_range": False,
-                            "quant_min": 0,
-                            "quant_max": 255,
-                            "alpha": smoothquant_scale_info[op_name]["alpha"],
-                        }
-                        v1["weight_observer"] = {
-                            "name": "SmoothQuantWeightObserver",
-                            "smooth_quant_enabled": False,
-                            "dtype": "torch.qint8",
-                            "qscheme": "torch.per_channel_symmetric",
-                            "reduce_range": False,
-                            "quant_min": -128,
-                            "quant_max": 127,
-                            "alpha": smoothquant_scale_info[op_name]["alpha"],  # only update alpha
-                        }
-        f.close()
-    # overwrite ipex_config_path
-    with open(ipex_config_path, "w") as f1:
-        json.dump(ipex_config, f1, indent=4)
-        f1.close()