Skip to content

Commit

Permalink
support peft model quantization with SmoothQuant (#1282)
Browse files Browse the repository at this point in the history
Peft model will use below arch: Linears in Linear. This pull request supports this arch with smoothquant.
```
(v): Linear(                                                                                                                          
  in_features=32, out_features=32, bias=False                                                                                         
  (lora_dropout): ModuleDict(                                                                                                         
    (default): Dropout(p=0.1, inplace=False)                                                                                          
  )                                                                                                                                   
  (lora_A): ModuleDict(                                                                                                               
    (default): Linear(in_features=32, out_features=8, bias=False)                                                                     
  )                                                                                                                                   
  (lora_B): ModuleDict(                                                                                                               
    (default): Linear(in_features=8, out_features=32, bias=False)                                                                     
  )                                                                                                                                   
  (lora_embedding_A): ParameterDict()                                                                                                 
  (lora_embedding_B): ParameterDict()  
```
BTW,
when IPEX version<=1.13, HistogramObserver doesn't support asym scheme, the zero_point is 0 for asym uint8, while the MinMaxObserver works well.
Also,
IPEX SmoothQuant Observer can only use save/load_qconf_summary once. The save_qconf_summary API will freeze the scale used in model and calibration won't work anymore. The load_qconf_summary will overwrite the scales used in model but only work in the first call. Here we implement normal observer to workaround this issue.
---------

Signed-off-by: changwangss <chang1.wang@intel.com>
Signed-off-by: Xin He <xin3.he@intel.com>
Signed-off-by: y <xin3.he@intel.com>
Signed-off-by: chensuyue <suyue.chen@intel.com>
Co-authored-by: changwangss <chang1.wang@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: chen, suyue <suyue.chen@intel.com>
  • Loading branch information
4 people authored Nov 6, 2023
1 parent 21668df commit 5e21b70
Show file tree
Hide file tree
Showing 8 changed files with 15,163 additions and 165 deletions.
2 changes: 1 addition & 1 deletion .azure-pipelines/scripts/ut/env_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ pip install horovod
pip install transformers

if [[ $(echo "${test_case}" | grep -c "others") != 0 ]];then
pip install tf_slim xgboost accelerate==0.21.0
pip install tf_slim xgboost accelerate==0.21.0 peft
elif [[ $(echo "${test_case}" | grep -c "nas") != 0 ]]; then
pip install dynast==1.6.0rc1
elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then
Expand Down
39 changes: 22 additions & 17 deletions neural_compressor/adaptor/pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -1833,7 +1833,7 @@ def _apply_pre_optimization(self, model, tune_cfg, recover=False):
absorb_layer = op_name
absorbed_layer = info["absorbed_layer"]
input_minmax = info["input_minmax"]
weight_max = info["weight_max"]
weight_max = info["weight_max"].clamp(min=1e-5)
abs_input_max = torch.max(torch.abs(input_minmax[0]), torch.abs(input_minmax[1]))
input_power = torch.pow(abs_input_max, alpha)
weight_power = torch.pow(weight_max, 1 - alpha)
Expand All @@ -1858,11 +1858,12 @@ def qdq_quantize(self, model, tune_cfg):
"""
q_model = model._model
from .torch_utils.model_wrapper import QDQLinear, SQLinearWrapper
from .torch_utils.util import fetch_module, set_module
from .torch_utils.smooth_quant import get_module, set_module

smoothquant_scale_info = {}
fallback_op_name_list = []
stats_result = {}
stats_result["Linear(failed when SQ)"] = {"INT8(QDQ)": 0, "BF16": 0, "FP32": 0}
for (op_name, op_type), qconfig in tune_cfg["op"].items():
if op_type == "Linear" and qconfig["weight"]["dtype"] != "int8":
fallback_op_name_list.append(op_name)
Expand All @@ -1876,13 +1877,16 @@ def qdq_quantize(self, model, tune_cfg):
alpha = info["alpha"]
absorbed_layer = info["absorbed_layer"]
input_minmax = info["input_minmax"]
weight_max = info["weight_max"]
weight_max = info["weight_max"].clamp(min=1e-5)
abs_input_max = torch.max(torch.abs(input_minmax[0]), torch.abs(input_minmax[1]))
input_power = torch.pow(abs_input_max, alpha)
weight_power = torch.pow(weight_max, 1 - alpha)
scale = torch.clip(input_power / weight_power, min=1e-5)
if torch.isnan(scale).any() or torch.isinf(scale).any():
stats_result["Linear(failed when SQ)"]["FP32"] += 1
continue # for peft model,lora_B weights is 0.
for op_name in absorbed_layer:
module = fetch_module(q_model, op_name)
module = get_module(q_model, op_name)
new_module = SQLinearWrapper(module, 1.0 / scale, input_minmax, alpha)
set_module(q_model, op_name, new_module)
logger.debug(f"Current SmoothQuant alpha of {op_name} is {alpha}")
Expand Down Expand Up @@ -2858,7 +2862,7 @@ def _dump_model_op_stats(self, tune_cfg):
output_data, header="Mixed Precision Statistics", field_names=["Op Type", "Total", "INT8", "BF16", "FP32"]
).print_stat()

def _cfg_to_qconfig(self, tune_cfg):
def _cfg_to_qconfig(self, tune_cfg, smooth_quant=False):
"""Convert tune configure to quantization config for each op.
Args:
Expand Down Expand Up @@ -2949,7 +2953,7 @@ def _cfg_to_qconfig(self, tune_cfg):
else:
op_infos = copy.deepcopy(self.op_infos_from_cfgs)
self.cfgs = torch_utils.util.check_cfg_and_qconfig(
tune_cfg["op"], self.cfgs, op_infos, self.output_tensor_id_op_name
tune_cfg["op"], self.cfgs, op_infos, self.output_tensor_id_op_name, smooth_quant
)

with open(self.ipex_config_path, "w") as write_f:
Expand Down Expand Up @@ -3112,7 +3116,7 @@ def _get_quantizable_ops_recursively(self, model, prefix, quantizable_ops):
smooth_quant_args = self.recipes.get("smooth_quant_args", {})
folding = smooth_quant_args.get("folding", False)
if not folding:
if self.sq_minmax_init:
if self.sq_minmax_init or self.version.release >= Version("2.2").release:
from torch.ao.quantization.observer import MinMaxObserver

static_qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(
Expand Down Expand Up @@ -3268,19 +3272,20 @@ def qdq_quantize(self, model, q_model, tune_cfg, dataloader, q_func):
if sq_max_info:
smoothquant_scale_info = {}
from .torch_utils.model_wrapper import SQLinearWrapper
from .torch_utils.util import fetch_module
from .torch_utils.smooth_quant import get_module

for _, info in sq_max_info.items():
alpha = info["alpha"]
absorbed_layer = info["absorbed_layer"]
input_minmax = info["input_minmax"]
weight_max = info["weight_max"]
# for peft model,lora_B weights is 0.
weight_max = info["weight_max"].clamp(min=1e-5)
abs_input_max = torch.max(torch.abs(input_minmax[0]), torch.abs(input_minmax[1]))
input_power = torch.pow(abs_input_max, alpha)
weight_power = torch.pow(weight_max, 1 - alpha)
scale = torch.clip(input_power / weight_power, min=1e-5)
for op_name in absorbed_layer:
module = copy.deepcopy(fetch_module(q_model._model, op_name))
module = copy.deepcopy(get_module(q_model._model, op_name))
new_module = SQLinearWrapper(module, 1.0 / scale, input_minmax, alpha)
weight_scale = new_module._get_weight_scale()
smoothquant_scale_info[op_name] = {
Expand All @@ -3296,7 +3301,7 @@ def qdq_quantize(self, model, q_model, tune_cfg, dataloader, q_func):
# Check save_qconf_summary part is a workaround for IPEX bug.
# Sometimes the prepared model from get_op_capablitiy loss this attribute
if not hasattr(model._model, "save_qconf_summary") or not hasattr(model._model, "load_qconf_summary"):
if self.sq_minmax_init:
if self.sq_minmax_init or self.version.release >= Version("2.2").release:
from torch.ao.quantization.observer import MinMaxObserver

static_qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(
Expand All @@ -3313,10 +3318,14 @@ def qdq_quantize(self, model, q_model, tune_cfg, dataloader, q_func):
model._model, static_qconfig, example_inputs=self.example_inputs, inplace=inplace
)

# TODO: update_sq_scale is used to update observer, should fuse in _cfg_to_qconfig
# The IPEX SmoothQuant observer can only use save/load_qconf_summary once.
# The save_qconf_summary API will freeze the scale used in model and calibration won't work anymore.
# The load_qconf_summary will overwrite the scales used in model but only work in the first call.
# Here, we use INC collected scale for Linear and set normal observer instead of SQObserver \
# to make sure calibration works for other ops, like add, bmm.
from .torch_utils.util import update_sq_scale

self._cfg_to_qconfig(tune_cfg)
self._cfg_to_qconfig(tune_cfg, smooth_quant=True)
update_sq_scale(self.ipex_config_path, smoothquant_scale_info)
model._model.load_qconf_summary(qconf_summary=self.ipex_config_path)

Expand All @@ -3337,10 +3346,6 @@ def qdq_quantize(self, model, q_model, tune_cfg, dataloader, q_func):
+ "using scale info from SmoothQuant for Linear and "
+ "one iter calibration for other ops."
)
# update ipex_config.json with smoothquant_scale_info
model._model.save_qconf_summary(qconf_summary=self.ipex_config_path)
update_sq_scale(self.ipex_config_path, smoothquant_scale_info)
model._model.load_qconf_summary(qconf_summary=self.ipex_config_path)

self._ipex_post_quant_process(model, q_model, dataloader, inplace=inplace)

Expand Down
4 changes: 2 additions & 2 deletions neural_compressor/adaptor/pytorch_ipex.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@
},
'activation': {
'dtype': ['uint8'],
'scheme': ['asym'],
'scheme': ['asym', 'sym'],
'granularity': ['per_tensor'],
'algorithm': ['minmax']
'algorithm': ['minmax', 'kl']
}
},
},
Expand Down
111 changes: 40 additions & 71 deletions neural_compressor/adaptor/torch_utils/smooth_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,12 @@ def get_module(model, key):
for name in name_list:
if hasattr(module, name):
module = getattr(module, name)
elif hasattr(module, "sq_linear"): # for peft models
module = getattr(module, "sq_linear")
module = getattr(module, name)
elif hasattr(module, "orig_layer"): # for peft models and auto alpha
module = getattr(module, "orig_layer")
module = getattr(module, name)
else:
module = module
return module
Expand All @@ -200,8 +206,19 @@ def set_module(model, key, new_module):
for name in name_list[:-1]:
if hasattr(module, name):
module = getattr(module, name)
elif hasattr(module, ("sq_linear")): # for peft models that Linears are contained in Linear
module = getattr(module, "sq_linear")
module = getattr(module, name)
elif hasattr(module, ("orig_layer")): # for peft models and auto alpha
module = getattr(module, "orig_layer")
module = getattr(module, name)
else:
module = module

if hasattr(module, "sq_linear") and name_list[-1] != "sq_linear": # for peft models
module = getattr(module, "sq_linear")
if hasattr(module, "orig_layer") and name_list[-1] != "orig_layer": # for peft models and auto alpha
module = getattr(module, "orig_layer")
setattr(module, name_list[-1], new_module)


Expand All @@ -222,7 +239,7 @@ def cal_scale(input_max, weights, alpha, scale_type="orig"):
class WrapperLayer(torch.nn.Module):
def __init__(self, layer, input_min, input_max, save_q_input=False):
super(WrapperLayer, self).__init__()
self.orig_layer = layer
self.add_module("orig_layer", layer) # set orig_layer in get/set_module
self.quant = False
self.q_input = None
self.fp32_output = None
Expand Down Expand Up @@ -281,7 +298,7 @@ class TorchSmoothQuant:
to recover the weights if needed
"""

def __init__(self, model, dataloader, example_inputs=None, q_func=None, traced_model=None):
def __init__(self, model, dataloader=None, example_inputs=None, q_func=None, traced_model=None):
"""
:param model: Torch model :param dataloader: Calibration dataloader :param traced_model: A specific model
shares the same architecture as the model and could be traced by torch.jit. If not supplied, we use model
Expand Down Expand Up @@ -372,7 +389,7 @@ def _calibrate(self, absorb_to_layer, calib_iter, percentile):
##hook all the module
hook_modules = {}
for n, module in self.model.named_modules():
if module.__class__.__name__.split(".")[-1] in self.op_types:
if isinstance(module, tuple(self.op_types)):
hook_modules[n] = module

self._add_min_max_observer(hook_modules, percentile)
Expand Down Expand Up @@ -547,6 +564,8 @@ def _cal_scales(self, absorb_to_layer, input_maxes, alpha=0.5, tuning=False):
alpha_tmp = alpha
elif isinstance(alpha, dict):
alpha_tmp = alpha[key]
else:
alpha_tmp = alpha
if alpha_tmp < 0:
scale = torch.ones((1), device=self.device)
else:
Expand Down Expand Up @@ -670,7 +689,7 @@ def _get_sq_layer_names(self):
def _get_all_hook_module_names(self):
module_names = []
for n, module in self.model.named_modules():
if module.__class__.__name__.split(".")[-1] in self.op_types:
if isinstance(module, tuple(self.op_types)):
module_names.append(n)
return module_names

Expand All @@ -680,25 +699,27 @@ def _qdq_model_wrapper_for_auto(self, save_q_input=False):
module_names = self._get_all_hook_module_names()
self.to_unwrap_module_names = module_names
for name in module_names:
if name not in self.input_mins: # skip module if it's not used in calibration
continue
module = get_module(self.model, name)
set_module(
self.model,
name,
WrapperLayer(module, self.input_mins[name], self.input_maxes[name], save_q_input=save_q_input),
)
new_module = WrapperLayer(module, self.input_mins[name], self.input_maxes[name], save_q_input=save_q_input)
set_module(self.model, name, new_module)

def _qdq_model_unwrapper_for_auto(self):
module_names = self.to_unwrap_module_names
for name in module_names:
module = get_module(self.model, name)
# print(name, flush=True)
if not hasattr(module, "orig_layer"): # skip module if it's not used in calibration
continue
set_module(self.model, name, module.orig_layer)

def _change_qdq_for_auto(self, enable=True):
module_names = self._get_all_hook_module_names()
for name in module_names:
name = name.split(".orig_layer")[0]
module = get_module(self.model, name)
if not hasattr(module, "orig_layer"): # skip module if it's not used in calibration
continue
if enable:
module.enable_quant()
else:
Expand Down Expand Up @@ -921,7 +942,7 @@ def transform(
alpha=0.5,
folding=False,
percentile=100,
op_types=["Linear", "Conv2d"],
op_types=[torch.nn.Linear, torch.nn.Conv2d],
scales_per_op=False,
calib_iter=100,
auto_alpha_args={"alpha_min": 0.0, "alpha_max": 1.0, "alpha_step": 0.1, "shared_criterion": "mean"},
Expand Down Expand Up @@ -953,12 +974,13 @@ def transform(
self.recover()
need_calibration = self._check_need_calibration(alpha, percentile, op_types, scales_per_op, calib_iter)
with torch.no_grad():
str_op_types = [i.__name__ for i in op_types]
input_maxes_abs = self.input_maxes_abs
if need_calibration: ##avoid multiple calibaration during tuning if the only difference is alpha
if self.insert_mul:
self.self_absorb_layers = self._get_all_layer_names() # TODO: only support linear now.
self.self_absorb_layers = self._get_all_layer_names(op_types) # TODO: only support linear now.
# fetch modules with the same input
group_modules = self._trace(op_types, skip_unsupported_layers=False)
group_modules = self._trace(str_op_types, skip_unsupported_layers=False)
if group_modules is not None:
# use one input for qkv
for k, v in group_modules.items():
Expand All @@ -969,7 +991,7 @@ def transform(
logger.debug(f"self_absorb_layers:{self.self_absorb_layers}")
if self.allow_absorb:
self.absorb_to_layer, no_absorb_layers = self._trace(
op_types
str_op_types
) ##TODO we need to insert mul layer for no_absorb_layers later
if self.absorb_to_layer is None and no_absorb_layers is None:
return self.model
Expand Down Expand Up @@ -1061,28 +1083,18 @@ def recover(self):
self.weight_scale_info = {} ##clear the data
self.absorb_scales_info = {}

def _get_all_layer_names(self, op_types=["Linear"]):
def _get_all_layer_names(self, op_types=[torch.nn.Linear]):
"""Try the model to find the layers which can be smooth quantized.
:param op_types: The op types to be smooth quantized
:return:
self_absorb_layer: A dict, absorb layer name (itself): layers to be smooth quantized
"""
self_absorb_layer = {}
op_types = [torch.nn.Linear] # TODO: only support SQLinearWrapper
for name, module in self.model.named_modules():
for op_type in op_types:
if op_type == str(module.__class__.__name__):
self_absorb_layer[name] = [name]
# remove duplicate Linear if Linear is wrapped by Linear
key_list = list(self_absorb_layer.keys())
key_list.sort()
duplicate_list = []
for i, k1 in enumerate(key_list):
for k2 in key_list[i + 1 :]:
if k1 in k2:
duplicate_list.append(k1)
for i in duplicate_list:
self_absorb_layer.pop(i)
if isinstance(module, tuple(op_types)):
self_absorb_layer[name] = [name]
return self_absorb_layer

def _get_example_input(self):
Expand Down Expand Up @@ -1334,46 +1346,3 @@ def remove_unsupported_layers(self, model, absorb_to_layer, no_absorb_layers):
if supported:
res[key] = absorb_to_layer[key]
return res


def update_sq_scale(ipex_config_path, smoothquant_scale_info):
"""Update ipex_config.json with smoothquant scale info generated by our algorithm.
Args:
ipex_config_path (str): a path to temporary ipex_config.json file.
smoothquant_scale_info (dict): a dict contains smoothquant scale info.
"""
with open(ipex_config_path, "r") as f:
ipex_config = json.load(f)
for module_name, v in ipex_config.items():
if "q_op_infos" in v and v["q_op_infos"]:
for op_num, v1 in v["q_op_infos"].items():
# update alpha data instead of updating weight scale
op_name = v1["fqn"] # fqn always exists even it's empty.
if op_name in smoothquant_scale_info:
# observers were overridden by the fallback step, setting it back.
v1["activation_observer"] = {
"name": "SmoothQuantActivationObserver",
"smooth_quant_enabled": False,
"dtype": "torch.quint8",
"qscheme": "torch.per_tensor_affine",
"reduce_range": False,
"quant_min": 0,
"quant_max": 255,
"alpha": smoothquant_scale_info[op_name]["alpha"],
}
v1["weight_observer"] = {
"name": "SmoothQuantWeightObserver",
"smooth_quant_enabled": False,
"dtype": "torch.qint8",
"qscheme": "torch.per_channel_symmetric",
"reduce_range": False,
"quant_min": -128,
"quant_max": 127,
"alpha": smoothquant_scale_info[op_name]["alpha"], # only update alpha
}
f.close()
# overwrite ipex_config_path
with open(ipex_config_path, "w") as f1:
json.dump(ipex_config, f1, indent=4)
f1.close()
Loading

0 comments on commit 5e21b70

Please sign in to comment.