diff --git a/.azure-pipelines/scripts/ut/env_setup.sh b/.azure-pipelines/scripts/ut/env_setup.sh index 0ea8e47029b..19f01b8a63c 100644 --- a/.azure-pipelines/scripts/ut/env_setup.sh +++ b/.azure-pipelines/scripts/ut/env_setup.sh @@ -92,7 +92,7 @@ pip install horovod pip install transformers if [[ $(echo "${test_case}" | grep -c "others") != 0 ]];then - pip install tf_slim xgboost accelerate==0.21.0 + pip install tf_slim xgboost accelerate==0.21.0 peft elif [[ $(echo "${test_case}" | grep -c "nas") != 0 ]]; then pip install dynast==1.6.0rc1 elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py index 6548eb36b87..b6c4f944da6 100644 --- a/neural_compressor/adaptor/pytorch.py +++ b/neural_compressor/adaptor/pytorch.py @@ -1833,7 +1833,7 @@ def _apply_pre_optimization(self, model, tune_cfg, recover=False): absorb_layer = op_name absorbed_layer = info["absorbed_layer"] input_minmax = info["input_minmax"] - weight_max = info["weight_max"] + weight_max = info["weight_max"].clamp(min=1e-5) abs_input_max = torch.max(torch.abs(input_minmax[0]), torch.abs(input_minmax[1])) input_power = torch.pow(abs_input_max, alpha) weight_power = torch.pow(weight_max, 1 - alpha) @@ -1858,11 +1858,12 @@ def qdq_quantize(self, model, tune_cfg): """ q_model = model._model from .torch_utils.model_wrapper import QDQLinear, SQLinearWrapper - from .torch_utils.util import fetch_module, set_module + from .torch_utils.smooth_quant import get_module, set_module smoothquant_scale_info = {} fallback_op_name_list = [] stats_result = {} + stats_result["Linear(failed when SQ)"] = {"INT8(QDQ)": 0, "BF16": 0, "FP32": 0} for (op_name, op_type), qconfig in tune_cfg["op"].items(): if op_type == "Linear" and qconfig["weight"]["dtype"] != "int8": fallback_op_name_list.append(op_name) @@ -1876,13 +1877,16 @@ def qdq_quantize(self, model, tune_cfg): alpha = info["alpha"] absorbed_layer = info["absorbed_layer"] input_minmax = info["input_minmax"] - weight_max = info["weight_max"] + weight_max = info["weight_max"].clamp(min=1e-5) abs_input_max = torch.max(torch.abs(input_minmax[0]), torch.abs(input_minmax[1])) input_power = torch.pow(abs_input_max, alpha) weight_power = torch.pow(weight_max, 1 - alpha) scale = torch.clip(input_power / weight_power, min=1e-5) + if torch.isnan(scale).any() or torch.isinf(scale).any(): + stats_result["Linear(failed when SQ)"]["FP32"] += 1 + continue # for peft model,lora_B weights is 0. for op_name in absorbed_layer: - module = fetch_module(q_model, op_name) + module = get_module(q_model, op_name) new_module = SQLinearWrapper(module, 1.0 / scale, input_minmax, alpha) set_module(q_model, op_name, new_module) logger.debug(f"Current SmoothQuant alpha of {op_name} is {alpha}") @@ -2858,7 +2862,7 @@ def _dump_model_op_stats(self, tune_cfg): output_data, header="Mixed Precision Statistics", field_names=["Op Type", "Total", "INT8", "BF16", "FP32"] ).print_stat() - def _cfg_to_qconfig(self, tune_cfg): + def _cfg_to_qconfig(self, tune_cfg, smooth_quant=False): """Convert tune configure to quantization config for each op. Args: @@ -2949,7 +2953,7 @@ def _cfg_to_qconfig(self, tune_cfg): else: op_infos = copy.deepcopy(self.op_infos_from_cfgs) self.cfgs = torch_utils.util.check_cfg_and_qconfig( - tune_cfg["op"], self.cfgs, op_infos, self.output_tensor_id_op_name + tune_cfg["op"], self.cfgs, op_infos, self.output_tensor_id_op_name, smooth_quant ) with open(self.ipex_config_path, "w") as write_f: @@ -3112,7 +3116,7 @@ def _get_quantizable_ops_recursively(self, model, prefix, quantizable_ops): smooth_quant_args = self.recipes.get("smooth_quant_args", {}) folding = smooth_quant_args.get("folding", False) if not folding: - if self.sq_minmax_init: + if self.sq_minmax_init or self.version.release >= Version("2.2").release: from torch.ao.quantization.observer import MinMaxObserver static_qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping( @@ -3268,19 +3272,20 @@ def qdq_quantize(self, model, q_model, tune_cfg, dataloader, q_func): if sq_max_info: smoothquant_scale_info = {} from .torch_utils.model_wrapper import SQLinearWrapper - from .torch_utils.util import fetch_module + from .torch_utils.smooth_quant import get_module for _, info in sq_max_info.items(): alpha = info["alpha"] absorbed_layer = info["absorbed_layer"] input_minmax = info["input_minmax"] - weight_max = info["weight_max"] + # for peft model,lora_B weights is 0. + weight_max = info["weight_max"].clamp(min=1e-5) abs_input_max = torch.max(torch.abs(input_minmax[0]), torch.abs(input_minmax[1])) input_power = torch.pow(abs_input_max, alpha) weight_power = torch.pow(weight_max, 1 - alpha) scale = torch.clip(input_power / weight_power, min=1e-5) for op_name in absorbed_layer: - module = copy.deepcopy(fetch_module(q_model._model, op_name)) + module = copy.deepcopy(get_module(q_model._model, op_name)) new_module = SQLinearWrapper(module, 1.0 / scale, input_minmax, alpha) weight_scale = new_module._get_weight_scale() smoothquant_scale_info[op_name] = { @@ -3296,7 +3301,7 @@ def qdq_quantize(self, model, q_model, tune_cfg, dataloader, q_func): # Check save_qconf_summary part is a workaround for IPEX bug. # Sometimes the prepared model from get_op_capablitiy loss this attribute if not hasattr(model._model, "save_qconf_summary") or not hasattr(model._model, "load_qconf_summary"): - if self.sq_minmax_init: + if self.sq_minmax_init or self.version.release >= Version("2.2").release: from torch.ao.quantization.observer import MinMaxObserver static_qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping( @@ -3313,10 +3318,14 @@ def qdq_quantize(self, model, q_model, tune_cfg, dataloader, q_func): model._model, static_qconfig, example_inputs=self.example_inputs, inplace=inplace ) - # TODO: update_sq_scale is used to update observer, should fuse in _cfg_to_qconfig + # The IPEX SmoothQuant observer can only use save/load_qconf_summary once. + # The save_qconf_summary API will freeze the scale used in model and calibration won't work anymore. + # The load_qconf_summary will overwrite the scales used in model but only work in the first call. + # Here, we use INC collected scale for Linear and set normal observer instead of SQObserver \ + # to make sure calibration works for other ops, like add, bmm. from .torch_utils.util import update_sq_scale - self._cfg_to_qconfig(tune_cfg) + self._cfg_to_qconfig(tune_cfg, smooth_quant=True) update_sq_scale(self.ipex_config_path, smoothquant_scale_info) model._model.load_qconf_summary(qconf_summary=self.ipex_config_path) @@ -3337,10 +3346,6 @@ def qdq_quantize(self, model, q_model, tune_cfg, dataloader, q_func): + "using scale info from SmoothQuant for Linear and " + "one iter calibration for other ops." ) - # update ipex_config.json with smoothquant_scale_info - model._model.save_qconf_summary(qconf_summary=self.ipex_config_path) - update_sq_scale(self.ipex_config_path, smoothquant_scale_info) - model._model.load_qconf_summary(qconf_summary=self.ipex_config_path) self._ipex_post_quant_process(model, q_model, dataloader, inplace=inplace) diff --git a/neural_compressor/adaptor/pytorch_ipex.yaml b/neural_compressor/adaptor/pytorch_ipex.yaml index c6777bfe7b5..bdeb96a181c 100644 --- a/neural_compressor/adaptor/pytorch_ipex.yaml +++ b/neural_compressor/adaptor/pytorch_ipex.yaml @@ -48,9 +48,9 @@ }, 'activation': { 'dtype': ['uint8'], - 'scheme': ['asym'], + 'scheme': ['asym', 'sym'], 'granularity': ['per_tensor'], - 'algorithm': ['minmax'] + 'algorithm': ['minmax', 'kl'] } }, }, diff --git a/neural_compressor/adaptor/torch_utils/smooth_quant.py b/neural_compressor/adaptor/torch_utils/smooth_quant.py index 7b59b3ce3e5..b91b95e8563 100644 --- a/neural_compressor/adaptor/torch_utils/smooth_quant.py +++ b/neural_compressor/adaptor/torch_utils/smooth_quant.py @@ -182,6 +182,12 @@ def get_module(model, key): for name in name_list: if hasattr(module, name): module = getattr(module, name) + elif hasattr(module, "sq_linear"): # for peft models + module = getattr(module, "sq_linear") + module = getattr(module, name) + elif hasattr(module, "orig_layer"): # for peft models and auto alpha + module = getattr(module, "orig_layer") + module = getattr(module, name) else: module = module return module @@ -200,8 +206,19 @@ def set_module(model, key, new_module): for name in name_list[:-1]: if hasattr(module, name): module = getattr(module, name) + elif hasattr(module, ("sq_linear")): # for peft models that Linears are contained in Linear + module = getattr(module, "sq_linear") + module = getattr(module, name) + elif hasattr(module, ("orig_layer")): # for peft models and auto alpha + module = getattr(module, "orig_layer") + module = getattr(module, name) else: module = module + + if hasattr(module, "sq_linear") and name_list[-1] != "sq_linear": # for peft models + module = getattr(module, "sq_linear") + if hasattr(module, "orig_layer") and name_list[-1] != "orig_layer": # for peft models and auto alpha + module = getattr(module, "orig_layer") setattr(module, name_list[-1], new_module) @@ -222,7 +239,7 @@ def cal_scale(input_max, weights, alpha, scale_type="orig"): class WrapperLayer(torch.nn.Module): def __init__(self, layer, input_min, input_max, save_q_input=False): super(WrapperLayer, self).__init__() - self.orig_layer = layer + self.add_module("orig_layer", layer) # set orig_layer in get/set_module self.quant = False self.q_input = None self.fp32_output = None @@ -281,7 +298,7 @@ class TorchSmoothQuant: to recover the weights if needed """ - def __init__(self, model, dataloader, example_inputs=None, q_func=None, traced_model=None): + def __init__(self, model, dataloader=None, example_inputs=None, q_func=None, traced_model=None): """ :param model: Torch model :param dataloader: Calibration dataloader :param traced_model: A specific model shares the same architecture as the model and could be traced by torch.jit. If not supplied, we use model @@ -372,7 +389,7 @@ def _calibrate(self, absorb_to_layer, calib_iter, percentile): ##hook all the module hook_modules = {} for n, module in self.model.named_modules(): - if module.__class__.__name__.split(".")[-1] in self.op_types: + if isinstance(module, tuple(self.op_types)): hook_modules[n] = module self._add_min_max_observer(hook_modules, percentile) @@ -547,6 +564,8 @@ def _cal_scales(self, absorb_to_layer, input_maxes, alpha=0.5, tuning=False): alpha_tmp = alpha elif isinstance(alpha, dict): alpha_tmp = alpha[key] + else: + alpha_tmp = alpha if alpha_tmp < 0: scale = torch.ones((1), device=self.device) else: @@ -670,7 +689,7 @@ def _get_sq_layer_names(self): def _get_all_hook_module_names(self): module_names = [] for n, module in self.model.named_modules(): - if module.__class__.__name__.split(".")[-1] in self.op_types: + if isinstance(module, tuple(self.op_types)): module_names.append(n) return module_names @@ -680,18 +699,18 @@ def _qdq_model_wrapper_for_auto(self, save_q_input=False): module_names = self._get_all_hook_module_names() self.to_unwrap_module_names = module_names for name in module_names: + if name not in self.input_mins: # skip module if it's not used in calibration + continue module = get_module(self.model, name) - set_module( - self.model, - name, - WrapperLayer(module, self.input_mins[name], self.input_maxes[name], save_q_input=save_q_input), - ) + new_module = WrapperLayer(module, self.input_mins[name], self.input_maxes[name], save_q_input=save_q_input) + set_module(self.model, name, new_module) def _qdq_model_unwrapper_for_auto(self): module_names = self.to_unwrap_module_names for name in module_names: module = get_module(self.model, name) - # print(name, flush=True) + if not hasattr(module, "orig_layer"): # skip module if it's not used in calibration + continue set_module(self.model, name, module.orig_layer) def _change_qdq_for_auto(self, enable=True): @@ -699,6 +718,8 @@ def _change_qdq_for_auto(self, enable=True): for name in module_names: name = name.split(".orig_layer")[0] module = get_module(self.model, name) + if not hasattr(module, "orig_layer"): # skip module if it's not used in calibration + continue if enable: module.enable_quant() else: @@ -921,7 +942,7 @@ def transform( alpha=0.5, folding=False, percentile=100, - op_types=["Linear", "Conv2d"], + op_types=[torch.nn.Linear, torch.nn.Conv2d], scales_per_op=False, calib_iter=100, auto_alpha_args={"alpha_min": 0.0, "alpha_max": 1.0, "alpha_step": 0.1, "shared_criterion": "mean"}, @@ -953,12 +974,13 @@ def transform( self.recover() need_calibration = self._check_need_calibration(alpha, percentile, op_types, scales_per_op, calib_iter) with torch.no_grad(): + str_op_types = [i.__name__ for i in op_types] input_maxes_abs = self.input_maxes_abs if need_calibration: ##avoid multiple calibaration during tuning if the only difference is alpha if self.insert_mul: - self.self_absorb_layers = self._get_all_layer_names() # TODO: only support linear now. + self.self_absorb_layers = self._get_all_layer_names(op_types) # TODO: only support linear now. # fetch modules with the same input - group_modules = self._trace(op_types, skip_unsupported_layers=False) + group_modules = self._trace(str_op_types, skip_unsupported_layers=False) if group_modules is not None: # use one input for qkv for k, v in group_modules.items(): @@ -969,7 +991,7 @@ def transform( logger.debug(f"self_absorb_layers:{self.self_absorb_layers}") if self.allow_absorb: self.absorb_to_layer, no_absorb_layers = self._trace( - op_types + str_op_types ) ##TODO we need to insert mul layer for no_absorb_layers later if self.absorb_to_layer is None and no_absorb_layers is None: return self.model @@ -1061,7 +1083,7 @@ def recover(self): self.weight_scale_info = {} ##clear the data self.absorb_scales_info = {} - def _get_all_layer_names(self, op_types=["Linear"]): + def _get_all_layer_names(self, op_types=[torch.nn.Linear]): """Try the model to find the layers which can be smooth quantized. :param op_types: The op types to be smooth quantized @@ -1069,20 +1091,10 @@ def _get_all_layer_names(self, op_types=["Linear"]): self_absorb_layer: A dict, absorb layer name (itself): layers to be smooth quantized """ self_absorb_layer = {} + op_types = [torch.nn.Linear] # TODOļ¼š only support SQLinearWrapper for name, module in self.model.named_modules(): - for op_type in op_types: - if op_type == str(module.__class__.__name__): - self_absorb_layer[name] = [name] - # remove duplicate Linear if Linear is wrapped by Linear - key_list = list(self_absorb_layer.keys()) - key_list.sort() - duplicate_list = [] - for i, k1 in enumerate(key_list): - for k2 in key_list[i + 1 :]: - if k1 in k2: - duplicate_list.append(k1) - for i in duplicate_list: - self_absorb_layer.pop(i) + if isinstance(module, tuple(op_types)): + self_absorb_layer[name] = [name] return self_absorb_layer def _get_example_input(self): @@ -1334,46 +1346,3 @@ def remove_unsupported_layers(self, model, absorb_to_layer, no_absorb_layers): if supported: res[key] = absorb_to_layer[key] return res - - -def update_sq_scale(ipex_config_path, smoothquant_scale_info): - """Update ipex_config.json with smoothquant scale info generated by our algorithm. - - Args: - ipex_config_path (str): a path to temporary ipex_config.json file. - smoothquant_scale_info (dict): a dict contains smoothquant scale info. - """ - with open(ipex_config_path, "r") as f: - ipex_config = json.load(f) - for module_name, v in ipex_config.items(): - if "q_op_infos" in v and v["q_op_infos"]: - for op_num, v1 in v["q_op_infos"].items(): - # update alpha data instead of updating weight scale - op_name = v1["fqn"] # fqn always exists even it's empty. - if op_name in smoothquant_scale_info: - # observers were overridden by the fallback step, setting it back. - v1["activation_observer"] = { - "name": "SmoothQuantActivationObserver", - "smooth_quant_enabled": False, - "dtype": "torch.quint8", - "qscheme": "torch.per_tensor_affine", - "reduce_range": False, - "quant_min": 0, - "quant_max": 255, - "alpha": smoothquant_scale_info[op_name]["alpha"], - } - v1["weight_observer"] = { - "name": "SmoothQuantWeightObserver", - "smooth_quant_enabled": False, - "dtype": "torch.qint8", - "qscheme": "torch.per_channel_symmetric", - "reduce_range": False, - "quant_min": -128, - "quant_max": 127, - "alpha": smoothquant_scale_info[op_name]["alpha"], # only update alpha - } - f.close() - # overwrite ipex_config_path - with open(ipex_config_path, "w") as f1: - json.dump(ipex_config, f1, indent=4) - f1.close() diff --git a/neural_compressor/adaptor/torch_utils/util.py b/neural_compressor/adaptor/torch_utils/util.py index 9b63e51f03b..907111b00f6 100644 --- a/neural_compressor/adaptor/torch_utils/util.py +++ b/neural_compressor/adaptor/torch_utils/util.py @@ -151,7 +151,7 @@ def append_attr(fx_model, model, fx_white_list=[]): return fx_model -def generate_activation_observer(scheme, algorithm): # pragma: no cover +def generate_activation_observer(scheme, algorithm, smooth_quant=False, smooth_quant_enable=False): # pragma: no cover """This is a helper method to generate an activation observer. Args: @@ -179,6 +179,46 @@ def generate_activation_observer(scheme, algorithm): # pragma: no cover "quant_min": 0, "quant_max": 255, } + smoothquant_kl_activation_observer = { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": smooth_quant_enable, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": False, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": kl_activation_observer, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": False, + "quant_min": 0, + "quant_max": 255, + }, + } + smoothquant_minmax_activation_observer = { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": smooth_quant_enable, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": False, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": minmax_activation_observer, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": False, + "quant_min": 0, + "quant_max": 255, + }, + } REDUCE_RANGE = False if CpuInfo().vnni else True if REDUCE_RANGE: minmax_activation_observer["reduce_range"] = REDUCE_RANGE @@ -192,13 +232,21 @@ def generate_activation_observer(scheme, algorithm): # pragma: no cover kl_activation_observer["dtype"] = "torch.qint8" kl_activation_observer["quant_min"] = -128 kl_activation_observer["quant_max"] = 127 - if algorithm == "kl": - return kl_activation_observer - if algorithm == "minmax": - return minmax_activation_observer + if smooth_quant and smooth_quant_enable: + if algorithm == "kl": + return smoothquant_kl_activation_observer + if algorithm == "minmax": + return smoothquant_minmax_activation_observer + else: + if algorithm == "kl": + return kl_activation_observer + if algorithm == "minmax": + return minmax_activation_observer -def check_cfg_and_qconfig(tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_op_name): # pragma: no cover +def check_cfg_and_qconfig( + tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_op_name, smooth_quant=False +): # pragma: no cover """Check configs and quantization configs. Args: @@ -228,11 +276,21 @@ def check_cfg_and_qconfig(tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_ inc_scheme = inc_op_cfg["activation"]["scheme"] inc_algorithm = inc_op_cfg["activation"]["algorithm"] ipex_op_cfg["input_tensor_infos"] = input_tensor_infos - activation_observer = generate_activation_observer(inc_scheme, inc_algorithm) - if inc_scheme == "sym": - input_tensor_infos[index]["force_dtype"] = "torch.qint8" - if inc_scheme == "asym": - input_tensor_infos[index]["force_dtype"] = "torch.quint8" + if ( + "op_type" in ipex_op_cfg + and ipex_op_cfg["op_type"] == "" + ): + smooth_quant_enable = True + else: + smooth_quant_enable = False + activation_observer = generate_activation_observer( + inc_scheme, inc_algorithm, smooth_quant, smooth_quant_enable + ) + if not smooth_quant: + if inc_scheme == "sym": + input_tensor_infos[index]["force_dtype"] = "torch.qint8" + if inc_scheme == "asym": + input_tensor_infos[index]["force_dtype"] = "torch.quint8" ipex_op_cfg["activation_observer"] = activation_observer # int8 -> fp32 else: @@ -397,7 +455,7 @@ def update_sq_scale(ipex_config_path, smoothquant_scale_info): for op_num, v1 in v["q_op_infos"].items(): # update alpha data instead of updating weight scale op_name = v1["fqn"] # fqn always exists even it's empty. - if op_name in smoothquant_scale_info: + if op_name in smoothquant_scale_info and v1["op_type_is_module"]: input_scale_for_mul = smoothquant_scale_info[op_name]["input_scale_for_mul"].tolist() input_scale_after_mul = smoothquant_scale_info[op_name]["input_scale_after_mul"].tolist() input_zero_point_after_mul = smoothquant_scale_info[op_name][ @@ -405,74 +463,19 @@ def update_sq_scale(ipex_config_path, smoothquant_scale_info): ].tolist() weight_scale_for_mul = (1 / smoothquant_scale_info[op_name]["input_scale_for_mul"]).tolist() weight_scale_after_mul = smoothquant_scale_info[op_name]["weight_scale_after_mul"].tolist() - v1["input_tensor_infos"][0]["smooth_quant_scaling_factor"] = input_scale_for_mul v1["input_tensor_infos"][0]["scale"] = input_scale_after_mul v1["input_tensor_infos"][0]["zero_point"] = input_zero_point_after_mul + v1["input_tensor_infos"][0]["smooth_quant_scaling_factor"] = input_scale_for_mul v1["weight_tensor_infos"][0]["smooth_quant_scaling_factor"] = weight_scale_for_mul v1["weight_tensor_infos"][0]["scale"] = weight_scale_after_mul # # observers were overridden by the fallback step, setting it back. - v1["activation_observer"] = { - "name": "SmoothQuantActivationObserver", - "smooth_quant_enabled": True, - "dtype": "torch.quint8", - "qscheme": "torch.per_tensor_affine", - "reduce_range": False, - "quant_min": 0, - "quant_max": 255, - "alpha": smoothquant_scale_info[op_name]["alpha"], - "act_observer": { - "name": "HistogramObserver", - "bins": 2048, - "upsample_rate": 128, - "dtype": "torch.quint8", - "qscheme": "torch.per_tensor_affine", - "reduce_range": False, - "quant_min": 0, - "quant_max": 255, - }, - "act_ic_observer": { - "name": "PerChannelMinMaxObserver", - "ch_axis": -1, - "dtype": "torch.quint8", - "qscheme": "torch.per_channel_affine", - "reduce_range": False, - "quant_min": 0, - "quant_max": 255, - }, - } - v1["weight_observer"] = { - "name": "SmoothQuantWeightObserver", - "smooth_quant_enabled": True, - "dtype": "torch.qint8", - "qscheme": "torch.per_channel_symmetric", - "reduce_range": False, - "quant_min": -128, - "quant_max": 127, - "alpha": smoothquant_scale_info[op_name]["alpha"], - "wei_observer": { - "name": "PerChannelMinMaxObserver", - "ch_axis": 0, - "dtype": "torch.qint8", - "qscheme": "torch.per_channel_symmetric", - "reduce_range": False, - "quant_min": -128, - "quant_max": 127, - }, - "wei_ic_observer": { - "name": "PerChannelMinMaxObserver", - "ch_axis": 1, - "dtype": "torch.qint8", - "qscheme": "torch.per_channel_affine", - "reduce_range": False, - "quant_min": -128, - "quant_max": 127, - }, - } f.close() # overwrite ipex_config_path with open(ipex_config_path, "w") as f1: json.dump(ipex_config, f1, indent=4) f1.close() + print(ipex_config_path) + # exit(0) def auto_copy(module): # pragma: no cover diff --git a/test/algorithm/ipex_config_tmp.json b/test/algorithm/ipex_config_tmp.json new file mode 100644 index 00000000000..12927ad8b0d --- /dev/null +++ b/test/algorithm/ipex_config_tmp.json @@ -0,0 +1,14881 @@ +{ + " ": { + "q_op_infos": {}, + "nonq_op_infos": {}, + "layer_output_infos": [ + { + "id": 299, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model": { + "q_op_infos": {}, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model", + "input_tensor_infos": [ + { + "id": 0, + "orig_dtype": "torch.int64", + "inf_dtype": "torch.int64" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 293, + "orig_dtype": "torch.bool", + "inf_dtype": "torch.bool" + } + ] + }, + "1": { + "op_type": "", + "fqn": "base_model", + "input_tensor_infos": [ + { + "id": 293, + "orig_dtype": "torch.bool", + "inf_dtype": "torch.bool" + } + ], + "output_tensor_infos": [ + { + "id": 294, + "orig_dtype": "torch.int64", + "inf_dtype": "torch.int64" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model", + "input_tensor_infos": [ + { + "id": 294, + "orig_dtype": "torch.int64", + "inf_dtype": "torch.int64" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 295, + "orig_dtype": "torch.int64", + "inf_dtype": "torch.int64" + } + ] + }, + "3": { + "op_type": "", + "fqn": "base_model", + "input_tensor_infos": [ + { + "id": 295, + "orig_dtype": "torch.int64", + "inf_dtype": "torch.int64" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 296, + "orig_dtype": "torch.int64", + "inf_dtype": "torch.int64" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model", + "input_tensor_infos": [ + { + "id": 296, + "orig_dtype": "torch.int64", + "inf_dtype": "torch.int64" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 297, + "orig_dtype": "torch.int64", + "inf_dtype": "torch.int64" + } + ] + }, + "5": { + "op_type": "", + "fqn": "base_model", + "input_tensor_infos": [ + { + "id": 292, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + { + "id": 298, + "orig_dtype": "torch.int64", + "inf_dtype": "torch.int64" + }, + { + "id": 297, + "orig_dtype": "torch.int64", + "inf_dtype": "torch.int64" + } + ], + "output_tensor_infos": [ + { + "id": 299, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 299, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model": { + "q_op_infos": {}, + "nonq_op_infos": {}, + "layer_output_infos": [] + }, + "base_model:model:model": { + "q_op_infos": {}, + "nonq_op_infos": {}, + "layer_output_infos": [ + { + "id": 291, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "base_model:model:model:decoder": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder", + "input_tensor_infos": [ + { + "id": 2, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 1.1920928955078125e-07 + ], + "zero_point": [ + 0 + ] + }, + { + "id": 3, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.00036990849184803665 + ], + "zero_point": [ + 128 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 4, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder", + "input_tensor_infos": [ + { + "id": 0, + "orig_dtype": "torch.int64", + "inf_dtype": "torch.int64" + } + ], + "output_tensor_infos": [] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder", + "input_tensor_infos": [ + { + "id": 0, + "orig_dtype": "torch.int64", + "inf_dtype": "torch.int64" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 1, + "orig_dtype": "torch.int64", + "inf_dtype": "torch.int64" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.embed_tokens", + "input_tensor_infos": [ + { + "id": 1, + "orig_dtype": "torch.int64", + "inf_dtype": "torch.int64" + } + ], + "output_tensor_infos": [ + { + "id": 2, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.final_layer_norm", + "input_tensor_infos": [ + { + "id": 290, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 291, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 291, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "base_model:model:model:decoder:embed_positions": { + "q_op_infos": {}, + "nonq_op_infos": {}, + "layer_output_infos": [ + { + "id": 3, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers": { + "q_op_infos": {}, + "nonq_op_infos": {}, + "layer_output_infos": [] + }, + "base_model:model:model:decoder:layers:0": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.0", + "input_tensor_infos": [ + { + "id": 4, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.00036990849184803665 + ], + "zero_point": [ + 128 + ] + }, + { + "id": 53, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 8.216560672735795e-05 + ], + "zero_point": [ + 123 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 54, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.0.fc1", + "input_tensor_infos": [ + { + "id": 56, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.002199406735599041 + ], + "zero_point": [ + 125 + ], + "smooth_quant_scaling_factor": [ + 0.10962644964456558, + 0.11907318979501724, + 0.09595140069723129, + 0.13423262536525726, + 0.14291484653949738, + 0.13098719716072083, + 0.12959600985050201, + 0.13702590763568878, + 0.1146077886223793, + 0.14385050535202026, + 0.12177281826734543, + 0.1325545758008957, + 0.10560321807861328, + 0.14362241327762604, + 0.14758096635341644, + 0.1234637051820755 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.0018767404835671186, + 0.0021353804040700197, + 0.002240665489807725, + 0.0021580320317298174 + ], + "zero_point": [ + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 9.121886253356934, + 8.39819622039795, + 10.421942710876465, + 7.449753761291504, + 6.99717378616333, + 7.634334087371826, + 7.716286659240723, + 7.2978901863098145, + 8.725410461425781, + 6.951661586761475, + 8.212013244628906, + 7.54406213760376, + 9.469408988952637, + 6.962701320648193, + 6.775941848754883, + 8.099546432495117 + ] + } + ], + "output_tensor_infos": [ + { + "id": 57, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "scale": [ + 0.0014130824711173773 + ], + "zero_point": [ + 119 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.0.activation_fn", + "input_tensor_infos": [ + { + "id": 57, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.0014130824711173773 + ], + "zero_point": [ + 119 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 58, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.0004062627849634737 + ], + "zero_point": [ + 0 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "3": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.0.fc2", + "input_tensor_infos": [ + { + "id": 58, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.0004062627849634737 + ], + "zero_point": [ + 0 + ], + "smooth_quant_scaling_factor": [ + 0.549189031124115, + 0.4980548024177551, + 0.539968729019165, + 0.5409433245658875 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.0008125152671709657, + 0.0004290587385185063, + 0.00045011902693659067, + 0.0003873987589031458, + 0.0004886860842816532, + 0.00013036445307079703, + 0.0004251394420862198, + 0.0006477311835624278, + 0.0001262984733330086, + 0.0002167609054595232, + 0.0003500462626107037, + 0.00043075167923234403, + 0.000206843062187545, + 0.0005493324715644121, + 0.0005107946344651282, + 0.0006017627310939133 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 1.820866584777832, + 2.0078110694885254, + 1.8519591093063354, + 1.848622441291809 + ] + } + ], + "output_tensor_infos": [ + { + "id": 59, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "4": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.0", + "input_tensor_infos": [ + { + "id": 55, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.0003670741862151772 + ], + "zero_point": [ + 126 + ] + }, + { + "id": 60, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 7.231033669086173e-05 + ], + "zero_point": [ + 107 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 61, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn_layer_norm", + "input_tensor_infos": [ + { + "id": 4, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 5, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0", + "input_tensor_infos": [ + { + "id": 52, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 53, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0", + "input_tensor_infos": [ + { + "id": 54, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0", + "input_tensor_infos": [ + { + "id": 54, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 55, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.final_layer_norm", + "input_tensor_infos": [ + { + "id": 55, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 56, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "5": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0", + "input_tensor_infos": [ + { + "id": 59, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 60, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "6": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0", + "input_tensor_infos": [ + { + "id": 61, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 62, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 62, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers:0:self_attn": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.0.self_attn.k_proj", + "input_tensor_infos": [ + { + "id": 9, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.0025989431887865067 + ], + "zero_point": [ + 125 + ], + "smooth_quant_scaling_factor": [ + 0.13708241283893585, + 0.16369596123695374, + 0.12329072505235672, + 0.16909663379192352, + 0.1828479766845703, + 0.162990003824234, + 0.14222995936870575, + 0.1607711762189865, + 0.12252063304185867, + 0.14310497045516968, + 0.12626583874225616, + 0.18967536091804504, + 0.15503790974617004, + 0.1411832571029663, + 0.1677650362253189, + 0.12387727946043015 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.001300352392718196, + 0.001692790538072586, + 0.0017874225741252303, + 0.0020049407612532377, + 0.00210148305632174, + 0.0023782390635460615, + 0.002406417392194271, + 0.0021967601496726274, + 0.0015098107978701591, + 0.0026584486477077007, + 0.0022751193027943373, + 0.002421419369056821, + 0.0025393515825271606, + 0.0016408554511144757, + 0.0018360354006290436, + 0.0018435503588989377 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 7.294881820678711, + 6.108886241912842, + 8.110910415649414, + 5.913778305053711, + 5.469024181365967, + 6.135345935821533, + 7.030867576599121, + 6.220020294189453, + 8.161890983581543, + 6.987877368927002, + 7.919798851013184, + 5.2721662521362305, + 6.450035095214844, + 7.0829925537109375, + 5.960717678070068, + 8.072505950927734 + ] + } + ], + "output_tensor_infos": [ + { + "id": 16, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 35, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0008092291536740959 + ], + "zero_point": [ + 117 + ] + }, + { + "id": 38, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0016102794324979186 + ], + "zero_point": [ + 142 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 39, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 40, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.0001863027282524854 + ], + "zero_point": [ + 119 + ] + }, + { + "id": 41, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 1.3344405750530544e+36 + ], + "zero_point": [ + 255 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 42, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "3": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 47, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.003919653594493866 + ], + "zero_point": [ + 0 + ] + }, + { + "id": 37, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0015389698091894388 + ], + "zero_point": [ + 132 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 48, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "4": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.0.self_attn.out_proj", + "input_tensor_infos": [ + { + "id": 51, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.0006978802848607302 + ], + "zero_point": [ + 119 + ], + "smooth_quant_scaling_factor": [ + 0.4900006949901581, + 0.6840704083442688, + 0.7935751080513, + 0.6235007047653198, + 0.9865836501121521, + 0.5714253783226013, + 0.8067981004714966, + 0.7140294909477234, + 0.5045593976974487, + 0.5837582349777222, + 0.5605868101119995, + 0.5316323041915894, + 1.5545496940612793, + 0.7744913101196289, + 0.8083890676498413, + 0.619972288608551 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.0005827043205499649, + 0.00046368170296773314, + 0.0003405930183362216, + 0.0007429186953231692, + 0.00036982446908950806, + 0.00032228915370069444, + 0.00030829786555841565, + 0.0005591728840954602, + 0.0005513495416380465, + 0.0006021953886374831, + 0.0006528276135213673, + 0.0005915488582104445, + 0.0005453170160762966, + 0.000527423806488514, + 0.0005745518137700856, + 0.0005068411701358855 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 2.040813446044922, + 1.4618377685546875, + 1.260120153427124, + 1.6038473844528198, + 1.0135987997055054, + 1.7500097751617432, + 1.2394675016403198, + 1.400502324104309, + 1.9819272756576538, + 1.7130379676818848, + 1.7838449478149414, + 1.8809993267059326, + 0.643273115158081, + 1.2911701202392578, + 1.2370281219482422, + 1.6129753589630127 + ] + } + ], + "output_tensor_infos": [ + { + "id": 52, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 5, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ], + "output_tensor_infos": [] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 14, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 15, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 16, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 17, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 17, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 18, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 18, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 19, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "5": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 28, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 29, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "6": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 29, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 30, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "7": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 30, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 31, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "8": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 15, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 32, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "9": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 32, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 33, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "10": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 33, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 34, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "11": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 34, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 35, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "12": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 19, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 36, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "13": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 31, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 37, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "14": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 36, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [] + }, + "15": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 36, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 38, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "16": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 39, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [] + }, + "17": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 39, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 40, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "18": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 42, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + { + "id": 43, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 44, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "19": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 44, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 45, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "20": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 45, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 46, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "21": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 46, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 47, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "22": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 48, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [] + }, + "23": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 48, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 49, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "24": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 49, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 50, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "25": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn", + "input_tensor_infos": [ + { + "id": 50, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 51, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 52, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers:0:self_attn:v_proj": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.0.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 9, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.017885278910398483 + ], + "zero_point": [ + 124 + ] + }, + { + "id": 20, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.000445978861534968, + 0.0003330075996927917, + 0.00025916812592186034, + 0.00038120243698358536, + 0.00024034043599385768, + 0.00027592855622060597, + 0.0003112396807409823, + 0.0003448774223215878, + 0.0004228033940307796, + 0.00035102569381706417, + 0.0003638050111476332, + 0.00040324265137314796, + 0.00014413250028155744, + 0.00036997394636273384, + 0.00037873946712352335, + 0.0003710437740664929 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 21, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.0.self_attn.v_proj.lora_A.default", + "input_tensor_infos": [ + { + "id": 23, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.005615917034447193 + ], + "zero_point": [ + 123 + ], + "smooth_quant_scaling_factor": [ + 0.28679710626602173, + 0.3436911106109619, + 0.3373546898365021, + 0.38693225383758545, + 0.40539291501045227, + 0.3380838930606842, + 0.3377487063407898, + 0.35613951086997986, + 0.3338879346847534, + 0.3551289439201355, + 0.3355533480644226, + 0.3332350552082062, + 0.31778484582901, + 0.33874425292015076, + 0.3934788107872009, + 0.31613245606422424 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.005829033441841602, + 0.005098548252135515, + 0.005407379940152168, + 0.005123750306665897, + 0.005402757786214352, + 0.005312266293913126, + 0.0052151489071547985, + 0.005331622902303934 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 3.486785650253296, + 2.9095892906188965, + 2.9642391204833984, + 2.5844316482543945, + 2.466742753982544, + 2.95784592628479, + 2.9607810974121094, + 2.8078885078430176, + 2.9950170516967773, + 2.8158786296844482, + 2.980152130126953, + 3.000884771347046, + 3.1467833518981934, + 2.9520797729492188, + 2.541433095932007, + 3.1632308959960938 + ] + } + ], + "output_tensor_infos": [ + { + "id": 24, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 8.888718184607569e-06 + ], + "zero_point": [ + 102 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.0.self_attn.v_proj.lora_B.default", + "input_tensor_infos": [ + { + "id": 24, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 8.888718184607569e-06 + ], + "zero_point": [ + 102 + ], + "smooth_quant_scaling_factor": [ + 0.0011025747517123818, + 0.000735479814466089, + 0.0008849164005368948, + 0.0009793724166229367, + 0.001015718444250524, + 0.001038661110214889, + 0.0011344861704856157, + 0.0011412216117605567 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 906.9679565429688, + 1359.6566162109375, + 1130.05029296875, + 1021.0620727539062, + 984.5247802734375, + 962.7780151367188, + 881.456298828125, + 876.2540283203125 + ] + } + ], + "output_tensor_infos": [ + { + "id": 25, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 9, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 22, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn.v_proj.lora_dropout.default", + "input_tensor_infos": [ + { + "id": 22, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 23, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 25, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 26, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 21, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + { + "id": 26, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 27, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 27, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 28, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 28, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers:0:self_attn:q_proj": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.0.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 5, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.017885278910398483 + ], + "zero_point": [ + 124 + ] + }, + { + "id": 6, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0003197852347511798, + 0.00030044614686630666, + 0.00020690658129751682, + 0.0002973057562485337, + 0.0004652647185139358, + 0.0003272708272561431, + 0.00024269577988889068, + 0.0005545270978473127, + 0.00040261104004457593, + 0.00031847142963670194, + 0.00036206343793310225, + 0.00035386800300329924, + 0.00032511813333258033, + 0.0002959877601824701, + 0.00024614552967250347, + 0.0003854252281598747 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 7, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.0.self_attn.q_proj.lora_A.default", + "input_tensor_infos": [ + { + "id": 9, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.005623312667012215 + ], + "zero_point": [ + 126 + ], + "smooth_quant_scaling_factor": [ + 0.29099902510643005, + 0.3386078178882599, + 0.32760125398635864, + 0.34182459115982056, + 0.4200611710548401, + 0.3903130888938904, + 0.3466292917728424, + 0.35106194019317627, + 0.2913013994693756, + 0.3590526580810547, + 0.35574501752853394, + 0.36627793312072754, + 0.30216914415359497, + 0.33425426483154297, + 0.40649864077568054, + 0.3092302978038788 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.004347391426563263, + 0.0051337298937141895, + 0.005251043010503054, + 0.005544815678149462, + 0.005272769834846258, + 0.004840066656470299, + 0.005468212533742189, + 0.005701767280697823 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 3.4364378452301025, + 2.9532690048217773, + 3.0524914264678955, + 2.9254770278930664, + 2.380605697631836, + 2.5620458126068115, + 2.8849265575408936, + 2.8485000133514404, + 3.432870388031006, + 2.785106658935547, + 2.811002254486084, + 2.7301673889160156, + 3.3094048500061035, + 2.991734266281128, + 2.4600329399108887, + 3.2338356971740723 + ] + } + ], + "output_tensor_infos": [ + { + "id": 10, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 8.660711500851903e-06 + ], + "zero_point": [ + 122 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.0.self_attn.q_proj.lora_B.default", + "input_tensor_infos": [ + { + "id": 10, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 8.660711500851903e-06 + ], + "zero_point": [ + 122 + ], + "smooth_quant_scaling_factor": [ + 0.001275573275052011, + 0.0010749376378953457, + 0.0008703423663973808, + 0.0012038754066452384, + 0.0011535761877894402, + 0.0009325446444563568, + 0.0009438325650990009, + 0.0009104721248149872 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 783.961181640625, + 930.2864990234375, + 1148.97314453125, + 830.6506958007812, + 866.8694458007812, + 1072.334716796875, + 1059.5098876953125, + 1098.331298828125 + ] + } + ], + "output_tensor_infos": [ + { + "id": 11, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 5, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 8, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn.q_proj.lora_dropout.default", + "input_tensor_infos": [ + { + "id": 8, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 9, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 11, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 12, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 7, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + { + "id": 12, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 13, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.0.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 13, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 14, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 14, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers:1": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.1", + "input_tensor_infos": [ + { + "id": 62, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.0003642447991296649 + ], + "zero_point": [ + 124 + ] + }, + { + "id": 110, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.00011278505553491414 + ], + "zero_point": [ + 131 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 111, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.1.fc1", + "input_tensor_infos": [ + { + "id": 113, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.002465195721015334 + ], + "zero_point": [ + 116 + ], + "smooth_quant_scaling_factor": [ + 0.06853945553302765, + 0.10416597872972488, + 0.12038654834032059, + 0.1323762983083725, + 0.12097253650426865, + 0.10555252432823181, + 0.10595230013132095, + 0.18730825185775757, + 0.14067092537879944, + 0.16711099445819855, + 0.11074528843164444, + 0.12620683014392853, + 0.11356709152460098, + 0.15941360592842102, + 0.14840248227119446, + 0.16470491886138916 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.0017666078638285398, + 0.002689927350729704, + 0.0025342977605760098, + 0.0018005971796810627 + ], + "zero_point": [ + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 14.590136528015137, + 9.60006332397461, + 8.306575775146484, + 7.554222583770752, + 8.266339302062988, + 9.473956108093262, + 9.438209533691406, + 5.3387932777404785, + 7.108789443969727, + 5.984046459197998, + 9.029729843139648, + 7.923501491546631, + 8.805367469787598, + 6.2729902267456055, + 6.738431453704834, + 6.0714640617370605 + ] + } + ], + "output_tensor_infos": [ + { + "id": 114, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "scale": [ + 0.0012313323095440865 + ], + "zero_point": [ + 133 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.1.activation_fn", + "input_tensor_infos": [ + { + "id": 114, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.0012313323095440865 + ], + "zero_point": [ + 133 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 115, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.00031851575477048755 + ], + "zero_point": [ + 0 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "3": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.1.fc2", + "input_tensor_infos": [ + { + "id": 115, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.00031851575477048755 + ], + "zero_point": [ + 0 + ], + "smooth_quant_scaling_factor": [ + 0.565930187702179, + 0.5698147416114807, + 0.5411486029624939, + 0.5345359444618225 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.0006370212067849934, + 0.0002463286218699068, + 0.00031396516715176404, + 0.000519612047355622, + 0.0003065533528570086, + 0.0004140080709476024, + 0.00011627166531980038, + 0.00031617359491065145, + 0.0004251366190146655, + 0.0005555427633225918, + 0.00031467361259274185, + 0.00010011934500653297, + 0.0005672333645634353, + 0.00037621770752593875, + 0.00032224206370301545, + 0.00017085621948353946 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 1.7670023441314697, + 1.7549563646316528, + 1.8479212522506714, + 1.870781660079956 + ] + } + ], + "output_tensor_infos": [ + { + "id": 116, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "4": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.1", + "input_tensor_infos": [ + { + "id": 112, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.0003685714618768543 + ], + "zero_point": [ + 125 + ] + }, + { + "id": 117, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 6.203278462635353e-05 + ], + "zero_point": [ + 98 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 118, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn_layer_norm", + "input_tensor_infos": [ + { + "id": 62, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 63, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1", + "input_tensor_infos": [ + { + "id": 109, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 110, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1", + "input_tensor_infos": [ + { + "id": 111, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1", + "input_tensor_infos": [ + { + "id": 111, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 112, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.final_layer_norm", + "input_tensor_infos": [ + { + "id": 112, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 113, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "5": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1", + "input_tensor_infos": [ + { + "id": 116, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 117, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "6": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1", + "input_tensor_infos": [ + { + "id": 118, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 119, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 119, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers:1:self_attn": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.1.self_attn.k_proj", + "input_tensor_infos": [ + { + "id": 67, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.0026635867543518543 + ], + "zero_point": [ + 124 + ], + "smooth_quant_scaling_factor": [ + 0.13751402497291565, + 0.16165295243263245, + 0.13631494343280792, + 0.1561332792043686, + 0.18651795387268066, + 0.1500948667526245, + 0.1976248025894165, + 0.18973785638809204, + 0.14113600552082062, + 0.1779075264930725, + 0.15286338329315186, + 0.1429177224636078, + 0.13644066452980042, + 0.14966687560081482, + 0.16786669194698334, + 0.14095285534858704 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.0027345705311745405, + 0.0014046697178855538, + 0.0017388592241331935, + 0.0025925124064087868, + 0.00205766293220222, + 0.0017343783983960748, + 0.00249765207991004, + 0.0024264284875243902, + 0.0024790402967482805, + 0.0011135643580928445, + 0.0009298875811509788, + 0.0015201057540252805, + 0.0019686089362949133, + 0.002474588342010975, + 0.001645392389036715, + 0.002263008849695325 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 7.2719855308532715, + 6.186091899871826, + 7.3359527587890625, + 6.404784202575684, + 5.361413955688477, + 6.6624531745910645, + 5.060093402862549, + 5.270429611206055, + 7.08536434173584, + 5.6208977699279785, + 6.5417890548706055, + 6.997032642364502, + 7.329193115234375, + 6.68150520324707, + 5.957108020782471, + 7.094571113586426 + ] + } + ], + "output_tensor_infos": [ + { + "id": 74, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 93, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0008104040753096342 + ], + "zero_point": [ + 139 + ] + }, + { + "id": 96, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0018258652416989207 + ], + "zero_point": [ + 140 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 97, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 98, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.0001867298415163532 + ], + "zero_point": [ + 124 + ] + }, + { + "id": 41, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 1.3344405750530544e+36 + ], + "zero_point": [ + 255 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 99, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "3": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 104, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.003919653594493866 + ], + "zero_point": [ + 0 + ] + }, + { + "id": 95, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0014409187715500593 + ], + "zero_point": [ + 141 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 105, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "4": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.1.self_attn.out_proj", + "input_tensor_infos": [ + { + "id": 108, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.0007428489625453949 + ], + "zero_point": [ + 107 + ], + "smooth_quant_scaling_factor": [ + 0.757666289806366, + 0.5746235251426697, + 0.8885927200317383, + 0.7807335257530212, + 1.1450066566467285, + 0.41837745904922485, + 0.880587637424469, + 0.6206071376800537, + 0.6669229865074158, + 0.9862504601478577, + 0.4604479670524597, + 1.3671139478683472, + 1.458922266960144, + 1.123700737953186, + 0.8432158827781677, + 0.9627863764762878 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.00038053718162700534, + 0.0003275577910244465, + 0.00034619285725057125, + 0.0004894750891253352, + 0.0003191542054992169, + 0.0008626226335763931, + 0.00034072514972649515, + 0.0003294590278528631, + 0.00027665012748911977, + 0.0004014151345472783, + 0.00030781672103330493, + 0.0004544431285466999, + 0.00062305957544595, + 0.00045046041486784816, + 0.00035027528065256774, + 0.0005793329910375178 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 1.3198422193527222, + 1.7402698993682861, + 1.1253750324249268, + 1.2808468341827393, + 0.8733574151992798, + 2.390186071395874, + 1.1356053352355957, + 1.6113252639770508, + 1.4994235038757324, + 1.0139412879943848, + 2.17179799079895, + 0.7314679026603699, + 0.6854374408721924, + 0.8899165987968445, + 1.1859359741210938, + 1.0386520624160767 + ] + } + ], + "output_tensor_infos": [ + { + "id": 109, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 63, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ], + "output_tensor_infos": [] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 72, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 73, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 74, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 75, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 75, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 76, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 76, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 77, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "5": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 86, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 87, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "6": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 87, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 88, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "7": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 88, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 89, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "8": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 73, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 90, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "9": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 90, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 91, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "10": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 91, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 92, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "11": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 92, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 93, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "12": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 77, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 94, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "13": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 89, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 95, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "14": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 94, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [] + }, + "15": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 94, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 96, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "16": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 97, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [] + }, + "17": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 97, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 98, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "18": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 99, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + { + "id": 100, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 101, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "19": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 101, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 102, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "20": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 102, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 103, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "21": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 103, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 104, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "22": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 105, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [] + }, + "23": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 105, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 106, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "24": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 106, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 107, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "25": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn", + "input_tensor_infos": [ + { + "id": 107, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 108, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 109, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers:1:self_attn:v_proj": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.1.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 67, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.017186878249049187 + ], + "zero_point": [ + 135 + ] + }, + { + "id": 78, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.00031020285678096116, + 0.0004165052669122815, + 0.00037859598523937166, + 0.00035231083165854216, + 0.00037992530269548297, + 0.00043818255653604865, + 0.00021832078346051276, + 0.00030567412613891065, + 0.00029989739414304495, + 0.000358005752786994, + 0.0003350492916069925, + 0.00027488538762554526, + 0.00041011988651007414, + 0.00020044640405103564, + 0.0002536431129556149, + 0.00024024673621170223 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 79, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.1.self_attn.v_proj.lora_A.default", + "input_tensor_infos": [ + { + "id": 81, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.005631454288959503 + ], + "zero_point": [ + 129 + ], + "smooth_quant_scaling_factor": [ + 0.31396907567977905, + 0.3419976234436035, + 0.32946449518203735, + 0.3809604346752167, + 0.4369880259037018, + 0.37603235244750977, + 0.38991811871528625, + 0.3634071350097656, + 0.3446651101112366, + 0.35563480854034424, + 0.3299185335636139, + 0.36610135436058044, + 0.33558520674705505, + 0.3683687448501587, + 0.39420658349990845, + 0.3463117778301239 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.005256906151771545, + 0.005702690687030554, + 0.00408707931637764, + 0.0055601755157113075, + 0.0051085250452160835, + 0.005457798019051552, + 0.005484873428940773, + 0.00546643789857626 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 3.1850271224975586, + 2.923996925354004, + 3.0352284908294678, + 2.6249446868896484, + 2.2883923053741455, + 2.6593456268310547, + 2.564640998840332, + 2.751734495162964, + 2.9013671875, + 2.811873197555542, + 3.0310513973236084, + 2.7314839363098145, + 2.9798693656921387, + 2.7146711349487305, + 2.536741018295288, + 2.8875715732574463 + ] + } + ], + "output_tensor_infos": [ + { + "id": 82, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 9.595456504030153e-06 + ], + "zero_point": [ + 122 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.1.self_attn.v_proj.lora_B.default", + "input_tensor_infos": [ + { + "id": 82, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 9.595456504030153e-06 + ], + "zero_point": [ + 122 + ], + "smooth_quant_scaling_factor": [ + 0.0012220613425597548, + 0.0009459510329179466, + 0.0010422103805467486, + 0.0007835418218746781, + 0.0009632350993342698, + 0.0009702403913252056, + 0.0007845996296964586, + 0.0009382423013448715 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 818.28955078125, + 1057.13720703125, + 959.4991455078125, + 1276.2559814453125, + 1038.1680908203125, + 1030.67236328125, + 1274.535400390625, + 1065.82275390625 + ] + } + ], + "output_tensor_infos": [ + { + "id": 83, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 67, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 80, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn.v_proj.lora_dropout.default", + "input_tensor_infos": [ + { + "id": 80, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 81, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 83, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 84, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 79, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + { + "id": 84, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 85, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 85, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 86, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 86, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers:1:self_attn:q_proj": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.1.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 63, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.017186878249049187 + ], + "zero_point": [ + 135 + ] + }, + { + "id": 64, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.00033155985875055194, + 0.00036077588447369635, + 0.00045065759331919253, + 0.00041911558946594596, + 0.0003609144187066704, + 0.00024745348491705954, + 0.000459514296380803, + 0.00030642913770861924, + 0.00029080515378154814, + 0.0002998409909196198, + 0.00018764213018584996, + 0.0003062607138417661, + 0.00033330157748423517, + 0.00037817415432073176, + 0.0002834920596797019, + 0.00019271073688287288 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 65, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.1.self_attn.q_proj.lora_A.default", + "input_tensor_infos": [ + { + "id": 67, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.00550608616322279 + ], + "zero_point": [ + 129 + ], + "smooth_quant_scaling_factor": [ + 0.2970189154148102, + 0.3477560579776764, + 0.33492574095726013, + 0.3764011263847351, + 0.37040868401527405, + 0.38529539108276367, + 0.38419976830482483, + 0.3488285541534424, + 0.3366585075855255, + 0.33536985516548157, + 0.31619206070899963, + 0.3598238527774811, + 0.3590928614139557, + 0.29943913221359253, + 0.3960738480091095, + 0.31984657049179077 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.004763101227581501, + 0.00539481732994318, + 0.005577226169407368, + 0.005154943559318781, + 0.0042894682846963406, + 0.005435059778392315, + 0.005302493926137686, + 0.00512793380767107 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 3.366788864135742, + 2.8755788803100586, + 2.985736608505249, + 2.656740188598633, + 2.699720621109009, + 2.5954113006591797, + 2.6028125286102295, + 2.8667376041412354, + 2.9703688621520996, + 2.9817826747894287, + 3.1626346111297607, + 2.77913761138916, + 2.784795045852661, + 3.3395769596099854, + 2.5247817039489746, + 3.1264989376068115 + ] + } + ], + "output_tensor_infos": [ + { + "id": 68, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 9.067955943464767e-06 + ], + "zero_point": [ + 130 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.1.self_attn.q_proj.lora_B.default", + "input_tensor_infos": [ + { + "id": 68, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 9.067955943464767e-06 + ], + "zero_point": [ + 130 + ], + "smooth_quant_scaling_factor": [ + 0.0009543295600451529, + 0.000984062673524022, + 0.001070799888111651, + 0.0009884184692054987, + 0.0010192039189860225, + 0.0009609365952201188, + 0.0008825542754493654, + 0.0008479927200824022 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 1047.8560791015625, + 1016.1954956054688, + 933.88134765625, + 1011.71728515625, + 981.1578979492188, + 1040.6513671875, + 1133.0748291015625, + 1179.25537109375 + ] + } + ], + "output_tensor_infos": [ + { + "id": 69, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 63, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 66, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn.q_proj.lora_dropout.default", + "input_tensor_infos": [ + { + "id": 66, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 67, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 69, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 70, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 65, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + { + "id": 70, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 71, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.1.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 71, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 72, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 72, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers:2": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.2", + "input_tensor_infos": [ + { + "id": 119, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.0003836948308162391 + ], + "zero_point": [ + 119 + ] + }, + { + "id": 167, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.00010558274516370147 + ], + "zero_point": [ + 150 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 168, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.2.fc1", + "input_tensor_infos": [ + { + "id": 170, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0025296418461948633 + ], + "zero_point": [ + 117 + ], + "smooth_quant_scaling_factor": [ + 0.09582631289958954, + 0.1445355862379074, + 0.082494355738163, + 0.14328081905841827, + 0.11226029694080353, + 0.08192360401153564, + 0.11447090655565262, + 0.1828862428665161, + 0.1200239434838295, + 0.14438803493976593, + 0.0708846002817154, + 0.1267455518245697, + 0.12479699403047562, + 0.14254459738731384, + 0.12599679827690125, + 0.17459845542907715 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.002273815916851163, + 0.002167364116758108, + 0.0027308000717312098, + 0.002338568912819028 + ], + "zero_point": [ + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 10.435546875, + 6.9187116622924805, + 12.122041702270508, + 6.979301452636719, + 8.907868385314941, + 12.206494331359863, + 8.735844612121582, + 5.467879772186279, + 8.331670761108398, + 6.925781726837158, + 14.107436180114746, + 7.8898234367370605, + 8.013012886047363, + 7.015348434448242, + 7.936709403991699, + 5.7274274826049805 + ] + } + ], + "output_tensor_infos": [ + { + "id": 171, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "scale": [ + 0.0012088003568351269 + ], + "zero_point": [ + 144 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.2.activation_fn", + "input_tensor_infos": [ + { + "id": 171, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.0012088003568351269 + ], + "zero_point": [ + 144 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 172, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.00028806543559767306 + ], + "zero_point": [ + 0 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "3": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.2.fc2", + "input_tensor_infos": [ + { + "id": 172, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.00028806543559767306 + ], + "zero_point": [ + 0 + ], + "smooth_quant_scaling_factor": [ + 0.6897866725921631, + 0.5229077339172363, + 0.5212884545326233, + 0.5749416947364807 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.0005493504577316344, + 0.0002844150294549763, + 0.0003237436758354306, + 0.0003664802643470466, + 0.00019999578944407403, + 0.0005761217325925827, + 0.0002001346874749288, + 0.0002735615416895598, + 0.0005581346922554076, + 0.0003144122601952404, + 0.0005320287891663611, + 0.00024963842588476837, + 0.0002453761699143797, + 0.00031408341601490974, + 0.0005700319889001548, + 0.00040268656448461115 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 1.4497236013412476, + 1.9123833179473877, + 1.9183236360549927, + 1.7393068075180054 + ] + } + ], + "output_tensor_infos": [ + { + "id": 173, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "4": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.2", + "input_tensor_infos": [ + { + "id": 169, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.00036479582195170224 + ], + "zero_point": [ + 129 + ] + }, + { + "id": 174, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 4.296861152397469e-05 + ], + "zero_point": [ + 132 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 175, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn_layer_norm", + "input_tensor_infos": [ + { + "id": 119, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 120, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2", + "input_tensor_infos": [ + { + "id": 166, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 167, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2", + "input_tensor_infos": [ + { + "id": 168, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2", + "input_tensor_infos": [ + { + "id": 168, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 169, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.final_layer_norm", + "input_tensor_infos": [ + { + "id": 169, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 170, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "5": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2", + "input_tensor_infos": [ + { + "id": 173, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 174, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "6": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2", + "input_tensor_infos": [ + { + "id": 175, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 176, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 176, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers:2:self_attn": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.2.self_attn.k_proj", + "input_tensor_infos": [ + { + "id": 124, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.002619683276861906 + ], + "zero_point": [ + 124 + ], + "smooth_quant_scaling_factor": [ + 0.09771496802568436, + 0.12890088558197021, + 0.14113005995750427, + 0.14548565447330475, + 0.21194416284561157, + 0.16805870831012726, + 0.16498667001724243, + 0.19031284749507904, + 0.15200883150100708, + 0.13688746094703674, + 0.16504071652889252, + 0.15595300495624542, + 0.16503247618675232, + 0.15158624947071075, + 0.1464781016111374, + 0.11796882748603821 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.001982156652957201, + 0.0024204144719988108, + 0.0021105222404003143, + 0.0025380882434546947, + 0.0020559560507535934, + 0.0019766828045248985, + 0.0020437673665583134, + 0.002261366695165634, + 0.0019714212976396084, + 0.0024409634061157703, + 0.0017740943003445864, + 0.0015419897390529513, + 0.0021045394241809845, + 0.0018249802524223924, + 0.0027011858765035868, + 0.0017819146160036325 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 10.233846664428711, + 7.757898807525635, + 7.085662841796875, + 6.87352991104126, + 4.718224048614502, + 5.950301647186279, + 6.061095714569092, + 5.2545061111450195, + 6.5785651206970215, + 7.305271148681641, + 6.05911111831665, + 6.4121880531311035, + 6.059413433074951, + 6.596904754638672, + 6.826959133148193, + 8.476815223693848 + ] + } + ], + "output_tensor_infos": [ + { + "id": 131, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 150, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0008205320918932557 + ], + "zero_point": [ + 131 + ] + }, + { + "id": 153, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0017587682232260704 + ], + "zero_point": [ + 123 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 154, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 155, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.00016366671479772776 + ], + "zero_point": [ + 134 + ] + }, + { + "id": 41, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 1.3344405750530544e+36 + ], + "zero_point": [ + 255 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 156, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "3": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 161, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.003919653594493866 + ], + "zero_point": [ + 0 + ] + }, + { + "id": 152, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0014430396258831024 + ], + "zero_point": [ + 128 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 162, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "4": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.2.self_attn.out_proj", + "input_tensor_infos": [ + { + "id": 165, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.0006827609031461179 + ], + "zero_point": [ + 148 + ], + "smooth_quant_scaling_factor": [ + 0.8441921472549438, + 0.6734631061553955, + 0.7215516567230225, + 1.250807523727417, + 1.2269479036331177, + 0.7070020437240601, + 0.7414212822914124, + 1.0293961763381958, + 1.3132224082946777, + 0.6262384057044983, + 1.1616765260696411, + 1.2901126146316528, + 0.5255112051963806, + 0.8619773983955383, + 1.1792985200881958, + 0.7246308326721191 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.00037930699181742966, + 0.00014124519657343626, + 0.0004108152643311769, + 0.0003173276490997523, + 0.0003424036840442568, + 0.0003925739147234708, + 0.0004336285637691617, + 0.0007942286320030689, + 0.0003078484733123332, + 0.00036415716749615967, + 0.0002718089963309467, + 0.0005587885971181095, + 0.0005712821148335934, + 0.00039936500252224505, + 0.0004900978528894484, + 0.0006824227748438716 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 1.184564471244812, + 1.4848623275756836, + 1.3859021663665771, + 0.7994834780693054, + 0.8150305151939392, + 1.4144231081008911, + 1.3487608432769775, + 0.971443235874176, + 0.7614856362342834, + 1.596835970878601, + 0.8608248233795166, + 0.7751261591911316, + 1.9029089212417603, + 1.1601232290267944, + 0.8479617238044739, + 1.380013108253479 + ] + } + ], + "output_tensor_infos": [ + { + "id": 166, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 120, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ], + "output_tensor_infos": [] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 129, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 130, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 131, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 132, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 132, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 133, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 133, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 134, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "5": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 143, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 144, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "6": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 144, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 145, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "7": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 145, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 146, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "8": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 130, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 147, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "9": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 147, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 148, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "10": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 148, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 149, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "11": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 149, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 150, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "12": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 134, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 151, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "13": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 146, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 152, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "14": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 151, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [] + }, + "15": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 151, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 153, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "16": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 154, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [] + }, + "17": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 154, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 155, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "18": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 156, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + { + "id": 157, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 158, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "19": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 158, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 159, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "20": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 159, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 160, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "21": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 160, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 161, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "22": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 162, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [] + }, + "23": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 162, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 163, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "24": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 163, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 164, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "25": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn", + "input_tensor_infos": [ + { + "id": 164, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 165, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 166, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers:2:self_attn:v_proj": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.2.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 124, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.01775776967406273 + ], + "zero_point": [ + 131 + ] + }, + { + "id": 135, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.00030197048909030855, + 0.0004389838723000139, + 0.00032464342075400054, + 0.0003311963810119778, + 0.00030156190041452646, + 0.0001560249220347032, + 0.00030205087387003005, + 0.00030933573725633323, + 0.0002607844944577664, + 0.00043516099685803056, + 0.0003899929579347372, + 0.00030722690280526876, + 0.0004186415462754667, + 0.0003425968752708286, + 0.00033600343158468604, + 0.00032240492873825133 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 136, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.2.self_attn.v_proj.lora_A.default", + "input_tensor_infos": [ + { + "id": 138, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.005629746709018946 + ], + "zero_point": [ + 131 + ], + "smooth_quant_scaling_factor": [ + 0.3171026110649109, + 0.33609408140182495, + 0.30538907647132874, + 0.3807145655155182, + 0.44476813077926636, + 0.38373881578445435, + 0.3624895513057709, + 0.4135953187942505, + 0.32436317205429077, + 0.3506500720977783, + 0.3330007791519165, + 0.36063969135284424, + 0.3561692535877228, + 0.38044002652168274, + 0.3623844385147095, + 0.3346044421195984 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.005229481495916843, + 0.005087756551802158, + 0.005477722734212875, + 0.005216663237661123, + 0.004781876225024462, + 0.005361112765967846, + 0.005782869178801775, + 0.005476581398397684 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 3.1535534858703613, + 2.9753575325012207, + 3.2745113372802734, + 2.6266398429870605, + 2.2483625411987305, + 2.6059391498565674, + 2.758700132369995, + 2.4178223609924316, + 3.0829641819000244, + 2.8518459796905518, + 3.0029959678649902, + 2.772850751876831, + 2.8076539039611816, + 2.628535270690918, + 2.75950026512146, + 2.988603353500366 + ] + } + ], + "output_tensor_infos": [ + { + "id": 139, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 8.981955943454523e-06 + ], + "zero_point": [ + 130 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.2.self_attn.v_proj.lora_B.default", + "input_tensor_infos": [ + { + "id": 139, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 8.981955943454523e-06 + ], + "zero_point": [ + 130 + ], + "smooth_quant_scaling_factor": [ + 0.0008957073441706598, + 0.0009056724957190454, + 0.001112349214963615, + 0.0009625894017517567, + 0.0008586375624872744, + 0.0008882852271199226, + 0.0011550098424777389, + 0.001077544060535729 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 1116.43603515625, + 1104.15185546875, + 898.998291015625, + 1038.8646240234375, + 1164.6357421875, + 1125.7645263671875, + 865.79345703125, + 928.0363159179688 + ] + } + ], + "output_tensor_infos": [ + { + "id": 140, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 124, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 137, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn.v_proj.lora_dropout.default", + "input_tensor_infos": [ + { + "id": 137, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 138, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 140, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 141, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 136, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + { + "id": 141, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 142, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 142, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 143, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 143, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers:2:self_attn:q_proj": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.2.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 120, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.01775776967406273 + ], + "zero_point": [ + 131 + ] + }, + { + "id": 121, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0003854333481285721, + 0.0002002624241868034, + 0.0002693389542400837, + 0.000398365780711174, + 0.00018276108312420547, + 0.0001984609116334468, + 0.00047398984315805137, + 0.00019450885883998126, + 0.0002813297032844275, + 0.00027399969985708594, + 0.000275527621852234, + 0.0002677437150850892, + 0.0002750060230027884, + 0.0003327021258883178, + 0.00046579609625041485, + 0.000409616157412529 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 122, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.2.self_attn.q_proj.lora_A.default", + "input_tensor_infos": [ + { + "id": 124, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.005674399435520172 + ], + "zero_point": [ + 130 + ], + "smooth_quant_scaling_factor": [ + 0.31798771023750305, + 0.34436145424842834, + 0.3208657205104828, + 0.34916648268699646, + 0.4356110095977783, + 0.3874937891960144, + 0.37093812227249146, + 0.38457533717155457, + 0.31679439544677734, + 0.3490495979785919, + 0.33288922905921936, + 0.36058124899864197, + 0.34836524724960327, + 0.38592469692230225, + 0.35887610912323, + 0.34950464963912964 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.005451584700495005, + 0.005285411607474089, + 0.005357700865715742, + 0.005486763082444668, + 0.005549744237214327, + 0.0057990108616650105, + 0.005492669530212879, + 0.0054483977146446705 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 3.144775629043579, + 2.9039254188537598, + 3.1165685653686523, + 2.8639633655548096, + 2.295626163482666, + 2.580686569213867, + 2.6958673000335693, + 2.6002707481384277, + 3.1566214561462402, + 2.864922285079956, + 3.004002332687378, + 2.7732999324798584, + 2.8705503940582275, + 2.591179132461548, + 2.7864768505096436, + 2.861192226409912 + ] + } + ], + "output_tensor_infos": [ + { + "id": 125, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 9.729664270707872e-06 + ], + "zero_point": [ + 144 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.2.self_attn.q_proj.lora_B.default", + "input_tensor_infos": [ + { + "id": 125, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 9.729664270707872e-06 + ], + "zero_point": [ + 144 + ], + "smooth_quant_scaling_factor": [ + 0.0008135975222103298, + 0.0010519020725041628, + 0.0010809052037075162, + 0.0010481273056939244, + 0.0010143211111426353, + 0.0009273464675061405, + 0.0009341556578874588, + 0.0007129005971364677 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 1229.1090087890625, + 950.6588745117188, + 925.1505126953125, + 954.0825805664062, + 985.8810424804688, + 1078.3455810546875, + 1070.4854736328125, + 1402.7200927734375 + ] + } + ], + "output_tensor_infos": [ + { + "id": 126, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 120, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 123, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn.q_proj.lora_dropout.default", + "input_tensor_infos": [ + { + "id": 123, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 124, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 126, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 127, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 122, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + { + "id": 127, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 128, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.2.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 128, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 129, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 129, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers:3": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.3", + "input_tensor_infos": [ + { + "id": 176, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.00037621482624672353 + ], + "zero_point": [ + 122 + ] + }, + { + "id": 224, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 7.752325473120436e-05 + ], + "zero_point": [ + 125 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 225, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.3.fc1", + "input_tensor_infos": [ + { + "id": 227, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0023776839952915907 + ], + "zero_point": [ + 133 + ], + "smooth_quant_scaling_factor": [ + 0.07985354959964752, + 0.16083626449108124, + 0.08333823829889297, + 0.14618489146232605, + 0.1407022327184677, + 0.16764311492443085, + 0.1410415768623352, + 0.14249786734580994, + 0.16607335209846497, + 0.13353237509727478, + 0.08218377828598022, + 0.13758495450019836, + 0.10985197126865387, + 0.15455026924610138, + 0.164423406124115, + 0.11410682648420334 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.0024586771614849567, + 0.0020242577884346247, + 0.001968280179426074, + 0.00247385841794312 + ], + "zero_point": [ + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 12.522924423217773, + 6.217503070831299, + 11.99929428100586, + 6.840651988983154, + 7.107207775115967, + 5.965052604675293, + 7.0901079177856445, + 7.017648696899414, + 6.021435737609863, + 7.488821029663086, + 12.167851448059082, + 7.2682366371154785, + 9.10315990447998, + 6.470386505126953, + 6.081859111785889, + 8.763717651367188 + ] + } + ], + "output_tensor_infos": [ + { + "id": 228, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "scale": [ + 0.0015002207364887 + ], + "zero_point": [ + 138 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.3.activation_fn", + "input_tensor_infos": [ + { + "id": 228, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.0015002207364887 + ], + "zero_point": [ + 138 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 229, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.00033306205295957625 + ], + "zero_point": [ + 0 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "3": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.3.fc2", + "input_tensor_infos": [ + { + "id": 229, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.00033306205295957625 + ], + "zero_point": [ + 0 + ], + "smooth_quant_scaling_factor": [ + 0.4834460914134979, + 0.7404611110687256, + 0.5941412448883057, + 0.4437545835971832 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.0004820904287043959, + 0.0004664862062782049, + 0.00027692882576957345, + 0.00027292605955153704, + 0.00012331640755292028, + 0.0004861857451032847, + 0.0005612521199509501, + 0.0002444250858388841, + 0.0005028269370086491, + 0.00029398142942227423, + 0.0004208429018035531, + 0.00038634889642708004, + 0.000549447606317699, + 0.0006100540049374104, + 0.000666111649479717, + 0.000535536149982363 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 2.0684828758239746, + 1.3505098819732666, + 1.6831014156341553, + 2.253497838973999 + ] + } + ], + "output_tensor_infos": [ + { + "id": 230, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "4": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.3", + "input_tensor_infos": [ + { + "id": 226, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.00034788335324265063 + ], + "zero_point": [ + 132 + ] + }, + { + "id": 231, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 5.832596798427403e-05 + ], + "zero_point": [ + 118 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 232, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn_layer_norm", + "input_tensor_infos": [ + { + "id": 176, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 177, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3", + "input_tensor_infos": [ + { + "id": 223, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 224, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3", + "input_tensor_infos": [ + { + "id": 225, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3", + "input_tensor_infos": [ + { + "id": 225, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 226, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.final_layer_norm", + "input_tensor_infos": [ + { + "id": 226, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 227, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "5": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3", + "input_tensor_infos": [ + { + "id": 230, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 231, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "6": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3", + "input_tensor_infos": [ + { + "id": 232, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 233, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 233, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers:3:self_attn": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.3.self_attn.k_proj", + "input_tensor_infos": [ + { + "id": 181, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.0027480784337967634 + ], + "zero_point": [ + 132 + ], + "smooth_quant_scaling_factor": [ + 0.15305233001708984, + 0.12225167453289032, + 0.14288797974586487, + 0.13328514993190765, + 0.1680242121219635, + 0.14797469973564148, + 0.1489410549402237, + 0.1615593284368515, + 0.1862698644399643, + 0.11918998509645462, + 0.14913791418075562, + 0.1354290097951889, + 0.14345955848693848, + 0.15483605861663818, + 0.15613141655921936, + 0.11631940305233002 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.001720090745948255, + 0.0024139657616615295, + 0.0019263529684394598, + 0.0019102180376648903, + 0.001767429057508707, + 0.0020019139628857374, + 0.0013569797156378627, + 0.0020858272910118103, + 0.001683894544839859, + 0.0018227207474410534, + 0.0013690086780115962, + 0.0025352758821099997, + 0.0028395988047122955, + 0.0017747465753927827, + 0.002007675589993596, + 0.0024633959401398897 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 6.533712863922119, + 8.17984676361084, + 6.998489856719971, + 7.502711296081543, + 5.951523303985596, + 6.757911682128906, + 6.7140655517578125, + 6.189676761627197, + 5.368554592132568, + 8.38996696472168, + 6.705203056335449, + 7.383942604064941, + 6.970605850219727, + 6.458444118499756, + 6.404860973358154, + 8.597018241882324 + ] + } + ], + "output_tensor_infos": [ + { + "id": 188, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 207, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0007488401024602354 + ], + "zero_point": [ + 120 + ] + }, + { + "id": 210, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.001735710073262453 + ], + "zero_point": [ + 115 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 211, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 212, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.0001701119472272694 + ], + "zero_point": [ + 123 + ] + }, + { + "id": 41, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 1.3344405750530544e+36 + ], + "zero_point": [ + 255 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 213, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "3": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 218, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.003919653594493866 + ], + "zero_point": [ + 0 + ] + }, + { + "id": 209, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0015115730930119753 + ], + "zero_point": [ + 113 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 219, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "4": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.3.self_attn.out_proj", + "input_tensor_infos": [ + { + "id": 222, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.0006200977368280292 + ], + "zero_point": [ + 113 + ], + "smooth_quant_scaling_factor": [ + 0.6995439529418945, + 0.6177797913551331, + 0.7721221446990967, + 0.6627070903778076, + 1.6795364618301392, + 0.7636286020278931, + 0.4094974994659424, + 0.8874781727790833, + 0.9102892875671387, + 1.1451750993728638, + 1.4108965396881104, + 0.7052625417709351, + 0.9994568228721619, + 1.100576400756836, + 1.8541733026504517, + 1.0649280548095703 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.00044207024620845914, + 0.0003401314897928387, + 0.0002646299544721842, + 0.0005266364896669984, + 0.0006902840686962008, + 0.0004130130400881171, + 0.0005498980171978474, + 0.0004560376692097634, + 0.0004986900603398681, + 0.0003417898842599243, + 0.0004254740779288113, + 0.00041787157533690333, + 0.00043513832497410476, + 0.00033653207356110215, + 0.0003935080021619797, + 0.0005104956217110157 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 1.4295027256011963, + 1.6186997890472412, + 1.2951319217681885, + 1.5089622735977173, + 0.5954023599624634, + 1.3095371723175049, + 2.4420173168182373, + 1.1267882585525513, + 1.098551869392395, + 0.873228907585144, + 0.7087692022323608, + 1.4179116487503052, + 1.000543475151062, + 0.9086148142814636, + 0.5393239259719849, + 0.9390305876731873 + ] + } + ], + "output_tensor_infos": [ + { + "id": 223, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 177, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ], + "output_tensor_infos": [] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 186, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 187, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 188, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 189, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 189, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 190, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 190, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 191, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "5": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 200, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 201, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "6": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 201, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 202, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "7": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 202, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 203, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "8": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 187, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 204, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "9": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 204, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 205, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "10": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 205, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 206, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "11": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 206, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 207, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "12": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 191, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 208, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "13": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 203, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 209, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "14": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 208, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [] + }, + "15": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 208, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 210, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "16": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 211, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [] + }, + "17": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 211, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 212, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "18": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 213, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + { + "id": 214, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 215, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "19": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 215, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 216, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "20": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 216, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 217, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "21": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 217, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 218, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "22": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 219, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [] + }, + "23": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 219, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 220, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "24": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 220, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 221, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "25": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn", + "input_tensor_infos": [ + { + "id": 221, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 222, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 223, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers:3:self_attn:v_proj": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.3.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 181, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.01776798442006111 + ], + "zero_point": [ + 133 + ] + }, + { + "id": 192, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0003324671706650406, + 0.00022315105888992548, + 0.0002932818024419248, + 0.00030679896008223295, + 0.0002626084315124899, + 0.00043297093361616135, + 0.00032347848173230886, + 0.0003563085338100791, + 0.0002481463016010821, + 0.0003380977432243526, + 0.0003385223390068859, + 0.0002827795979101211, + 0.0001963942195288837, + 0.00028943148208782077, + 0.0003743217675946653, + 0.00036169850500300527 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 193, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.3.self_attn.v_proj.lora_A.default", + "input_tensor_infos": [ + { + "id": 195, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.005664766300469637 + ], + "zero_point": [ + 134 + ], + "smooth_quant_scaling_factor": [ + 0.32095256447792053, + 0.330251008272171, + 0.20045620203018188, + 0.36967453360557556, + 0.3964439630508423, + 0.36228129267692566, + 0.39195728302001953, + 0.38848578929901123, + 0.3446809649467468, + 0.34981322288513184, + 0.2928933799266815, + 0.361460417509079, + 0.348776251077652, + 0.38512811064720154, + 0.39673128724098206, + 0.27381011843681335 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.004655179567635059, + 0.005229951348155737, + 0.005511950235813856, + 0.004998452961444855, + 0.00595475547015667, + 0.0053747366182506084, + 0.005163250025361776, + 0.00517900800332427 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 3.11572527885437, + 3.0279998779296875, + 4.988620758056641, + 2.7050821781158447, + 2.5224244594573975, + 2.7602858543395996, + 2.5512983798980713, + 2.5740966796875, + 2.901233673095703, + 2.858668565750122, + 3.4142115116119385, + 2.766554594039917, + 2.8671674728393555, + 2.596538543701172, + 2.5205979347229004, + 3.6521661281585693 + ] + } + ], + "output_tensor_infos": [ + { + "id": 196, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 9.392422725795768e-06 + ], + "zero_point": [ + 128 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.3.self_attn.v_proj.lora_B.default", + "input_tensor_infos": [ + { + "id": 196, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 9.392422725795768e-06 + ], + "zero_point": [ + 128 + ], + "smooth_quant_scaling_factor": [ + 0.0008344786474481225, + 0.0009075241396203637, + 0.001010999782010913, + 0.001130994874984026, + 0.0009487126371823251, + 0.0008356202160939574, + 0.0011216377606615424, + 0.001240230049006641 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 1198.35302734375, + 1101.8990478515625, + 989.119873046875, + 884.1773071289062, + 1054.0599365234375, + 1196.7159423828125, + 891.553466796875, + 806.302001953125 + ] + } + ], + "output_tensor_infos": [ + { + "id": 197, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 181, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 194, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn.v_proj.lora_dropout.default", + "input_tensor_infos": [ + { + "id": 194, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 195, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 197, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 198, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 193, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + { + "id": 198, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 199, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 199, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 200, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 200, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers:3:self_attn:q_proj": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.3.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 177, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.01776798442006111 + ], + "zero_point": [ + 133 + ] + }, + { + "id": 178, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.00032423349330201745, + 0.000316593621391803, + 0.000278401275863871, + 0.0004554924671538174, + 0.0003137265157420188, + 0.0002551494399085641, + 0.00033229144173674285, + 0.00037298008101060987, + 0.00018664839444682002, + 0.00038735457928851247, + 0.00029549546889029443, + 0.0003679211949929595, + 0.0003933409752789885, + 0.000295131525490433, + 0.00048211432294920087, + 0.00038136306102387607 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 179, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.3.self_attn.q_proj.lora_A.default", + "input_tensor_infos": [ + { + "id": 181, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.005755296908318996 + ], + "zero_point": [ + 130 + ], + "smooth_quant_scaling_factor": [ + 0.31676962971687317, + 0.34011387825012207, + 0.33628129959106445, + 0.3565284311771393, + 0.4056679308414459, + 0.3763771653175354, + 0.3095501661300659, + 0.3858562707901001, + 0.3497072458267212, + 0.337135910987854, + 0.3139989972114563, + 0.3360980153083801, + 0.3577878177165985, + 0.3620110750198364, + 0.397707998752594, + 0.31274789571762085 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.0053594219498336315, + 0.005633404012769461, + 0.005202163010835648, + 0.004862114321440458, + 0.004991317167878151, + 0.0058771464973688126, + 0.0048659383319318295, + 0.0053062173537909985 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 3.1568682193756104, + 2.9401917457580566, + 2.973700761795044, + 2.8048255443573, + 2.4650704860687256, + 2.656909465789795, + 3.2304940223693848, + 2.5916385650634766, + 2.859534740447998, + 2.966162919998169, + 3.184723377227783, + 2.9753224849700928, + 2.794952630996704, + 2.7623465061187744, + 2.5144073963165283, + 3.1974635124206543 + ] + } + ], + "output_tensor_infos": [ + { + "id": 182, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 8.640432497486472e-06 + ], + "zero_point": [ + 121 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.3.self_attn.q_proj.lora_B.default", + "input_tensor_infos": [ + { + "id": 182, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 8.640432497486472e-06 + ], + "zero_point": [ + 121 + ], + "smooth_quant_scaling_factor": [ + 0.001176572171971202, + 0.0009582844795659184, + 0.0010004665236920118, + 0.0011581374565139413, + 0.0009539962629787624, + 0.0009674145840108395, + 0.0011028836015611887, + 0.0008622322347946465 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 849.9266357421875, + 1043.531494140625, + 999.53369140625, + 863.4553833007812, + 1048.22216796875, + 1033.6829833984375, + 906.7140502929688, + 1159.7802734375 + ] + } + ], + "output_tensor_infos": [ + { + "id": 183, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 177, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 180, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn.q_proj.lora_dropout.default", + "input_tensor_infos": [ + { + "id": 180, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 181, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 183, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 184, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 179, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + { + "id": 184, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 185, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.3.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 185, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 186, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 186, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers:4": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.4", + "input_tensor_infos": [ + { + "id": 233, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.0003514998243190348 + ], + "zero_point": [ + 130 + ] + }, + { + "id": 281, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 8.57003906276077e-05 + ], + "zero_point": [ + 128 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 282, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.4.fc1", + "input_tensor_infos": [ + { + "id": 284, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.002228383906185627 + ], + "zero_point": [ + 119 + ], + "smooth_quant_scaling_factor": [ + 0.07490289211273193, + 0.08660875260829926, + 0.11810106784105301, + 0.14189991354942322, + 0.21989880502223969, + 0.11465814709663391, + 0.16941188275814056, + 0.12009169906377792, + 0.10226233303546906, + 0.142814502120018, + 0.10813327878713608, + 0.11895965784788132, + 0.121878482401371, + 0.11287350207567215, + 0.13685068488121033, + 0.09771235287189484 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.0020820628851652145, + 0.0020485082641243935, + 0.002014799742028117, + 0.0023746262304484844 + ], + "zero_point": [ + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 13.350618362426758, + 11.54617691040039, + 8.467324256896973, + 7.047220706939697, + 4.54754638671875, + 8.721578598022461, + 5.902773857116699, + 8.326970100402832, + 9.77877140045166, + 7.002089977264404, + 9.247847557067871, + 8.406210899353027, + 8.204894065856934, + 8.859475135803223, + 7.307234287261963, + 10.23412036895752 + ] + } + ], + "output_tensor_infos": [ + { + "id": 285, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "scale": [ + 0.001055107219144702 + ], + "zero_point": [ + 145 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.4.activation_fn", + "input_tensor_infos": [ + { + "id": 285, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.001055107219144702 + ], + "zero_point": [ + 145 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 286, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.000284639245364815 + ], + "zero_point": [ + 0 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "3": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.4.fc2", + "input_tensor_infos": [ + { + "id": 286, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.000284639245364815 + ], + "zero_point": [ + 0 + ], + "smooth_quant_scaling_factor": [ + 0.6736321449279785, + 0.7133809328079224, + 0.6224557757377625, + 0.6026663780212402 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.0004203274438623339, + 0.0005461736582219601, + 0.00029452473972924054, + 0.000159776012878865, + 0.00043560945778153837, + 0.00014329873374663293, + 0.00041623887955211103, + 0.00016978861822281033, + 0.0005588481435552239, + 0.0005692706909030676, + 0.0005020407843403518, + 0.00035534577909857035, + 0.00012209215492475778, + 0.00042365113040432334, + 0.00031242144177667797, + 0.00043097708839923143 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 1.4844897985458374, + 1.401775598526001, + 1.6065398454666138, + 1.6592928171157837 + ] + } + ], + "output_tensor_infos": [ + { + "id": 287, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "4": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.4", + "input_tensor_infos": [ + { + "id": 283, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.0003883987374138087 + ], + "zero_point": [ + 119 + ] + }, + { + "id": 288, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 5.001319004804827e-05 + ], + "zero_point": [ + 98 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 289, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn_layer_norm", + "input_tensor_infos": [ + { + "id": 233, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 234, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4", + "input_tensor_infos": [ + { + "id": 280, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 281, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4", + "input_tensor_infos": [ + { + "id": 282, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4", + "input_tensor_infos": [ + { + "id": 282, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 283, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.final_layer_norm", + "input_tensor_infos": [ + { + "id": 283, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 284, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "5": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4", + "input_tensor_infos": [ + { + "id": 287, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 288, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "6": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4", + "input_tensor_infos": [ + { + "id": 289, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 290, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 290, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers:4:self_attn": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.4.self_attn.k_proj", + "input_tensor_infos": [ + { + "id": 238, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.0027999659068882465 + ], + "zero_point": [ + 126 + ], + "smooth_quant_scaling_factor": [ + 0.1276211142539978, + 0.150820791721344, + 0.1714450716972351, + 0.16795159876346588, + 0.16124901175498962, + 0.18224044144153595, + 0.18983706831932068, + 0.1788574606180191, + 0.15811805427074432, + 0.1341456025838852, + 0.12334004789590836, + 0.1529984027147293, + 0.1920592486858368, + 0.18413232266902924, + 0.14898242056369781, + 0.14343814551830292 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.00161055289208889, + 0.002761982847005129, + 0.002112831687554717, + 0.001920697744935751, + 0.0022592521272599697, + 0.002328770002350211, + 0.0025834105908870697, + 0.001746615394949913, + 0.001962139271199703, + 0.002837864914909005, + 0.0014426681445911527, + 0.0011523402063176036, + 0.0017300972249358892, + 0.0019138624193146825, + 0.0012878895504400134, + 0.002227139426395297 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 7.835694313049316, + 6.630385875701904, + 5.832772254943848, + 5.954096794128418, + 6.2015886306762695, + 5.487256050109863, + 5.267674922943115, + 5.5910444259643555, + 6.32438850402832, + 7.454586982727051, + 8.107666969299316, + 6.53601598739624, + 5.206726551055908, + 5.430877208709717, + 6.7122015953063965, + 6.971645832061768 + ] + } + ], + "output_tensor_infos": [ + { + "id": 245, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 264, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0007117515779100358 + ], + "zero_point": [ + 140 + ] + }, + { + "id": 267, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0019945132080465555 + ], + "zero_point": [ + 119 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 268, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 269, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 0.00019521928334143013 + ], + "zero_point": [ + 116 + ] + }, + { + "id": 41, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32", + "force_dtype": "torch.float32", + "scale": [ + 1.3344405750530544e+36 + ], + "zero_point": [ + 255 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 270, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "3": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 275, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.003919653594493866 + ], + "zero_point": [ + 0 + ] + }, + { + "id": 266, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0016270156484097242 + ], + "zero_point": [ + 146 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 276, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "4": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.4.self_attn.out_proj", + "input_tensor_infos": [ + { + "id": 279, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.0007375198183581233 + ], + "zero_point": [ + 128 + ], + "smooth_quant_scaling_factor": [ + 0.7722166180610657, + 0.5633851885795593, + 1.1684167385101318, + 0.3943912386894226, + 1.159185767173767, + 0.5480626225471497, + 0.6345553994178772, + 0.5357393026351929, + 0.4990648031234741, + 0.5183709263801575, + 0.999289870262146, + 1.1737982034683228, + 0.900857150554657, + 0.5510081648826599, + 0.6094800233840942, + 0.8488926291465759 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.0007356680580414832, + 0.000624767504632473, + 0.00041961250826716423, + 0.0003189104900229722, + 0.0006824223673902452, + 0.00028400454903021455, + 0.0003987895615864545, + 0.0007355051347985864, + 0.000533037877175957, + 0.0007393524865619838, + 0.0004391110851429403, + 0.0005651583196595311, + 0.0006558214081451297, + 0.00030508654890581965, + 0.0003671708982437849, + 0.00036240171175450087 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 1.294973373413086, + 1.77498459815979, + 0.8558589816093445, + 2.535553216934204, + 0.8626744747161865, + 1.8246090412139893, + 1.57590651512146, + 1.866579532623291, + 2.0037479400634766, + 1.9291205406188965, + 1.0007106065750122, + 0.8519351482391357, + 1.1100538969039917, + 1.8148550987243652, + 1.6407428979873657, + 1.178005337715149 + ] + } + ], + "output_tensor_infos": [ + { + "id": 280, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 234, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ], + "output_tensor_infos": [] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 243, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 244, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 245, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 246, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 246, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 247, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 247, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 248, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "5": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 257, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 258, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "6": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 258, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 259, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "7": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 259, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 260, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "8": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 244, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 261, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "9": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 261, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 262, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "10": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 262, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 263, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "11": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 263, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 264, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "12": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 248, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 265, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "13": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 260, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 266, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "14": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 265, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [] + }, + "15": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 265, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 267, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "16": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 268, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [] + }, + "17": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 268, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 269, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "18": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 270, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + { + "id": 271, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 272, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "19": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 272, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 273, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "20": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 273, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 274, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "21": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 274, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 275, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + } + ] + }, + "22": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 276, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [] + }, + "23": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 276, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 277, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "24": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 277, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 278, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "25": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn", + "input_tensor_infos": [ + { + "id": 278, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {}, + {}, + {} + ], + "output_tensor_infos": [ + { + "id": 279, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 280, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers:4:self_attn:v_proj": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.4.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 238, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.01734682358801365 + ], + "zero_point": [ + 133 + ] + }, + { + "id": 249, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.00031136625329963863, + 0.00025668463786132634, + 0.00022895813162904233, + 0.00041868723928928375, + 0.00046801360440440476, + 0.000544839771464467, + 0.0002859718806575984, + 0.0003757727099582553, + 0.00036266579991206527, + 0.0003396574466023594, + 0.00031236352515406907, + 0.0002318342449143529, + 0.00039146305061876774, + 0.00027779644005931914, + 0.00029864496900700033, + 0.00038601880078203976 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 250, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.4.self_attn.v_proj.lora_A.default", + "input_tensor_infos": [ + { + "id": 252, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.005635845009237528 + ], + "zero_point": [ + 131 + ], + "smooth_quant_scaling_factor": [ + 0.31827637553215027, + 0.3578762412071228, + 0.33076614141464233, + 0.38460636138916016, + 0.40154778957366943, + 0.36107155680656433, + 0.40281784534454346, + 0.35139355063438416, + 0.33984655141830444, + 0.3377038836479187, + 0.3539571166038513, + 0.3015575408935547, + 0.36707204580307007, + 0.3956300914287567, + 0.3773953318595886, + 0.35525140166282654 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.005779150407761335, + 0.0048353704623878, + 0.005278879776597023, + 0.005413893144577742, + 0.005464703775942326, + 0.005715600214898586, + 0.005475110374391079, + 0.00549249816685915 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 3.141923427581787, + 2.794262170791626, + 3.0232841968536377, + 2.6000609397888184, + 2.490363597869873, + 2.769534111022949, + 2.4825117588043213, + 2.8458120822906494, + 2.942504644393921, + 2.9611740112304688, + 2.8252010345458984, + 3.3161168098449707, + 2.7242608070373535, + 2.527613639831543, + 2.6497411727905273, + 2.8149077892303467 + ] + } + ], + "output_tensor_infos": [ + { + "id": 253, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 9.803168723010458e-06 + ], + "zero_point": [ + 123 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.4.self_attn.v_proj.lora_B.default", + "input_tensor_infos": [ + { + "id": 253, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 9.803168723010458e-06 + ], + "zero_point": [ + 123 + ], + "smooth_quant_scaling_factor": [ + 0.0008598009590059519, + 0.0009210532298311591, + 0.0007724169990979135, + 0.0008119846461340785, + 0.0008297572494484484, + 0.0009486194467172027, + 0.0009887740015983582, + 0.0009047275525517762 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 1163.0599365234375, + 1085.713623046875, + 1294.637451171875, + 1231.5504150390625, + 1205.1717529296875, + 1054.16357421875, + 1011.3534545898438, + 1105.30517578125 + ] + } + ], + "output_tensor_infos": [ + { + "id": 254, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 238, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 251, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn.v_proj.lora_dropout.default", + "input_tensor_infos": [ + { + "id": 251, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 252, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 254, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 255, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 250, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + { + "id": 255, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 256, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn.v_proj", + "input_tensor_infos": [ + { + "id": 256, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 257, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 257, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:model:decoder:layers:4:self_attn:q_proj": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": false, + "fqn": "base_model.model.model.decoder.layers.4.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 234, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.01734682358801365 + ], + "zero_point": [ + 133 + ] + }, + { + "id": 235, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "force_dtype": "torch.quint8", + "scale": [ + 0.0002663393097463995, + 0.0001794756535673514, + 0.00038752422551624477, + 0.00040039693703874946, + 0.0003284724662080407, + 0.0003813150688074529, + 0.00030346630956046283, + 0.0003574831353034824, + 0.00039425952127203345, + 0.00027267029508948326, + 0.0005633418913930655, + 0.0002701474877540022, + 0.00029551019542850554, + 0.00038041899097152054, + 0.00028876157011836767, + 0.0003570202679838985 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + } + ], + "weight_tensor_infos": [], + "output_tensor_infos": [ + { + "id": 236, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": false, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "MinMaxObserver", + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": false, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "1": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.4.self_attn.q_proj.lora_A.default", + "input_tensor_infos": [ + { + "id": 238, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.005501109641045332 + ], + "zero_point": [ + 129 + ], + "smooth_quant_scaling_factor": [ + 0.30547964572906494, + 0.35993775725364685, + 0.31513577699661255, + 0.34536346793174744, + 0.44191431999206543, + 0.35816141963005066, + 0.3525455892086029, + 0.32803699374198914, + 0.34046927094459534, + 0.3439047634601593, + 0.26700085401535034, + 0.35134607553482056, + 0.35045164823532104, + 0.3664116859436035, + 0.3242016136646271, + 0.31810393929481506 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.005165941547602415, + 0.005216381512582302, + 0.005039858631789684, + 0.0055467900820076466, + 0.005347891245037317, + 0.005455384962260723, + 0.005077300127595663, + 0.0038714618422091007 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 3.273540496826172, + 2.7782580852508545, + 3.1732351779937744, + 2.8955001831054688, + 2.2628822326660156, + 2.79203724861145, + 2.836512565612793, + 3.0484366416931152, + 2.9371225833892822, + 2.9077816009521484, + 3.7453062534332275, + 2.8461966514587402, + 2.8534607887268066, + 2.729170560836792, + 3.084500312805176, + 3.1436264514923096 + ] + } + ], + "output_tensor_infos": [ + { + "id": 239, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 9.286790373153053e-06 + ], + "zero_point": [ + 122 + ] + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + }, + "2": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.model.decoder.layers.4.self_attn.q_proj.lora_B.default", + "input_tensor_infos": [ + { + "id": 239, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 9.286790373153053e-06 + ], + "zero_point": [ + 122 + ], + "smooth_quant_scaling_factor": [ + 0.0010128314606845379, + 0.0009598570759408176, + 0.0008829228463582695, + 0.0010532870655879378, + 0.0010632226476445794, + 0.00080936832819134, + 0.0009245102410204709, + 0.0015908911591395736 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07, + 1.1920928955078125e-07 + ], + "zero_point": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 987.3311157226562, + 1041.82177734375, + 1132.601806640625, + 949.4088134765625, + 940.5367431640625, + 1235.5313720703125, + 1081.65380859375, + 628.5784912109375 + ] + } + ], + "output_tensor_infos": [ + { + "id": 240, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": { + "0": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 234, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 237, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "1": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn.q_proj.lora_dropout.default", + "input_tensor_infos": [ + { + "id": 237, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 238, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8" + } + ] + }, + "2": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 240, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 241, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "3": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 236, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + { + "id": 241, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "output_tensor_infos": [ + { + "id": 242, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "4": { + "op_type": "", + "fqn": "base_model.model.model.decoder.layers.4.self_attn.q_proj", + "input_tensor_infos": [ + { + "id": 242, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + }, + {} + ], + "output_tensor_infos": [ + { + "id": 243, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + } + }, + "layer_output_infos": [ + { + "id": 243, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + }, + "base_model:model:score": { + "q_op_infos": { + "0": { + "op_type": "", + "op_type_is_module": true, + "fqn": "base_model.model.score.modules_to_save.default", + "input_tensor_infos": [ + { + "id": 291, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.quint8", + "force_dtype": "torch.qint8", + "scale": [ + 0.002082008868455887 + ], + "zero_point": [ + 121 + ], + "smooth_quant_scaling_factor": [ + 0.09728745371103287, + 0.1161823719739914, + 0.11396356672048569, + 0.11526038497686386, + 0.1379472017288208, + 0.07854942232370377, + 0.11264046281576157, + 0.11592700332403183, + 0.06852123886346817, + 0.12406256049871445, + 0.13584022223949432, + 0.11282354593276978, + 0.04839572310447693, + 0.16329161822795868, + 0.15682533383369446, + 0.13011035323143005 + ] + } + ], + "weight_tensor_infos": [ + { + "orig_dtype": "torch.float32", + "inf_dtype": "torch.qint8", + "scale": [ + 0.002189199673011899, + 0.001974712824448943 + ], + "zero_point": [ + 0, + 0 + ], + "smooth_quant_scaling_factor": [ + 10.278817176818848, + 8.607157707214355, + 8.774734497070312, + 8.676008224487305, + 7.249150276184082, + 12.730838775634766, + 8.87780475616455, + 8.626117706298828, + 14.594016075134277, + 8.060449600219727, + 7.3615899085998535, + 8.863398551940918, + 20.662982940673828, + 6.1240129470825195, + 6.37652063369751, + 7.685783386230469 + ] + } + ], + "output_tensor_infos": [ + { + "id": 292, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ], + "activation_observer": { + "name": "SmoothQuantActivationObserver", + "smooth_quant_enabled": true, + "dtype": "torch.quint8", + "qscheme": "torch.per_tensor_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255, + "alpha": 0.5, + "act_observer": { + "name": "HistogramObserver", + "bins": 2048, + "upsample_rate": 128, + "dtype": "torch.qint8", + "qscheme": "torch.per_tensor_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "act_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": -1, + "dtype": "torch.quint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": 0, + "quant_max": 255 + } + }, + "weight_observer": { + "name": "SmoothQuantWeightObserver", + "smooth_quant_enabled": true, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127, + "alpha": 0.5, + "wei_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 0, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_symmetric", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + }, + "wei_ic_observer": { + "name": "PerChannelMinMaxObserver", + "ch_axis": 1, + "dtype": "torch.qint8", + "qscheme": "torch.per_channel_affine", + "reduce_range": false, + "quant_min": -128, + "quant_max": 127 + } + } + } + }, + "nonq_op_infos": {}, + "layer_output_infos": [ + { + "id": 292, + "orig_dtype": "torch.float32", + "inf_dtype": "torch.float32" + } + ] + } +} \ No newline at end of file diff --git a/test/algorithm/test_smooth_quant.py b/test/algorithm/test_smooth_quant.py index 4b4201edcc3..3f0cb63a4d9 100644 --- a/test/algorithm/test_smooth_quant.py +++ b/test/algorithm/test_smooth_quant.py @@ -27,6 +27,7 @@ import intel_extension_for_pytorch as ipex TEST_IPEX = True + IPEX_VERSION = Version(ipex.__version__) except: TEST_IPEX = False @@ -891,8 +892,8 @@ def calib_func(model): ) self.assertTrue(torch.allclose(inc_sq_weight_scale, ipex_sq_weight_scale)) # set a big atol to avoid random issue - self.assertTrue(torch.allclose(ipex_out, inc_out, atol=1e-02)) - self.assertTrue(torch.allclose(output1, inc_out, atol=1e-02)) + self.assertTrue(torch.allclose(ipex_out, inc_out, atol=2e-02)) + self.assertTrue(torch.allclose(output1, inc_out, atol=2e-02)) class CalibDataloader: def __init__(self): @@ -915,7 +916,7 @@ def __iter__(self): ) output2 = q_model.model(input_ids) # set a big atol to avoid random issue - self.assertTrue(torch.allclose(output1, output2, atol=1e-02)) + self.assertTrue(torch.allclose(output1, output2, atol=2e-02)) conf = PostTrainingQuantConfig( backend="ipex", @@ -931,7 +932,7 @@ def __iter__(self): ) output2 = q_model.model(input_ids) # set a big atol to avoid random issue - self.assertTrue(torch.allclose(output1, output2, atol=1e-02)) + self.assertTrue(torch.allclose(output1, output2, atol=2e-02)) class TestSqSkipOp(unittest.TestCase): @@ -1288,5 +1289,143 @@ def test_sq_auto_mem_usage(self): assert (mem_use1 - mem_use0) <= 2.0 +class TestPeftModel(unittest.TestCase): + def test_peft_model_fixed_alpha(self): + import peft + + model_id = "peft-internal-testing/tiny_OPTForSequenceClassification-lora" + model = peft.AutoPeftModelForSequenceClassification.from_pretrained(model_id) + example_input = torch.ones(1, 12, dtype=torch.long) + out1 = model(example_input) + + def calib_func(model): + model(example_input) + + sq = TorchSmoothQuant(model, example_inputs=example_input, q_func=calib_func) + sq.transform(alpha=0.5, folding=False) + self.assertTrue(isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, SQLinearWrapper)) + self.assertTrue( + isinstance( + model.base_model.model.model.decoder.layers[0].self_attn.v_proj.sq_linear.lora_A.default, + SQLinearWrapper, + ) + ) # Linear in Linear + self.assertTrue( + isinstance(model.base_model.model.score.original_module, torch.nn.Linear) + ) # Linear that is not called in calibration + + def test_peft_model_auto_alpha(self): + import peft + + model_id = "peft-internal-testing/tiny_OPTForSequenceClassification-lora" + model = peft.AutoPeftModelForSequenceClassification.from_pretrained(model_id, torchscript=True) + example_input = torch.ones(1, 12, dtype=torch.long) + out1 = model(example_input) + + def calib_func(model): + model(example_input) + + # folding=False + sq = TorchSmoothQuant(model, example_inputs=example_input, q_func=calib_func) + sq.transform(alpha="auto", folding=False) + self.assertTrue(isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, SQLinearWrapper)) + self.assertTrue( + isinstance( + model.base_model.model.model.decoder.layers[0].self_attn.v_proj.sq_linear.lora_A.default, + SQLinearWrapper, + ) + ) # Linear in Linear + self.assertTrue( + isinstance(model.base_model.model.score.original_module, torch.nn.Linear) + ) # Linear that is not called in calibration + + # folding=True + model = peft.AutoPeftModelForSequenceClassification.from_pretrained(model_id, torchscript=True) + example_input = torch.ones(1, 12, dtype=torch.long) + out1 = model(example_input) + + def calib_func(model): + model(example_input) + + sq = TorchSmoothQuant(model, example_inputs=example_input, q_func=calib_func) + sq.transform(alpha="auto", folding=True) + self.assertTrue(isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, torch.nn.Linear)) + self.assertTrue( + isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj.lora_A.default, torch.nn.Linear) + ) # Linear in Linear + + def test_peft_model_quantization(self): + import peft + + model_id = "peft-internal-testing/tiny_OPTForSequenceClassification-lora" + model = peft.AutoPeftModelForSequenceClassification.from_pretrained(model_id) + # model.base_model.model.model.decoder.layers[0].self_attn.v_proj.lora_B.default.weight is Zero + # peft model is needed to be trained first. + example_input = torch.ones(1, 12, dtype=torch.long) + out1 = model(example_input) + + def calib_func(model): + model(example_input) + + from neural_compressor import PostTrainingQuantConfig, quantization + + recipes = {"smooth_quant": True, "smooth_quant_args": {"alpha": 0.5}} + conf = PostTrainingQuantConfig( + excluded_precisions=["bf16"], + recipes=recipes, + example_inputs=example_input, + ) + q_model = quantization.fit( + model, + conf, + calib_func=calib_func, + ) + decoder = q_model.model.base_model.model.model.decoder + self.assertTrue(isinstance(decoder.layers[0].self_attn.v_proj, SQLinearWrapper)) + self.assertTrue( + isinstance( + decoder.layers[0].self_attn.v_proj.sq_linear.module.lora_A.default, + SQLinearWrapper, + ) + ) # Linear in Linear + self.assertTrue( + isinstance(q_model.model.base_model.model.score.original_module, torch.nn.Linear) + ) # Linear that is not called in calibration + + @unittest.skipIf( + IPEX_VERSION.release <= Version("2.1.0").release and ipex.__version__ != "2.1.0+cpu", + "Please use Intel extension for Pytorch version higher or equal to 2.1.0", + ) + def test_peft_model_quantization_ipex(self): + import peft + + model_id = "peft-internal-testing/tiny_OPTForSequenceClassification-lora" + model = peft.AutoPeftModelForSequenceClassification.from_pretrained(model_id, torchscript=True) + # model.base_model.model.model.decoder.layers[0].self_attn.v_proj.lora_B.default.weight is Zero + # peft model is needed to be trained first. + example_input = torch.ones(1, 12, dtype=torch.long) + out1 = model(example_input)[0] + + def calib_func(model): + model(example_input) + + from neural_compressor import PostTrainingQuantConfig, quantization + + recipes = {"smooth_quant": True, "smooth_quant_args": {"alpha": 0.5}} + conf = PostTrainingQuantConfig( + backend="ipex", # IPEX will got error now, will enhance it. + excluded_precisions=["bf16"], + op_name_dict={".*": {"activation": {"algorithm": "minmax"}}}, + recipes=recipes, + example_inputs=example_input, + ) + q_model = quantization.fit( + model, + conf, + calib_func=calib_func, + ) + out2 = q_model.model(example_input)[0] + + if __name__ == "__main__": unittest.main() diff --git a/test/requirements.txt b/test/requirements.txt index 7e4499f31f4..616dbe385dc 100644 --- a/test/requirements.txt +++ b/test/requirements.txt @@ -10,6 +10,7 @@ onnx onnxruntime onnxruntime-extensions; python_version < '3.11' optimum +peft tensorflow-addons tf2onnx tf_slim