diff --git a/docs/source/quantization_weight_only.md b/docs/source/quantization_weight_only.md index 50f5913893e..13200d536c8 100644 --- a/docs/source/quantization_weight_only.md +++ b/docs/source/quantization_weight_only.md @@ -87,6 +87,8 @@ Notes: | use_max_length | False | Whether to align all calibration data to fixed length, which equals to pad_max_length. | | block_size | 128 | Execute GPTQ quantization per block, block shape = [$C_{out}$, block_size] | | static_groups | False | Whether to calculate group wise quantization parameters in advance. This option mitigate actorder's extra computational requirements | +| true_sequential | False | Whether to quantize layers within a transformer block in their original order. This can lead to higher accuracy but slower overall quantization process. | +| lm_head | False | Whether to quantize the lm_head (linear layer related to prediction in the end of the language models). | **Note:** Neural compressor provides `Unsigned integer for asymmetric quantization` and `Signed integer for symmetric quantization`. Please follow the below section to compress the low bit data type for saving. diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py index ce4b7f9ab4f..477ad75b7e1 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py @@ -77,6 +77,8 @@ this should align with your model config, \ and your dataset builder args: args.pad_max_length') parser.add_argument('--gptq_static_groups', action='store_true', help='Use determined group to do quantization') +parser.add_argument('--gptq_true_sequential', action='store_true', help="Whether to run in true_sequential model.") +parser.add_argument('--gptq_lm_head', action='store_true', help="Whether to use GPTQ to quantize the output layer of the LLMs.") # ==============code generation args=========== parser.add_argument("--code_generation", action="store_true") parser.add_argument("--n_samples", default=200, type=int) @@ -278,7 +280,8 @@ def calib_func(prepared_model): 'use_max_length': args.gptq_use_max_length, 'pad_max_length': args.gptq_pad_max_length, 'static_groups': args.gptq_static_groups, - "enable_mse_search": args.woq_enable_mse_search, + "true_sequential": args.gptq_true_sequential, + "lm_head": args.gptq_lm_head, } # GPTQ: use assistive functions to modify calib_dataloader and calib_func # TEQ: set calib_func=None, use default training func as calib_func diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py index db79fe05c7f..2a368665739 100644 --- a/neural_compressor/adaptor/pytorch.py +++ b/neural_compressor/adaptor/pytorch.py @@ -4722,6 +4722,8 @@ def gptq_quantize(self, model, tune_cfg, dataloader): "act_order": self.recipes["gptq_args"].get("act_order", False), "block_size": self.recipes["gptq_args"].get("block_size", True), "static_groups": self.recipes["gptq_args"].get("static_groups", False), + "true_sequential": self.recipes["gptq_args"].get("true_sequential", False), + "lm_head": self.recipes["gptq_args"].get("lm_head", False), } nsamples = self.recipes["gptq_args"].get("nsamples", 128) use_max_length = self.recipes["gptq_args"].get("use_max_length", False) diff --git a/neural_compressor/adaptor/torch_utils/gptq.py b/neural_compressor/adaptor/torch_utils/gptq.py index 49ac8695598..2f1c6cc0582 100644 --- a/neural_compressor/adaptor/torch_utils/gptq.py +++ b/neural_compressor/adaptor/torch_utils/gptq.py @@ -89,6 +89,7 @@ def trace_gptq_target_blocks(module, module_types=[torch.nn.ModuleList, torch.nn "transformers": {}, Dict# TODO } """ + find_transformers = False if type(module).__name__ == "MixFormerSequentialForCausalLM": # pragma: no cover gptq_related_blocks = { "embeddings": {}, @@ -118,12 +119,19 @@ def trace_gptq_target_blocks(module, module_types=[torch.nn.ModuleList, torch.nn } for n, m in module.named_modules(): if type(m) in module_types: + # find the block gptq_related_blocks["transformers_name"] = n gptq_related_blocks["transformers"] = m - return gptq_related_blocks + find_transformers = True + # return gptq_related_blocks + elif is_leaf(m) and not find_transformers: + gptq_related_blocks["embeddings"][n] = m + elif n.find(gptq_related_blocks["transformers_name"]) == -1 and find_transformers: + # no longer belong to transformers + gptq_related_blocks["transformers_post"]["name"] = n + gptq_related_blocks["transformers_post"]["layer"] = m else: - if is_leaf(m): - gptq_related_blocks["embeddings"][n] = m + continue return gptq_related_blocks @@ -234,6 +242,7 @@ def __init__( self.sym_default = False self.act_order_default = False self.static_groups_default = False + self.true_sequential_default = None self.perchannel_default = True self.mse_default = False self.check_layer_config() @@ -411,6 +420,9 @@ def check_layer_config(self): tmp_weight_config[name]["static_groups"] = self.weight_config.get( "static_groups", self.static_groups_default ) + tmp_weight_config[name]["true_sequential"] = self.weight_config.get( + "true_sequential", self.true_sequential_default + ) tmp_weight_config[name]["perchannel"] = self.weight_config.get("perchannel", self.perchannel_default) tmp_weight_config[name]["mse"] = self.weight_config.get("mse", self.mse_default) self.weight_config = tmp_weight_config @@ -425,6 +437,9 @@ def check_layer_config(self): self.weight_config[layer_name]["static_groups"] = config.get( "static_groups", self.static_groups_default ) + self.weight_config[layer_name]["true_sequential"] = config.get( + "true_sequential", self.true_sequential_default + ) self.weight_config[layer_name]["perchannel"] = config.get("perchannel", self.perchannel_default) self.weight_config[layer_name]["mse"] = config.get("mse", self.mse_default) @@ -544,6 +559,38 @@ def update_blockwise_hidden_states(self, outs): else: self.cache_positional_arguments[0] = outs[:] + def find_true_sequential_config(self): + for layer_name in self.weight_config: + if self.weight_config[layer_name].get("true_sequential", None) is not None: + return self.weight_config[layer_name]["true_sequential"] + return False + + def find_lm_head_config(self): + for layer_name in self.weight_config: + if self.weight_config[layer_name].get("lm_head", None) is not None: + return self.weight_config[layer_name]["lm_head"] + return False + + def analyze_true_sequential(self, module, inputs=None): + # to obtain the depth of each linear layers in this block + # obtain all linear layers' names + layers = find_layers(module) + layers = list(layers) + # group layers into sequentials + # case 1: query, key and value are calculated from one matrix, bloom, etc.. + if "q" in layers[0].lower() and "k" in layers[0].lower(): + qkv_layers = [layers[0]] + post_qkv_layers = layers[1:] + else: + # case 2: qkv are calculated separately. + qkv_layers = layers[0:3] + post_qkv_layers = layers[3:] + layers.clear() + layers.append(qkv_layers) + for layer in post_qkv_layers: + layers.append([layer]) + return layers + @torch.no_grad() def execute_quantization(self, means=None, stds=None, model_path=None): """Run quantization.""" @@ -554,6 +601,11 @@ def execute_quantization(self, means=None, stds=None, model_path=None): # Step2: run gptq quantization in a transformer block-wise manner. gptq_config = {} + + self.true_sequential = self.find_true_sequential_config() + # automatically get true_sequential + true_sequential_map = self.analyze_true_sequential(self.gptq_related_blocks["transformers"][0]) + logger.info(f"Sequential Name: {true_sequential_map}") tblock_length = len(self.gptq_related_blocks["transformers"]) for block_idx in range(tblock_length): logger.info(f"Quantizing layer {block_idx + 1} / {tblock_length}..") @@ -565,75 +617,209 @@ def execute_quantization(self, means=None, stds=None, model_path=None): # Step2.1: obtain all layers (Linear, Conv2d, etc) in the block which can be quantized. sub_layers = find_layers(transformer_block) sub_layers_to_quant = {} + # add true sequential options + if self.true_sequential is not None and self.true_sequential: + sequentials = true_sequential_map + else: + sequentials = [list(sub_layers.keys())] + # start to process every layers in a sequential + for sequential in sequentials: + logger.info(f"Current quantization sequential: {sequential}") + sub_layers_to_quant = {} + sequential_layers = {n: sub_layers[n] for n in sequential} + for layer_name, layer_obj in sequential_layers.items(): + # filter sub_layers with included layer_names in self.weight_config + full_layer_name = self.get_full_layer_name(layer_name, block_idx) + # if self.weight_config.get(full_layer_name, None) == None: + if self.get_layer_config(full_layer_name) is None: + logger.warning( + f"{full_layer_name} can be quantized " + "but excluded from quantization configs." + ) + else: + sub_layers_to_quant[layer_name] = layer_obj + del sequential_layers + sequential_layers = sub_layers_to_quant + # Step 2.2: Initialize GPTQ quantizers for collected layers. + gptq_for_this_block = {} + # initialize gptq quantizer for every layer in a transformer block + for layer_name in sequential_layers: + # weight_config_this_layer = self.weight_config.get( + # self.get_full_layer_name(layer_name, block_idx), None + # ) + full_layer_name = self.get_full_layer_name(layer_name, block_idx) + weight_config_this_layer = self.get_layer_config(full_layer_name) + if self.layer_wise: + from ..torch_utils.layer_wise_quant.utils import load_value + + W = load_value(self.model, full_layer_name + ".weight", model_path) + else: + W = sequential_layers[layer_name].weight.data.clone() + + gptq_for_this_block[layer_name] = GPTQ(sequential_layers[layer_name], W, self.device) + # gptq_for_this_block[layer_name].quantizer = Quantizer() + gptq_for_this_block[layer_name].quantizer.configure( + weight_config_this_layer["wbits"], + weight_config_this_layer["perchannel"], + weight_config_this_layer["sym"], + weight_config_this_layer["mse"], + ) + + # Step 2.3: modify forward functions to hook inputs data (used in gptq execution) + def add_batch(_name): + def tmp(_, inp, out): + gptq_for_this_block[_name].add_batch(inp[0].data, out.data) # noqa: F821 + + return tmp + + handles = [] # register handles which add inputs and outputs to gptq object + for layer_name in sequential_layers: + handles.append(sequential_layers[layer_name].register_forward_hook(add_batch(layer_name))) + idx = self.cache_key_arguments.pop("i") + for j in range(len(self.dataloader)): + cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j) + cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j) + out = transformer_block(*cache_positional_batch, **cache_keyword_batch) + out = self.track_hidden_states(out) + self.cache_key_arguments["i"] = idx + for h in handles: + h.remove() + # Step 2.4: everything is prepared, so start quantization! + for layer_name in sequential_layers: + # weight_config_this_layer = self.weight_config.get( + # self.get_full_layer_name(layer_name, block_idx), None + # ) + weight_config_this_layer = self.get_layer_config(self.get_full_layer_name(layer_name, block_idx)) + logger.info(f"Quantizing layer {layer_name}") + if self.layer_wise: + from ..torch_utils.layer_wise_quant.utils import load_value + + full_layer_name = self.get_full_layer_name(layer_name, block_idx) + W = load_value(self.model, full_layer_name + ".weight", model_path) + else: + W = sequential_layers[layer_name].weight.data.clone() + scale, zp, Q = gptq_for_this_block[layer_name].fasterquant( + W, + blocksize=weight_config_this_layer["block_size"], + percdamp=weight_config_this_layer["percdamp"], + groupsize=weight_config_this_layer["group_size"], + act_order=weight_config_this_layer["act_order"], + static_groups=weight_config_this_layer["static_groups"], + ) + if self.layer_wise: + from ..torch_utils.layer_wise_quant.utils import ( + LWQ_WORKSPACE, + clean_module_weight, + load_value, + set_module_tensor_to_device, + ) + + sub_layer = sequential_layers[layer_name] + full_layer_name = self.get_full_layer_name(layer_name, block_idx) + for n, p in sub_layer.named_parameters(): + param_name = full_layer_name + "." + n + if n == "weight": + set_module_tensor_to_device(self.model, param_name, self.device, Q) + else: + value = load_value(self.model, param_name, model_path) + set_module_tensor_to_device(self.model, param_name, self.device, value) + # sub_layer.weight.data = Q + torch.save(sub_layer.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt") + clean_module_weight(sub_layer) + del Q + gc.collect() + else: + sequential_layers[layer_name].weight.data = Q + gptq_config[self.get_full_layer_name(layer_name, block_idx)] = {"scale": scale} + if not weight_config_this_layer["sym"]: + gptq_config[self.get_full_layer_name(layer_name, block_idx)]["zero"] = zp + if weight_config_this_layer["act_order"] and not weight_config_this_layer["static_groups"]: + # save perm for restoring the weights, but only when static_groups is not enabled. + gptq_config[self.get_full_layer_name(layer_name, block_idx)]["perm"] = gptq_for_this_block[ + layer_name + ].perm + gptq_for_this_block[layer_name].free() + + # Step 2.5: replace output data with quantized weights + outs = [] + idx = self.cache_key_arguments.pop("i") + for j in range(len(self.dataloader)): + cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j) + cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j) + out = transformer_block(*cache_positional_batch, **cache_keyword_batch) + out = self.track_hidden_states(out) + outs.append(out) + self.cache_key_arguments["i"] = idx + if self.layer_wise: + self.gptq_related_blocks["transformers"][block_idx] = transformer_block + else: + self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu() + del gptq_for_this_block + torch.cuda.empty_cache() + # iteratively replace the input with output, thus layerwise quantization can continue. + self.update_blockwise_hidden_states(outs) + logger.info("------------------------------") + + # do the post transformer blocks quantization + do_post_transformer_quant = self.find_lm_head_config() + if do_post_transformer_quant: + logger.info("Quantizing post transformer layers") + # the input should be self.cache_key_arguments and self.cache_positional_arguments + sub_layers = find_layers(self.gptq_related_blocks["transformers_post"]["layer"]) + sub_layers_to_quant = {} for layer_name, layer_obj in sub_layers.items(): # filter sub_layers with included layer_names in self.weight_config - full_layer_name = self.get_full_layer_name(layer_name, block_idx) + full_layer_name = self.gptq_related_blocks["transformers_post"]["name"] # if self.weight_config.get(full_layer_name, None) == None: if self.get_layer_config(full_layer_name) is None: logger.warning(f"{full_layer_name} can be quantized " + "but excluded from quantization configs.") else: - sub_layers_to_quant[layer_name] = layer_obj + sub_layers_to_quant[full_layer_name] = layer_obj del sub_layers sub_layers = sub_layers_to_quant - # Step 2.2: Initialize GPTQ quantizers for collected layers. - gptq_for_this_block = {} - # initialize gptq quantizer for every layer in a transformer block + gptq_post_block = {} + + def add_batch_post(_name): + def tmp(_, inp, out): + gptq_post_block[_name].add_batch(inp[0].data, out.data) + + return tmp + for layer_name in sub_layers: - # weight_config_this_layer = self.weight_config.get( - # self.get_full_layer_name(layer_name, block_idx), None - # ) - full_layer_name = self.get_full_layer_name(layer_name, block_idx) + full_layer_name = self.gptq_related_blocks["transformers_post"]["name"] weight_config_this_layer = self.get_layer_config(full_layer_name) - if self.layer_wise: - from ..torch_utils.layer_wise_quant.utils import load_value + W = sub_layers[layer_name].weight.data.clone() - W = load_value(self.model, full_layer_name + ".weight", model_path) - else: - W = sub_layers[layer_name].weight.data.clone() - - gptq_for_this_block[layer_name] = GPTQ(sub_layers[layer_name], W, self.device) + gptq_post_block[layer_name] = GPTQ(sub_layers[layer_name], W, self.device) # gptq_for_this_block[layer_name].quantizer = Quantizer() - gptq_for_this_block[layer_name].quantizer.configure( + gptq_post_block[layer_name].quantizer.configure( weight_config_this_layer["wbits"], weight_config_this_layer["perchannel"], weight_config_this_layer["sym"], weight_config_this_layer["mse"], ) - - # Step 2.3: modify forward functions to hook inputs data (used in gptq execution) - def add_batch(_name): - def tmp(_, inp, out): - gptq_for_this_block[_name].add_batch(inp[0].data, out.data) # noqa: F821 - - return tmp - + # generate the gptq quantizer handles = [] # register handles which add inputs and outputs to gptq object for layer_name in sub_layers: - handles.append(sub_layers[layer_name].register_forward_hook(add_batch(layer_name))) - idx = self.cache_key_arguments.pop("i") + handles.append(sub_layers[layer_name].register_forward_hook(add_batch_post(layer_name))) for j in range(len(self.dataloader)): - cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j) - cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j) - out = transformer_block(*cache_positional_batch, **cache_keyword_batch) - out = self.track_hidden_states(out) - self.cache_key_arguments["i"] = idx + if "hidden_states" in self.cache_key_arguments: + out = sub_layers[layer_name](self.cache_key_arguments["hidden_states"][j]) + else: + out = sub_layers[layer_name](self.cache_positional_arguments[0][j]) + + # if "hidden_states" in self.cache_key_arguments: + # self.cache_key_arguments["hidden_states"] = outs[:] + # else: + # self.cache_positional_arguments[0] = outs[:] + # perform the inference process + for h in handles: h.remove() - # Step 2.4: everything is prepared, so start quantization! - for layer_name in sub_layers: - # weight_config_this_layer = self.weight_config.get( - # self.get_full_layer_name(layer_name, block_idx), None - # ) - weight_config_this_layer = self.get_layer_config(self.get_full_layer_name(layer_name, block_idx)) - logger.info(f"Quantizing layer {layer_name}") - if self.layer_wise: - from ..torch_utils.layer_wise_quant.utils import load_value - full_layer_name = self.get_full_layer_name(layer_name, block_idx) - W = load_value(self.model, full_layer_name + ".weight", model_path) - else: - W = sub_layers[layer_name].weight.data.clone() - scale, zp, Q = gptq_for_this_block[layer_name].fasterquant( + for layer_name in sub_layers: + full_layer_name = self.gptq_related_blocks["transformers_post"]["name"] + weight_config_this_layer = self.get_layer_config(full_layer_name) + scale, zp, Q = gptq_post_block[layer_name].fasterquant( W, blocksize=weight_config_this_layer["block_size"], percdamp=weight_config_this_layer["percdamp"], @@ -641,59 +827,15 @@ def tmp(_, inp, out): act_order=weight_config_this_layer["act_order"], static_groups=weight_config_this_layer["static_groups"], ) - if self.layer_wise: - from ..torch_utils.layer_wise_quant.utils import ( - LWQ_WORKSPACE, - clean_module_weight, - load_value, - set_module_tensor_to_device, - ) - - sub_layer = sub_layers[layer_name] - full_layer_name = self.get_full_layer_name(layer_name, block_idx) - for n, p in sub_layer.named_parameters(): - param_name = full_layer_name + "." + n - if n == "weight": - set_module_tensor_to_device(self.model, param_name, self.device, Q) - else: - value = load_value(self.model, param_name, model_path) - set_module_tensor_to_device(self.model, param_name, self.device, value) - # sub_layer.weight.data = Q - torch.save(sub_layer.state_dict(), LWQ_WORKSPACE + f"/{full_layer_name}.pt") - clean_module_weight(sub_layer) - del Q - gc.collect() - else: - sub_layers[layer_name].weight.data = Q - gptq_config[self.get_full_layer_name(layer_name, block_idx)] = {"scale": scale} + sub_layers[layer_name].weight.data = Q + # save the quantization results + gptq_config[full_layer_name] = {"scale": scale} if not weight_config_this_layer["sym"]: - gptq_config[self.get_full_layer_name(layer_name, block_idx)]["zero"] = zp + gptq_config[full_layer_name]["zero"] = zp if weight_config_this_layer["act_order"] and not weight_config_this_layer["static_groups"]: # save perm for restoring the weights, but only when static_groups is not enabled. - gptq_config[self.get_full_layer_name(layer_name, block_idx)]["perm"] = gptq_for_this_block[ - layer_name - ].perm - gptq_for_this_block[layer_name].free() - - # Step 2.5: replace output data with quantized weights - outs = [] - idx = self.cache_key_arguments.pop("i") - for j in range(len(self.dataloader)): - cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j) - cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j) - out = transformer_block(*cache_positional_batch, **cache_keyword_batch) - out = self.track_hidden_states(out) - outs.append(out) - self.cache_key_arguments["i"] = idx - if self.layer_wise: - self.gptq_related_blocks["transformers"][block_idx] = transformer_block - else: - self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu() - del gptq_for_this_block - torch.cuda.empty_cache() - # iteratively replace the input with output, thus layerwise quantization can continue. - self.update_blockwise_hidden_states(outs) - logger.info("------------------------------") + gptq_config[full_layer_name]["perm"] = gptq_post_block[full_layer_name].perm + gptq_post_block[layer_name].free() logger.info("Quantization done") # self.model.config.use_cache = self.use_cache diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py index 8bcacd65cff..ca4a011907a 100644 --- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py +++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py @@ -522,7 +522,15 @@ def __iter__(self): }, }, recipes={ - "gptq_args": {"percdamp": 0.01, "act_order": False, "use_max_length": True, "pad_max_length": 512}, + "gptq_args": { + "percdamp": 0.01, + "act_order": False, + "use_max_length": True, + "pad_max_length": 512, + "static_groups": True, + "true_sequential": True, + "lm_head": True, + }, }, ) @@ -537,7 +545,7 @@ def __iter__(self): ) q_model.save("saved") out1 = q_model.model(input) - self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-02)) + self.assertTrue(torch.allclose(out1[0], out0[0], atol=1e-01)) compressed_model = q_model.export_compressed_model(use_optimum_format=False) out2 = compressed_model(input) torch.save(compressed_model.state_dict(), "saved/compressed_model.pt") diff --git a/test/quantization/test_weight_only_quantization.py b/test/quantization/test_weight_only_quantization.py index b990f9aee9f..f5577eea7c4 100644 --- a/test/quantization/test_weight_only_quantization.py +++ b/test/quantization/test_weight_only_quantization.py @@ -154,6 +154,7 @@ def __iter__(self): "sym": True, "percdamp": 0.01, "perchannel": False, + "lm_head": True, }, "transformer.h.1.attn.k_proj": { "wbits": 3,