From 7c598e42e5b7899f284616c05c6896bf9d8bd2b8 Mon Sep 17 00:00:00 2001 From: blzheng Date: Fri, 10 Nov 2023 13:04:18 +0800 Subject: [PATCH] enable optimized codegen (#2257) --- examples/cpu/inference/python/llm/README.md | 9 +- .../run_accuracy_with_deepspeed.py | 3 +- .../run_generation_with_deepspeed.py | 1 + examples/cpu/inference/python/llm/run.py | 2 + .../llm/single_instance/run_accuracy.py | 3 +- .../llm/single_instance/run_generation.py | 1 + .../transformers/generation/beam_search.py | 5 +- .../transformers/generation/greedy_search.py | 5 +- .../models/cpu/modules/attentions.py | 5 + .../models/cpu/modules/decoder.py | 9 ++ .../models/reference/fusions/mha_fusion.py | 23 ++++- .../transformers/models/reference/models.py | 73 ++++++++++++++ .../models/reference/modules/attentions.py | 96 +++++++++++++++++++ .../models/reference/modules/decoder.py | 8 +- .../transformers/optimize.py | 18 +++- tests/cpu/hf_configs/codegen/config.json | 42 ++++++++ tests/cpu/test_ipex_optimize_transformers.py | 22 ++++- 17 files changed, 312 insertions(+), 13 deletions(-) create mode 100644 tests/cpu/hf_configs/codegen/config.json diff --git a/examples/cpu/inference/python/llm/README.md b/examples/cpu/inference/python/llm/README.md index 9b8d043ea..477448262 100644 --- a/examples/cpu/inference/python/llm/README.md +++ b/examples/cpu/inference/python/llm/README.md @@ -1,6 +1,6 @@ # Text Generation We provide the inference benchmarking scripts for large language models text generation.
-Support large language model families, including GPT-J, LLaMA, GPT-Neox, OPT, Falcon.
+Support large language model families, including GPT-J, LLaMA, GPT-Neox, OPT, Falcon, CodeGen.
The scripts include both single instance and distributed (DeepSpeed) use cases.
The scripts cover model generation inference with low precions cases for different models with best perf and accuracy (bf16 AMP,static quantization and weight only quantization).
@@ -84,10 +84,11 @@ wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prom |GPT-NEOX| "EleutherAI/gpt-neox-20b" | ✅ | ✅ | ✅ | ❎ ** | |FALCON*|"tiiuae/falcon-40b" | ✅ | ✅ | ✅ | ❎ **| |OPT|"facebook/opt-30b", "facebook/opt-1.3b"| ✅ | ✅ | ✅ | ❎ **| +|CodeGen|"Salesforce/codegen-2B-multi"| ✅ | ✅ | ✅ | ❎ **| *For Falcon models from remote hub, we need to modify the config.json to use the modeling_falcon.py in transformers. Therefore, in the following scripts, we need to pass an extra configuration file like "--config-file=model_config/tiiuae_falcon-40b_config.json". This is optional for FP32/BF16 but needed for quantizations. -** For GPT-NEOX/FALCON/OPT models, the accuracy recipes of static quantization INT8 are not ready thus they will be skipped in our coverage. +** For GPT-NEOX/FALCON/OPT/CodeGen models, the accuracy recipes of static quantization INT8 are not ready thus they will be skipped in our coverage. *Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp32/bf16). For other LLM model families, we are working in progress to cover those optimizations, which will expand the model list above. @@ -180,6 +181,8 @@ python run_gpt-neox_quantization.py --ipex-weight-only-quantization --output-dir python run_falcon_quantization.py --ipex-weight-only-quantization --output-dir "saved_results" --int8-bf16-mixed -m --config-file ## OPT quantization python run_opt_quantization.py --ipex-weight-only-quantization --output-dir "saved_results" --int8-bf16-mixed -m +## CodeGen quantization +python run_codegen_quantization.py --ipex-weight-only-quantization --output-dir "saved_results" --int8-bf16-mixed -m ## (2) Run quantization performance test (note that GPT-NEOX uses --int8 instead of --int8-bf16-mixed) OMP_NUM_THREADS= numactl -m -C python run__quantization.py -m --quantized-model-path "./saved_results/best_model.pt" --benchmark --int8-bf16-mixed @@ -277,7 +280,7 @@ export WORK_DIR=./ cd distributed mv PATH/TO/prompt.json ./ -# Run GPTJ/LLAMA/OPT/Falcon with bfloat16 DeepSpeed +# Run GPTJ/LLAMA/OPT/Falcon/CodeGen with bfloat16 DeepSpeed deepspeed --bind_cores_to_rank run_generation_with_deepspeed.py --benchmark -m --dtype bfloat16 --ipex --deployment-mode # Run GPT-NeoX with ipex weight only quantization diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py index ad0c605f8..b5435aff0 100644 --- a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py @@ -25,6 +25,7 @@ "opt": (AutoModelForCausalLM, AutoTokenizer), "llama": (AutoModelForCausalLM, LlamaTokenizer), "falcon": (AutoModelForCausalLM, AutoTokenizer), + "codegen": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } @@ -336,7 +337,7 @@ def _model_call( for text in inputs: input_ids = text.to(self._device) input_bs = inputs.shape[0] * self.num_beams - if re.search("GPTJ", self.base_model.config.architectures[0]): + if re.search("GPTJ", self.base_model.config.architectures[0]) or re.search("codegen", self.base_model.config.architectures[0], re.IGNORECASE): beam_idx_tmp = torch.zeros( (2048, int(input_bs)), dtype=torch.long ).contiguous() diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py index 53e3cb935..37a414da0 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py @@ -32,6 +32,7 @@ "opt": (AutoModelForCausalLM, AutoTokenizer), "falcon": (AutoModelForCausalLM, AutoTokenizer), "chatglm": (AutoModelForCausalLM, AutoTokenizer), + "codegen": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/examples/cpu/inference/python/llm/run.py b/examples/cpu/inference/python/llm/run.py index 293d5ccbd..098ad893b 100644 --- a/examples/cpu/inference/python/llm/run.py +++ b/examples/cpu/inference/python/llm/run.py @@ -181,6 +181,8 @@ def main(args_in: Optional[List[str]] = None) -> None: ) elif re.search("OPT", config.architectures[0], re.IGNORECASE): qpath = Path(parent_path, "single_instance/run_opt_quantization.py") + elif re.search("codegen", config.architectures[0], re.IGNORECASE): + qpath = Path(parent_path, "single_instance/run_codegen_quantization.py") infer_cmd = ["python", qpath] # 1) quantization diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py index adb06d970..0659a1e9a 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py +++ b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py @@ -17,6 +17,7 @@ "opt": (AutoModelForCausalLM, AutoTokenizer), "llama": (AutoModelForCausalLM, LlamaTokenizer), "falcon": (AutoModelForCausalLM, AutoTokenizer), + "codegen": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } @@ -172,7 +173,7 @@ def _model_call( for text in inputs: input_ids = text.to(self._device) input_bs = inputs.shape[0] * self.num_beams - if re.search("GPTJ", self.base_model.config.architectures[0]): + if re.search("GPTJ", self.base_model.config.architectures[0]) or re.search("codegen", self.base_model.config.architectures[0], re.IGNORECASE): beam_idx_tmp = torch.zeros( (2048, int(input_bs)), dtype=torch.long ).contiguous() diff --git a/examples/cpu/inference/python/llm/single_instance/run_generation.py b/examples/cpu/inference/python/llm/single_instance/run_generation.py index d86b09df0..76dacc489 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_generation.py +++ b/examples/cpu/inference/python/llm/single_instance/run_generation.py @@ -19,6 +19,7 @@ "llama": (AutoModelForCausalLM, LlamaTokenizer), "opt": (AutoModelForCausalLM, AutoTokenizer), "falcon": (AutoModelForCausalLM, AutoTokenizer), + "codegen": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/intel_extension_for_pytorch/transformers/generation/beam_search.py b/intel_extension_for_pytorch/transformers/generation/beam_search.py index a4e55f82c..7643999be 100644 --- a/intel_extension_for_pytorch/transformers/generation/beam_search.py +++ b/intel_extension_for_pytorch/transformers/generation/beam_search.py @@ -176,6 +176,7 @@ def _beam_search( or re.search("OPT", self.config.architectures[0], re.IGNORECASE) or re.search("falcon", self.config.architectures[0], re.IGNORECASE) or re.search("rw", self.config.architectures[0], re.IGNORECASE) + or re.search("codegen", self.config.architectures[0], re.IGNORECASE) ): first_token = False input_bs = input_ids.size()[0] @@ -183,7 +184,9 @@ def _beam_search( if model_inputs["past_key_values"] is None: first_token = True if first_token: - if re.search("GPTJ", self.config.architectures[0]): + if re.search("GPTJ", self.config.architectures[0]) or re.search( + "codegen", self.config.architectures[0], re.IGNORECASE + ): beam_idx_tmp = torch.zeros( (2048, int(batch_size * num_beams)), dtype=torch.long ).contiguous() diff --git a/intel_extension_for_pytorch/transformers/generation/greedy_search.py b/intel_extension_for_pytorch/transformers/generation/greedy_search.py index 7e8f08d1e..fa2fc778c 100644 --- a/intel_extension_for_pytorch/transformers/generation/greedy_search.py +++ b/intel_extension_for_pytorch/transformers/generation/greedy_search.py @@ -157,13 +157,16 @@ def _greedy_search( or re.search("OPT", self.config.architectures[0], re.IGNORECASE) or re.search("falcon", self.config.architectures[0], re.IGNORECASE) or re.search("rw", self.config.architectures[0], re.IGNORECASE) + or re.search("codegen", self.config.architectures[0], re.IGNORECASE) ): first_token = False input_bs = input_ids.size()[0] if model_inputs["past_key_values"] is None: first_token = True if first_token: - if re.search("GPTJ", self.config.architectures[0]): + if re.search("GPTJ", self.config.architectures[0]) or re.search( + "codegen", self.config.architectures[0], re.IGNORECASE + ): beam_idx_tmp = torch.zeros( (2048, int(input_bs)), dtype=torch.long ).contiguous() diff --git a/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py index 94c65b368..0c5241333 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py @@ -26,6 +26,11 @@ def __init__(self, module, config, tpp=False, woq=False): self.rope_base, self.model_backbone, ) + + if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search( + "codegen", self.model_backbone, re.IGNORECASE + ): + self._IPEXROPE.embed_positions.sin_cos = self.embed_positions if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search( "LLAMA", self.model_backbone, re.IGNORECASE ): diff --git a/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py index 543dddcf8..64cbfb32b 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py @@ -67,5 +67,14 @@ def __init__(self, module, config, tpp=False, woq=False): self.linear_add = _IPEXlinearAddCPU( module.linear_add.linear, tpp=tpp, woq=woq ) + elif re.search("codegen", self.model_backbone, re.IGNORECASE): + if not self.distributed: + self.linear_add_add = _IPEXlinearAddAddCPU( + module.linear_add_add.linear, tpp=tpp, woq=woq + ) + # woq_linear_gelu has accuracy issues on codegen, disable it + self.linear_gelu = _IPEXlinearNewGeluCPU( + module.linear_gelu.linear, tpp=tpp and not woq, woq=False + ) else: AssertionError(False, "Do not support the optimization of your model yet") diff --git a/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py b/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py index 6be6dd206..65b78df1c 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py @@ -161,6 +161,17 @@ def forward( _cos = _cos.type(x.dtype)[:, 0:seq_len] _sin = _sin.type(x.dtype)[:, 0:seq_len] x = (x * _cos) + (self.rotate_half(x) * _sin) + elif re.search("codegen", self.model_backbone, re.IGNORECASE): + sincos = _sin_cos[position_ids] + sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1) + if rotary_ndims is not None: + x_rot = x[:, :, :, :rotary_ndims] + x_pass = x[:, :, :, rotary_ndims:] + + x_rot = self.apply_rotary_pos_emb_gptj(x_rot, sin, cos) + x = torch.cat([x_rot, x_pass], dim=-1) + else: + x = self.apply_rotary_pos_emb_gptj(x, sin, cos) else: AssertionError(False, "Do not support the optimization of your model yet") return x @@ -201,6 +212,12 @@ def __init__(self, module, config): if hasattr(module, "new_decoder_architecture") else None ) + elif re.search("codegen", self.model_backbone, re.IGNORECASE): + self.num_heads = module.num_attention_heads + self.head_dim = module.head_dim + self.scale_attn = module.scale_attn + self.attn_dropout = module.attn_dropout + self.causal_mask = module.causal_mask for k, v in module.__class__.__dict__.items(): if k.startswith("__") or k.startswith("forward"): @@ -247,12 +264,14 @@ def forward( key.permute(0, 2, 1, 3) if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search("OPT", self.model_backbone, re.IGNORECASE) + or re.search("codegen", self.model_backbone, re.IGNORECASE) else key ) query = ( query.permute(0, 2, 1, 3) if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search("OPT", self.model_backbone, re.IGNORECASE) + or re.search("codegen", self.model_backbone, re.IGNORECASE) else query ) value = value.permute(0, 2, 1, 3) @@ -263,7 +282,9 @@ def forward( value = torch.cat((past_value, value), dim=-2) present = (key, value) - if re.search("GPTJ", self.model_backbone, re.IGNORECASE): + if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search( + "codegen", self.model_backbone, re.IGNORECASE + ): attn_output, attn_weights = self._attn( query, key, value, attention_mask, head_mask ) diff --git a/intel_extension_for_pytorch/transformers/models/reference/models.py b/intel_extension_for_pytorch/transformers/models/reference/models.py index c7f40b255..0a4f9ec61 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/models.py +++ b/intel_extension_for_pytorch/transformers/models/reference/models.py @@ -301,6 +301,79 @@ def OPTForCausalLM_forward( ) +def CodeGenForCausalLM_forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + token_type_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + """ + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + transformer_outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + position_ids=position_ids, + token_type_ids=token_type_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + + # make sure sampling in fp16 works correctly and + # compute loss in fp32 to match with mesh-tf version + # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179 + lm_logits = self.lm_head(hidden_states).to(torch.float32) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(lm_logits.device) + # Shift so that tokens < n predict n + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + loss = loss_fct( + shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1) + ) + + loss = loss.to(hidden_states.dtype) + + if not return_dict: + output = (lm_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def prepare_inputs_for_generation( self, input_ids: torch.LongTensor, diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py index 14668b42b..4852d1300 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py @@ -530,6 +530,91 @@ def _FalconAttention_forward( return output_tensor, present +def _CodeGenAttention_forward( + self, + hidden_states: Optional[torch.FloatTensor], + layer_past: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, +) -> Union[ + Tuple[torch.Tensor, Tuple[torch.Tensor]], + Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]], +]: + qkv = self.qkv_proj(hidden_states) + # TODO(enijkamp): factor out number of logical TPU-v4 cores or make forward pass agnostic + mp_num = 4 + qkv_split = qkv.reshape(qkv.shape[:-1] + (mp_num, -1)) + + local_dim = self.head_dim * self.num_attention_heads // mp_num + query, value, key = torch.split(qkv_split, local_dim, dim=-1) + query = self._split_heads( + query, self.num_attention_heads, self.head_dim, mp_num=mp_num + ).contiguous() + key = self._split_heads( + key, self.num_attention_heads, self.head_dim, mp_num=mp_num + ).contiguous() + value = self._split_heads( + value, self.num_attention_heads, self.head_dim, mp_num=mp_num + ).contiguous() + + key = self._IPEXROPE( + key, + position_ids.contiguous(), + self.num_attention_heads, + self.head_dim, + 1, # neighbor elements + 64, + ) + query = self._IPEXROPE( + query, + position_ids.contiguous(), + self.num_attention_heads, + self.head_dim, + 1, + 64, + ) + + if use_cache: + ( + attn_output, + attn_weights, + present, + ) = self._IPEXScaleDotProduct( + query, + key, + value, + self.scale_attn, + layer_past, + head_mask, + attention_mask, + ) + else: + key = key.permute(0, 2, 1, 3) + query = query.permute(0, 2, 1, 3) + value = value.permute(0, 2, 1, 3) + present = None + + # compute self-attention: V x Softmax(QK^T) + attn_output, attn_weights = self._attn( + query, key, value, attention_mask, head_mask + ) + + attn_output = self._merge_heads( + attn_output, self.num_attention_heads, self.head_dim + ) + attn_output = self.out_proj(attn_output) + attn_output = self.resid_dropout(attn_output) + + outputs = (attn_output, present) + if output_attentions: + outputs += (attn_weights,) + + return outputs # a, present, (attentions) + + class _IPEXAttentionRef(nn.Module): def __init__(self, module, config, sdp_module_ref, distributed=False): super().__init__() @@ -733,6 +818,17 @@ def forward( use_cache, output_attentions, ) + elif re.search("codegen", self.model_backbone, re.IGNORECASE): + return _CodeGenAttention_forward( + self, + hidden_states, + layer_past, + attention_mask, + position_ids, + head_mask, + use_cache, + output_attentions, + ) else: AssertionError(False, "Do not support the optimization of your model yet") diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py index 486321e28..a7f4951a0 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py @@ -259,7 +259,9 @@ def __init__(self, module, config, distributed=False): self.distributed = distributed self.model_backbone = config.architectures[0] - if re.search("GPTJ", self.model_backbone, re.IGNORECASE): + if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search( + "CodeGen", self.model_backbone, re.IGNORECASE + ): if not self.distributed: self.linear_add_add = _IPEXlinearAddAddRef(module.mlp.fc_out) del self.__dict__["_modules"]["mlp"].fc_out @@ -316,7 +318,9 @@ def forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, alibi: Optional[torch.Tensor] = None, ): - if re.search("GPTJ", self.model_backbone, re.IGNORECASE): + if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search( + "CodeGen", self.model_backbone, re.IGNORECASE + ): return GPTJBlock_forward( self, hidden_states, diff --git a/intel_extension_for_pytorch/transformers/optimize.py b/intel_extension_for_pytorch/transformers/optimize.py index cbda882b8..956aadbd2 100644 --- a/intel_extension_for_pytorch/transformers/optimize.py +++ b/intel_extension_for_pytorch/transformers/optimize.py @@ -108,6 +108,7 @@ def model_convert_reference(_model): LlamaForCausalLM_forward, GPTNeoXForCausalLM_forward, OPTForCausalLM_forward, + CodeGenForCausalLM_forward, prepare_inputs_for_generation, ) @@ -172,6 +173,16 @@ def model_convert_reference(_model): "forward", OPTForCausalLM_forward, ) + elif ( + hasattr(_model, "__class__") + and _model.__class__ + == transformers.models.codegen.modeling_codegen.CodeGenForCausalLM + ): + convert_function( + _model, + "forward", + CodeGenForCausalLM_forward, + ) # checking if model has been wrapped by deepspeed (distributed or not) try: @@ -189,6 +200,7 @@ def model_convert_reference(_model): transformers.models.llama.modeling_llama.LlamaAttention, transformers.models.gptj.modeling_gptj.GPTJAttention, transformers.models.opt.modeling_opt.OPTAttention, + transformers.models.codegen.modeling_codegen.CodeGenAttention, ]: convert_class( _model, @@ -201,6 +213,7 @@ def model_convert_reference(_model): for supported_decoder_class in [ transformers.models.llama.modeling_llama.LlamaDecoderLayer, transformers.models.gptj.modeling_gptj.GPTJBlock, + transformers.models.codegen.modeling_codegen.CodeGenBlock, transformers.models.opt.modeling_opt.OPTDecoderLayer, ]: convert_class( @@ -452,7 +465,7 @@ def optimize_transformers( r""" Apply optimizations at Python frontend to the given transformers model (nn.Module). This API focus on transformers models, especially for generation tasks inference. - Well supported model family: Llama, GPT-J, GPT-Neox, OPT, Falcon. + Well supported model family: Llama, GPT-J, GPT-Neox, OPT, Falcon, CodeGen. Args: model (torch.nn.Module): User model to apply optimizations. @@ -547,10 +560,11 @@ def optimize_transformers( or re.search("OPT", model.config.architectures[0], re.IGNORECASE) or re.search("falcon", model.config.architectures[0], re.IGNORECASE) or re.search("rw", model.config.architectures[0], re.IGNORECASE) + or re.search("codegen", model.config.architectures[0], re.IGNORECASE) ) if not well_supported_model: warnings.warn( - "optimize_transformers supports Llama, GPT-J, GPT-Neox, Falcon, and OPT, fallback to origin model" + "optimize_transformers supports Llama, GPT-J, GPT-Neox, Falcon, OPT, and CodeGen, fallback to origin model" ) return model diff --git a/tests/cpu/hf_configs/codegen/config.json b/tests/cpu/hf_configs/codegen/config.json new file mode 100644 index 000000000..cff0daa9a --- /dev/null +++ b/tests/cpu/hf_configs/codegen/config.json @@ -0,0 +1,42 @@ +{ + "_name_or_path": "codegen-2B-multi", + "activation_function": "gelu_new", + "architectures": [ + "CodeGenForCausalLM" + ], + "attn_pdrop": 0.0, + "bos_token_id": 1, + "embd_pdrop": 0.0, + "eos_token_id": 50256, + "gradient_checkpointing": false, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "codegen", + "n_ctx": 2048, + "n_embd": 2560, + "n_head": 32, + "n_inner": null, + "n_layer": 1, + "n_positions": 2048, + "resid_pdrop": 0.0, + "rotary_dim": 64, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "task_specific_params": { + "text-generation": { + "do_sample": true, + "max_length": 50, + "temperature": 1.0 + } + }, + "tie_word_embeddings": false, + "tokenizer_class": "GPT2Tokenizer", + "torch_dtype": "float16", + "transformers_version": "4.21.0.dev0", + "use_cache": true, + "vocab_size": 51200 +} \ No newline at end of file diff --git a/tests/cpu/test_ipex_optimize_transformers.py b/tests/cpu/test_ipex_optimize_transformers.py index a8a2dd057..d0c311862 100644 --- a/tests/cpu/test_ipex_optimize_transformers.py +++ b/tests/cpu/test_ipex_optimize_transformers.py @@ -98,7 +98,9 @@ def model_replacement_check(self, model, has_position_id, torchcompile=False): ) self.assertEqual(key_hf[0], key_ipex[0], prec=0.1) - if re.search("GPTJ", model.config.architectures[0]): + if re.search("GPTJ", model.config.architectures[0]) or re.search( + "codegen", model.config.architectures[0] + ): assert ( ipex_m.transformer.h[0].attn.__class__ is ipex.transformers.models.cpu.modules.attentions._IPEXAttentionCPU @@ -228,6 +230,24 @@ def test_model_replacement_falcon_torchcompile(self): ipex.nn.utils._model_convert.replace_customized_linear_with_linear(m.eval()) self.model_replacement_check(m, False, torchcompile=True) + def test_model_replacement_codegen(self): + config = AutoConfig.from_pretrained( + f"{curpath}/hf_configs/codegen", return_dict=False + ) + m = transformers.models.codegen.modeling_codegen.CodeGenForCausalLM( + config + ).eval() + self.model_replacement_check(m, True) + + def test_model_replacement_codegen_torchcompile(self): + config = AutoConfig.from_pretrained( + f"{curpath}/hf_configs/codegen", return_dict=False + ) + m = transformers.models.codegen.modeling_codegen.CodeGenForCausalLM( + config + ).eval() + self.model_replacement_check(m, True, torchcompile=True) + def _model_replacement_check_woq(self, model): qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping() model = ipex.optimize_transformers(