From 7c598e42e5b7899f284616c05c6896bf9d8bd2b8 Mon Sep 17 00:00:00 2001
From: blzheng <beilei.zheng@intel.com>
Date: Fri, 10 Nov 2023 13:04:18 +0800
Subject: [PATCH] enable optimized codegen (#2257)

---
 examples/cpu/inference/python/llm/README.md   |  9 +-
 .../run_accuracy_with_deepspeed.py            |  3 +-
 .../run_generation_with_deepspeed.py          |  1 +
 examples/cpu/inference/python/llm/run.py      |  2 +
 .../llm/single_instance/run_accuracy.py       |  3 +-
 .../llm/single_instance/run_generation.py     |  1 +
 .../transformers/generation/beam_search.py    |  5 +-
 .../transformers/generation/greedy_search.py  |  5 +-
 .../models/cpu/modules/attentions.py          |  5 +
 .../models/cpu/modules/decoder.py             |  9 ++
 .../models/reference/fusions/mha_fusion.py    | 23 ++++-
 .../transformers/models/reference/models.py   | 73 ++++++++++++++
 .../models/reference/modules/attentions.py    | 96 +++++++++++++++++++
 .../models/reference/modules/decoder.py       |  8 +-
 .../transformers/optimize.py                  | 18 +++-
 tests/cpu/hf_configs/codegen/config.json      | 42 ++++++++
 tests/cpu/test_ipex_optimize_transformers.py  | 22 ++++-
 17 files changed, 312 insertions(+), 13 deletions(-)
 create mode 100644 tests/cpu/hf_configs/codegen/config.json
diff --git a/examples/cpu/inference/python/llm/README.md b/examples/cpu/inference/python/llm/README.md
index 9b8d043ea..477448262 100644
--- a/examples/cpu/inference/python/llm/README.md
+++ b/examples/cpu/inference/python/llm/README.md
@@ -1,6 +1,6 @@
 # Text Generation
 We provide the inference benchmarking scripts for large language models text generation.<br/>
-Support large language model families, including GPT-J, LLaMA, GPT-Neox, OPT, Falcon.<br/>
+Support large language model families, including GPT-J, LLaMA, GPT-Neox, OPT, Falcon, CodeGen.<br/>
 The scripts include both single instance and distributed (DeepSpeed) use cases.<br/>
 The scripts cover model generation inference with low precions cases for different models with best perf and accuracy (bf16 AMP，static quantization and weight only quantization).<br/>
 
@@ -84,10 +84,11 @@ wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prom
 |GPT-NEOX| "EleutherAI/gpt-neox-20b" | ✅ | ✅ | ✅ | ❎ ** | 
 |FALCON*|"tiiuae/falcon-40b" | ✅ | ✅ |  ✅ | ❎ **| 
 |OPT|"facebook/opt-30b", "facebook/opt-1.3b"| ✅ | ✅ |  ✅ | ❎ **| 
+|CodeGen|"Salesforce/codegen-2B-multi"| ✅ | ✅ |  ✅ | ❎ **|
 
 *For Falcon models from remote hub, we need to modify the config.json to use the modeling_falcon.py in transformers. Therefore, in the following scripts, we need to pass an extra configuration file like "--config-file=model_config/tiiuae_falcon-40b_config.json". This is optional for FP32/BF16 but needed for quantizations.
 
-** For GPT-NEOX/FALCON/OPT models, the accuracy recipes of static quantization INT8 are not ready thus they will be skipped in our coverage.
+** For GPT-NEOX/FALCON/OPT/CodeGen models, the accuracy recipes of static quantization INT8 are not ready thus they will be skipped in our coverage.
 
 *Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp32/bf16). For other LLM model families, we are working in progress to cover those optimizations, which will expand the model list above.
 
@@ -180,6 +181,8 @@ python run_gpt-neox_quantization.py --ipex-weight-only-quantization --output-dir
 python run_falcon_quantization.py --ipex-weight-only-quantization --output-dir "saved_results"  --int8-bf16-mixed -m <FALCON MODEL_ID> --config-file <CONFIG_FILE>
 ## OPT quantization
 python run_opt_quantization.py --ipex-weight-only-quantization --output-dir "saved_results"  --int8-bf16-mixed -m <OPT MODEL_ID> 
+## CodeGen quantization
+python run_codegen_quantization.py --ipex-weight-only-quantization --output-dir "saved_results"  --int8-bf16-mixed -m <CODEGEN MODEL_ID>
 
 ## (2) Run quantization performance test (note that GPT-NEOX uses --int8 instead of --int8-bf16-mixed)
 OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_<MODEL>_quantization.py -m <MODEL_ID> --quantized-model-path "./saved_results/best_model.pt" --benchmark --int8-bf16-mixed
@@ -277,7 +280,7 @@ export WORK_DIR=./
 cd distributed
 mv PATH/TO/prompt.json ./
 
-# Run GPTJ/LLAMA/OPT/Falcon with bfloat16 DeepSpeed
+# Run GPTJ/LLAMA/OPT/Falcon/CodeGen with bfloat16 DeepSpeed
 deepspeed --bind_cores_to_rank run_generation_with_deepspeed.py --benchmark -m <MODEL_ID> --dtype bfloat16 --ipex --deployment-mode
 
 # Run GPT-NeoX with ipex weight only quantization
diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py
index ad0c605f8..b5435aff0 100644
--- a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py
+++ b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py
@@ -25,6 +25,7 @@
     "opt": (AutoModelForCausalLM, AutoTokenizer),
     "llama": (AutoModelForCausalLM, LlamaTokenizer),
     "falcon": (AutoModelForCausalLM, AutoTokenizer),
+    "codegen": (AutoModelForCausalLM, AutoTokenizer),
     "auto": (AutoModelForCausalLM, AutoTokenizer),
 }
 
@@ -336,7 +337,7 @@ def _model_call(
                 for text in inputs:
                     input_ids = text.to(self._device)
                     input_bs = inputs.shape[0] * self.num_beams
-                    if re.search("GPTJ", self.base_model.config.architectures[0]):
+                    if re.search("GPTJ", self.base_model.config.architectures[0]) or re.search("codegen", self.base_model.config.architectures[0], re.IGNORECASE):
                         beam_idx_tmp = torch.zeros(
                             (2048, int(input_bs)), dtype=torch.long
                         ).contiguous()
diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py
index 53e3cb935..37a414da0 100644
--- a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py
+++ b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py
@@ -32,6 +32,7 @@
     "opt": (AutoModelForCausalLM, AutoTokenizer),
     "falcon": (AutoModelForCausalLM, AutoTokenizer),
     "chatglm": (AutoModelForCausalLM, AutoTokenizer),
+    "codegen": (AutoModelForCausalLM, AutoTokenizer),
     "auto": (AutoModelForCausalLM, AutoTokenizer),
 }
 
diff --git a/examples/cpu/inference/python/llm/run.py b/examples/cpu/inference/python/llm/run.py
index 293d5ccbd..098ad893b 100644
--- a/examples/cpu/inference/python/llm/run.py
+++ b/examples/cpu/inference/python/llm/run.py
@@ -181,6 +181,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
                 )
             elif re.search("OPT", config.architectures[0], re.IGNORECASE):
                 qpath = Path(parent_path, "single_instance/run_opt_quantization.py")
+            elif re.search("codegen", config.architectures[0], re.IGNORECASE):
+                qpath = Path(parent_path, "single_instance/run_codegen_quantization.py")
 
             infer_cmd = ["python", qpath]
             # 1) quantization
diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py
index adb06d970..0659a1e9a 100644
--- a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py
+++ b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py
@@ -17,6 +17,7 @@
     "opt": (AutoModelForCausalLM, AutoTokenizer),
     "llama": (AutoModelForCausalLM, LlamaTokenizer),
     "falcon": (AutoModelForCausalLM, AutoTokenizer),
+    "codegen": (AutoModelForCausalLM, AutoTokenizer),
     "auto": (AutoModelForCausalLM, AutoTokenizer),
 }
 
@@ -172,7 +173,7 @@ def _model_call(
                 for text in inputs:
                     input_ids = text.to(self._device)
                     input_bs = inputs.shape[0] * self.num_beams
-                    if re.search("GPTJ", self.base_model.config.architectures[0]):
+                    if re.search("GPTJ", self.base_model.config.architectures[0]) or re.search("codegen", self.base_model.config.architectures[0], re.IGNORECASE):
                         beam_idx_tmp = torch.zeros(
                             (2048, int(input_bs)), dtype=torch.long
                         ).contiguous()
diff --git a/examples/cpu/inference/python/llm/single_instance/run_generation.py b/examples/cpu/inference/python/llm/single_instance/run_generation.py
index d86b09df0..76dacc489 100644
--- a/examples/cpu/inference/python/llm/single_instance/run_generation.py
+++ b/examples/cpu/inference/python/llm/single_instance/run_generation.py
@@ -19,6 +19,7 @@
     "llama": (AutoModelForCausalLM, LlamaTokenizer),
     "opt": (AutoModelForCausalLM, AutoTokenizer),
     "falcon": (AutoModelForCausalLM, AutoTokenizer),
+    "codegen": (AutoModelForCausalLM, AutoTokenizer),
     "auto": (AutoModelForCausalLM, AutoTokenizer),
 }
 
diff --git a/intel_extension_for_pytorch/transformers/generation/beam_search.py b/intel_extension_for_pytorch/transformers/generation/beam_search.py
index a4e55f82c..7643999be 100644
--- a/intel_extension_for_pytorch/transformers/generation/beam_search.py
+++ b/intel_extension_for_pytorch/transformers/generation/beam_search.py
@@ -176,6 +176,7 @@ def _beam_search(
             or re.search("OPT", self.config.architectures[0], re.IGNORECASE)
             or re.search("falcon", self.config.architectures[0], re.IGNORECASE)
             or re.search("rw", self.config.architectures[0], re.IGNORECASE)
+            or re.search("codegen", self.config.architectures[0], re.IGNORECASE)
         ):
             first_token = False
             input_bs = input_ids.size()[0]
@@ -183,7 +184,9 @@ def _beam_search(
             if model_inputs["past_key_values"] is None:
                 first_token = True
             if first_token:
-                if re.search("GPTJ", self.config.architectures[0]):
+                if re.search("GPTJ", self.config.architectures[0]) or re.search(
+                    "codegen", self.config.architectures[0], re.IGNORECASE
+                ):
                     beam_idx_tmp = torch.zeros(
                         (2048, int(batch_size * num_beams)), dtype=torch.long
                     ).contiguous()
diff --git a/intel_extension_for_pytorch/transformers/generation/greedy_search.py b/intel_extension_for_pytorch/transformers/generation/greedy_search.py
index 7e8f08d1e..fa2fc778c 100644
--- a/intel_extension_for_pytorch/transformers/generation/greedy_search.py
+++ b/intel_extension_for_pytorch/transformers/generation/greedy_search.py
@@ -157,13 +157,16 @@ def _greedy_search(
             or re.search("OPT", self.config.architectures[0], re.IGNORECASE)
             or re.search("falcon", self.config.architectures[0], re.IGNORECASE)
             or re.search("rw", self.config.architectures[0], re.IGNORECASE)
+            or re.search("codegen", self.config.architectures[0], re.IGNORECASE)
         ):
             first_token = False
             input_bs = input_ids.size()[0]
             if model_inputs["past_key_values"] is None:
                 first_token = True
             if first_token:
-                if re.search("GPTJ", self.config.architectures[0]):
+                if re.search("GPTJ", self.config.architectures[0]) or re.search(
+                    "codegen", self.config.architectures[0], re.IGNORECASE
+                ):
                     beam_idx_tmp = torch.zeros(
                         (2048, int(input_bs)), dtype=torch.long
                     ).contiguous()
diff --git a/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py
index 94c65b368..0c5241333 100644
--- a/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py
+++ b/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py
@@ -26,6 +26,11 @@ def __init__(self, module, config, tpp=False, woq=False):
                 self.rope_base,
                 self.model_backbone,
             )
+
+        if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search(
+            "codegen", self.model_backbone, re.IGNORECASE
+        ):
+            self._IPEXROPE.embed_positions.sin_cos = self.embed_positions
         if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search(
             "LLAMA", self.model_backbone, re.IGNORECASE
         ):
diff --git a/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py
index 543dddcf8..64cbfb32b 100644
--- a/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py
+++ b/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py
@@ -67,5 +67,14 @@ def __init__(self, module, config, tpp=False, woq=False):
                     self.linear_add = _IPEXlinearAddCPU(
                         module.linear_add.linear, tpp=tpp, woq=woq
                     )
+        elif re.search("codegen", self.model_backbone, re.IGNORECASE):
+            if not self.distributed:
+                self.linear_add_add = _IPEXlinearAddAddCPU(
+                    module.linear_add_add.linear, tpp=tpp, woq=woq
+                )
+            # woq_linear_gelu has accuracy issues on codegen, disable it
+            self.linear_gelu = _IPEXlinearNewGeluCPU(
+                module.linear_gelu.linear, tpp=tpp and not woq, woq=False
+            )
         else:
             AssertionError(False, "Do not support the optimization of your model yet")
diff --git a/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py b/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py
index 6be6dd206..65b78df1c 100644
--- a/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py
+++ b/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py
@@ -161,6 +161,17 @@ def forward(
             _cos = _cos.type(x.dtype)[:, 0:seq_len]
             _sin = _sin.type(x.dtype)[:, 0:seq_len]
             x = (x * _cos) + (self.rotate_half(x) * _sin)
+        elif re.search("codegen", self.model_backbone, re.IGNORECASE):
+            sincos = _sin_cos[position_ids]
+            sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1)
+            if rotary_ndims is not None:
+                x_rot = x[:, :, :, :rotary_ndims]
+                x_pass = x[:, :, :, rotary_ndims:]
+
+                x_rot = self.apply_rotary_pos_emb_gptj(x_rot, sin, cos)
+                x = torch.cat([x_rot, x_pass], dim=-1)
+            else:
+                x = self.apply_rotary_pos_emb_gptj(x, sin, cos)
         else:
             AssertionError(False, "Do not support the optimization of your model yet")
         return x
@@ -201,6 +212,12 @@ def __init__(self, module, config):
                 if hasattr(module, "new_decoder_architecture")
                 else None
             )
+        elif re.search("codegen", self.model_backbone, re.IGNORECASE):
+            self.num_heads = module.num_attention_heads
+            self.head_dim = module.head_dim
+            self.scale_attn = module.scale_attn
+            self.attn_dropout = module.attn_dropout
+            self.causal_mask = module.causal_mask
 
         for k, v in module.__class__.__dict__.items():
             if k.startswith("__") or k.startswith("forward"):
@@ -247,12 +264,14 @@ def forward(
                 key.permute(0, 2, 1, 3)
                 if re.search("GPTJ", self.model_backbone, re.IGNORECASE)
                 or re.search("OPT", self.model_backbone, re.IGNORECASE)
+                or re.search("codegen", self.model_backbone, re.IGNORECASE)
                 else key
             )
             query = (
                 query.permute(0, 2, 1, 3)
                 if re.search("GPTJ", self.model_backbone, re.IGNORECASE)
                 or re.search("OPT", self.model_backbone, re.IGNORECASE)
+                or re.search("codegen", self.model_backbone, re.IGNORECASE)
                 else query
             )
             value = value.permute(0, 2, 1, 3)
@@ -263,7 +282,9 @@ def forward(
             value = torch.cat((past_value, value), dim=-2)
         present = (key, value)
 
-        if re.search("GPTJ", self.model_backbone, re.IGNORECASE):
+        if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search(
+            "codegen", self.model_backbone, re.IGNORECASE
+        ):
             attn_output, attn_weights = self._attn(
                 query, key, value, attention_mask, head_mask
             )
diff --git a/intel_extension_for_pytorch/transformers/models/reference/models.py b/intel_extension_for_pytorch/transformers/models/reference/models.py
index c7f40b255..0a4f9ec61 100644
--- a/intel_extension_for_pytorch/transformers/models/reference/models.py
+++ b/intel_extension_for_pytorch/transformers/models/reference/models.py
@@ -301,6 +301,79 @@ def OPTForCausalLM_forward(
     )
 
 
+def CodeGenForCausalLM_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+    token_type_ids: Optional[torch.LongTensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+) -> Union[Tuple, CausalLMOutputWithPast]:
+    r"""
+    labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+        `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+        are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+    """
+    return_dict = (
+        return_dict if return_dict is not None else self.config.use_return_dict
+    )
+
+    transformer_outputs = self.transformer(
+        input_ids,
+        attention_mask=attention_mask,
+        past_key_values=past_key_values,
+        position_ids=position_ids,
+        token_type_ids=token_type_ids,
+        head_mask=head_mask,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+    )
+    hidden_states = transformer_outputs[0]
+
+    # make sure sampling in fp16 works correctly and
+    # compute loss in fp32 to match with mesh-tf version
+    # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
+    lm_logits = self.lm_head(hidden_states).to(torch.float32)
+
+    loss = None
+    if labels is not None:
+        # move labels to correct device to enable model parallelism
+        labels = labels.to(lm_logits.device)
+        # Shift so that tokens < n predict n
+        shift_logits = lm_logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # Flatten the tokens
+        loss_fct = CrossEntropyLoss()
+        loss = loss_fct(
+            shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
+        )
+
+        loss = loss.to(hidden_states.dtype)
+
+    if not return_dict:
+        output = (lm_logits,) + transformer_outputs[1:]
+        return ((loss,) + output) if loss is not None else output
+
+    return CausalLMOutputWithPast(
+        loss=loss,
+        logits=lm_logits,
+        past_key_values=transformer_outputs.past_key_values,
+        hidden_states=transformer_outputs.hidden_states,
+        attentions=transformer_outputs.attentions,
+    )
+
+
 def prepare_inputs_for_generation(
     self,
     input_ids: torch.LongTensor,
diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py
index 14668b42b..4852d1300 100644
--- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py
+++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py
@@ -530,6 +530,91 @@ def _FalconAttention_forward(
         return output_tensor, present
 
 
+def _CodeGenAttention_forward(
+    self,
+    hidden_states: Optional[torch.FloatTensor],
+    layer_past: Optional[Tuple[torch.Tensor]] = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = False,
+    output_attentions: Optional[bool] = False,
+) -> Union[
+    Tuple[torch.Tensor, Tuple[torch.Tensor]],
+    Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]],
+]:
+    qkv = self.qkv_proj(hidden_states)
+    # TODO(enijkamp): factor out number of logical TPU-v4 cores or make forward pass agnostic
+    mp_num = 4
+    qkv_split = qkv.reshape(qkv.shape[:-1] + (mp_num, -1))
+
+    local_dim = self.head_dim * self.num_attention_heads // mp_num
+    query, value, key = torch.split(qkv_split, local_dim, dim=-1)
+    query = self._split_heads(
+        query, self.num_attention_heads, self.head_dim, mp_num=mp_num
+    ).contiguous()
+    key = self._split_heads(
+        key, self.num_attention_heads, self.head_dim, mp_num=mp_num
+    ).contiguous()
+    value = self._split_heads(
+        value, self.num_attention_heads, self.head_dim, mp_num=mp_num
+    ).contiguous()
+
+    key = self._IPEXROPE(
+        key,
+        position_ids.contiguous(),
+        self.num_attention_heads,
+        self.head_dim,
+        1,  # neighbor elements
+        64,
+    )
+    query = self._IPEXROPE(
+        query,
+        position_ids.contiguous(),
+        self.num_attention_heads,
+        self.head_dim,
+        1,
+        64,
+    )
+
+    if use_cache:
+        (
+            attn_output,
+            attn_weights,
+            present,
+        ) = self._IPEXScaleDotProduct(
+            query,
+            key,
+            value,
+            self.scale_attn,
+            layer_past,
+            head_mask,
+            attention_mask,
+        )
+    else:
+        key = key.permute(0, 2, 1, 3)
+        query = query.permute(0, 2, 1, 3)
+        value = value.permute(0, 2, 1, 3)
+        present = None
+
+        # compute self-attention: V x Softmax(QK^T)
+        attn_output, attn_weights = self._attn(
+            query, key, value, attention_mask, head_mask
+        )
+
+    attn_output = self._merge_heads(
+        attn_output, self.num_attention_heads, self.head_dim
+    )
+    attn_output = self.out_proj(attn_output)
+    attn_output = self.resid_dropout(attn_output)
+
+    outputs = (attn_output, present)
+    if output_attentions:
+        outputs += (attn_weights,)
+
+    return outputs  # a, present, (attentions)
+
+
 class _IPEXAttentionRef(nn.Module):
     def __init__(self, module, config, sdp_module_ref, distributed=False):
         super().__init__()
@@ -733,6 +818,17 @@ def forward(
                 use_cache,
                 output_attentions,
             )
+        elif re.search("codegen", self.model_backbone, re.IGNORECASE):
+            return _CodeGenAttention_forward(
+                self,
+                hidden_states,
+                layer_past,
+                attention_mask,
+                position_ids,
+                head_mask,
+                use_cache,
+                output_attentions,
+            )
         else:
             AssertionError(False, "Do not support the optimization of your model yet")
 
diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py
index 486321e28..a7f4951a0 100644
--- a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py
+++ b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py
@@ -259,7 +259,9 @@ def __init__(self, module, config, distributed=False):
         self.distributed = distributed
         self.model_backbone = config.architectures[0]
 
-        if re.search("GPTJ", self.model_backbone, re.IGNORECASE):
+        if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search(
+            "CodeGen", self.model_backbone, re.IGNORECASE
+        ):
             if not self.distributed:
                 self.linear_add_add = _IPEXlinearAddAddRef(module.mlp.fc_out)
                 del self.__dict__["_modules"]["mlp"].fc_out
@@ -316,7 +318,9 @@ def forward(
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         alibi: Optional[torch.Tensor] = None,
     ):
-        if re.search("GPTJ", self.model_backbone, re.IGNORECASE):
+        if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search(
+            "CodeGen", self.model_backbone, re.IGNORECASE
+        ):
             return GPTJBlock_forward(
                 self,
                 hidden_states,
diff --git a/intel_extension_for_pytorch/transformers/optimize.py b/intel_extension_for_pytorch/transformers/optimize.py
index cbda882b8..956aadbd2 100644
--- a/intel_extension_for_pytorch/transformers/optimize.py
+++ b/intel_extension_for_pytorch/transformers/optimize.py
@@ -108,6 +108,7 @@ def model_convert_reference(_model):
         LlamaForCausalLM_forward,
         GPTNeoXForCausalLM_forward,
         OPTForCausalLM_forward,
+        CodeGenForCausalLM_forward,
         prepare_inputs_for_generation,
     )
 
@@ -172,6 +173,16 @@ def model_convert_reference(_model):
             "forward",
             OPTForCausalLM_forward,
         )
+    elif (
+        hasattr(_model, "__class__")
+        and _model.__class__
+        == transformers.models.codegen.modeling_codegen.CodeGenForCausalLM
+    ):
+        convert_function(
+            _model,
+            "forward",
+            CodeGenForCausalLM_forward,
+        )
 
     # checking if model has been wrapped by deepspeed (distributed or not)
     try:
@@ -189,6 +200,7 @@ def model_convert_reference(_model):
         transformers.models.llama.modeling_llama.LlamaAttention,
         transformers.models.gptj.modeling_gptj.GPTJAttention,
         transformers.models.opt.modeling_opt.OPTAttention,
+        transformers.models.codegen.modeling_codegen.CodeGenAttention,
     ]:
         convert_class(
             _model,
@@ -201,6 +213,7 @@ def model_convert_reference(_model):
     for supported_decoder_class in [
         transformers.models.llama.modeling_llama.LlamaDecoderLayer,
         transformers.models.gptj.modeling_gptj.GPTJBlock,
+        transformers.models.codegen.modeling_codegen.CodeGenBlock,
         transformers.models.opt.modeling_opt.OPTDecoderLayer,
     ]:
         convert_class(
@@ -452,7 +465,7 @@ def optimize_transformers(
     r"""
     Apply optimizations at Python frontend to the given transformers model (nn.Module).
     This API focus on transformers models, especially for generation tasks inference.
-    Well supported model family: Llama, GPT-J, GPT-Neox, OPT, Falcon.
+    Well supported model family: Llama, GPT-J, GPT-Neox, OPT, Falcon, CodeGen.
 
     Args:
         model (torch.nn.Module): User model to apply optimizations.
@@ -547,10 +560,11 @@ def optimize_transformers(
             or re.search("OPT", model.config.architectures[0], re.IGNORECASE)
             or re.search("falcon", model.config.architectures[0], re.IGNORECASE)
             or re.search("rw", model.config.architectures[0], re.IGNORECASE)
+            or re.search("codegen", model.config.architectures[0], re.IGNORECASE)
         )
         if not well_supported_model:
             warnings.warn(
-                "optimize_transformers supports Llama, GPT-J, GPT-Neox, Falcon, and OPT, fallback to origin model"
+                "optimize_transformers supports Llama, GPT-J, GPT-Neox, Falcon, OPT, and CodeGen, fallback to origin model"
             )
             return model
 
diff --git a/tests/cpu/hf_configs/codegen/config.json b/tests/cpu/hf_configs/codegen/config.json
new file mode 100644
index 000000000..cff0daa9a
--- /dev/null
+++ b/tests/cpu/hf_configs/codegen/config.json
@@ -0,0 +1,42 @@
+{
+    "_name_or_path": "codegen-2B-multi",
+    "activation_function": "gelu_new",
+    "architectures": [
+        "CodeGenForCausalLM"
+    ],
+    "attn_pdrop": 0.0,
+    "bos_token_id": 1,
+    "embd_pdrop": 0.0,
+    "eos_token_id": 50256,
+    "gradient_checkpointing": false,
+    "initializer_range": 0.02,
+    "layer_norm_epsilon": 1e-05,
+    "model_type": "codegen",
+    "n_ctx": 2048,
+    "n_embd": 2560,
+    "n_head": 32,
+    "n_inner": null,
+    "n_layer": 1,
+    "n_positions": 2048,
+    "resid_pdrop": 0.0,
+    "rotary_dim": 64,
+    "scale_attn_weights": true,
+    "summary_activation": null,
+    "summary_first_dropout": 0.1,
+    "summary_proj_to_labels": true,
+    "summary_type": "cls_index",
+    "summary_use_proj": true,
+    "task_specific_params": {
+        "text-generation": {
+            "do_sample": true,
+            "max_length": 50,
+            "temperature": 1.0
+        }
+    },
+    "tie_word_embeddings": false,
+    "tokenizer_class": "GPT2Tokenizer",
+    "torch_dtype": "float16",
+    "transformers_version": "4.21.0.dev0",
+    "use_cache": true,
+    "vocab_size": 51200
+}
\ No newline at end of file
diff --git a/tests/cpu/test_ipex_optimize_transformers.py b/tests/cpu/test_ipex_optimize_transformers.py
index a8a2dd057..d0c311862 100644
--- a/tests/cpu/test_ipex_optimize_transformers.py
+++ b/tests/cpu/test_ipex_optimize_transformers.py
@@ -98,7 +98,9 @@ def model_replacement_check(self, model, has_position_id, torchcompile=False):
                         )
                     self.assertEqual(key_hf[0], key_ipex[0], prec=0.1)
 
-                    if re.search("GPTJ", model.config.architectures[0]):
+                    if re.search("GPTJ", model.config.architectures[0]) or re.search(
+                        "codegen", model.config.architectures[0]
+                    ):
                         assert (
                             ipex_m.transformer.h[0].attn.__class__
                             is ipex.transformers.models.cpu.modules.attentions._IPEXAttentionCPU
@@ -228,6 +230,24 @@ def test_model_replacement_falcon_torchcompile(self):
             ipex.nn.utils._model_convert.replace_customized_linear_with_linear(m.eval())
         self.model_replacement_check(m, False, torchcompile=True)
 
+    def test_model_replacement_codegen(self):
+        config = AutoConfig.from_pretrained(
+            f"{curpath}/hf_configs/codegen", return_dict=False
+        )
+        m = transformers.models.codegen.modeling_codegen.CodeGenForCausalLM(
+            config
+        ).eval()
+        self.model_replacement_check(m, True)
+
+    def test_model_replacement_codegen_torchcompile(self):
+        config = AutoConfig.from_pretrained(
+            f"{curpath}/hf_configs/codegen", return_dict=False
+        )
+        m = transformers.models.codegen.modeling_codegen.CodeGenForCausalLM(
+            config
+        ).eval()
+        self.model_replacement_check(m, True, torchcompile=True)
+
     def _model_replacement_check_woq(self, model):
         qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping()
         model = ipex.optimize_transformers(