enable optimized codegen (#2257)

intel · Nov 10, 2023 · 7c598e4 · 7c598e4
1 parent 99aa54f
commit 7c598e4
Show file tree

Hide file tree

Showing 17 changed files with 312 additions and 13 deletions.
diff --git a/examples/cpu/inference/python/llm/README.md b/examples/cpu/inference/python/llm/README.md
@@ -1,6 +1,6 @@
 # Text Generation
 We provide the inference benchmarking scripts for large language models text generation.<br/>
-Support large language model families, including GPT-J, LLaMA, GPT-Neox, OPT, Falcon.<br/>
+Support large language model families, including GPT-J, LLaMA, GPT-Neox, OPT, Falcon, CodeGen.<br/>
 The scripts include both single instance and distributed (DeepSpeed) use cases.<br/>
 The scripts cover model generation inference with low precions cases for different models with best perf and accuracy (bf16 AMP，static quantization and weight only quantization).<br/>
 
@@ -84,10 +84,11 @@ wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prom
 |GPT-NEOX| "EleutherAI/gpt-neox-20b" | ✅ | ✅ | ✅ | ❎ ** | 
 |FALCON*|"tiiuae/falcon-40b" | ✅ | ✅ |  ✅ | ❎ **| 
 |OPT|"facebook/opt-30b", "facebook/opt-1.3b"| ✅ | ✅ |  ✅ | ❎ **| 
+|CodeGen|"Salesforce/codegen-2B-multi"| ✅ | ✅ |  ✅ | ❎ **|
 
 *For Falcon models from remote hub, we need to modify the config.json to use the modeling_falcon.py in transformers. Therefore, in the following scripts, we need to pass an extra configuration file like "--config-file=model_config/tiiuae_falcon-40b_config.json". This is optional for FP32/BF16 but needed for quantizations.
 
-** For GPT-NEOX/FALCON/OPT models, the accuracy recipes of static quantization INT8 are not ready thus they will be skipped in our coverage.
+** For GPT-NEOX/FALCON/OPT/CodeGen models, the accuracy recipes of static quantization INT8 are not ready thus they will be skipped in our coverage.
 
 *Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp32/bf16). For other LLM model families, we are working in progress to cover those optimizations, which will expand the model list above.
 
@@ -180,6 +181,8 @@ python run_gpt-neox_quantization.py --ipex-weight-only-quantization --output-dir
 python run_falcon_quantization.py --ipex-weight-only-quantization --output-dir "saved_results"  --int8-bf16-mixed -m <FALCON MODEL_ID> --config-file <CONFIG_FILE>
 ## OPT quantization
 python run_opt_quantization.py --ipex-weight-only-quantization --output-dir "saved_results"  --int8-bf16-mixed -m <OPT MODEL_ID> 
+## CodeGen quantization
+python run_codegen_quantization.py --ipex-weight-only-quantization --output-dir "saved_results"  --int8-bf16-mixed -m <CODEGEN MODEL_ID>
 
 ## (2) Run quantization performance test (note that GPT-NEOX uses --int8 instead of --int8-bf16-mixed)
 OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_<MODEL>_quantization.py -m <MODEL_ID> --quantized-model-path "./saved_results/best_model.pt" --benchmark --int8-bf16-mixed
@@ -277,7 +280,7 @@ export WORK_DIR=./
 cd distributed
 mv PATH/TO/prompt.json ./
 
-# Run GPTJ/LLAMA/OPT/Falcon with bfloat16 DeepSpeed
+# Run GPTJ/LLAMA/OPT/Falcon/CodeGen with bfloat16 DeepSpeed
 deepspeed --bind_cores_to_rank run_generation_with_deepspeed.py --benchmark -m <MODEL_ID> --dtype bfloat16 --ipex --deployment-mode
 
 # Run GPT-NeoX with ipex weight only quantization

diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py
@@ -25,6 +25,7 @@
     "opt": (AutoModelForCausalLM, AutoTokenizer),
     "llama": (AutoModelForCausalLM, LlamaTokenizer),
     "falcon": (AutoModelForCausalLM, AutoTokenizer),
+    "codegen": (AutoModelForCausalLM, AutoTokenizer),
     "auto": (AutoModelForCausalLM, AutoTokenizer),
 }
 
@@ -336,7 +337,7 @@ def _model_call(
                 for text in inputs:
                     input_ids = text.to(self._device)
                     input_bs = inputs.shape[0] * self.num_beams
-                    if re.search("GPTJ", self.base_model.config.architectures[0]):
+                    if re.search("GPTJ", self.base_model.config.architectures[0]) or re.search("codegen", self.base_model.config.architectures[0], re.IGNORECASE):
                         beam_idx_tmp = torch.zeros(
                             (2048, int(input_bs)), dtype=torch.long
                         ).contiguous()

diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py
@@ -32,6 +32,7 @@
     "opt": (AutoModelForCausalLM, AutoTokenizer),
     "falcon": (AutoModelForCausalLM, AutoTokenizer),
     "chatglm": (AutoModelForCausalLM, AutoTokenizer),
+    "codegen": (AutoModelForCausalLM, AutoTokenizer),
     "auto": (AutoModelForCausalLM, AutoTokenizer),
 }
 

diff --git a/examples/cpu/inference/python/llm/run.py b/examples/cpu/inference/python/llm/run.py
@@ -181,6 +181,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
                 )
             elif re.search("OPT", config.architectures[0], re.IGNORECASE):
                 qpath = Path(parent_path, "single_instance/run_opt_quantization.py")
+            elif re.search("codegen", config.architectures[0], re.IGNORECASE):
+                qpath = Path(parent_path, "single_instance/run_codegen_quantization.py")
 
             infer_cmd = ["python", qpath]
             # 1) quantization

diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py
@@ -17,6 +17,7 @@
     "opt": (AutoModelForCausalLM, AutoTokenizer),
     "llama": (AutoModelForCausalLM, LlamaTokenizer),
     "falcon": (AutoModelForCausalLM, AutoTokenizer),
+    "codegen": (AutoModelForCausalLM, AutoTokenizer),
     "auto": (AutoModelForCausalLM, AutoTokenizer),
 }
 
@@ -172,7 +173,7 @@ def _model_call(
                 for text in inputs:
                     input_ids = text.to(self._device)
                     input_bs = inputs.shape[0] * self.num_beams
-                    if re.search("GPTJ", self.base_model.config.architectures[0]):
+                    if re.search("GPTJ", self.base_model.config.architectures[0]) or re.search("codegen", self.base_model.config.architectures[0], re.IGNORECASE):
                         beam_idx_tmp = torch.zeros(
                             (2048, int(input_bs)), dtype=torch.long
                         ).contiguous()

diff --git a/examples/cpu/inference/python/llm/single_instance/run_generation.py b/examples/cpu/inference/python/llm/single_instance/run_generation.py
@@ -19,6 +19,7 @@
     "llama": (AutoModelForCausalLM, LlamaTokenizer),
     "opt": (AutoModelForCausalLM, AutoTokenizer),
     "falcon": (AutoModelForCausalLM, AutoTokenizer),
+    "codegen": (AutoModelForCausalLM, AutoTokenizer),
     "auto": (AutoModelForCausalLM, AutoTokenizer),
 }
 

diff --git a/intel_extension_for_pytorch/transformers/generation/beam_search.py b/intel_extension_for_pytorch/transformers/generation/beam_search.py
@@ -176,14 +176,17 @@ def _beam_search(
             or re.search("OPT", self.config.architectures[0], re.IGNORECASE)
             or re.search("falcon", self.config.architectures[0], re.IGNORECASE)
             or re.search("rw", self.config.architectures[0], re.IGNORECASE)
+            or re.search("codegen", self.config.architectures[0], re.IGNORECASE)
         ):
             first_token = False
             input_bs = input_ids.size()[0]
             has_position_id = True
             if model_inputs["past_key_values"] is None:
                 first_token = True
             if first_token:
-                if re.search("GPTJ", self.config.architectures[0]):
+                if re.search("GPTJ", self.config.architectures[0]) or re.search(
+                    "codegen", self.config.architectures[0], re.IGNORECASE
+                ):
                     beam_idx_tmp = torch.zeros(
                         (2048, int(batch_size * num_beams)), dtype=torch.long
                     ).contiguous()

diff --git a/intel_extension_for_pytorch/transformers/generation/greedy_search.py b/intel_extension_for_pytorch/transformers/generation/greedy_search.py
@@ -157,13 +157,16 @@ def _greedy_search(
             or re.search("OPT", self.config.architectures[0], re.IGNORECASE)
             or re.search("falcon", self.config.architectures[0], re.IGNORECASE)
             or re.search("rw", self.config.architectures[0], re.IGNORECASE)
+            or re.search("codegen", self.config.architectures[0], re.IGNORECASE)
         ):
             first_token = False
             input_bs = input_ids.size()[0]
             if model_inputs["past_key_values"] is None:
                 first_token = True
             if first_token:
-                if re.search("GPTJ", self.config.architectures[0]):
+                if re.search("GPTJ", self.config.architectures[0]) or re.search(
+                    "codegen", self.config.architectures[0], re.IGNORECASE
+                ):
                     beam_idx_tmp = torch.zeros(
                         (2048, int(input_bs)), dtype=torch.long
                     ).contiguous()

diff --git a/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py
@@ -26,6 +26,11 @@ def __init__(self, module, config, tpp=False, woq=False):
                 self.rope_base,
                 self.model_backbone,
             )
+
+        if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search(
+            "codegen", self.model_backbone, re.IGNORECASE
+        ):
+            self._IPEXROPE.embed_positions.sin_cos = self.embed_positions
         if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search(
             "LLAMA", self.model_backbone, re.IGNORECASE
         ):

diff --git a/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py
@@ -67,5 +67,14 @@ def __init__(self, module, config, tpp=False, woq=False):
                     self.linear_add = _IPEXlinearAddCPU(
                         module.linear_add.linear, tpp=tpp, woq=woq
                     )
+        elif re.search("codegen", self.model_backbone, re.IGNORECASE):
+            if not self.distributed:
+                self.linear_add_add = _IPEXlinearAddAddCPU(
+                    module.linear_add_add.linear, tpp=tpp, woq=woq
+                )
+            # woq_linear_gelu has accuracy issues on codegen, disable it
+            self.linear_gelu = _IPEXlinearNewGeluCPU(
+                module.linear_gelu.linear, tpp=tpp and not woq, woq=False
+            )
         else:
             AssertionError(False, "Do not support the optimization of your model yet")
diff --git a/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py b/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py
@@ -161,6 +161,17 @@ def forward(
             _cos = _cos.type(x.dtype)[:, 0:seq_len]
             _sin = _sin.type(x.dtype)[:, 0:seq_len]
             x = (x * _cos) + (self.rotate_half(x) * _sin)
+        elif re.search("codegen", self.model_backbone, re.IGNORECASE):
+            sincos = _sin_cos[position_ids]
+            sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1)
+            if rotary_ndims is not None:
+                x_rot = x[:, :, :, :rotary_ndims]
+                x_pass = x[:, :, :, rotary_ndims:]
+
+                x_rot = self.apply_rotary_pos_emb_gptj(x_rot, sin, cos)
+                x = torch.cat([x_rot, x_pass], dim=-1)
+            else:
+                x = self.apply_rotary_pos_emb_gptj(x, sin, cos)
         else:
             AssertionError(False, "Do not support the optimization of your model yet")
         return x
@@ -201,6 +212,12 @@ def __init__(self, module, config):
                 if hasattr(module, "new_decoder_architecture")
                 else None
             )
+        elif re.search("codegen", self.model_backbone, re.IGNORECASE):
+            self.num_heads = module.num_attention_heads
+            self.head_dim = module.head_dim
+            self.scale_attn = module.scale_attn
+            self.attn_dropout = module.attn_dropout
+            self.causal_mask = module.causal_mask
 
         for k, v in module.__class__.__dict__.items():
             if k.startswith("__") or k.startswith("forward"):
@@ -247,12 +264,14 @@ def forward(
                 key.permute(0, 2, 1, 3)
                 if re.search("GPTJ", self.model_backbone, re.IGNORECASE)
                 or re.search("OPT", self.model_backbone, re.IGNORECASE)
+                or re.search("codegen", self.model_backbone, re.IGNORECASE)
                 else key
             )
             query = (
                 query.permute(0, 2, 1, 3)
                 if re.search("GPTJ", self.model_backbone, re.IGNORECASE)
                 or re.search("OPT", self.model_backbone, re.IGNORECASE)
+                or re.search("codegen", self.model_backbone, re.IGNORECASE)
                 else query
             )
             value = value.permute(0, 2, 1, 3)
@@ -263,7 +282,9 @@ def forward(
             value = torch.cat((past_value, value), dim=-2)
         present = (key, value)
 
-        if re.search("GPTJ", self.model_backbone, re.IGNORECASE):
+        if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search(
+            "codegen", self.model_backbone, re.IGNORECASE
+        ):
             attn_output, attn_weights = self._attn(
                 query, key, value, attention_mask, head_mask
             )

diff --git a/intel_extension_for_pytorch/transformers/models/reference/models.py b/intel_extension_for_pytorch/transformers/models/reference/models.py
@@ -301,6 +301,79 @@ def OPTForCausalLM_forward(
     )
 
 
+def CodeGenForCausalLM_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+    token_type_ids: Optional[torch.LongTensor] = None,
+    head_mask: Optional[torch.FloatTensor] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+) -> Union[Tuple, CausalLMOutputWithPast]:
+    r"""
+    labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+        `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+        are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+    """
+    return_dict = (
+        return_dict if return_dict is not None else self.config.use_return_dict
+    )
+
+    transformer_outputs = self.transformer(
+        input_ids,
+        attention_mask=attention_mask,
+        past_key_values=past_key_values,
+        position_ids=position_ids,
+        token_type_ids=token_type_ids,
+        head_mask=head_mask,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+    )
+    hidden_states = transformer_outputs[0]
+
+    # make sure sampling in fp16 works correctly and
+    # compute loss in fp32 to match with mesh-tf version
+    # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
+    lm_logits = self.lm_head(hidden_states).to(torch.float32)
+
+    loss = None
+    if labels is not None:
+        # move labels to correct device to enable model parallelism
+        labels = labels.to(lm_logits.device)
+        # Shift so that tokens < n predict n
+        shift_logits = lm_logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # Flatten the tokens
+        loss_fct = CrossEntropyLoss()
+        loss = loss_fct(
+            shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
+        )
+
+        loss = loss.to(hidden_states.dtype)
+
+    if not return_dict:
+        output = (lm_logits,) + transformer_outputs[1:]
+        return ((loss,) + output) if loss is not None else output
+
+    return CausalLMOutputWithPast(
+        loss=loss,
+        logits=lm_logits,
+        past_key_values=transformer_outputs.past_key_values,
+        hidden_states=transformer_outputs.hidden_states,
+        attentions=transformer_outputs.attentions,
+    )
+
+
 def prepare_inputs_for_generation(
     self,
     input_ids: torch.LongTensor,