Skip to content

Commit

Permalink
enable optimized codegen (#2257)
Browse files Browse the repository at this point in the history
  • Loading branch information
blzheng authored Nov 10, 2023
1 parent 99aa54f commit 7c598e4
Show file tree
Hide file tree
Showing 17 changed files with 312 additions and 13 deletions.
9 changes: 6 additions & 3 deletions examples/cpu/inference/python/llm/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Text Generation
We provide the inference benchmarking scripts for large language models text generation.<br/>
Support large language model families, including GPT-J, LLaMA, GPT-Neox, OPT, Falcon.<br/>
Support large language model families, including GPT-J, LLaMA, GPT-Neox, OPT, Falcon, CodeGen.<br/>
The scripts include both single instance and distributed (DeepSpeed) use cases.<br/>
The scripts cover model generation inference with low precions cases for different models with best perf and accuracy (bf16 AMP,static quantization and weight only quantization).<br/>

Expand Down Expand Up @@ -84,10 +84,11 @@ wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prom
|GPT-NEOX| "EleutherAI/gpt-neox-20b" ||||** |
|FALCON*|"tiiuae/falcon-40b" ||||**|
|OPT|"facebook/opt-30b", "facebook/opt-1.3b"||||**|
|CodeGen|"Salesforce/codegen-2B-multi"||||**|

*For Falcon models from remote hub, we need to modify the config.json to use the modeling_falcon.py in transformers. Therefore, in the following scripts, we need to pass an extra configuration file like "--config-file=model_config/tiiuae_falcon-40b_config.json". This is optional for FP32/BF16 but needed for quantizations.

** For GPT-NEOX/FALCON/OPT models, the accuracy recipes of static quantization INT8 are not ready thus they will be skipped in our coverage.
** For GPT-NEOX/FALCON/OPT/CodeGen models, the accuracy recipes of static quantization INT8 are not ready thus they will be skipped in our coverage.

*Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp32/bf16). For other LLM model families, we are working in progress to cover those optimizations, which will expand the model list above.

Expand Down Expand Up @@ -180,6 +181,8 @@ python run_gpt-neox_quantization.py --ipex-weight-only-quantization --output-dir
python run_falcon_quantization.py --ipex-weight-only-quantization --output-dir "saved_results" --int8-bf16-mixed -m <FALCON MODEL_ID> --config-file <CONFIG_FILE>
## OPT quantization
python run_opt_quantization.py --ipex-weight-only-quantization --output-dir "saved_results" --int8-bf16-mixed -m <OPT MODEL_ID>
## CodeGen quantization
python run_codegen_quantization.py --ipex-weight-only-quantization --output-dir "saved_results" --int8-bf16-mixed -m <CODEGEN MODEL_ID>

## (2) Run quantization performance test (note that GPT-NEOX uses --int8 instead of --int8-bf16-mixed)
OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_<MODEL>_quantization.py -m <MODEL_ID> --quantized-model-path "./saved_results/best_model.pt" --benchmark --int8-bf16-mixed
Expand Down Expand Up @@ -277,7 +280,7 @@ export WORK_DIR=./
cd distributed
mv PATH/TO/prompt.json ./
# Run GPTJ/LLAMA/OPT/Falcon with bfloat16 DeepSpeed
# Run GPTJ/LLAMA/OPT/Falcon/CodeGen with bfloat16 DeepSpeed
deepspeed --bind_cores_to_rank run_generation_with_deepspeed.py --benchmark -m <MODEL_ID> --dtype bfloat16 --ipex --deployment-mode
# Run GPT-NeoX with ipex weight only quantization
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"opt": (AutoModelForCausalLM, AutoTokenizer),
"llama": (AutoModelForCausalLM, LlamaTokenizer),
"falcon": (AutoModelForCausalLM, AutoTokenizer),
"codegen": (AutoModelForCausalLM, AutoTokenizer),
"auto": (AutoModelForCausalLM, AutoTokenizer),
}

Expand Down Expand Up @@ -336,7 +337,7 @@ def _model_call(
for text in inputs:
input_ids = text.to(self._device)
input_bs = inputs.shape[0] * self.num_beams
if re.search("GPTJ", self.base_model.config.architectures[0]):
if re.search("GPTJ", self.base_model.config.architectures[0]) or re.search("codegen", self.base_model.config.architectures[0], re.IGNORECASE):
beam_idx_tmp = torch.zeros(
(2048, int(input_bs)), dtype=torch.long
).contiguous()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
"opt": (AutoModelForCausalLM, AutoTokenizer),
"falcon": (AutoModelForCausalLM, AutoTokenizer),
"chatglm": (AutoModelForCausalLM, AutoTokenizer),
"codegen": (AutoModelForCausalLM, AutoTokenizer),
"auto": (AutoModelForCausalLM, AutoTokenizer),
}

Expand Down
2 changes: 2 additions & 0 deletions examples/cpu/inference/python/llm/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
)
elif re.search("OPT", config.architectures[0], re.IGNORECASE):
qpath = Path(parent_path, "single_instance/run_opt_quantization.py")
elif re.search("codegen", config.architectures[0], re.IGNORECASE):
qpath = Path(parent_path, "single_instance/run_codegen_quantization.py")

infer_cmd = ["python", qpath]
# 1) quantization
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"opt": (AutoModelForCausalLM, AutoTokenizer),
"llama": (AutoModelForCausalLM, LlamaTokenizer),
"falcon": (AutoModelForCausalLM, AutoTokenizer),
"codegen": (AutoModelForCausalLM, AutoTokenizer),
"auto": (AutoModelForCausalLM, AutoTokenizer),
}

Expand Down Expand Up @@ -172,7 +173,7 @@ def _model_call(
for text in inputs:
input_ids = text.to(self._device)
input_bs = inputs.shape[0] * self.num_beams
if re.search("GPTJ", self.base_model.config.architectures[0]):
if re.search("GPTJ", self.base_model.config.architectures[0]) or re.search("codegen", self.base_model.config.architectures[0], re.IGNORECASE):
beam_idx_tmp = torch.zeros(
(2048, int(input_bs)), dtype=torch.long
).contiguous()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"llama": (AutoModelForCausalLM, LlamaTokenizer),
"opt": (AutoModelForCausalLM, AutoTokenizer),
"falcon": (AutoModelForCausalLM, AutoTokenizer),
"codegen": (AutoModelForCausalLM, AutoTokenizer),
"auto": (AutoModelForCausalLM, AutoTokenizer),
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,14 +176,17 @@ def _beam_search(
or re.search("OPT", self.config.architectures[0], re.IGNORECASE)
or re.search("falcon", self.config.architectures[0], re.IGNORECASE)
or re.search("rw", self.config.architectures[0], re.IGNORECASE)
or re.search("codegen", self.config.architectures[0], re.IGNORECASE)
):
first_token = False
input_bs = input_ids.size()[0]
has_position_id = True
if model_inputs["past_key_values"] is None:
first_token = True
if first_token:
if re.search("GPTJ", self.config.architectures[0]):
if re.search("GPTJ", self.config.architectures[0]) or re.search(
"codegen", self.config.architectures[0], re.IGNORECASE
):
beam_idx_tmp = torch.zeros(
(2048, int(batch_size * num_beams)), dtype=torch.long
).contiguous()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,13 +157,16 @@ def _greedy_search(
or re.search("OPT", self.config.architectures[0], re.IGNORECASE)
or re.search("falcon", self.config.architectures[0], re.IGNORECASE)
or re.search("rw", self.config.architectures[0], re.IGNORECASE)
or re.search("codegen", self.config.architectures[0], re.IGNORECASE)
):
first_token = False
input_bs = input_ids.size()[0]
if model_inputs["past_key_values"] is None:
first_token = True
if first_token:
if re.search("GPTJ", self.config.architectures[0]):
if re.search("GPTJ", self.config.architectures[0]) or re.search(
"codegen", self.config.architectures[0], re.IGNORECASE
):
beam_idx_tmp = torch.zeros(
(2048, int(input_bs)), dtype=torch.long
).contiguous()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ def __init__(self, module, config, tpp=False, woq=False):
self.rope_base,
self.model_backbone,
)

if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search(
"codegen", self.model_backbone, re.IGNORECASE
):
self._IPEXROPE.embed_positions.sin_cos = self.embed_positions
if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search(
"LLAMA", self.model_backbone, re.IGNORECASE
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,5 +67,14 @@ def __init__(self, module, config, tpp=False, woq=False):
self.linear_add = _IPEXlinearAddCPU(
module.linear_add.linear, tpp=tpp, woq=woq
)
elif re.search("codegen", self.model_backbone, re.IGNORECASE):
if not self.distributed:
self.linear_add_add = _IPEXlinearAddAddCPU(
module.linear_add_add.linear, tpp=tpp, woq=woq
)
# woq_linear_gelu has accuracy issues on codegen, disable it
self.linear_gelu = _IPEXlinearNewGeluCPU(
module.linear_gelu.linear, tpp=tpp and not woq, woq=False
)
else:
AssertionError(False, "Do not support the optimization of your model yet")
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,17 @@ def forward(
_cos = _cos.type(x.dtype)[:, 0:seq_len]
_sin = _sin.type(x.dtype)[:, 0:seq_len]
x = (x * _cos) + (self.rotate_half(x) * _sin)
elif re.search("codegen", self.model_backbone, re.IGNORECASE):
sincos = _sin_cos[position_ids]
sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1)
if rotary_ndims is not None:
x_rot = x[:, :, :, :rotary_ndims]
x_pass = x[:, :, :, rotary_ndims:]

x_rot = self.apply_rotary_pos_emb_gptj(x_rot, sin, cos)
x = torch.cat([x_rot, x_pass], dim=-1)
else:
x = self.apply_rotary_pos_emb_gptj(x, sin, cos)
else:
AssertionError(False, "Do not support the optimization of your model yet")
return x
Expand Down Expand Up @@ -201,6 +212,12 @@ def __init__(self, module, config):
if hasattr(module, "new_decoder_architecture")
else None
)
elif re.search("codegen", self.model_backbone, re.IGNORECASE):
self.num_heads = module.num_attention_heads
self.head_dim = module.head_dim
self.scale_attn = module.scale_attn
self.attn_dropout = module.attn_dropout
self.causal_mask = module.causal_mask

for k, v in module.__class__.__dict__.items():
if k.startswith("__") or k.startswith("forward"):
Expand Down Expand Up @@ -247,12 +264,14 @@ def forward(
key.permute(0, 2, 1, 3)
if re.search("GPTJ", self.model_backbone, re.IGNORECASE)
or re.search("OPT", self.model_backbone, re.IGNORECASE)
or re.search("codegen", self.model_backbone, re.IGNORECASE)
else key
)
query = (
query.permute(0, 2, 1, 3)
if re.search("GPTJ", self.model_backbone, re.IGNORECASE)
or re.search("OPT", self.model_backbone, re.IGNORECASE)
or re.search("codegen", self.model_backbone, re.IGNORECASE)
else query
)
value = value.permute(0, 2, 1, 3)
Expand All @@ -263,7 +282,9 @@ def forward(
value = torch.cat((past_value, value), dim=-2)
present = (key, value)

if re.search("GPTJ", self.model_backbone, re.IGNORECASE):
if re.search("GPTJ", self.model_backbone, re.IGNORECASE) or re.search(
"codegen", self.model_backbone, re.IGNORECASE
):
attn_output, attn_weights = self._attn(
query, key, value, attention_mask, head_mask
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,79 @@ def OPTForCausalLM_forward(
)


def CodeGenForCausalLM_forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
token_type_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
return_dict = (
return_dict if return_dict is not None else self.config.use_return_dict
)

transformer_outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
past_key_values=past_key_values,
position_ids=position_ids,
token_type_ids=token_type_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = transformer_outputs[0]

# make sure sampling in fp16 works correctly and
# compute loss in fp32 to match with mesh-tf version
# https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
lm_logits = self.lm_head(hidden_states).to(torch.float32)

loss = None
if labels is not None:
# move labels to correct device to enable model parallelism
labels = labels.to(lm_logits.device)
# Shift so that tokens < n predict n
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss()
loss = loss_fct(
shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
)

loss = loss.to(hidden_states.dtype)

if not return_dict:
output = (lm_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output

return CausalLMOutputWithPast(
loss=loss,
logits=lm_logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)


def prepare_inputs_for_generation(
self,
input_ids: torch.LongTensor,
Expand Down
Loading

0 comments on commit 7c598e4

Please sign in to comment.