intel · VincyZhang · Dec 9, 2023 · Dec 7, 2023 · Dec 7, 2023 · Dec 9, 2023
diff --git a/intel_extension_for_transformers/neural_chat/models/model_utils.py b/intel_extension_for_transformers/neural_chat/models/model_utils.py
@@ -46,6 +46,7 @@
     WeightOnlyQuantConfig,
     BitsAndBytesConfig
 )
+import shutil
 
 if is_deepspeed_available():
     import deepspeed # pylint: disable=E0401
@@ -232,11 +233,19 @@ def import_deepspeed():
     print("DeepSpeed is enabled.")
 
 
-def init_deepspeed_inference(model, model_name_or_path, use_hpu_graphs, is_meta, token=None):
+def init_deepspeed_inference(model, model_name_or_path, peft_path, use_hpu_graphs, is_meta, token=None):
     # Initialize the model
     from habana_frameworks.torch.distributed.hccl import initialize_distributed_hpu # pylint: disable=E0401
 
     world_size, rank, local_rank = initialize_distributed_hpu()
+    merged_model_dir = None
+    if peft_path and is_meta:
+        merged_model_dir = "/tmp/text_generation_merged_peft_model"
+        if local_rank == 0:
+            if Path(merged_model_dir).is_dir():
+                shutil.rmtree(merged_model_dir)
+            peft_model(model_name_or_path, peft_path, torch.bfloat16, token).save_pretrained(merged_model_dir)
+        torch.distributed.barrier()
 
     model = model.eval()
     ds_inference_kwargs = {"dtype": torch.bfloat16}
@@ -245,7 +254,8 @@ def init_deepspeed_inference(model, model_name_or_path, use_hpu_graphs, is_meta,
     # Make sure all devices/nodes have access to the model checkpoints
     if is_meta:
         checkpoints_json = "checkpoints.json"
-        write_checkpoints_json(model_name_or_path, local_rank, checkpoints_json, token)
+        write_checkpoints_json(merged_model_dir if merged_model_dir is not None else model_name_or_path, local_rank,
+                               checkpoints_json, token)
 
     torch.distributed.barrier()
 
@@ -256,6 +266,50 @@ def init_deepspeed_inference(model, model_name_or_path, use_hpu_graphs, is_meta,
     model = deepspeed.init_inference(model, **ds_inference_kwargs)
     return model.module
 
+
+def peft_model(model_name, peft_model, model_dtype, hf_access_token=None):
+    import importlib.util
+
+    if importlib.util.find_spec("peft") is None:
+        raise ImportError("The `peft` package is not installed, please run: `pip install peft`.")
+    from peft import AutoPeftModelForCausalLM
+    from peft.config import PeftConfigMixin
+
+    base_model_name = PeftConfigMixin.from_pretrained(
+        peft_model,
+        use_auth_token=hf_access_token,
+    ).base_model_name_or_path
+
+    base_model_is_local = Path(base_model_name).is_dir()
+    if not base_model_is_local:
+        # Check if the base model path to a remote repository on the HF Hub exists
+        from huggingface_hub import list_repo_files
+
+        try:
+            list_repo_files(base_model_name)
+            base_model_is_remote = True
+        except Exception:
+            base_model_is_remote = False
+
+    if base_model_is_local or base_model_is_remote:
+        model = AutoPeftModelForCausalLM.from_pretrained(peft_model, torch_dtype=model_dtype, low_cpu_mem_usage=True,
+                                                         use_auth_token=hf_access_token)
+    else:
+        # Since the base model doesn't exist locally nor remotely, use `args.model_name_or_path` as the base model
+        print(
+            f"The base model `{base_model_name}` of the LoRA configuration associated"
+            f" to `{peft_model}` does not exist locally or remotely. Using "
+            f"`--model_name_or_path {model_name}` as a fall back for the base model."
+        )
+        from peft import PeftModel
+
+        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype, low_cpu_mem_usage=True,
+                                                     use_auth_token=hf_access_token)
+        model = PeftModel.from_pretrained(model, peft_model, torch_dtype=model_dtype, low_cpu_mem_usage=True,
+                                          use_auth_token=hf_access_token)
+
+    return model.merge_and_unload()
+
 def load_model(
     model_name,
     tokenizer_name,
@@ -345,10 +399,6 @@ def load_model(
         MODELS[model_name]["tokenizer"] = tokenizer
         print("Optimized Model loaded.")
         return
-
-    if peft_path and device == "hpu" and use_deepspeed and load_to_meta:
-        print("PEFT could not work in deepspeed sharded checkpt loading mode, set load_to_meta to False")
-        load_to_meta = False
 
     if device == "hpu" and use_deepspeed and load_to_meta:
         with deepspeed.OnDevice(dtype=torch.bfloat16, device="meta"):
@@ -479,7 +529,7 @@ def load_model(
         print("Optimized Model loaded.")
         return
     if device == "hpu":
-        if peft_path:
+        if peft_path and not (use_deepspeed and load_to_meta):
             from peft import PeftModel
             model = PeftModel.from_pretrained(model, peft_path)
             model = model.to(torch.bfloat16)
@@ -495,6 +545,7 @@ def load_model(
             model = init_deepspeed_inference(
                 model=model,
                 model_name_or_path=model_name,
+                peft_path=peft_path,
                 use_hpu_graphs=use_hpu_graphs,
                 is_meta=load_to_meta,
                 token=hf_access_token,

diff --git a/workflows/chatbot/inference/README.md b/workflows/chatbot/inference/README.md
@@ -107,6 +107,18 @@ python ../utils/gaudi_spawn.py --use_deepspeed --world_size 8 generate.py \
 
 Habana supports HPU graph mode for inference speedup, which is available for bloom, gpt2, opt, gptj, gpt_neox, mpt, llama. You can use the parameter `use_hpu_graphs` to speed up the inference.
 
+you can use '--peft_model_path' to apply you peft finetuned output model during generation.
+
+```bash
+python ../utils/gaudi_spawn.py --use_deepspeed --world_size 8 generate.py \
+        --base_model_path "meta-llama/Llama-2-70b-chat-hf" \
+        --peft_model_path <peft_model_output_folder>
+        --habana \
+        --use_hpu_graphs \
+        --use_kv_cache \
+        --task chat \
+        --instructions "Transform the following sentence into one that shows contrast. The tree is rotten."
+```
 
 # Additional Notes