Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

Commit

Permalink
add peft model support in deepspeed sharded mode (#884)
Browse files Browse the repository at this point in the history
  • Loading branch information
sywangyi authored Dec 9, 2023
1 parent 33defb7 commit 370ca35
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 6 deletions.
66 changes: 60 additions & 6 deletions intel_extension_for_transformers/neural_chat/models/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@
WeightOnlyQuantConfig,
BitsAndBytesConfig
)

import shutil

if is_deepspeed_available():
import deepspeed # pylint: disable=E0401

Expand Down Expand Up @@ -240,11 +243,19 @@ def import_deepspeed():
logging.info("DeepSpeed is enabled.")


def init_deepspeed_inference(model, model_name_or_path, use_hpu_graphs, is_meta, token=None):
def init_deepspeed_inference(model, model_name_or_path, peft_path, use_hpu_graphs, is_meta, token=None):
# Initialize the model
from habana_frameworks.torch.distributed.hccl import initialize_distributed_hpu # pylint: disable=E0401

world_size, rank, local_rank = initialize_distributed_hpu()
merged_model_dir = None
if peft_path and is_meta:
merged_model_dir = "/tmp/text_generation_merged_peft_model"
if local_rank == 0:
if Path(merged_model_dir).is_dir():
shutil.rmtree(merged_model_dir)
peft_model(model_name_or_path, peft_path, torch.bfloat16, token).save_pretrained(merged_model_dir)
torch.distributed.barrier()

model = model.eval()
ds_inference_kwargs = {"dtype": torch.bfloat16}
Expand All @@ -253,7 +264,8 @@ def init_deepspeed_inference(model, model_name_or_path, use_hpu_graphs, is_meta,
# Make sure all devices/nodes have access to the model checkpoints
if is_meta:
checkpoints_json = "checkpoints.json"
write_checkpoints_json(model_name_or_path, local_rank, checkpoints_json, token)
write_checkpoints_json(merged_model_dir if merged_model_dir is not None else model_name_or_path, local_rank,
checkpoints_json, token)

torch.distributed.barrier()

Expand All @@ -264,6 +276,50 @@ def init_deepspeed_inference(model, model_name_or_path, use_hpu_graphs, is_meta,
model = deepspeed.init_inference(model, **ds_inference_kwargs)
return model.module


def peft_model(model_name, peft_model, model_dtype, hf_access_token=None):
import importlib.util

if importlib.util.find_spec("peft") is None:
raise ImportError("The `peft` package is not installed, please run: `pip install peft`.")
from peft import AutoPeftModelForCausalLM
from peft.config import PeftConfigMixin

base_model_name = PeftConfigMixin.from_pretrained(
peft_model,
use_auth_token=hf_access_token,
).base_model_name_or_path

base_model_is_local = Path(base_model_name).is_dir()
if not base_model_is_local:
# Check if the base model path to a remote repository on the HF Hub exists
from huggingface_hub import list_repo_files

try:
list_repo_files(base_model_name)
base_model_is_remote = True
except Exception:
base_model_is_remote = False

if base_model_is_local or base_model_is_remote:
model = AutoPeftModelForCausalLM.from_pretrained(peft_model, torch_dtype=model_dtype, low_cpu_mem_usage=True,
use_auth_token=hf_access_token)
else:
# Since the base model doesn't exist locally nor remotely, use `args.model_name_or_path` as the base model
print(
f"The base model `{base_model_name}` of the LoRA configuration associated"
f" to `{peft_model}` does not exist locally or remotely. Using "
f"`--model_name_or_path {model_name}` as a fall back for the base model."
)
from peft import PeftModel

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype, low_cpu_mem_usage=True,
use_auth_token=hf_access_token)
model = PeftModel.from_pretrained(model, peft_model, torch_dtype=model_dtype, low_cpu_mem_usage=True,
use_auth_token=hf_access_token)

return model.merge_and_unload()

def load_model(
model_name,
tokenizer_name,
Expand Down Expand Up @@ -376,9 +432,6 @@ def load_model(
logging.info("Optimized Model loaded.")
return

if peft_path and device == "hpu" and use_deepspeed and load_to_meta:
logging.warning("PEFT could not work in deepspeed sharded checkpt loading mode, set load_to_meta to False")
load_to_meta = False
if device == "hpu" and use_deepspeed and load_to_meta:
with deepspeed.OnDevice(dtype=torch.bfloat16, device="meta"):
model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)
Expand Down Expand Up @@ -500,7 +553,7 @@ def load_model(
model.generation_config.eos_token_id = tokenizer.eos_token_id

if device == "hpu":
if peft_path:
if peft_path and not (use_deepspeed and load_to_meta):
from peft import PeftModel
model = PeftModel.from_pretrained(model, peft_path)
model = model.to(torch.bfloat16)
Expand All @@ -516,6 +569,7 @@ def load_model(
model = init_deepspeed_inference(
model=model,
model_name_or_path=model_name,
peft_path=peft_path,
use_hpu_graphs=use_hpu_graphs,
is_meta=load_to_meta,
token=hf_access_token,
Expand Down
12 changes: 12 additions & 0 deletions workflows/chatbot/inference/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,18 @@ python ../utils/gaudi_spawn.py --use_deepspeed --world_size 8 generate.py \

Habana supports HPU graph mode for inference speedup, which is available for bloom, gpt2, opt, gptj, gpt_neox, mpt, llama. You can use the parameter `use_hpu_graphs` to speed up the inference.

you can use '--peft_model_path' to apply you peft finetuned output model during generation.

```bash
python ../utils/gaudi_spawn.py --use_deepspeed --world_size 8 generate.py \
--base_model_path "meta-llama/Llama-2-70b-chat-hf" \
--peft_model_path <peft_model_output_folder>
--habana \
--use_hpu_graphs \
--use_kv_cache \
--task chat \
--instructions "Transform the following sentence into one that shows contrast. The tree is rotten."
```

# Additional Notes

Expand Down

0 comments on commit 370ca35

Please sign in to comment.