Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

add peft model support in deepspeed sharded mode #884

Merged
merged 3 commits into from
Dec 9, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 58 additions & 7 deletions intel_extension_for_transformers/neural_chat/models/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
WeightOnlyQuantConfig,
BitsAndBytesConfig
)
import shutil

if is_deepspeed_available():
import deepspeed # pylint: disable=E0401
Expand Down Expand Up @@ -232,11 +233,19 @@ def import_deepspeed():
print("DeepSpeed is enabled.")


def init_deepspeed_inference(model, model_name_or_path, use_hpu_graphs, is_meta, token=None):
def init_deepspeed_inference(model, model_name_or_path, peft_path, use_hpu_graphs, is_meta, token=None):
# Initialize the model
from habana_frameworks.torch.distributed.hccl import initialize_distributed_hpu # pylint: disable=E0401

world_size, rank, local_rank = initialize_distributed_hpu()
merged_model_dir = None
if peft_path and is_meta:
merged_model_dir = "/tmp/text_generation_merged_peft_model"
if local_rank == 0:
if Path(merged_model_dir).is_dir():
shutil.rmtree(merged_model_dir)
peft_model(model_name_or_path, peft_path, torch.bfloat16, token).save_pretrained(merged_model_dir)
torch.distributed.barrier()

model = model.eval()
ds_inference_kwargs = {"dtype": torch.bfloat16}
Expand All @@ -245,7 +254,8 @@ def init_deepspeed_inference(model, model_name_or_path, use_hpu_graphs, is_meta,
# Make sure all devices/nodes have access to the model checkpoints
if is_meta:
checkpoints_json = "checkpoints.json"
write_checkpoints_json(model_name_or_path, local_rank, checkpoints_json, token)
write_checkpoints_json(merged_model_dir if merged_model_dir is not None else model_name_or_path, local_rank,
checkpoints_json, token)

torch.distributed.barrier()

Expand All @@ -256,6 +266,50 @@ def init_deepspeed_inference(model, model_name_or_path, use_hpu_graphs, is_meta,
model = deepspeed.init_inference(model, **ds_inference_kwargs)
return model.module


def peft_model(model_name, peft_model, model_dtype, hf_access_token=None):
import importlib.util

if importlib.util.find_spec("peft") is None:
raise ImportError("The `peft` package is not installed, please run: `pip install peft`.")
from peft import AutoPeftModelForCausalLM
from peft.config import PeftConfigMixin

base_model_name = PeftConfigMixin.from_pretrained(
peft_model,
use_auth_token=hf_access_token,
).base_model_name_or_path

base_model_is_local = Path(base_model_name).is_dir()
if not base_model_is_local:
# Check if the base model path to a remote repository on the HF Hub exists
from huggingface_hub import list_repo_files

try:
list_repo_files(base_model_name)
base_model_is_remote = True
except Exception:
base_model_is_remote = False

if base_model_is_local or base_model_is_remote:
model = AutoPeftModelForCausalLM.from_pretrained(peft_model, torch_dtype=model_dtype, low_cpu_mem_usage=True,
use_auth_token=hf_access_token)
else:
# Since the base model doesn't exist locally nor remotely, use `args.model_name_or_path` as the base model
print(
f"The base model `{base_model_name}` of the LoRA configuration associated"
f" to `{peft_model}` does not exist locally or remotely. Using "
f"`--model_name_or_path {model_name}` as a fall back for the base model."
)
from peft import PeftModel

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype, low_cpu_mem_usage=True,
use_auth_token=hf_access_token)
model = PeftModel.from_pretrained(model, peft_model, torch_dtype=model_dtype, low_cpu_mem_usage=True,
use_auth_token=hf_access_token)

return model.merge_and_unload()

def load_model(
model_name,
tokenizer_name,
Expand Down Expand Up @@ -345,10 +399,6 @@ def load_model(
MODELS[model_name]["tokenizer"] = tokenizer
print("Optimized Model loaded.")
return

if peft_path and device == "hpu" and use_deepspeed and load_to_meta:
print("PEFT could not work in deepspeed sharded checkpt loading mode, set load_to_meta to False")
load_to_meta = False

if device == "hpu" and use_deepspeed and load_to_meta:
with deepspeed.OnDevice(dtype=torch.bfloat16, device="meta"):
Expand Down Expand Up @@ -479,7 +529,7 @@ def load_model(
print("Optimized Model loaded.")
return
if device == "hpu":
if peft_path:
if peft_path and not (use_deepspeed and load_to_meta):
from peft import PeftModel
model = PeftModel.from_pretrained(model, peft_path)
model = model.to(torch.bfloat16)
Expand All @@ -495,6 +545,7 @@ def load_model(
model = init_deepspeed_inference(
model=model,
model_name_or_path=model_name,
peft_path=peft_path,
use_hpu_graphs=use_hpu_graphs,
is_meta=load_to_meta,
token=hf_access_token,
Expand Down
12 changes: 12 additions & 0 deletions workflows/chatbot/inference/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,18 @@ python ../utils/gaudi_spawn.py --use_deepspeed --world_size 8 generate.py \

Habana supports HPU graph mode for inference speedup, which is available for bloom, gpt2, opt, gptj, gpt_neox, mpt, llama. You can use the parameter `use_hpu_graphs` to speed up the inference.

you can use '--peft_model_path' to apply you peft finetuned output model during generation.

```bash
python ../utils/gaudi_spawn.py --use_deepspeed --world_size 8 generate.py \
--base_model_path "meta-llama/Llama-2-70b-chat-hf" \
--peft_model_path <peft_model_output_folder>
--habana \
--use_hpu_graphs \
--use_kv_cache \
--task chat \
--instructions "Transform the following sentence into one that shows contrast. The tree is rotten."
```

# Additional Notes

Expand Down
Loading