Skip to content

Commit

Permalink
fixing trainer code
Browse files Browse the repository at this point in the history
  • Loading branch information
deepanker13 committed Jan 10, 2024
1 parent 5931a49 commit cf6419e
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 26 deletions.
74 changes: 74 additions & 0 deletions examples/sdk/train_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from kubeflow.training.api.training_client import TrainingClient
from kubeflow.storage_init_container.hugging_face import (
HuggingFaceModelParams,
HuggingFaceTrainParams,
HfDatasetParams,
TRANSFORMER_TYPES,
)
from kubeflow.storage_init_container.s3 import S3DatasetParams
from peft import LoraConfig
from transformers import TrainingArguments
import json

client = TrainingClient(
config_file="/Users/deepanker/Downloads/deepanker-test-kubectl.cfg"
)

client.train(
name="deepanker-test",
namespace="test",
num_workers=2,
num_procs_per_worker=0,
storage_config={
"size": "10Gi",
"storage_class": "deepanker-test",
},
model_provider_parameters=HuggingFaceModelParams(
model_uri="hf://Jedalc/codeparrot-gp2-finetune",
transformer_type=TRANSFORMER_TYPES.AutoModelForCausalLM,
),
dataset_provider_parameters=HfDatasetParams(
repo_id="imdatta0/ultrachat_10k",
access_token="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",
),
# dataset_provider_parameters=S3DatasetParams(endpoint_url="http://10.117.63.3", bucket_name="deepanker-test", file_key="list_roles_response.txt", access_key="qEMHyz8wNwLpUWkvfZmQZrj60TE6zX4p", secret_key="qIp_QNLPKI0LJ5X0F8NrypoSMSsw_Gfe" ),
train_parameters=HuggingFaceTrainParams(
lora_config=LoraConfig(
r=8,
lora_alpha=8,
target_modules=["c_attn", "c_proj", "w1", "w2"],
layers_to_transform=list(range(30, 40)),
# layers_pattern=['lm_head'],
lora_dropout=0.1,
bias="none",
task_type="CAUSAL_LM",
),
training_parameters=TrainingArguments(
num_train_epochs=2,
per_device_train_batch_size=1,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
warmup_steps=0.01,
# max_steps=50, #20,
learning_rate=1,
lr_scheduler_type="cosine",
bf16=False,
logging_steps=0.01,
output_dir="",
optim=f"paged_adamw_32bit",
save_steps=0.01,
save_total_limit=3,
disable_tqdm=False,
resume_from_checkpoint=True,
remove_unused_columns=True,
evaluation_strategy="steps",
eval_steps=0.01,
# eval_accumulation_steps=1,
per_device_eval_batch_size=1,
# load_best_model_at_end=True,
report_to="wandb",
run_name=f"{1}",
),
),
resources_per_worker={"gpu": 0, "cpu": 8, "memory": "8Gi"},
)
9 changes: 6 additions & 3 deletions sdk/python/kubeflow/trainer/hf_dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@ FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
WORKDIR /app

# Copy the Python package and its source code into the container
COPY . .
COPY . /app

# Copy the requirements.txt file into the container
COPY requirements.txt /app/requirements.txt

# Install any needed packages specified in requirements.txt
RUN pip install --no-cache-dir -r /app/requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# Run storage.py when the container launches
ENTRYPOINT ["python", "/app/hf_llm_training.py"]
ENTRYPOINT ["torchrun", "hf_llm_training.py"]

73 changes: 50 additions & 23 deletions sdk/python/kubeflow/trainer/hf_llm_training.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
import transformers
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
Expand All @@ -10,38 +11,54 @@
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from urllib.parse import urlparse
import os
import json


def setup_model_and_tokenizer(token_dir, model_dir):
def setup_model_and_tokenizer(model_uri, transformer_type, model_dir):
# Set up the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(
token_dir, use_fast=False, trust_remote_code=True

parsed_uri = urlparse(model_uri)
model_name = parsed_uri.netloc + parsed_uri.path
transformer_type_class = getattr(transformers, transformer_type)

model = transformer_type_class.from_pretrained(
pretrained_model_name_or_path=model_name,
cache_dir=model_dir,
local_files_only=True,
device_map="auto",
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_pad_token = True

model = AutoModelForCausalLM.from_pretrained(
model_dir,
# print(model)

tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=model_name,
cache_dir=model_dir,
local_files_only=True,
device_map="auto",
trust_remote_code=True,
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_pad_token = True

# print(tokenizer)

# Freeze model parameters
for param in model.parameters():
param.requires_grad = False

return model, tokenizer


def load_and_preprocess_data(dataset_dir, tokenizer):
def load_and_preprocess_data(dataset_name, dataset_dir):
# Load and preprocess the dataset
train_data = load_dataset(dataset_dir, split="train").map(
lambda x: tokenizer(x["text"]), batched=True
)
train_data = train_data.train_test_split(shuffle=True, test_size=0.1)

print("loading dataset")
dataset = load_dataset(dataset_name, cache_dir=dataset_dir)
train_data = dataset["train"]
# print(train_data)
try:
eval_data = load_dataset(dataset_dir, split="eval")
eval_data = dataset["eval"]
except Exception as err:
eval_data = None

Expand All @@ -50,7 +67,8 @@ def load_and_preprocess_data(dataset_dir, tokenizer):

def setup_peft_model(model, lora_config):
# Set up the PEFT model
lora_config = LoraConfig(**lora_config)
lora_config = LoraConfig(**json.loads(lora_config))
print(lora_config)
model = get_peft_model(model, lora_config)
return model

Expand All @@ -77,18 +95,27 @@ def parse_arguments():
parser = argparse.ArgumentParser(
description="Script for training a model with PEFT configuration."
)

parser.add_argument("--model_uri", help="model uri")
parser.add_argument("--transformer_type", help="model transformer type")
parser.add_argument("--model_dir", help="directory containing model")
parser.add_argument("--token_dir", help="directory containing tokenizer")
parser.add_argument("--dataset_dir", help="directory contaning dataset")
parser.add_argument("--peft_config", help="peft_config")
parser.add_argument("--train_parameters", help="hugging face training parameters")
parser.add_argument("--dataset_name", help="dataset name")
parser.add_argument("--lora_config", help="lora_config")
parser.add_argument(
"--training_parameters", help="hugging face training parameters"
)

return parser.parse_args()


if __name__ == "__main__":
args = parse_arguments()
model, tokenizer = setup_model_and_tokenizer(args.token_dir, args.model_dir)
train_data, eval_data = load_and_preprocess_data(args.dataset_dir, tokenizer)
model = setup_peft_model(model, args.peft_config)
train_model(model, train_data, eval_data, tokenizer, args.train_parameters)
model, tokenizer = setup_model_and_tokenizer(
args.model_uri, args.transformer_type, args.model_dir
)
train_data, eval_data = load_and_preprocess_data(
args.dataset_name, args.dataset_dir
)
model = setup_peft_model(model, args.lora_config)
train_model(model, train_data, eval_data, tokenizer, args.training_parameters)

0 comments on commit cf6419e

Please sign in to comment.