diff --git a/examples/sdk/train_api.py b/examples/sdk/train_api.py new file mode 100644 index 0000000000..f6077a132b --- /dev/null +++ b/examples/sdk/train_api.py @@ -0,0 +1,74 @@ +from kubeflow.training.api.training_client import TrainingClient +from kubeflow.storage_init_container.hugging_face import ( + HuggingFaceModelParams, + HuggingFaceTrainParams, + HfDatasetParams, + TRANSFORMER_TYPES, +) +from kubeflow.storage_init_container.s3 import S3DatasetParams +from peft import LoraConfig +from transformers import TrainingArguments +import json + +client = TrainingClient( + config_file="/Users/deepanker/Downloads/deepanker-test-kubectl.cfg" +) + +client.train( + name="deepanker-test", + namespace="test", + num_workers=2, + num_procs_per_worker=0, + storage_config={ + "size": "10Gi", + "storage_class": "deepanker-test", + }, + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://Jedalc/codeparrot-gp2-finetune", + transformer_type=TRANSFORMER_TYPES.AutoModelForCausalLM, + ), + dataset_provider_parameters=HfDatasetParams( + repo_id="imdatta0/ultrachat_10k", + access_token="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", + ), + # dataset_provider_parameters=S3DatasetParams(endpoint_url="http://10.117.63.3", bucket_name="deepanker-test", file_key="list_roles_response.txt", access_key="qEMHyz8wNwLpUWkvfZmQZrj60TE6zX4p", secret_key="qIp_QNLPKI0LJ5X0F8NrypoSMSsw_Gfe" ), + train_parameters=HuggingFaceTrainParams( + lora_config=LoraConfig( + r=8, + lora_alpha=8, + target_modules=["c_attn", "c_proj", "w1", "w2"], + layers_to_transform=list(range(30, 40)), + # layers_pattern=['lm_head'], + lora_dropout=0.1, + bias="none", + task_type="CAUSAL_LM", + ), + training_parameters=TrainingArguments( + num_train_epochs=2, + per_device_train_batch_size=1, + gradient_accumulation_steps=1, + gradient_checkpointing=True, + warmup_steps=0.01, + # max_steps=50, #20, + learning_rate=1, + lr_scheduler_type="cosine", + bf16=False, + logging_steps=0.01, + output_dir="", + optim=f"paged_adamw_32bit", + save_steps=0.01, + save_total_limit=3, + disable_tqdm=False, + resume_from_checkpoint=True, + remove_unused_columns=True, + evaluation_strategy="steps", + eval_steps=0.01, + # eval_accumulation_steps=1, + per_device_eval_batch_size=1, + # load_best_model_at_end=True, + report_to="wandb", + run_name=f"{1}", + ), + ), + resources_per_worker={"gpu": 0, "cpu": 8, "memory": "8Gi"}, +) diff --git a/sdk/python/kubeflow/trainer/hf_dockerfile b/sdk/python/kubeflow/trainer/hf_dockerfile index f0f2ca16bf..c7671aefb9 100644 --- a/sdk/python/kubeflow/trainer/hf_dockerfile +++ b/sdk/python/kubeflow/trainer/hf_dockerfile @@ -5,11 +5,14 @@ FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime WORKDIR /app # Copy the Python package and its source code into the container - COPY . . + COPY . /app + + # Copy the requirements.txt file into the container + COPY requirements.txt /app/requirements.txt # Install any needed packages specified in requirements.txt - RUN pip install --no-cache-dir -r /app/requirements.txt + RUN pip install --no-cache-dir -r requirements.txt # Run storage.py when the container launches - ENTRYPOINT ["python", "/app/hf_llm_training.py"] + ENTRYPOINT ["torchrun", "hf_llm_training.py"] \ No newline at end of file diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py index 827a1e2ed2..6999dc483a 100644 --- a/sdk/python/kubeflow/trainer/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -1,4 +1,5 @@ import argparse +import transformers from transformers import ( AutoModelForCausalLM, AutoTokenizer, @@ -10,22 +11,39 @@ import torch from datasets import load_dataset from peft import LoraConfig, get_peft_model +from urllib.parse import urlparse +import os +import json -def setup_model_and_tokenizer(token_dir, model_dir): +def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): # Set up the model and tokenizer - tokenizer = AutoTokenizer.from_pretrained( - token_dir, use_fast=False, trust_remote_code=True + + parsed_uri = urlparse(model_uri) + model_name = parsed_uri.netloc + parsed_uri.path + transformer_type_class = getattr(transformers, transformer_type) + + model = transformer_type_class.from_pretrained( + pretrained_model_name_or_path=model_name, + cache_dir=model_dir, + local_files_only=True, + device_map="auto", ) - tokenizer.pad_token = tokenizer.eos_token - tokenizer.add_pad_token = True - model = AutoModelForCausalLM.from_pretrained( - model_dir, + # print(model) + + tokenizer = transformers.AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=model_name, + cache_dir=model_dir, + local_files_only=True, device_map="auto", - trust_remote_code=True, ) + tokenizer.pad_token = tokenizer.eos_token + tokenizer.add_pad_token = True + + # print(tokenizer) + # Freeze model parameters for param in model.parameters(): param.requires_grad = False @@ -33,15 +51,14 @@ def setup_model_and_tokenizer(token_dir, model_dir): return model, tokenizer -def load_and_preprocess_data(dataset_dir, tokenizer): +def load_and_preprocess_data(dataset_name, dataset_dir): # Load and preprocess the dataset - train_data = load_dataset(dataset_dir, split="train").map( - lambda x: tokenizer(x["text"]), batched=True - ) - train_data = train_data.train_test_split(shuffle=True, test_size=0.1) - + print("loading dataset") + dataset = load_dataset(dataset_name, cache_dir=dataset_dir) + train_data = dataset["train"] + # print(train_data) try: - eval_data = load_dataset(dataset_dir, split="eval") + eval_data = dataset["eval"] except Exception as err: eval_data = None @@ -50,7 +67,8 @@ def load_and_preprocess_data(dataset_dir, tokenizer): def setup_peft_model(model, lora_config): # Set up the PEFT model - lora_config = LoraConfig(**lora_config) + lora_config = LoraConfig(**json.loads(lora_config)) + print(lora_config) model = get_peft_model(model, lora_config) return model @@ -77,18 +95,27 @@ def parse_arguments(): parser = argparse.ArgumentParser( description="Script for training a model with PEFT configuration." ) + + parser.add_argument("--model_uri", help="model uri") + parser.add_argument("--transformer_type", help="model transformer type") parser.add_argument("--model_dir", help="directory containing model") - parser.add_argument("--token_dir", help="directory containing tokenizer") parser.add_argument("--dataset_dir", help="directory contaning dataset") - parser.add_argument("--peft_config", help="peft_config") - parser.add_argument("--train_parameters", help="hugging face training parameters") + parser.add_argument("--dataset_name", help="dataset name") + parser.add_argument("--lora_config", help="lora_config") + parser.add_argument( + "--training_parameters", help="hugging face training parameters" + ) return parser.parse_args() if __name__ == "__main__": args = parse_arguments() - model, tokenizer = setup_model_and_tokenizer(args.token_dir, args.model_dir) - train_data, eval_data = load_and_preprocess_data(args.dataset_dir, tokenizer) - model = setup_peft_model(model, args.peft_config) - train_model(model, train_data, eval_data, tokenizer, args.train_parameters) + model, tokenizer = setup_model_and_tokenizer( + args.model_uri, args.transformer_type, args.model_dir + ) + train_data, eval_data = load_and_preprocess_data( + args.dataset_name, args.dataset_dir + ) + model = setup_peft_model(model, args.lora_config) + train_model(model, train_data, eval_data, tokenizer, args.training_parameters)