From a2fc0cb98c2832ae0927836dde6e76c4181df4a7 Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Tue, 12 Dec 2023 18:32:34 +0530 Subject: [PATCH 01/14] adding training image creationcode --- .github/workflows/publish-sdk-images.yaml | 23 +++++ .../training/training_container/Dockerfile | 17 ++++ .../training_container/hf_llm_training.py | 86 +++++++++++++++++++ .../training_container/requirements.txt | 3 + 4 files changed, 129 insertions(+) create mode 100644 .github/workflows/publish-sdk-images.yaml create mode 100644 sdk/python/kubeflow/training/training_container/Dockerfile create mode 100644 sdk/python/kubeflow/training/training_container/hf_llm_training.py create mode 100644 sdk/python/kubeflow/training/training_container/requirements.txt diff --git a/.github/workflows/publish-sdk-images.yaml b/.github/workflows/publish-sdk-images.yaml new file mode 100644 index 0000000000..0fe412d712 --- /dev/null +++ b/.github/workflows/publish-sdk-images.yaml @@ -0,0 +1,23 @@ +name: Publish Training Operator SDK Images + +on: + - pull_request + +jobs: + core: + name: Publish Image + uses: ./.github/workflows/build-and-publish-images.yaml + with: + component-name: ${{ matrix.component-name }} + platforms: linux/amd64,linux/arm64,linux/ppc64le + dockerfile: ${{ matrix.dockerfile }} + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} + + strategy: + fail-fast: false + matrix: + include: + - component-name: train-api-training-image + dockerfile: sdk/python/kubeflow/training/training_container/Dockerfile \ No newline at end of file diff --git a/sdk/python/kubeflow/training/training_container/Dockerfile b/sdk/python/kubeflow/training/training_container/Dockerfile new file mode 100644 index 0000000000..d80edcf2c2 --- /dev/null +++ b/sdk/python/kubeflow/training/training_container/Dockerfile @@ -0,0 +1,17 @@ +# Use an official Pytorch runtime as a parent image +FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime + + # Set the working directory in the container + WORKDIR /app + + # Copy the Python package and its source code into the container + COPY . /app + + # Copy the requirements.txt file into the container + COPY requirements.txt /app/requirements.txt + + # Install any needed packages specified in requirements.txt + RUN pip install --no-cache-dir -r requirements.txt + + # Run storage.py when the container launches + ENTRYPOINT ["python", "hf_llm_training.py"] \ No newline at end of file diff --git a/sdk/python/kubeflow/training/training_container/hf_llm_training.py b/sdk/python/kubeflow/training/training_container/hf_llm_training.py new file mode 100644 index 0000000000..4c44243d51 --- /dev/null +++ b/sdk/python/kubeflow/training/training_container/hf_llm_training.py @@ -0,0 +1,86 @@ +import argparse +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + AutoConfig, + TrainingArguments, + DataCollatorForLanguageModeling, + Trainer +) +import torch +from datasets import load_dataset +from peft import LoraConfig, get_peft_model + + +def setup_model_and_tokenizer(token_dir, model_dir): + # Set up the model and tokenizer + tokenizer = AutoTokenizer.from_pretrained(token_dir, use_fast=False, trust_remote_code=True) + tokenizer.pad_token = tokenizer.eos_token + tokenizer.add_pad_token = True + + model = AutoModelForCausalLM.from_pretrained( + model_dir, + device_map='auto', + trust_remote_code=True, + ) + + # Freeze model parameters + for param in model.parameters(): + param.requires_grad = False + + return model, tokenizer + +def load_and_preprocess_data(dataset_dir, tokenizer): + # Load and preprocess the dataset + train_data = load_dataset(dataset_dir, split='train').map(lambda x: tokenizer(x['text']), batched=True) + train_data = train_data.train_test_split(shuffle=True, test_size=200) + + try: + eval_data = load_dataset(dataset_dir, split='eval') + except Exception as err: + eval_data = None + + return train_data, eval_data + +def setup_peft_model(model, lora_config): + # Set up the PEFT model + lora_config = LoraConfig(**lora_config) + model = get_peft_model(model, lora_config) + return model + +def train_model(model, train_data, eval_data,tokenizer, train_params): + # Train the model + trainer = Trainer( + model=model, + train_dataset=train_data, + eval_dataset=eval_data, + tokenizer=tokenizer, + args=TrainingArguments( + **train_params, + data_collator=DataCollatorForLanguageModeling( + tokenizer, + pad_to_multiple_of=8, + return_tensors="pt", + mlm=False + ) + ) + ) + + trainer.train() + +def parse_arguments(): + parser = argparse.ArgumentParser(description='Script for training a model with PEFT configuration.') + parser.add_argument('--model_dir', help='directory containing model') + parser.add_argument('--token_dir', help='directory containing tokenizer') + parser.add_argument('--dataset_dir', help='directory contaning dataset') + parser.add_argument('--peft_config', help='peft_config') + parser.add_argument('--train_params', help='hugging face training parameters') + + return parser.parse_args() + +if __name__ == "__main__": + args = parse_arguments() + model, tokenizer = setup_model_and_tokenizer(args.token_dir, args.model_dir) + train_data, eval_data = load_and_preprocess_data(args.dataset_dir, tokenizer) + model = setup_peft_model(model, args.peft_config) + train_model(model, train_data, eval_data, tokenizer, args) diff --git a/sdk/python/kubeflow/training/training_container/requirements.txt b/sdk/python/kubeflow/training/training_container/requirements.txt new file mode 100644 index 0000000000..e4c4b2b6c3 --- /dev/null +++ b/sdk/python/kubeflow/training/training_container/requirements.txt @@ -0,0 +1,3 @@ +peft==0.7.0 +datasets==2.15.0 +transformers==4.35.2 \ No newline at end of file From 3197a946c2719ae75be4cef476c975d03020dd8c Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Tue, 12 Dec 2023 18:37:29 +0530 Subject: [PATCH 02/14] reformatting code using black --- .../training_container/hf_llm_training.py | 42 +++++++++++-------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/sdk/python/kubeflow/training/training_container/hf_llm_training.py b/sdk/python/kubeflow/training/training_container/hf_llm_training.py index 4c44243d51..6a8749570f 100644 --- a/sdk/python/kubeflow/training/training_container/hf_llm_training.py +++ b/sdk/python/kubeflow/training/training_container/hf_llm_training.py @@ -5,7 +5,7 @@ AutoConfig, TrainingArguments, DataCollatorForLanguageModeling, - Trainer + Trainer, ) import torch from datasets import load_dataset @@ -14,13 +14,15 @@ def setup_model_and_tokenizer(token_dir, model_dir): # Set up the model and tokenizer - tokenizer = AutoTokenizer.from_pretrained(token_dir, use_fast=False, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained( + token_dir, use_fast=False, trust_remote_code=True + ) tokenizer.pad_token = tokenizer.eos_token tokenizer.add_pad_token = True model = AutoModelForCausalLM.from_pretrained( model_dir, - device_map='auto', + device_map="auto", trust_remote_code=True, ) @@ -30,25 +32,30 @@ def setup_model_and_tokenizer(token_dir, model_dir): return model, tokenizer + def load_and_preprocess_data(dataset_dir, tokenizer): # Load and preprocess the dataset - train_data = load_dataset(dataset_dir, split='train').map(lambda x: tokenizer(x['text']), batched=True) + train_data = load_dataset(dataset_dir, split="train").map( + lambda x: tokenizer(x["text"]), batched=True + ) train_data = train_data.train_test_split(shuffle=True, test_size=200) try: - eval_data = load_dataset(dataset_dir, split='eval') + eval_data = load_dataset(dataset_dir, split="eval") except Exception as err: eval_data = None return train_data, eval_data + def setup_peft_model(model, lora_config): # Set up the PEFT model lora_config = LoraConfig(**lora_config) model = get_peft_model(model, lora_config) return model -def train_model(model, train_data, eval_data,tokenizer, train_params): + +def train_model(model, train_data, eval_data, tokenizer, train_params): # Train the model trainer = Trainer( model=model, @@ -58,26 +65,27 @@ def train_model(model, train_data, eval_data,tokenizer, train_params): args=TrainingArguments( **train_params, data_collator=DataCollatorForLanguageModeling( - tokenizer, - pad_to_multiple_of=8, - return_tensors="pt", - mlm=False + tokenizer, pad_to_multiple_of=8, return_tensors="pt", mlm=False ) - ) + ), ) trainer.train() + def parse_arguments(): - parser = argparse.ArgumentParser(description='Script for training a model with PEFT configuration.') - parser.add_argument('--model_dir', help='directory containing model') - parser.add_argument('--token_dir', help='directory containing tokenizer') - parser.add_argument('--dataset_dir', help='directory contaning dataset') - parser.add_argument('--peft_config', help='peft_config') - parser.add_argument('--train_params', help='hugging face training parameters') + parser = argparse.ArgumentParser( + description="Script for training a model with PEFT configuration." + ) + parser.add_argument("--model_dir", help="directory containing model") + parser.add_argument("--token_dir", help="directory containing tokenizer") + parser.add_argument("--dataset_dir", help="directory contaning dataset") + parser.add_argument("--peft_config", help="peft_config") + parser.add_argument("--train_params", help="hugging face training parameters") return parser.parse_args() + if __name__ == "__main__": args = parse_arguments() model, tokenizer = setup_model_and_tokenizer(args.token_dir, args.model_dir) From 5c2d48ceccc7c3abcb6966ea23ee8cf8f903d7ba Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Wed, 13 Dec 2023 16:52:37 +0530 Subject: [PATCH 03/14] code review changes --- .github/workflows/publish-sdk-images.yaml | 4 ++-- .../training_container/Dockerfile => trainer/hf_dockerfile} | 3 ++- .../training_container => trainer}/hf_llm_training.py | 2 +- .../{training/training_container => trainer}/requirements.txt | 0 4 files changed, 5 insertions(+), 4 deletions(-) rename sdk/python/kubeflow/{training/training_container/Dockerfile => trainer/hf_dockerfile} (91%) rename sdk/python/kubeflow/{training/training_container => trainer}/hf_llm_training.py (99%) rename sdk/python/kubeflow/{training/training_container => trainer}/requirements.txt (100%) diff --git a/.github/workflows/publish-sdk-images.yaml b/.github/workflows/publish-sdk-images.yaml index 0fe412d712..86e707a0b5 100644 --- a/.github/workflows/publish-sdk-images.yaml +++ b/.github/workflows/publish-sdk-images.yaml @@ -19,5 +19,5 @@ jobs: fail-fast: false matrix: include: - - component-name: train-api-training-image - dockerfile: sdk/python/kubeflow/training/training_container/Dockerfile \ No newline at end of file + - component-name: train-api-hf-image + dockerfile: sdk/python/kubeflow/trainer/hf_dockerfile diff --git a/sdk/python/kubeflow/training/training_container/Dockerfile b/sdk/python/kubeflow/trainer/hf_dockerfile similarity index 91% rename from sdk/python/kubeflow/training/training_container/Dockerfile rename to sdk/python/kubeflow/trainer/hf_dockerfile index d80edcf2c2..d03c458238 100644 --- a/sdk/python/kubeflow/training/training_container/Dockerfile +++ b/sdk/python/kubeflow/trainer/hf_dockerfile @@ -14,4 +14,5 @@ FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime RUN pip install --no-cache-dir -r requirements.txt # Run storage.py when the container launches - ENTRYPOINT ["python", "hf_llm_training.py"] \ No newline at end of file + ENTRYPOINT ["python", "hf_llm_training.py"] + \ No newline at end of file diff --git a/sdk/python/kubeflow/training/training_container/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py similarity index 99% rename from sdk/python/kubeflow/training/training_container/hf_llm_training.py rename to sdk/python/kubeflow/trainer/hf_llm_training.py index 6a8749570f..23ab4bb407 100644 --- a/sdk/python/kubeflow/training/training_container/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -38,7 +38,7 @@ def load_and_preprocess_data(dataset_dir, tokenizer): train_data = load_dataset(dataset_dir, split="train").map( lambda x: tokenizer(x["text"]), batched=True ) - train_data = train_data.train_test_split(shuffle=True, test_size=200) + train_data = train_data.train_test_split(shuffle=True, test_size=0.1) try: eval_data = load_dataset(dataset_dir, split="eval") diff --git a/sdk/python/kubeflow/training/training_container/requirements.txt b/sdk/python/kubeflow/trainer/requirements.txt similarity index 100% rename from sdk/python/kubeflow/training/training_container/requirements.txt rename to sdk/python/kubeflow/trainer/requirements.txt From c7260da790c0ad5feea22b0d82d305955f25b3c2 Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Wed, 20 Dec 2023 10:37:30 +0530 Subject: [PATCH 04/14] fixes --- sdk/python/kubeflow/trainer/hf_dockerfile | 7 ++----- sdk/python/kubeflow/trainer/hf_llm_training.py | 4 ++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/sdk/python/kubeflow/trainer/hf_dockerfile b/sdk/python/kubeflow/trainer/hf_dockerfile index d03c458238..0853a233ae 100644 --- a/sdk/python/kubeflow/trainer/hf_dockerfile +++ b/sdk/python/kubeflow/trainer/hf_dockerfile @@ -7,12 +7,9 @@ FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime # Copy the Python package and its source code into the container COPY . /app - # Copy the requirements.txt file into the container - COPY requirements.txt /app/requirements.txt - # Install any needed packages specified in requirements.txt - RUN pip install --no-cache-dir -r requirements.txt + RUN pip install --no-cache-dir -r /app/requirements.txt # Run storage.py when the container launches - ENTRYPOINT ["python", "hf_llm_training.py"] + ENTRYPOINT ["python", "/app/hf_llm_training.py"] \ No newline at end of file diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py index 23ab4bb407..827a1e2ed2 100644 --- a/sdk/python/kubeflow/trainer/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -81,7 +81,7 @@ def parse_arguments(): parser.add_argument("--token_dir", help="directory containing tokenizer") parser.add_argument("--dataset_dir", help="directory contaning dataset") parser.add_argument("--peft_config", help="peft_config") - parser.add_argument("--train_params", help="hugging face training parameters") + parser.add_argument("--train_parameters", help="hugging face training parameters") return parser.parse_args() @@ -91,4 +91,4 @@ def parse_arguments(): model, tokenizer = setup_model_and_tokenizer(args.token_dir, args.model_dir) train_data, eval_data = load_and_preprocess_data(args.dataset_dir, tokenizer) model = setup_peft_model(model, args.peft_config) - train_model(model, train_data, eval_data, tokenizer, args) + train_model(model, train_data, eval_data, tokenizer, args.train_parameters) From 61fb7e192d116b27153b50c326734afac06f4827 Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Thu, 21 Dec 2023 15:44:20 +0530 Subject: [PATCH 05/14] running workflow on push --- .github/workflows/publish-sdk-images.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/publish-sdk-images.yaml b/.github/workflows/publish-sdk-images.yaml index 86e707a0b5..0d6f7abf42 100644 --- a/.github/workflows/publish-sdk-images.yaml +++ b/.github/workflows/publish-sdk-images.yaml @@ -1,6 +1,7 @@ name: Publish Training Operator SDK Images on: + - push - pull_request jobs: From 94d05430ce912e9ef08faa88b1ce62782c6aad25 Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Thu, 21 Dec 2023 15:57:21 +0530 Subject: [PATCH 06/14] correcting context --- .github/workflows/publish-sdk-images.yaml | 1 + sdk/python/kubeflow/trainer/hf_dockerfile | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/publish-sdk-images.yaml b/.github/workflows/publish-sdk-images.yaml index 0d6f7abf42..e39df90d18 100644 --- a/.github/workflows/publish-sdk-images.yaml +++ b/.github/workflows/publish-sdk-images.yaml @@ -22,3 +22,4 @@ jobs: include: - component-name: train-api-hf-image dockerfile: sdk/python/kubeflow/trainer/hf_dockerfile + context: sdk/python/kubeflow/trainer diff --git a/sdk/python/kubeflow/trainer/hf_dockerfile b/sdk/python/kubeflow/trainer/hf_dockerfile index 0853a233ae..f0f2ca16bf 100644 --- a/sdk/python/kubeflow/trainer/hf_dockerfile +++ b/sdk/python/kubeflow/trainer/hf_dockerfile @@ -5,7 +5,7 @@ FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime WORKDIR /app # Copy the Python package and its source code into the container - COPY . /app + COPY . . # Install any needed packages specified in requirements.txt RUN pip install --no-cache-dir -r /app/requirements.txt From 5931a4964e2d6dc39d6a49383b6248849fb6de74 Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Thu, 21 Dec 2023 16:35:53 +0530 Subject: [PATCH 07/14] correcting context --- .github/workflows/publish-sdk-images.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/publish-sdk-images.yaml b/.github/workflows/publish-sdk-images.yaml index e39df90d18..432e17e9e8 100644 --- a/.github/workflows/publish-sdk-images.yaml +++ b/.github/workflows/publish-sdk-images.yaml @@ -12,6 +12,7 @@ jobs: component-name: ${{ matrix.component-name }} platforms: linux/amd64,linux/arm64,linux/ppc64le dockerfile: ${{ matrix.dockerfile }} + context: ${{ matrix.context }} secrets: DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} From cf6419e7e703e940305e9c629e47146c2523ded8 Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Fri, 5 Jan 2024 00:31:38 +0530 Subject: [PATCH 08/14] fixing trainer code --- examples/sdk/train_api.py | 74 +++++++++++++++++++ sdk/python/kubeflow/trainer/hf_dockerfile | 9 ++- .../kubeflow/trainer/hf_llm_training.py | 73 ++++++++++++------ 3 files changed, 130 insertions(+), 26 deletions(-) create mode 100644 examples/sdk/train_api.py diff --git a/examples/sdk/train_api.py b/examples/sdk/train_api.py new file mode 100644 index 0000000000..f6077a132b --- /dev/null +++ b/examples/sdk/train_api.py @@ -0,0 +1,74 @@ +from kubeflow.training.api.training_client import TrainingClient +from kubeflow.storage_init_container.hugging_face import ( + HuggingFaceModelParams, + HuggingFaceTrainParams, + HfDatasetParams, + TRANSFORMER_TYPES, +) +from kubeflow.storage_init_container.s3 import S3DatasetParams +from peft import LoraConfig +from transformers import TrainingArguments +import json + +client = TrainingClient( + config_file="/Users/deepanker/Downloads/deepanker-test-kubectl.cfg" +) + +client.train( + name="deepanker-test", + namespace="test", + num_workers=2, + num_procs_per_worker=0, + storage_config={ + "size": "10Gi", + "storage_class": "deepanker-test", + }, + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://Jedalc/codeparrot-gp2-finetune", + transformer_type=TRANSFORMER_TYPES.AutoModelForCausalLM, + ), + dataset_provider_parameters=HfDatasetParams( + repo_id="imdatta0/ultrachat_10k", + access_token="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", + ), + # dataset_provider_parameters=S3DatasetParams(endpoint_url="http://10.117.63.3", bucket_name="deepanker-test", file_key="list_roles_response.txt", access_key="qEMHyz8wNwLpUWkvfZmQZrj60TE6zX4p", secret_key="qIp_QNLPKI0LJ5X0F8NrypoSMSsw_Gfe" ), + train_parameters=HuggingFaceTrainParams( + lora_config=LoraConfig( + r=8, + lora_alpha=8, + target_modules=["c_attn", "c_proj", "w1", "w2"], + layers_to_transform=list(range(30, 40)), + # layers_pattern=['lm_head'], + lora_dropout=0.1, + bias="none", + task_type="CAUSAL_LM", + ), + training_parameters=TrainingArguments( + num_train_epochs=2, + per_device_train_batch_size=1, + gradient_accumulation_steps=1, + gradient_checkpointing=True, + warmup_steps=0.01, + # max_steps=50, #20, + learning_rate=1, + lr_scheduler_type="cosine", + bf16=False, + logging_steps=0.01, + output_dir="", + optim=f"paged_adamw_32bit", + save_steps=0.01, + save_total_limit=3, + disable_tqdm=False, + resume_from_checkpoint=True, + remove_unused_columns=True, + evaluation_strategy="steps", + eval_steps=0.01, + # eval_accumulation_steps=1, + per_device_eval_batch_size=1, + # load_best_model_at_end=True, + report_to="wandb", + run_name=f"{1}", + ), + ), + resources_per_worker={"gpu": 0, "cpu": 8, "memory": "8Gi"}, +) diff --git a/sdk/python/kubeflow/trainer/hf_dockerfile b/sdk/python/kubeflow/trainer/hf_dockerfile index f0f2ca16bf..c7671aefb9 100644 --- a/sdk/python/kubeflow/trainer/hf_dockerfile +++ b/sdk/python/kubeflow/trainer/hf_dockerfile @@ -5,11 +5,14 @@ FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime WORKDIR /app # Copy the Python package and its source code into the container - COPY . . + COPY . /app + + # Copy the requirements.txt file into the container + COPY requirements.txt /app/requirements.txt # Install any needed packages specified in requirements.txt - RUN pip install --no-cache-dir -r /app/requirements.txt + RUN pip install --no-cache-dir -r requirements.txt # Run storage.py when the container launches - ENTRYPOINT ["python", "/app/hf_llm_training.py"] + ENTRYPOINT ["torchrun", "hf_llm_training.py"] \ No newline at end of file diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py index 827a1e2ed2..6999dc483a 100644 --- a/sdk/python/kubeflow/trainer/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -1,4 +1,5 @@ import argparse +import transformers from transformers import ( AutoModelForCausalLM, AutoTokenizer, @@ -10,22 +11,39 @@ import torch from datasets import load_dataset from peft import LoraConfig, get_peft_model +from urllib.parse import urlparse +import os +import json -def setup_model_and_tokenizer(token_dir, model_dir): +def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): # Set up the model and tokenizer - tokenizer = AutoTokenizer.from_pretrained( - token_dir, use_fast=False, trust_remote_code=True + + parsed_uri = urlparse(model_uri) + model_name = parsed_uri.netloc + parsed_uri.path + transformer_type_class = getattr(transformers, transformer_type) + + model = transformer_type_class.from_pretrained( + pretrained_model_name_or_path=model_name, + cache_dir=model_dir, + local_files_only=True, + device_map="auto", ) - tokenizer.pad_token = tokenizer.eos_token - tokenizer.add_pad_token = True - model = AutoModelForCausalLM.from_pretrained( - model_dir, + # print(model) + + tokenizer = transformers.AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=model_name, + cache_dir=model_dir, + local_files_only=True, device_map="auto", - trust_remote_code=True, ) + tokenizer.pad_token = tokenizer.eos_token + tokenizer.add_pad_token = True + + # print(tokenizer) + # Freeze model parameters for param in model.parameters(): param.requires_grad = False @@ -33,15 +51,14 @@ def setup_model_and_tokenizer(token_dir, model_dir): return model, tokenizer -def load_and_preprocess_data(dataset_dir, tokenizer): +def load_and_preprocess_data(dataset_name, dataset_dir): # Load and preprocess the dataset - train_data = load_dataset(dataset_dir, split="train").map( - lambda x: tokenizer(x["text"]), batched=True - ) - train_data = train_data.train_test_split(shuffle=True, test_size=0.1) - + print("loading dataset") + dataset = load_dataset(dataset_name, cache_dir=dataset_dir) + train_data = dataset["train"] + # print(train_data) try: - eval_data = load_dataset(dataset_dir, split="eval") + eval_data = dataset["eval"] except Exception as err: eval_data = None @@ -50,7 +67,8 @@ def load_and_preprocess_data(dataset_dir, tokenizer): def setup_peft_model(model, lora_config): # Set up the PEFT model - lora_config = LoraConfig(**lora_config) + lora_config = LoraConfig(**json.loads(lora_config)) + print(lora_config) model = get_peft_model(model, lora_config) return model @@ -77,18 +95,27 @@ def parse_arguments(): parser = argparse.ArgumentParser( description="Script for training a model with PEFT configuration." ) + + parser.add_argument("--model_uri", help="model uri") + parser.add_argument("--transformer_type", help="model transformer type") parser.add_argument("--model_dir", help="directory containing model") - parser.add_argument("--token_dir", help="directory containing tokenizer") parser.add_argument("--dataset_dir", help="directory contaning dataset") - parser.add_argument("--peft_config", help="peft_config") - parser.add_argument("--train_parameters", help="hugging face training parameters") + parser.add_argument("--dataset_name", help="dataset name") + parser.add_argument("--lora_config", help="lora_config") + parser.add_argument( + "--training_parameters", help="hugging face training parameters" + ) return parser.parse_args() if __name__ == "__main__": args = parse_arguments() - model, tokenizer = setup_model_and_tokenizer(args.token_dir, args.model_dir) - train_data, eval_data = load_and_preprocess_data(args.dataset_dir, tokenizer) - model = setup_peft_model(model, args.peft_config) - train_model(model, train_data, eval_data, tokenizer, args.train_parameters) + model, tokenizer = setup_model_and_tokenizer( + args.model_uri, args.transformer_type, args.model_dir + ) + train_data, eval_data = load_and_preprocess_data( + args.dataset_name, args.dataset_dir + ) + model = setup_peft_model(model, args.lora_config) + train_model(model, train_data, eval_data, tokenizer, args.training_parameters) From 32041490e003a5e13d686fd70392173d2ce1fc20 Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Mon, 8 Jan 2024 15:52:03 +0530 Subject: [PATCH 09/14] code review changes --- .github/workflows/publish-example-images.yaml | 4 ++- .github/workflows/publish-sdk-images.yaml | 26 ------------------- sdk/python/kubeflow/trainer/hf_dockerfile | 2 +- 3 files changed, 4 insertions(+), 28 deletions(-) delete mode 100644 .github/workflows/publish-sdk-images.yaml diff --git a/.github/workflows/publish-example-images.yaml b/.github/workflows/publish-example-images.yaml index 616c2f1072..a9e805b60b 100644 --- a/.github/workflows/publish-example-images.yaml +++ b/.github/workflows/publish-example-images.yaml @@ -52,7 +52,9 @@ jobs: - component-name: mxnet-auto-tuning dockerfile: examples/mxnet/tune/Dockerfile context: examples/mxnet/tune - + - component-name: train-api-hf-image + dockerfile: sdk/python/kubeflow/trainer/hf_dockerfile + context: sdk/python/kubeflow/trainer # TODO (tenzen-y): Fix the below broken Dockerfiles # - component-name: pytorch-dist-mnist-mpi # dockerfile: examples/pytorch/mnist/Dockerfile-mpi diff --git a/.github/workflows/publish-sdk-images.yaml b/.github/workflows/publish-sdk-images.yaml deleted file mode 100644 index 432e17e9e8..0000000000 --- a/.github/workflows/publish-sdk-images.yaml +++ /dev/null @@ -1,26 +0,0 @@ -name: Publish Training Operator SDK Images - -on: - - push - - pull_request - -jobs: - core: - name: Publish Image - uses: ./.github/workflows/build-and-publish-images.yaml - with: - component-name: ${{ matrix.component-name }} - platforms: linux/amd64,linux/arm64,linux/ppc64le - dockerfile: ${{ matrix.dockerfile }} - context: ${{ matrix.context }} - secrets: - DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} - DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} - - strategy: - fail-fast: false - matrix: - include: - - component-name: train-api-hf-image - dockerfile: sdk/python/kubeflow/trainer/hf_dockerfile - context: sdk/python/kubeflow/trainer diff --git a/sdk/python/kubeflow/trainer/hf_dockerfile b/sdk/python/kubeflow/trainer/hf_dockerfile index c7671aefb9..f0ddd8f9c2 100644 --- a/sdk/python/kubeflow/trainer/hf_dockerfile +++ b/sdk/python/kubeflow/trainer/hf_dockerfile @@ -1,5 +1,5 @@ # Use an official Pytorch runtime as a parent image -FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime +FROM nvcr.io/nvidia/pytorch:23.12-py3 # Set the working directory in the container WORKDIR /app From 4642b9e481d8b7deae10d9cdcc9578c9bfdb1c24 Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Wed, 10 Jan 2024 22:39:24 +0530 Subject: [PATCH 10/14] resolving merge conflict --- .github/workflows/publish-core-images.yaml | 7 +++++-- .github/workflows/publish-example-images.yaml | 3 --- examples/sdk/train_api.py | 18 ++++++------------ sdk/python/kubeflow/trainer/hf_llm_training.py | 7 ++----- 4 files changed, 13 insertions(+), 22 deletions(-) diff --git a/.github/workflows/publish-core-images.yaml b/.github/workflows/publish-core-images.yaml index b81185071d..91a7bb0329 100644 --- a/.github/workflows/publish-core-images.yaml +++ b/.github/workflows/publish-core-images.yaml @@ -25,5 +25,8 @@ jobs: - component-name: kubectl-delivery dockerfile: build/images/kubectl-delivery/Dockerfile - component-name: storage-initializer - dockerfile: sdk/python/kubeflow/storage_initializer/Dockerfile - context: sdk/python/kubeflow/storage_initializer \ No newline at end of file + dockerfile: sdk/python/kubeflow/storage_initializer/Dockerfile + context: sdk/python/kubeflow/storage_initializer + - component-name: trainer-huggingface + dockerfile: sdk/python/kubeflow/trainer/hf_dockerfile + context: sdk/python/kubeflow/trainer diff --git a/.github/workflows/publish-example-images.yaml b/.github/workflows/publish-example-images.yaml index a9e805b60b..d3e7f4f549 100644 --- a/.github/workflows/publish-example-images.yaml +++ b/.github/workflows/publish-example-images.yaml @@ -52,9 +52,6 @@ jobs: - component-name: mxnet-auto-tuning dockerfile: examples/mxnet/tune/Dockerfile context: examples/mxnet/tune - - component-name: train-api-hf-image - dockerfile: sdk/python/kubeflow/trainer/hf_dockerfile - context: sdk/python/kubeflow/trainer # TODO (tenzen-y): Fix the below broken Dockerfiles # - component-name: pytorch-dist-mnist-mpi # dockerfile: examples/pytorch/mnist/Dockerfile-mpi diff --git a/examples/sdk/train_api.py b/examples/sdk/train_api.py index f6077a132b..783f9ed5d7 100644 --- a/examples/sdk/train_api.py +++ b/examples/sdk/train_api.py @@ -1,14 +1,12 @@ from kubeflow.training.api.training_client import TrainingClient -from kubeflow.storage_init_container.hugging_face import ( +from kubeflow.storage_initializer.hugging_face import ( HuggingFaceModelParams, HuggingFaceTrainParams, HfDatasetParams, - TRANSFORMER_TYPES, ) -from kubeflow.storage_init_container.s3 import S3DatasetParams from peft import LoraConfig +import transformers from transformers import TrainingArguments -import json client = TrainingClient( config_file="/Users/deepanker/Downloads/deepanker-test-kubectl.cfg" @@ -25,13 +23,9 @@ }, model_provider_parameters=HuggingFaceModelParams( model_uri="hf://Jedalc/codeparrot-gp2-finetune", - transformer_type=TRANSFORMER_TYPES.AutoModelForCausalLM, + transformer_type=transformers.AutoModelForCausalLM, ), - dataset_provider_parameters=HfDatasetParams( - repo_id="imdatta0/ultrachat_10k", - access_token="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", - ), - # dataset_provider_parameters=S3DatasetParams(endpoint_url="http://10.117.63.3", bucket_name="deepanker-test", file_key="list_roles_response.txt", access_key="qEMHyz8wNwLpUWkvfZmQZrj60TE6zX4p", secret_key="qIp_QNLPKI0LJ5X0F8NrypoSMSsw_Gfe" ), + dataset_provider_parameters=HfDatasetParams(repo_id="imdatta0/ultrachat_10k"), train_parameters=HuggingFaceTrainParams( lora_config=LoraConfig( r=8, @@ -66,8 +60,8 @@ # eval_accumulation_steps=1, per_device_eval_batch_size=1, # load_best_model_at_end=True, - report_to="wandb", - run_name=f"{1}", + # report_to="wandb", + # run_name=f"{1}", ), ), resources_per_worker={"gpu": 0, "cpu": 8, "memory": "8Gi"}, diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py index 6999dc483a..2bccc0ac4a 100644 --- a/sdk/python/kubeflow/trainer/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -30,8 +30,6 @@ def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): device_map="auto", ) - # print(model) - tokenizer = transformers.AutoTokenizer.from_pretrained( pretrained_model_name_or_path=model_name, cache_dir=model_dir, @@ -42,8 +40,6 @@ def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): tokenizer.pad_token = tokenizer.eos_token tokenizer.add_pad_token = True - # print(tokenizer) - # Freeze model parameters for param in model.parameters(): param.requires_grad = False @@ -56,7 +52,7 @@ def load_and_preprocess_data(dataset_name, dataset_dir): print("loading dataset") dataset = load_dataset(dataset_name, cache_dir=dataset_dir) train_data = dataset["train"] - # print(train_data) + try: eval_data = dataset["eval"] except Exception as err: @@ -89,6 +85,7 @@ def train_model(model, train_data, eval_data, tokenizer, train_params): ) trainer.train() + print("training done") def parse_arguments(): From 112c581577bb8aa28259a9cd6fdd95c5777ed4da Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Thu, 11 Jan 2024 10:39:49 +0530 Subject: [PATCH 11/14] code review changes --- .github/workflows/publish-core-images.yaml | 7 ++++++- examples/sdk/train_api.py | 15 ++------------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/.github/workflows/publish-core-images.yaml b/.github/workflows/publish-core-images.yaml index 91a7bb0329..20cfa9d554 100644 --- a/.github/workflows/publish-core-images.yaml +++ b/.github/workflows/publish-core-images.yaml @@ -10,8 +10,9 @@ jobs: uses: ./.github/workflows/build-and-publish-images.yaml with: component-name: ${{ matrix.component-name }} - platforms: linux/amd64,linux/arm64,linux/ppc64le + platforms: ${{ matrix.platforms }} dockerfile: ${{ matrix.dockerfile }} + context: ${{ matrix.context }} secrets: DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} @@ -22,11 +23,15 @@ jobs: include: - component-name: training-operator dockerfile: build/images/training-operator/Dockerfile + platforms: linux/amd64,linux/arm64,linux/ppc64le - component-name: kubectl-delivery dockerfile: build/images/kubectl-delivery/Dockerfile + platforms: linux/amd64,linux/arm64,linux/ppc64le - component-name: storage-initializer dockerfile: sdk/python/kubeflow/storage_initializer/Dockerfile context: sdk/python/kubeflow/storage_initializer + platforms: linux/amd64,linux/arm64 - component-name: trainer-huggingface dockerfile: sdk/python/kubeflow/trainer/hf_dockerfile context: sdk/python/kubeflow/trainer + platforms: linux/amd64,linux/arm64 diff --git a/examples/sdk/train_api.py b/examples/sdk/train_api.py index 783f9ed5d7..e71c1c7fd0 100644 --- a/examples/sdk/train_api.py +++ b/examples/sdk/train_api.py @@ -8,19 +8,12 @@ import transformers from transformers import TrainingArguments -client = TrainingClient( - config_file="/Users/deepanker/Downloads/deepanker-test-kubectl.cfg" -) +client = TrainingClient() client.train( - name="deepanker-test", - namespace="test", + name="hf-test", num_workers=2, num_procs_per_worker=0, - storage_config={ - "size": "10Gi", - "storage_class": "deepanker-test", - }, model_provider_parameters=HuggingFaceModelParams( model_uri="hf://Jedalc/codeparrot-gp2-finetune", transformer_type=transformers.AutoModelForCausalLM, @@ -57,11 +50,7 @@ remove_unused_columns=True, evaluation_strategy="steps", eval_steps=0.01, - # eval_accumulation_steps=1, per_device_eval_batch_size=1, - # load_best_model_at_end=True, - # report_to="wandb", - # run_name=f"{1}", ), ), resources_per_worker={"gpu": 0, "cpu": 8, "memory": "8Gi"}, From 1034403b8f88be92ff94c313911c6f79be927c25 Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Thu, 11 Jan 2024 23:37:17 +0530 Subject: [PATCH 12/14] downgrading pytorch version, removing changes for running things locally, adding jupyter notebook --- examples/sdk/train_api.ipynb | 133 ++++++++++++++++++ examples/sdk/train_api.py | 57 -------- sdk/python/kubeflow/trainer/hf_dockerfile | 20 +-- .../kubeflow/trainer/hf_llm_training.py | 36 +++-- sdk/python/kubeflow/trainer/requirements.txt | 6 +- .../kubeflow/training/api/training_client.py | 3 +- sdk/python/kubeflow/training/utils/utils.py | 4 +- 7 files changed, 170 insertions(+), 89 deletions(-) create mode 100644 examples/sdk/train_api.ipynb delete mode 100644 examples/sdk/train_api.py diff --git a/examples/sdk/train_api.ipynb b/examples/sdk/train_api.ipynb new file mode 100644 index 0000000000..76c74a0354 --- /dev/null +++ b/examples/sdk/train_api.ipynb @@ -0,0 +1,133 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# install kubeflow-training extra 'huggingface'\n", + "!pip install -U 'kubeflow-training[huggingface]'" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# import the libraries\n", + "from kubeflow.training.api.training_client import TrainingClient\n", + "from kubeflow.storage_initializer.hugging_face import (\n", + " HuggingFaceModelParams,\n", + " HuggingFaceTrainParams,\n", + " HfDatasetParams,\n", + ")\n", + "from kubeflow.storage_initializer.constants import INIT_CONTAINER_MOUNT_PATH\n", + "from peft import LoraConfig\n", + "import transformers\n", + "from transformers import TrainingArguments\n", + "from kubeflow.training import constants" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# create a training client, pass config_file parameter if you want to use kubeconfig other than \"~/.kube/config\"\n", + "client = TrainingClient()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# mention the model, datasets and training parameters\n", + "client.train(\n", + " name=\"huggingface-test\",\n", + " num_workers=2,\n", + " num_procs_per_worker=1,\n", + " # specify the storage class if you don't want to use the default one for the storage-initializer PVC\n", + " # storage_config={\n", + " # \"size\": \"10Gi\",\n", + " # \"storage_class\": \"\",\n", + " # },\n", + " model_provider_parameters=HuggingFaceModelParams(\n", + " model_uri=\"hf://TinyLlama/TinyLlama-1.1B-Chat-v1.0\",\n", + " transformer_type=transformers.AutoModelForCausalLM,\n", + " ),\n", + " # it is assumed for text related tasks, you have 'text' column in the dataset.\n", + " # for more info on how dataset is loaded check load_and_preprocess_data function in sdk/python/kubeflow/trainer/hf_llm_training.py\n", + " dataset_provider_parameters=HfDatasetParams(repo_id=\"imdatta0/ultrachat_1k\"),\n", + " train_parameters=HuggingFaceTrainParams(\n", + " lora_config=LoraConfig(\n", + " r=8,\n", + " lora_alpha=8,\n", + " lora_dropout=0.1,\n", + " bias=\"none\",\n", + " task_type=\"CAUSAL_LM\",\n", + " ),\n", + " training_parameters=TrainingArguments(\n", + " num_train_epochs=1,\n", + " per_device_train_batch_size=4,\n", + " gradient_accumulation_steps=4,\n", + " gradient_checkpointing=True,\n", + " warmup_steps=0.02,\n", + " learning_rate=1,\n", + " lr_scheduler_type=\"cosine\",\n", + " bf16=False,\n", + " logging_steps=0.01,\n", + " output_dir=INIT_CONTAINER_MOUNT_PATH,\n", + " optim=f\"sgd\",\n", + " save_steps=0.01,\n", + " save_total_limit=3,\n", + " disable_tqdm=False,\n", + " resume_from_checkpoint=True,\n", + " remove_unused_columns=True,\n", + " ),\n", + " ),\n", + " resources_per_worker={\n", + " \"gpu\": 1,\n", + " \"cpu\": 8,\n", + " \"memory\": \"16Gi\",\n", + " }, # remove the gpu key if you don't want to attach gpus to the pods\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# check the logs of the job\n", + "client.get_job_logs(name=\"huggingface-test\", job_kind=constants.PYTORCHJOB_KIND)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "myenv3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/sdk/train_api.py b/examples/sdk/train_api.py deleted file mode 100644 index e71c1c7fd0..0000000000 --- a/examples/sdk/train_api.py +++ /dev/null @@ -1,57 +0,0 @@ -from kubeflow.training.api.training_client import TrainingClient -from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceModelParams, - HuggingFaceTrainParams, - HfDatasetParams, -) -from peft import LoraConfig -import transformers -from transformers import TrainingArguments - -client = TrainingClient() - -client.train( - name="hf-test", - num_workers=2, - num_procs_per_worker=0, - model_provider_parameters=HuggingFaceModelParams( - model_uri="hf://Jedalc/codeparrot-gp2-finetune", - transformer_type=transformers.AutoModelForCausalLM, - ), - dataset_provider_parameters=HfDatasetParams(repo_id="imdatta0/ultrachat_10k"), - train_parameters=HuggingFaceTrainParams( - lora_config=LoraConfig( - r=8, - lora_alpha=8, - target_modules=["c_attn", "c_proj", "w1", "w2"], - layers_to_transform=list(range(30, 40)), - # layers_pattern=['lm_head'], - lora_dropout=0.1, - bias="none", - task_type="CAUSAL_LM", - ), - training_parameters=TrainingArguments( - num_train_epochs=2, - per_device_train_batch_size=1, - gradient_accumulation_steps=1, - gradient_checkpointing=True, - warmup_steps=0.01, - # max_steps=50, #20, - learning_rate=1, - lr_scheduler_type="cosine", - bf16=False, - logging_steps=0.01, - output_dir="", - optim=f"paged_adamw_32bit", - save_steps=0.01, - save_total_limit=3, - disable_tqdm=False, - resume_from_checkpoint=True, - remove_unused_columns=True, - evaluation_strategy="steps", - eval_steps=0.01, - per_device_eval_batch_size=1, - ), - ), - resources_per_worker={"gpu": 0, "cpu": 8, "memory": "8Gi"}, -) diff --git a/sdk/python/kubeflow/trainer/hf_dockerfile b/sdk/python/kubeflow/trainer/hf_dockerfile index f0ddd8f9c2..d82b715552 100644 --- a/sdk/python/kubeflow/trainer/hf_dockerfile +++ b/sdk/python/kubeflow/trainer/hf_dockerfile @@ -1,18 +1,18 @@ # Use an official Pytorch runtime as a parent image -FROM nvcr.io/nvidia/pytorch:23.12-py3 +FROM nvcr.io/nvidia/pytorch:23.10-py3 - # Set the working directory in the container - WORKDIR /app +# Set the working directory in the container +WORKDIR /app - # Copy the Python package and its source code into the container - COPY . /app +# Copy the Python package and its source code into the container +COPY . /app - # Copy the requirements.txt file into the container +# Copy the requirements.txt file into the container COPY requirements.txt /app/requirements.txt - # Install any needed packages specified in requirements.txt - RUN pip install --no-cache-dir -r requirements.txt +# Install any needed packages specified in requirements.txt +RUN pip install --no-cache-dir -r requirements.txt - # Run storage.py when the container launches - ENTRYPOINT ["torchrun", "hf_llm_training.py"] +# Run storage.py when the container launches +ENTRYPOINT ["torchrun", "hf_llm_training.py"] \ No newline at end of file diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py index 2bccc0ac4a..26c48c08dd 100644 --- a/sdk/python/kubeflow/trainer/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -16,9 +16,8 @@ import json -def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): +def setup_model_and_tokenizer(model_uri, transformer_type, model_dir, train_args): # Set up the model and tokenizer - parsed_uri = urlparse(model_uri) model_name = parsed_uri.netloc + parsed_uri.path transformer_type_class = getattr(transformers, transformer_type) @@ -28,6 +27,7 @@ def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): cache_dir=model_dir, local_files_only=True, device_map="auto", + trust_remote_code=True, ) tokenizer = transformers.AutoTokenizer.from_pretrained( @@ -47,16 +47,24 @@ def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): return model, tokenizer -def load_and_preprocess_data(dataset_name, dataset_dir): +def load_and_preprocess_data(dataset_name, dataset_dir, transformer_type, tokenizer): # Load and preprocess the dataset print("loading dataset") - dataset = load_dataset(dataset_name, cache_dir=dataset_dir) + transformer_type_class = getattr(transformers, transformer_type) + if transformer_type_class != transformers.AutoModelForImageClassification: + dataset = load_dataset(dataset_name, cache_dir=dataset_dir).map( + lambda x: tokenizer(x["text"]), batched=True + ) + else: + dataset = load_dataset(dataset_name, cache_dir=dataset_dir) + train_data = dataset["train"] try: eval_data = dataset["eval"] except Exception as err: eval_data = None + print("Evaluation dataset is not found") return train_data, eval_data @@ -64,26 +72,23 @@ def load_and_preprocess_data(dataset_name, dataset_dir): def setup_peft_model(model, lora_config): # Set up the PEFT model lora_config = LoraConfig(**json.loads(lora_config)) - print(lora_config) + model.enable_input_require_grads() model = get_peft_model(model, lora_config) return model -def train_model(model, train_data, eval_data, tokenizer, train_params): +def train_model(model, train_data, eval_data, tokenizer, train_args): # Train the model trainer = Trainer( model=model, train_dataset=train_data, eval_dataset=eval_data, tokenizer=tokenizer, - args=TrainingArguments( - **train_params, - data_collator=DataCollatorForLanguageModeling( - tokenizer, pad_to_multiple_of=8, return_tensors="pt", mlm=False - ) + args=train_args, + data_collator=DataCollatorForLanguageModeling( + tokenizer, pad_to_multiple_of=8, mlm=False ), ) - trainer.train() print("training done") @@ -108,11 +113,12 @@ def parse_arguments(): if __name__ == "__main__": args = parse_arguments() + train_args = TrainingArguments(**json.loads(args.training_parameters)) model, tokenizer = setup_model_and_tokenizer( - args.model_uri, args.transformer_type, args.model_dir + args.model_uri, args.transformer_type, args.model_dir, train_args ) train_data, eval_data = load_and_preprocess_data( - args.dataset_name, args.dataset_dir + args.dataset_name, args.dataset_dir, args.transformer_type, tokenizer ) model = setup_peft_model(model, args.lora_config) - train_model(model, train_data, eval_data, tokenizer, args.training_parameters) + train_model(model, train_data, eval_data, tokenizer, train_args) diff --git a/sdk/python/kubeflow/trainer/requirements.txt b/sdk/python/kubeflow/trainer/requirements.txt index e4c4b2b6c3..f342311be0 100644 --- a/sdk/python/kubeflow/trainer/requirements.txt +++ b/sdk/python/kubeflow/trainer/requirements.txt @@ -1,3 +1,5 @@ -peft==0.7.0 +peft>=0.3.0 datasets==2.15.0 -transformers==4.35.2 \ No newline at end of file +transformers>=4.20.0 +bitsandbytes>=0.42.0 +einops>=0.6.1 diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py index 37c782c5c5..a8187de7e0 100644 --- a/sdk/python/kubeflow/training/api/training_client.py +++ b/sdk/python/kubeflow/training/api/training_client.py @@ -171,8 +171,7 @@ def train( ), ) except Exception as e: - pass # local - # raise RuntimeError("failed to create pvc") + raise RuntimeError("failed to create pvc") if isinstance(model_provider_parameters, HuggingFaceModelParams): mp = "hf" diff --git a/sdk/python/kubeflow/training/utils/utils.py b/sdk/python/kubeflow/training/utils/utils.py index 09130a4de1..655839225b 100644 --- a/sdk/python/kubeflow/training/utils/utils.py +++ b/sdk/python/kubeflow/training/utils/utils.py @@ -131,7 +131,6 @@ def get_container_spec( raise ValueError("container name or image cannot be none") container_spec = models.V1Container(name=name, image=image) - container_spec.image_pull_policy = "Always" if args: container_spec.args = args @@ -175,8 +174,7 @@ def get_pod_template_spec( name=constants.JOB_PARAMETERS[job_kind]["container"], image=base_image, ) - ], - image_pull_secrets=[models.V1LocalObjectReference(name="regcred")], + ] ), ) From f520329e5e69f4de687381d816d2a087dd95a877 Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Thu, 11 Jan 2024 23:45:44 +0530 Subject: [PATCH 13/14] ci fix and removing unused parameter and adding check if pvc exists already --- sdk/python/kubeflow/trainer/hf_llm_training.py | 4 ++-- sdk/python/kubeflow/training/api/training_client.py | 11 ++++++++++- sdk/python/kubeflow/training/utils/utils.py | 2 +- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py index 26c48c08dd..c39c547c83 100644 --- a/sdk/python/kubeflow/trainer/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -16,7 +16,7 @@ import json -def setup_model_and_tokenizer(model_uri, transformer_type, model_dir, train_args): +def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): # Set up the model and tokenizer parsed_uri = urlparse(model_uri) model_name = parsed_uri.netloc + parsed_uri.path @@ -115,7 +115,7 @@ def parse_arguments(): args = parse_arguments() train_args = TrainingArguments(**json.loads(args.training_parameters)) model, tokenizer = setup_model_and_tokenizer( - args.model_uri, args.transformer_type, args.model_dir, train_args + args.model_uri, args.transformer_type, args.model_dir ) train_data, eval_data = load_and_preprocess_data( args.dataset_name, args.dataset_dir, args.transformer_type, tokenizer diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py index a8187de7e0..fe62e0c271 100644 --- a/sdk/python/kubeflow/training/api/training_client.py +++ b/sdk/python/kubeflow/training/api/training_client.py @@ -171,7 +171,16 @@ def train( ), ) except Exception as e: - raise RuntimeError("failed to create pvc") + pvc_list = self.core_api.list_namespaced_persistent_volume_claim(namespace) + # Check if the PVC with the specified name exists + for pvc in pvc_list.items: + if pvc.metadata.name == constants.TRAINER_PVC_NAME: + print( + f"PVC '{constants.TRAINER_PVC_NAME}' already exists in namespace '{namespace}'." + ) + break + else: + raise RuntimeError("failed to create pvc") if isinstance(model_provider_parameters, HuggingFaceModelParams): mp = "hf" diff --git a/sdk/python/kubeflow/training/utils/utils.py b/sdk/python/kubeflow/training/utils/utils.py index 655839225b..d4a9a0e011 100644 --- a/sdk/python/kubeflow/training/utils/utils.py +++ b/sdk/python/kubeflow/training/utils/utils.py @@ -300,7 +300,7 @@ def get_pytorchjob_template( master_pod_template_spec: models.V1PodTemplateSpec = None, worker_pod_template_spec: models.V1PodTemplateSpec = None, num_worker_replicas: Optional[int] = None, - num_procs_per_worker: Optional[int] = None, + num_procs_per_worker: Optional[int] = 0, elastic_policy: Optional[models.KubeflowOrgV1ElasticPolicy] = None, ): # Check if at least one replica is set. From 95b3e2b3a39580177ef4ce8c6ee98a36532828fb Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Fri, 12 Jan 2024 01:27:12 +0530 Subject: [PATCH 14/14] gpu training example fix --- examples/sdk/train_api.ipynb | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/sdk/train_api.ipynb b/examples/sdk/train_api.ipynb index 76c74a0354..242efe8ebb 100644 --- a/examples/sdk/train_api.ipynb +++ b/examples/sdk/train_api.ipynb @@ -73,9 +73,12 @@ " ),\n", " training_parameters=TrainingArguments(\n", " num_train_epochs=1,\n", - " per_device_train_batch_size=4,\n", - " gradient_accumulation_steps=4,\n", + " per_device_train_batch_size=1,\n", + " gradient_accumulation_steps=1,\n", " gradient_checkpointing=True,\n", + " gradient_checkpointing_kwargs={\n", + " \"use_reentrant\": False\n", + " }, # this is mandatory if checkpointng is enabled\n", " warmup_steps=0.02,\n", " learning_rate=1,\n", " lr_scheduler_type=\"cosine\",\n", @@ -93,7 +96,7 @@ " resources_per_worker={\n", " \"gpu\": 1,\n", " \"cpu\": 8,\n", - " \"memory\": \"16Gi\",\n", + " \"memory\": \"8Gi\",\n", " }, # remove the gpu key if you don't want to attach gpus to the pods\n", ")" ]