Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Training image needed for train api #1963

Merged
merged 14 commits into from
Jan 11, 2024
14 changes: 11 additions & 3 deletions .github/workflows/publish-core-images.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@ jobs:
uses: ./.github/workflows/build-and-publish-images.yaml
with:
component-name: ${{ matrix.component-name }}
platforms: linux/amd64,linux/arm64,linux/ppc64le
platforms: ${{ matrix.platforms }}
dockerfile: ${{ matrix.dockerfile }}
context: ${{ matrix.context }}
secrets:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
Expand All @@ -22,8 +23,15 @@ jobs:
include:
- component-name: training-operator
dockerfile: build/images/training-operator/Dockerfile
platforms: linux/amd64,linux/arm64,linux/ppc64le
- component-name: kubectl-delivery
dockerfile: build/images/kubectl-delivery/Dockerfile
platforms: linux/amd64,linux/arm64,linux/ppc64le
- component-name: storage-initializer
dockerfile: sdk/python/kubeflow/storage_initializer/Dockerfile
context: sdk/python/kubeflow/storage_initializer
dockerfile: sdk/python/kubeflow/storage_initializer/Dockerfile
deepanker13 marked this conversation as resolved.
Show resolved Hide resolved
context: sdk/python/kubeflow/storage_initializer
platforms: linux/amd64,linux/arm64
- component-name: trainer-huggingface
dockerfile: sdk/python/kubeflow/trainer/hf_dockerfile
deepanker13 marked this conversation as resolved.
Show resolved Hide resolved
context: sdk/python/kubeflow/trainer
platforms: linux/amd64,linux/arm64
1 change: 0 additions & 1 deletion .github/workflows/publish-example-images.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ jobs:
- component-name: mxnet-auto-tuning
dockerfile: examples/mxnet/tune/Dockerfile
context: examples/mxnet/tune

# TODO (tenzen-y): Fix the below broken Dockerfiles
# - component-name: pytorch-dist-mnist-mpi
# dockerfile: examples/pytorch/mnist/Dockerfile-mpi
Expand Down
136 changes: 136 additions & 0 deletions examples/sdk/train_api.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# install kubeflow-training extra 'huggingface'\n",
"!pip install -U 'kubeflow-training[huggingface]'"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# import the libraries\n",
"from kubeflow.training.api.training_client import TrainingClient\n",
"from kubeflow.storage_initializer.hugging_face import (\n",
" HuggingFaceModelParams,\n",
" HuggingFaceTrainParams,\n",
" HfDatasetParams,\n",
")\n",
"from kubeflow.storage_initializer.constants import INIT_CONTAINER_MOUNT_PATH\n",
"from peft import LoraConfig\n",
"import transformers\n",
"from transformers import TrainingArguments\n",
"from kubeflow.training import constants"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# create a training client, pass config_file parameter if you want to use kubeconfig other than \"~/.kube/config\"\n",
"client = TrainingClient()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# mention the model, datasets and training parameters\n",
"client.train(\n",
" name=\"huggingface-test\",\n",
" num_workers=2,\n",
" num_procs_per_worker=1,\n",
" # specify the storage class if you don't want to use the default one for the storage-initializer PVC\n",
" # storage_config={\n",
" # \"size\": \"10Gi\",\n",
" # \"storage_class\": \"<your storage class>\",\n",
" # },\n",
" model_provider_parameters=HuggingFaceModelParams(\n",
" model_uri=\"hf://TinyLlama/TinyLlama-1.1B-Chat-v1.0\",\n",
" transformer_type=transformers.AutoModelForCausalLM,\n",
" ),\n",
" # it is assumed for text related tasks, you have 'text' column in the dataset.\n",
" # for more info on how dataset is loaded check load_and_preprocess_data function in sdk/python/kubeflow/trainer/hf_llm_training.py\n",
" dataset_provider_parameters=HfDatasetParams(repo_id=\"imdatta0/ultrachat_1k\"),\n",
" train_parameters=HuggingFaceTrainParams(\n",
" lora_config=LoraConfig(\n",
" r=8,\n",
" lora_alpha=8,\n",
" lora_dropout=0.1,\n",
" bias=\"none\",\n",
" task_type=\"CAUSAL_LM\",\n",
" ),\n",
" training_parameters=TrainingArguments(\n",
" num_train_epochs=1,\n",
" per_device_train_batch_size=1,\n",
" gradient_accumulation_steps=1,\n",
" gradient_checkpointing=True,\n",
" gradient_checkpointing_kwargs={\n",
" \"use_reentrant\": False\n",
" }, # this is mandatory if checkpointng is enabled\n",
" warmup_steps=0.02,\n",
" learning_rate=1,\n",
" lr_scheduler_type=\"cosine\",\n",
" bf16=False,\n",
" logging_steps=0.01,\n",
" output_dir=INIT_CONTAINER_MOUNT_PATH,\n",
" optim=f\"sgd\",\n",
" save_steps=0.01,\n",
" save_total_limit=3,\n",
" disable_tqdm=False,\n",
" resume_from_checkpoint=True,\n",
" remove_unused_columns=True,\n",
" ),\n",
" ),\n",
" resources_per_worker={\n",
" \"gpu\": 1,\n",
" \"cpu\": 8,\n",
" \"memory\": \"8Gi\",\n",
" }, # remove the gpu key if you don't want to attach gpus to the pods\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# check the logs of the job\n",
"client.get_job_logs(name=\"huggingface-test\", job_kind=constants.PYTORCHJOB_KIND)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "myenv3.11",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
18 changes: 18 additions & 0 deletions sdk/python/kubeflow/trainer/hf_dockerfile
deepanker13 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Use an official Pytorch runtime as a parent image
FROM nvcr.io/nvidia/pytorch:23.10-py3

# Set the working directory in the container
WORKDIR /app

# Copy the Python package and its source code into the container
COPY . /app

# Copy the requirements.txt file into the container
COPY requirements.txt /app/requirements.txt

# Install any needed packages specified in requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# Run storage.py when the container launches
ENTRYPOINT ["torchrun", "hf_llm_training.py"]

124 changes: 124 additions & 0 deletions sdk/python/kubeflow/trainer/hf_llm_training.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import argparse
import transformers
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
AutoConfig,
TrainingArguments,
DataCollatorForLanguageModeling,
Trainer,
)
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from urllib.parse import urlparse
import os
import json


def setup_model_and_tokenizer(model_uri, transformer_type, model_dir):
# Set up the model and tokenizer
parsed_uri = urlparse(model_uri)
model_name = parsed_uri.netloc + parsed_uri.path
transformer_type_class = getattr(transformers, transformer_type)

model = transformer_type_class.from_pretrained(
pretrained_model_name_or_path=model_name,
cache_dir=model_dir,
local_files_only=True,
device_map="auto",
trust_remote_code=True,
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=model_name,
cache_dir=model_dir,
local_files_only=True,
device_map="auto",
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_pad_token = True

# Freeze model parameters
for param in model.parameters():
param.requires_grad = False

return model, tokenizer


def load_and_preprocess_data(dataset_name, dataset_dir, transformer_type, tokenizer):
# Load and preprocess the dataset
print("loading dataset")
transformer_type_class = getattr(transformers, transformer_type)
if transformer_type_class != transformers.AutoModelForImageClassification:
dataset = load_dataset(dataset_name, cache_dir=dataset_dir).map(
lambda x: tokenizer(x["text"]), batched=True
)
else:
dataset = load_dataset(dataset_name, cache_dir=dataset_dir)

train_data = dataset["train"]

try:
eval_data = dataset["eval"]
except Exception as err:
eval_data = None
deepanker13 marked this conversation as resolved.
Show resolved Hide resolved
print("Evaluation dataset is not found")

return train_data, eval_data


def setup_peft_model(model, lora_config):
# Set up the PEFT model
lora_config = LoraConfig(**json.loads(lora_config))
model.enable_input_require_grads()
model = get_peft_model(model, lora_config)
return model


def train_model(model, train_data, eval_data, tokenizer, train_args):
# Train the model
trainer = Trainer(
model=model,
train_dataset=train_data,
eval_dataset=eval_data,
tokenizer=tokenizer,
args=train_args,
data_collator=DataCollatorForLanguageModeling(
tokenizer, pad_to_multiple_of=8, mlm=False
),
)
trainer.train()
print("training done")


def parse_arguments():
parser = argparse.ArgumentParser(
description="Script for training a model with PEFT configuration."
)

parser.add_argument("--model_uri", help="model uri")
parser.add_argument("--transformer_type", help="model transformer type")
parser.add_argument("--model_dir", help="directory containing model")
parser.add_argument("--dataset_dir", help="directory contaning dataset")
parser.add_argument("--dataset_name", help="dataset name")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We add dataset_name argument for users who want to use this Trainer without SDK client ?
I am asking because in SDK client we always download dataset in storage initializer and store it in Trainer volume.
So we don't need to provide name.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in the same dataset_dir there can be multiple datasets, right?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But can we use train API to download more than one dataset ?
E.g. in your example, you just download ultrachat_10k dataset.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, if I run with a different datasetname, it will work fine.
@andreyvelich

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, but for every API execution you create a new PyTorchJob and a new Trainer image will be spin up.
So dataset is always represent single name, isn't ?

parser.add_argument("--lora_config", help="lora_config")
parser.add_argument(
"--training_parameters", help="hugging face training parameters"
)

return parser.parse_args()


if __name__ == "__main__":
args = parse_arguments()
train_args = TrainingArguments(**json.loads(args.training_parameters))
model, tokenizer = setup_model_and_tokenizer(
args.model_uri, args.transformer_type, args.model_dir
)
train_data, eval_data = load_and_preprocess_data(
args.dataset_name, args.dataset_dir, args.transformer_type, tokenizer
)
model = setup_peft_model(model, args.lora_config)
train_model(model, train_data, eval_data, tokenizer, train_args)
5 changes: 5 additions & 0 deletions sdk/python/kubeflow/trainer/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
peft>=0.3.0
datasets==2.15.0
transformers>=4.20.0
bitsandbytes>=0.42.0
einops>=0.6.1
12 changes: 10 additions & 2 deletions sdk/python/kubeflow/training/api/training_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,16 @@ def train(
),
)
except Exception as e:
pass # local
# raise RuntimeError("failed to create pvc")
pvc_list = self.core_api.list_namespaced_persistent_volume_claim(namespace)
# Check if the PVC with the specified name exists
for pvc in pvc_list.items:
if pvc.metadata.name == constants.TRAINER_PVC_NAME:
print(
f"PVC '{constants.TRAINER_PVC_NAME}' already exists in namespace '{namespace}'."
)
break
else:
raise RuntimeError("failed to create pvc")

if isinstance(model_provider_parameters, HuggingFaceModelParams):
mp = "hf"
Expand Down
6 changes: 2 additions & 4 deletions sdk/python/kubeflow/training/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,6 @@ def get_container_spec(
raise ValueError("container name or image cannot be none")

container_spec = models.V1Container(name=name, image=image)
container_spec.image_pull_policy = "Always"
if args:
container_spec.args = args

Expand Down Expand Up @@ -175,8 +174,7 @@ def get_pod_template_spec(
name=constants.JOB_PARAMETERS[job_kind]["container"],
image=base_image,
)
],
image_pull_secrets=[models.V1LocalObjectReference(name="regcred")],
]
),
)

Expand Down Expand Up @@ -302,7 +300,7 @@ def get_pytorchjob_template(
master_pod_template_spec: models.V1PodTemplateSpec = None,
worker_pod_template_spec: models.V1PodTemplateSpec = None,
num_worker_replicas: Optional[int] = None,
num_procs_per_worker: Optional[int] = None,
num_procs_per_worker: Optional[int] = 0,
elastic_policy: Optional[models.KubeflowOrgV1ElasticPolicy] = None,
):
# Check if at least one replica is set.
Expand Down
Loading