Skip to content

Commit

Permalink
downgrading pytorch version, removing changes for running things loca…
Browse files Browse the repository at this point in the history
…lly, adding jupyter notebook
  • Loading branch information
deepanker13 committed Jan 11, 2024
1 parent 112c581 commit 1034403
Show file tree
Hide file tree
Showing 7 changed files with 170 additions and 89 deletions.
133 changes: 133 additions & 0 deletions examples/sdk/train_api.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# install kubeflow-training extra 'huggingface'\n",
"!pip install -U 'kubeflow-training[huggingface]'"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# import the libraries\n",
"from kubeflow.training.api.training_client import TrainingClient\n",
"from kubeflow.storage_initializer.hugging_face import (\n",
" HuggingFaceModelParams,\n",
" HuggingFaceTrainParams,\n",
" HfDatasetParams,\n",
")\n",
"from kubeflow.storage_initializer.constants import INIT_CONTAINER_MOUNT_PATH\n",
"from peft import LoraConfig\n",
"import transformers\n",
"from transformers import TrainingArguments\n",
"from kubeflow.training import constants"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# create a training client, pass config_file parameter if you want to use kubeconfig other than \"~/.kube/config\"\n",
"client = TrainingClient()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# mention the model, datasets and training parameters\n",
"client.train(\n",
" name=\"huggingface-test\",\n",
" num_workers=2,\n",
" num_procs_per_worker=1,\n",
" # specify the storage class if you don't want to use the default one for the storage-initializer PVC\n",
" # storage_config={\n",
" # \"size\": \"10Gi\",\n",
" # \"storage_class\": \"<your storage class>\",\n",
" # },\n",
" model_provider_parameters=HuggingFaceModelParams(\n",
" model_uri=\"hf://TinyLlama/TinyLlama-1.1B-Chat-v1.0\",\n",
" transformer_type=transformers.AutoModelForCausalLM,\n",
" ),\n",
" # it is assumed for text related tasks, you have 'text' column in the dataset.\n",
" # for more info on how dataset is loaded check load_and_preprocess_data function in sdk/python/kubeflow/trainer/hf_llm_training.py\n",
" dataset_provider_parameters=HfDatasetParams(repo_id=\"imdatta0/ultrachat_1k\"),\n",
" train_parameters=HuggingFaceTrainParams(\n",
" lora_config=LoraConfig(\n",
" r=8,\n",
" lora_alpha=8,\n",
" lora_dropout=0.1,\n",
" bias=\"none\",\n",
" task_type=\"CAUSAL_LM\",\n",
" ),\n",
" training_parameters=TrainingArguments(\n",
" num_train_epochs=1,\n",
" per_device_train_batch_size=4,\n",
" gradient_accumulation_steps=4,\n",
" gradient_checkpointing=True,\n",
" warmup_steps=0.02,\n",
" learning_rate=1,\n",
" lr_scheduler_type=\"cosine\",\n",
" bf16=False,\n",
" logging_steps=0.01,\n",
" output_dir=INIT_CONTAINER_MOUNT_PATH,\n",
" optim=f\"sgd\",\n",
" save_steps=0.01,\n",
" save_total_limit=3,\n",
" disable_tqdm=False,\n",
" resume_from_checkpoint=True,\n",
" remove_unused_columns=True,\n",
" ),\n",
" ),\n",
" resources_per_worker={\n",
" \"gpu\": 1,\n",
" \"cpu\": 8,\n",
" \"memory\": \"16Gi\",\n",
" }, # remove the gpu key if you don't want to attach gpus to the pods\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# check the logs of the job\n",
"client.get_job_logs(name=\"huggingface-test\", job_kind=constants.PYTORCHJOB_KIND)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "myenv3.11",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
57 changes: 0 additions & 57 deletions examples/sdk/train_api.py

This file was deleted.

20 changes: 10 additions & 10 deletions sdk/python/kubeflow/trainer/hf_dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
# Use an official Pytorch runtime as a parent image
FROM nvcr.io/nvidia/pytorch:23.12-py3
FROM nvcr.io/nvidia/pytorch:23.10-py3

# Set the working directory in the container
WORKDIR /app
# Set the working directory in the container
WORKDIR /app

# Copy the Python package and its source code into the container
COPY . /app
# Copy the Python package and its source code into the container
COPY . /app

# Copy the requirements.txt file into the container
# Copy the requirements.txt file into the container
COPY requirements.txt /app/requirements.txt

# Install any needed packages specified in requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
# Install any needed packages specified in requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# Run storage.py when the container launches
ENTRYPOINT ["torchrun", "hf_llm_training.py"]
# Run storage.py when the container launches
ENTRYPOINT ["torchrun", "hf_llm_training.py"]

36 changes: 21 additions & 15 deletions sdk/python/kubeflow/trainer/hf_llm_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,8 @@
import json


def setup_model_and_tokenizer(model_uri, transformer_type, model_dir):
def setup_model_and_tokenizer(model_uri, transformer_type, model_dir, train_args):
# Set up the model and tokenizer

parsed_uri = urlparse(model_uri)
model_name = parsed_uri.netloc + parsed_uri.path
transformer_type_class = getattr(transformers, transformer_type)
Expand All @@ -28,6 +27,7 @@ def setup_model_and_tokenizer(model_uri, transformer_type, model_dir):
cache_dir=model_dir,
local_files_only=True,
device_map="auto",
trust_remote_code=True,
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
Expand All @@ -47,43 +47,48 @@ def setup_model_and_tokenizer(model_uri, transformer_type, model_dir):
return model, tokenizer


def load_and_preprocess_data(dataset_name, dataset_dir):
def load_and_preprocess_data(dataset_name, dataset_dir, transformer_type, tokenizer):
# Load and preprocess the dataset
print("loading dataset")
dataset = load_dataset(dataset_name, cache_dir=dataset_dir)
transformer_type_class = getattr(transformers, transformer_type)
if transformer_type_class != transformers.AutoModelForImageClassification:
dataset = load_dataset(dataset_name, cache_dir=dataset_dir).map(
lambda x: tokenizer(x["text"]), batched=True
)
else:
dataset = load_dataset(dataset_name, cache_dir=dataset_dir)

train_data = dataset["train"]

try:
eval_data = dataset["eval"]
except Exception as err:
eval_data = None
print("Evaluation dataset is not found")

return train_data, eval_data


def setup_peft_model(model, lora_config):
# Set up the PEFT model
lora_config = LoraConfig(**json.loads(lora_config))
print(lora_config)
model.enable_input_require_grads()
model = get_peft_model(model, lora_config)
return model


def train_model(model, train_data, eval_data, tokenizer, train_params):
def train_model(model, train_data, eval_data, tokenizer, train_args):
# Train the model
trainer = Trainer(
model=model,
train_dataset=train_data,
eval_dataset=eval_data,
tokenizer=tokenizer,
args=TrainingArguments(
**train_params,
data_collator=DataCollatorForLanguageModeling(
tokenizer, pad_to_multiple_of=8, return_tensors="pt", mlm=False
)
args=train_args,
data_collator=DataCollatorForLanguageModeling(
tokenizer, pad_to_multiple_of=8, mlm=False
),
)

trainer.train()
print("training done")

Expand All @@ -108,11 +113,12 @@ def parse_arguments():

if __name__ == "__main__":
args = parse_arguments()
train_args = TrainingArguments(**json.loads(args.training_parameters))
model, tokenizer = setup_model_and_tokenizer(
args.model_uri, args.transformer_type, args.model_dir
args.model_uri, args.transformer_type, args.model_dir, train_args
)
train_data, eval_data = load_and_preprocess_data(
args.dataset_name, args.dataset_dir
args.dataset_name, args.dataset_dir, args.transformer_type, tokenizer
)
model = setup_peft_model(model, args.lora_config)
train_model(model, train_data, eval_data, tokenizer, args.training_parameters)
train_model(model, train_data, eval_data, tokenizer, train_args)
6 changes: 4 additions & 2 deletions sdk/python/kubeflow/trainer/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
peft==0.7.0
peft>=0.3.0
datasets==2.15.0
transformers==4.35.2
transformers>=4.20.0
bitsandbytes>=0.42.0
einops>=0.6.1
3 changes: 1 addition & 2 deletions sdk/python/kubeflow/training/api/training_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,7 @@ def train(
),
)
except Exception as e:
pass # local
# raise RuntimeError("failed to create pvc")
raise RuntimeError("failed to create pvc")

if isinstance(model_provider_parameters, HuggingFaceModelParams):
mp = "hf"
Expand Down
4 changes: 1 addition & 3 deletions sdk/python/kubeflow/training/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,6 @@ def get_container_spec(
raise ValueError("container name or image cannot be none")

container_spec = models.V1Container(name=name, image=image)
container_spec.image_pull_policy = "Always"
if args:
container_spec.args = args

Expand Down Expand Up @@ -175,8 +174,7 @@ def get_pod_template_spec(
name=constants.JOB_PARAMETERS[job_kind]["container"],
image=base_image,
)
],
image_pull_secrets=[models.V1LocalObjectReference(name="regcred")],
]
),
)

Expand Down

0 comments on commit 1034403

Please sign in to comment.