kubeflow · google-oss-prow · Jan 11, 2024 · Dec 12, 2023 · Dec 12, 2023 · Dec 13, 2023
diff --git a/.github/workflows/publish-core-images.yaml b/.github/workflows/publish-core-images.yaml
@@ -10,8 +10,9 @@ jobs:
     uses: ./.github/workflows/build-and-publish-images.yaml
     with:
       component-name: ${{ matrix.component-name }}
-      platforms: linux/amd64,linux/arm64,linux/ppc64le
+      platforms: ${{ matrix.platforms }}
       dockerfile: ${{ matrix.dockerfile }}
+      context: ${{ matrix.context }}
     secrets:
       DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
       DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -22,8 +23,15 @@ jobs:
         include:
           - component-name: training-operator
             dockerfile: build/images/training-operator/Dockerfile
+            platforms: linux/amd64,linux/arm64,linux/ppc64le
           - component-name: kubectl-delivery
             dockerfile: build/images/kubectl-delivery/Dockerfile
+            platforms: linux/amd64,linux/arm64,linux/ppc64le
           - component-name: storage-initializer
-             dockerfile: sdk/python/kubeflow/storage_initializer/Dockerfile
-             context: sdk/python/kubeflow/storage_initializer
+            dockerfile: sdk/python/kubeflow/storage_initializer/Dockerfile
+            context: sdk/python/kubeflow/storage_initializer
+            platforms: linux/amd64,linux/arm64
+          - component-name: trainer-huggingface
+            dockerfile: sdk/python/kubeflow/trainer/hf_dockerfile
+            context: sdk/python/kubeflow/trainer
+            platforms: linux/amd64,linux/arm64
diff --git a/.github/workflows/publish-example-images.yaml b/.github/workflows/publish-example-images.yaml
@@ -52,7 +52,6 @@ jobs:
           - component-name: mxnet-auto-tuning
             dockerfile: examples/mxnet/tune/Dockerfile
             context: examples/mxnet/tune
-
 # TODO (tenzen-y): Fix the below broken Dockerfiles
 #          - component-name: pytorch-dist-mnist-mpi
 #            dockerfile: examples/pytorch/mnist/Dockerfile-mpi

diff --git a/examples/sdk/train_api.ipynb b/examples/sdk/train_api.ipynb
@@ -0,0 +1,136 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# install kubeflow-training extra 'huggingface'\n",
+    "!pip install -U 'kubeflow-training[huggingface]'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import the libraries\n",
+    "from kubeflow.training.api.training_client import TrainingClient\n",
+    "from kubeflow.storage_initializer.hugging_face import (\n",
+    "    HuggingFaceModelParams,\n",
+    "    HuggingFaceTrainParams,\n",
+    "    HfDatasetParams,\n",
+    ")\n",
+    "from kubeflow.storage_initializer.constants import INIT_CONTAINER_MOUNT_PATH\n",
+    "from peft import LoraConfig\n",
+    "import transformers\n",
+    "from transformers import TrainingArguments\n",
+    "from kubeflow.training import constants"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a training client, pass config_file parameter if you want to use kubeconfig other than \"~/.kube/config\"\n",
+    "client = TrainingClient()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mention the model, datasets and training parameters\n",
+    "client.train(\n",
+    "    name=\"huggingface-test\",\n",
+    "    num_workers=2,\n",
+    "    num_procs_per_worker=1,\n",
+    "    # specify the storage class if you don't want to use the default one for the storage-initializer PVC\n",
+    "    # storage_config={\n",
+    "    #     \"size\": \"10Gi\",\n",
+    "    #     \"storage_class\": \"<your storage class>\",\n",
+    "    # },\n",
+    "    model_provider_parameters=HuggingFaceModelParams(\n",
+    "        model_uri=\"hf://TinyLlama/TinyLlama-1.1B-Chat-v1.0\",\n",
+    "        transformer_type=transformers.AutoModelForCausalLM,\n",
+    "    ),\n",
+    "    # it is assumed for text related tasks, you have 'text' column in the dataset.\n",
+    "    # for more info on how dataset is loaded check load_and_preprocess_data function in sdk/python/kubeflow/trainer/hf_llm_training.py\n",
+    "    dataset_provider_parameters=HfDatasetParams(repo_id=\"imdatta0/ultrachat_1k\"),\n",
+    "    train_parameters=HuggingFaceTrainParams(\n",
+    "        lora_config=LoraConfig(\n",
+    "            r=8,\n",
+    "            lora_alpha=8,\n",
+    "            lora_dropout=0.1,\n",
+    "            bias=\"none\",\n",
+    "            task_type=\"CAUSAL_LM\",\n",
+    "        ),\n",
+    "        training_parameters=TrainingArguments(\n",
+    "            num_train_epochs=1,\n",
+    "            per_device_train_batch_size=1,\n",
+    "            gradient_accumulation_steps=1,\n",
+    "            gradient_checkpointing=True,\n",
+    "            gradient_checkpointing_kwargs={\n",
+    "                \"use_reentrant\": False\n",
+    "            },  # this is mandatory if checkpointng is enabled\n",
+    "            warmup_steps=0.02,\n",
+    "            learning_rate=1,\n",
+    "            lr_scheduler_type=\"cosine\",\n",
+    "            bf16=False,\n",
+    "            logging_steps=0.01,\n",
+    "            output_dir=INIT_CONTAINER_MOUNT_PATH,\n",
+    "            optim=f\"sgd\",\n",
+    "            save_steps=0.01,\n",
+    "            save_total_limit=3,\n",
+    "            disable_tqdm=False,\n",
+    "            resume_from_checkpoint=True,\n",
+    "            remove_unused_columns=True,\n",
+    "        ),\n",
+    "    ),\n",
+    "    resources_per_worker={\n",
+    "        \"gpu\": 1,\n",
+    "        \"cpu\": 8,\n",
+    "        \"memory\": \"8Gi\",\n",
+    "    },  # remove the gpu key if you don't want to attach gpus to the pods\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# check the logs of the job\n",
+    "client.get_job_logs(name=\"huggingface-test\", job_kind=constants.PYTORCHJOB_KIND)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "myenv3.11",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/sdk/python/kubeflow/trainer/hf_dockerfile b/sdk/python/kubeflow/trainer/hf_dockerfile
@@ -0,0 +1,18 @@
+# Use an official Pytorch runtime as a parent image
+FROM nvcr.io/nvidia/pytorch:23.10-py3
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy the Python package and its source code into the container
+COPY . /app
+
+# Copy the requirements.txt file into the container
+ COPY requirements.txt /app/requirements.txt
+
+# Install any needed packages specified in requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Run storage.py when the container launches
+ENTRYPOINT ["torchrun", "hf_llm_training.py"]
+
diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py
@@ -0,0 +1,124 @@
+import argparse
+import transformers
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    AutoConfig,
+    TrainingArguments,
+    DataCollatorForLanguageModeling,
+    Trainer,
+)
+import torch
+from datasets import load_dataset
+from peft import LoraConfig, get_peft_model
+from urllib.parse import urlparse
+import os
+import json
+
+
+def setup_model_and_tokenizer(model_uri, transformer_type, model_dir):
+    # Set up the model and tokenizer
+    parsed_uri = urlparse(model_uri)
+    model_name = parsed_uri.netloc + parsed_uri.path
+    transformer_type_class = getattr(transformers, transformer_type)
+
+    model = transformer_type_class.from_pretrained(
+        pretrained_model_name_or_path=model_name,
+        cache_dir=model_dir,
+        local_files_only=True,
+        device_map="auto",
+        trust_remote_code=True,
+    )
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        pretrained_model_name_or_path=model_name,
+        cache_dir=model_dir,
+        local_files_only=True,
+        device_map="auto",
+    )
+
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.add_pad_token = True
+
+    # Freeze model parameters
+    for param in model.parameters():
+        param.requires_grad = False
+
+    return model, tokenizer
+
+
+def load_and_preprocess_data(dataset_name, dataset_dir, transformer_type, tokenizer):
+    # Load and preprocess the dataset
+    print("loading dataset")
+    transformer_type_class = getattr(transformers, transformer_type)
+    if transformer_type_class != transformers.AutoModelForImageClassification:
+        dataset = load_dataset(dataset_name, cache_dir=dataset_dir).map(
+            lambda x: tokenizer(x["text"]), batched=True
+        )
+    else:
+        dataset = load_dataset(dataset_name, cache_dir=dataset_dir)
+
+    train_data = dataset["train"]
+
+    try:
+        eval_data = dataset["eval"]
+    except Exception as err:
+        eval_data = None
+        print("Evaluation dataset is not found")
+
+    return train_data, eval_data
+
+
+def setup_peft_model(model, lora_config):
+    # Set up the PEFT model
+    lora_config = LoraConfig(**json.loads(lora_config))
+    model.enable_input_require_grads()
+    model = get_peft_model(model, lora_config)
+    return model
+
+
+def train_model(model, train_data, eval_data, tokenizer, train_args):
+    # Train the model
+    trainer = Trainer(
+        model=model,
+        train_dataset=train_data,
+        eval_dataset=eval_data,
+        tokenizer=tokenizer,
+        args=train_args,
+        data_collator=DataCollatorForLanguageModeling(
+            tokenizer, pad_to_multiple_of=8, mlm=False
+        ),
+    )
+    trainer.train()
+    print("training done")
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="Script for training a model with PEFT configuration."
+    )
+
+    parser.add_argument("--model_uri", help="model uri")
+    parser.add_argument("--transformer_type", help="model transformer type")
+    parser.add_argument("--model_dir", help="directory containing model")
+    parser.add_argument("--dataset_dir", help="directory contaning dataset")
+    parser.add_argument("--dataset_name", help="dataset name")
+    parser.add_argument("--lora_config", help="lora_config")
+    parser.add_argument(
+        "--training_parameters", help="hugging face training parameters"
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    train_args = TrainingArguments(**json.loads(args.training_parameters))
+    model, tokenizer = setup_model_and_tokenizer(
+        args.model_uri, args.transformer_type, args.model_dir
+    )
+    train_data, eval_data = load_and_preprocess_data(
+        args.dataset_name, args.dataset_dir, args.transformer_type, tokenizer
+    )
+    model = setup_peft_model(model, args.lora_config)
+    train_model(model, train_data, eval_data, tokenizer, train_args)
diff --git a/sdk/python/kubeflow/trainer/requirements.txt b/sdk/python/kubeflow/trainer/requirements.txt
@@ -0,0 +1,5 @@
+peft>=0.3.0
+datasets==2.15.0
+transformers>=4.20.0
+bitsandbytes>=0.42.0
+einops>=0.6.1
diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py
@@ -171,8 +171,16 @@ def train(
                 ),
             )
         except Exception as e:
-            pass  # local
-            # raise RuntimeError("failed to create pvc")
+            pvc_list = self.core_api.list_namespaced_persistent_volume_claim(namespace)
+            # Check if the PVC with the specified name exists
+            for pvc in pvc_list.items:
+                if pvc.metadata.name == constants.TRAINER_PVC_NAME:
+                    print(
+                        f"PVC '{constants.TRAINER_PVC_NAME}' already exists in namespace '{namespace}'."
+                    )
+                    break
+            else:
+                raise RuntimeError("failed to create pvc")
 
         if isinstance(model_provider_parameters, HuggingFaceModelParams):
             mp = "hf"

diff --git a/sdk/python/kubeflow/training/utils/utils.py b/sdk/python/kubeflow/training/utils/utils.py
@@ -131,7 +131,6 @@ def get_container_spec(
         raise ValueError("container name or image cannot be none")
 
     container_spec = models.V1Container(name=name, image=image)
-    container_spec.image_pull_policy = "Always"
     if args:
         container_spec.args = args
 
@@ -175,8 +174,7 @@ def get_pod_template_spec(
                     name=constants.JOB_PARAMETERS[job_kind]["container"],
                     image=base_image,
                 )
-            ],
-            image_pull_secrets=[models.V1LocalObjectReference(name="regcred")],
+            ]
         ),
     )
 
@@ -302,7 +300,7 @@ def get_pytorchjob_template(
     master_pod_template_spec: models.V1PodTemplateSpec = None,
     worker_pod_template_spec: models.V1PodTemplateSpec = None,
     num_worker_replicas: Optional[int] = None,
-    num_procs_per_worker: Optional[int] = None,
+    num_procs_per_worker: Optional[int] = 0,
     elastic_policy: Optional[models.KubeflowOrgV1ElasticPolicy] = None,
 ):
     # Check if at least one replica is set.