diff --git a/.github/workflows/publish-sdk-images.yaml b/.github/workflows/publish-sdk-images.yaml index 0fe412d712..86e707a0b5 100644 --- a/.github/workflows/publish-sdk-images.yaml +++ b/.github/workflows/publish-sdk-images.yaml @@ -19,5 +19,5 @@ jobs: fail-fast: false matrix: include: - - component-name: train-api-training-image - dockerfile: sdk/python/kubeflow/training/training_container/Dockerfile \ No newline at end of file + - component-name: train-api-hf-image + dockerfile: sdk/python/kubeflow/trainer/hf_dockerfile diff --git a/sdk/python/kubeflow/training/training_container/Dockerfile b/sdk/python/kubeflow/trainer/hf_dockerfile similarity index 91% rename from sdk/python/kubeflow/training/training_container/Dockerfile rename to sdk/python/kubeflow/trainer/hf_dockerfile index d80edcf2c2..d03c458238 100644 --- a/sdk/python/kubeflow/training/training_container/Dockerfile +++ b/sdk/python/kubeflow/trainer/hf_dockerfile @@ -14,4 +14,5 @@ FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime RUN pip install --no-cache-dir -r requirements.txt # Run storage.py when the container launches - ENTRYPOINT ["python", "hf_llm_training.py"] \ No newline at end of file + ENTRYPOINT ["python", "hf_llm_training.py"] + \ No newline at end of file diff --git a/sdk/python/kubeflow/training/training_container/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py similarity index 99% rename from sdk/python/kubeflow/training/training_container/hf_llm_training.py rename to sdk/python/kubeflow/trainer/hf_llm_training.py index 6a8749570f..23ab4bb407 100644 --- a/sdk/python/kubeflow/training/training_container/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -38,7 +38,7 @@ def load_and_preprocess_data(dataset_dir, tokenizer): train_data = load_dataset(dataset_dir, split="train").map( lambda x: tokenizer(x["text"]), batched=True ) - train_data = train_data.train_test_split(shuffle=True, test_size=200) + train_data = train_data.train_test_split(shuffle=True, test_size=0.1) try: eval_data = load_dataset(dataset_dir, split="eval") diff --git a/sdk/python/kubeflow/training/training_container/requirements.txt b/sdk/python/kubeflow/trainer/requirements.txt similarity index 100% rename from sdk/python/kubeflow/training/training_container/requirements.txt rename to sdk/python/kubeflow/trainer/requirements.txt