From 854d6fd0ad24cf7cf654f37828749ae6e3c02cd5 Mon Sep 17 00:00:00 2001 From: deepanker13 Date: Wed, 10 Jan 2024 22:30:57 +0530 Subject: [PATCH] publishing trainer from publish-core-images --- .github/workflows/publish-core-images.yaml | 3 +++ .github/workflows/publish-example-images.yaml | 3 --- examples/sdk/train_api.py | 18 ++++++------------ sdk/python/kubeflow/trainer/hf_llm_training.py | 7 ++----- 4 files changed, 11 insertions(+), 20 deletions(-) diff --git a/.github/workflows/publish-core-images.yaml b/.github/workflows/publish-core-images.yaml index d24d384cb6..f869114a60 100644 --- a/.github/workflows/publish-core-images.yaml +++ b/.github/workflows/publish-core-images.yaml @@ -24,3 +24,6 @@ jobs: dockerfile: build/images/training-operator/Dockerfile - component-name: kubectl-delivery dockerfile: build/images/kubectl-delivery/Dockerfile + - component-name: trainer-huggingface + dockerfile: sdk/python/kubeflow/trainer/hf_dockerfile + context: sdk/python/kubeflow/trainer \ No newline at end of file diff --git a/.github/workflows/publish-example-images.yaml b/.github/workflows/publish-example-images.yaml index a9e805b60b..d3e7f4f549 100644 --- a/.github/workflows/publish-example-images.yaml +++ b/.github/workflows/publish-example-images.yaml @@ -52,9 +52,6 @@ jobs: - component-name: mxnet-auto-tuning dockerfile: examples/mxnet/tune/Dockerfile context: examples/mxnet/tune - - component-name: train-api-hf-image - dockerfile: sdk/python/kubeflow/trainer/hf_dockerfile - context: sdk/python/kubeflow/trainer # TODO (tenzen-y): Fix the below broken Dockerfiles # - component-name: pytorch-dist-mnist-mpi # dockerfile: examples/pytorch/mnist/Dockerfile-mpi diff --git a/examples/sdk/train_api.py b/examples/sdk/train_api.py index f6077a132b..783f9ed5d7 100644 --- a/examples/sdk/train_api.py +++ b/examples/sdk/train_api.py @@ -1,14 +1,12 @@ from kubeflow.training.api.training_client import TrainingClient -from kubeflow.storage_init_container.hugging_face import ( +from kubeflow.storage_initializer.hugging_face import ( HuggingFaceModelParams, HuggingFaceTrainParams, HfDatasetParams, - TRANSFORMER_TYPES, ) -from kubeflow.storage_init_container.s3 import S3DatasetParams from peft import LoraConfig +import transformers from transformers import TrainingArguments -import json client = TrainingClient( config_file="/Users/deepanker/Downloads/deepanker-test-kubectl.cfg" @@ -25,13 +23,9 @@ }, model_provider_parameters=HuggingFaceModelParams( model_uri="hf://Jedalc/codeparrot-gp2-finetune", - transformer_type=TRANSFORMER_TYPES.AutoModelForCausalLM, + transformer_type=transformers.AutoModelForCausalLM, ), - dataset_provider_parameters=HfDatasetParams( - repo_id="imdatta0/ultrachat_10k", - access_token="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", - ), - # dataset_provider_parameters=S3DatasetParams(endpoint_url="http://10.117.63.3", bucket_name="deepanker-test", file_key="list_roles_response.txt", access_key="qEMHyz8wNwLpUWkvfZmQZrj60TE6zX4p", secret_key="qIp_QNLPKI0LJ5X0F8NrypoSMSsw_Gfe" ), + dataset_provider_parameters=HfDatasetParams(repo_id="imdatta0/ultrachat_10k"), train_parameters=HuggingFaceTrainParams( lora_config=LoraConfig( r=8, @@ -66,8 +60,8 @@ # eval_accumulation_steps=1, per_device_eval_batch_size=1, # load_best_model_at_end=True, - report_to="wandb", - run_name=f"{1}", + # report_to="wandb", + # run_name=f"{1}", ), ), resources_per_worker={"gpu": 0, "cpu": 8, "memory": "8Gi"}, diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py index 6999dc483a..2bccc0ac4a 100644 --- a/sdk/python/kubeflow/trainer/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -30,8 +30,6 @@ def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): device_map="auto", ) - # print(model) - tokenizer = transformers.AutoTokenizer.from_pretrained( pretrained_model_name_or_path=model_name, cache_dir=model_dir, @@ -42,8 +40,6 @@ def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): tokenizer.pad_token = tokenizer.eos_token tokenizer.add_pad_token = True - # print(tokenizer) - # Freeze model parameters for param in model.parameters(): param.requires_grad = False @@ -56,7 +52,7 @@ def load_and_preprocess_data(dataset_name, dataset_dir): print("loading dataset") dataset = load_dataset(dataset_name, cache_dir=dataset_dir) train_data = dataset["train"] - # print(train_data) + try: eval_data = dataset["eval"] except Exception as err: @@ -89,6 +85,7 @@ def train_model(model, train_data, eval_data, tokenizer, train_params): ) trainer.train() + print("training done") def parse_arguments():