diff --git a/sdk/python/kubeflow/training/training_container/hf_llm_training.py b/sdk/python/kubeflow/training/training_container/hf_llm_training.py index 4c44243d51..6a8749570f 100644 --- a/sdk/python/kubeflow/training/training_container/hf_llm_training.py +++ b/sdk/python/kubeflow/training/training_container/hf_llm_training.py @@ -5,7 +5,7 @@ AutoConfig, TrainingArguments, DataCollatorForLanguageModeling, - Trainer + Trainer, ) import torch from datasets import load_dataset @@ -14,13 +14,15 @@ def setup_model_and_tokenizer(token_dir, model_dir): # Set up the model and tokenizer - tokenizer = AutoTokenizer.from_pretrained(token_dir, use_fast=False, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained( + token_dir, use_fast=False, trust_remote_code=True + ) tokenizer.pad_token = tokenizer.eos_token tokenizer.add_pad_token = True model = AutoModelForCausalLM.from_pretrained( model_dir, - device_map='auto', + device_map="auto", trust_remote_code=True, ) @@ -30,25 +32,30 @@ def setup_model_and_tokenizer(token_dir, model_dir): return model, tokenizer + def load_and_preprocess_data(dataset_dir, tokenizer): # Load and preprocess the dataset - train_data = load_dataset(dataset_dir, split='train').map(lambda x: tokenizer(x['text']), batched=True) + train_data = load_dataset(dataset_dir, split="train").map( + lambda x: tokenizer(x["text"]), batched=True + ) train_data = train_data.train_test_split(shuffle=True, test_size=200) try: - eval_data = load_dataset(dataset_dir, split='eval') + eval_data = load_dataset(dataset_dir, split="eval") except Exception as err: eval_data = None return train_data, eval_data + def setup_peft_model(model, lora_config): # Set up the PEFT model lora_config = LoraConfig(**lora_config) model = get_peft_model(model, lora_config) return model -def train_model(model, train_data, eval_data,tokenizer, train_params): + +def train_model(model, train_data, eval_data, tokenizer, train_params): # Train the model trainer = Trainer( model=model, @@ -58,26 +65,27 @@ def train_model(model, train_data, eval_data,tokenizer, train_params): args=TrainingArguments( **train_params, data_collator=DataCollatorForLanguageModeling( - tokenizer, - pad_to_multiple_of=8, - return_tensors="pt", - mlm=False + tokenizer, pad_to_multiple_of=8, return_tensors="pt", mlm=False ) - ) + ), ) trainer.train() + def parse_arguments(): - parser = argparse.ArgumentParser(description='Script for training a model with PEFT configuration.') - parser.add_argument('--model_dir', help='directory containing model') - parser.add_argument('--token_dir', help='directory containing tokenizer') - parser.add_argument('--dataset_dir', help='directory contaning dataset') - parser.add_argument('--peft_config', help='peft_config') - parser.add_argument('--train_params', help='hugging face training parameters') + parser = argparse.ArgumentParser( + description="Script for training a model with PEFT configuration." + ) + parser.add_argument("--model_dir", help="directory containing model") + parser.add_argument("--token_dir", help="directory containing tokenizer") + parser.add_argument("--dataset_dir", help="directory contaning dataset") + parser.add_argument("--peft_config", help="peft_config") + parser.add_argument("--train_params", help="hugging face training parameters") return parser.parse_args() + if __name__ == "__main__": args = parse_arguments() model, tokenizer = setup_model_and_tokenizer(args.token_dir, args.model_dir)