diff --git a/intel_extension_for_transformers/llm/finetuning/finetuning.py b/intel_extension_for_transformers/llm/finetuning/finetuning.py index bc26b6d15b6..f520c515b8c 100644 --- a/intel_extension_for_transformers/llm/finetuning/finetuning.py +++ b/intel_extension_for_transformers/llm/finetuning/finetuning.py @@ -445,12 +445,10 @@ def concatenate_data(dataset, max_seq_length): ) if training_args.do_eval: - if "test" not in tokenized_datasets: - self.logger.info('Splitting train dataset in train and validation according to `eval_dataset_size`') - tokenized_datasets = tokenized_datasets["train"].train_test_split( - test_size=data_args.eval_dataset_size, shuffle=True, seed=42 - ) - eval_dataset = tokenized_datasets["test"] + if "validation" not in tokenized_datasets: + raise ValueError("--do_eval requires a validation dataset") + + eval_dataset = tokenized_datasets["validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) diff --git a/intel_extension_for_transformers/neural_chat/config.py b/intel_extension_for_transformers/neural_chat/config.py index 881effab4ef..e3b4b254958 100644 --- a/intel_extension_for_transformers/neural_chat/config.py +++ b/intel_extension_for_transformers/neural_chat/config.py @@ -228,9 +228,6 @@ class DataArguments: ) }, ) - eval_dataset_size: int = field( - default=500, metadata={"help": "Size of validation dataset."} - ) streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"}) preprocessing_num_workers: Optional[int] = field( default=None,