Improve FSDP + QLora #68
4 fail, 8 skipped, 528 pass in 30m 46s
540 tests 528 ✅ 30m 46s ⏱️
1 suites 8 💤
1 files 4 ❌
Results for commit 2bda20f.
Annotations
Check warning on line 0 in src.tests.trainers.test_trainers.TestTrainHFDPO
github-actions / Test Results
test_seq2seq (src.tests.trainers.test_trainers.TestTrainHFDPO) failed
test.xml [took 0s]
Raw output
ModuleNotFoundError: No module named 'src.utils._vendored'
self = <src.tests.trainers.test_trainers.TestTrainHFDPO object at 0x7fcb87961480>
create_datadreamer = <function create_datadreamer.<locals>._create_datadreamer at 0x7fcb56227f40>
mocker = <pytest_mock.plugin.MockerFixture object at 0x7fcb67748970>
def test_seq2seq(self, create_datadreamer, mocker):
with create_datadreamer():
dataset = DataSource(
"Training Data",
data={
"prompts": [
"The color of the sky is",
"Firetrucks are",
"The color of an apple is",
"The color of grass is",
"The color of clouds are",
"The color of the sun is",
],
"chosen": [
" purple",
" bright yellow",
" orange",
" blue",
" red",
" green",
],
"rejected": [
" blue",
" red",
" red",
" green",
" white",
" yellow",
],
},
)
val_dataset = dataset.take(2)
trainer = TrainHFDPO("T5 Trainer", model_name="google/flan-t5-small")
data_collator_spy = mocker.spy(CustomDataCollatorWithPadding, "__call__")
> train_result = trainer.train(
train_prompts=dataset.output["prompts"],
train_chosen=dataset.output["chosen"],
train_rejected=dataset.output["rejected"],
validation_prompts=val_dataset.output["prompts"],
validation_chosen=val_dataset.output["chosen"],
validation_rejected=val_dataset.output["rejected"],
epochs=1,
batch_size=8,
)
src/tests/trainers/test_trainers.py:1714:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
src/trainers/train_hf_dpo.py:472: in train
self._setup_folder_and_resume(
src/trainers/trainer.py:291: in _setup_folder_and_resume
self.__setup_folder_and_resume(**kwargs)
src/trainers/trainer.py:271: in __setup_folder_and_resume
self._train(**kwargs)
src/trainers/train_hf_dpo.py:120: in _train
train_dataset, validation_dataset, _, _ = prepare_inputs_and_outputs(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <T5 Trainer (google/flan-t5-small)>
train_columns = {('train_chosen', 'Train Chosen Generations'): OutputDatasetColumn(column_name='chosen', num_rows=6, dataset=<Dataset ...n Rejected Generations'): OutputDatasetColumn(column_name='rejected', num_rows=6, dataset=<Dataset @ 140511020706272>)}
validation_columns = {('validation_chosen', 'Validation Chosen Generations'): OutputDatasetColumn(column_name='chosen', num_rows=2, dataset...n Rejected Generations'): OutputDatasetColumn(column_name='rejected', num_rows=2, dataset=<Dataset @ 140511020698640>)}
truncate = True, causal = False, dpo = True, reward_pairs = False
reward_scores = False
def prepare_inputs_and_outputs( # noqa: C901
self: "_TrainHFBase",
train_columns: dict[
tuple[str, str], OutputDatasetColumn | OutputIterableDatasetColumn
],
validation_columns: dict[
tuple[str, str], OutputDatasetColumn | OutputIterableDatasetColumn
],
truncate: bool = False,
causal: bool = False,
dpo: bool = False,
reward_pairs: bool = False,
reward_scores: bool = False,
) -> tuple[
Dataset | IterableDataset | _SizedIterableDataset,
Dataset | IterableDataset | _SizedIterableDataset,
dict[Any, int],
bool,
]:
num_proc = (
(
len(os.sched_getaffinity(0))
if hasattr(os, "sched_getaffinity")
else os.cpu_count()
)
if sys.platform != "darwin"
else 1
)
label2id: dict[Any, int] = {}
is_multi_target: bool = False
def get_train_column(
column_name: str,
) -> OutputDatasetColumn | OutputIterableDatasetColumn:
for (train_column_name, _), train_column in train_columns.items():
if train_column_name == column_name:
return train_column
raise KeyError(f"Train column {column_name} not found.") # pragma: no cover
def get_validation_column(
column_name: str,
) -> OutputDatasetColumn | OutputIterableDatasetColumn:
for (
validation_column_name,
_,
), validation_column in validation_columns.items():
if validation_column_name == column_name:
return validation_column
raise KeyError(
f"Validation column {column_name} not found."
) # pragma: no cover
def apply_chat_prompt_template(prompt: str) -> str:
return (
cast(str, self.chat_prompt_template)
.replace("{{system_prompt}}", self.system_prompt or "")
.replace("{{prompt}}", prompt)
)
def tokenize_function(
examples,
column_name: str,
new_column_name: str,
causal: bool,
reward_scores: bool,
): # pragma: no cover
if reward_scores:
prompt, completion = examples[column_name]
if self.chat_prompt_template:
prompt = apply_chat_prompt_template(prompt)
input_ids = self.tokenizer(
prompt + completion,
truncation=truncate,
padding=False,
add_special_tokens=True,
)["input_ids"]
return {
"input_ids": input_ids[: self.tokenizer.model_max_length]
if truncate
else input_ids,
"labels": examples["label"],
}
elif causal:
prompt, completion = examples[column_name]
if self.chat_prompt_template:
prompt = apply_chat_prompt_template(prompt)
prompt_input_ids = self.tokenizer(
prompt, truncation=truncate, padding=False, add_special_tokens=True
)["input_ids"]
completion_input_ids = self.tokenizer(
completion, truncation=truncate, padding=False, add_special_tokens=False
)["input_ids"] + [self.tokenizer.eos_token_id]
prompt_labels = [-100] * len(prompt_input_ids)
input_ids = prompt_input_ids + completion_input_ids
labels = prompt_labels + completion_input_ids
return {
"input_ids": input_ids[: self.tokenizer.model_max_length]
if truncate
else input_ids,
"labels": labels[: self.tokenizer.model_max_length]
if truncate
else labels,
}
elif new_column_name in ["decoder_labels"]:
return {
"labels": self.tokenizer(
examples[column_name],
truncation=truncate,
padding=False,
add_special_tokens=True,
)["input_ids"]
}
else:
prompts = examples[column_name]
if self.chat_prompt_template:
prompts = list(map(apply_chat_prompt_template, prompts))
tokenizer_results = self.tokenizer(
prompts, truncation=truncate, padding=False, add_special_tokens=True
)
return {
new_column_name: tokenizer_results["input_ids"],
f"{new_column_name.replace('input_ids', '')}attention_mask": tokenizer_results[
"attention_mask"
],
}
def tokenize_column_name(
column_name: str,
new_column_name: str,
causal: bool,
reward_scores: bool = False,
) -> Callable:
return partial(
tokenize_function,
column_name=column_name,
new_column_name=new_column_name,
causal=causal,
reward_scores=reward_scores,
)
def tokenize_column(
column: OutputDatasetColumn | OutputIterableDatasetColumn,
new_column_name: str,
name: str,
causal: bool = False,
reward_scores: bool = False,
) -> Dataset | IterableDataset:
column_name = column.column_names[0]
return column.step.map(
name=f"Tokenize {name}",
function=tokenize_column_name(
column_name,
new_column_name=new_column_name,
causal=causal,
reward_scores=reward_scores,
),
batched=not causal and not reward_scores,
remove_columns=column.step.output.column_names,
total_num_rows=column.num_rows,
auto_progress=column.num_rows is not None,
lazy=isinstance(column, OutputIterableDatasetColumn),
progress_interval=sys.maxsize
if isinstance(column, OutputIterableDatasetColumn)
else 120,
save_num_proc=num_proc,
).output.dataset
def rename_column(
column: OutputDatasetColumn | OutputIterableDatasetColumn, new_column_name: str
) -> Dataset | IterableDataset:
column_name = column.column_names[0]
column_dataset = column.step.output.dataset.select_columns(column.column_names)
return (
column_dataset.rename_column(column_name, new_column_name)
if column_name != new_column_name
else column_dataset
)
def label_encode_function(
_, column_name: str, example: dict[str, Any]
) -> dict[str, Any]: # pragma: no cover
if isinstance(example[column_name], list):
row_labels = set(str(label) for label in example[column_name])
return {
column_name: [1 if label in row_labels else 0 for label in label2id]
}
else:
return {column_name: label2id[str(example[column_name])]}
def label2id_column(
column: OutputDatasetColumn | OutputIterableDatasetColumn,
new_column_name: str,
name: str,
) -> Dataset | IterableDataset:
column_name = column.column_names[0]
return rename_column(
column.step.map(
name=f"Encode {name} labels",
function=partial(
label_encode_function, sorted(label2id.keys()), column_name
),
batched=False,
remove_columns=list(
set(column.step.output.column_names).difference(set([column_name]))
),
total_num_rows=column.num_rows,
auto_progress=column.num_rows is not None,
lazy=isinstance(column, OutputIterableDatasetColumn),
progress_interval=sys.maxsize
if isinstance(column, OutputIterableDatasetColumn)
else 120,
save_num_proc=num_proc,
).output[column_name],
new_column_name,
)
def process_column(
column: OutputDatasetColumn | OutputIterableDatasetColumn,
new_column_name: str,
name: str,
) -> Dataset | IterableDataset:
if new_column_name == "label" and reward_scores is False:
return label2id_column(
column=column, new_column_name=new_column_name, name=name
)
else: # pragma: no cover
return rename_column(column=column, new_column_name=new_column_name)
def concatenate_prompts_and_completions(
dataset: Dataset | IterableDataset,
) -> IterableDataset:
iterable_dataset = (
dataset.to_iterable_dataset() if isinstance(dataset, Dataset) else dataset
)
return iterable_dataset.map(
lambda row: {"text": [row["prompt"], row["completion"]]},
remove_columns=["prompt", "completion"],
)
# Calculate label2id
uniq_labels = []
for (new_column_name, name), column in list(train_columns.items()) + list(
validation_columns.items()
):
column_name = column.column_names[0]
def uniqify_labels(labels: set[Any], column_name, example):
nonlocal is_multi_target
if isinstance(example[column_name], list):
is_multi_target = True
is_new = False
for label in example[column_name]:
if label not in labels:
is_new = True
labels.add(label)
return is_new
else:
is_new = example[column_name] not in labels
labels.add(example[column_name])
return is_new
if new_column_name == "label" and reward_scores is False:
uniq_labels_column = column.step.filter(
name=f"Get all {name} label names",
function=partial(uniqify_labels, set(), column_name),
batched=False,
total_num_rows=column.num_rows,
auto_progress=column.num_rows is not None,
lazy=False,
progress_interval=sys.maxsize
if isinstance(column, OutputIterableDatasetColumn)
else 120,
).output[column_name]
uniq_labels_from_column = list(uniq_labels_column)
uniq_labels += (
list(chain.from_iterable(uniq_labels_column))
if len(uniq_labels_from_column) > 0
and isinstance(uniq_labels_from_column[0], list)
else uniq_labels_column
)
uniq_labels = sorted(set(uniq_labels))
for label in uniq_labels:
label2id[str(label)] = len(label2id)
# Create train and validation datasets
train_dataset: Dataset | IterableDataset
validation_dataset: Dataset | IterableDataset
if reward_pairs:
# Check if scores are provided
try:
get_train_column("train_chosen_scores")
has_scores = True
except KeyError:
has_scores = False
# Get data collator
def prepare_for_reward_pairs(row): # pragma: no cover
row = row.copy()
if self.chat_prompt_template:
row["prompt"] = apply_chat_prompt_template(row["prompt"])
row["chosen"] = row["prompt"] + row["chosen"]
row["rejected"] = row["prompt"] + row["rejected"]
reward_results = {}
chosen_tokenizer_results = self.tokenizer(
row["chosen"],
truncation=truncate,
padding=False,
add_special_tokens=True,
)
reward_results["input_ids_chosen"] = chosen_tokenizer_results["input_ids"]
rejected_tokenizer_results = self.tokenizer(
row["rejected"],
truncation=truncate,
padding=False,
add_special_tokens=True,
)
reward_results["input_ids_rejected"] = rejected_tokenizer_results[
"input_ids"
]
if "chosen_scores" in row and "rejected_scores" in row:
reward_results["margin"] = row["chosen_scores"] - row["rejected_scores"]
return reward_results
# Run data collator
train_columns_to_combine = [
rename_column(get_train_column("train_prompts"), "prompt"),
rename_column(get_train_column("train_chosen"), "chosen"),
rename_column(get_train_column("train_rejected"), "rejected"),
]
if has_scores:
train_columns_to_combine.extend(
[
rename_column(
get_train_column("train_chosen_scores"), "chosen_scores"
),
rename_column(
get_train_column("train_rejected_scores"), "rejected_scores"
),
]
)
train_combine_step = DataSource(
"Combine Train Prompts, Chosen Generations, and Rejected Generations",
data=concatenate_datasets(train_columns_to_combine, axis=1),
total_num_rows=get_train_column("train_prompts").num_rows,
auto_progress=get_train_column("train_prompts").num_rows is not None,
)
train_dataset = train_combine_step.map(
name="Prepare Train Dataset for Reward Model Training",
function=prepare_for_reward_pairs,
batched=False,
remove_columns=train_combine_step.output.column_names,
total_num_rows=get_train_column("train_prompts").num_rows,
auto_progress=get_train_column("train_prompts").num_rows is not None,
lazy=isinstance(train_combine_step.output, OutputIterableDataset),
progress_interval=sys.maxsize
if isinstance(train_combine_step.output, OutputIterableDataset)
else 120,
save_num_proc=num_proc,
).output.dataset
validation_columns_to_combine = [
rename_column(get_validation_column("validation_prompts"), "prompt"),
rename_column(get_validation_column("validation_chosen"), "chosen"),
rename_column(get_validation_column("validation_rejected"), "rejected"),
]
if has_scores:
validation_columns_to_combine.extend(
[
rename_column(
get_validation_column("validation_chosen_scores"),
"chosen_scores",
),
rename_column(
get_validation_column("validation_rejected_scores"),
"rejected_scores",
),
]
)
validation_combine_step = DataSource(
"Combine Validation Prompts, Chosen Generations, and Rejected Generations",
data=concatenate_datasets(validation_columns_to_combine, axis=1),
total_num_rows=get_validation_column("validation_prompts").num_rows,
auto_progress=get_validation_column("validation_prompts").num_rows
is not None,
)
validation_dataset = validation_combine_step.map(
name="Prepare Validation Dataset for Reward Model Training",
function=prepare_for_reward_pairs,
batched=False,
remove_columns=validation_combine_step.output.column_names,
total_num_rows=get_validation_column("validation_prompts").num_rows,
auto_progress=get_validation_column("validation_prompts").num_rows
is not None,
lazy=isinstance(validation_combine_step.output, OutputIterableDataset),
progress_interval=sys.maxsize
if isinstance(validation_combine_step.output, OutputIterableDataset)
else 120,
save_num_proc=num_proc,
).output.dataset
elif dpo:
if TYPE_CHECKING: # pragma: no cover
DPODataCollatorWithPadding: Any = None
else:
> from ._vendored._dpo_helper import DPODataCollatorWithPadding
E ModuleNotFoundError: No module named 'src.utils._vendored'
src/utils/hf_training_utils.py:522: ModuleNotFoundError
Check warning on line 0 in src.tests.trainers.test_trainers.TestTrainHFDPO
github-actions / Test Results
test_causal (src.tests.trainers.test_trainers.TestTrainHFDPO) failed
test.xml [took 0s]
Raw output
ModuleNotFoundError: No module named 'src.utils._vendored'
self = <src.tests.trainers.test_trainers.TestTrainHFDPO object at 0x7fcb879639a0>
create_datadreamer = <function create_datadreamer.<locals>._create_datadreamer at 0x7fcb56224af0>
mocker = <pytest_mock.plugin.MockerFixture object at 0x7fcb4409c280>
def test_causal(self, create_datadreamer, mocker):
with create_datadreamer():
dataset = DataSource(
"Training Data",
data={
"prompts": [
"The color of the sky is",
"Firetrucks are",
"The color of an apple is",
"The color of grass is",
"The color of clouds are",
"The color of the sun is",
],
"chosen": ["purple", "yellow", "orange", "blue", "red", "green"],
"rejected": ["blue", "red", "red", "green", "white", "yellow"],
},
)
val_dataset = dataset.take(2)
trainer = TrainHFDPO(
"GPT-2 Trainer",
model_name="gpt2",
chat_prompt_template=CHAT_PROMPT_TEMPLATES["guanaco_system"],
system_prompt=SYSTEM_PROMPTS["llama_system"],
)
data_collator_spy = mocker.spy(CustomDataCollatorWithPadding, "__call__")
> train_result = trainer.train(
train_prompts=dataset.output["prompts"],
train_chosen=dataset.output["chosen"],
train_rejected=dataset.output["rejected"],
validation_prompts=val_dataset.output["prompts"],
validation_chosen=val_dataset.output["chosen"],
validation_rejected=val_dataset.output["rejected"],
epochs=1,
batch_size=8,
precompute_ref_log_probs=False, # We test precompute_ref_log_probs here
)
src/tests/trainers/test_trainers.py:1806:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
src/trainers/train_hf_dpo.py:472: in train
self._setup_folder_and_resume(
src/trainers/trainer.py:291: in _setup_folder_and_resume
self.__setup_folder_and_resume(**kwargs)
src/trainers/trainer.py:271: in __setup_folder_and_resume
self._train(**kwargs)
src/trainers/train_hf_dpo.py:120: in _train
train_dataset, validation_dataset, _, _ = prepare_inputs_and_outputs(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <GPT-2 Trainer (gpt2)>
train_columns = {('train_chosen', 'Train Chosen Generations'): OutputDatasetColumn(column_name='chosen', num_rows=6, dataset=<Dataset ...n Rejected Generations'): OutputDatasetColumn(column_name='rejected', num_rows=6, dataset=<Dataset @ 140511010452768>)}
validation_columns = {('validation_chosen', 'Validation Chosen Generations'): OutputDatasetColumn(column_name='chosen', num_rows=2, dataset...n Rejected Generations'): OutputDatasetColumn(column_name='rejected', num_rows=2, dataset=<Dataset @ 140511010448640>)}
truncate = True, causal = False, dpo = True, reward_pairs = False
reward_scores = False
def prepare_inputs_and_outputs( # noqa: C901
self: "_TrainHFBase",
train_columns: dict[
tuple[str, str], OutputDatasetColumn | OutputIterableDatasetColumn
],
validation_columns: dict[
tuple[str, str], OutputDatasetColumn | OutputIterableDatasetColumn
],
truncate: bool = False,
causal: bool = False,
dpo: bool = False,
reward_pairs: bool = False,
reward_scores: bool = False,
) -> tuple[
Dataset | IterableDataset | _SizedIterableDataset,
Dataset | IterableDataset | _SizedIterableDataset,
dict[Any, int],
bool,
]:
num_proc = (
(
len(os.sched_getaffinity(0))
if hasattr(os, "sched_getaffinity")
else os.cpu_count()
)
if sys.platform != "darwin"
else 1
)
label2id: dict[Any, int] = {}
is_multi_target: bool = False
def get_train_column(
column_name: str,
) -> OutputDatasetColumn | OutputIterableDatasetColumn:
for (train_column_name, _), train_column in train_columns.items():
if train_column_name == column_name:
return train_column
raise KeyError(f"Train column {column_name} not found.") # pragma: no cover
def get_validation_column(
column_name: str,
) -> OutputDatasetColumn | OutputIterableDatasetColumn:
for (
validation_column_name,
_,
), validation_column in validation_columns.items():
if validation_column_name == column_name:
return validation_column
raise KeyError(
f"Validation column {column_name} not found."
) # pragma: no cover
def apply_chat_prompt_template(prompt: str) -> str:
return (
cast(str, self.chat_prompt_template)
.replace("{{system_prompt}}", self.system_prompt or "")
.replace("{{prompt}}", prompt)
)
def tokenize_function(
examples,
column_name: str,
new_column_name: str,
causal: bool,
reward_scores: bool,
): # pragma: no cover
if reward_scores:
prompt, completion = examples[column_name]
if self.chat_prompt_template:
prompt = apply_chat_prompt_template(prompt)
input_ids = self.tokenizer(
prompt + completion,
truncation=truncate,
padding=False,
add_special_tokens=True,
)["input_ids"]
return {
"input_ids": input_ids[: self.tokenizer.model_max_length]
if truncate
else input_ids,
"labels": examples["label"],
}
elif causal:
prompt, completion = examples[column_name]
if self.chat_prompt_template:
prompt = apply_chat_prompt_template(prompt)
prompt_input_ids = self.tokenizer(
prompt, truncation=truncate, padding=False, add_special_tokens=True
)["input_ids"]
completion_input_ids = self.tokenizer(
completion, truncation=truncate, padding=False, add_special_tokens=False
)["input_ids"] + [self.tokenizer.eos_token_id]
prompt_labels = [-100] * len(prompt_input_ids)
input_ids = prompt_input_ids + completion_input_ids
labels = prompt_labels + completion_input_ids
return {
"input_ids": input_ids[: self.tokenizer.model_max_length]
if truncate
else input_ids,
"labels": labels[: self.tokenizer.model_max_length]
if truncate
else labels,
}
elif new_column_name in ["decoder_labels"]:
return {
"labels": self.tokenizer(
examples[column_name],
truncation=truncate,
padding=False,
add_special_tokens=True,
)["input_ids"]
}
else:
prompts = examples[column_name]
if self.chat_prompt_template:
prompts = list(map(apply_chat_prompt_template, prompts))
tokenizer_results = self.tokenizer(
prompts, truncation=truncate, padding=False, add_special_tokens=True
)
return {
new_column_name: tokenizer_results["input_ids"],
f"{new_column_name.replace('input_ids', '')}attention_mask": tokenizer_results[
"attention_mask"
],
}
def tokenize_column_name(
column_name: str,
new_column_name: str,
causal: bool,
reward_scores: bool = False,
) -> Callable:
return partial(
tokenize_function,
column_name=column_name,
new_column_name=new_column_name,
causal=causal,
reward_scores=reward_scores,
)
def tokenize_column(
column: OutputDatasetColumn | OutputIterableDatasetColumn,
new_column_name: str,
name: str,
causal: bool = False,
reward_scores: bool = False,
) -> Dataset | IterableDataset:
column_name = column.column_names[0]
return column.step.map(
name=f"Tokenize {name}",
function=tokenize_column_name(
column_name,
new_column_name=new_column_name,
causal=causal,
reward_scores=reward_scores,
),
batched=not causal and not reward_scores,
remove_columns=column.step.output.column_names,
total_num_rows=column.num_rows,
auto_progress=column.num_rows is not None,
lazy=isinstance(column, OutputIterableDatasetColumn),
progress_interval=sys.maxsize
if isinstance(column, OutputIterableDatasetColumn)
else 120,
save_num_proc=num_proc,
).output.dataset
def rename_column(
column: OutputDatasetColumn | OutputIterableDatasetColumn, new_column_name: str
) -> Dataset | IterableDataset:
column_name = column.column_names[0]
column_dataset = column.step.output.dataset.select_columns(column.column_names)
return (
column_dataset.rename_column(column_name, new_column_name)
if column_name != new_column_name
else column_dataset
)
def label_encode_function(
_, column_name: str, example: dict[str, Any]
) -> dict[str, Any]: # pragma: no cover
if isinstance(example[column_name], list):
row_labels = set(str(label) for label in example[column_name])
return {
column_name: [1 if label in row_labels else 0 for label in label2id]
}
else:
return {column_name: label2id[str(example[column_name])]}
def label2id_column(
column: OutputDatasetColumn | OutputIterableDatasetColumn,
new_column_name: str,
name: str,
) -> Dataset | IterableDataset:
column_name = column.column_names[0]
return rename_column(
column.step.map(
name=f"Encode {name} labels",
function=partial(
label_encode_function, sorted(label2id.keys()), column_name
),
batched=False,
remove_columns=list(
set(column.step.output.column_names).difference(set([column_name]))
),
total_num_rows=column.num_rows,
auto_progress=column.num_rows is not None,
lazy=isinstance(column, OutputIterableDatasetColumn),
progress_interval=sys.maxsize
if isinstance(column, OutputIterableDatasetColumn)
else 120,
save_num_proc=num_proc,
).output[column_name],
new_column_name,
)
def process_column(
column: OutputDatasetColumn | OutputIterableDatasetColumn,
new_column_name: str,
name: str,
) -> Dataset | IterableDataset:
if new_column_name == "label" and reward_scores is False:
return label2id_column(
column=column, new_column_name=new_column_name, name=name
)
else: # pragma: no cover
return rename_column(column=column, new_column_name=new_column_name)
def concatenate_prompts_and_completions(
dataset: Dataset | IterableDataset,
) -> IterableDataset:
iterable_dataset = (
dataset.to_iterable_dataset() if isinstance(dataset, Dataset) else dataset
)
return iterable_dataset.map(
lambda row: {"text": [row["prompt"], row["completion"]]},
remove_columns=["prompt", "completion"],
)
# Calculate label2id
uniq_labels = []
for (new_column_name, name), column in list(train_columns.items()) + list(
validation_columns.items()
):
column_name = column.column_names[0]
def uniqify_labels(labels: set[Any], column_name, example):
nonlocal is_multi_target
if isinstance(example[column_name], list):
is_multi_target = True
is_new = False
for label in example[column_name]:
if label not in labels:
is_new = True
labels.add(label)
return is_new
else:
is_new = example[column_name] not in labels
labels.add(example[column_name])
return is_new
if new_column_name == "label" and reward_scores is False:
uniq_labels_column = column.step.filter(
name=f"Get all {name} label names",
function=partial(uniqify_labels, set(), column_name),
batched=False,
total_num_rows=column.num_rows,
auto_progress=column.num_rows is not None,
lazy=False,
progress_interval=sys.maxsize
if isinstance(column, OutputIterableDatasetColumn)
else 120,
).output[column_name]
uniq_labels_from_column = list(uniq_labels_column)
uniq_labels += (
list(chain.from_iterable(uniq_labels_column))
if len(uniq_labels_from_column) > 0
and isinstance(uniq_labels_from_column[0], list)
else uniq_labels_column
)
uniq_labels = sorted(set(uniq_labels))
for label in uniq_labels:
label2id[str(label)] = len(label2id)
# Create train and validation datasets
train_dataset: Dataset | IterableDataset
validation_dataset: Dataset | IterableDataset
if reward_pairs:
# Check if scores are provided
try:
get_train_column("train_chosen_scores")
has_scores = True
except KeyError:
has_scores = False
# Get data collator
def prepare_for_reward_pairs(row): # pragma: no cover
row = row.copy()
if self.chat_prompt_template:
row["prompt"] = apply_chat_prompt_template(row["prompt"])
row["chosen"] = row["prompt"] + row["chosen"]
row["rejected"] = row["prompt"] + row["rejected"]
reward_results = {}
chosen_tokenizer_results = self.tokenizer(
row["chosen"],
truncation=truncate,
padding=False,
add_special_tokens=True,
)
reward_results["input_ids_chosen"] = chosen_tokenizer_results["input_ids"]
rejected_tokenizer_results = self.tokenizer(
row["rejected"],
truncation=truncate,
padding=False,
add_special_tokens=True,
)
reward_results["input_ids_rejected"] = rejected_tokenizer_results[
"input_ids"
]
if "chosen_scores" in row and "rejected_scores" in row:
reward_results["margin"] = row["chosen_scores"] - row["rejected_scores"]
return reward_results
# Run data collator
train_columns_to_combine = [
rename_column(get_train_column("train_prompts"), "prompt"),
rename_column(get_train_column("train_chosen"), "chosen"),
rename_column(get_train_column("train_rejected"), "rejected"),
]
if has_scores:
train_columns_to_combine.extend(
[
rename_column(
get_train_column("train_chosen_scores"), "chosen_scores"
),
rename_column(
get_train_column("train_rejected_scores"), "rejected_scores"
),
]
)
train_combine_step = DataSource(
"Combine Train Prompts, Chosen Generations, and Rejected Generations",
data=concatenate_datasets(train_columns_to_combine, axis=1),
total_num_rows=get_train_column("train_prompts").num_rows,
auto_progress=get_train_column("train_prompts").num_rows is not None,
)
train_dataset = train_combine_step.map(
name="Prepare Train Dataset for Reward Model Training",
function=prepare_for_reward_pairs,
batched=False,
remove_columns=train_combine_step.output.column_names,
total_num_rows=get_train_column("train_prompts").num_rows,
auto_progress=get_train_column("train_prompts").num_rows is not None,
lazy=isinstance(train_combine_step.output, OutputIterableDataset),
progress_interval=sys.maxsize
if isinstance(train_combine_step.output, OutputIterableDataset)
else 120,
save_num_proc=num_proc,
).output.dataset
validation_columns_to_combine = [
rename_column(get_validation_column("validation_prompts"), "prompt"),
rename_column(get_validation_column("validation_chosen"), "chosen"),
rename_column(get_validation_column("validation_rejected"), "rejected"),
]
if has_scores:
validation_columns_to_combine.extend(
[
rename_column(
get_validation_column("validation_chosen_scores"),
"chosen_scores",
),
rename_column(
get_validation_column("validation_rejected_scores"),
"rejected_scores",
),
]
)
validation_combine_step = DataSource(
"Combine Validation Prompts, Chosen Generations, and Rejected Generations",
data=concatenate_datasets(validation_columns_to_combine, axis=1),
total_num_rows=get_validation_column("validation_prompts").num_rows,
auto_progress=get_validation_column("validation_prompts").num_rows
is not None,
)
validation_dataset = validation_combine_step.map(
name="Prepare Validation Dataset for Reward Model Training",
function=prepare_for_reward_pairs,
batched=False,
remove_columns=validation_combine_step.output.column_names,
total_num_rows=get_validation_column("validation_prompts").num_rows,
auto_progress=get_validation_column("validation_prompts").num_rows
is not None,
lazy=isinstance(validation_combine_step.output, OutputIterableDataset),
progress_interval=sys.maxsize
if isinstance(validation_combine_step.output, OutputIterableDataset)
else 120,
save_num_proc=num_proc,
).output.dataset
elif dpo:
if TYPE_CHECKING: # pragma: no cover
DPODataCollatorWithPadding: Any = None
else:
> from ._vendored._dpo_helper import DPODataCollatorWithPadding
E ModuleNotFoundError: No module named 'src.utils._vendored'
src/utils/hf_training_utils.py:522: ModuleNotFoundError
Check warning on line 0 in src.tests.trainers.test_trainers.TestTrainHFDPO
github-actions / Test Results
test_peft (src.tests.trainers.test_trainers.TestTrainHFDPO) failed
test.xml [took 0s]
Raw output
ModuleNotFoundError: No module named 'src.utils._vendored'
self = <src.tests.trainers.test_trainers.TestTrainHFDPO object at 0x7fcb87963a90>
create_datadreamer = <function create_datadreamer.<locals>._create_datadreamer at 0x7fcb4d525bd0>
mocker = <pytest_mock.plugin.MockerFixture object at 0x7fcb47b60c40>
def test_peft(self, create_datadreamer, mocker):
with create_datadreamer():
dataset = DataSource(
"Training Data",
data={
"prompts": [
"The color of the sky is",
"Firetrucks are",
"The color of an apple is",
"The color of grass is",
"The color of clouds are",
"The color of the sun is",
],
"chosen": [
" purple",
" yellow",
" orange",
" blue",
" red",
" green",
],
"rejected": [
" blue",
" red",
" red",
" green",
" white",
" yellow",
],
},
)
val_dataset = dataset.take(2)
# A warning is thrown if not run on GPU by bitsandbytes imported by PEFT
with ignore_transformers_warnings():
from peft import LoraConfig
peft_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
target_modules=["c_proj"],
fan_in_fan_out=True,
)
trainer = TrainHFDPO(
"GPT-2 Trainer", model_name="gpt2", peft_config=peft_config
)
> train_result = trainer.train(
train_prompts=dataset.output["prompts"],
train_chosen=dataset.output["chosen"],
train_rejected=dataset.output["rejected"],
validation_prompts=val_dataset.output["prompts"],
validation_chosen=val_dataset.output["chosen"],
validation_rejected=val_dataset.output["rejected"],
epochs=1,
batch_size=8,
)
src/tests/trainers/test_trainers.py:1946:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
src/trainers/train_hf_dpo.py:472: in train
self._setup_folder_and_resume(
src/trainers/trainer.py:291: in _setup_folder_and_resume
self.__setup_folder_and_resume(**kwargs)
src/trainers/trainer.py:271: in __setup_folder_and_resume
self._train(**kwargs)
src/trainers/train_hf_dpo.py:120: in _train
train_dataset, validation_dataset, _, _ = prepare_inputs_and_outputs(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <GPT-2 Trainer (gpt2)>
train_columns = {('train_chosen', 'Train Chosen Generations'): OutputDatasetColumn(column_name='chosen', num_rows=6, dataset=<Dataset ...n Rejected Generations'): OutputDatasetColumn(column_name='rejected', num_rows=6, dataset=<Dataset @ 140511048563056>)}
validation_columns = {('validation_chosen', 'Validation Chosen Generations'): OutputDatasetColumn(column_name='chosen', num_rows=2, dataset...n Rejected Generations'): OutputDatasetColumn(column_name='rejected', num_rows=2, dataset=<Dataset @ 140511048560560>)}
truncate = True, causal = False, dpo = True, reward_pairs = False
reward_scores = False
def prepare_inputs_and_outputs( # noqa: C901
self: "_TrainHFBase",
train_columns: dict[
tuple[str, str], OutputDatasetColumn | OutputIterableDatasetColumn
],
validation_columns: dict[
tuple[str, str], OutputDatasetColumn | OutputIterableDatasetColumn
],
truncate: bool = False,
causal: bool = False,
dpo: bool = False,
reward_pairs: bool = False,
reward_scores: bool = False,
) -> tuple[
Dataset | IterableDataset | _SizedIterableDataset,
Dataset | IterableDataset | _SizedIterableDataset,
dict[Any, int],
bool,
]:
num_proc = (
(
len(os.sched_getaffinity(0))
if hasattr(os, "sched_getaffinity")
else os.cpu_count()
)
if sys.platform != "darwin"
else 1
)
label2id: dict[Any, int] = {}
is_multi_target: bool = False
def get_train_column(
column_name: str,
) -> OutputDatasetColumn | OutputIterableDatasetColumn:
for (train_column_name, _), train_column in train_columns.items():
if train_column_name == column_name:
return train_column
raise KeyError(f"Train column {column_name} not found.") # pragma: no cover
def get_validation_column(
column_name: str,
) -> OutputDatasetColumn | OutputIterableDatasetColumn:
for (
validation_column_name,
_,
), validation_column in validation_columns.items():
if validation_column_name == column_name:
return validation_column
raise KeyError(
f"Validation column {column_name} not found."
) # pragma: no cover
def apply_chat_prompt_template(prompt: str) -> str:
return (
cast(str, self.chat_prompt_template)
.replace("{{system_prompt}}", self.system_prompt or "")
.replace("{{prompt}}", prompt)
)
def tokenize_function(
examples,
column_name: str,
new_column_name: str,
causal: bool,
reward_scores: bool,
): # pragma: no cover
if reward_scores:
prompt, completion = examples[column_name]
if self.chat_prompt_template:
prompt = apply_chat_prompt_template(prompt)
input_ids = self.tokenizer(
prompt + completion,
truncation=truncate,
padding=False,
add_special_tokens=True,
)["input_ids"]
return {
"input_ids": input_ids[: self.tokenizer.model_max_length]
if truncate
else input_ids,
"labels": examples["label"],
}
elif causal:
prompt, completion = examples[column_name]
if self.chat_prompt_template:
prompt = apply_chat_prompt_template(prompt)
prompt_input_ids = self.tokenizer(
prompt, truncation=truncate, padding=False, add_special_tokens=True
)["input_ids"]
completion_input_ids = self.tokenizer(
completion, truncation=truncate, padding=False, add_special_tokens=False
)["input_ids"] + [self.tokenizer.eos_token_id]
prompt_labels = [-100] * len(prompt_input_ids)
input_ids = prompt_input_ids + completion_input_ids
labels = prompt_labels + completion_input_ids
return {
"input_ids": input_ids[: self.tokenizer.model_max_length]
if truncate
else input_ids,
"labels": labels[: self.tokenizer.model_max_length]
if truncate
else labels,
}
elif new_column_name in ["decoder_labels"]:
return {
"labels": self.tokenizer(
examples[column_name],
truncation=truncate,
padding=False,
add_special_tokens=True,
)["input_ids"]
}
else:
prompts = examples[column_name]
if self.chat_prompt_template:
prompts = list(map(apply_chat_prompt_template, prompts))
tokenizer_results = self.tokenizer(
prompts, truncation=truncate, padding=False, add_special_tokens=True
)
return {
new_column_name: tokenizer_results["input_ids"],
f"{new_column_name.replace('input_ids', '')}attention_mask": tokenizer_results[
"attention_mask"
],
}
def tokenize_column_name(
column_name: str,
new_column_name: str,
causal: bool,
reward_scores: bool = False,
) -> Callable:
return partial(
tokenize_function,
column_name=column_name,
new_column_name=new_column_name,
causal=causal,
reward_scores=reward_scores,
)
def tokenize_column(
column: OutputDatasetColumn | OutputIterableDatasetColumn,
new_column_name: str,
name: str,
causal: bool = False,
reward_scores: bool = False,
) -> Dataset | IterableDataset:
column_name = column.column_names[0]
return column.step.map(
name=f"Tokenize {name}",
function=tokenize_column_name(
column_name,
new_column_name=new_column_name,
causal=causal,
reward_scores=reward_scores,
),
batched=not causal and not reward_scores,
remove_columns=column.step.output.column_names,
total_num_rows=column.num_rows,
auto_progress=column.num_rows is not None,
lazy=isinstance(column, OutputIterableDatasetColumn),
progress_interval=sys.maxsize
if isinstance(column, OutputIterableDatasetColumn)
else 120,
save_num_proc=num_proc,
).output.dataset
def rename_column(
column: OutputDatasetColumn | OutputIterableDatasetColumn, new_column_name: str
) -> Dataset | IterableDataset:
column_name = column.column_names[0]
column_dataset = column.step.output.dataset.select_columns(column.column_names)
return (
column_dataset.rename_column(column_name, new_column_name)
if column_name != new_column_name
else column_dataset
)
def label_encode_function(
_, column_name: str, example: dict[str, Any]
) -> dict[str, Any]: # pragma: no cover
if isinstance(example[column_name], list):
row_labels = set(str(label) for label in example[column_name])
return {
column_name: [1 if label in row_labels else 0 for label in label2id]
}
else:
return {column_name: label2id[str(example[column_name])]}
def label2id_column(
column: OutputDatasetColumn | OutputIterableDatasetColumn,
new_column_name: str,
name: str,
) -> Dataset | IterableDataset:
column_name = column.column_names[0]
return rename_column(
column.step.map(
name=f"Encode {name} labels",
function=partial(
label_encode_function, sorted(label2id.keys()), column_name
),
batched=False,
remove_columns=list(
set(column.step.output.column_names).difference(set([column_name]))
),
total_num_rows=column.num_rows,
auto_progress=column.num_rows is not None,
lazy=isinstance(column, OutputIterableDatasetColumn),
progress_interval=sys.maxsize
if isinstance(column, OutputIterableDatasetColumn)
else 120,
save_num_proc=num_proc,
).output[column_name],
new_column_name,
)
def process_column(
column: OutputDatasetColumn | OutputIterableDatasetColumn,
new_column_name: str,
name: str,
) -> Dataset | IterableDataset:
if new_column_name == "label" and reward_scores is False:
return label2id_column(
column=column, new_column_name=new_column_name, name=name
)
else: # pragma: no cover
return rename_column(column=column, new_column_name=new_column_name)
def concatenate_prompts_and_completions(
dataset: Dataset | IterableDataset,
) -> IterableDataset:
iterable_dataset = (
dataset.to_iterable_dataset() if isinstance(dataset, Dataset) else dataset
)
return iterable_dataset.map(
lambda row: {"text": [row["prompt"], row["completion"]]},
remove_columns=["prompt", "completion"],
)
# Calculate label2id
uniq_labels = []
for (new_column_name, name), column in list(train_columns.items()) + list(
validation_columns.items()
):
column_name = column.column_names[0]
def uniqify_labels(labels: set[Any], column_name, example):
nonlocal is_multi_target
if isinstance(example[column_name], list):
is_multi_target = True
is_new = False
for label in example[column_name]:
if label not in labels:
is_new = True
labels.add(label)
return is_new
else:
is_new = example[column_name] not in labels
labels.add(example[column_name])
return is_new
if new_column_name == "label" and reward_scores is False:
uniq_labels_column = column.step.filter(
name=f"Get all {name} label names",
function=partial(uniqify_labels, set(), column_name),
batched=False,
total_num_rows=column.num_rows,
auto_progress=column.num_rows is not None,
lazy=False,
progress_interval=sys.maxsize
if isinstance(column, OutputIterableDatasetColumn)
else 120,
).output[column_name]
uniq_labels_from_column = list(uniq_labels_column)
uniq_labels += (
list(chain.from_iterable(uniq_labels_column))
if len(uniq_labels_from_column) > 0
and isinstance(uniq_labels_from_column[0], list)
else uniq_labels_column
)
uniq_labels = sorted(set(uniq_labels))
for label in uniq_labels:
label2id[str(label)] = len(label2id)
# Create train and validation datasets
train_dataset: Dataset | IterableDataset
validation_dataset: Dataset | IterableDataset
if reward_pairs:
# Check if scores are provided
try:
get_train_column("train_chosen_scores")
has_scores = True
except KeyError:
has_scores = False
# Get data collator
def prepare_for_reward_pairs(row): # pragma: no cover
row = row.copy()
if self.chat_prompt_template:
row["prompt"] = apply_chat_prompt_template(row["prompt"])
row["chosen"] = row["prompt"] + row["chosen"]
row["rejected"] = row["prompt"] + row["rejected"]
reward_results = {}
chosen_tokenizer_results = self.tokenizer(
row["chosen"],
truncation=truncate,
padding=False,
add_special_tokens=True,
)
reward_results["input_ids_chosen"] = chosen_tokenizer_results["input_ids"]
rejected_tokenizer_results = self.tokenizer(
row["rejected"],
truncation=truncate,
padding=False,
add_special_tokens=True,
)
reward_results["input_ids_rejected"] = rejected_tokenizer_results[
"input_ids"
]
if "chosen_scores" in row and "rejected_scores" in row:
reward_results["margin"] = row["chosen_scores"] - row["rejected_scores"]
return reward_results
# Run data collator
train_columns_to_combine = [
rename_column(get_train_column("train_prompts"), "prompt"),
rename_column(get_train_column("train_chosen"), "chosen"),
rename_column(get_train_column("train_rejected"), "rejected"),
]
if has_scores:
train_columns_to_combine.extend(
[
rename_column(
get_train_column("train_chosen_scores"), "chosen_scores"
),
rename_column(
get_train_column("train_rejected_scores"), "rejected_scores"
),
]
)
train_combine_step = DataSource(
"Combine Train Prompts, Chosen Generations, and Rejected Generations",
data=concatenate_datasets(train_columns_to_combine, axis=1),
total_num_rows=get_train_column("train_prompts").num_rows,
auto_progress=get_train_column("train_prompts").num_rows is not None,
)
train_dataset = train_combine_step.map(
name="Prepare Train Dataset for Reward Model Training",
function=prepare_for_reward_pairs,
batched=False,
remove_columns=train_combine_step.output.column_names,
total_num_rows=get_train_column("train_prompts").num_rows,
auto_progress=get_train_column("train_prompts").num_rows is not None,
lazy=isinstance(train_combine_step.output, OutputIterableDataset),
progress_interval=sys.maxsize
if isinstance(train_combine_step.output, OutputIterableDataset)
else 120,
save_num_proc=num_proc,
).output.dataset
validation_columns_to_combine = [
rename_column(get_validation_column("validation_prompts"), "prompt"),
rename_column(get_validation_column("validation_chosen"), "chosen"),
rename_column(get_validation_column("validation_rejected"), "rejected"),
]
if has_scores:
validation_columns_to_combine.extend(
[
rename_column(
get_validation_column("validation_chosen_scores"),
"chosen_scores",
),
rename_column(
get_validation_column("validation_rejected_scores"),
"rejected_scores",
),
]
)
validation_combine_step = DataSource(
"Combine Validation Prompts, Chosen Generations, and Rejected Generations",
data=concatenate_datasets(validation_columns_to_combine, axis=1),
total_num_rows=get_validation_column("validation_prompts").num_rows,
auto_progress=get_validation_column("validation_prompts").num_rows
is not None,
)
validation_dataset = validation_combine_step.map(
name="Prepare Validation Dataset for Reward Model Training",
function=prepare_for_reward_pairs,
batched=False,
remove_columns=validation_combine_step.output.column_names,
total_num_rows=get_validation_column("validation_prompts").num_rows,
auto_progress=get_validation_column("validation_prompts").num_rows
is not None,
lazy=isinstance(validation_combine_step.output, OutputIterableDataset),
progress_interval=sys.maxsize
if isinstance(validation_combine_step.output, OutputIterableDataset)
else 120,
save_num_proc=num_proc,
).output.dataset
elif dpo:
if TYPE_CHECKING: # pragma: no cover
DPODataCollatorWithPadding: Any = None
else:
> from ._vendored._dpo_helper import DPODataCollatorWithPadding
E ModuleNotFoundError: No module named 'src.utils._vendored'
src/utils/hf_training_utils.py:522: ModuleNotFoundError
Check warning on line 0 in src.tests.utils.test_device_utils.TestDeviceUtils
github-actions / Test Results
test_get_device_env_variables (src.tests.utils.test_device_utils.TestDeviceUtils) failed
test.xml [took 0s]
Raw output
AssertionError: assert {'CUDA_VISIBL...DISABLE': '1'} == {'CUDA_VISIBL...CES': '6,3,4'}
Omitting 1 identical items, use -vv to show
Left contains 1 more item:
{'NCCL_P2P_DISABLE': '1'}
Full diff:
- {'CUDA_VISIBLE_DEVICES': '6,3,4'}
+ {'CUDA_VISIBLE_DEVICES': '6,3,4', 'NCCL_P2P_DISABLE': '1'}
self = <src.tests.utils.test_device_utils.TestDeviceUtils object at 0x7fcb87f1d6c0>
def test_get_device_env_variables(self):
os.environ["CUDA_VISIBLE_DEVICES"] = "6,4,3"
with pytest.raises(AssertionError):
get_device_env_variables([0, 2, 999999, 0, 1, -1, -1])
with pytest.raises(AssertionError):
get_device_env_variables([0, 2, 0, 1])
> assert get_device_env_variables([0, 2, 1]) == {"CUDA_VISIBLE_DEVICES": "6,3,4"}
E AssertionError: assert {'CUDA_VISIBL...DISABLE': '1'} == {'CUDA_VISIBL...CES': '6,3,4'}
E Omitting 1 identical items, use -vv to show
E Left contains 1 more item:
E {'NCCL_P2P_DISABLE': '1'}
E Full diff:
E - {'CUDA_VISIBLE_DEVICES': '6,3,4'}
E + {'CUDA_VISIBLE_DEVICES': '6,3,4', 'NCCL_P2P_DISABLE': '1'}
src/tests/utils/test_device_utils.py:92: AssertionError