Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Removal of pad_token when padding_free is set #368

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions tuning/sft_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
GPT2Tokenizer,
GPTNeoXTokenizerFast,
LlamaTokenizer,
LlamaTokenizerFast,
TrainerCallback,
Expand Down Expand Up @@ -136,6 +134,7 @@ def train(
):
raise ValueError("gradient_accumulation_steps has to be an integer >= 1")

padding_free = False
if (
attention_and_distributed_packing_config is not None
and attention_and_distributed_packing_config.padding_free is not None
Expand All @@ -153,6 +152,7 @@ def train(
"`--padding_free` argument was called with `packing=True`, "
"Trainer should not perform packing when using `--padding_free`"
)
padding_free = True

task_type = "CAUSAL_LM"
additional_metrics = {}
Expand Down Expand Up @@ -253,9 +253,8 @@ def train(
special_tokens_dict["bos_token"] = "<s>"
special_tokens_dict["eos_token"] = "</s>"
special_tokens_dict["unk_token"] = "<unk>"
special_tokens_dict["pad_token"] = "<pad>"
elif isinstance(tokenizer, (GPT2Tokenizer, GPTNeoXTokenizerFast)):
special_tokens_dict["pad_token"] = "<pad>"
if not padding_free:
special_tokens_dict["pad_token"] = "<pad>"

max_seq_length = min(train_args.max_seq_length, tokenizer.model_max_length)
logger.info("Max sequence length is %s", max_seq_length)
Expand All @@ -271,7 +270,7 @@ def train(
# add special tokens only when a custom tokenizer is not passed
if not model_args.tokenizer_name_or_path:
# TODO: we need to change this, perhaps follow what open instruct does?
if tokenizer.pad_token is None:
if tokenizer.pad_token is None and not padding_free:
logger.warning("PAD token set to default, missing in tokenizer")
special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
Expand All @@ -288,7 +287,9 @@ def train(
"PAD token set to default, to make it different from eos token"
)
if tokenizer.eos_token != configs.DEFAULT_PAD_TOKEN:
tokenizer.pad_token = configs.DEFAULT_PAD_TOKEN
tokenizer.pad_token = (
configs.DEFAULT_PAD_TOKEN if not padding_free else None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
configs.DEFAULT_PAD_TOKEN if not padding_free else None
configs.DEFAULT_PAD_TOKEN if not padding_free else None

I dont think we need the else condition to explicitly reset it as None. We'll leave it as it was if padding_free, we just wont reset pad to a unique token . We don't want to add a new pad token / change existing pad token if padding_free basically.

)
else:
tokenizer.eos_token = configs.DEFAULT_EOS_TOKEN

Expand Down
Loading