From 4ea1c43a108919fc974ce34348ccf327bec3c03f Mon Sep 17 00:00:00 2001 From: Ita Zaporozhets <31893021+itazap@users.noreply.github.com> Date: Thu, 26 Sep 2024 13:38:20 -0400 Subject: [PATCH] clean_up_tokenization_spaces=False if unset (#31938) * clean_up_tokenization_spaces=False if unset * deprecate warning * updating param for old models * update models * make fix-copies * fix-copies and update bert models * warning msg * update prophet and clvp * updating test since space before is arbitrarily removed * remove warning for 4.45 --- src/transformers/models/bert/tokenization_bert.py | 5 +++++ src/transformers/models/convbert/tokenization_convbert.py | 5 +++++ .../models/distilbert/tokenization_distilbert.py | 5 +++++ src/transformers/models/electra/tokenization_electra.py | 5 +++++ src/transformers/models/funnel/tokenization_funnel.py | 5 +++++ src/transformers/models/layoutlm/tokenization_layoutlm.py | 5 +++++ src/transformers/models/lxmert/tokenization_lxmert.py | 5 +++++ .../models/mobilebert/tokenization_mobilebert.py | 5 +++++ src/transformers/models/mpnet/tokenization_mpnet.py | 5 +++++ .../models/prophetnet/tokenization_prophetnet.py | 5 +++++ .../models/squeezebert/tokenization_squeezebert.py | 5 +++++ src/transformers/models/tapas/tokenization_tapas.py | 5 +++++ src/transformers/tokenization_utils_base.py | 2 +- tests/models/clvp/test_tokenization_clvp.py | 2 +- tests/models/wav2vec2/test_tokenization_wav2vec2.py | 8 ++++---- .../test_tokenization_wav2vec2_phoneme.py | 2 +- 16 files changed, 67 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py index cd70e38d008..07583b94966 100644 --- a/src/transformers/models/bert/tokenization_bert.py +++ b/src/transformers/models/bert/tokenization_bert.py @@ -88,6 +88,9 @@ class BertTokenizer(PreTrainedTokenizer): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. """ vocab_files_names = VOCAB_FILES_NAMES @@ -105,6 +108,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, + clean_up_tokenization_spaces=True, **kwargs, ): if not os.path.isfile(vocab_file): @@ -136,6 +140,7 @@ def __init__( mask_token=mask_token, tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) diff --git a/src/transformers/models/convbert/tokenization_convbert.py b/src/transformers/models/convbert/tokenization_convbert.py index cc8cb1b9a73..10bbc096bf5 100644 --- a/src/transformers/models/convbert/tokenization_convbert.py +++ b/src/transformers/models/convbert/tokenization_convbert.py @@ -91,6 +91,9 @@ class ConvBertTokenizer(PreTrainedTokenizer): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original ConvBERT). + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. """ vocab_files_names = VOCAB_FILES_NAMES @@ -108,6 +111,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, + clean_up_tokenization_spaces=True, **kwargs, ): if not os.path.isfile(vocab_file): @@ -139,6 +143,7 @@ def __init__( mask_token=mask_token, tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) diff --git a/src/transformers/models/distilbert/tokenization_distilbert.py b/src/transformers/models/distilbert/tokenization_distilbert.py index 87b1eb192e4..610000ce813 100644 --- a/src/transformers/models/distilbert/tokenization_distilbert.py +++ b/src/transformers/models/distilbert/tokenization_distilbert.py @@ -90,6 +90,9 @@ class DistilBertTokenizer(PreTrainedTokenizer): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. """ vocab_files_names = VOCAB_FILES_NAMES @@ -108,6 +111,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, + clean_up_tokenization_spaces=True, **kwargs, ): if not os.path.isfile(vocab_file): @@ -138,6 +142,7 @@ def __init__( mask_token=mask_token, tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) diff --git a/src/transformers/models/electra/tokenization_electra.py b/src/transformers/models/electra/tokenization_electra.py index 9ecbce63f50..2acd86ca083 100644 --- a/src/transformers/models/electra/tokenization_electra.py +++ b/src/transformers/models/electra/tokenization_electra.py @@ -90,6 +90,9 @@ class ElectraTokenizer(PreTrainedTokenizer): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original Electra). + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. """ vocab_files_names = VOCAB_FILES_NAMES @@ -107,6 +110,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, + clean_up_tokenization_spaces=True, **kwargs, ): if not os.path.isfile(vocab_file): @@ -138,6 +142,7 @@ def __init__( mask_token=mask_token, tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) diff --git a/src/transformers/models/funnel/tokenization_funnel.py b/src/transformers/models/funnel/tokenization_funnel.py index 68e7d958b74..78499cbee4e 100644 --- a/src/transformers/models/funnel/tokenization_funnel.py +++ b/src/transformers/models/funnel/tokenization_funnel.py @@ -107,6 +107,9 @@ class FunnelTokenizer(PreTrainedTokenizer): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. """ vocab_files_names = VOCAB_FILES_NAMES @@ -127,6 +130,7 @@ def __init__( eos_token="", tokenize_chinese_chars=True, strip_accents=None, + clean_up_tokenization_spaces=True, **kwargs, ): if not os.path.isfile(vocab_file): @@ -159,6 +163,7 @@ def __init__( eos_token=eos_token, tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm.py b/src/transformers/models/layoutlm/tokenization_layoutlm.py index b0a57dac5fd..62fb4c524f2 100644 --- a/src/transformers/models/layoutlm/tokenization_layoutlm.py +++ b/src/transformers/models/layoutlm/tokenization_layoutlm.py @@ -91,6 +91,9 @@ class LayoutLMTokenizer(PreTrainedTokenizer): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original LayoutLM). + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. """ vocab_files_names = VOCAB_FILES_NAMES @@ -108,6 +111,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, + clean_up_tokenization_spaces=True, **kwargs, ): if not os.path.isfile(vocab_file): @@ -139,6 +143,7 @@ def __init__( mask_token=mask_token, tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) diff --git a/src/transformers/models/lxmert/tokenization_lxmert.py b/src/transformers/models/lxmert/tokenization_lxmert.py index 5800f6b0d4a..8310993160a 100644 --- a/src/transformers/models/lxmert/tokenization_lxmert.py +++ b/src/transformers/models/lxmert/tokenization_lxmert.py @@ -90,6 +90,9 @@ class LxmertTokenizer(PreTrainedTokenizer): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original Lxmert). + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. """ vocab_files_names = VOCAB_FILES_NAMES @@ -107,6 +110,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, + clean_up_tokenization_spaces=True, **kwargs, ): if not os.path.isfile(vocab_file): @@ -138,6 +142,7 @@ def __init__( mask_token=mask_token, tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert.py b/src/transformers/models/mobilebert/tokenization_mobilebert.py index 972f57fae0a..e4faaf12d38 100644 --- a/src/transformers/models/mobilebert/tokenization_mobilebert.py +++ b/src/transformers/models/mobilebert/tokenization_mobilebert.py @@ -92,6 +92,9 @@ class MobileBertTokenizer(PreTrainedTokenizer): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original MobileBERT). + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. """ vocab_files_names = VOCAB_FILES_NAMES @@ -109,6 +112,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, + clean_up_tokenization_spaces=True, **kwargs, ): if not os.path.isfile(vocab_file): @@ -140,6 +144,7 @@ def __init__( mask_token=mask_token, tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) diff --git a/src/transformers/models/mpnet/tokenization_mpnet.py b/src/transformers/models/mpnet/tokenization_mpnet.py index 8f152fa3434..8d46381f056 100644 --- a/src/transformers/models/mpnet/tokenization_mpnet.py +++ b/src/transformers/models/mpnet/tokenization_mpnet.py @@ -108,6 +108,9 @@ class MPNetTokenizer(PreTrainedTokenizer): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. """ vocab_files_names = VOCAB_FILES_NAMES @@ -128,6 +131,7 @@ def __init__( mask_token="", tokenize_chinese_chars=True, strip_accents=None, + clean_up_tokenization_spaces=True, **kwargs, ): bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token @@ -170,6 +174,7 @@ def __init__( mask_token=mask_token, tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) diff --git a/src/transformers/models/prophetnet/tokenization_prophetnet.py b/src/transformers/models/prophetnet/tokenization_prophetnet.py index b253ca70995..dc8956da093 100644 --- a/src/transformers/models/prophetnet/tokenization_prophetnet.py +++ b/src/transformers/models/prophetnet/tokenization_prophetnet.py @@ -308,6 +308,9 @@ class ProphetNetTokenizer(PreTrainedTokenizer): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. """ vocab_files_names = VOCAB_FILES_NAMES @@ -330,6 +333,7 @@ def __init__( mask_token: Optional[str] = "[MASK]", tokenize_chinese_chars: Optional[bool] = True, strip_accents: Optional[bool] = None, + clean_up_tokenization_spaces: bool = True, **kwargs, ): if not os.path.isfile(vocab_file): @@ -360,6 +364,7 @@ def __init__( mask_token=mask_token, tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) diff --git a/src/transformers/models/squeezebert/tokenization_squeezebert.py b/src/transformers/models/squeezebert/tokenization_squeezebert.py index 191e57c0f8a..9ac72fcc260 100644 --- a/src/transformers/models/squeezebert/tokenization_squeezebert.py +++ b/src/transformers/models/squeezebert/tokenization_squeezebert.py @@ -91,6 +91,9 @@ class SqueezeBertTokenizer(PreTrainedTokenizer): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original SqueezeBERT). + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. """ vocab_files_names = VOCAB_FILES_NAMES @@ -108,6 +111,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, + clean_up_tokenization_spaces=True, **kwargs, ): if not os.path.isfile(vocab_file): @@ -139,6 +143,7 @@ def __init__( mask_token=mask_token, tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py index 867e53ff890..69950396079 100644 --- a/src/transformers/models/tapas/tokenization_tapas.py +++ b/src/transformers/models/tapas/tokenization_tapas.py @@ -225,6 +225,9 @@ class TapasTokenizer(PreTrainedTokenizer): Minimum length of each question in terms of tokens (will be skipped otherwise). max_question_length (`int`, *optional*): Maximum length of each question in terms of tokens (will be skipped otherwise). + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. """ vocab_files_names = VOCAB_FILES_NAMES @@ -252,6 +255,7 @@ def __init__( max_question_length=None, model_max_length: int = 512, additional_special_tokens: Optional[List[str]] = None, + clean_up_tokenization_spaces=True, **kwargs, ): if not is_pandas_available(): @@ -322,6 +326,7 @@ def __init__( max_question_length=max_question_length, model_max_length=model_max_length, additional_special_tokens=additional_special_tokens, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index c6467bb7d7f..c96f7a33147 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1622,7 +1622,7 @@ def __init__(self, **kwargs): ) # By default, cleaning tokenization spaces for both fast and slow tokenizers - self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True) + self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False) # By default, do not split special tokens for both fast and slow tokenizers self.split_special_tokens = kwargs.pop("split_special_tokens", False) diff --git a/tests/models/clvp/test_tokenization_clvp.py b/tests/models/clvp/test_tokenization_clvp.py index 71ea9c08c83..aa8d2d22a5b 100644 --- a/tests/models/clvp/test_tokenization_clvp.py +++ b/tests/models/clvp/test_tokenization_clvp.py @@ -79,7 +79,7 @@ def get_tokenizer(self, **kwargs): # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts def get_input_output_texts(self, tokenizer): input_text = "lower newer" - output_text = "lower newer" + output_text = "lower[SPACE]newer" return input_text, output_text # Copied from transformers.tests.models.layoutxlm.test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_add_special_tokens diff --git a/tests/models/wav2vec2/test_tokenization_wav2vec2.py b/tests/models/wav2vec2/test_tokenization_wav2vec2.py index 88920638277..4a4058891d3 100644 --- a/tests/models/wav2vec2/test_tokenization_wav2vec2.py +++ b/tests/models/wav2vec2/test_tokenization_wav2vec2.py @@ -147,8 +147,8 @@ def test_tokenizer_decode_added_tokens(self): batch_tokens = tokenizer.batch_decode(sample_ids) batch_tokens_2 = tokenizer.batch_decode(sample_ids, skip_special_tokens=True) - self.assertEqual(batch_tokens, ["HELLO!?!?$$$", "BYE BYE$$$"]) - self.assertEqual(batch_tokens_2, ["HELO!?!?", "BYE BYE"]) + self.assertEqual(batch_tokens, ["HELLO!? !?$$$", "BYE BYE$$$"]) + self.assertEqual(batch_tokens_2, ["HELO!? !?", "BYE BYE"]) def test_call(self): # Tests that all call wrap to encode_plus and batch_encode_plus @@ -467,8 +467,8 @@ def test_tokenizer_decode_added_tokens(self): batch_tokens = tokenizer.batch_decode(sample_ids) batch_tokens_2 = tokenizer.batch_decode(sample_ids, skip_special_tokens=True) - self.assertEqual(batch_tokens, ["HELLO!?!?$$$", "BYE BYE$$$"]) - self.assertEqual(batch_tokens_2, ["HELO!?!?", "BYE BYE"]) + self.assertEqual(batch_tokens, ["HELLO!? !?$$$", "BYE BYE$$$"]) + self.assertEqual(batch_tokens_2, ["HELO!? !?", "BYE BYE"]) def test_special_characters_in_vocab(self): sent = "ʈʰ æ æ̃ ˧ kʰ" diff --git a/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py b/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py index 4aab34207a6..96bed25ad16 100644 --- a/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py +++ b/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py @@ -249,7 +249,7 @@ def test_tokenizer_decode_added_tokens(self): # fmt: on batch_tokens = tokenizer.batch_decode(sample_ids) - self.assertEqual(batch_tokens, ["k s ɾ ɾ l ɭʲ!?!? $$$", "j ð s j ð s oːɹ $$$"]) + self.assertEqual(batch_tokens, ["k s ɾ ɾ l ɭʲ ! ? ! ? $$$", "j ð s j ð s oːɹ $$$"]) @staticmethod def get_from_offsets(offsets, key):