From 9b992b7a217d91b60056eb05415190cfe1ce1964 Mon Sep 17 00:00:00 2001 From: Ita Zaporozhets <31893021+itazap@users.noreply.github.com> Date: Fri, 21 Jun 2024 10:48:10 +0200 Subject: [PATCH] SPLIT PR: add user defined symbols and control symbols (#31305) * PR SPLIT: moving origina changes for adding user defined symbols * adding gemma test and generalizing gemma converter * ruff * update common test * update serialization test * deberta v2 tests updates as rust version adds '.' as a user added token, so a space is not added * removing commented lines * applying feedback - user only added_tokens to add and check piece.type instead of trainer_spec for user_defined_symbols * add comment referencing sentencepiece --- src/transformers/convert_slow_tokenizer.py | 15 ++++++++--- .../camembert/test_tokenization_camembert.py | 10 +++++-- .../test_tokenization_deberta_v2.py | 26 +++++++++---------- tests/models/gemma/test_tokenization_gemma.py | 13 ++++++++++ .../rembert/test_tokenization_rembert.py | 9 +++++-- tests/test_tokenization_common.py | 10 +++++-- 6 files changed, 60 insertions(+), 23 deletions(-) diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 1f7fcf12f9f908..9876463011966b 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -622,6 +622,17 @@ def decoder(self, replacement, add_prefix_space): def converted(self) -> Tokenizer: tokenizer = self.tokenizer(self.proto) + # Add user defined symbols (type == 4) from sentnecepiece (https://github.com/google/sentencepiece/blob/6225e08edb2577757163b3f5dbba4c0b670ef445/src/sentencepiece_model.proto#L299C29-L299C33) + user_defined_symbols = [ + AddedToken(token, normalized=False, special=False) + for token in [p.piece for p in self.proto.pieces if p.type == 4] + ] + control_symbols = [ + AddedToken(token, normalized=False, special=True) for token in self.proto.trainer_spec.control_symbols + ] + + tokenizer.add_tokens(user_defined_symbols + control_symbols) + # Tokenizer assemble normalizer = self.normalizer(self.proto) if normalizer is not None: @@ -1330,10 +1341,6 @@ def tokenizer(self, proto): raise Exception( "You're trying to run a `Unigram` model but you're file was trained with a different algorithm" ) - user_defined_symbols = [ - AddedToken(token, normalized=True, special=False) for token in proto.trainer_spec.user_defined_symbols - ] - tokenizer.add_tokens(user_defined_symbols) return tokenizer diff --git a/tests/models/camembert/test_tokenization_camembert.py b/tests/models/camembert/test_tokenization_camembert.py index 624338b7f0b118..340d5fc456162c 100644 --- a/tests/models/camembert/test_tokenization_camembert.py +++ b/tests/models/camembert/test_tokenization_camembert.py @@ -144,7 +144,7 @@ def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir) self.assertTrue(str(expected_eos) not in tokenizer.additional_special_tokens) self.assertIn(new_eos, tokenizer.added_tokens_decoder.values()) self.assertEqual(tokenizer.added_tokens_decoder[tokenizer.eos_token_id], new_eos) - self.assertDictEqual(expected, tokenizer.added_tokens_decoder) + self.assertTrue(all(item in tokenizer.added_tokens_decoder.items() for item in expected.items())) return tokenizer new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False) @@ -198,7 +198,13 @@ def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir) self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values())) # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"): - self.assertDictEqual(EXPECTED_ADDED_TOKENS_DECODER, tokenizer_fast.added_tokens_decoder) + with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"): + self.assertTrue( + all( + item in tokenizer.added_tokens_decoder.items() + for item in EXPECTED_ADDED_TOKENS_DECODER.items() + ) + ) EXPECTED_ADDED_TOKENS_DECODER = tokenizer_fast.added_tokens_decoder with tempfile.TemporaryDirectory() as tmp_dir_4: diff --git a/tests/models/deberta_v2/test_tokenization_deberta_v2.py b/tests/models/deberta_v2/test_tokenization_deberta_v2.py index 55f7e8b542901d..8a0085986a599d 100644 --- a/tests/models/deberta_v2/test_tokenization_deberta_v2.py +++ b/tests/models/deberta_v2/test_tokenization_deberta_v2.py @@ -89,8 +89,8 @@ def test_sentencepiece_tokenize_and_decode(self): def test_split_by_punct(self): # fmt: off - sequence = "I was born in 92000, and this is falsé." - tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", ".", ] + sequence = "I was born in 92000, and this is falsé!" + tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", "!", ] # fmt: on tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", split_by_punct=True) @@ -105,8 +105,8 @@ def test_split_by_punct(self): def test_do_lower_case_split_by_punct(self): # fmt: off - sequence = "I was born in 92000, and this is falsé." - tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", ".", ] + sequence = "I was born in 92000, and this is falsé!" + tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", "!", ] # fmt: on tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=True) @@ -121,8 +121,8 @@ def test_do_lower_case_split_by_punct(self): def test_do_lower_case_split_by_punct_false(self): # fmt: off - sequence = "I was born in 92000, and this is falsé." - tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", ".", ] + sequence = "I was born in 92000, and this is falsé!" + tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "!", ] # fmt: on tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=False) @@ -139,8 +139,8 @@ def test_do_lower_case_split_by_punct_false(self): def test_do_lower_case_false_split_by_punct(self): # fmt: off - sequence = "I was born in 92000, and this is falsé." - tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", ".", ] + sequence = "I was born in 92000, and this is falsé!" + tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", "!", ] # fmt: on tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=False, split_by_punct=True) @@ -177,7 +177,7 @@ def test_rust_and_python_full_tokenizers(self): tokenizer = self.get_tokenizer() rust_tokenizer = self.get_rust_tokenizer() - sequence = "I was born in 92000, and this is falsé." + sequence = "I was born in 92000, and this is falsé!" tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) @@ -216,10 +216,10 @@ def test_full_tokenizer(self): self.assertListEqual(rust_back_tokens, back_tokens_target) # fmt: off - sequence = "I was born in 92000, and this is falsé." - ids_target = [13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9] - tokens_target = ["▁", "I", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", ".", ] - back_tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", ".", ] + sequence = "I was born in 92000, and this is falsé!" + ids_target = [13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 187] + tokens_target = ["▁", "I", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "!", ] + back_tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "!", ] # fmt: on ids = tokenizer.encode(sequence, add_special_tokens=False) diff --git a/tests/models/gemma/test_tokenization_gemma.py b/tests/models/gemma/test_tokenization_gemma.py index d36f1b7dc17647..c322e6174b426f 100644 --- a/tests/models/gemma/test_tokenization_gemma.py +++ b/tests/models/gemma/test_tokenization_gemma.py @@ -193,6 +193,19 @@ def integration_tests(self): }, ) + def test_user_added_tokens(self): + # Ensure that user added tokens are not split in the fast tokenizer + slow_tokenizer = self.tokenizer + fast_tokenizer = self.rust_tokenizer + + user_added_token = "" + + slow_tokens = slow_tokenizer.convert_ids_to_tokens(slow_tokenizer.encode(user_added_token)) + fast_tokens = slow_tokenizer.convert_ids_to_tokens(fast_tokenizer.encode(user_added_token)) + + self.assertTrue(user_added_token in fast_tokens) + self.assertEqual(slow_tokens, fast_tokens) + def test_fast_special_tokens(self): slow_tokenizer = self.tokenizer fast_tokenizer = self.rust_tokenizer diff --git a/tests/models/rembert/test_tokenization_rembert.py b/tests/models/rembert/test_tokenization_rembert.py index 46794733f37f4c..9578a6782fce47 100644 --- a/tests/models/rembert/test_tokenization_rembert.py +++ b/tests/models/rembert/test_tokenization_rembert.py @@ -172,7 +172,7 @@ def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir) self.assertTrue(str(expected_eos) not in tokenizer.additional_special_tokens) self.assertIn(new_eos, tokenizer.added_tokens_decoder.values()) self.assertEqual(tokenizer.added_tokens_decoder[tokenizer.eos_token_id], new_eos) - self.assertDictEqual(expected, tokenizer.added_tokens_decoder) + self.assertTrue(all(item in tokenizer.added_tokens_decoder.items() for item in expected.items())) return tokenizer new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False, special=True) @@ -227,7 +227,12 @@ def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir) self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values())) # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"): - self.assertDictEqual(EXPECTED_ADDED_TOKENS_DECODER, tokenizer_fast.added_tokens_decoder) + self.assertTrue( + all( + item in tokenizer.added_tokens_decoder.items() + for item in EXPECTED_ADDED_TOKENS_DECODER.items() + ) + ) EXPECTED_ADDED_TOKENS_DECODER = tokenizer_fast.added_tokens_decoder with tempfile.TemporaryDirectory() as tmp_dir_4: diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 8b0ad38795f26c..682565256b8347 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -4228,7 +4228,7 @@ def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir) self.assertTrue(str(expected_eos) not in tokenizer.additional_special_tokens) self.assertIn(new_eos, tokenizer.added_tokens_decoder.values()) self.assertEqual(tokenizer.added_tokens_decoder[tokenizer.eos_token_id], new_eos) - self.assertDictEqual(expected, tokenizer.added_tokens_decoder) + self.assertTrue(all(item in tokenizer.added_tokens_decoder.items() for item in expected.items())) return tokenizer new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False, special=True) @@ -4280,7 +4280,13 @@ def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir) self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values())) # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"): - self.assertDictEqual(EXPECTED_ADDED_TOKENS_DECODER, tokenizer_fast.added_tokens_decoder) + # Fast tokenizer may have user_defined_symbols and control_symbols added, unlike slow + self.assertTrue( + all( + item in tokenizer.added_tokens_decoder.items() + for item in EXPECTED_ADDED_TOKENS_DECODER.items() + ) + ) EXPECTED_ADDED_TOKENS_DECODER = tokenizer_fast.added_tokens_decoder with tempfile.TemporaryDirectory() as tmp_dir_4: