-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[canary] Test for CanaryTokenizer + refactoring (#8285)
* Test for CanaryTokenizer Signed-off-by: Piotr Żelasko <petezor@gmail.com> * Attempt at refactor... Signed-off-by: Piotr Żelasko <petezor@gmail.com> --------- Signed-off-by: Piotr Żelasko <petezor@gmail.com>
- Loading branch information
Showing
3 changed files
with
98 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
from unittest.mock import Mock | ||
|
||
import pytest | ||
import sentencepiece as spm | ||
Check notice Code scanning / CodeQL Unused import Note test
Import of 'spm' is not used.
|
||
from omegaconf import OmegaConf | ||
|
||
from nemo.collections.asr.parts.mixins import ASRBPEMixin | ||
from nemo.collections.common.tokenizers.canary_tokenizer import SPECIAL_TOKENS, UNUSED_SPECIAL_TOKENS, CanaryTokenizer | ||
from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer, create_spt_model | ||
from nemo.core import Serialization | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
def special_tokenizer_path(tmp_path_factory) -> str: | ||
tmpdir = tmp_path_factory.mktemp("spl_tokens") | ||
CanaryTokenizer.build_special_tokenizer(tmpdir) | ||
return str(tmpdir) | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
def lang_tokenizer_path(tmp_path_factory) -> str: | ||
tmpdir = tmp_path_factory.mktemp("klingon_tokens") | ||
text_path = tmpdir / "text.txt" | ||
text_path.write_text("a\nb\nc\nd\n") | ||
create_spt_model(text_path, vocab_size=8, sample_size=-1, do_lower_case=False, output_dir=str(tmpdir)) | ||
return str(tmpdir) | ||
|
||
|
||
def test_canary_tokenizer_build_special_tokenizer(tmp_path): | ||
tokenizer = CanaryTokenizer.build_special_tokenizer(tmp_path) | ||
expected_tokens = ["<unk>"] + SPECIAL_TOKENS + UNUSED_SPECIAL_TOKENS + ["▁"] | ||
tokens = [] | ||
for i in range(tokenizer.tokenizer.vocab_size()): | ||
tokens.append(tokenizer.tokenizer.IdToPiece(i)) | ||
assert expected_tokens == tokens | ||
|
||
|
||
def test_canary_tokenizer_init_from_cfg(special_tokenizer_path, lang_tokenizer_path): | ||
class DummyModel(ASRBPEMixin, Serialization): | ||
pass | ||
|
||
model = DummyModel() | ||
model.register_artifact = Mock(side_effect=lambda self, x: x) | ||
config = OmegaConf.create( | ||
{ | ||
"type": "agg", | ||
"dir": None, | ||
"langs": { | ||
"spl_tokens": {"dir": special_tokenizer_path, "type": "bpe"}, | ||
"en": {"dir": lang_tokenizer_path, "type": "bpe"}, | ||
}, | ||
"custom_tokenizer": {"_target_": "nemo.collections.common.tokenizers.canary_tokenizer.CanaryTokenizer",}, | ||
} | ||
) | ||
model._setup_aggregate_tokenizer(config) | ||
tokenizer = model.tokenizer | ||
|
||
assert isinstance(tokenizer, CanaryTokenizer) | ||
assert len(tokenizer.tokenizers_dict) == 2 | ||
assert set(tokenizer.tokenizers_dict.keys()) == {"spl_tokens", "en"} | ||
|
||
assert isinstance(tokenizer.tokenizers_dict["spl_tokens"], SentencePieceTokenizer) | ||
assert tokenizer.tokenizers_dict["spl_tokens"].vocab_size == 32 | ||
|
||
assert isinstance(tokenizer.tokenizers_dict["en"], SentencePieceTokenizer) | ||
assert tokenizer.tokenizers_dict["en"].vocab_size == 6 | ||
|
||
assert tokenizer.text_to_ids("<|startoftranscript|>", lang_id="spl_tokens") == [31, 3] # "_" comes first | ||
assert tokenizer.text_to_ids("a", lang_id="en") == [32 + 1, 32 + 2] |