Merge pull request #61 from mozilla/icu_tokenizer

Add support for ICU tokenizer
hplt-project · Jan 13, 2025 · e63da95 · e63da95
2 parents 7be3b4d + 554b720
commit e63da95
Show file tree

Hide file tree

Showing 8 changed files with 264 additions and 24 deletions.
diff --git a/README.md b/README.md
@@ -198,8 +198,8 @@ This modifier needs a third column in the training data with per-word (technical
 
 ```yaml
 - Tags: 0.05
-  custom_detok_src: null
-  custom_detok_trg: zh
+  custom_detok_src: "moses:null"
+  custom_detok_trg: "moses:zh"
   spm_vocab: path/to/vocab.enzh.spm
   template: "__source__ {src} __target__ {trg} __done__"
 ```
@@ -218,8 +218,8 @@ Sometimes we want to just replace the source token with the target token directl
 ```yml
 modifiers:
   - Tags: 0.1
-    custom_detok_src: null # Null value for the src detokenizer
-    custom_detok_trg: zh
+    custom_detok_src: "moses:null" # Null value for the src detokenizer
+    custom_detok_trg: "moses:zh"
     replace: 0.4 # 0.4 out of the time tags is triggered, instead replace the target token with random noise, and use that random noise to tag a corresponding source word.
 ```
 
@@ -229,8 +229,8 @@ If alignment information is present, we can augment the training data with inlin
 ```yml
 modifiers:
   - Tags: 0.1
-    custom_detok_src: null # Null value for the src detokenizer
-    custom_detok_trg: zh
+    custom_detok_src: "moses:null" # Null value for the src detokenizer
+    custom_detok_trg: "moses:zh"
     augment: 0.4 # 0.4 out of the time tags is triggered, instead augment the source and the target with random noise. If you want 100% only noise without tag functionality use augment: 1
 ```
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "opustrainer"
-version = "0.2"
+version = "0.3"
 authors = [
   { name="Jelmer van der Linde", email="jelmer.vanderlinde@ed.ac.uk" },
   { name="Nikolay Bogoychev", email="n.bogoych@ed.ac.uk" }

diff --git a/src/opustrainer/modifiers/placeholders.py b/src/opustrainer/modifiers/placeholders.py
@@ -5,7 +5,8 @@
 
 from opustrainer.alignments import Pair, parse_alignments, format_alignments
 from opustrainer.modifiers import Modifier
-from opustrainer.tokenizers import SpaceDetokenizer, SpaceTokenizer, MosesDetokenizer, SentencePieceTokenizer
+from opustrainer.tokenizers import SpaceDetokenizer, SpaceTokenizer, SentencePieceTokenizer, \
+    make_detokenizer, ICU_WHITESPACE_TOKEN
 from opustrainer.modifiers.retokenize import Retokenizer, remap_alignment_pairs
 from opustrainer import logger
 
@@ -231,8 +232,8 @@ class PlaceholderTagModifier(Modifier):
        ```yaml
        modifiers:
        - Tags: 0.02
-         custom_detok_src: 'zh'
-         custom_detok_trg: null
+         custom_detok_src: 'moses:zh'
+         custom_detok_trg: "moses:null"
          template: "__source__ {src} __target__ {trg} __done__"
          augment: 0.0 # 0% chance to just insert a random string on both sides
          replace: 0.0 # 0% change to use tags to force translate to a random string
@@ -252,18 +253,27 @@ class PlaceholderTagModifier(Modifier):
 
     def __init__(self, probability: float=0.0, custom_detok_src: Optional[str]=None, custom_detok_trg: Optional[str]=None,
         spm_vocab: Optional[Path]=None,
-        template: str="__source__ {src} __target__ {trg} __done__", augment: float=0, replace:float=0):
+        template: str="__source__ {src} __target__ {trg} __done__", augment: float=0, replace:float=0, tag:float=1):
         super().__init__(probability)
 
         self.template = template
 
+        # uses Moses detokenizer by default
+        if custom_detok_src and ':' not in custom_detok_src:
+            custom_detok_src = f'moses:{custom_detok_src}'
+        if custom_detok_trg and ':' not in custom_detok_trg:
+            custom_detok_trg = f'moses:{custom_detok_trg}'
+
+        self.custom_detok_src = custom_detok_src
+        self.custom_detok_trg = custom_detok_trg
+
         self.src_retokenizer = Retokenizer(
-            detokenizer=MosesDetokenizer(custom_detok_src) if custom_detok_src else SpaceDetokenizer(),
+            detokenizer=make_detokenizer(custom_detok_src) if custom_detok_src else SpaceDetokenizer(),
             tokenizer=SentencePieceTokenizer(spm_vocab) if spm_vocab else SpaceTokenizer()
         )
 
         self.trg_retokenizer = Retokenizer(
-            detokenizer=MosesDetokenizer(custom_detok_trg) if custom_detok_trg else SpaceDetokenizer(),
+            detokenizer=make_detokenizer(custom_detok_trg) if custom_detok_trg else SpaceDetokenizer(),
             tokenizer=SentencePieceTokenizer(spm_vocab) if spm_vocab else SpaceTokenizer()
         )
 
@@ -281,7 +291,13 @@ def __init__(self, probability: float=0.0, custom_detok_src: Optional[str]=None,
         if replace > 0:
             self.modes.append(('replace', replace))
 
-        self.modes.append(('tag', 1.0)) # Weight doesn't matter as long as cumsum => 1.0, it's last on the list anyway
+        # the modifier can be used for inline noise augmentation only
+        if tag > 0:
+            self.modes.append(('tag', tag))
+
+        if ({'replace', 'tag'} & {mode for mode,_ in self.modes}) and \
+            'icu' in ((self.custom_detok_trg or '') + (self.custom_detok_trg or '')):
+            raise ValueError('ICU tokenization is not supported with "tag" and "replace" modes')
 
     def __call__(self, batch: List[str]) -> Iterable[str]:
         for line in batch:
@@ -293,7 +309,7 @@ def __call__(self, batch: List[str]) -> Iterable[str]:
     def apply(self, line:str) -> str:
         """Applies tag to words in a line based on alignment info, and then removes the alignment info from the line.
            This is used to enable terminology support by tagging random words with their translation.
-           eg "I like cake" would become "I __source__ like __target__ gusta __done__ cake. 
+           eg "I like cake" would become "I __source__ like __target__ gusta __done__ cake.
            By default the detokenizer used is the trivial detokenizer, but we can instead have separate detokenizers on src and trg."
         """
 
@@ -333,7 +349,7 @@ def apply(self, line:str) -> str:
                 continue
 
             # Select mode (skip random_weighted_choices*() when 'tag' is the only mode)
-            mode = random_weighted_choice(self.modes) if len(self.modes) > 1 else 'tag'
+            mode = random_weighted_choice(self.modes) if len(self.modes) > 1 else self.modes[0][0]
 
             if mode == "tag" or mode == "replace":
                 if mode == "tag":
@@ -375,19 +391,19 @@ def apply(self, line:str) -> str:
                 # Augment mode adds random noise both on the source and the target without any
                 # tagging encouraging the model to copy crap from one side to the other.
                 augment_tokens = get_random_unicode_words()
-                source = source[:candidate.src+1] + augment_tokens + source[candidate.src+1:]
-                target = target[:candidate.trg+1] + augment_tokens + target[candidate.trg+1:]
+                source, num_src_aug_tokens, pos_aug_src = self.insert_augmented(augment_tokens, source, candidate.src+1, self.custom_detok_src)
+                target, num_trg_aug_tokens, pos_aug_trg = self.insert_augmented(augment_tokens, target, candidate.trg+1, self.custom_detok_trg)
 
                 # Fix up alignment pairs
                 alignments = (
                     # pairs before and including the candidate stay the same
                     alignments[:candidate_index+1]
                     # fill in the gap created by the added random noise
-                    + [Pair(candidate.src + n, candidate.trg + n) for n in range(1, len(augment_tokens) + 1)]
+                    + [Pair(candidate.src + n_src, candidate.trg + n_trg) for n_src, n_trg in zip(pos_aug_src, pos_aug_trg)]
                     # pairs after the replaced bit have to be offset by the length of the replacement bit
-                    + [Pair(pair.src + len(augment_tokens), pair.trg + len(augment_tokens)) for pair in alignments[candidate_index+1:]]
+                    + [Pair(pair.src + num_src_aug_tokens, pair.trg + num_trg_aug_tokens) for pair in alignments[candidate_index+1:]]
                 )
-                candidate_offset = candidate_index + len(augment_tokens) + 1
+                candidate_offset = candidate_index + min(num_src_aug_tokens, num_trg_aug_tokens) + 1
 
         source_detok, _, source_mapping = self.src_retokenizer.retokenize(source)
         target_detok, _, target_mapping = self.trg_retokenizer.retokenize(target)
@@ -398,6 +414,46 @@ def apply(self, line:str) -> str:
         else:
             return source_detok + "\t" + target_detok
 
+    def insert_augmented(self, augment_tokens: List[str], tokens: List[str], position: int, detokenization: str) -> Tuple[List[str], int, List[int]]:
+        """
+        Inserts augmented tokens.
+        Accounts for possible ICU detokenization which uses special symbol "▁" for whitespace tokens.
+            Such tokens will also be inserted to separate the augmented words.
+
+        Returns:
+            new tokens
+            number of augmented tokens including whitespaces for in ICU case
+            alignments positions for the augmented tokens (whitespaces are excluded, we don't need alignments for them)
+        """
+        prefix = tokens[:position]
+        postfix = tokens[position:]
+        aug_aln_offset = []
+
+        if detokenization is not None and "icu" in detokenization:
+            new_aug_tokens = []
+            aug_pos_index = 1
+
+            if len(prefix) > 0 and prefix[-1] != ICU_WHITESPACE_TOKEN:
+                new_aug_tokens.append(ICU_WHITESPACE_TOKEN)
+                aug_pos_index += 1
+
+            for token in augment_tokens:
+                new_aug_tokens.append(token)
+                # save the offset of the augmented words to use in alignments
+                aug_aln_offset.append(aug_pos_index)
+                new_aug_tokens.append(ICU_WHITESPACE_TOKEN)
+                aug_pos_index += 2
+
+            if len(postfix) > 0 and postfix[0] == ICU_WHITESPACE_TOKEN:
+                new_aug_tokens.pop()
+
+            augment_tokens = new_aug_tokens
+        else:
+            aug_aln_offset = list(range(1, len(augment_tokens) + 1))
+
+        tokens = prefix + augment_tokens + postfix
+        return tokens, len(augment_tokens), aug_aln_offset
+
     def validate(self, context:List[Modifier]) -> None:
         """Current limitation of the tags modifier is that any other modifier might modify the
         inserted tags, which we don't want. So warn users about that if we notice it.

diff --git a/src/opustrainer/modifiers/retokenize.py b/src/opustrainer/modifiers/retokenize.py
@@ -33,6 +33,10 @@ def retokenize(self, tokens:TokenList) -> Tuple[str,TokenList,TokenMapping]:
 
         prev_j = 0
         for i, old_token_span in enumerate(old_token_spans):
+            # it is possible for ICU tokenizer whitespace token, return empty list
+            if old_token_span is None:
+                continue
+
             for j, new_token_span in enumerate(new_token_spans[prev_j:], start=prev_j):
                 prev_j = j
                 overlap = slice_cmp(old_token_span, new_token_span)
@@ -59,8 +63,8 @@ def remap_alignment_pairs(src_mapping:TokenMapping, trg_mapping:TokenMapping, al
     sentence pair.
     
     E.g. if you have
-    source-mapping: [0 => [3,4], 1 => [5]],
-    target-mapping: [0 => [0], 1 => [1]]
+    source-mapping: [0 => [3,4], 1 => [5], 2 => []],
+    target-mapping: [0 => [0], 1 => [1], 2 => []]
     alignments:     [(0,1), (1,1)]
     it will return  [
         (3,1), (4,1), # the [0 => [3,4]] mapping

diff --git a/src/opustrainer/tokenizers.py b/src/opustrainer/tokenizers.py
@@ -11,12 +11,15 @@
 DETOKENIZERS = {
     'moses': lambda lang: MosesDetokenizer(lang),
     'spaces': lambda: SpaceDetokenizer(),
+    'icu': lambda lang: IcuDetokenizer(lang),
+
 }
 
 TOKENIZERS = {
     'moses': lambda lang: MosesTokenizer(lang),
     'spm': lambda vocab: SentencePieceTokenizer(vocab),
     'spaces': lambda: SpaceTokenizer(),
+    'icu': lambda lang: IcuTokenizer(lang),
 }
 
 
@@ -126,3 +129,72 @@ def tokenize(self, text:str) -> Tuple[TokenList,TokenSpanList]:
         tokens = [text[span] for span in spans]
         return tokens, spans
 
+# The same character as in SentencePiece
+ICU_WHITESPACE_TOKEN = "▁"
+class IcuTokenizer:
+    """
+    Tokenizes text by splitting words and punctuation using ICU segmenter.
+    Whitespaces will be preserved as a special token ▁ for lossless detokenization.
+    Requires installation with the steps specified in https://pypi.org/project/PyICU/
+    """
+
+    def __init__(self, lang: str):
+        self.lang = lang
+
+    def tokenize(self, text:str) -> Tuple[TokenList, TokenSpanList]:
+        from icu import BreakIterator, Locale
+
+        bi = BreakIterator.createWordInstance(Locale(self.lang))
+        bi.setText(text)
+
+        tokens = []
+        start = bi.first()
+        for end in bi:
+            token = text[start:end]
+            if (
+                token and token != "\n"
+            ):  # exclude empty tokens, but leave whitespaces and replace them with a special token
+                tokens.append(token)
+            start = end
+
+        spans: TokenSpanList = []
+        offset = 0
+        for token in tokens:
+            offset = text.find(token, offset)
+            if offset == -1:
+                raise RuntimeError(f"Could not find token '{token}' in original text")
+            spans.append(slice(offset, offset + len(token)))
+            offset += len(token)
+
+        tokens = [token.replace(" ", ICU_WHITESPACE_TOKEN) for token in tokens]
+        return tokens, spans
+
+class IcuDetokenizer:
+    """
+    Detokenizes tokens back into the original text preserving whitespaces as well.
+    Spans for whitespaces will be None.
+    """
+
+    # For compatibility with MosesDetokenizer interface
+    def __init__(self, lang):
+        self.lang = lang
+
+    def detokenize(self, tokens:TokenList) -> Tuple[str,TokenSpanList]:
+        text = "".join(tokens).replace(ICU_WHITESPACE_TOKEN, " ")
+
+        spans = []
+        offset = 0
+
+        for token in tokens:
+            if token == ICU_WHITESPACE_TOKEN:
+                spans.append(None)
+                continue
+            # there are some edge cases where a whitespace can appear inside a token
+            token = token.replace(ICU_WHITESPACE_TOKEN, " ")
+            offset = text.find(token, offset)
+            if offset == -1:
+                raise RuntimeError(f"Could not find token '{token}' in detokenized text")
+            spans.append(slice(offset, offset + len(token)))
+            offset += len(token)
+
+        return text, spans
diff --git a/tests/test_placeholders.py b/tests/test_placeholders.py
@@ -40,6 +40,15 @@ def test_tagger_augment(self):
     output = tagger(['Hello world\tHallo Welt\t0-0 1-1'])
     self.assertEqual(first(output), 'Hello িৡহ world ЇӤӕѣѮ қӃӄЀҲ\tHallo িৡহ Welt ЇӤӕѣѮ қӃӄЀҲ\t0-0 1-1 2-2 3-3 4-4')
 
+  def test_tagger_augment_icu(self):
+    """Augment mode will add random noise without tags to both source and target
+    sentence, teaching the model to copy strings it doesn't understand."""
+    tagger = PlaceholderTagModifier(probability=1, augment=1, tag=0, custom_detok_src='icu:en', custom_detok_trg='icu:de')
+    tagger.print_alignments = True
+    output = tagger(['Hello ▁ world\tHallo ▁ Welt\t0-0 1-1 2-2'])
+    self.assertEqual(first(output), 'Hello িৡহ world ټ؇ۤە٣ٮڛۃ \tHallo িৡহ Welt ټ؇ۤە٣ٮڛۃ \t0-0 1-1 2-2 3-3')
+
+
   def test_retokenize(self):
     """Pass the spm vocab to the placeholder tag generator so that it can
     retokenize the input, and update the alignments accordingly."""
@@ -80,6 +89,36 @@ def test_retokenize(self):
       # 7-9 [.]         [。]    18-16
     ])
 
+  def test_augment_icu(self):
+    """Pass the spm vocab to the placeholder tag generator so that it can
+    retokenize the input, and update the alignments accordingly."""
+    tagger = PlaceholderTagModifier(
+      probability=0.2,
+      augment=1,
+      tag=0,
+      custom_detok_src='icu:en',
+      custom_detok_trg='icu:zh',
+      spm_vocab='contrib/test-data/vocab.zhen.spm')  # type: ignore Path vs String type issue
+
+    output = tagger(['\t'.join([
+      'This ▁ is ▁ a ▁ simple ▁ test ▁ statement ▁ 🤣 .',
+      #^0   ^1^2 ^3^4^5^6     ^7^8   ^9^10       ^11^12^13
+      '这 是 一个 简单 的 测试 语 句 ▁ 🤣 ▁ 。',
+      #^0 ^1 ^2  ^3  ^4 ^5  ^6 ^7^8 ^9^10^11
+      '0-0 2-1 4-2 6-3 6-4 8-5 10-6 10-7 12-9 13-11',
+    ])])
+
+    self.assertEqual(first(output).split('\t'), [
+      'This িৡহ is a simple test statement 🤣.',
+      # ['This', ' ', '', '', 'ি', '', '', 'ৡ', '', '', 'হ', ' is', ' a', ' simple', ' test', ' statement', ' ', '', '', '', '🤣', '.']
+      '这 িৡহ 是一个简单的测试语句 🤣 。',
+      # ['这', ' ', '', '', 'ি', '', '', 'ৡ', '', '', 'হ', ' 是', '一', '个', '简', '单', '的', '测', '试', '语', '句', ' ', '', '', '', '🤣', ' 。']
+      '0-0 4-4 4-5 4-6 4-7 4-8 4-9 4-10 5-4 5-5 5-6 5-7 5-8 5-9 5-10 6-4 6-5 6-6 '
+      '6-7 6-8 6-9 6-10 7-4 7-5 7-6 7-7 7-8 7-9 7-10 8-4 8-5 8-6 8-7 8-8 8-9 8-10 '
+      '9-4 9-5 9-6 9-7 9-8 9-9 9-10 10-4 10-5 10-6 10-7 10-8 10-9 10-10 11-11 12-12 '
+      '12-13 13-14 13-15 13-16 14-17 14-18 15-19 15-20 20-25 21-26'
+    ])
+
   def test_retokenize_on_non_trigger(self):
     """Pass the spm vocab to the placeholder tag generator so that it can
     retokenize the input, even if probability is 0."""