Skip to content

Commit

Permalink
Merge pull request #61 from mozilla/icu_tokenizer
Browse files Browse the repository at this point in the history
Add support for ICU tokenizer
  • Loading branch information
ZJaume authored Jan 13, 2025
2 parents 7be3b4d + 554b720 commit e63da95
Show file tree
Hide file tree
Showing 8 changed files with 264 additions and 24 deletions.
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,8 @@ This modifier needs a third column in the training data with per-word (technical

```yaml
- Tags: 0.05
custom_detok_src: null
custom_detok_trg: zh
custom_detok_src: "moses:null"
custom_detok_trg: "moses:zh"
spm_vocab: path/to/vocab.enzh.spm
template: "__source__ {src} __target__ {trg} __done__"
```
Expand All @@ -218,8 +218,8 @@ Sometimes we want to just replace the source token with the target token directl
```yml
modifiers:
- Tags: 0.1
custom_detok_src: null # Null value for the src detokenizer
custom_detok_trg: zh
custom_detok_src: "moses:null" # Null value for the src detokenizer
custom_detok_trg: "moses:zh"
replace: 0.4 # 0.4 out of the time tags is triggered, instead replace the target token with random noise, and use that random noise to tag a corresponding source word.
```

Expand All @@ -229,8 +229,8 @@ If alignment information is present, we can augment the training data with inlin
```yml
modifiers:
- Tags: 0.1
custom_detok_src: null # Null value for the src detokenizer
custom_detok_trg: zh
custom_detok_src: "moses:null" # Null value for the src detokenizer
custom_detok_trg: "moses:zh"
augment: 0.4 # 0.4 out of the time tags is triggered, instead augment the source and the target with random noise. If you want 100% only noise without tag functionality use augment: 1
```

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "opustrainer"
version = "0.2"
version = "0.3"
authors = [
{ name="Jelmer van der Linde", email="jelmer.vanderlinde@ed.ac.uk" },
{ name="Nikolay Bogoychev", email="n.bogoych@ed.ac.uk" }
Expand Down
84 changes: 70 additions & 14 deletions src/opustrainer/modifiers/placeholders.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

from opustrainer.alignments import Pair, parse_alignments, format_alignments
from opustrainer.modifiers import Modifier
from opustrainer.tokenizers import SpaceDetokenizer, SpaceTokenizer, MosesDetokenizer, SentencePieceTokenizer
from opustrainer.tokenizers import SpaceDetokenizer, SpaceTokenizer, SentencePieceTokenizer, \
make_detokenizer, ICU_WHITESPACE_TOKEN
from opustrainer.modifiers.retokenize import Retokenizer, remap_alignment_pairs
from opustrainer import logger

Expand Down Expand Up @@ -231,8 +232,8 @@ class PlaceholderTagModifier(Modifier):
```yaml
modifiers:
- Tags: 0.02
custom_detok_src: 'zh'
custom_detok_trg: null
custom_detok_src: 'moses:zh'
custom_detok_trg: "moses:null"
template: "__source__ {src} __target__ {trg} __done__"
augment: 0.0 # 0% chance to just insert a random string on both sides
replace: 0.0 # 0% change to use tags to force translate to a random string
Expand All @@ -252,18 +253,27 @@ class PlaceholderTagModifier(Modifier):

def __init__(self, probability: float=0.0, custom_detok_src: Optional[str]=None, custom_detok_trg: Optional[str]=None,
spm_vocab: Optional[Path]=None,
template: str="__source__ {src} __target__ {trg} __done__", augment: float=0, replace:float=0):
template: str="__source__ {src} __target__ {trg} __done__", augment: float=0, replace:float=0, tag:float=1):
super().__init__(probability)

self.template = template

# uses Moses detokenizer by default
if custom_detok_src and ':' not in custom_detok_src:
custom_detok_src = f'moses:{custom_detok_src}'
if custom_detok_trg and ':' not in custom_detok_trg:
custom_detok_trg = f'moses:{custom_detok_trg}'

self.custom_detok_src = custom_detok_src
self.custom_detok_trg = custom_detok_trg

self.src_retokenizer = Retokenizer(
detokenizer=MosesDetokenizer(custom_detok_src) if custom_detok_src else SpaceDetokenizer(),
detokenizer=make_detokenizer(custom_detok_src) if custom_detok_src else SpaceDetokenizer(),
tokenizer=SentencePieceTokenizer(spm_vocab) if spm_vocab else SpaceTokenizer()
)

self.trg_retokenizer = Retokenizer(
detokenizer=MosesDetokenizer(custom_detok_trg) if custom_detok_trg else SpaceDetokenizer(),
detokenizer=make_detokenizer(custom_detok_trg) if custom_detok_trg else SpaceDetokenizer(),
tokenizer=SentencePieceTokenizer(spm_vocab) if spm_vocab else SpaceTokenizer()
)

Expand All @@ -281,7 +291,13 @@ def __init__(self, probability: float=0.0, custom_detok_src: Optional[str]=None,
if replace > 0:
self.modes.append(('replace', replace))

self.modes.append(('tag', 1.0)) # Weight doesn't matter as long as cumsum => 1.0, it's last on the list anyway
# the modifier can be used for inline noise augmentation only
if tag > 0:
self.modes.append(('tag', tag))

if ({'replace', 'tag'} & {mode for mode,_ in self.modes}) and \
'icu' in ((self.custom_detok_trg or '') + (self.custom_detok_trg or '')):
raise ValueError('ICU tokenization is not supported with "tag" and "replace" modes')

def __call__(self, batch: List[str]) -> Iterable[str]:
for line in batch:
Expand All @@ -293,7 +309,7 @@ def __call__(self, batch: List[str]) -> Iterable[str]:
def apply(self, line:str) -> str:
"""Applies tag to words in a line based on alignment info, and then removes the alignment info from the line.
This is used to enable terminology support by tagging random words with their translation.
eg "I like cake" would become "I __source__ like __target__ gusta __done__ cake.
eg "I like cake" would become "I __source__ like __target__ gusta __done__ cake.
By default the detokenizer used is the trivial detokenizer, but we can instead have separate detokenizers on src and trg."
"""

Expand Down Expand Up @@ -333,7 +349,7 @@ def apply(self, line:str) -> str:
continue

# Select mode (skip random_weighted_choices*() when 'tag' is the only mode)
mode = random_weighted_choice(self.modes) if len(self.modes) > 1 else 'tag'
mode = random_weighted_choice(self.modes) if len(self.modes) > 1 else self.modes[0][0]

if mode == "tag" or mode == "replace":
if mode == "tag":
Expand Down Expand Up @@ -375,19 +391,19 @@ def apply(self, line:str) -> str:
# Augment mode adds random noise both on the source and the target without any
# tagging encouraging the model to copy crap from one side to the other.
augment_tokens = get_random_unicode_words()
source = source[:candidate.src+1] + augment_tokens + source[candidate.src+1:]
target = target[:candidate.trg+1] + augment_tokens + target[candidate.trg+1:]
source, num_src_aug_tokens, pos_aug_src = self.insert_augmented(augment_tokens, source, candidate.src+1, self.custom_detok_src)
target, num_trg_aug_tokens, pos_aug_trg = self.insert_augmented(augment_tokens, target, candidate.trg+1, self.custom_detok_trg)

# Fix up alignment pairs
alignments = (
# pairs before and including the candidate stay the same
alignments[:candidate_index+1]
# fill in the gap created by the added random noise
+ [Pair(candidate.src + n, candidate.trg + n) for n in range(1, len(augment_tokens) + 1)]
+ [Pair(candidate.src + n_src, candidate.trg + n_trg) for n_src, n_trg in zip(pos_aug_src, pos_aug_trg)]
# pairs after the replaced bit have to be offset by the length of the replacement bit
+ [Pair(pair.src + len(augment_tokens), pair.trg + len(augment_tokens)) for pair in alignments[candidate_index+1:]]
+ [Pair(pair.src + num_src_aug_tokens, pair.trg + num_trg_aug_tokens) for pair in alignments[candidate_index+1:]]
)
candidate_offset = candidate_index + len(augment_tokens) + 1
candidate_offset = candidate_index + min(num_src_aug_tokens, num_trg_aug_tokens) + 1

source_detok, _, source_mapping = self.src_retokenizer.retokenize(source)
target_detok, _, target_mapping = self.trg_retokenizer.retokenize(target)
Expand All @@ -398,6 +414,46 @@ def apply(self, line:str) -> str:
else:
return source_detok + "\t" + target_detok

def insert_augmented(self, augment_tokens: List[str], tokens: List[str], position: int, detokenization: str) -> Tuple[List[str], int, List[int]]:
"""
Inserts augmented tokens.
Accounts for possible ICU detokenization which uses special symbol "▁" for whitespace tokens.
Such tokens will also be inserted to separate the augmented words.
Returns:
new tokens
number of augmented tokens including whitespaces for in ICU case
alignments positions for the augmented tokens (whitespaces are excluded, we don't need alignments for them)
"""
prefix = tokens[:position]
postfix = tokens[position:]
aug_aln_offset = []

if detokenization is not None and "icu" in detokenization:
new_aug_tokens = []
aug_pos_index = 1

if len(prefix) > 0 and prefix[-1] != ICU_WHITESPACE_TOKEN:
new_aug_tokens.append(ICU_WHITESPACE_TOKEN)
aug_pos_index += 1

for token in augment_tokens:
new_aug_tokens.append(token)
# save the offset of the augmented words to use in alignments
aug_aln_offset.append(aug_pos_index)
new_aug_tokens.append(ICU_WHITESPACE_TOKEN)
aug_pos_index += 2

if len(postfix) > 0 and postfix[0] == ICU_WHITESPACE_TOKEN:
new_aug_tokens.pop()

augment_tokens = new_aug_tokens
else:
aug_aln_offset = list(range(1, len(augment_tokens) + 1))

tokens = prefix + augment_tokens + postfix
return tokens, len(augment_tokens), aug_aln_offset

def validate(self, context:List[Modifier]) -> None:
"""Current limitation of the tags modifier is that any other modifier might modify the
inserted tags, which we don't want. So warn users about that if we notice it.
Expand Down
8 changes: 6 additions & 2 deletions src/opustrainer/modifiers/retokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ def retokenize(self, tokens:TokenList) -> Tuple[str,TokenList,TokenMapping]:

prev_j = 0
for i, old_token_span in enumerate(old_token_spans):
# it is possible for ICU tokenizer whitespace token, return empty list
if old_token_span is None:
continue

for j, new_token_span in enumerate(new_token_spans[prev_j:], start=prev_j):
prev_j = j
overlap = slice_cmp(old_token_span, new_token_span)
Expand All @@ -59,8 +63,8 @@ def remap_alignment_pairs(src_mapping:TokenMapping, trg_mapping:TokenMapping, al
sentence pair.
E.g. if you have
source-mapping: [0 => [3,4], 1 => [5]],
target-mapping: [0 => [0], 1 => [1]]
source-mapping: [0 => [3,4], 1 => [5], 2 => []],
target-mapping: [0 => [0], 1 => [1], 2 => []]
alignments: [(0,1), (1,1)]
it will return [
(3,1), (4,1), # the [0 => [3,4]] mapping
Expand Down
72 changes: 72 additions & 0 deletions src/opustrainer/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,15 @@
DETOKENIZERS = {
'moses': lambda lang: MosesDetokenizer(lang),
'spaces': lambda: SpaceDetokenizer(),
'icu': lambda lang: IcuDetokenizer(lang),

}

TOKENIZERS = {
'moses': lambda lang: MosesTokenizer(lang),
'spm': lambda vocab: SentencePieceTokenizer(vocab),
'spaces': lambda: SpaceTokenizer(),
'icu': lambda lang: IcuTokenizer(lang),
}


Expand Down Expand Up @@ -126,3 +129,72 @@ def tokenize(self, text:str) -> Tuple[TokenList,TokenSpanList]:
tokens = [text[span] for span in spans]
return tokens, spans

# The same character as in SentencePiece
ICU_WHITESPACE_TOKEN = "▁"
class IcuTokenizer:
"""
Tokenizes text by splitting words and punctuation using ICU segmenter.
Whitespaces will be preserved as a special token ▁ for lossless detokenization.
Requires installation with the steps specified in https://pypi.org/project/PyICU/
"""

def __init__(self, lang: str):
self.lang = lang

def tokenize(self, text:str) -> Tuple[TokenList, TokenSpanList]:
from icu import BreakIterator, Locale

bi = BreakIterator.createWordInstance(Locale(self.lang))
bi.setText(text)

tokens = []
start = bi.first()
for end in bi:
token = text[start:end]
if (
token and token != "\n"
): # exclude empty tokens, but leave whitespaces and replace them with a special token
tokens.append(token)
start = end

spans: TokenSpanList = []
offset = 0
for token in tokens:
offset = text.find(token, offset)
if offset == -1:
raise RuntimeError(f"Could not find token '{token}' in original text")
spans.append(slice(offset, offset + len(token)))
offset += len(token)

tokens = [token.replace(" ", ICU_WHITESPACE_TOKEN) for token in tokens]
return tokens, spans

class IcuDetokenizer:
"""
Detokenizes tokens back into the original text preserving whitespaces as well.
Spans for whitespaces will be None.
"""

# For compatibility with MosesDetokenizer interface
def __init__(self, lang):
self.lang = lang

def detokenize(self, tokens:TokenList) -> Tuple[str,TokenSpanList]:
text = "".join(tokens).replace(ICU_WHITESPACE_TOKEN, " ")

spans = []
offset = 0

for token in tokens:
if token == ICU_WHITESPACE_TOKEN:
spans.append(None)
continue
# there are some edge cases where a whitespace can appear inside a token
token = token.replace(ICU_WHITESPACE_TOKEN, " ")
offset = text.find(token, offset)
if offset == -1:
raise RuntimeError(f"Could not find token '{token}' in detokenized text")
spans.append(slice(offset, offset + len(token)))
offset += len(token)

return text, spans
39 changes: 39 additions & 0 deletions tests/test_placeholders.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,15 @@ def test_tagger_augment(self):
output = tagger(['Hello world\tHallo Welt\t0-0 1-1'])
self.assertEqual(first(output), 'Hello িৡহ world ЇӤӕѣѮ қӃӄЀҲ\tHallo িৡহ Welt ЇӤӕѣѮ қӃӄЀҲ\t0-0 1-1 2-2 3-3 4-4')

def test_tagger_augment_icu(self):
"""Augment mode will add random noise without tags to both source and target
sentence, teaching the model to copy strings it doesn't understand."""
tagger = PlaceholderTagModifier(probability=1, augment=1, tag=0, custom_detok_src='icu:en', custom_detok_trg='icu:de')
tagger.print_alignments = True
output = tagger(['Hello ▁ world\tHallo ▁ Welt\t0-0 1-1 2-2'])
self.assertEqual(first(output), 'Hello িৡহ world ټ؇ۤە٣ٮڛۃ \tHallo িৡহ Welt ټ؇ۤە٣ٮڛۃ \t0-0 1-1 2-2 3-3')


def test_retokenize(self):
"""Pass the spm vocab to the placeholder tag generator so that it can
retokenize the input, and update the alignments accordingly."""
Expand Down Expand Up @@ -80,6 +89,36 @@ def test_retokenize(self):
# 7-9 [.] [。] 18-16
])

def test_augment_icu(self):
"""Pass the spm vocab to the placeholder tag generator so that it can
retokenize the input, and update the alignments accordingly."""
tagger = PlaceholderTagModifier(
probability=0.2,
augment=1,
tag=0,
custom_detok_src='icu:en',
custom_detok_trg='icu:zh',
spm_vocab='contrib/test-data/vocab.zhen.spm') # type: ignore Path vs String type issue

output = tagger(['\t'.join([
'This ▁ is ▁ a ▁ simple ▁ test ▁ statement ▁ 🤣 .',
#^0 ^1^2 ^3^4^5^6 ^7^8 ^9^10 ^11^12^13
'这 是 一个 简单 的 测试 语 句 ▁ 🤣 ▁ 。',
#^0 ^1 ^2 ^3 ^4 ^5 ^6 ^7^8 ^9^10^11
'0-0 2-1 4-2 6-3 6-4 8-5 10-6 10-7 12-9 13-11',
])])

self.assertEqual(first(output).split('\t'), [
'This িৡহ is a simple test statement 🤣.',
# ['This', ' ', '', '', 'ি', '', '', 'ৡ', '', '', 'হ', ' is', ' a', ' simple', ' test', ' statement', ' ', '', '', '', '🤣', '.']
'这 িৡহ 是一个简单的测试语句 🤣 。',
# ['这', ' ', '', '', 'ি', '', '', 'ৡ', '', '', 'হ', ' 是', '一', '个', '简', '单', '的', '测', '试', '语', '句', ' ', '', '', '', '🤣', ' 。']
'0-0 4-4 4-5 4-6 4-7 4-8 4-9 4-10 5-4 5-5 5-6 5-7 5-8 5-9 5-10 6-4 6-5 6-6 '
'6-7 6-8 6-9 6-10 7-4 7-5 7-6 7-7 7-8 7-9 7-10 8-4 8-5 8-6 8-7 8-8 8-9 8-10 '
'9-4 9-5 9-6 9-7 9-8 9-9 9-10 10-4 10-5 10-6 10-7 10-8 10-9 10-10 11-11 12-12 '
'12-13 13-14 13-15 13-16 14-17 14-18 15-19 15-20 20-25 21-26'
])

def test_retokenize_on_non_trigger(self):
"""Pass the spm vocab to the placeholder tag generator so that it can
retokenize the input, even if probability is 0."""
Expand Down
Loading

0 comments on commit e63da95

Please sign in to comment.