Skip to content

Commit

Permalink
[TTS] added missing German phoneme tokenizer. (#5070)
Browse files Browse the repository at this point in the history
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
  • Loading branch information
XuesongYang authored and web-flow committed Oct 4, 2022
1 parent a71712b commit fc44b20
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,77 @@ def __init__(
)


class GermanPhonemesTokenizer(BaseCharsTokenizer):
# fmt: off
PUNCT_LIST = ( # Derived from LJSpeech and "/" additionally
',', '.', '!', '?', '-',
':', ';', '/', '"', '(',
')', '[', ']', '{', '}',
)
# fmt: on

def __init__(
self,
punct=True,
apostrophe=True,
add_blank_at=None,
pad_with_space=False,
non_default_punct_list=None,
text_preprocessing_func=german_text_preprocessing,
):
"""Deutsch phoneme-based tokenizer.
Args:
punct: Whether to reserve grapheme for basic punctuation or not.
apostrophe: Whether to use apostrophe or not.
add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
if None then no blank in labels.
pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
non_default_punct_list: List of punctuation marks which will be used instead default.
text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer.
Currently, it only applies lower() function.
"""

de_ipa = "abdefhijklmnoprstuvwxyzçðøŋœɐɑɒɔəɛɜɡɪɹɾʃʊʌʒː̃"
de_suprasegmentals = "12"
super().__init__(
chars=de_ipa + de_suprasegmentals,
punct=punct,
apostrophe=apostrophe,
add_blank_at=add_blank_at,
pad_with_space=pad_with_space,
non_default_punct_list=non_default_punct_list,
text_preprocessing_func=text_preprocessing_func,
)

def encode(self, text):
"""See base class."""
cs, space, tokens = [], self.tokens[self.space], set(self.tokens)

text = self.text_preprocessing_func(text)
for c in text:
# Add space if last one isn't one
if c == space and len(cs) > 0 and cs[-1] != space:
cs.append(c)
# Add next char
elif (c.isalnum() or c == "'" or c == "\u0303") and c in tokens:
cs.append(c)
# Add punct
elif (c in self.PUNCT_LIST) and self.punct:
cs.append(c)
# Warn about unknown char
elif c != space:
logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.")

# Remove trailing spaces
while cs[-1] == space:
cs.pop()

if self.pad_with_space:
cs = [space] + cs + [space]

return [self._token2id[p] for p in cs]


class EnglishPhonemesTokenizer(BaseTokenizer):
# fmt: off
PUNCT_LIST = ( # Derived from LJSpeech and "/" additionally
Expand Down
1 change: 1 addition & 0 deletions nemo/collections/tts/torch/tts_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,6 @@
EnglishCharsTokenizer,
EnglishPhonemesTokenizer,
GermanCharsTokenizer,
GermanPhonemesTokenizer,
IPATokenizer,
)

0 comments on commit fc44b20

Please sign in to comment.