Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TTS] added missing German phoneme tokenizer #5070

Merged
merged 1 commit into from
Oct 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,77 @@ def __init__(
)


class GermanPhonemesTokenizer(BaseCharsTokenizer):
# fmt: off
PUNCT_LIST = ( # Derived from LJSpeech and "/" additionally
',', '.', '!', '?', '-',
':', ';', '/', '"', '(',
')', '[', ']', '{', '}',
)
# fmt: on

def __init__(
self,
punct=True,
apostrophe=True,
add_blank_at=None,
pad_with_space=False,
non_default_punct_list=None,
text_preprocessing_func=german_text_preprocessing,
):
"""Deutsch phoneme-based tokenizer.
Args:
punct: Whether to reserve grapheme for basic punctuation or not.
apostrophe: Whether to use apostrophe or not.
add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
if None then no blank in labels.
pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
non_default_punct_list: List of punctuation marks which will be used instead default.
text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer.
Currently, it only applies lower() function.
"""

de_ipa = "abdefhijklmnoprstuvwxyzçðøŋœɐɑɒɔəɛɜɡɪɹɾʃʊʌʒː̃"
de_suprasegmentals = "12"
super().__init__(
chars=de_ipa + de_suprasegmentals,
punct=punct,
apostrophe=apostrophe,
add_blank_at=add_blank_at,
pad_with_space=pad_with_space,
non_default_punct_list=non_default_punct_list,
text_preprocessing_func=text_preprocessing_func,
)

def encode(self, text):
"""See base class."""
cs, space, tokens = [], self.tokens[self.space], set(self.tokens)

text = self.text_preprocessing_func(text)
for c in text:
# Add space if last one isn't one
if c == space and len(cs) > 0 and cs[-1] != space:
cs.append(c)
# Add next char
elif (c.isalnum() or c == "'" or c == "\u0303") and c in tokens:
cs.append(c)
# Add punct
elif (c in self.PUNCT_LIST) and self.punct:
cs.append(c)
# Warn about unknown char
elif c != space:
logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.")

# Remove trailing spaces
while cs[-1] == space:
cs.pop()

if self.pad_with_space:
cs = [space] + cs + [space]

return [self._token2id[p] for p in cs]


class EnglishPhonemesTokenizer(BaseTokenizer):
# fmt: off
PUNCT_LIST = ( # Derived from LJSpeech and "/" additionally
Expand Down
1 change: 1 addition & 0 deletions nemo/collections/tts/torch/tts_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,6 @@
EnglishCharsTokenizer,
EnglishPhonemesTokenizer,
GermanCharsTokenizer,
GermanPhonemesTokenizer,
IPATokenizer,
)