diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py index bc6731ebcb88..5dc5c07de0c6 100644 --- a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py +++ b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py @@ -233,6 +233,77 @@ def __init__( ) +class GermanPhonemesTokenizer(BaseCharsTokenizer): + # fmt: off + PUNCT_LIST = ( # Derived from LJSpeech and "/" additionally + ',', '.', '!', '?', '-', + ':', ';', '/', '"', '(', + ')', '[', ']', '{', '}', + ) + # fmt: on + + def __init__( + self, + punct=True, + apostrophe=True, + add_blank_at=None, + pad_with_space=False, + non_default_punct_list=None, + text_preprocessing_func=german_text_preprocessing, + ): + """Deutsch phoneme-based tokenizer. + Args: + punct: Whether to reserve grapheme for basic punctuation or not. + apostrophe: Whether to use apostrophe or not. + add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None), + if None then no blank in labels. + pad_with_space: Whether to pad text with spaces at the beginning and at the end or not. + non_default_punct_list: List of punctuation marks which will be used instead default. + text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer. + Currently, it only applies lower() function. + """ + + de_ipa = "abdefhijklmnoprstuvwxyzçðøŋœɐɑɒɔəɛɜɡɪɹɾʃʊʌʒː̃" + de_suprasegmentals = "12" + super().__init__( + chars=de_ipa + de_suprasegmentals, + punct=punct, + apostrophe=apostrophe, + add_blank_at=add_blank_at, + pad_with_space=pad_with_space, + non_default_punct_list=non_default_punct_list, + text_preprocessing_func=text_preprocessing_func, + ) + + def encode(self, text): + """See base class.""" + cs, space, tokens = [], self.tokens[self.space], set(self.tokens) + + text = self.text_preprocessing_func(text) + for c in text: + # Add space if last one isn't one + if c == space and len(cs) > 0 and cs[-1] != space: + cs.append(c) + # Add next char + elif (c.isalnum() or c == "'" or c == "\u0303") and c in tokens: + cs.append(c) + # Add punct + elif (c in self.PUNCT_LIST) and self.punct: + cs.append(c) + # Warn about unknown char + elif c != space: + logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.") + + # Remove trailing spaces + while cs[-1] == space: + cs.pop() + + if self.pad_with_space: + cs = [space] + cs + [space] + + return [self._token2id[p] for p in cs] + + class EnglishPhonemesTokenizer(BaseTokenizer): # fmt: off PUNCT_LIST = ( # Derived from LJSpeech and "/" additionally diff --git a/nemo/collections/tts/torch/tts_tokenizers.py b/nemo/collections/tts/torch/tts_tokenizers.py index 88a9d1df22f1..0cac295b6c06 100644 --- a/nemo/collections/tts/torch/tts_tokenizers.py +++ b/nemo/collections/tts/torch/tts_tokenizers.py @@ -20,5 +20,6 @@ EnglishCharsTokenizer, EnglishPhonemesTokenizer, GermanCharsTokenizer, + GermanPhonemesTokenizer, IPATokenizer, )