[TTS][zh] refine hardcoded lowercase for ASCII letters.

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
NVIDIA · Jun 1, 2023 · cdf768a · cdf768a
1 parent 8672af6
commit cdf768a
Show file tree

Hide file tree

Showing 3 changed files with 72 additions and 39 deletions.
diff --git a/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py b/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py
@@ -185,10 +185,9 @@ def any_locale_word_tokenize(text: str) -> List[Tuple[List[str], bool]]:
     return _word_tokenize(words)
 
 
-# TODO @xueyang: deprecate language-specific text preprocessing and use any_locale_text_preprocessing.
-def spanish_text_preprocessing(text):
+def spanish_text_preprocessing(text: str) -> str:
     return text.lower()
 
 
-def chinese_text_preprocessing(text):
-    return text.lower()
+def chinese_text_preprocessing(text: str) -> str:
+    return text
diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
@@ -692,7 +692,7 @@ def __init__(
         sep='|',  # To be able to distinguish between 2/3 letters codes.
         add_blank_at=None,
         pad_with_space=False,
-        text_preprocessing_func=lambda text: chinese_text_preprocessing(text),
+        text_preprocessing_func=chinese_text_preprocessing,
     ):
         """Chinese phoneme-based tokenizer.
         Args:
@@ -716,12 +716,16 @@ def __init__(
         if silence is not None:
             self.silence, tokens = len(tokens), tokens + [silence]  # Silence
 
-        self.phonemes_list = g2p.phonemes_list
-        self.tones_list = g2p.tones_list
+        self.phoneme_list = g2p.phoneme_list
+        self.tone_list = g2p.tone_list
+        self.ascii_letter_list = g2p.ascii_letter_list
+        self.ascii_letter_case = g2p.ascii_letter_case
 
-        tokens.extend(self.phonemes_list)
-        tokens.extend(self.tones_list)
-        tokens.extend(string.ascii_lowercase)
+        tokens.extend(self.phoneme_list)
+        tokens.extend(self.tone_list)
+        tokens.extend(self.ascii_letter_list)
+
+        self.text_preprocessing_func = text_preprocessing_func
 
         if apostrophe:
             tokens.append("'")  # Apostrophe
@@ -737,15 +741,12 @@ def __init__(
 
         self.punct = punct
         self.pad_with_space = pad_with_space
-
-        self.text_preprocessing_func = text_preprocessing_func
         self.g2p = g2p
 
-    def encode(self, text):
+    def encode(self, text: str) -> List[int]:
         """See base class for more information."""
-
         text = self.text_preprocessing_func(text)
-        g2p_text = self.g2p(text)  # TODO: handle infer
+        g2p_text = self.g2p(text)
         return self.encode_from_g2p(g2p_text, text)
 
     def encode_from_g2p(self, g2p_text: List[str], raw_text: Optional[str] = None):
@@ -762,15 +763,15 @@ def encode_from_g2p(self, g2p_text: List[str], raw_text: Optional[str] = None):
             # Add space if last one isn't one
             if p == space and len(ps) > 0 and ps[-1] != space:
                 ps.append(p)
-            # Add next phoneme or char (if chars=True)
-            elif (p.isalnum() or p == "'" or p in self.phonemes_list or p in self.tones_list) and p in tokens:
+            # Add next phoneme or tone or ascii letter or apostrophe.
+            elif (p.isalnum() or p == "'" or p in self.phoneme_list + self.tone_list + self.ascii_letter_list) and p in tokens:
                 ps.append(p)
-            # Add punct
+            # Add punctuation
             elif (p in self.PUNCT_LIST) and self.punct:
                 ps.append(p)
             # Warn about unknown char/phoneme
             elif p != space:
-                message = f"Text: [{''.join(g2p_text)}] contains unknown char/phoneme: [{p}]."
+                message = f"Text: [{' '.join(g2p_text)}] contains unknown char/phoneme: [{p}]."
                 if raw_text is not None:
                     message += f"Original text: [{raw_text}]. Symbol will be skipped."
                 logging.warning(message)

diff --git a/nemo/collections/tts/g2p/models/zh_cn_pinyin.py b/nemo/collections/tts/g2p/models/zh_cn_pinyin.py
@@ -16,7 +16,9 @@
 from collections import defaultdict
 from typing import Dict, List, Optional, Union
 
+from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import get_grapheme_character_set
 from nemo.collections.tts.g2p.models.base import BaseG2p
+from nemo.collections.tts.g2p.utils import set_grapheme_case
 from nemo.utils import logging
 
 
@@ -26,24 +28,34 @@ def __init__(
         phoneme_dict: Union[str, pathlib.Path, Dict[str, List[str]]],
         phoneme_prefix: str = "#",
         tone_prefix: str = "#",
+        ascii_letter_prefix: str = "",
+        ascii_letter_case: str = "lower",
         word_tokenize_func=None,
         apply_to_oov_word=None,
         mapping_file: Optional[str] = None,
         word_segmenter: Optional[str] = None,
     ):
-        """Chinese G2P module. This module first converts Chinese characters into pinyin sequences using pypinyin, then pinyin sequences would
-           be further converted into phoneme sequences using pinyin_dict_nv_22.10.txt dict file. For Chinese and English bilingual sentences, the English words
-           would be converted into letters.
+        """
+        Chinese G2P module. This module first converts Chinese characters into pinyin sequences using pypinyin, then
+            pinyin sequences would be further converted into phoneme sequences by looking them up in the `phoneme_dict`.
+            This G2P module also works with Chinese/English bilingual sentences where English words would be converted
+            into letters. It is advised to attach prefix symbols for Chinese phonemes and tones to discriminate them
+            from English letters to avoid any potential symbol set overlaps.
         Args:
             phoneme_dict (str, Path, Dict): Path to pinyin_dict_nv_22.10.txt dict file or a dict object.
             phoneme_prefix (str): Prepend a special symbol to any phonemes in order to distinguish phonemes from
                 graphemes because there may be overlaps between the two sets. Phoneme dictionary typically applies
                 uppercase initials and finals. It is suggested to choose a prefix that
                 is not used or preserved somewhere else. Default to "#".
             tone_prefix (str): Prepend a special symbol to any tone digits. Default to "#".
+            ascii_letter_prefix (str): Prepend a special symbol to any ASCII letters. Default to "".
+            ascii_letter_case (str): Specify the case chosen from `"lower"`, `"upper"`, or `"mixed"`, and process the
+                cases of non-Chinese words. Default to `"lower"`.
             word_tokenize_func: Function for tokenizing text to words.
-                It has to return List[Tuple[Union[str, List[str]], bool]] where every tuple denotes word representation and flag whether to leave unchanged or not.
-                It is expected that unchangeable word representation will be represented as List[str], other cases are represented as str.
+                It has to return List[Tuple[Union[str, List[str]], bool]] where every tuple denotes word representation
+                    and flag whether to leave unchanged or not.
+                It is expected that unchangeable word representation will be represented as List[str], other cases are
+                    represented as str.
                 It is useful to mark word as unchangeable which is already in phoneme representation.
             apply_to_oov_word: Function that will be applied to out of phoneme_dict word.
             word_segmenter: method that will be applied to segment utterances into words for better polyphone disambiguation.
@@ -58,13 +70,27 @@ def __init__(
             phoneme_prefix = ""
         if tone_prefix is None:
             tone_prefix = ""
+        if ascii_letter_prefix is None:
+            ascii_letter_prefix = ""
 
+        # phonemes
         phoneme_dict = (
             self._parse_as_pinyin_dict(phoneme_dict, phoneme_prefix)
             if isinstance(phoneme_dict, str) or isinstance(phoneme_dict, pathlib.Path)
             else phoneme_dict
         )
-        self.phonemes_list = list({pron for prons in phoneme_dict.values() for pron in prons})
+        self.phoneme_list = list({pron for prons in phoneme_dict.values() for pron in prons})
+
+        # tones
+        self.tone_dict = {str(x): tone_prefix + str(x) for x in range(1, 6)}
+        self.tone_list = list(self.tone_dict.values())
+
+        # ascii letters
+        self.ascii_letter_dict = {
+            x: ascii_letter_prefix + x for x in get_grapheme_character_set(locale="en-US", case=ascii_letter_case)
+        }
+        self.ascii_letter_list = sorted(self.ascii_letter_dict)
+        self.ascii_letter_case = ascii_letter_case
 
         if apply_to_oov_word is None:
             logging.warning(
@@ -81,9 +107,6 @@ def __init__(
             mapping_file=mapping_file,
         )
 
-        self.tones = {str(x): tone_prefix + str(x) for x in range(1, 6)}
-        self.tones_list = list(self.tones.values())
-
         if word_segmenter == "jieba":
             try:
                 import jieba
@@ -120,26 +143,33 @@ def _parse_as_pinyin_dict(
                     continue
 
                 parts = line.split('\t')
-                # lowercase the Chinese syllables because pypinyin requires lowercase inputs.
+                # Convert the cases of Chinese syllables loaded from the dictionary to lowercase to match the lowercase
+                # Chinese syllable outputs generated by the function `pypinyin.lazy_pinyin`. Note that the function
+                # `pypinyin.lazy_pinyin` preserves the cases of ASCII letters.
                 syllable = parts[0].lower()
                 pronunciation = parts[1].split()
 
-                # add phoneme prefix to distinguish from other symbols.
+                # add a prefix to distinguish phoneme symbols from non-phoneme symbols.
                 pronunciation_with_prefix = [phoneme_prefix + pron for pron in pronunciation]
                 g2p_dict[syllable] = pronunciation_with_prefix
 
         return g2p_dict
 
     def __call__(self, text: str) -> List[str]:
         """
-        errors func handle below is to process the bilingual situation,
-        where English words would be split into letters.
-        e.g. 我今天去了Apple Store, 买了一个iPhone。
-        would return a list
-        ['wo3', 'jin1', 'tian1', 'qu4', 'le5', 'A', 'p', 'p', 'l', 'e',
-        ' ', 'S', 't', 'o', 'r', 'e', ',', ' ', 'mai3', 'le5', 'yi2',
-        'ge4', 'i', 'P', 'h', 'o', 'n', 'e', '。']
+        This forward pass function translates Chinese characters into pinyin sequences and then converts the pinyin
+        into phonemes. It is primarily designed to process texts containing with Chinese characters, but we have
+        extended its support to handle texts that include both Chinese and English. This extension was mainly
+        necessitated by the limited availability of bilingual datasets. The `errors` argument used in the
+        `pypinyin.lazy_pinyin` function below is used to process non-Chinese words, where each English word is split
+        into letters.
+
+        For example, The text "我今天去了Apple Store, 买了一个iPhone。" would be converted as a list,
+        `['wo3', 'jin1', 'tian1', 'qu4', 'le5', 'A', 'p', 'p', 'l', 'e', ' ', 'S', 't', 'o', 'r', 'e', ',', ' ', 'mai3',
+         'le5', 'yi2', 'ge4', 'i', 'P', 'h', 'o', 'n', 'e', '。']`
         """
+        text = set_grapheme_case(text, case=self.ascii_letter_case)
+
         pinyin_seq = []
         words_list = self.word_segmenter(text)
 
@@ -154,15 +184,18 @@ def __call__(self, text: str) -> List[str]:
             )
         phoneme_seq = []
         for pinyin in pinyin_seq:
+            # only pinyin has tones while non-pinyin doesn't.
             tone_hyp = pinyin[-1]
-            if tone_hyp in self.tones:
+            if tone_hyp in self.tone_dict:
                 syllable = pinyin[:-1]
                 assert syllable in self.phoneme_dict, f"Syllable <{syllable}> does not exist in the dictionary."
                 phoneme_seq += self.phoneme_dict[syllable]
-                phoneme_seq.append(self.tones[tone_hyp])
+                phoneme_seq.append(self.tone_dict[tone_hyp])
             # All pinyin would end up with a number in 1-5, which represents tones of the pinyin.
             # For symbols which are not pinyin, such as English letters and Chinese punctuations, we directly
             # use them as inputs.
+            elif tone_hyp in self.ascii_letter_dict:
+                phoneme_seq.append(self.ascii_letter_dict[tone_hyp])
             else:
                 phoneme_seq.append(pinyin)
         return phoneme_seq