Skip to content

Commit

Permalink
[TTS][zh] refine hardcoded lowercase for ASCII letters.
Browse files Browse the repository at this point in the history
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
  • Loading branch information
XuesongYang committed Jun 1, 2023
1 parent 8672af6 commit cdf768a
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,9 @@ def any_locale_word_tokenize(text: str) -> List[Tuple[List[str], bool]]:
return _word_tokenize(words)


# TODO @xueyang: deprecate language-specific text preprocessing and use any_locale_text_preprocessing.
def spanish_text_preprocessing(text):
def spanish_text_preprocessing(text: str) -> str:
return text.lower()


def chinese_text_preprocessing(text):
return text.lower()
def chinese_text_preprocessing(text: str) -> str:
return text
31 changes: 16 additions & 15 deletions nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,7 +692,7 @@ def __init__(
sep='|', # To be able to distinguish between 2/3 letters codes.
add_blank_at=None,
pad_with_space=False,
text_preprocessing_func=lambda text: chinese_text_preprocessing(text),
text_preprocessing_func=chinese_text_preprocessing,
):
"""Chinese phoneme-based tokenizer.
Args:
Expand All @@ -716,12 +716,16 @@ def __init__(
if silence is not None:
self.silence, tokens = len(tokens), tokens + [silence] # Silence

self.phonemes_list = g2p.phonemes_list
self.tones_list = g2p.tones_list
self.phoneme_list = g2p.phoneme_list
self.tone_list = g2p.tone_list
self.ascii_letter_list = g2p.ascii_letter_list
self.ascii_letter_case = g2p.ascii_letter_case

tokens.extend(self.phonemes_list)
tokens.extend(self.tones_list)
tokens.extend(string.ascii_lowercase)
tokens.extend(self.phoneme_list)
tokens.extend(self.tone_list)
tokens.extend(self.ascii_letter_list)

self.text_preprocessing_func = text_preprocessing_func

if apostrophe:
tokens.append("'") # Apostrophe
Expand All @@ -737,15 +741,12 @@ def __init__(

self.punct = punct
self.pad_with_space = pad_with_space

self.text_preprocessing_func = text_preprocessing_func
self.g2p = g2p

def encode(self, text):
def encode(self, text: str) -> List[int]:
"""See base class for more information."""

text = self.text_preprocessing_func(text)
g2p_text = self.g2p(text) # TODO: handle infer
g2p_text = self.g2p(text)
return self.encode_from_g2p(g2p_text, text)

def encode_from_g2p(self, g2p_text: List[str], raw_text: Optional[str] = None):
Expand All @@ -762,15 +763,15 @@ def encode_from_g2p(self, g2p_text: List[str], raw_text: Optional[str] = None):
# Add space if last one isn't one
if p == space and len(ps) > 0 and ps[-1] != space:
ps.append(p)
# Add next phoneme or char (if chars=True)
elif (p.isalnum() or p == "'" or p in self.phonemes_list or p in self.tones_list) and p in tokens:
# Add next phoneme or tone or ascii letter or apostrophe.
elif (p.isalnum() or p == "'" or p in self.phoneme_list + self.tone_list + self.ascii_letter_list) and p in tokens:
ps.append(p)
# Add punct
# Add punctuation
elif (p in self.PUNCT_LIST) and self.punct:
ps.append(p)
# Warn about unknown char/phoneme
elif p != space:
message = f"Text: [{''.join(g2p_text)}] contains unknown char/phoneme: [{p}]."
message = f"Text: [{' '.join(g2p_text)}] contains unknown char/phoneme: [{p}]."
if raw_text is not None:
message += f"Original text: [{raw_text}]. Symbol will be skipped."
logging.warning(message)
Expand Down
73 changes: 53 additions & 20 deletions nemo/collections/tts/g2p/models/zh_cn_pinyin.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
from collections import defaultdict
from typing import Dict, List, Optional, Union

from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import get_grapheme_character_set
from nemo.collections.tts.g2p.models.base import BaseG2p
from nemo.collections.tts.g2p.utils import set_grapheme_case
from nemo.utils import logging


Expand All @@ -26,24 +28,34 @@ def __init__(
phoneme_dict: Union[str, pathlib.Path, Dict[str, List[str]]],
phoneme_prefix: str = "#",
tone_prefix: str = "#",
ascii_letter_prefix: str = "",
ascii_letter_case: str = "lower",
word_tokenize_func=None,
apply_to_oov_word=None,
mapping_file: Optional[str] = None,
word_segmenter: Optional[str] = None,
):
"""Chinese G2P module. This module first converts Chinese characters into pinyin sequences using pypinyin, then pinyin sequences would
be further converted into phoneme sequences using pinyin_dict_nv_22.10.txt dict file. For Chinese and English bilingual sentences, the English words
would be converted into letters.
"""
Chinese G2P module. This module first converts Chinese characters into pinyin sequences using pypinyin, then
pinyin sequences would be further converted into phoneme sequences by looking them up in the `phoneme_dict`.
This G2P module also works with Chinese/English bilingual sentences where English words would be converted
into letters. It is advised to attach prefix symbols for Chinese phonemes and tones to discriminate them
from English letters to avoid any potential symbol set overlaps.
Args:
phoneme_dict (str, Path, Dict): Path to pinyin_dict_nv_22.10.txt dict file or a dict object.
phoneme_prefix (str): Prepend a special symbol to any phonemes in order to distinguish phonemes from
graphemes because there may be overlaps between the two sets. Phoneme dictionary typically applies
uppercase initials and finals. It is suggested to choose a prefix that
is not used or preserved somewhere else. Default to "#".
tone_prefix (str): Prepend a special symbol to any tone digits. Default to "#".
ascii_letter_prefix (str): Prepend a special symbol to any ASCII letters. Default to "".
ascii_letter_case (str): Specify the case chosen from `"lower"`, `"upper"`, or `"mixed"`, and process the
cases of non-Chinese words. Default to `"lower"`.
word_tokenize_func: Function for tokenizing text to words.
It has to return List[Tuple[Union[str, List[str]], bool]] where every tuple denotes word representation and flag whether to leave unchanged or not.
It is expected that unchangeable word representation will be represented as List[str], other cases are represented as str.
It has to return List[Tuple[Union[str, List[str]], bool]] where every tuple denotes word representation
and flag whether to leave unchanged or not.
It is expected that unchangeable word representation will be represented as List[str], other cases are
represented as str.
It is useful to mark word as unchangeable which is already in phoneme representation.
apply_to_oov_word: Function that will be applied to out of phoneme_dict word.
word_segmenter: method that will be applied to segment utterances into words for better polyphone disambiguation.
Expand All @@ -58,13 +70,27 @@ def __init__(
phoneme_prefix = ""
if tone_prefix is None:
tone_prefix = ""
if ascii_letter_prefix is None:
ascii_letter_prefix = ""

# phonemes
phoneme_dict = (
self._parse_as_pinyin_dict(phoneme_dict, phoneme_prefix)
if isinstance(phoneme_dict, str) or isinstance(phoneme_dict, pathlib.Path)
else phoneme_dict
)
self.phonemes_list = list({pron for prons in phoneme_dict.values() for pron in prons})
self.phoneme_list = list({pron for prons in phoneme_dict.values() for pron in prons})

# tones
self.tone_dict = {str(x): tone_prefix + str(x) for x in range(1, 6)}
self.tone_list = list(self.tone_dict.values())

# ascii letters
self.ascii_letter_dict = {
x: ascii_letter_prefix + x for x in get_grapheme_character_set(locale="en-US", case=ascii_letter_case)
}
self.ascii_letter_list = sorted(self.ascii_letter_dict)
self.ascii_letter_case = ascii_letter_case

if apply_to_oov_word is None:
logging.warning(
Expand All @@ -81,9 +107,6 @@ def __init__(
mapping_file=mapping_file,
)

self.tones = {str(x): tone_prefix + str(x) for x in range(1, 6)}
self.tones_list = list(self.tones.values())

if word_segmenter == "jieba":
try:
import jieba
Expand Down Expand Up @@ -120,26 +143,33 @@ def _parse_as_pinyin_dict(
continue

parts = line.split('\t')
# lowercase the Chinese syllables because pypinyin requires lowercase inputs.
# Convert the cases of Chinese syllables loaded from the dictionary to lowercase to match the lowercase
# Chinese syllable outputs generated by the function `pypinyin.lazy_pinyin`. Note that the function
# `pypinyin.lazy_pinyin` preserves the cases of ASCII letters.
syllable = parts[0].lower()
pronunciation = parts[1].split()

# add phoneme prefix to distinguish from other symbols.
# add a prefix to distinguish phoneme symbols from non-phoneme symbols.
pronunciation_with_prefix = [phoneme_prefix + pron for pron in pronunciation]
g2p_dict[syllable] = pronunciation_with_prefix

return g2p_dict

def __call__(self, text: str) -> List[str]:
"""
errors func handle below is to process the bilingual situation,
where English words would be split into letters.
e.g. 我今天去了Apple Store, 买了一个iPhone。
would return a list
['wo3', 'jin1', 'tian1', 'qu4', 'le5', 'A', 'p', 'p', 'l', 'e',
' ', 'S', 't', 'o', 'r', 'e', ',', ' ', 'mai3', 'le5', 'yi2',
'ge4', 'i', 'P', 'h', 'o', 'n', 'e', '。']
This forward pass function translates Chinese characters into pinyin sequences and then converts the pinyin
into phonemes. It is primarily designed to process texts containing with Chinese characters, but we have
extended its support to handle texts that include both Chinese and English. This extension was mainly
necessitated by the limited availability of bilingual datasets. The `errors` argument used in the
`pypinyin.lazy_pinyin` function below is used to process non-Chinese words, where each English word is split
into letters.
For example, The text "我今天去了Apple Store, 买了一个iPhone。" would be converted as a list,
`['wo3', 'jin1', 'tian1', 'qu4', 'le5', 'A', 'p', 'p', 'l', 'e', ' ', 'S', 't', 'o', 'r', 'e', ',', ' ', 'mai3',
'le5', 'yi2', 'ge4', 'i', 'P', 'h', 'o', 'n', 'e', '。']`
"""
text = set_grapheme_case(text, case=self.ascii_letter_case)

pinyin_seq = []
words_list = self.word_segmenter(text)

Expand All @@ -154,15 +184,18 @@ def __call__(self, text: str) -> List[str]:
)
phoneme_seq = []
for pinyin in pinyin_seq:
# only pinyin has tones while non-pinyin doesn't.
tone_hyp = pinyin[-1]
if tone_hyp in self.tones:
if tone_hyp in self.tone_dict:
syllable = pinyin[:-1]
assert syllable in self.phoneme_dict, f"Syllable <{syllable}> does not exist in the dictionary."
phoneme_seq += self.phoneme_dict[syllable]
phoneme_seq.append(self.tones[tone_hyp])
phoneme_seq.append(self.tone_dict[tone_hyp])
# All pinyin would end up with a number in 1-5, which represents tones of the pinyin.
# For symbols which are not pinyin, such as English letters and Chinese punctuations, we directly
# use them as inputs.
elif tone_hyp in self.ascii_letter_dict:
phoneme_seq.append(self.ascii_letter_dict[tone_hyp])
else:
phoneme_seq.append(pinyin)
return phoneme_seq

0 comments on commit cdf768a

Please sign in to comment.