diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index 5e701c085f..e87eb0766b 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -233,6 +233,12 @@ def split_sentence(text, lang, text_split_length=250): # Korean doesn't typically use abbreviations in the same way as Latin-based scripts. ] ], + "hi": [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + # Hindi doesn't typically use abbreviations in the same way as Latin-based scripts. + ] + ], } @@ -429,6 +435,18 @@ def expand_abbreviations_multilingual(text, lang="en"): ("°", " 도 "), ] ], + "hi": [ + (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + for x in [ + ("&", " और "), + ("@", " ऐट दी रेट "), + ("%", " प्रतिशत "), + ("#", " हैश "), + ("$", " डॉलर "), + ("£", " पाउंड "), + ("°", " डिग्री "), + ] + ], } @@ -454,6 +472,7 @@ def expand_symbols_multilingual(text, lang="en"): "tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"), "hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"), "ko": re.compile(r"([0-9]+)(번째|번|차|째)"), + "hi": re.compile(r"([0-9]+)(st|nd|rd|th)"), # To check } _number_re = re.compile(r"[0-9]+") _currency_re = { @@ -505,6 +524,7 @@ def _expand_currency(m, lang="en", currency="USD"): "tr": ", ", "hu": ", ", "ko": ", ", + "hi": ", ", } if amount.is_integer(): @@ -644,7 +664,7 @@ def check_input_length(self, txt, lang): ) def preprocess_text(self, txt, lang): - if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}: + if lang in {"ar", "cs", "de", "en", "es", "fr", "hi", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}: txt = multilingual_cleaners(txt, lang) if lang == "zh": txt = chinese_transliterate(txt) @@ -652,9 +672,6 @@ def preprocess_text(self, txt, lang): txt = korean_transliterate(txt) elif lang == "ja": txt = japanese_cleaners(txt, self.katsu) - elif lang == "hi": - # @manmay will implement this - txt = basic_cleaners(txt) else: raise NotImplementedError(f"Language '{lang}' is not supported.") return txt @@ -777,6 +794,9 @@ def test_expand_numbers_multilingual(): ("12.5 초 안에.", "십이 점 다섯 초 안에.", "ko"), ("50 명의 병사가 있었다.", "오십 명의 병사가 있었다.", "ko"), ("이것은 1 번째 테스트입니다", "이것은 첫 번째 테스트입니다", "ko"), + # Hindi + ("12.5 सेकंड में।", "साढ़े बारह सेकंड में।", "hi"), + ("50 सैनिक थे।", "पचास सैनिक थे।", "hi"), ] for a, b, lang in test_cases: out = expand_numbers_multilingual(a, lang=lang) @@ -846,6 +866,7 @@ def test_symbols_multilingual(): ("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"), ("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"), ("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"), + ("मेरे पास 14% बैटरी है।", "मेरे पास चौदह प्रतिशत बैटरी है।", "hi"), ] for a, b, lang in test_cases: