Skip to content

Commit

Permalink
feat(xtts): support hindi in tokenizer (#64)
Browse files Browse the repository at this point in the history
Added proper tokenizer support for Hindi Language which would prevent crash while fine tuning Hindi language.

Co-authored-by: Akshat Bhardwaj <157223825+akshatrocky@users.noreply.github.com>
  • Loading branch information
eginhard and Akshat Bhardwaj authored Sep 12, 2024
1 parent 233dfb5 commit 1920328
Showing 1 changed file with 25 additions and 4 deletions.
29 changes: 25 additions & 4 deletions TTS/tts/layers/xtts/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,12 @@ def split_sentence(text, lang, text_split_length=250):
# Korean doesn't typically use abbreviations in the same way as Latin-based scripts.
]
],
"hi": [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
# Hindi doesn't typically use abbreviations in the same way as Latin-based scripts.
]
],
}


Expand Down Expand Up @@ -429,6 +435,18 @@ def expand_abbreviations_multilingual(text, lang="en"):
("°", " 도 "),
]
],
"hi": [
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
for x in [
("&", " और "),
("@", " ऐट दी रेट "),
("%", " प्रतिशत "),
("#", " हैश "),
("$", " डॉलर "),
("£", " पाउंड "),
("°", " डिग्री "),
]
],
}


Expand All @@ -454,6 +472,7 @@ def expand_symbols_multilingual(text, lang="en"):
"tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"),
"hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"),
"ko": re.compile(r"([0-9]+)(번째|번|차|째)"),
"hi": re.compile(r"([0-9]+)(st|nd|rd|th)"), # To check
}
_number_re = re.compile(r"[0-9]+")
_currency_re = {
Expand Down Expand Up @@ -505,6 +524,7 @@ def _expand_currency(m, lang="en", currency="USD"):
"tr": ", ",
"hu": ", ",
"ko": ", ",
"hi": ", ",
}

if amount.is_integer():
Expand Down Expand Up @@ -644,17 +664,14 @@ def check_input_length(self, txt, lang):
)

def preprocess_text(self, txt, lang):
if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}:
if lang in {"ar", "cs", "de", "en", "es", "fr", "hi", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}:
txt = multilingual_cleaners(txt, lang)
if lang == "zh":
txt = chinese_transliterate(txt)
if lang == "ko":
txt = korean_transliterate(txt)
elif lang == "ja":
txt = japanese_cleaners(txt, self.katsu)
elif lang == "hi":
# @manmay will implement this
txt = basic_cleaners(txt)
else:
raise NotImplementedError(f"Language '{lang}' is not supported.")
return txt
Expand Down Expand Up @@ -777,6 +794,9 @@ def test_expand_numbers_multilingual():
("12.5 초 안에.", "십이 점 다섯 초 안에.", "ko"),
("50 명의 병사가 있었다.", "오십 명의 병사가 있었다.", "ko"),
("이것은 1 번째 테스트입니다", "이것은 첫 번째 테스트입니다", "ko"),
# Hindi
("12.5 सेकंड में।", "साढ़े बारह सेकंड में।", "hi"),
("50 सैनिक थे।", "पचास सैनिक थे।", "hi"),
]
for a, b, lang in test_cases:
out = expand_numbers_multilingual(a, lang=lang)
Expand Down Expand Up @@ -846,6 +866,7 @@ def test_symbols_multilingual():
("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"),
("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"),
("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"),
("मेरे पास 14% बैटरी है।", "मेरे पास चौदह प्रतिशत बैटरी है।", "hi"),
]

for a, b, lang in test_cases:
Expand Down

0 comments on commit 1920328

Please sign in to comment.