Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Export the English TTS model from MeloTTS #1509

Merged
merged 3 commits into from
Nov 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 60 additions & 6 deletions .github/workflows/export-melo-tts-to-onnx.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
name: test.wav
path: scripts/melo-tts/test.wav

- name: Publish to huggingface
- name: Publish to huggingface (Chinese + English)
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
uses: nick-fields/retry@v3
Expand All @@ -61,14 +61,14 @@ jobs:
git fetch
git pull
echo "pwd: $PWD"
ls -lh ../scripts/melo-tts
ls -lh ../scripts/melo-tts/zh_en

rm -rf ./

cp -v ../scripts/melo-tts/*.onnx .
cp -v ../scripts/melo-tts/lexicon.txt .
cp -v ../scripts/melo-tts/tokens.txt .
cp -v ../scripts/melo-tts/README.md .
cp -v ../scripts/melo-tts/zh_en/*.onnx .
cp -v ../scripts/melo-tts/zh_en/lexicon.txt .
cp -v ../scripts/melo-tts/zh_en/tokens.txt .
cp -v ../scripts/melo-tts/zh_en/README.md .

curl -SL -O https://raw.githubusercontent.com/myshell-ai/MeloTTS/main/LICENSE

Expand Down Expand Up @@ -102,6 +102,60 @@ jobs:
tar cjvf $dst.tar.bz2 $dst
rm -rf $dst

- name: Publish to huggingface (English)
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
uses: nick-fields/retry@v3
with:
max_attempts: 20
timeout_seconds: 200
shell: bash
command: |
git config --global user.email "csukuangfj@gmail.com"
git config --global user.name "Fangjun Kuang"

rm -rf huggingface
export GIT_LFS_SKIP_SMUDGE=1
export GIT_CLONE_PROTECTION_ACTIVE=false

git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/vits-melo-tts-en huggingface
cd huggingface
git fetch
git pull
echo "pwd: $PWD"
ls -lh ../scripts/melo-tts/en

rm -rf ./

cp -v ../scripts/melo-tts/en/*.onnx .
cp -v ../scripts/melo-tts/en/lexicon.txt .
cp -v ../scripts/melo-tts/en/tokens.txt .
cp -v ../scripts/melo-tts/en/README.md .

curl -SL -O https://raw.githubusercontent.com/myshell-ai/MeloTTS/main/LICENSE

git lfs track "*.onnx"
git add .

ls -lh

git status

git diff

git commit -m "add models"
git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/vits-melo-tts-en main || true

cd ..

rm -rf huggingface/.git*
dst=vits-melo-tts-en

mv huggingface $dst

tar cjvf $dst.tar.bz2 $dst
rm -rf $dst

- name: Release
uses: svenstaro/upload-release-action@v2
with:
Expand Down
3 changes: 2 additions & 1 deletion scripts/melo-tts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
Models in this directory are converted from
https://github.com/myshell-ai/MeloTTS

Note there is only a single female speaker in the model.
Note there is only a single female speaker in the model for Chinese+English TTS.
TTS model, whereas there are 5 female speakers in the model For English TTS.
221 changes: 221 additions & 0 deletions scripts/melo-tts/export-onnx-en.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
#!/usr/bin/env python3
# This model exports the English-only TTS model.
# It has 5 speakers.
# {'EN-US': 0, 'EN-BR': 1, 'EN_INDIA': 2, 'EN-AU': 3, 'EN-Default': 4}

from typing import Any, Dict

import onnx
import torch
from melo.api import TTS
from melo.text import language_id_map, language_tone_start_map
from melo.text.chinese import pinyin_to_symbol_map
from melo.text.english import eng_dict, refine_syllables
from pypinyin import Style, lazy_pinyin, phrases_dict, pinyin_dict


def generate_tokens(symbol_list):
with open("tokens.txt", "w", encoding="utf-8") as f:
for i, s in enumerate(symbol_list):
f.write(f"{s} {i}\n")


def add_new_english_words(lexicon):
"""
Args:
lexicon:
Please modify it in-place.
"""

# Please have a look at
# https://github.com/myshell-ai/MeloTTS/blob/main/melo/text/cmudict.rep

# We give several examples below about how to add new words

# Example 1. Add a new word kaldi

# It does not contain the word kaldi in cmudict.rep
# so if we add the following line to cmudict.rep
#
# KALDI K AH0 - L D IH0
#
# then we need to change the lexicon like below
lexicon["kaldi"] = [["K", "AH0"], ["L", "D", "IH0"]]
#
# K AH0 and L D IH0 are separated by a dash "-", so
# ["K", "AH0"] is a in list and ["L", "D", "IH0"] is in a separate list

# Note: Either kaldi or KALDI is fine. You can use either lowercase or
# uppercase or both

# Example 2. Add a new word SF
#
# If we add the following line to cmudict.rep
#
# SF EH1 S - EH1 F
#
# to cmudict.rep, then we need to change the lexicon like below:
lexicon["SF"] = [["EH1", "S"], ["EH1", "F"]]

# Please add your new words here

# No need to return lexicon since it is changed in-place


def generate_lexicon():
add_new_english_words(eng_dict)
with open("lexicon.txt", "w", encoding="utf-8") as f:
for word in eng_dict:
phones, tones = refine_syllables(eng_dict[word])
tones = [t + language_tone_start_map["EN"] for t in tones]
tones = [str(t) for t in tones]

phones = " ".join(phones)
tones = " ".join(tones)

f.write(f"{word.lower()} {phones} {tones}\n")


def add_meta_data(filename: str, meta_data: Dict[str, Any]):
"""Add meta data to an ONNX model. It is changed in-place.

Args:
filename:
Filename of the ONNX model to be changed.
meta_data:
Key-value pairs.
"""
model = onnx.load(filename)
while len(model.metadata_props):
model.metadata_props.pop()

for key, value in meta_data.items():
meta = model.metadata_props.add()
meta.key = key
meta.value = str(value)

onnx.save(model, filename)


class ModelWrapper(torch.nn.Module):
def __init__(self, model: "SynthesizerTrn"):
super().__init__()
self.model = model
self.lang_id = language_id_map[model.language]

def forward(
self,
x,
x_lengths,
tones,
sid,
noise_scale,
length_scale,
noise_scale_w,
max_len=None,
):
"""
Args:
x: A 1-D array of dtype np.int64. Its shape is (token_numbers,)
tones: A 1-D array of dtype np.int64. Its shape is (token_numbers,)
lang_id: A 1-D array of dtype np.int64. Its shape is (token_numbers,)
sid: an integer
"""
bert = torch.zeros(x.shape[0], 1024, x.shape[1], dtype=torch.float32)
ja_bert = torch.zeros(x.shape[0], 768, x.shape[1], dtype=torch.float32)
lang_id = torch.zeros_like(x)
lang_id[:, 1::2] = self.lang_id
return self.model.model.infer(
x=x,
x_lengths=x_lengths,
sid=sid,
tone=tones,
language=lang_id,
bert=bert,
ja_bert=ja_bert,
noise_scale=noise_scale,
noise_scale_w=noise_scale_w,
length_scale=length_scale,
)[0]


def main():
generate_lexicon()

language = "EN"
model = TTS(language=language, device="cpu")

generate_tokens(model.hps["symbols"])

torch_model = ModelWrapper(model)

opset_version = 13
x = torch.randint(low=0, high=10, size=(60,), dtype=torch.int64)
print(x.shape)
x_lengths = torch.tensor([x.size(0)], dtype=torch.int64)
sid = torch.tensor([1], dtype=torch.int64)
tones = torch.zeros_like(x)

noise_scale = torch.tensor([1.0], dtype=torch.float32)
length_scale = torch.tensor([1.0], dtype=torch.float32)
noise_scale_w = torch.tensor([1.0], dtype=torch.float32)

x = x.unsqueeze(0)
tones = tones.unsqueeze(0)

filename = "model.onnx"

torch.onnx.export(
torch_model,
(
x,
x_lengths,
tones,
sid,
noise_scale,
length_scale,
noise_scale_w,
),
filename,
opset_version=opset_version,
input_names=[
"x",
"x_lengths",
"tones",
"sid",
"noise_scale",
"length_scale",
"noise_scale_w",
],
output_names=["y"],
dynamic_axes={
"x": {0: "N", 1: "L"},
"x_lengths": {0: "N"},
"tones": {0: "N", 1: "L"},
"y": {0: "N", 1: "S", 2: "T"},
},
)

meta_data = {
"model_type": "melo-vits",
"comment": "melo",
"version": 2,
"language": "English",
"add_blank": int(model.hps.data.add_blank),
"n_speakers": len(model.hps.data.spk2id), # 5
"jieba": 0,
"sample_rate": model.hps.data.sampling_rate,
"bert_dim": 1024,
"ja_bert_dim": 768,
"speaker_id": 0,
"lang_id": language_id_map[model.language],
"tone_start": language_tone_start_map[model.language],
"url": "https://github.com/myshell-ai/MeloTTS",
"license": "MIT license",
"description": "MeloTTS is a high-quality multi-lingual text-to-speech library by MyShell.ai",
}
add_meta_data(filename, meta_data)


if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions scripts/melo-tts/export-onnx.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#!/usr/bin/env python3
# This script export ZH_EN TTS model, which supports both Chinese and English.
# This model has only 1 speaker.

from typing import Any, Dict

import onnx
Expand Down
20 changes: 20 additions & 0 deletions scripts/melo-tts/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,24 @@ tail tokens.txt

./test.py

mkdir zh_en
mv -v *.onnx zh_en/
mv -v lexicon.txt zh_en
mv -v tokens.txt zh_en
cp -v README.md zh_en

ls -lh
echo "---"
ls -lh zh_en

./export-onnx-en.py

mkdir en
mv -v *.onnx en/
mv -v lexicon.txt en
mv -v tokens.txt en
cp -v README.md en

ls -lh en

ls -lh
4 changes: 0 additions & 4 deletions sherpa-onnx/csrc/macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,10 +152,6 @@
#define SHERPA_ONNX_READ_META_DATA_STR_ALLOW_EMPTY(dst, src_key) \
do { \
auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \
if (value.empty()) { \
SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \
exit(-1); \
} \
\
dst = std::move(value); \
} while (0)
Expand Down
Loading
Loading