Skip to content

Commit

Permalink
Add C++ support for MatchaTTS models not from icefall. (#1834)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Feb 10, 2025
1 parent 7d62ccf commit 9559a10
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 7 deletions.
23 changes: 23 additions & 0 deletions .github/scripts/test-offline-tts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,28 @@ for sid in $(seq 0 10); do
done
rm -rf kokoro-en-v0_19

log "------------------------------------------------------------"
log "matcha-tts-fa_en-male"
log "------------------------------------------------------------"
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-tts-fa_en-male.tar.bz2
tar xvf matcha-tts-fa_en-male.tar.bz2
rm matcha-tts-fa_en-male.tar.bz2

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx

$EXE \
--matcha-acoustic-model=./matcha-tts-fa_en-male/model.onnx \
--matcha-vocoder=./hifigan_v2.onnx \
--matcha-tokens=./matcha-tts-fa_en-male/tokens.txt \
--matcha-data-dir=./matcha-tts-fa_en-male/espeak-ng-data \
--output-filename=./tts/test-matcha-fa-en-male.wav \
--num-threads=2 \
"How are you doing today? این یک نمونه ی تست فارسی است. This is a test."

rm -rf matcha-tts-fa_en-male
rm hifigan_v2.onnx
ls -lh tts/*.wav

log "------------------------------------------------------------"
log "matcha-icefall-en_US-ljspeech"
log "------------------------------------------------------------"
Expand All @@ -64,6 +86,7 @@ $EXE \

rm hifigan_v2.onnx
rm -rf matcha-icefall-en_US-ljspeech
ls -lh tts/*.wav

log "------------------------------------------------------------"
log "matcha-icefall-zh-baker"
Expand Down
18 changes: 14 additions & 4 deletions scripts/apk/generate-tts-apk-script.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,18 +397,28 @@ def get_matcha_models() -> List[TtsModel]:
m.dict_dir = m.model_dir + "/dict"
m.vocoder = "hifigan_v2.onnx"

english_models = [
english_persian_models = [
TtsModel(
model_dir="matcha-icefall-en_US-ljspeech",
acoustic_model_name="model-steps-3.onnx",
lang="en",
)
),
TtsModel(
model_dir="matcha-tts-fa_en-male",
acoustic_model_name="model.onnx",
lang="fa",
),
TtsModel(
model_dir="matcha-tts-fa_en-female",
acoustic_model_name="model.onnx",
lang="fa",
),
]
for m in english_models:
for m in english_persian_models:
m.data_dir = f"{m.model_dir}/espeak-ng-data"
m.vocoder = "hifigan_v2.onnx"

return chinese_models + english_models
return chinese_models + english_persian_models


def get_kokoro_models() -> List[TtsModel]:
Expand Down
2 changes: 1 addition & 1 deletion sherpa-onnx/csrc/offline-tts-matcha-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl {
}

std::vector<TokenIDs> token_ids =
frontend_->ConvertTextToTokenIds(text, "en-US");
frontend_->ConvertTextToTokenIds(text, meta_data.voice);

if (token_ids.empty() ||
(token_ids.size() == 1 && token_ids[0].tokens.empty())) {
Expand Down
2 changes: 2 additions & 0 deletions sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ struct OfflineTtsMatchaModelMetaData {
int32_t has_espeak = 0;
int32_t use_eos_bos = 0;
int32_t pad_id = 0;

std::string voice;
};

} // namespace sherpa_onnx
Expand Down
23 changes: 21 additions & 2 deletions sherpa-onnx/csrc/offline-tts-matcha-model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,15 +83,32 @@ class OfflineTtsMatchaModel::Impl {
Ort::Value sid_tensor =
Ort::Value::CreateTensor(memory_info, &sid, 1, &scale_shape, 1);

std::array<float, 2> scales = {noise_scale, length_scale};
int64_t scales_shape = 2;

Ort::Value scales_tensor = Ort::Value::CreateTensor(
memory_info, scales.data(), scales.size(), &scales_shape, 1);

std::vector<Ort::Value> inputs;
inputs.reserve(5);
inputs.push_back(std::move(x));
inputs.push_back(std::move(x_length));
inputs.push_back(std::move(noise_scale_tensor));
inputs.push_back(std::move(length_scale_tensor));
if (input_names_[2] == "scales") {
// for models from
// https://github.com/shivammehta25/Matcha-TTS
inputs.push_back(std::move(scales_tensor));
} else {
// for models from icefall
inputs.push_back(std::move(noise_scale_tensor));
inputs.push_back(std::move(length_scale_tensor));
}

if (input_names_.size() == 5 && input_names_.back() == "sid") {
// for models from icefall
inputs.push_back(std::move(sid_tensor));

// Note that we have not supported multi-speaker tts models from
// https://github.com/shivammehta25/Matcha-TTS
}

auto out =
Expand Down Expand Up @@ -145,6 +162,8 @@ class OfflineTtsMatchaModel::Impl {
SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak");
SHERPA_ONNX_READ_META_DATA(meta_data_.use_eos_bos, "use_eos_bos");
SHERPA_ONNX_READ_META_DATA(meta_data_.pad_id, "pad_id");
SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.voice, "voice",
"en-us");
}

private:
Expand Down

0 comments on commit 9559a10

Please sign in to comment.