From a00d3b482123e65e3544e40ec0360e3d66d47faa Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 2 Jan 2025 15:15:30 +0800 Subject: [PATCH] Add Java API for Matcha-TTS models. (#1673) --- .github/workflows/run-java-test.yaml | 7 ++ .gitignore | 1 + .../NonStreamingTtsMatchaEn.java | 60 +++++++++ .../NonStreamingTtsMatchaZh.java | 66 ++++++++++ .../run-non-streaming-tts-matcha-en.sh | 45 +++++++ .../run-non-streaming-tts-matcha-zh.sh | 44 +++++++ sherpa-onnx/csrc/piper-phonemize-lexicon.cc | 5 + sherpa-onnx/java-api/Makefile | 1 + .../onnx/OfflineTtsMatchaModelConfig.java | 116 ++++++++++++++++++ .../sherpa/onnx/OfflineTtsModelConfig.java | 12 ++ .../onnx/OfflineTtsVitsModelConfig.java | 4 +- 11 files changed, 359 insertions(+), 2 deletions(-) create mode 100644 java-api-examples/NonStreamingTtsMatchaEn.java create mode 100644 java-api-examples/NonStreamingTtsMatchaZh.java create mode 100755 java-api-examples/run-non-streaming-tts-matcha-en.sh create mode 100755 java-api-examples/run-non-streaming-tts-matcha-zh.sh create mode 100644 sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsMatchaModelConfig.java diff --git a/.github/workflows/run-java-test.yaml b/.github/workflows/run-java-test.yaml index cca94422e..b375f01e5 100644 --- a/.github/workflows/run-java-test.yaml +++ b/.github/workflows/run-java-test.yaml @@ -235,6 +235,13 @@ jobs: shell: bash run: | cd ./java-api-examples + + ./run-non-streaming-tts-matcha-zh.sh + ./run-non-streaming-tts-matcha-en.sh + + rm -rf matcha-icefall-* + rm hifigan_v2.onnx + ./run-non-streaming-tts-piper-en.sh rm -rf vits-piper-* diff --git a/.gitignore b/.gitignore index eeec52d9c..e5226f0e1 100644 --- a/.gitignore +++ b/.gitignore @@ -126,3 +126,4 @@ sherpa-onnx-moonshine-base-en-int8 harmony-os/SherpaOnnxHar/sherpa_onnx/LICENSE harmony-os/SherpaOnnxHar/sherpa_onnx/CHANGELOG.md matcha-icefall-zh-baker +matcha-icefall-en_US-ljspeech diff --git a/java-api-examples/NonStreamingTtsMatchaEn.java b/java-api-examples/NonStreamingTtsMatchaEn.java new file mode 100644 index 000000000..bda41f061 --- /dev/null +++ b/java-api-examples/NonStreamingTtsMatchaEn.java @@ -0,0 +1,60 @@ +// Copyright 2025 Xiaomi Corporation + +// This file shows how to use a matcha English model +// to convert text to speech +import com.k2fsa.sherpa.onnx.*; + +public class NonStreamingTtsMatchaEn { + public static void main(String[] args) { + // please visit + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker + // to download model files + String acousticModel = "./matcha-icefall-en_US-ljspeech/model-steps-3.onnx"; + String vocoder = "./hifigan_v2.onnx"; + String tokens = "./matcha-icefall-en_US-ljspeech/tokens.txt"; + String dataDir = "./matcha-icefall-en_US-ljspeech/espeak-ng-data"; + String text = + "Today as always, men fall into two groups: slaves and free men. Whoever does not have" + + " two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a" + + " businessman, an official, or a scholar."; + + OfflineTtsMatchaModelConfig matchaModelConfig = + OfflineTtsMatchaModelConfig.builder() + .setAcousticModel(acousticModel) + .setVocoder(vocoder) + .setTokens(tokens) + .setDataDir(dataDir) + .build(); + + OfflineTtsModelConfig modelConfig = + OfflineTtsModelConfig.builder() + .setMatcha(matchaModelConfig) + .setNumThreads(1) + .setDebug(true) + .build(); + + OfflineTtsConfig config = OfflineTtsConfig.builder().setModel(modelConfig).build(); + OfflineTts tts = new OfflineTts(config); + + int sid = 0; + float speed = 1.0f; + long start = System.currentTimeMillis(); + GeneratedAudio audio = tts.generate(text, sid, speed); + long stop = System.currentTimeMillis(); + + float timeElapsedSeconds = (stop - start) / 1000.0f; + + float audioDuration = audio.getSamples().length / (float) audio.getSampleRate(); + float real_time_factor = timeElapsedSeconds / audioDuration; + + String waveFilename = "tts-matcha-en.wav"; + audio.save(waveFilename); + System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); + System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); + System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); + System.out.printf("-- text: %s\n", text); + System.out.printf("-- Saved to %s\n", waveFilename); + + tts.release(); + } +} diff --git a/java-api-examples/NonStreamingTtsMatchaZh.java b/java-api-examples/NonStreamingTtsMatchaZh.java new file mode 100644 index 000000000..dec24dbb3 --- /dev/null +++ b/java-api-examples/NonStreamingTtsMatchaZh.java @@ -0,0 +1,66 @@ +// Copyright 2025 Xiaomi Corporation + +// This file shows how to use a matcha Chinese TTS model +// to convert text to speech +import com.k2fsa.sherpa.onnx.*; + +public class NonStreamingTtsMatchaZh { + public static void main(String[] args) { + // please visit + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker + // to download model files + String acousticModel = "./matcha-icefall-zh-baker/model-steps-3.onnx"; + String vocoder = "./hifigan_v2.onnx"; + String tokens = "./matcha-icefall-zh-baker/tokens.txt"; + String lexicon = "./matcha-icefall-zh-baker/lexicon.txt"; + String dictDir = "./matcha-icefall-zh-baker/dict"; + String ruleFsts = + "./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst"; + String text = + "某某银行的副行长和一些行政领导表示,他们去过长江" + + "和长白山; 经济不断增长。" + + "2024年12月31号,拨打110或者18920240511。" + + "123456块钱。"; + + OfflineTtsMatchaModelConfig matchaModelConfig = + OfflineTtsMatchaModelConfig.builder() + .setAcousticModel(acousticModel) + .setVocoder(vocoder) + .setTokens(tokens) + .setLexicon(lexicon) + .setDictDir(dictDir) + .build(); + + OfflineTtsModelConfig modelConfig = + OfflineTtsModelConfig.builder() + .setMatcha(matchaModelConfig) + .setNumThreads(1) + .setDebug(true) + .build(); + + OfflineTtsConfig config = + OfflineTtsConfig.builder().setModel(modelConfig).setRuleFsts(ruleFsts).build(); + OfflineTts tts = new OfflineTts(config); + + int sid = 0; + float speed = 1.0f; + long start = System.currentTimeMillis(); + GeneratedAudio audio = tts.generate(text, sid, speed); + long stop = System.currentTimeMillis(); + + float timeElapsedSeconds = (stop - start) / 1000.0f; + + float audioDuration = audio.getSamples().length / (float) audio.getSampleRate(); + float real_time_factor = timeElapsedSeconds / audioDuration; + + String waveFilename = "tts-matcha-zh.wav"; + audio.save(waveFilename); + System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); + System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); + System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); + System.out.printf("-- text: %s\n", text); + System.out.printf("-- Saved to %s\n", waveFilename); + + tts.release(); + } +} diff --git a/java-api-examples/run-non-streaming-tts-matcha-en.sh b/java-api-examples/run-non-streaming-tts-matcha-en.sh new file mode 100755 index 000000000..ce0289fc9 --- /dev/null +++ b/java-api-examples/run-non-streaming-tts-matcha-en.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +set -ex + +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then + mkdir -p ../build + pushd ../build + cmake \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=ON \ + .. + + make -j4 + ls -lh lib + popd +fi + +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then + pushd ../sherpa-onnx/java-api + make + popd +fi + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker +# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker +# to download more models +if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 + tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 + rm matcha-icefall-en_US-ljspeech.tar.bz2 +fi + +if [ ! -f ./hifigan_v2.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx +fi + +java \ + -Djava.library.path=$PWD/../build/lib \ + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ + NonStreamingTtsMatchaEn.java diff --git a/java-api-examples/run-non-streaming-tts-matcha-zh.sh b/java-api-examples/run-non-streaming-tts-matcha-zh.sh new file mode 100755 index 000000000..a339e298a --- /dev/null +++ b/java-api-examples/run-non-streaming-tts-matcha-zh.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +set -ex + +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then + mkdir -p ../build + pushd ../build + cmake \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=ON \ + .. + + make -j4 + ls -lh lib + popd +fi + +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then + pushd ../sherpa-onnx/java-api + make + popd +fi + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker +# to download more models +if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 + tar xvf matcha-icefall-zh-baker.tar.bz2 + rm matcha-icefall-zh-baker.tar.bz2 +fi + +if [ ! -f ./hifigan_v2.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx +fi + +java \ + -Djava.library.path=$PWD/../build/lib \ + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ + NonStreamingTtsMatchaZh.java diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc index 9bc93ce98..982260003 100644 --- a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc @@ -366,6 +366,11 @@ template PiperPhonemizeLexicon::PiperPhonemizeLexicon( #endif #if __OHOS__ +template PiperPhonemizeLexicon::PiperPhonemizeLexicon( + NativeResourceManager *mgr, const std::string &tokens, + const std::string &data_dir, + const OfflineTtsVitsModelMetaData &vits_meta_data); + template PiperPhonemizeLexicon::PiperPhonemizeLexicon( NativeResourceManager *mgr, const std::string &tokens, const std::string &data_dir, diff --git a/sherpa-onnx/java-api/Makefile b/sherpa-onnx/java-api/Makefile index 8b9278fc0..816d2139f 100644 --- a/sherpa-onnx/java-api/Makefile +++ b/sherpa-onnx/java-api/Makefile @@ -35,6 +35,7 @@ java_files += OfflineRecognizerResult.java java_files += OfflineStream.java java_files += OfflineRecognizer.java +java_files += OfflineTtsMatchaModelConfig.java java_files += OfflineTtsVitsModelConfig.java java_files += OfflineTtsModelConfig.java java_files += OfflineTtsConfig.java diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsMatchaModelConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsMatchaModelConfig.java new file mode 100644 index 000000000..8a95aea75 --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsMatchaModelConfig.java @@ -0,0 +1,116 @@ +// Copyright 2025 Xiaomi Corporation + +package com.k2fsa.sherpa.onnx; + +public class OfflineTtsMatchaModelConfig { + private final String acousticModel; + private final String vocoder; + private final String lexicon; + private final String tokens; + private final String dataDir; + private final String dictDir; + private final float noiseScale; + private final float lengthScale; + + private OfflineTtsMatchaModelConfig(Builder builder) { + this.acousticModel = builder.acousticModel; + this.vocoder = builder.vocoder; + this.lexicon = builder.lexicon; + this.tokens = builder.tokens; + this.dataDir = builder.dataDir; + this.dictDir = builder.dictDir; + this.noiseScale = builder.noiseScale; + this.lengthScale = builder.lengthScale; + } + + public static Builder builder() { + return new Builder(); + } + + public String getAcousticModel() { + return acousticModel; + } + + public String getVocoder() { + return vocoder; + } + + public String getLexicon() { + return lexicon; + } + + public String getTokens() { + return tokens; + } + + public String getDataDir() { + return dataDir; + } + + public String getDictDir() { + return dictDir; + } + + public float getLengthScale() { + return lengthScale; + } + + public float getNoiseScale() { + return noiseScale; + } + + public static class Builder { + private String acousticModel = ""; + private String vocoder = ""; + private String lexicon = ""; + private String tokens = ""; + private String dataDir = ""; + private String dictDir = ""; + private float noiseScale = 1.0f; + private float lengthScale = 1.0f; + + public OfflineTtsMatchaModelConfig build() { + return new OfflineTtsMatchaModelConfig(this); + } + + public Builder setAcousticModel(String acousticModel) { + this.acousticModel = acousticModel; + return this; + } + + public Builder setVocoder(String vocoder) { + this.vocoder = vocoder; + return this; + } + + public Builder setTokens(String tokens) { + this.tokens = tokens; + return this; + } + + public Builder setLexicon(String lexicon) { + this.lexicon = lexicon; + return this; + } + + public Builder setDataDir(String dataDir) { + this.dataDir = dataDir; + return this; + } + + public Builder setDictDir(String dictDir) { + this.dictDir = dictDir; + return this; + } + + public Builder setNoiseScale(float noiseScale) { + this.noiseScale = noiseScale; + return this; + } + + public Builder setLengthScale(float lengthScale) { + this.lengthScale = lengthScale; + return this; + } + } +} diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsModelConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsModelConfig.java index 52960217c..ff3589b13 100644 --- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsModelConfig.java +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsModelConfig.java @@ -4,12 +4,14 @@ public class OfflineTtsModelConfig { private final OfflineTtsVitsModelConfig vits; + private final OfflineTtsMatchaModelConfig matcha; private final int numThreads; private final boolean debug; private final String provider; private OfflineTtsModelConfig(Builder builder) { this.vits = builder.vits; + this.matcha = builder.matcha; this.numThreads = builder.numThreads; this.debug = builder.debug; this.provider = builder.provider; @@ -23,8 +25,13 @@ public OfflineTtsVitsModelConfig getVits() { return vits; } + public OfflineTtsMatchaModelConfig getMatcha() { + return matcha; + } + public static class Builder { private OfflineTtsVitsModelConfig vits = OfflineTtsVitsModelConfig.builder().build(); + private OfflineTtsMatchaModelConfig matcha = OfflineTtsMatchaModelConfig.builder().build(); private int numThreads = 1; private boolean debug = true; private String provider = "cpu"; @@ -38,6 +45,11 @@ public Builder setVits(OfflineTtsVitsModelConfig vits) { return this; } + public Builder setMatcha(OfflineTtsMatchaModelConfig matcha) { + this.matcha = matcha; + return this; + } + public Builder setNumThreads(int numThreads) { this.numThreads = numThreads; return this; diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsVitsModelConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsVitsModelConfig.java index 4cfc9eebd..35bfd41c6 100644 --- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsVitsModelConfig.java +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsVitsModelConfig.java @@ -60,9 +60,9 @@ public float getNoiseScaleW() { } public static class Builder { - private String model; + private String model = ""; private String lexicon = ""; - private String tokens; + private String tokens = ""; private String dataDir = ""; private String dictDir = ""; private float noiseScale = 0.667f;