From ea7c45b60cac64521cca60a33b10b88fdad20ed0 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 19 Oct 2023 17:38:23 +0800 Subject: [PATCH] Add C API for offline TTS. (#373) --- .github/scripts/test-offline-tts.sh | 98 ++++++++++ .github/workflows/linux-gpu.yaml | 12 +- .github/workflows/linux.yaml | 17 +- .github/workflows/macos.yaml | 12 +- .github/workflows/pkg-config.yaml | 18 ++ .github/workflows/windows-x64-cuda.yaml | 10 + .github/workflows/windows-x64.yaml | 12 +- .github/workflows/windows-x86.yaml | 12 +- .gitignore | 1 + c-api-examples/CMakeLists.txt | 3 + c-api-examples/Makefile | 12 +- c-api-examples/README.md | 9 + c-api-examples/offline-tts-c-api.c | 203 ++++++++++++++++++++ sherpa-onnx/c-api/c-api.cc | 109 +++++++++-- sherpa-onnx/c-api/c-api.h | 56 ++++++ sherpa-onnx/csrc/sherpa-onnx-offline-tts.cc | 2 +- 16 files changed, 562 insertions(+), 24 deletions(-) create mode 100755 .github/scripts/test-offline-tts.sh create mode 100644 c-api-examples/offline-tts-c-api.c diff --git a/.github/scripts/test-offline-tts.sh b/.github/scripts/test-offline-tts.sh new file mode 100755 index 000000000..15be2d921 --- /dev/null +++ b/.github/scripts/test-offline-tts.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash + +set -e + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +echo "EXE is $EXE" +echo "PATH: $PATH" + +which $EXE + +# test waves are saved in ./tts +mkdir ./tts + +log "------------------------------------------------------------" +log "vits-ljs test" +log "------------------------------------------------------------" + +repo_url=https://huggingface.co/csukuangfj/vits-ljs +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model from $repo_url" +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "*.onnx" +ls -lh *.onnx +popd + +$EXE \ + --vits-model=$repo/vits-ljs.onnx \ + --vits-lexicon=$repo/lexicon.txt \ + --vits-tokens=$repo/tokens.txt \ + --output-filename=./tts/vits-ljs.wav \ + 'liliana, the most beautiful and lovely assistant of our team!' + +ls -lh ./tts + +rm -rfv $repo + +log "------------------------------------------------------------" +log "vits-vctk test" +log "------------------------------------------------------------" + +repo_url=https://huggingface.co/csukuangfj/vits-vctk +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model from $repo_url" +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "*.onnx" +ls -lh *.onnx +popd + +for sid in 0 10 90; do + $EXE \ + --vits-model=$repo/vits-vctk.onnx \ + --vits-lexicon=$repo/lexicon.txt \ + --vits-tokens=$repo/tokens.txt \ + --sid=$sid \ + --output-filename=./tts/vits-vctk-${sid}.wav \ + 'liliana, the most beautiful and lovely assistant of our team!' +done + +rm -rfv $repo + +ls -lh tts/ + +log "------------------------------------------------------------" +log "vits-zh-aishell3" +log "------------------------------------------------------------" + +repo_url=https://huggingface.co/csukuangfj/vits-zh-aishell3 +log "Start testing ${repo_url}" +repo=$(basename $repo_url) +log "Download pretrained model from $repo_url" +GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url +pushd $repo +git lfs pull --include "*.onnx" +ls -lh *.onnx +popd + +for sid in 0 10 90; do + $EXE \ + --vits-model=$repo/vits-aishell3.onnx \ + --vits-lexicon=$repo/lexicon.txt \ + --vits-tokens=$repo/tokens.txt \ + --sid=$sid \ + --output-filename=./tts/vits-aishell3-${sid}.wav \ + '林美丽最美丽' +done + +rm -rfv $repo + +ls -lh ./tts/ diff --git a/.github/workflows/linux-gpu.yaml b/.github/workflows/linux-gpu.yaml index de4e3e5af..db01420f3 100644 --- a/.github/workflows/linux-gpu.yaml +++ b/.github/workflows/linux-gpu.yaml @@ -12,6 +12,7 @@ on: - '.github/scripts/test-online-paraformer.sh' - '.github/scripts/test-offline-transducer.sh' - '.github/scripts/test-offline-ctc.sh' + - '.github/scripts/test-offline-tts.sh' - 'CMakeLists.txt' - 'cmake/**' - 'sherpa-onnx/csrc/*' @@ -26,6 +27,7 @@ on: - '.github/scripts/test-online-paraformer.sh' - '.github/scripts/test-offline-transducer.sh' - '.github/scripts/test-offline-ctc.sh' + - '.github/scripts/test-offline-tts.sh' - 'CMakeLists.txt' - 'cmake/**' - 'sherpa-onnx/csrc/*' @@ -48,7 +50,7 @@ jobs: build_type: [Release, Debug] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: fetch-depth: 0 @@ -75,6 +77,14 @@ jobs: file build/bin/sherpa-onnx readelf -d build/bin/sherpa-onnx + - name: Test offline TTS + shell: bash + run: | + export PATH=$PWD/build/bin:$PATH + export EXE=sherpa-onnx-offline-tts + + .github/scripts/test-offline-tts.sh + - name: Test online paraformer shell: bash run: | diff --git a/.github/workflows/linux.yaml b/.github/workflows/linux.yaml index 1bfb327ee..4fb48c4cf 100644 --- a/.github/workflows/linux.yaml +++ b/.github/workflows/linux.yaml @@ -12,6 +12,7 @@ on: - '.github/scripts/test-online-paraformer.sh' - '.github/scripts/test-offline-transducer.sh' - '.github/scripts/test-offline-ctc.sh' + - '.github/scripts/test-offline-tts.sh' - 'CMakeLists.txt' - 'cmake/**' - 'sherpa-onnx/csrc/*' @@ -26,6 +27,7 @@ on: - '.github/scripts/test-online-paraformer.sh' - '.github/scripts/test-offline-transducer.sh' - '.github/scripts/test-offline-ctc.sh' + - '.github/scripts/test-offline-tts.sh' - 'CMakeLists.txt' - 'cmake/**' - 'sherpa-onnx/csrc/*' @@ -49,7 +51,7 @@ jobs: shared_lib: [ON, OFF] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: fetch-depth: 0 @@ -76,6 +78,14 @@ jobs: file build/bin/sherpa-onnx readelf -d build/bin/sherpa-onnx + - name: Test offline TTS + shell: bash + run: | + export PATH=$PWD/build/bin:$PATH + export EXE=sherpa-onnx-offline-tts + + .github/scripts/test-offline-tts.sh + - name: Test online paraformer shell: bash run: | @@ -150,3 +160,8 @@ jobs: file_glob: true overwrite: true file: sherpa-onnx-*linux-x64.tar.bz2 + + - uses: actions/upload-artifact@v3 + with: + name: tts-generated-test-files + path: tts diff --git a/.github/workflows/macos.yaml b/.github/workflows/macos.yaml index 5cc6e83e2..721e1db3c 100644 --- a/.github/workflows/macos.yaml +++ b/.github/workflows/macos.yaml @@ -12,6 +12,7 @@ on: - '.github/scripts/test-online-paraformer.sh' - '.github/scripts/test-offline-transducer.sh' - '.github/scripts/test-offline-ctc.sh' + - '.github/scripts/test-offline-tts.sh' - 'CMakeLists.txt' - 'cmake/**' - 'sherpa-onnx/csrc/*' @@ -24,6 +25,7 @@ on: - '.github/scripts/test-online-paraformer.sh' - '.github/scripts/test-offline-transducer.sh' - '.github/scripts/test-offline-ctc.sh' + - '.github/scripts/test-offline-tts.sh' - 'CMakeLists.txt' - 'cmake/**' - 'sherpa-onnx/csrc/*' @@ -44,7 +46,7 @@ jobs: build_type: [Release, Debug] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: fetch-depth: 0 @@ -74,6 +76,14 @@ jobs: otool -L build/bin/sherpa-onnx otool -l build/bin/sherpa-onnx + - name: Test offline TTS + shell: bash + run: | + export PATH=$PWD/build/bin:$PATH + export EXE=sherpa-onnx-offline-tts + + .github/scripts/test-offline-tts.sh + - name: Test online paraformer shell: bash run: | diff --git a/.github/workflows/pkg-config.yaml b/.github/workflows/pkg-config.yaml index 6445ae5dc..4a898e373 100644 --- a/.github/workflows/pkg-config.yaml +++ b/.github/workflows/pkg-config.yaml @@ -9,6 +9,7 @@ on: - '*' paths: - '.github/workflows/pkg-config.yaml' + - '.github/scripts/test-offline-tts.sh' - 'CMakeLists.txt' - 'cmake/**' - 'sherpa-onnx/csrc/*' @@ -19,6 +20,7 @@ on: - master paths: - '.github/workflows/pkg-config.yaml' + - '.github/scripts/test-offline-tts.sh' - 'CMakeLists.txt' - 'cmake/**' - 'sherpa-onnx/csrc/*' @@ -92,9 +94,20 @@ jobs: run: | export PKG_CONFIG_PATH=$PWD/build/install:$PKG_CONFIG_PATH cd c-api-examples + gcc -o decode-file-c-api $(pkg-config --cflags sherpa-onnx) ./decode-file-c-api.c $(pkg-config --libs sherpa-onnx) ./decode-file-c-api --help + gcc -o offline-tts-c-api $(pkg-config --cflags sherpa-onnx) ./offline-tts-c-api.c $(pkg-config --libs sherpa-onnx) + ./offline-tts-c-api --help + + - name: Test offline TTS C API + shell: bash + run: | + export PATH=$PWD/c-api-examples:$PATH + export EXE=offline-tts-c-api + .github/scripts/test-offline-tts.sh + - name: Test online transducer (C API) shell: bash run: | @@ -102,3 +115,8 @@ jobs: export EXE=decode-file-c-api .github/scripts/test-online-transducer.sh + + - uses: actions/upload-artifact@v3 + with: + name: tts-generated-test-files + path: tts diff --git a/.github/workflows/windows-x64-cuda.yaml b/.github/workflows/windows-x64-cuda.yaml index afbb501c4..cd181fb12 100644 --- a/.github/workflows/windows-x64-cuda.yaml +++ b/.github/workflows/windows-x64-cuda.yaml @@ -12,6 +12,7 @@ on: - '.github/scripts/test-online-paraformer.sh' - '.github/scripts/test-offline-transducer.sh' - '.github/scripts/test-offline-ctc.sh' + - '.github/scripts/test-offline-tts.sh' - 'CMakeLists.txt' - 'cmake/**' - 'sherpa-onnx/csrc/*' @@ -24,6 +25,7 @@ on: - '.github/scripts/test-online-paraformer.sh' - '.github/scripts/test-offline-transducer.sh' - '.github/scripts/test-offline-ctc.sh' + - '.github/scripts/test-offline-tts.sh' - 'CMakeLists.txt' - 'cmake/**' - 'sherpa-onnx/csrc/*' @@ -64,6 +66,14 @@ jobs: ls -lh ./bin/Release/sherpa-onnx.exe + - name: Test offline TTS + shell: bash + run: | + export PATH=$PWD/build/bin/Release:$PATH + export EXE=sherpa-onnx-offline-tts.exe + + .github/scripts/test-offline-tts.sh + - name: Test online paraformer for windows x64 shell: bash run: | diff --git a/.github/workflows/windows-x64.yaml b/.github/workflows/windows-x64.yaml index c491b37d6..9e826f977 100644 --- a/.github/workflows/windows-x64.yaml +++ b/.github/workflows/windows-x64.yaml @@ -12,6 +12,7 @@ on: - '.github/scripts/test-online-paraformer.sh' - '.github/scripts/test-offline-transducer.sh' - '.github/scripts/test-offline-ctc.sh' + - '.github/scripts/test-offline-tts.sh' - 'CMakeLists.txt' - 'cmake/**' - 'sherpa-onnx/csrc/*' @@ -24,6 +25,7 @@ on: - '.github/scripts/test-online-paraformer.sh' - '.github/scripts/test-offline-transducer.sh' - '.github/scripts/test-offline-ctc.sh' + - '.github/scripts/test-offline-tts.sh' - 'CMakeLists.txt' - 'cmake/**' - 'sherpa-onnx/csrc/*' @@ -45,7 +47,7 @@ jobs: shared_lib: [ON, OFF] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: fetch-depth: 0 @@ -65,6 +67,14 @@ jobs: ls -lh ./bin/Release/sherpa-onnx.exe + - name: Test offline TTS + shell: bash + run: | + export PATH=$PWD/build/bin/Release:$PATH + export EXE=sherpa-onnx-offline-tts.exe + + .github/scripts/test-offline-tts.sh + - name: Test online paraformer for windows x64 shell: bash run: | diff --git a/.github/workflows/windows-x86.yaml b/.github/workflows/windows-x86.yaml index e74aa54b2..c6fa61ea2 100644 --- a/.github/workflows/windows-x86.yaml +++ b/.github/workflows/windows-x86.yaml @@ -12,6 +12,7 @@ on: - '.github/scripts/test-online-paraformer.sh' - '.github/scripts/test-offline-transducer.sh' - '.github/scripts/test-offline-ctc.sh' + - '.github/scripts/test-offline-tts.sh' - 'CMakeLists.txt' - 'cmake/**' - 'sherpa-onnx/csrc/*' @@ -24,6 +25,7 @@ on: - '.github/scripts/test-online-paraformer.sh' - '.github/scripts/test-offline-transducer.sh' - '.github/scripts/test-offline-ctc.sh' + - '.github/scripts/test-offline-tts.sh' - 'CMakeLists.txt' - 'cmake/**' - 'sherpa-onnx/csrc/*' @@ -45,7 +47,7 @@ jobs: shared_lib: [ON, OFF] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: fetch-depth: 0 @@ -65,6 +67,14 @@ jobs: ls -lh ./bin/Release/sherpa-onnx.exe + - name: Test offline TTS + shell: bash + run: | + export PATH=$PWD/build/bin/Release:$PATH + export EXE=sherpa-onnx-offline-tts.exe + + .github/scripts/test-offline-tts.sh + - name: Test online paraformer for windows x86 shell: bash run: | diff --git a/.gitignore b/.gitignore index b5b678525..cd68dd483 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ run-bilingual*.sh run-*-zipformer.sh run-zh.sh decode-file-c-api +offline-tts-c-api run-decode-file-c-api.sh sherpa-onnx-ffmpeg build-ios diff --git a/c-api-examples/CMakeLists.txt b/c-api-examples/CMakeLists.txt index 38347a3c2..95983cd86 100644 --- a/c-api-examples/CMakeLists.txt +++ b/c-api-examples/CMakeLists.txt @@ -3,3 +3,6 @@ include(cargs) include_directories(${CMAKE_SOURCE_DIR}) add_executable(decode-file-c-api decode-file-c-api.c) target_link_libraries(decode-file-c-api sherpa-onnx-c-api cargs) + +add_executable(offline-tts-c-api offline-tts-c-api.c) +target_link_libraries(offline-tts-c-api sherpa-onnx-c-api cargs) diff --git a/c-api-examples/Makefile b/c-api-examples/Makefile index 18ddda27e..767e0c5b0 100644 --- a/c-api-examples/Makefile +++ b/c-api-examples/Makefile @@ -4,9 +4,19 @@ CUR_DIR :=$(shell pwd) CFLAGS := -I ../ -I ../build/_deps/cargs-src/include/ LDFLAGS := -L ../build/lib LDFLAGS += -L ../build/_deps/onnxruntime-src/lib -LDFLAGS += -lsherpa-onnx-c-api -lsherpa-onnx-core -lonnxruntime -lkaldi-native-fbank-core -lcargs +LDFLAGS += -lsherpa-onnx-c-api -lsherpa-onnx-core -lonnxruntime -lkaldi-native-fbank-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lcargs LDFLAGS += -Wl,-rpath,${CUR_DIR}/../build/lib LDFLAGS += -Wl,-rpath,${CUR_DIR}/../build/_deps/onnxruntime-src/lib +.PHONY: all clean + +all: decode-file-c-api offline-tts-c-api + decode-file-c-api: decode-file-c-api.c $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +offline-tts-c-api: offline-tts-c-api.c + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +clean: + $(RM) ./decode-file-c-api ./offline-tts-c-api diff --git a/c-api-examples/README.md b/c-api-examples/README.md index 85f2e505f..a7e6f5fd4 100644 --- a/c-api-examples/README.md +++ b/c-api-examples/README.md @@ -6,4 +6,13 @@ Please refer to the documentation https://k2-fsa.github.io/sherpa/onnx/c-api/index.html for details. + +## File descriptions + +- [decode-file-c-api.c](./decode-file-c-api.c) This file shows how to use the C API + for speech recognition with a streaming model. + +- [offline-tts-c-api.c](./offline-tts-c-api.c) This file shows how to use the C API + to convert text to speech with a non-streaming model. + [sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx diff --git a/c-api-examples/offline-tts-c-api.c b/c-api-examples/offline-tts-c-api.c new file mode 100644 index 000000000..54ba4d099 --- /dev/null +++ b/c-api-examples/offline-tts-c-api.c @@ -0,0 +1,203 @@ +// c-api-examples/offline-tts-c-api.c +// +// Copyright (c) 2023 Xiaomi Corporation + +// This file shows how to use sherpa-onnx C API +// to convert text to speech using an offline model. + +#include +#include +#include + +#include "cargs.h" +#include "sherpa-onnx/c-api/c-api.h" + +static struct cag_option options[] = { + {.identifier = 'h', + .access_letters = "h", + .access_name = "help", + .description = "Show help"}, + {.access_name = "vits-model", + .value_name = "/path/to/xxx.onnx", + .identifier = '0', + .description = "Path to VITS model"}, + {.access_name = "vits-lexicon", + .value_name = "/path/to/lexicon.txt", + .identifier = '1', + .description = "Path to lexicon.txt for VITS models"}, + {.access_name = "vits-tokens", + .value_name = "/path/to/tokens.txt", + .identifier = '2', + .description = "Path to tokens.txt for VITS models"}, + {.access_name = "vits-noise-scale", + .value_name = "0.667", + .identifier = '3', + .description = "noise_scale for VITS models"}, + {.access_name = "vits-noise-scale-w", + .value_name = "0.8", + .identifier = '4', + .description = "noise_scale_w for VITS models"}, + {.access_name = "vits-length-scale", + .value_name = "1.0", + .identifier = '5', + .description = + "length_scale for VITS models. Default to 1. You can tune it " + "to change the speech speed. small -> faster; large -> slower. "}, + {.access_name = "num-threads", + .value_name = "1", + .identifier = '6', + .description = "Number of threads"}, + {.access_name = "provider", + .value_name = "cpu", + .identifier = '7', + .description = "Provider: cpu (default), cuda, coreml"}, + {.access_name = "debug", + .value_name = "0", + .identifier = '8', + .description = "1 to show debug messages while loading the model"}, + {.access_name = "sid", + .value_name = "0", + .identifier = '9', + .description = "Speaker ID. Default to 0. Note it is not used for " + "single-speaker models."}, + {.access_name = "output-filename", + .value_name = "./generated.wav", + .identifier = 'a', + .description = + "Filename to save the generated audio. Default to ./generated.wav"}, +}; + +static void ShowUsage() { + const char *kUsageMessage = + "Offline text-to-speech with sherpa-onnx C API" + "\n" + "./offline-tts-c-api \\\n" + " --vits-model=/path/to/model.onnx \\\n" + " --vits-lexicon=/path/to/lexicon.txt \\\n" + " --vits-tokens=/path/to/tokens.txt \\\n" + " --sid=0 \\\n" + " --output-filename=./generated.wav \\\n" + " 'some text within single quotes on linux/macos or use double quotes on " + "windows'\n" + "\n" + "It will generate a file ./generated.wav as specified by " + "--output-filename.\n" + "\n" + "You can download a test model from\n" + "https://huggingface.co/csukuangfj/vits-ljs\n" + "\n" + "For instance, you can use:\n" + "wget " + "https://huggingface.co/csukuangfj/vits-ljs/resolve/main/vits-ljs.onnx\n" + "wget " + "https://huggingface.co/csukuangfj/vits-ljs/resolve/main/lexicon.txt\n" + "wget " + "https://huggingface.co/csukuangfj/vits-ljs/resolve/main/tokens.txt\n" + "\n" + "./offline-tts-c-api \\\n" + " --vits-model=./vits-ljs.onnx \\\n" + " --vits-lexicon=./lexicon.txt \\\n" + " --vits-tokens=./tokens.txt \\\n" + " --sid=0 \\\n" + " --output-filename=./generated.wav \\\n" + " 'liliana, the most beautiful and lovely assistant of our team!'\n" + "\n" + "Please see\n" + "https://k2-fsa.github.io/sherpa/onnx/tts/index.html\n" + "or details.\n\n"; + + fprintf(stderr, "%s", kUsageMessage); + cag_option_print(options, CAG_ARRAY_SIZE(options), stderr); + exit(0); +} + +int32_t main(int32_t argc, char *argv[]) { + cag_option_context context; + char identifier; + const char *value; + + cag_option_prepare(&context, options, CAG_ARRAY_SIZE(options), argc, argv); + + SherpaOnnxOfflineTtsConfig config; + memset(&config, 0, sizeof(config)); + + int32_t sid = 0; + const char *filename = strdup("./generated.wav"); + const char *text; + + while (cag_option_fetch(&context)) { + identifier = cag_option_get(&context); + value = cag_option_get_value(&context); + switch (identifier) { + case '0': + config.model.vits.model = value; + break; + case '1': + config.model.vits.lexicon = value; + break; + case '2': + config.model.vits.tokens = value; + break; + case '3': + config.model.vits.noise_scale = atof(value); + break; + case '4': + config.model.vits.noise_scale_w = atof(value); + break; + case '5': + config.model.vits.length_scale = atof(value); + break; + case '6': + config.model.num_threads = atoi(value); + break; + case '7': + config.model.provider = value; + break; + case '8': + config.model.debug = atoi(value); + break; + case '9': + sid = atoi(value); + break; + case 'a': + free((void *)filename); + filename = strdup(value); + break; + case 'h': + // fall through + default: + ShowUsage(); + } + } + + if (!config.model.vits.model || !config.model.vits.lexicon || + !config.model.vits.tokens) { + ShowUsage(); + } + + // the last arg is the text + text = argv[argc - 1]; + if (text[0] == '-') { + fprintf(stderr, "\n***Please input your text!***\n\n"); + fprintf(stderr, "\n---------------Usage---------------\n\n"); + ShowUsage(); + } + + SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); + + const SherpaOnnxGeneratedAudio *audio = + SherpaOnnxOfflineTtsGenerate(tts, text, sid); + + SherpaOnnxDestroyOfflineWriteWave(audio, filename); + + SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio); + SherpaOnnxDestroyOfflineTts(tts); + + fprintf(stderr, "Input text is: %s\n", text); + fprintf(stderr, "Speaker ID is is: %d\n", sid); + fprintf(stderr, "Saved to: %s\n", filename); + + free((void *)filename); + + return 0; +} diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index ca97da25f..c07592e7d 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -12,8 +12,10 @@ #include "sherpa-onnx/csrc/circular-buffer.h" #include "sherpa-onnx/csrc/display.h" #include "sherpa-onnx/csrc/offline-recognizer.h" +#include "sherpa-onnx/csrc/offline-tts.h" #include "sherpa-onnx/csrc/online-recognizer.h" #include "sherpa-onnx/csrc/voice-activity-detector.h" +#include "sherpa-onnx/csrc/wave-writer.h" struct SherpaOnnxOnlineRecognizer { std::unique_ptr impl; @@ -204,12 +206,14 @@ const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult( } void DestroyOnlineRecognizerResult(const SherpaOnnxOnlineRecognizerResult *r) { - delete[] r->text; - delete[] r->json; - delete[] r->tokens; - delete[] r->tokens_arr; - delete[] r->timestamps; - delete r; + if (r) { + delete[] r->text; + delete[] r->json; + delete[] r->tokens; + delete[] r->tokens_arr; + delete[] r->timestamps; + delete r; + } } void Reset(SherpaOnnxOnlineRecognizer *recognizer, @@ -385,9 +389,11 @@ const SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult( void DestroyOfflineRecognizerResult( const SherpaOnnxOfflineRecognizerResult *r) { - delete[] r->text; - delete[] r->timestamps; - delete r; + if (r) { + delete[] r->text; + delete[] r->timestamps; + delete r; + } } // ============================================================ @@ -493,18 +499,16 @@ int32_t SherpaOnnxVoiceActivityDetectorDetected( return p->impl->IsSpeechDetected(); } -void SherpaOnnxVoiceActivityDetectorPop( - SherpaOnnxVoiceActivityDetector *p) { +void SherpaOnnxVoiceActivityDetectorPop(SherpaOnnxVoiceActivityDetector *p) { p->impl->Pop(); } -void SherpaOnnxVoiceActivityDetectorClear( - SherpaOnnxVoiceActivityDetector *p) { +void SherpaOnnxVoiceActivityDetectorClear(SherpaOnnxVoiceActivityDetector *p) { p->impl->Clear(); } -const SherpaOnnxSpeechSegment * -SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p) { +const SherpaOnnxSpeechSegment *SherpaOnnxVoiceActivityDetectorFront( + SherpaOnnxVoiceActivityDetector *p) { const sherpa_onnx::SpeechSegment &segment = p->impl->Front(); SherpaOnnxSpeechSegment *ans = new SherpaOnnxSpeechSegment; @@ -517,10 +521,81 @@ SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p) { } void SherpaOnnxDestroySpeechSegment(const SherpaOnnxSpeechSegment *p) { - delete[] p->samples; - delete p; + if (p) { + delete[] p->samples; + delete p; + } } void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) { p->impl->Reset(); } + +struct SherpaOnnxOfflineTts { + std::unique_ptr impl; +}; + +SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( + const SherpaOnnxOfflineTtsConfig *config) { + sherpa_onnx::OfflineTtsConfig tts_config; + + tts_config.model.vits.model = SHERPA_ONNX_OR(config->model.vits.model, ""); + tts_config.model.vits.lexicon = + SHERPA_ONNX_OR(config->model.vits.lexicon, ""); + tts_config.model.vits.tokens = SHERPA_ONNX_OR(config->model.vits.tokens, ""); + tts_config.model.vits.noise_scale = + SHERPA_ONNX_OR(config->model.vits.noise_scale, 0.667); + tts_config.model.vits.noise_scale_w = + SHERPA_ONNX_OR(config->model.vits.noise_scale_w, 0.8); + tts_config.model.vits.length_scale = + SHERPA_ONNX_OR(config->model.vits.length_scale, 1.0); + + tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1); + tts_config.model.debug = config->model.debug; + tts_config.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu"); + + if (tts_config.model.debug) { + fprintf(stderr, "%s\n", tts_config.ToString().c_str()); + } + + SherpaOnnxOfflineTts *tts = new SherpaOnnxOfflineTts; + + tts->impl = std::make_unique(tts_config); + + return tts; +} + +void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts) { delete tts; } + +const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerate( + const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid) { + sherpa_onnx::GeneratedAudio audio = tts->impl->Generate(text, sid); + + if (audio.samples.empty()) { + return nullptr; + } + + SherpaOnnxGeneratedAudio *ans = new SherpaOnnxGeneratedAudio; + + float *samples = new float[audio.samples.size()]; + std::copy(audio.samples.begin(), audio.samples.end(), samples); + + ans->samples = samples; + ans->n = audio.samples.size(); + ans->sample_rate = audio.sample_rate; + + return ans; +} + +SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTtsGeneratedAudio( + const SherpaOnnxGeneratedAudio *p) { + if (p) { + delete[] p->samples; + delete p; + } +} + +int32_t SherpaOnnxDestroyOfflineWriteWave(const SherpaOnnxGeneratedAudio *p, + const char *filename) { + return sherpa_onnx::WriteWave(filename, p->sample_rate, p->samples, p->n); +} diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 05b79a806..2898df30c 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -595,6 +595,62 @@ SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment( SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset( SherpaOnnxVoiceActivityDetector *p); +// ============================================================ +// For offline Text-to-Speech (i.e., non-streaming TTS) +// ============================================================ +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsVitsModelConfig { + const char *model; + const char *lexicon; + const char *tokens; + + float noise_scale; + float noise_scale_w; + float length_scale; // < 1, faster in speed; > 1, slower in speed +} SherpaOnnxOfflineTtsVitsModelConfig; + +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig { + SherpaOnnxOfflineTtsVitsModelConfig vits; + int32_t num_threads; + int32_t debug; + const char *provider; +} SherpaOnnxOfflineTtsModelConfig; + +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig { + SherpaOnnxOfflineTtsModelConfig model; +} SherpaOnnxOfflineTtsConfig; + +SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio { + const float *samples; // in the range [-1, 1] + int32_t n; // number of samples + int32_t sample_rate; +} SherpaOnnxGeneratedAudio; + +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts; + +// Create an instance of offline TTS. The user has to use DestroyOfflineTts() +// to free the returned pointer to avoid memory leak. +SHERPA_ONNX_API SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( + const SherpaOnnxOfflineTtsConfig *config); + +// Free the pointer returned by CreateOfflineTts() +SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts); + +// Generate audio from the given text and speaker id (sid). +// The user has to use DestroyOfflineTtsGeneratedAudio() to free the returned +// pointer to avoid memory leak. +SHERPA_ONNX_API const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerate( + const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid); + +SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTtsGeneratedAudio( + const SherpaOnnxGeneratedAudio *p); + +// Write the generated audio to a wave file. +// The saved wave file contains a single channel and has 16-bit samples. +// +// Return 1 if the write succeeded; return 0 on failure. +SHERPA_ONNX_API int32_t SherpaOnnxDestroyOfflineWriteWave( + const SherpaOnnxGeneratedAudio *p, const char *filename); + #if defined(__GNUC__) #pragma GCC diagnostic pop #endif diff --git a/sherpa-onnx/csrc/sherpa-onnx-offline-tts.cc b/sherpa-onnx/csrc/sherpa-onnx-offline-tts.cc index 6097468a2..835d09e34 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-offline-tts.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-offline-tts.cc @@ -40,7 +40,7 @@ wget https://huggingface.co/csukuangfj/vits-ljs/resolve/main/tokens.txt Please see https://k2-fsa.github.io/sherpa/onnx/tts/index.html -or detailes. +or details. )usage"; sherpa_onnx::ParseOptions po(kUsageMessage);