diff --git a/.gitignore b/.gitignore index f76303f..a24ff2e 100644 --- a/.gitignore +++ b/.gitignore @@ -77,3 +77,4 @@ release/* *.exe deps.zip +README.md.backup diff --git a/EnglishPhoneticProcessor.cpp b/EnglishPhoneticProcessor.cpp index 3aff2ba..7bb17e3 100644 --- a/EnglishPhoneticProcessor.cpp +++ b/EnglishPhoneticProcessor.cpp @@ -9,7 +9,7 @@ bool EnglishPhoneticProcessor::Initialize(Phonemizer* InPhn) Phoner = InPhn; Tokenizer.SetAllowedChars(Phoner->GetGraphemeChars()); - Tokenizer.SetNumberText(Phoner->GetNumTxt(),Phoner->GetNumTxtLang()); + @@ -17,19 +17,19 @@ bool EnglishPhoneticProcessor::Initialize(Phonemizer* InPhn) } -std::string EnglishPhoneticProcessor::ProcessTextPhonetic(const std::string& InText, const std::vector &InPhonemes, const std::vector& InDict, ETTSLanguage::Enum InLanguage, bool IsTac) +std::string EnglishPhoneticProcessor::ProcessTextPhonetic(const std::string& InText, const std::vector &InPhonemes, const std::vector& InDict, ETTSLanguageType::Enum InLanguageType, bool IsTac) { if (!Phoner) return "ERROR"; - vector Words = Tokenizer.Tokenize(InText,InLanguage,IsTac); + vector Words = Tokenizer.Tokenize(InText,IsTac); string Assemble = ""; - // If language is negative, this is char-based model. - if (InLanguage < 0) + + if (InLanguageType == ETTSLanguageType::Char) { for (size_t w = 0; w < Words.size();w++) { diff --git a/EnglishPhoneticProcessor.h b/EnglishPhoneticProcessor.h index 7056289..d179de3 100644 --- a/EnglishPhoneticProcessor.h +++ b/EnglishPhoneticProcessor.h @@ -18,9 +18,11 @@ class EnglishPhoneticProcessor public: bool Initialize(Phonemizer *InPhn); - std::string ProcessTextPhonetic(const std::string& InText, const std::vector &InPhonemes, const std::vector& InDict, ETTSLanguage::Enum InLanguage, bool IsTac); + std::string ProcessTextPhonetic(const std::string& InText, const std::vector &InPhonemes, const std::vector& InDict, ETTSLanguageType::Enum InLanguageType, bool IsTac); EnglishPhoneticProcessor(); EnglishPhoneticProcessor(Phonemizer *InPhn); ~EnglishPhoneticProcessor(); + + inline TextTokenizer& GetTokenizer() {return Tokenizer;} }; diff --git a/README.md b/README.md index 5db47d6..65872af 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ -# TensorVox +# TensorVox -[![Join the chat at https://gitter.im/TensorVox/community](https://badges.gitter.im/TensorVox/community.svg)](https://gitter.im/TensorVox/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) +[![](https://dcbadge.vercel.app/api/server/yqFDAWH)](https://discord.gg/yqFDAWH) TensorVox is an application designed to enable user-friendly and lightweight neural speech synthesis in the desktop, aimed at increasing accessibility to such technology. -Powered mainly by [TensorFlowTTS](https://github.com/TensorSpeech/TensorFlowTTS) and also by [Coqui-TTS](https://github.com/coqui-ai/TTS), it is written in pure C++/Qt, using the Tensorflow C API for interacting with the models. This way, we can perform inference without having to install gigabytes worth of Python libraries, just a 100MB DLL. +Powered mainly by [TensorFlowTTS](https://github.com/TensorSpeech/TensorFlowTTS) and also by [Coqui-TTS](https://github.com/coqui-ai/TTS) and [VITS](https://github.com/jaywalnut310/vits), it is written in pure C++/Qt, using the Tensorflow C API for interacting with Tensorflow models (first two), and LibTorch for PyTorch ones. This way, we can perform inference without having to install gigabytes worth of Python libraries, just a few DLLs. ![Interface with Tac2 model loaded](https://i.imgur.com/wtPzzNh.png) @@ -20,14 +20,14 @@ If you're interested in using your own model, first you need to train then expor ## Supported architectures -TensorVox supports models from two main repos: +TensorVox supports models from three repos: - **TensorFlowTTS**: FastSpeech2, Tacotron2, both char and phoneme based and Multi-Band MelGAN. Here's a Colab notebook demonstrating how to export the LJSpeech pretrained, char-based Tacotron2 model: [](https://colab.research.google.com/drive/1KLqZ1rkD4Enw7zpTgXGL6if7e5s0UeWa?usp=sharing) - **Coqui-TTS:** Tacotron2 (phoneme-based IPA) and Multi-Band MelGAN, after converting from PyTorch to Tensorflow. Here's a notebook showing how to export the LJSpeech DDC model: [](https://colab.research.google.com/drive/15CdGEAu_-KezV1XxwzVfQiFSm0tveBkC?usp=sharing) + - **jaywalnut310/VITS:** VITS, which is a fully E2E model. (Stressed IPA as phonemes) Export notebook: [](https://colab.research.google.com/drive/1BSGE5DQYweXBWrwPOmb6CRPUU8H5mBvb?usp=sharing) - -Those two examples should provide you with enough guidance to understand what is needed. If you're looking to train a model specifically for this purpose then I recommend TensorFlowTTS, as it is the one with the best support. -As for languages, out-of-the-box support is provided for English (both Coqui and TFTTS), German and Spanish (only TensorFlowTTS); that is, you won't have to modify any code. +Those two examples should provide you with enough guidance to understand what is needed. If you're looking to train a model specifically for this purpose then I recommend TensorFlowTTS, as it is the one with the best support, and VITS, as it's the closest thing to perfect +As for languages, out-of-the-box support is provided for English (Coqui and TFTTS, VITS), German and Spanish (only TensorFlowTTS); that is, you won't have to do anything. You can add languages without modifying code, as long as the phoneme set are IPA (stressed or nonstressed), ARPA, or GlobalPhone, (open an issue and I'll explain it to you) ## Build instructions @@ -39,16 +39,18 @@ Currently, only Windows 10 x64 (although I've heard reports of it running on 8.1 **Primed build (with all provided libraries):** - 1. Download [precompiled binary dependencies and includes](https://drive.google.com/file/d/1ufLQvH-Me2NLmzNBkjcyD13WTyHb35aB/view?usp=sharing) + 1. Download [precompiled binary dependencies and includes](https://drive.google.com/file/d/1N6IxSpsgemS94z_v82toXhiNs2tLXkz6/view?usp=sharing) 2. Unzip it so that the `deps` folder is in the same place as the .pro and main source files. 3. Open the project with Qt Creator, add your compiler and compile -Note that to try your shiny new executable you'll need to download the program as described above and insert the `models` folder where your new build is output. +Note that to try your shiny new executable you'll need to download a release of program as described above and replace the executable in that release with your new one, so you have all the DLLs in place. TODO: Add instructions for compile from scratch. ## Externals (and thanks) + - **LibTorch**: https://pytorch.org/cppdocs/installing.html + - **Tensorflow C API**: [https://www.tensorflow.org/install/lang_c](https://www.tensorflow.org/install/lang_c) - **CppFlow** (TF C API -> C++ wrapper): [https://github.com/serizba/cppflow](https://github.com/serizba/cppflow) - **AudioFile** (for WAV export): [https://github.com/adamstark/AudioFile](https://github.com/adamstark/AudioFile) diff --git a/TensorVox.pro b/TensorVox.pro index 34db052..c37464f 100644 --- a/TensorVox.pro +++ b/TensorVox.pro @@ -44,6 +44,7 @@ SOURCES += \ tacotron2.cpp \ tfg2p.cpp \ track.cpp \ + vits.cpp \ voicemanager.cpp \ voxer.cpp @@ -84,6 +85,7 @@ HEADERS += \ tacotron2.h \ tfg2p.h \ track.h \ + vits.h \ voicemanager.h \ voxer.h @@ -103,8 +105,9 @@ else: unix:!android: target.path = /opt/$${TARGET}/bin DEFINES += _CRT_SECURE_NO_WARNINGS INCLUDEPATH += $$PWD/deps/include +INCLUDEPATH += $$PWD/deps/include/libtorch INCLUDEPATH += $$PWD/ext/Qt-Frameless-Window-DarkStyle-master/framelesswindow -win32: LIBS += -L$$PWD/deps/lib/ tensorflow.lib r8bsrc64.lib rnnoise64.lib LogitechLEDLib.lib LibNumberText64.lib +win32: LIBS += -L$$PWD/deps/lib/ tensorflow.lib r8bsrc64.lib rnnoise64.lib LogitechLEDLib.lib LibNumberText64.lib c10.lib torch.lib torch_cpu.lib win32: LIBS += Advapi32.lib User32.lib Psapi.lib @@ -115,7 +118,7 @@ RESOURCES += \ win32:RC_ICONS += winicon.ico -VERSION = 0.9.0.0 +VERSION = 1.0.0.0 CONFIG += force_debug_info QMAKE_CXXFLAGS += /std:c++17 /utf-8 -DPSAPI_VERSION=1 diff --git a/TextTokenizer.cpp b/TextTokenizer.cpp index cfd2425..602bdc5 100644 --- a/TextTokenizer.cpp +++ b/TextTokenizer.cpp @@ -140,7 +140,7 @@ void TextTokenizer::SetNumberText(Numbertext &INum, const string &Lang) -vector TextTokenizer::Tokenize(const std::string & InTxt,ETTSLanguage::Enum Language,bool IsTacotron) +vector TextTokenizer::Tokenize(const std::string & InTxt,bool IsTacotron) { vector ProcessedTokens; diff --git a/TextTokenizer.h b/TextTokenizer.h index 550c3c7..5cc20c9 100644 --- a/TextTokenizer.h +++ b/TextTokenizer.h @@ -28,7 +28,7 @@ class TextTokenizer void SetNumberText(Numbertext& INum,const std::string& Lang); - std::vector Tokenize(const std::string& InTxt, ETTSLanguage::Enum Language = ETTSLanguage::EnglishPhn, bool IsTacotron = false); + std::vector Tokenize(const std::string& InTxt, bool IsTacotron = false); void SetAllowedChars(const std::string &value); }; diff --git a/Voice.cpp b/Voice.cpp index 1eebdff..259a492 100644 --- a/Voice.cpp +++ b/Voice.cpp @@ -4,6 +4,7 @@ std::vector Voice::CharsToID(const std::string & RawInTxt) { + std::cout << "CharsToID: " << RawInTxt << "\n"; std::vector VecPhones; std::u32string InTxt = VoxUtil::StrToU32(RawInTxt); @@ -30,6 +31,7 @@ std::vector Voice::CharsToID(const std::string & RawInTxt) std::vector Voice::PhonemesToID(const std::string & RawInTxt) { + std::cout << "PhonemesToID: " << RawInTxt << "\n"; ZStringDelimiter Delim(RawInTxt); Delim.AddDelimiter(" "); std::u32string InTxt = VoxUtil::StrToU32(RawInTxt); @@ -114,16 +116,28 @@ Voice::Voice(const std::string & VoxPath, const std::string &inName, Phonemizer VoxInfo = VoxUtil::ReadModelJSON(VoxPath + "/info.json"); - if (VoxInfo.Architecture.Text2Mel == EText2MelModel::Tacotron2) + const int32_t Tex2MelArch = VoxInfo.Architecture.Text2Mel; + + if (Tex2MelArch == EText2MelModel::Tacotron2) MelPredictor = std::make_unique(); - else + else if (Tex2MelArch == EText2MelModel::FastSpeech2) MelPredictor = std::make_unique(); + else + MelPredictor = std::make_unique(); + + + std::string MelPredInit = VoxPath + "/melgen"; + if (Tex2MelArch == EText2MelModel::VITS) + MelPredInit = VoxPath + "/vits.pt"; - MelPredictor->Initialize(VoxPath + "/melgen",(ETTSRepo::Enum)VoxInfo.Architecture.Repo); + MelPredictor->Initialize(MelPredInit,(ETTSRepo::Enum)VoxInfo.Architecture.Repo); - Vocoder.Initialize(VoxPath + "/vocoder"); + + if (Tex2MelArch != EText2MelModel::VITS) // No vocoder necessary for fully E2E TTS + Vocoder.Initialize(VoxPath + "/vocoder"); + if (InPhn) Processor.Initialize(InPhn); @@ -147,16 +161,22 @@ Voice::Voice(const std::string & VoxPath, const std::string &inName, Phonemizer void Voice::AddPhonemizer(Phonemizer *InPhn) { Processor.Initialize(InPhn); + Processor.GetTokenizer().SetNumberText(NumTxt,VoxCommon::CommonLangConst); } +void Voice::LoadNumberText(const std::string &NumTxtPath) +{ + NumTxt.load(VoxCommon::CommonLangConst,NumTxtPath); +} + std::string Voice::PhonemizeStr(const std::string &Prompt) { return Processor.ProcessTextPhonetic(Prompt,Phonemes,CurrentDict, - (ETTSLanguage::Enum)VoxInfo.Language, + (ETTSLanguageType::Enum)VoxInfo.LangType, true); // default voxistac to true to preserve punctuation. } @@ -167,14 +187,16 @@ VoxResults Voice::Vocalize(const std::string & Prompt, float Speed, int32_t Spea - bool VoxIsTac = VoxInfo.Architecture.Text2Mel == EText2MelModel::Tacotron2; + const int32_t Text2MelN = VoxInfo.Architecture.Text2Mel; + + bool VoxIsTac = Text2MelN != EText2MelModel::FastSpeech2; std::string PromptToFeed = Prompt; - if (VoxInfo.Language > -1) + if (VoxInfo.LangType != ETTSLanguageType::Char) PromptToFeed += VoxInfo.EndPadding; std::string PhoneticTxt = Processor.ProcessTextPhonetic(PromptToFeed,Phonemes,CurrentDict, - (ETTSLanguage::Enum)VoxInfo.Language, + (ETTSLanguageType::Enum)VoxInfo.LangType, VoxIsTac); TFTensor Mel; TFTensor Attention; @@ -184,7 +206,7 @@ VoxResults Voice::Vocalize(const std::string & Prompt, float Speed, int32_t Spea // Note to self: always check for negative or positive language by checking that it is lower than 0 // if we try greater than 0, English is missed. - if (VoxInfo.Language < 0){ + if (VoxInfo.LangType == ETTSLanguageType::Char){ InputIDs = CharsToID(PhoneticTxt); InputIDs.push_back(std::stoi(VoxInfo.EndPadding)); @@ -192,7 +214,7 @@ VoxResults Voice::Vocalize(const std::string & Prompt, float Speed, int32_t Spea } else { - if (VoxInfo.s_Language.find("IPA") != std::string::npos) + if (VoxInfo.LangType == ETTSLanguageType::IPA) InputIDs = CharsToID(PhoneticTxt); else InputIDs = PhonemesToID(PhoneticTxt); @@ -206,22 +228,39 @@ VoxResults Voice::Vocalize(const std::string & Prompt, float Speed, int32_t Spea std::vector IntArgs; - if (VoxIsTac) + + if (Text2MelN == EText2MelModel::Tacotron2) { Mel = ((Tacotron2*)MelPredictor.get())->DoInference(InputIDs,FloatArgs,IntArgs,SpeakerID, EmotionID); Attention = ((Tacotron2*)MelPredictor.get())->Attention; } - else + else if (Text2MelN == EText2MelModel::FastSpeech2) { FloatArgs = {Speed,Energy,F0}; Mel = ((FastSpeech2*)MelPredictor.get())->DoInference(InputIDs,FloatArgs,IntArgs,SpeakerID, EmotionID); + }else + { + FloatArgs = {Speed}; + TFTensor Audio = MelPredictor.get()->DoInference(InputIDs,FloatArgs,IntArgs,SpeakerID,EmotionID); + Attention = ((VITS*)MelPredictor.get())->Attention; + + std::vector AudioData = Audio.Data; + + Mel.Shape.push_back(-1); // Tell the plotter that we have no mel to plot + + // As VITS is fully E2E, we return here + + return {AudioData,Attention,Mel}; + } + // Vocoder inference + TFTensor AuData = Vocoder.DoInference(Mel); std::vector AudioData; @@ -269,7 +308,7 @@ void Voice::SetDictEntries(const std::vector &InEntries) { for (const DictEntry& Entr : InEntries) { - if (Entr.Language != VoxInfo.s_Language) + if (Entr.Language != VoxInfo.s_Language_Fullname) continue; CurrentDict.push_back(Entr); diff --git a/Voice.h b/Voice.h index f63d982..cfb021c 100644 --- a/Voice.h +++ b/Voice.h @@ -4,6 +4,8 @@ #include "tacotron2.h" #include "MultiBandMelGAN.h" #include "EnglishPhoneticProcessor.h" +#include "vits.h" +#include "Numbertext.hxx" #include "phoneticdict.h" @@ -51,6 +53,8 @@ class Voice std::string ModelInfo; std::vector CharsToID(const std::string &RawInTxt); + + Numbertext NumTxt; public: /* Voice constructor, arguments obligatory. -> VoxPath: Path of folder where models are contained. @@ -70,6 +74,7 @@ class Voice Voice(const std::string& VoxPath, const std::string& inName,Phonemizer* InPhn); void AddPhonemizer(Phonemizer* InPhn); + void LoadNumberText(const std::string& NumTxtPath); std::string PhonemizeStr(const std::string& Prompt); diff --git a/VoxCommon.cpp b/VoxCommon.cpp index 3054edc..fbc4458 100644 --- a/VoxCommon.cpp +++ b/VoxCommon.cpp @@ -4,14 +4,34 @@ using namespace nlohmann; #include #include // std::wstring_convert -const std::vector Text2MelNames = {"FastSpeech2","Tacotron2"}; -const std::vector VocoderNames = {"Multi-Band MelGAN","MelGAN-STFT"}; -const std::vector RepoNames = {"TensorflowTTS","Coqui-TTS"}; +const std::vector Text2MelNames = {"FastSpeech2","Tacotron2","VITS"}; +const std::vector VocoderNames = {"Multi-Band MelGAN","MelGAN-STFT",""}; +const std::vector RepoNames = {"TensorflowTTS","Coqui-TTS","jaywalnut310"}; const std::vector LanguageNames = {"English","Spanish", "German", "EnglishIPA"}; const std::vector LangaugeNamesNumToWords = {"en", "es","de","en"}; + + +#include "ext/ZCharScanner.h" + +const std::map LegacyToV1Lang = { + {-3,"German-Char"}, + {0,"English-ARPA"}, + {-1,"English-Char"}, + {3,"English-IPA"}, + {1,"Spanish-GlobalPhone"} + }; + +const std::map V1LangTypes ={ + {"IPA",ETTSLanguageType::IPA}, + {"IPAStressed",ETTSLanguageType::IPA}, + {"ARPA",ETTSLanguageType::ARPA}, + {"Char",ETTSLanguageType::Char}, + {"GlobalPhone",ETTSLanguageType::GlobalPhone} +}; + void VoxUtil::ExportWAV(const std::string & Filename, const std::vector& Data, unsigned SampleRate) { AudioFile::AudioBuffer Buffer; Buffer.resize(1); @@ -83,11 +103,30 @@ VoiceInfo VoxUtil::ReadModelJSON(const std::string &InfoFilename) CuArch.s_Vocoder = VocoderNames[CuArch.Vocoder]; // Language value for the info - int32_t RawLang = JS["language"].get(); + auto LangVal = JS["language"]; + + + std::string LanguageFullName; - // Language value for the vectors - int32_t LanguageValue = ProcessLanguageValue(RawLang); + if (LangVal.is_string()){ // V1 Language type standard model; see ETTSLanguageType enum desc on header + LanguageFullName = LangVal.get(); + + }else{ + // Convert legacy language to V1 + int32_t LegacyLang = JS["language"].get(); + LanguageFullName = LegacyToV1Lang.find(LegacyLang)->second; + + + } + + ZStringDelimiter LangDel(LanguageFullName); + LangDel.AddDelimiter("-"); + + std::string LangName = LangDel[0]; + std::string LangTypeStr = LangDel[1]; + + int32_t LangType = V1LangTypes.find(LangTypeStr)->second; @@ -95,9 +134,11 @@ VoiceInfo VoxUtil::ReadModelJSON(const std::string &InfoFilename) std::string EndToken = JS["pad"].get(); // If it's phonetic then it's the token str, like "@EOS" - if (RawLang > -1) + if (LangType != ETTSLanguageType::Char && EndToken.size()) EndToken = " " + EndToken; // In this case we add a space for separation since we directly append the value to the prompt + + VoiceInfo Inf{JS["name"].get(), JS["author"].get(), JS["version"].get(), @@ -105,10 +146,11 @@ VoiceInfo VoxUtil::ReadModelJSON(const std::string &InfoFilename) CuArch, JS["note"].get(), JS["sarate"].get(), - RawLang, - LanguageNames[LanguageValue], - LangaugeNamesNumToWords[LanguageValue], - EndToken}; + LangName, + LanguageFullName, + EndToken, + LangType + }; if (Inf.Note.size() > MaxNoteSize) Inf.Note = Inf.Note.substr(0,MaxNoteSize); diff --git a/VoxCommon.hpp b/VoxCommon.hpp index 32009f4..27b365b 100644 --- a/VoxCommon.hpp +++ b/VoxCommon.hpp @@ -3,16 +3,37 @@ VoxCommon.hpp : Defines common data structures and constants to be used with TensorVox */ #include + +#undef slots // https://github.com/pytorch/pytorch/issues/19405 + + +#pragma warning(push, 0) // LibTorch spams us with warnings +#include // One-stop header. +#pragma warning(pop) + +#define slots Q_SLOTS + #include #include "ext/AudioFile.hpp" #include "ext/CppFlow/ops.h" #include "ext/CppFlow/model.h" + + + #include + + #define IF_RETURN(cond,ret) if (cond){return ret;} const uint32_t CommonSampleRate = 48000; +namespace VoxCommon{ +const std::string CommonLangConst = "_std"; + + +} + // https://github.com/almogh52/rnnoise-cmake/blob/d981adb2e797216f456cfcf158f73761a29981f8/examples/rnnoise_demo.c#L31 const uint32_t RNNoiseFrameSize = 480; typedef std::vector> TensorVec; @@ -29,14 +50,16 @@ struct TFTensor { namespace ETTSRepo { enum Enum{ TensorflowTTS = 0, - CoquiTTS + CoquiTTS, + jaywalnut310 // OG VITS repo }; } namespace EText2MelModel { enum Enum{ FastSpeech2 = 0, - Tacotron2 + Tacotron2, + VITS }; } @@ -44,14 +67,17 @@ enum Enum{ namespace EVocoderModel{ enum Enum{ MultiBandMelGAN = 0, - MelGANSTFT // there is no architectural changes so we can use mb-melgan class for melgan-stft + MelGANSTFT, // there is no architectural changes so we can use mb-melgan class for melgan-stft + NullVocoder // For fully E2E models }; } +// ===========DEPRECATED=============== // Negative numbers denote character-based language, positive for phoneme based. Standard is char-equivalent language idx = negative(phn-based) // In case of English, since -0 doesn't exist, we use -1. // For example, German phonetic would be 3, and character based would be -3 // IPA-phn-based are mainly for Coqui +// ===========DEPRECATED=============== namespace ETTSLanguage{ enum Enum{ GermanChar = -3, @@ -65,6 +91,23 @@ enum Enum{ } +/* Language Spec Standard V1: +- Language is specified with a string from the JSON and the type is saved instead of relying +on ETTSLanguage enum. +-- The string is LanguageName-Method; for example English-StressedIPA, English-ARPA, German-Char +- Both pre-V1 standard and current are supported +- V1 Standard does not require changes in code to add new languages + +*/ + +namespace ETTSLanguageType{ +enum Enum{ + ARPA = 0, + Char, + IPA, + GlobalPhone +}; +} struct ArchitectureInfo{ @@ -89,11 +132,11 @@ struct VoiceInfo{ uint32_t SampleRate; - int32_t Language; - std::string s_Language; - std::string s_Language_Num; + std::string s_Language; // Language name = English-ARPA -> "English" + std::string s_Language_Fullname; // Full language name = "English-ARPA" std::string EndPadding; + int32_t LangType; @@ -101,6 +144,7 @@ struct VoiceInfo{ namespace VoxUtil { + std::string U32ToStr(const std::u32string& InU32); std::u32string StrToU32(const std::string& InStr); @@ -109,6 +153,28 @@ namespace VoxUtil { VoiceInfo ReadModelJSON(const std::string& InfoFilename); + + // Copy PyTorch tensor + + template + TFTensor CopyTensor(at::Tensor& InTens){ + D* Data = InTens.data(); + std::vector Shape = InTens.sizes().vec(); + + size_t TotalSize = 1; + + for (const int64_t& Dim : Shape) + TotalSize *= Dim; + + std::vector DataVec = std::vector(Data,Data + TotalSize); + + return TFTensor{DataVec,Shape,TotalSize}; + + + } + + + // Copy CppFlow (TF) tensor template TFTensor CopyTensor(cppflow::tensor& InTens) { @@ -123,8 +189,8 @@ namespace VoxUtil { } - template - bool FindInVec(V In, const std::vector& Vec, size_t& OutIdx, size_t start = 0) { + template + bool FindInVec(VXVec1 In, const std::vector& Vec, size_t& OutIdx, size_t start = 0) { for (size_t xx = start;xx < Vec.size();xx++) { if (Vec[xx] == In) { @@ -139,8 +205,8 @@ namespace VoxUtil { return false; } - template - bool FindInVec2(V In, const std::vector& Vec, size_t& OutIdx, size_t start = 0) { + template + bool FindInVec2(VXVec1 In, const std::vector& Vec, size_t& OutIdx, size_t start = 0) { for (size_t xx = start;xx < Vec.size();xx++) { if (Vec[xx] == In) { diff --git a/g2p_train/train_and_export.py b/g2p_train/train_and_export.py index 9326eba..0f04618 100644 --- a/g2p_train/train_and_export.py +++ b/g2p_train/train_and_export.py @@ -145,7 +145,7 @@ def main(): txtpadded, phnpadded, txtsize, phnsize, phn_wi, txt_wi, words, phns = preprocess(args.dict_path,args.char_tok_phn) yf = open(args.config_path,"r") - config = yaml.load(yf) + config = yaml.load(yf,Loader=yaml.FullLoader) yf.close() print("Finished preprocessing. Getting model") diff --git a/mainwindow.cpp b/mainwindow.cpp index 08bb150..96d5ef5 100644 --- a/mainwindow.cpp +++ b/mainwindow.cpp @@ -589,6 +589,8 @@ void MainWindow::PlayBuffer(QBuffer *pBuff,bool ByUser, int32_t RowID) if (MelSpec.Shape[0] != -1) PlotSpec(MelSpec,( ((float)NumSamples) / ((float)CommonSampleRate))); + else + ui->tabMetrics->setTabEnabled(1,false); @@ -776,7 +778,7 @@ void MainWindow::ProcessCurlies(QString &ModTxt) // Curlie processing not supported in IPA - if (GetCurrentVoice()->GetInfo().s_Language.find("IPA") != std::string::npos) + if (GetCurrentVoice()->GetInfo().LangType == ETTSLanguageType::IPA) { QMessageBox::critical((QWidget*)pDarkFw,"Warning","Curly brace phonetic text input processing not supported in IPA"); @@ -798,9 +800,9 @@ void MainWindow::ProcessCurlies(QString &ModTxt) - // Only English requires all phn input to be uppercase + // Only ARPA requires all phn input to be uppercase - if (GetCurrentVoice()->GetInfo().Language == 0) + if (GetCurrentVoice()->GetInfo().LangType == ETTSLanguageType::ARPA) Tk = Tk.toUpper(); NewTokens.push_back("@" + Tk); @@ -920,8 +922,12 @@ void MainWindow::on_btnLoad_clicked() LogiLedFlashLighting(0,100,100,5000,500); - if (VoMan[VoID]->GetInfo().Architecture.Text2Mel != EText2MelModel::Tacotron2) - ui->tabMetrics->setTabEnabled(2,false); + if (VoMan[VoID]->GetInfo().Architecture.Text2Mel == EText2MelModel::FastSpeech2) + ui->tabMetrics->setTabEnabled(2,false); // FS2 has no attention + + if (VoMan[VoID]->GetInfo().Architecture.Text2Mel == EText2MelModel::VITS) + ui->tabMetrics->setTabEnabled(1,false); // VITS has no mel + @@ -1187,10 +1193,19 @@ void MainWindow::HandleIsMultiSpeaker(size_t inVid) ArchitectureInfo Inf = CurrentVoice.GetInfo().Architecture; - if (Inf.Text2Mel == EText2MelModel::FastSpeech2) + if (Inf.Text2Mel == EText2MelModel::FastSpeech2 || Inf.Text2Mel == EText2MelModel::VITS) { ui->grpFs2Params->show(); - ui->chkBiPad->setEnabled(true); + + + bool IsFs2 = Inf.Text2Mel == EText2MelModel::FastSpeech2; + + ui->SubEnergy_2->setVisible(IsFs2); + ui->SubF0_2->setVisible(IsFs2); + + ui->chkBiPad->setEnabled(IsFs2); + + } else { @@ -1274,7 +1289,7 @@ void MainWindow::on_actionOverrides_triggered() } - if (VoMan[CurrentIndex]->GetInfo().Language < 0){ + if (VoMan[CurrentIndex]->GetInfo().LangType == ETTSLanguageType::Char){ QMessageBox::critical(FwParent,"Error","Phonetic overrides dictionary is not available for character-based models. Please use a phoneme-based model."); return; @@ -1289,7 +1304,7 @@ void MainWindow::on_actionOverrides_triggered() PhdDialog Dlg(FwParent); Dlg.Entrs = PhonDict.Entries; - Dlg.CurrentLang = VoMan[CurrentIndex]->GetInfo().s_Language; + Dlg.CurrentLang = VoMan[CurrentIndex]->GetInfo().s_Language_Fullname; FDlg.setContent(&Dlg); FDlg.ContentDlg(&Dlg); @@ -1316,6 +1331,7 @@ void MainWindow::SetDict() VoMan.SetDict(PhonDict.Entries); for (Voice*& Vo : VoMan.GetVoices()) { + Vo->SetDictEntries(PhonDict.Entries); } @@ -1528,6 +1544,12 @@ void MainWindow::on_tabMetrics_currentChanged(int index) ui->tabMetrics->setMinimumHeight(150); + } + if (index == 2) + { + ui->tabMetrics->setSizePolicy(QSizePolicy::Policy::Expanding,QSizePolicy::Policy::Expanding); + ui->tabMetrics->setMinimumHeight(225); + } update(); @@ -1565,6 +1587,7 @@ void MainWindow::PlotSpec(const TFTensor &InMel,float TimeInSecs) { UpdateIfDoSlides(); ui->widSpec->DoPlot(InMel,TimeInSecs); + ui->tabMetrics->setTabEnabled(1,true); } @@ -1584,7 +1607,7 @@ void MainWindow::on_actExAtt_triggered() { if (!ui->tabMetrics->isTabEnabled(2)) { - QMessageBox::critical(FwParent,"Error","There is no attention map to export. Only Tacotron 2 models generate alignment."); + QMessageBox::critical(FwParent,"Error","There is no attention map to export. Only Tacotron 2 or VITS models generate alignment."); return; @@ -1602,6 +1625,13 @@ void MainWindow::on_actExAtt_triggered() void MainWindow::on_actExSpec_triggered() { + if (!ui->tabMetrics->isTabEnabled(1)) + { + QMessageBox::critical(FwParent,"Error","There is no spectrogram to export."); + return; + + + } QString ofname = QFileDialog::getSaveFileName(FwParent, tr("Export PNG file"), "Spect", tr("PNG image (*.png)")); if (!ofname.size()) diff --git a/mainwindow.ui b/mainwindow.ui index 2122d4b..eb4a4af 100644 --- a/mainwindow.ui +++ b/mainwindow.ui @@ -115,108 +115,144 @@ + + + 0 + 4 + + - FastSpeech2 Parameters + Control Parameters + + 1 + + + QLayout::SetMinimumSize + - - - - - Energy - - - - - - - 200 - - - 100 - - - Qt::Horizontal - - - - - - - 100% - - - - + + + + 0 + 0 + + + + + + + Energy + + + + + + + 200 + + + 100 + + + Qt::Horizontal + + + + + + + 100% + + + + + - - - - - Speed - - - - - - - 200 - - - 100 - - - Qt::Horizontal - - - - - - - 100% - - - - + + + + 0 + 0 + + + + + + + Speed + + + + + + + 200 + + + 100 + + + Qt::Horizontal + + + + + + + 100% + + + + + - - - - - - 0 - 0 - - - - F0 - - - - - - - 200 - - - 100 - - - Qt::Horizontal - - - - - - - 100% - - - - + + + + 0 + 0 + + + + + + + + 0 + 0 + + + + F0 + + + + + + + 200 + + + 100 + + + Qt::Horizontal + + + + + + + 100% + + + + + @@ -339,7 +375,7 @@ 4 - 9 + 14 @@ -435,7 +471,7 @@ 0 - 1 + 4 diff --git a/melgen.h b/melgen.h index 0c74563..f4dc15b 100644 --- a/melgen.h +++ b/melgen.h @@ -1,18 +1,21 @@ #ifndef MELGEN_H #define MELGEN_H + + #include "ext/CppFlow/ops.h" #include "ext/CppFlow/model.h" - #include "VoxCommon.hpp" + #include // MelGen: base virtual class for mel generators class MelGen { private: - ETTSRepo::Enum CurrentRepo; + public: + ETTSRepo::Enum CurrentRepo; MelGen(); MelGen(const std::string& SavedModelFolder,ETTSRepo::Enum InTTSRepo); diff --git a/modelinfodlg.cpp b/modelinfodlg.cpp index 02d3d01..c97e8b4 100644 --- a/modelinfodlg.cpp +++ b/modelinfodlg.cpp @@ -23,7 +23,12 @@ void ModelInfoDlg::SetInfo(const QString &ModelName, const QString &Info, int32_ ui->lblModelTitle->setText(ModelName); - ui->lblModelArchitecture->setText("Architecture: " + Repo + " " + MelGen + " & " + Vocoder); + QString ArchShow = "Architecture: " + Repo + " " + MelGen; + + if (Vocoder.size()) + ArchShow += " & " + Vocoder; + + ui->lblModelArchitecture->setText(ArchShow); ui->lblSampleRate->setText("Sampling rate: " + QString::number(SampleRate / 1000) + "KHz"); QString ImgPath = QApplication::applicationDirPath() + "/models/" + ModelName + "/image.png"; diff --git a/phonemizer.cpp b/phonemizer.cpp index 0a76a98..f7ea486 100644 --- a/phonemizer.cpp +++ b/phonemizer.cpp @@ -157,12 +157,11 @@ Phonemizer::Phonemizer() } -bool Phonemizer::Initialize(const std::string InPath, const std::string &NLangName,bool Minimal) +bool Phonemizer::Initialize(const std::string InPath, bool Minimal) { IsMinimal = Minimal; - NumTxt.load(NLangName,InPath + "/" + NLangName + ".sor"); - NumTxtLang = NLangName; + // Load char indices CharId = GetDelimitedFile(InPath + "/char2id.txt"); diff --git a/phonemizer.h b/phonemizer.h index afb4fc3..feb35e9 100644 --- a/phonemizer.h +++ b/phonemizer.h @@ -4,7 +4,6 @@ #include #include #include -#include struct IdStr{ int32_t ID; @@ -30,7 +29,6 @@ class Phonemizer std::vector DictBuckets; - Numbertext NumTxt; std::string NumTxtLang; bool IsMinimal; @@ -54,7 +52,7 @@ class Phonemizer std::string PhnLanguage; public: - int32_t PhnLangID; + std::string PhnLangID; public: Phonemizer(); /* @@ -65,14 +63,14 @@ class Phonemizer * -- phn2id.txt: Translation from output ID from the model to phoneme * - A model/ folder where a G2P-Tensorflow model was saved as SavedModel * - dict.txt: Phonetic dictionary. First it searches the word there and if it can't be found then it uses the model. - * - (two-char name).sor: Name of num2text. + * * * If Minimal == true, it only requires the .sor and char2id (for determining allowed graphemes only, * the IDs can be arbitrary in this case) * A Minimal phonemizer only serves to hold values useful to the processor and tokenizer, for char-based models. */ - bool Initialize(const std::string InPath, const std::string& NLangName, bool Minimal); + bool Initialize(const std::string InPath, bool Minimal); std::string ProcessWord(const std::string& InWord, float Temperature = 0.1f); @@ -81,8 +79,6 @@ class Phonemizer std::string GetGraphemeChars(); - inline Numbertext& GetNumTxt() {return NumTxt;} - ~Phonemizer(); inline const std::string& GetNumTxtLang() {return NumTxtLang;} diff --git a/phoneticdict.cpp b/phoneticdict.cpp index 484766e..dee09e6 100644 --- a/phoneticdict.cpp +++ b/phoneticdict.cpp @@ -1,5 +1,18 @@ #include "phoneticdict.h" #include "ext/ZFile.h" +#include + +const std::map LegToV1{ + {"English","English-ARPA"}, + {"Spanish","Spanish-GlobalPhone"} +}; + +void AutoConvertToV1(std::string& LangStr){ + auto It = LegToV1.find(LangStr); + if (It != LegToV1.end()) + LangStr = It->second; + +} ZFILE_IOVR(DictEntry,inentr){ right << inentr.Word; @@ -12,6 +25,9 @@ ZFILE_OOVR(DictEntry,entr){ right >> entr.Word; right >> entr.PhSpelling; right >> entr.Language; + + AutoConvertToV1(entr.Language); + return right; } diff --git a/vits.cpp b/vits.cpp new file mode 100644 index 0000000..d238b12 --- /dev/null +++ b/vits.cpp @@ -0,0 +1,80 @@ +#include "vits.h" + + +std::vector VITS::ZeroPadVec(const std::vector &InIDs) +{ + std::vector NewIDs; + NewIDs.reserve(InIDs.size() * 2); + + for (auto CharID : InIDs) + { + NewIDs.push_back(0); + NewIDs.push_back((int64_t)CharID); + + } + // Add final 0 + NewIDs.push_back(0); + + return NewIDs; + +} + +VITS::VITS() +{ + +} + +bool VITS::Initialize(const std::string &SavedModelFolder, ETTSRepo::Enum InTTSRepo) +{ + try { + // Deserialize the ScriptModule from a file using torch::jit::load(). + + Model = torch::jit::load(SavedModelFolder); + + } + catch (const c10::Error& e) { + return false; + + } + + CurrentRepo = InTTSRepo; + return true; +} + +TFTensor VITS::DoInference(const std::vector &InputIDs, const std::vector &ArgsFloat, const std::vector ArgsInt, int32_t SpeakerID, int32_t EmotionID) +{ + std::vector PaddedIDs = ZeroPadVec(InputIDs); + std::vector inLen = { (int64_t)PaddedIDs.size() }; + + + // ZDisket: Is this really necessary? + torch::TensorOptions Opts = torch::TensorOptions().requires_grad(false); + + auto InIDS = torch::tensor(PaddedIDs, Opts).unsqueeze(0); + auto InLens = torch::tensor(inLen, Opts); + auto InLenScale = torch::tensor({ ArgsFloat[0]}, Opts); + + + std::vector inputs{ InIDS,InLens,InLenScale }; + + // Infer + + c10::IValue Output = Model.get_method("infer_ts")(inputs); + + // Output = tuple (audio,att) + + auto OutputT = Output.toTuple(); + + // Grab audio + // [1, frames] -> [frames] + auto AuTens = OutputT.get()->elements()[0].toTensor().squeeze(); + + // Grab Attention + // [1, 1, x, y] -> [x, y] -> [y,x] -> [1, y, x] + auto AttTens = OutputT.get()->elements()[1].toTensor().squeeze().transpose(0,1).unsqueeze(0); + + Attention = VoxUtil::CopyTensor(AttTens); + + return VoxUtil::CopyTensor(AuTens); + +} diff --git a/vits.h b/vits.h new file mode 100644 index 0000000..d2fc766 --- /dev/null +++ b/vits.h @@ -0,0 +1,47 @@ +#ifndef VITS_H +#define VITS_H + + +#include "melgen.h" + + + + + +// VITS is a fully E2E model; no separate vocoder needed +class VITS : public MelGen +{ +private: + torch::jit::script::Module Model; + + // Most VITS model require zero-interspersed input IDs + std::vector ZeroPadVec(const std::vector& InIDs); + +public: + TFTensor Attention; + + VITS(); + + // Since VITS runs on PyTorch, we override the loader + /* + Initialize and load the model + + -> SavedModelFolder: Not a folder, but path to TorchScripted .pt file + <- Returns: (bool)Success + */ + virtual bool Initialize(const std::string& SavedModelFolder, ETTSRepo::Enum InTTSRepo); + + + /* + Do inference on a VITS model. + + -> InputIDs: Input IDs of tokens for inference + -> SpeakerID: ID of the speaker in the model to do inference on. If single speaker, always leave at 0. If multispeaker, refer to your model. + -> ArgsFloat[0]: Length scale. + + <- Returns: TFTensor with shape {frames} of audio data + */ + TFTensor DoInference(const std::vector& InputIDs,const std::vector& ArgsFloat,const std::vector ArgsInt, int32_t SpeakerID = 0, int32_t EmotionID = -1); +}; + +#endif // VITS_H diff --git a/voicemanager.cpp b/voicemanager.cpp index 808cdfc..f8a4415 100644 --- a/voicemanager.cpp +++ b/voicemanager.cpp @@ -2,12 +2,12 @@ #define SAFE_DELETE(pdel)if (pdel){delete pdel;} #include -Phonemizer* VoiceManager::LoadPhonemizer(const QString& InPhnLang,const QString& InNumberLang,int32_t InLangNum) +Phonemizer* VoiceManager::LoadPhonemizer(const QString& InPhnLang,int32_t InLangNum) { for (Phonemizer*& Phn : Phonemizers) { - if (Phn->PhnLangID == InLangNum) + if (Phn->GetPhnLanguage() == InPhnLang.toStdString()) return Phn; @@ -18,10 +18,10 @@ Phonemizer* VoiceManager::LoadPhonemizer(const QString& InPhnLang,const QString& // Initialize regularly or minimally CreatePhn->Initialize(QString(QCoreApplication::applicationDirPath() + "/g2p/" + InPhnLang).toStdString(), - InNumberLang.toStdString(), InLangNum < 0); + InLangNum == ETTSLanguageType::Char); CreatePhn->SetPhnLanguage(InPhnLang.toStdString()); - CreatePhn->PhnLangID = InLangNum; + Phonemizers.push_back(CreatePhn); @@ -34,9 +34,13 @@ size_t VoiceManager::LoadVoice(const QString &Voname) { Voice* NuVoice = new Voice(QString(QCoreApplication::applicationDirPath() + "/models/" + Voname).toStdString(),Voname.toStdString(),nullptr); - QString PLang = QString::fromStdString(NuVoice->GetInfo().s_Language); - QString NLang = QString::fromStdString(NuVoice->GetInfo().s_Language_Num); - NuVoice->AddPhonemizer(LoadPhonemizer(PLang,NLang,NuVoice->GetInfo().Language)); + QString PLang = QString::fromStdString(NuVoice->GetInfo().s_Language_Fullname); + NuVoice->AddPhonemizer(LoadPhonemizer(PLang,NuVoice->GetInfo().LangType)); + + std::string NumTxtPath = QString(QCoreApplication::applicationDirPath() + "/num2txt/" + + QString::fromStdString(NuVoice->GetInfo().s_Language) + ".sor").toStdString(); + + NuVoice->LoadNumberText(NumTxtPath); Voices.push_back(NuVoice); Voices[Voices.size() - 1]->SetDictEntries(ManDict); diff --git a/voicemanager.h b/voicemanager.h index 8ca1320..cc740ee 100644 --- a/voicemanager.h +++ b/voicemanager.h @@ -12,7 +12,7 @@ class VoiceManager std::vector Phonemizers; - Phonemizer* LoadPhonemizer(const QString& InPhnLang, const QString& InNumberLang, int32_t InLangNum); + Phonemizer* LoadPhonemizer(const QString& InPhnLang, int32_t InLangNum);