Merge pull request #12 from ZDisket/vits

Add VITS (PyTorch) and V1 Language Standard
ZDisket · Oct 16, 2022 · 0b233df · 0b233df
2 parents 35bae2d + 34eac5d
commit 0b233df
Show file tree

Hide file tree

Showing 23 changed files with 557 additions and 181 deletions.
diff --git a/.gitignore b/.gitignore
@@ -77,3 +77,4 @@ release/*
 *.exe
 
 deps.zip
+README.md.backup
diff --git a/EnglishPhoneticProcessor.cpp b/EnglishPhoneticProcessor.cpp
@@ -9,27 +9,27 @@ bool EnglishPhoneticProcessor::Initialize(Phonemizer* InPhn)
 
     Phoner = InPhn;
     Tokenizer.SetAllowedChars(Phoner->GetGraphemeChars());
-    Tokenizer.SetNumberText(Phoner->GetNumTxt(),Phoner->GetNumTxtLang());
+
 
 
 
 	return true;
 }
 
 
-std::string EnglishPhoneticProcessor::ProcessTextPhonetic(const std::string& InText, const std::vector<u32string> &InPhonemes, const std::vector<DictEntry>& InDict, ETTSLanguage::Enum InLanguage, bool IsTac)
+std::string EnglishPhoneticProcessor::ProcessTextPhonetic(const std::string& InText, const std::vector<u32string> &InPhonemes, const std::vector<DictEntry>& InDict, ETTSLanguageType::Enum InLanguageType, bool IsTac)
 {
     if (!Phoner)
 		return "ERROR";
 
 
 
-    vector<string> Words = Tokenizer.Tokenize(InText,InLanguage,IsTac);
+    vector<string> Words = Tokenizer.Tokenize(InText,IsTac);
 
 	string Assemble = "";
 
-    // If language is negative, this is char-based model.
-    if (InLanguage < 0)
+
+    if (InLanguageType == ETTSLanguageType::Char)
     {
         for (size_t w = 0; w < Words.size();w++)
         {

diff --git a/EnglishPhoneticProcessor.h b/EnglishPhoneticProcessor.h
@@ -18,9 +18,11 @@ class EnglishPhoneticProcessor
 
 public:
     bool Initialize(Phonemizer *InPhn);
-    std::string ProcessTextPhonetic(const std::string& InText, const std::vector<std::u32string> &InPhonemes, const std::vector<DictEntry>& InDict, ETTSLanguage::Enum InLanguage, bool IsTac);
+    std::string ProcessTextPhonetic(const std::string& InText, const std::vector<std::u32string> &InPhonemes, const std::vector<DictEntry>& InDict, ETTSLanguageType::Enum InLanguageType, bool IsTac);
 	EnglishPhoneticProcessor();
     EnglishPhoneticProcessor(Phonemizer *InPhn);
 	~EnglishPhoneticProcessor();
+
+    inline TextTokenizer& GetTokenizer() {return Tokenizer;}
 };
 
diff --git a/README.md b/README.md
@@ -1,10 +1,10 @@
-# TensorVox
+# TensorVox
 
-[![Join the chat at https://gitter.im/TensorVox/community](https://badges.gitter.im/TensorVox/community.svg)](https://gitter.im/TensorVox/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+[![](https://dcbadge.vercel.app/api/server/yqFDAWH)](https://discord.gg/yqFDAWH)
 
 TensorVox is an application designed to enable user-friendly and lightweight neural speech synthesis in the desktop, aimed at increasing accessibility to such technology. 
 
-Powered mainly by [TensorFlowTTS](https://github.com/TensorSpeech/TensorFlowTTS) and also by [Coqui-TTS](https://github.com/coqui-ai/TTS), it is written in pure C++/Qt, using the Tensorflow C API for interacting with the models. This way, we can perform inference without having to install gigabytes worth of Python libraries, just a 100MB DLL.
+Powered mainly by [TensorFlowTTS](https://github.com/TensorSpeech/TensorFlowTTS) and also by [Coqui-TTS](https://github.com/coqui-ai/TTS) and [VITS](https://github.com/jaywalnut310/vits), it is written in pure C++/Qt, using the Tensorflow C API for interacting with Tensorflow models (first two), and LibTorch for PyTorch ones. This way, we can perform inference without having to install gigabytes worth of Python libraries, just a few DLLs.
 
 ![Interface with Tac2 model loaded](https://i.imgur.com/wtPzzNh.png)
 
@@ -20,14 +20,14 @@ If you're interested in using your own model, first you need to train then expor
 
 ## Supported architectures
 
-TensorVox supports models from two main repos:
+TensorVox supports models from three repos:
 
  - **TensorFlowTTS**: FastSpeech2, Tacotron2, both char and phoneme based and Multi-Band MelGAN. Here's a Colab notebook demonstrating how to export the LJSpeech pretrained, char-based Tacotron2 model: [<img src="https://colab.research.google.com/assets/colab-badge.svg">](https://colab.research.google.com/drive/1KLqZ1rkD4Enw7zpTgXGL6if7e5s0UeWa?usp=sharing) 
  - **Coqui-TTS:** Tacotron2 (phoneme-based IPA) and Multi-Band MelGAN, after converting from PyTorch to Tensorflow. Here's a notebook showing how to export the LJSpeech DDC model: [<img src="https://colab.research.google.com/assets/colab-badge.svg">](https://colab.research.google.com/drive/15CdGEAu_-KezV1XxwzVfQiFSm0tveBkC?usp=sharing)
+ - **jaywalnut310/VITS:** VITS, which is a fully E2E model. (Stressed IPA as phonemes) Export notebook: [<img src="https://colab.research.google.com/assets/colab-badge.svg">](https://colab.research.google.com/drive/1BSGE5DQYweXBWrwPOmb6CRPUU8H5mBvb?usp=sharing)
 
-
-Those two examples should provide you with enough guidance to understand what is needed. If you're looking to train a model specifically for this purpose then I recommend TensorFlowTTS, as it is the one with the best support.
-As for languages, out-of-the-box support is provided for English (both Coqui and TFTTS), German and Spanish (only TensorFlowTTS); that is, you won't have to modify any code. 
+Those two examples should provide you with enough guidance to understand what is needed. If you're looking to train a model specifically for this purpose then I recommend TensorFlowTTS, as it is the one with the best support, and VITS, as it's the closest thing to perfect
+As for languages, out-of-the-box support is provided for English (Coqui and TFTTS, VITS), German and Spanish (only TensorFlowTTS); that is, you won't have to do anything. You can add languages without modifying code, as long as the phoneme set are IPA (stressed or nonstressed), ARPA, or GlobalPhone, (open an issue and I'll explain it to you)
 
 
 ## Build instructions
@@ -39,16 +39,18 @@ Currently, only Windows 10 x64 (although I've heard reports of it running on 8.1
 
 **Primed build (with all provided libraries):**
 
- 1. Download [precompiled binary dependencies and includes](https://drive.google.com/file/d/1ufLQvH-Me2NLmzNBkjcyD13WTyHb35aB/view?usp=sharing)
+ 1. Download [precompiled binary dependencies and includes](https://drive.google.com/file/d/1N6IxSpsgemS94z_v82toXhiNs2tLXkz6/view?usp=sharing)
  2. Unzip it so that the `deps` folder is in the same place as the .pro and main source files.
  3. Open the project with Qt Creator, add your compiler and compile
 
-Note that to try your shiny new executable you'll need to download the program as described above and insert the `models` folder where your new build is output.
+Note that to try your shiny new executable you'll need to download a release of program as described above and replace the executable in that release with your new one, so you have all the DLLs in place.
 
 TODO: Add instructions for compile from scratch.
 
 ## Externals (and thanks)
 
+ - **LibTorch**: https://pytorch.org/cppdocs/installing.html
+
  - **Tensorflow C API**: [https://www.tensorflow.org/install/lang_c](https://www.tensorflow.org/install/lang_c)
  - **CppFlow** (TF C API -> C++ wrapper): [https://github.com/serizba/cppflow](https://github.com/serizba/cppflow) 
  - **AudioFile** (for WAV export): [https://github.com/adamstark/AudioFile](https://github.com/adamstark/AudioFile)

diff --git a/TensorVox.pro b/TensorVox.pro
@@ -44,6 +44,7 @@ SOURCES += \
     tacotron2.cpp \
     tfg2p.cpp \
     track.cpp \
+    vits.cpp \
     voicemanager.cpp \
     voxer.cpp
 
@@ -84,6 +85,7 @@ HEADERS += \
     tacotron2.h \
     tfg2p.h \
     track.h \
+    vits.h \
     voicemanager.h \
     voxer.h
 
@@ -103,8 +105,9 @@ else: unix:!android: target.path = /opt/$${TARGET}/bin
 DEFINES += _CRT_SECURE_NO_WARNINGS
 
 INCLUDEPATH += $$PWD/deps/include
+INCLUDEPATH += $$PWD/deps/include/libtorch
 INCLUDEPATH += $$PWD/ext/Qt-Frameless-Window-DarkStyle-master/framelesswindow
-win32: LIBS += -L$$PWD/deps/lib/ tensorflow.lib r8bsrc64.lib rnnoise64.lib LogitechLEDLib.lib LibNumberText64.lib
+win32: LIBS += -L$$PWD/deps/lib/ tensorflow.lib r8bsrc64.lib rnnoise64.lib LogitechLEDLib.lib LibNumberText64.lib c10.lib torch.lib torch_cpu.lib
 win32: LIBS += Advapi32.lib User32.lib Psapi.lib
 
 
@@ -115,7 +118,7 @@ RESOURCES += \
 
 win32:RC_ICONS += winicon.ico
 
-VERSION = 0.9.0.0
+VERSION = 1.0.0.0
 CONFIG += force_debug_info
 
 QMAKE_CXXFLAGS += /std:c++17 /utf-8 -DPSAPI_VERSION=1

diff --git a/TextTokenizer.cpp b/TextTokenizer.cpp
@@ -140,7 +140,7 @@ void TextTokenizer::SetNumberText(Numbertext &INum, const string &Lang)
 
 
 
-vector<string> TextTokenizer::Tokenize(const std::string & InTxt,ETTSLanguage::Enum Language,bool IsTacotron)
+vector<string> TextTokenizer::Tokenize(const std::string & InTxt,bool IsTacotron)
 {
 	vector<string> ProcessedTokens;
 

diff --git a/TextTokenizer.h b/TextTokenizer.h
@@ -28,7 +28,7 @@ class TextTokenizer
 
     void SetNumberText(Numbertext& INum,const std::string& Lang);
 
-    std::vector<std::string> Tokenize(const std::string& InTxt, ETTSLanguage::Enum Language = ETTSLanguage::EnglishPhn, bool IsTacotron = false);
+    std::vector<std::string> Tokenize(const std::string& InTxt, bool IsTacotron = false);
     void SetAllowedChars(const std::string &value);
 };
 
diff --git a/Voice.cpp b/Voice.cpp
@@ -4,6 +4,7 @@
 std::vector<int32_t> Voice::CharsToID(const std::string & RawInTxt)
 {
 
+    std::cout << "CharsToID: " << RawInTxt << "\n";
     std::vector<int32_t> VecPhones;
 
     std::u32string InTxt = VoxUtil::StrToU32(RawInTxt);
@@ -30,6 +31,7 @@ std::vector<int32_t> Voice::CharsToID(const std::string & RawInTxt)
 
 std::vector<int32_t> Voice::PhonemesToID(const std::string & RawInTxt)
 {
+    std::cout << "PhonemesToID: " << RawInTxt << "\n";
     ZStringDelimiter Delim(RawInTxt);
 	Delim.AddDelimiter(" ");
     std::u32string InTxt = VoxUtil::StrToU32(RawInTxt);
@@ -114,16 +116,28 @@ Voice::Voice(const std::string & VoxPath, const std::string &inName, Phonemizer
 
     VoxInfo = VoxUtil::ReadModelJSON(VoxPath + "/info.json");
 
-    if (VoxInfo.Architecture.Text2Mel == EText2MelModel::Tacotron2)
+    const int32_t Tex2MelArch = VoxInfo.Architecture.Text2Mel;
+
+    if (Tex2MelArch == EText2MelModel::Tacotron2)
         MelPredictor = std::make_unique<Tacotron2>();
-    else
+    else if (Tex2MelArch == EText2MelModel::FastSpeech2)
         MelPredictor = std::make_unique<FastSpeech2>();
+    else
+        MelPredictor = std::make_unique<VITS>();
+
+
+    std::string MelPredInit = VoxPath + "/melgen";
 
+    if (Tex2MelArch == EText2MelModel::VITS)
+        MelPredInit = VoxPath + "/vits.pt";
 
-    MelPredictor->Initialize(VoxPath + "/melgen",(ETTSRepo::Enum)VoxInfo.Architecture.Repo);
+    MelPredictor->Initialize(MelPredInit,(ETTSRepo::Enum)VoxInfo.Architecture.Repo);
 
 
-    Vocoder.Initialize(VoxPath + "/vocoder");
+
+    if (Tex2MelArch != EText2MelModel::VITS) // No vocoder necessary for fully E2E TTS
+        Vocoder.Initialize(VoxPath + "/vocoder");
+
 
     if (InPhn)
         Processor.Initialize(InPhn);
@@ -147,16 +161,22 @@ Voice::Voice(const std::string & VoxPath, const std::string &inName, Phonemizer
 void Voice::AddPhonemizer(Phonemizer *InPhn)
 {
     Processor.Initialize(InPhn);
+    Processor.GetTokenizer().SetNumberText(NumTxt,VoxCommon::CommonLangConst);
 
 
 }
 
+void Voice::LoadNumberText(const std::string &NumTxtPath)
+{
+    NumTxt.load(VoxCommon::CommonLangConst,NumTxtPath);
+}
+
 std::string Voice::PhonemizeStr(const std::string &Prompt)
 {
 
 
     return Processor.ProcessTextPhonetic(Prompt,Phonemes,CurrentDict,
-                                                            (ETTSLanguage::Enum)VoxInfo.Language,
+                                                           (ETTSLanguageType::Enum)VoxInfo.LangType,
                                                            true); // default voxistac to true to preserve punctuation.
 
 }
@@ -167,14 +187,16 @@ VoxResults Voice::Vocalize(const std::string & Prompt, float Speed, int32_t Spea
 
 
 
-    bool VoxIsTac = VoxInfo.Architecture.Text2Mel == EText2MelModel::Tacotron2;
+    const int32_t Text2MelN = VoxInfo.Architecture.Text2Mel;
+
+    bool VoxIsTac = Text2MelN != EText2MelModel::FastSpeech2;
 
     std::string PromptToFeed = Prompt;
-    if (VoxInfo.Language > -1)
+    if (VoxInfo.LangType != ETTSLanguageType::Char)
         PromptToFeed += VoxInfo.EndPadding;
 
     std::string PhoneticTxt = Processor.ProcessTextPhonetic(PromptToFeed,Phonemes,CurrentDict,
-                                                            (ETTSLanguage::Enum)VoxInfo.Language,
+                                                            (ETTSLanguageType::Enum)VoxInfo.LangType,
                                                            VoxIsTac);
     TFTensor<float> Mel;
     TFTensor<float> Attention;
@@ -184,15 +206,15 @@ VoxResults Voice::Vocalize(const std::string & Prompt, float Speed, int32_t Spea
 
     // Note to self: always check for negative or positive language by checking that it is lower than 0
     // if we try greater than 0, English is missed.
-    if (VoxInfo.Language < 0){
+    if (VoxInfo.LangType == ETTSLanguageType::Char){
         InputIDs = CharsToID(PhoneticTxt);
         InputIDs.push_back(std::stoi(VoxInfo.EndPadding));
 
 
     }
     else
     {
-        if (VoxInfo.s_Language.find("IPA") != std::string::npos)
+        if (VoxInfo.LangType == ETTSLanguageType::IPA)
             InputIDs = CharsToID(PhoneticTxt);
         else
             InputIDs = PhonemesToID(PhoneticTxt);
@@ -206,22 +228,39 @@ VoxResults Voice::Vocalize(const std::string & Prompt, float Speed, int32_t Spea
     std::vector<int32_t> IntArgs;
 
 
-    if (VoxIsTac)
+
+    if (Text2MelN == EText2MelModel::Tacotron2)
     {
 
         Mel = ((Tacotron2*)MelPredictor.get())->DoInference(InputIDs,FloatArgs,IntArgs,SpeakerID, EmotionID);
         Attention = ((Tacotron2*)MelPredictor.get())->Attention;
 
     }
-    else
+    else if (Text2MelN == EText2MelModel::FastSpeech2)
     {
 
         FloatArgs = {Speed,Energy,F0};
 
         Mel = ((FastSpeech2*)MelPredictor.get())->DoInference(InputIDs,FloatArgs,IntArgs,SpeakerID, EmotionID);
 
+    }else
+    {
+        FloatArgs = {Speed};
+        TFTensor<float> Audio = MelPredictor.get()->DoInference(InputIDs,FloatArgs,IntArgs,SpeakerID,EmotionID);
+        Attention = ((VITS*)MelPredictor.get())->Attention;
+
+        std::vector<float> AudioData = Audio.Data;
+
+        Mel.Shape.push_back(-1); // Tell the plotter that we have no mel to plot
+
+        // As VITS is fully E2E, we return here
+
+        return {AudioData,Attention,Mel};
+
     }
 
+    // Vocoder inference
+
 
 	TFTensor<float> AuData = Vocoder.DoInference(Mel);
     std::vector<float> AudioData;
@@ -269,7 +308,7 @@ void Voice::SetDictEntries(const std::vector<DictEntry> &InEntries)
 {
     for (const DictEntry& Entr : InEntries)
     {
-        if (Entr.Language != VoxInfo.s_Language)
+        if (Entr.Language != VoxInfo.s_Language_Fullname)
             continue;
 
         CurrentDict.push_back(Entr);

diff --git a/Voice.h b/Voice.h
@@ -4,6 +4,8 @@
 #include "tacotron2.h"
 #include "MultiBandMelGAN.h"
 #include "EnglishPhoneticProcessor.h"
+#include "vits.h"
+#include "Numbertext.hxx"
 
 #include "phoneticdict.h"
 
@@ -51,6 +53,8 @@ class Voice
     std::string ModelInfo;
 
     std::vector<int32_t> CharsToID(const std::string &RawInTxt);
+
+    Numbertext NumTxt;
 public:
 	/* Voice constructor, arguments obligatory.
 	 -> VoxPath: Path of folder where models are contained. 
@@ -70,6 +74,7 @@ class Voice
     Voice(const std::string& VoxPath, const std::string& inName,Phonemizer* InPhn);
 
     void AddPhonemizer(Phonemizer* InPhn);
+    void LoadNumberText(const std::string& NumTxtPath);
 
 
     std::string PhonemizeStr(const std::string& Prompt);
Original file line number	Diff line number	Diff line change
Expand Up		@@ -77,3 +77,4 @@ release/*
		*.exe

		deps.zip
		README.md.backup
-Original file line number
+Diff line change
@@ Expand Up @@
-    vector<string> TextTokenizer::Tokenize(const std::string & InTxt,ETTSLanguage::Enum Language,bool IsTacotron)
+    vector<string> TextTokenizer::Tokenize(const std::string & InTxt,bool IsTacotron)
     {
     	vector<string> ProcessedTokens;
@@ Expand Down @@