Capacitron (#977)

* new CI config * initial Capacitron implementation * delete old unused file * fix empty formatting changes * update losses and training script * fix previous commit * fix commit * Add Capacitron test and first round of test fixes * revert formatter change * add changes to the synthesizer * add stepwise gradual lr scheduler and changes to the recipe * add inference script for dev use * feat: add posterior inference arguments to synth methods - added reference wav and text args for posterior inference - some formatting * fix: add espeak flag to base_tts and dataset APIs - use_espeak_phonemes flag was not implemented in those APIs - espeak is now able to be utilised for phoneme generation - necessary phonemizer for the Capacitron model * chore: update training script and style - training script includes the espeak flag and other hyperparams - made style * chore: fix linting * feat: add Tacotron 2 support * leftover from dev * chore:rename parser args * feat: extract optimizers - created a separate optimizer class to merge the two optimizers * chore: revert arbitrary trainer changes * fmt: revert formatting bug * formatting again * formatting fixed * fix: log func * fix: update optimizer - Implemented load_state_dict for continuing training * fix: clean optimizer init for standard models * improvement: purge espeak flags and add training scripts * Delete capacitronT2.py delete old training script, new one is pushed * feat: capacitron trainer methods - extracted capacitron specific training operations from the trainer into custom methods in taco1 and taco2 models * chore: renaming and merging capacitron and gst style args * fix: bug fixes from the previous commit * fix: implement state_dict method on CapacitronOptimizer * fix: call method * fix: inference naming * Delete train_capacitron.py * fix: synthesize * feat: update tests * chore: fix style * Delete capacitron_inference.py * fix: fix train tts t2 capacitron tests * fix: double forward in T2 train step * fix: double forward in T1 train step * fix: run make style * fix: remove unused import * fix: test for T1 capacitron * fix: make lint * feat: add blizzard2013 recipes * make style * fix: update recipes * chore: make style * Plot test sentences in Tacotron * chore: make style and fix import * fix: call forward first before problematic floordiv op * fix: update recipes * feat: add min_audio_len to recipes * aux_input["style_mel"] * chore: make style * Make capacitron T2 recipe more stable * Remove T1 capacitron Ljspeech * feat: implement new grad clipping routine and update configs * make style * Add pretrained checkpoints * Add default vocoder * Change trainer package * Fix grad clip issue for tacotron * Fix scheduler issue with tacotron Co-authored-by: Eren Gölge <egolge@coqui.ai> Co-authored-by: WeberJulian <julian.weber@hotmail.fr> Co-authored-by: Eren Gölge <erogol@hotmail.com>
coqui-ai · May 20, 2022 · 8be21ec · 8be21ec
1 parent ee99a6c
commit 8be21ec
Show file tree

Hide file tree

Showing 20 changed files with 1,194 additions and 39 deletions.
diff --git a/TTS/.models.json b/TTS/.models.json
@@ -119,6 +119,26 @@
                     "license": "apache 2.0",
                     "contact": "egolge@coqui.com"
                 }
+            },
+            "blizzard2013": {
+                "capacitron-t2-c50": {
+                    "description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
+                    "commit": "d6284e7",
+                    "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
+                    "author": "Adam Froghyar @a-froghyar",
+                    "license": "apache 2.0",
+                    "contact": "adamfroghyar@gmail.com"
+                },
+                "capacitron-t2-c150": {
+                    "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c150.zip",
+                    "commit": "d6284e7",
+                    "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
+                    "author": "Adam Froghyar @a-froghyar",
+                    "license": "apache 2.0",
+                    "contact": "adamfroghyar@gmail.com"
+                }
             }
         },
         "es": {
@@ -379,6 +399,16 @@
                     "contact": "egolge@coqui.ai"
                 }
             },
+            "blizzard2013": {
+                "hifigan_v2": {
+                    "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
+                    "commit": "d6284e7",
+                    "author": "Adam Froghyar @a-froghyar",
+                    "license": "apache 2.0",
+                    "contact": "adamfroghyar@gmail.com"
+                }
+            },
             "vctk": {
                 "hifigan_v2": {
                     "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",

diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
@@ -172,6 +172,10 @@ def main():
         default=None,
     )
     parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
+    parser.add_argument(
+        "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
+    )
+    parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
     parser.add_argument(
         "--list_speaker_idxs",
         help="List available speaker ids for the defined multi-speaker model.",
@@ -308,6 +312,8 @@ def main():
         args.language_idx,
         args.speaker_wav,
         reference_wav=args.reference_wav,
+        style_wav=args.capacitron_style_wav,
+        style_text=args.capacitron_style_text,
         reference_speaker_name=args.reference_speaker_idx,
     )
 

diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py
@@ -48,6 +48,50 @@ def check_values(
         check_argument("gst_num_style_tokens", c, restricted=True, min_val=1, max_val=1000)
 
 
+@dataclass
+class CapacitronVAEConfig(Coqpit):
+    """Defines the capacitron VAE Module
+    Args:
+        capacitron_capacity (int):
+            Defines the variational capacity limit of the prosody embeddings. Defaults to 150.
+        capacitron_VAE_embedding_dim (int):
+            Defines the size of the Capacitron embedding vector dimension. Defaults to 128.
+        capacitron_use_text_summary_embeddings (bool):
+            If True, use a text summary embedding in Capacitron. Defaults to True.
+        capacitron_text_summary_embedding_dim (int):
+            Defines the size of the capacitron text embedding vector dimension. Defaults to 128.
+        capacitron_use_speaker_embedding (bool):
+            if True use speaker embeddings in Capacitron. Defaults to False.
+        capacitron_VAE_loss_alpha (float):
+            Weight for the VAE loss of the Tacotron model. If set less than or equal to zero, it disables the
+            corresponding loss function. Defaults to 0.25
+        capacitron_grad_clip (float):
+            Gradient clipping value for all gradients except beta. Defaults to 5.0
+    """
+
+    capacitron_loss_alpha: int = 1
+    capacitron_capacity: int = 150
+    capacitron_VAE_embedding_dim: int = 128
+    capacitron_use_text_summary_embeddings: bool = True
+    capacitron_text_summary_embedding_dim: int = 128
+    capacitron_use_speaker_embedding: bool = False
+    capacitron_VAE_loss_alpha: float = 0.25
+    capacitron_grad_clip: float = 5.0
+
+    def check_values(
+        self,
+    ):
+        """Check config fields"""
+        c = asdict(self)
+        super().check_values()
+        check_argument("capacitron_capacity", c, restricted=True, min_val=10, max_val=500)
+        check_argument("capacitron_VAE_embedding_dim", c, restricted=True, min_val=16, max_val=1024)
+        check_argument("capacitron_use_speaker_embedding", c, restricted=False)
+        check_argument("capacitron_text_summary_embedding_dim", c, restricted=False, min_val=16, max_val=512)
+        check_argument("capacitron_VAE_loss_alpha", c, restricted=False)
+        check_argument("capacitron_grad_clip", c, restricted=False)
+
+
 @dataclass
 class CharactersConfig(Coqpit):
     """Defines arguments for the `BaseCharacters` or `BaseVocabulary` and their subclasses.

diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py
@@ -1,7 +1,7 @@
 from dataclasses import dataclass, field
 from typing import List
 
-from TTS.tts.configs.shared_configs import BaseTTSConfig, GSTConfig
+from TTS.tts.configs.shared_configs import BaseTTSConfig, CapacitronVAEConfig, GSTConfig
 
 
 @dataclass
@@ -23,6 +23,10 @@ class TacotronConfig(BaseTTSConfig):
         gst_style_input (str):
             Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
             this is not defined, the model uses a zero vector as an input. Defaults to None.
+        use_capacitron_vae (bool):
+            enable / disable the use of Capacitron modules. Defaults to False.
+        capacitron_vae (CapacitronConfig):
+            Instance of `CapacitronConfig` class.
         num_chars (int):
             Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
         num_speakers (int):
@@ -143,6 +147,9 @@ class TacotronConfig(BaseTTSConfig):
     gst: GSTConfig = None
     gst_style_input: str = None
 
+    use_capacitron_vae: bool = False
+    capacitron_vae: CapacitronVAEConfig = None
+
     # model specific params
     num_speakers: int = 1
     num_chars: int = 0

diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py
@@ -281,6 +281,10 @@ class TacotronLoss(torch.nn.Module):
     def __init__(self, c, ga_sigma=0.4):
         super().__init__()
         self.stopnet_pos_weight = c.stopnet_pos_weight
+        self.use_capacitron_vae = c.use_capacitron_vae
+        if self.use_capacitron_vae:
+            self.capacitron_capacity = c.capacitron_vae.capacitron_capacity
+            self.capacitron_vae_loss_alpha = c.capacitron_vae.capacitron_VAE_loss_alpha
         self.ga_alpha = c.ga_alpha
         self.decoder_diff_spec_alpha = c.decoder_diff_spec_alpha
         self.postnet_diff_spec_alpha = c.postnet_diff_spec_alpha
@@ -308,6 +312,9 @@ def __init__(self, c, ga_sigma=0.4):
         # pylint: disable=not-callable
         self.criterion_st = BCELossMasked(pos_weight=torch.tensor(self.stopnet_pos_weight)) if c.stopnet else None
 
+        # For dev pruposes only
+        self.criterion_capacitron_reconstruction_loss = nn.L1Loss(reduction="sum")
+
     def forward(
         self,
         postnet_output,
@@ -317,6 +324,7 @@ def forward(
         stopnet_output,
         stopnet_target,
         stop_target_length,
+        capacitron_vae_outputs,
         output_lens,
         decoder_b_output,
         alignments,
@@ -348,6 +356,55 @@ def forward(
         return_dict["decoder_loss"] = decoder_loss
         return_dict["postnet_loss"] = postnet_loss
 
+        if self.use_capacitron_vae:
+            # extract capacitron vae infos
+            posterior_distribution, prior_distribution, beta = capacitron_vae_outputs
+
+            # KL divergence term between the posterior and the prior
+            kl_term = torch.mean(torch.distributions.kl_divergence(posterior_distribution, prior_distribution))
+
+            # Limit the mutual information between the data and latent space by the variational capacity limit
+            kl_capacity = kl_term - self.capacitron_capacity
+
+            # pass beta through softplus to keep it positive
+            beta = torch.nn.functional.softplus(beta)[0]
+
+            # This is the term going to the main ADAM optimiser, we detach beta because
+            # beta is optimised by a separate, SGD optimiser below
+            capacitron_vae_loss = beta.detach() * kl_capacity
+
+            # normalize the capacitron_vae_loss as in L1Loss or MSELoss.
+            # After this, both the standard loss and capacitron_vae_loss will be in the same scale.
+            # For this reason we don't need use L1Loss and MSELoss in "sum" reduction mode.
+            # Note: the batch is not considered because the L1Loss was calculated in "sum" mode
+            # divided by the batch size, So not dividing the capacitron_vae_loss by B is legitimate.
+
+            # get B T D dimension from input
+            B, T, D = mel_input.size()
+            # normalize
+            if self.config.loss_masking:
+                # if mask loss get T using the mask
+                T = output_lens.sum() / B
+
+            # Only for dev purposes to be able to compare the reconstruction loss with the values in the
+            # original Capacitron paper
+            return_dict["capaciton_reconstruction_loss"] = (
+                self.criterion_capacitron_reconstruction_loss(decoder_output, mel_input) / decoder_output.size(0)
+            ) + kl_capacity
+
+            capacitron_vae_loss = capacitron_vae_loss / (T * D)
+            capacitron_vae_loss = capacitron_vae_loss * self.capacitron_vae_loss_alpha
+
+            # This is the term to purely optimise beta and to pass into the SGD optimizer
+            beta_loss = torch.negative(beta) * kl_capacity.detach()
+
+            loss += capacitron_vae_loss
+
+            return_dict["capacitron_vae_loss"] = capacitron_vae_loss
+            return_dict["capacitron_vae_beta_loss"] = beta_loss
+            return_dict["capacitron_vae_kl_term"] = kl_term
+            return_dict["capacitron_beta"] = beta
+
         stop_loss = (
             self.criterion_st(stopnet_output, stopnet_target, stop_target_length)
             if self.config.stopnet