From 8be21ec38734e780e787d07d7e979392d7d63f24 Mon Sep 17 00:00:00 2001
From: a-froghyar <adamfroghyar@gmail.com>
Date: Fri, 20 May 2022 16:17:11 +0200
Subject: [PATCH] Capacitron (#977)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* new CI config

* initial Capacitron implementation

* delete old unused file

* fix empty formatting changes

* update losses and training script

* fix previous commit

* fix commit

* Add Capacitron test and first round of test fixes

* revert formatter change

* add changes to the synthesizer

* add stepwise gradual lr scheduler and changes to the recipe

* add inference script for dev use

* feat: add posterior inference arguments to synth methods
- added reference wav and text args for posterior inference
- some formatting

* fix: add espeak flag to base_tts and dataset APIs
- use_espeak_phonemes flag was not implemented in those APIs
- espeak is now able to be utilised for phoneme generation
- necessary phonemizer for the Capacitron model

* chore: update training script and style
- training script includes the espeak flag and other hyperparams
- made style

* chore: fix linting

* feat: add Tacotron 2 support

* leftover from dev

* chore:rename parser args

* feat: extract optimizers
- created a separate optimizer class to merge the two optimizers

* chore: revert arbitrary trainer changes

* fmt: revert formatting bug

* formatting again

* formatting fixed

* fix: log func

* fix: update optimizer
- Implemented load_state_dict for continuing training

* fix: clean optimizer init for standard models

* improvement: purge espeak flags and add training scripts

* Delete capacitronT2.py

delete old training script, new one is pushed

* feat: capacitron trainer methods
- extracted capacitron specific training  operations from the trainer into custom
methods in taco1 and taco2 models

* chore: renaming and merging capacitron and gst style args

* fix: bug fixes from the previous commit

* fix: implement state_dict method on CapacitronOptimizer

* fix: call method

* fix: inference naming

* Delete train_capacitron.py

* fix: synthesize

* feat: update tests

* chore: fix style

* Delete capacitron_inference.py

* fix: fix train tts t2 capacitron tests

* fix: double forward in T2 train step

* fix: double forward in T1 train step

* fix: run make style

* fix: remove unused import

* fix: test for T1 capacitron

* fix: make lint

* feat: add blizzard2013 recipes

* make style

* fix: update recipes

* chore: make style

* Plot test sentences in Tacotron

* chore: make style and fix import

* fix: call forward first before problematic floordiv op

* fix: update recipes

* feat: add min_audio_len to recipes

* aux_input["style_mel"]

* chore: make style

* Make capacitron T2 recipe more stable

* Remove T1 capacitron Ljspeech

* feat: implement new grad clipping routine and update configs

* make style

* Add pretrained checkpoints

* Add default vocoder

* Change trainer package

* Fix grad clip issue for tacotron

* Fix scheduler issue with tacotron

Co-authored-by: Eren Gölge <egolge@coqui.ai>
Co-authored-by: WeberJulian <julian.weber@hotmail.fr>
Co-authored-by: Eren Gölge <erogol@hotmail.com>
---
 TTS/.models.json                              |  30 +++
 TTS/bin/synthesize.py                         |   6 +
 TTS/tts/configs/shared_configs.py             |  44 ++++
 TTS/tts/configs/tacotron_config.py            |   9 +-
 TTS/tts/layers/losses.py                      |  57 +++++
 TTS/tts/layers/tacotron/capacitron_layers.py  | 205 ++++++++++++++++++
 TTS/tts/layers/tacotron/gst_layers.py         |   2 +-
 TTS/tts/models/base_tacotron.py               |  79 ++++++-
 TTS/tts/models/tacotron.py                    | 106 +++++++--
 TTS/tts/models/tacotron2.py                   |  98 ++++++++-
 TTS/tts/utils/synthesis.py                    |  37 +++-
 TTS/utils/capacitron_optimizer.py             |  65 ++++++
 TTS/utils/io.py                               |   2 +
 TTS/utils/synthesizer.py                      |   9 +-
 recipes/blizzard2013/README.md                |  12 +
 .../train_capacitron_t1.py                    | 101 +++++++++
 .../train_capacitron_t2.py                    | 117 ++++++++++
 .../train_capacitron_t2.py                    | 115 ++++++++++
 tests/tts_tests/test_tacotron2_model.py       |  69 +++++-
 tests/tts_tests/test_tacotron_model.py        |  70 +++++-
 20 files changed, 1194 insertions(+), 39 deletions(-)
 create mode 100644 TTS/tts/layers/tacotron/capacitron_layers.py
 create mode 100644 TTS/utils/capacitron_optimizer.py
 create mode 100644 recipes/blizzard2013/README.md
 create mode 100644 recipes/blizzard2013/tacotron1-Capacitron/train_capacitron_t1.py
 create mode 100644 recipes/blizzard2013/tacotron2-Capacitron/train_capacitron_t2.py
 create mode 100644 recipes/ljspeech/tacotron2-Capacitron/train_capacitron_t2.py

diff --git a/TTS/.models.json b/TTS/.models.json
index 4870bc1f1c..660d479cf1 100644
--- a/TTS/.models.json
+++ b/TTS/.models.json
@@ -119,6 +119,26 @@
                     "license": "apache 2.0",
                     "contact": "egolge@coqui.com"
                 }
+            },
+            "blizzard2013": {
+                "capacitron-t2-c50": {
+                    "description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
+                    "commit": "d6284e7",
+                    "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
+                    "author": "Adam Froghyar @a-froghyar",
+                    "license": "apache 2.0",
+                    "contact": "adamfroghyar@gmail.com"
+                },
+                "capacitron-t2-c150": {
+                    "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c150.zip",
+                    "commit": "d6284e7",
+                    "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
+                    "author": "Adam Froghyar @a-froghyar",
+                    "license": "apache 2.0",
+                    "contact": "adamfroghyar@gmail.com"
+                }
             }
         },
         "es": {
@@ -379,6 +399,16 @@
                     "contact": "egolge@coqui.ai"
                 }
             },
+            "blizzard2013": {
+                "hifigan_v2": {
+                    "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
+                    "commit": "d6284e7",
+                    "author": "Adam Froghyar @a-froghyar",
+                    "license": "apache 2.0",
+                    "contact": "adamfroghyar@gmail.com"
+                }
+            },
             "vctk": {
                 "hifigan_v2": {
                     "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
index dc6e30b404..4e93535af6 100755
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@@ -172,6 +172,10 @@ def main():
         default=None,
     )
     parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
+    parser.add_argument(
+        "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
+    )
+    parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
     parser.add_argument(
         "--list_speaker_idxs",
         help="List available speaker ids for the defined multi-speaker model.",
@@ -308,6 +312,8 @@ def main():
         args.language_idx,
         args.speaker_wav,
         reference_wav=args.reference_wav,
+        style_wav=args.capacitron_style_wav,
+        style_text=args.capacitron_style_text,
         reference_speaker_name=args.reference_speaker_idx,
     )
 
diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py
index b782117cb0..4704687c26 100644
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@@ -48,6 +48,50 @@ def check_values(
         check_argument("gst_num_style_tokens", c, restricted=True, min_val=1, max_val=1000)
 
 
+@dataclass
+class CapacitronVAEConfig(Coqpit):
+    """Defines the capacitron VAE Module
+    Args:
+        capacitron_capacity (int):
+            Defines the variational capacity limit of the prosody embeddings. Defaults to 150.
+        capacitron_VAE_embedding_dim (int):
+            Defines the size of the Capacitron embedding vector dimension. Defaults to 128.
+        capacitron_use_text_summary_embeddings (bool):
+            If True, use a text summary embedding in Capacitron. Defaults to True.
+        capacitron_text_summary_embedding_dim (int):
+            Defines the size of the capacitron text embedding vector dimension. Defaults to 128.
+        capacitron_use_speaker_embedding (bool):
+            if True use speaker embeddings in Capacitron. Defaults to False.
+        capacitron_VAE_loss_alpha (float):
+            Weight for the VAE loss of the Tacotron model. If set less than or equal to zero, it disables the
+            corresponding loss function. Defaults to 0.25
+        capacitron_grad_clip (float):
+            Gradient clipping value for all gradients except beta. Defaults to 5.0
+    """
+
+    capacitron_loss_alpha: int = 1
+    capacitron_capacity: int = 150
+    capacitron_VAE_embedding_dim: int = 128
+    capacitron_use_text_summary_embeddings: bool = True
+    capacitron_text_summary_embedding_dim: int = 128
+    capacitron_use_speaker_embedding: bool = False
+    capacitron_VAE_loss_alpha: float = 0.25
+    capacitron_grad_clip: float = 5.0
+
+    def check_values(
+        self,
+    ):
+        """Check config fields"""
+        c = asdict(self)
+        super().check_values()
+        check_argument("capacitron_capacity", c, restricted=True, min_val=10, max_val=500)
+        check_argument("capacitron_VAE_embedding_dim", c, restricted=True, min_val=16, max_val=1024)
+        check_argument("capacitron_use_speaker_embedding", c, restricted=False)
+        check_argument("capacitron_text_summary_embedding_dim", c, restricted=False, min_val=16, max_val=512)
+        check_argument("capacitron_VAE_loss_alpha", c, restricted=False)
+        check_argument("capacitron_grad_clip", c, restricted=False)
+
+
 @dataclass
 class CharactersConfig(Coqpit):
     """Defines arguments for the `BaseCharacters` or `BaseVocabulary` and their subclasses.
diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py
index 5193c22438..e25609ffcf 100644
--- a/TTS/tts/configs/tacotron_config.py
+++ b/TTS/tts/configs/tacotron_config.py
@@ -1,7 +1,7 @@
 from dataclasses import dataclass, field
 from typing import List
 
-from TTS.tts.configs.shared_configs import BaseTTSConfig, GSTConfig
+from TTS.tts.configs.shared_configs import BaseTTSConfig, CapacitronVAEConfig, GSTConfig
 
 
 @dataclass
@@ -23,6 +23,10 @@ class TacotronConfig(BaseTTSConfig):
         gst_style_input (str):
             Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
             this is not defined, the model uses a zero vector as an input. Defaults to None.
+        use_capacitron_vae (bool):
+            enable / disable the use of Capacitron modules. Defaults to False.
+        capacitron_vae (CapacitronConfig):
+            Instance of `CapacitronConfig` class.
         num_chars (int):
             Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
         num_speakers (int):
@@ -143,6 +147,9 @@ class TacotronConfig(BaseTTSConfig):
     gst: GSTConfig = None
     gst_style_input: str = None
 
+    use_capacitron_vae: bool = False
+    capacitron_vae: CapacitronVAEConfig = None
+
     # model specific params
     num_speakers: int = 1
     num_chars: int = 0
diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py
index e03cf0840c..1f0961b303 100644
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@@ -281,6 +281,10 @@ class TacotronLoss(torch.nn.Module):
     def __init__(self, c, ga_sigma=0.4):
         super().__init__()
         self.stopnet_pos_weight = c.stopnet_pos_weight
+        self.use_capacitron_vae = c.use_capacitron_vae
+        if self.use_capacitron_vae:
+            self.capacitron_capacity = c.capacitron_vae.capacitron_capacity
+            self.capacitron_vae_loss_alpha = c.capacitron_vae.capacitron_VAE_loss_alpha
         self.ga_alpha = c.ga_alpha
         self.decoder_diff_spec_alpha = c.decoder_diff_spec_alpha
         self.postnet_diff_spec_alpha = c.postnet_diff_spec_alpha
@@ -308,6 +312,9 @@ def __init__(self, c, ga_sigma=0.4):
         # pylint: disable=not-callable
         self.criterion_st = BCELossMasked(pos_weight=torch.tensor(self.stopnet_pos_weight)) if c.stopnet else None
 
+        # For dev pruposes only
+        self.criterion_capacitron_reconstruction_loss = nn.L1Loss(reduction="sum")
+
     def forward(
         self,
         postnet_output,
@@ -317,6 +324,7 @@ def forward(
         stopnet_output,
         stopnet_target,
         stop_target_length,
+        capacitron_vae_outputs,
         output_lens,
         decoder_b_output,
         alignments,
@@ -348,6 +356,55 @@ def forward(
         return_dict["decoder_loss"] = decoder_loss
         return_dict["postnet_loss"] = postnet_loss
 
+        if self.use_capacitron_vae:
+            # extract capacitron vae infos
+            posterior_distribution, prior_distribution, beta = capacitron_vae_outputs
+
+            # KL divergence term between the posterior and the prior
+            kl_term = torch.mean(torch.distributions.kl_divergence(posterior_distribution, prior_distribution))
+
+            # Limit the mutual information between the data and latent space by the variational capacity limit
+            kl_capacity = kl_term - self.capacitron_capacity
+
+            # pass beta through softplus to keep it positive
+            beta = torch.nn.functional.softplus(beta)[0]
+
+            # This is the term going to the main ADAM optimiser, we detach beta because
+            # beta is optimised by a separate, SGD optimiser below
+            capacitron_vae_loss = beta.detach() * kl_capacity
+
+            # normalize the capacitron_vae_loss as in L1Loss or MSELoss.
+            # After this, both the standard loss and capacitron_vae_loss will be in the same scale.
+            # For this reason we don't need use L1Loss and MSELoss in "sum" reduction mode.
+            # Note: the batch is not considered because the L1Loss was calculated in "sum" mode
+            # divided by the batch size, So not dividing the capacitron_vae_loss by B is legitimate.
+
+            # get B T D dimension from input
+            B, T, D = mel_input.size()
+            # normalize
+            if self.config.loss_masking:
+                # if mask loss get T using the mask
+                T = output_lens.sum() / B
+
+            # Only for dev purposes to be able to compare the reconstruction loss with the values in the
+            # original Capacitron paper
+            return_dict["capaciton_reconstruction_loss"] = (
+                self.criterion_capacitron_reconstruction_loss(decoder_output, mel_input) / decoder_output.size(0)
+            ) + kl_capacity
+
+            capacitron_vae_loss = capacitron_vae_loss / (T * D)
+            capacitron_vae_loss = capacitron_vae_loss * self.capacitron_vae_loss_alpha
+
+            # This is the term to purely optimise beta and to pass into the SGD optimizer
+            beta_loss = torch.negative(beta) * kl_capacity.detach()
+
+            loss += capacitron_vae_loss
+
+            return_dict["capacitron_vae_loss"] = capacitron_vae_loss
+            return_dict["capacitron_vae_beta_loss"] = beta_loss
+            return_dict["capacitron_vae_kl_term"] = kl_term
+            return_dict["capacitron_beta"] = beta
+
         stop_loss = (
             self.criterion_st(stopnet_output, stopnet_target, stop_target_length)
             if self.config.stopnet
diff --git a/TTS/tts/layers/tacotron/capacitron_layers.py b/TTS/tts/layers/tacotron/capacitron_layers.py
new file mode 100644
index 0000000000..56fe44bc33
--- /dev/null
+++ b/TTS/tts/layers/tacotron/capacitron_layers.py
@@ -0,0 +1,205 @@
+import torch
+from torch import nn
+from torch.distributions.multivariate_normal import MultivariateNormal as MVN
+from torch.nn import functional as F
+
+
+class CapacitronVAE(nn.Module):
+    """Effective Use of Variational Embedding Capacity for prosody transfer.
+
+    See https://arxiv.org/abs/1906.03402"""
+
+    def __init__(
+        self,
+        num_mel,
+        capacitron_VAE_embedding_dim,
+        encoder_output_dim=256,
+        reference_encoder_out_dim=128,
+        speaker_embedding_dim=None,
+        text_summary_embedding_dim=None,
+    ):
+        super().__init__()
+        # Init distributions
+        self.prior_distribution = MVN(
+            torch.zeros(capacitron_VAE_embedding_dim), torch.eye(capacitron_VAE_embedding_dim)
+        )
+        self.approximate_posterior_distribution = None
+        # define output ReferenceEncoder dim to the capacitron_VAE_embedding_dim
+        self.encoder = ReferenceEncoder(num_mel, out_dim=reference_encoder_out_dim)
+
+        # Init beta, the lagrange-like term for the KL distribution
+        self.beta = torch.nn.Parameter(torch.log(torch.exp(torch.Tensor([1.0])) - 1), requires_grad=True)
+        mlp_input_dimension = reference_encoder_out_dim
+
+        if text_summary_embedding_dim is not None:
+            self.text_summary_net = TextSummary(text_summary_embedding_dim, encoder_output_dim=encoder_output_dim)
+            mlp_input_dimension += text_summary_embedding_dim
+        if speaker_embedding_dim is not None:
+            # TODO: Test a multispeaker model!
+            mlp_input_dimension += speaker_embedding_dim
+        self.post_encoder_mlp = PostEncoderMLP(mlp_input_dimension, capacitron_VAE_embedding_dim)
+
+    def forward(self, reference_mel_info=None, text_info=None, speaker_embedding=None):
+        # Use reference
+        if reference_mel_info is not None:
+            reference_mels = reference_mel_info[0]  # [batch_size, num_frames, num_mels]
+            mel_lengths = reference_mel_info[1]  # [batch_size]
+            enc_out = self.encoder(reference_mels, mel_lengths)
+
+            # concat speaker_embedding and/or text summary embedding
+            if text_info is not None:
+                text_inputs = text_info[0]  # [batch_size, num_characters, num_embedding]
+                input_lengths = text_info[1]
+                text_summary_out = self.text_summary_net(text_inputs, input_lengths).to(reference_mels.device)
+                enc_out = torch.cat([enc_out, text_summary_out], dim=-1)
+            if speaker_embedding is not None:
+                enc_out = torch.cat([enc_out, speaker_embedding], dim=-1)
+
+            # Feed the output of the ref encoder and information about text/speaker into
+            # an MLP to produce the parameteres for the approximate poterior distributions
+            mu, sigma = self.post_encoder_mlp(enc_out)
+            # convert to cpu because prior_distribution was created on cpu
+            mu = mu.cpu()
+            sigma = sigma.cpu()
+
+            # Sample from the posterior: z ~ q(z|x)
+            self.approximate_posterior_distribution = MVN(mu, torch.diag_embed(sigma))
+            VAE_embedding = self.approximate_posterior_distribution.rsample()
+        # Infer from the model, bypasses encoding
+        else:
+            # Sample from the prior: z ~ p(z)
+            VAE_embedding = self.prior_distribution.sample().unsqueeze(0)
+
+        # reshape to [batch_size, 1, capacitron_VAE_embedding_dim]
+        return VAE_embedding.unsqueeze(1), self.approximate_posterior_distribution, self.prior_distribution, self.beta
+
+
+class ReferenceEncoder(nn.Module):
+    """NN module creating a fixed size prosody embedding from a spectrogram.
+
+    inputs: mel spectrograms [batch_size, num_spec_frames, num_mel]
+    outputs: [batch_size, embedding_dim]
+    """
+
+    def __init__(self, num_mel, out_dim):
+
+        super().__init__()
+        self.num_mel = num_mel
+        filters = [1] + [32, 32, 64, 64, 128, 128]
+        num_layers = len(filters) - 1
+        convs = [
+            nn.Conv2d(
+                in_channels=filters[i], out_channels=filters[i + 1], kernel_size=(3, 3), stride=(2, 2), padding=(2, 2)
+            )
+            for i in range(num_layers)
+        ]
+        self.convs = nn.ModuleList(convs)
+        self.training = False
+        self.bns = nn.ModuleList([nn.BatchNorm2d(num_features=filter_size) for filter_size in filters[1:]])
+
+        post_conv_height = self.calculate_post_conv_height(num_mel, 3, 2, 2, num_layers)
+        self.recurrence = nn.LSTM(
+            input_size=filters[-1] * post_conv_height, hidden_size=out_dim, batch_first=True, bidirectional=False
+        )
+
+    def forward(self, inputs, input_lengths):
+        batch_size = inputs.size(0)
+        x = inputs.view(batch_size, 1, -1, self.num_mel)  # [batch_size, num_channels==1, num_frames, num_mel]
+        valid_lengths = input_lengths.float()  # [batch_size]
+        for conv, bn in zip(self.convs, self.bns):
+            x = conv(x)
+            x = bn(x)
+            x = F.relu(x)
+
+            # Create the post conv width mask based on the valid lengths of the output of the convolution.
+            # The valid lengths for the output of a convolution on varying length inputs is
+            # ceil(input_length/stride) + 1 for stride=3 and padding=2
+            # For example (kernel_size=3, stride=2, padding=2):
+            # 0 0 x x x x x 0 0 -> Input = 5, 0 is zero padding, x is valid values coming from padding=2 in conv2d
+            # _____
+            #   x _____
+            #       x _____
+            #           x  ____
+            #               x
+            # x x x x -> Output valid length = 4
+            # Since every example in te batch is zero padded and therefore have separate valid_lengths,
+            # we need to mask off all the values AFTER the valid length for each example in the batch.
+            # Otherwise, the convolutions create noise and a lot of not real information
+            valid_lengths = (valid_lengths / 2).float()
+            valid_lengths = torch.ceil(valid_lengths).to(dtype=torch.int64) + 1  # 2 is stride -- size: [batch_size]
+            post_conv_max_width = x.size(2)
+
+            mask = torch.arange(post_conv_max_width).to(inputs.device).expand(
+                len(valid_lengths), post_conv_max_width
+            ) < valid_lengths.unsqueeze(1)
+            mask = mask.expand(1, 1, -1, -1).transpose(2, 0).transpose(-1, 2)  # [batch_size, 1, post_conv_max_width, 1]
+            x = x * mask
+
+        x = x.transpose(1, 2)
+        # x: 4D tensor [batch_size, post_conv_width,
+        #               num_channels==128, post_conv_height]
+
+        post_conv_width = x.size(1)
+        x = x.contiguous().view(batch_size, post_conv_width, -1)
+        # x: 3D tensor [batch_size, post_conv_width,
+        #               num_channels*post_conv_height]
+
+        # Routine for fetching the last valid output of a dynamic LSTM with varying input lengths and padding
+        post_conv_input_lengths = valid_lengths
+        packed_seqs = nn.utils.rnn.pack_padded_sequence(
+            x, post_conv_input_lengths.tolist(), batch_first=True, enforce_sorted=False
+        )  # dynamic rnn sequence padding
+        self.recurrence.flatten_parameters()
+        _, (ht, _) = self.recurrence(packed_seqs)
+        last_output = ht[-1]
+
+        return last_output.to(inputs.device)  # [B, 128]
+
+    @staticmethod
+    def calculate_post_conv_height(height, kernel_size, stride, pad, n_convs):
+        """Height of spec after n convolutions with fixed kernel/stride/pad."""
+        for _ in range(n_convs):
+            height = (height - kernel_size + 2 * pad) // stride + 1
+        return height
+
+
+class TextSummary(nn.Module):
+    def __init__(self, embedding_dim, encoder_output_dim):
+        super().__init__()
+        self.lstm = nn.LSTM(
+            encoder_output_dim,  # text embedding dimension from the text encoder
+            embedding_dim,  # fixed length output summary the lstm creates from the input
+            batch_first=True,
+            bidirectional=False,
+        )
+
+    def forward(self, inputs, input_lengths):
+        # Routine for fetching the last valid output of a dynamic LSTM with varying input lengths and padding
+        packed_seqs = nn.utils.rnn.pack_padded_sequence(
+            inputs, input_lengths.tolist(), batch_first=True, enforce_sorted=False
+        )  # dynamic rnn sequence padding
+        self.lstm.flatten_parameters()
+        _, (ht, _) = self.lstm(packed_seqs)
+        last_output = ht[-1]
+        return last_output
+
+
+class PostEncoderMLP(nn.Module):
+    def __init__(self, input_size, hidden_size):
+        super().__init__()
+        self.hidden_size = hidden_size
+        modules = [
+            nn.Linear(input_size, hidden_size),  # Hidden Layer
+            nn.Tanh(),
+            nn.Linear(hidden_size, hidden_size * 2),
+        ]  # Output layer twice the size for mean and variance
+        self.net = nn.Sequential(*modules)
+        self.softplus = nn.Softplus()
+
+    def forward(self, _input):
+        mlp_output = self.net(_input)
+        # The mean parameter is unconstrained
+        mu = mlp_output[:, : self.hidden_size]
+        # The standard deviation must be positive. Parameterise with a softplus
+        sigma = self.softplus(mlp_output[:, self.hidden_size :])
+        return mu, sigma
diff --git a/TTS/tts/layers/tacotron/gst_layers.py b/TTS/tts/layers/tacotron/gst_layers.py
index 7d751bc047..ec622e4db8 100644
--- a/TTS/tts/layers/tacotron/gst_layers.py
+++ b/TTS/tts/layers/tacotron/gst_layers.py
@@ -139,7 +139,7 @@ def forward(self, query, key):
         keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]
         values = torch.stack(torch.split(values, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]
 
-        # score = softmax(QK^T / (d_k ** 0.5))
+        # score = softmax(QK^T / (d_k**0.5))
         scores = torch.matmul(queries, keys.transpose(2, 3))  # [h, N, T_q, T_k]
         scores = scores / (self.key_dim**0.5)
         scores = F.softmax(scores, dim=3)
diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py
index 54939c61c1..c0f4c3392d 100644
--- a/TTS/tts/models/base_tacotron.py
+++ b/TTS/tts/models/base_tacotron.py
@@ -1,6 +1,6 @@
 import copy
 from abc import abstractmethod
-from typing import Dict
+from typing import Dict, Tuple
 
 import torch
 from coqpit import Coqpit
@@ -10,7 +10,9 @@
 from TTS.tts.models.base_tts import BaseTTS
 from TTS.tts.utils.helpers import sequence_mask
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.synthesis import synthesis
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
 from TTS.utils.generic_utils import format_aux_input
 from TTS.utils.io import load_fsspec
 from TTS.utils.training import gradual_training_scheduler
@@ -47,6 +49,11 @@ def __init__(
             self.decoder_in_features += self.gst.gst_embedding_dim  # add gst embedding dim
             self.gst_layer = None
 
+        # Capacitron
+        if self.capacitron_vae and self.use_capacitron_vae:
+            self.decoder_in_features += self.capacitron_vae.capacitron_VAE_embedding_dim  # add capacitron embedding dim
+            self.capacitron_vae_layer = None
+
         # additional layers
         self.decoder_backward = None
         self.coarse_decoder = None
@@ -125,6 +132,53 @@ def init_from_config(config: Coqpit):
         speaker_manager = SpeakerManager.init_from_config(config)
         return BaseTacotron(config, ap, tokenizer, speaker_manager)
 
+    ##########################
+    # TEST AND LOG FUNCTIONS #
+    ##########################
+
+    def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
+        """Generic test run for `tts` models used by `Trainer`.
+
+        You can override this for a different behaviour.
+
+        Args:
+            assets (dict): A dict of training assets. For `tts` models, it must include `{'audio_processor': ap}`.
+
+        Returns:
+            Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
+        """
+        print(" | > Synthesizing test sentences.")
+        test_audios = {}
+        test_figures = {}
+        test_sentences = self.config.test_sentences
+        aux_inputs = self._get_test_aux_input()
+        for idx, sen in enumerate(test_sentences):
+            outputs_dict = synthesis(
+                self,
+                sen,
+                self.config,
+                "cuda" in str(next(self.parameters()).device),
+                speaker_id=aux_inputs["speaker_id"],
+                d_vector=aux_inputs["d_vector"],
+                style_wav=aux_inputs["style_wav"],
+                use_griffin_lim=True,
+                do_trim_silence=False,
+            )
+            test_audios["{}-audio".format(idx)] = outputs_dict["wav"]
+            test_figures["{}-prediction".format(idx)] = plot_spectrogram(
+                outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False
+            )
+            test_figures["{}-alignment".format(idx)] = plot_alignment(
+                outputs_dict["outputs"]["alignments"], output_fig=False
+            )
+        return {"figures": test_figures, "audios": test_audios}
+
+    def test_log(
+        self, outputs: dict, logger: "Logger", assets: dict, steps: int  # pylint: disable=unused-argument
+    ) -> None:
+        logger.test_audios(steps, outputs["audios"], self.ap.sample_rate)
+        logger.test_figures(steps, outputs["figures"])
+
     #############################
     # COMMON COMPUTE FUNCTIONS
     #############################
@@ -160,7 +214,9 @@ def _coarse_decoder_pass(self, mel_specs, encoder_outputs, alignments, input_mas
         )
         # scale_factor = self.decoder.r_init / self.decoder.r
         alignments_backward = torch.nn.functional.interpolate(
-            alignments_backward.transpose(1, 2), size=alignments.shape[1], mode="nearest"
+            alignments_backward.transpose(1, 2),
+            size=alignments.shape[1],
+            mode="nearest",
         ).transpose(1, 2)
         decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2)
         decoder_outputs_backward = decoder_outputs_backward[:, :T, :]
@@ -193,6 +249,25 @@ def compute_gst(self, inputs, style_input, speaker_embedding=None):
         inputs = self._concat_speaker_embedding(inputs, gst_outputs)
         return inputs
 
+    def compute_capacitron_VAE_embedding(self, inputs, reference_mel_info, text_info=None, speaker_embedding=None):
+        """Capacitron Variational Autoencoder"""
+        (VAE_outputs, posterior_distribution, prior_distribution, capacitron_beta,) = self.capacitron_vae_layer(
+            reference_mel_info,
+            text_info,
+            speaker_embedding,  # pylint: disable=not-callable
+        )
+
+        VAE_outputs = VAE_outputs.to(inputs.device)
+        encoder_output = self._concat_speaker_embedding(
+            inputs, VAE_outputs
+        )  # concatenate to the output of the basic tacotron encoder
+        return (
+            encoder_output,
+            posterior_distribution,
+            prior_distribution,
+            capacitron_beta,
+        )
+
     @staticmethod
     def _add_speaker_embedding(outputs, embedded_speakers):
         embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1)
diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py
index 8341f5bbd2..7bfa6ba5e4 100644
--- a/TTS/tts/models/tacotron.py
+++ b/TTS/tts/models/tacotron.py
@@ -1,11 +1,13 @@
 # coding: utf-8
 
-from typing import Dict, List, Union
+from typing import Dict, List, Tuple, Union
 
 import torch
 from torch import nn
 from torch.cuda.amp.autocast_mode import autocast
+from trainer.trainer_utils import get_optimizer, get_scheduler
 
+from TTS.tts.layers.tacotron.capacitron_layers import CapacitronVAE
 from TTS.tts.layers.tacotron.gst_layers import GST
 from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG
 from TTS.tts.models.base_tacotron import BaseTacotron
@@ -13,6 +15,7 @@
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
+from TTS.utils.capacitron_optimizer import CapacitronOptimizer
 
 
 class Tacotron(BaseTacotron):
@@ -51,6 +54,9 @@ def __init__(
         if self.use_gst:
             self.decoder_in_features += self.gst.gst_embedding_dim
 
+        if self.use_capacitron_vae:
+            self.decoder_in_features += self.capacitron_vae.capacitron_VAE_embedding_dim
+
         # embedding layer
         self.embedding = nn.Embedding(self.num_chars, 256, padding_idx=0)
         self.embedding.weight.data.normal_(0, 0.3)
@@ -90,6 +96,20 @@ def __init__(
                 gst_embedding_dim=self.gst.gst_embedding_dim,
             )
 
+        # Capacitron layers
+        if self.capacitron_vae and self.use_capacitron_vae:
+            self.capacitron_vae_layer = CapacitronVAE(
+                num_mel=self.decoder_output_dim,
+                encoder_output_dim=self.encoder_in_features,
+                capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim,
+                speaker_embedding_dim=self.embedded_speaker_dim
+                if self.use_speaker_embedding and self.capacitron_vae.capacitron_use_speaker_embedding
+                else None,
+                text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim
+                if self.capacitron_vae.capacitron_use_text_summary_embeddings
+                else None,
+            )
+
         # backward pass decoder
         if self.bidirectional_decoder:
             self._init_backward_decoder()
@@ -146,6 +166,19 @@ def forward(  # pylint: disable=dangerous-default-value
                 # B x 1 x speaker_embed_dim
                 embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1)
             encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers)
+        # Capacitron
+        if self.capacitron_vae and self.use_capacitron_vae:
+            # B x capacitron_VAE_embedding_dim
+            encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding(
+                encoder_outputs,
+                reference_mel_info=[mel_specs, mel_lengths],
+                text_info=[inputs, text_lengths]
+                if self.capacitron_vae.capacitron_use_text_summary_embeddings
+                else None,
+                speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None,
+            )
+        else:
+            capacitron_vae_outputs = None
         # decoder_outputs: B x decoder_in_features x T_out
         # alignments: B x T_in x encoder_in_features
         # stop_tokens: B x T_in
@@ -178,6 +211,7 @@ def forward(  # pylint: disable=dangerous-default-value
                 "decoder_outputs": decoder_outputs,
                 "alignments": alignments,
                 "stop_tokens": stop_tokens,
+                "capacitron_vae_outputs": capacitron_vae_outputs,
             }
         )
         return outputs
@@ -190,6 +224,28 @@ def inference(self, text_input, aux_input=None):
         if self.gst and self.use_gst:
             # B x gst_dim
             encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"])
+        if self.capacitron_vae and self.use_capacitron_vae:
+            if aux_input["style_text"] is not None:
+                style_text_embedding = self.embedding(aux_input["style_text"])
+                style_text_length = torch.tensor([style_text_embedding.size(1)], dtype=torch.int64).to(
+                    encoder_outputs.device
+                )  # pylint: disable=not-callable
+            reference_mel_length = (
+                torch.tensor([aux_input["style_mel"].size(1)], dtype=torch.int64).to(encoder_outputs.device)
+                if aux_input["style_mel"] is not None
+                else None
+            )  # pylint: disable=not-callable
+            # B x capacitron_VAE_embedding_dim
+            encoder_outputs, *_ = self.compute_capacitron_VAE_embedding(
+                encoder_outputs,
+                reference_mel_info=[aux_input["style_mel"], reference_mel_length]
+                if aux_input["style_mel"] is not None
+                else None,
+                text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None,
+                speaker_embedding=aux_input["d_vectors"]
+                if self.capacitron_vae.capacitron_use_speaker_embedding
+                else None,
+            )
         if self.num_speakers > 1:
             if not self.use_d_vector_file:
                 # B x 1 x speaker_embed_dim
@@ -215,12 +271,19 @@ def inference(self, text_input, aux_input=None):
         }
         return outputs
 
-    def train_step(self, batch, criterion):
-        """Perform a single training step by fetching the right set if samples from the batch.
+    def before_backward_pass(self, loss_dict, optimizer) -> None:
+        # Extracting custom training specific operations for capacitron
+        # from the trainer
+        if self.use_capacitron_vae:
+            loss_dict["capacitron_vae_beta_loss"].backward()
+            optimizer.first_step()
+
+    def train_step(self, batch: Dict, criterion: torch.nn.Module) -> Tuple[Dict, Dict]:
+        """Perform a single training step by fetching the right set of samples from the batch.
 
         Args:
-            batch ([type]): [description]
-            criterion ([type]): [description]
+            batch ([Dict]): A dictionary of input tensors.
+            criterion ([torch.nn.Module]): Callable criterion to compute model loss.
         """
         text_input = batch["text_input"]
         text_lengths = batch["text_lengths"]
@@ -232,14 +295,8 @@ def train_step(self, batch, criterion):
         speaker_ids = batch["speaker_ids"]
         d_vectors = batch["d_vectors"]
 
-        # forward pass model
-        outputs = self.forward(
-            text_input,
-            text_lengths,
-            mel_input,
-            mel_lengths,
-            aux_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors},
-        )
+        aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
+        outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
 
         # set the [alignment] lengths wrt reduction factor for guided attention
         if mel_lengths.max() % self.decoder.r != 0:
@@ -249,9 +306,6 @@ def train_step(self, batch, criterion):
         else:
             alignment_lengths = mel_lengths // self.decoder.r
 
-        aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
-        outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
-
         # compute loss
         with autocast(enabled=False):  # use float32 for the criterion
             loss_dict = criterion(
@@ -262,6 +316,7 @@ def train_step(self, batch, criterion):
                 outputs["stop_tokens"].float(),
                 stop_targets.float(),
                 stop_target_lengths,
+                outputs["capacitron_vae_outputs"] if self.capacitron_vae else None,
                 mel_lengths,
                 None if outputs["decoder_outputs_backward"] is None else outputs["decoder_outputs_backward"].float(),
                 outputs["alignments"].float(),
@@ -275,6 +330,25 @@ def train_step(self, batch, criterion):
         loss_dict["align_error"] = align_error
         return outputs, loss_dict
 
+    def get_optimizer(self) -> List:
+        if self.use_capacitron_vae:
+            return CapacitronOptimizer(self.config, self.named_parameters())
+        return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self)
+
+    def get_scheduler(self, optimizer: object):
+        opt = optimizer.primary_optimizer if self.use_capacitron_vae else optimizer
+        return get_scheduler(self.config.lr_scheduler, self.config.lr_scheduler_params, opt)
+
+    def before_gradient_clipping(self):
+        if self.use_capacitron_vae:
+            # Capacitron model specific gradient clipping
+            model_params_to_clip = []
+            for name, param in self.named_parameters():
+                if param.requires_grad:
+                    if name != "capacitron_vae_layer.beta":
+                        model_params_to_clip.append(param)
+            torch.nn.utils.clip_grad_norm_(model_params_to_clip, self.capacitron_vae.capacitron_grad_clip)
+
     def _create_logs(self, batch, outputs, ap):
         postnet_outputs = outputs["model_outputs"]
         decoder_outputs = outputs["decoder_outputs"]
diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py
index d4e665e347..95d339f17d 100644
--- a/TTS/tts/models/tacotron2.py
+++ b/TTS/tts/models/tacotron2.py
@@ -5,7 +5,9 @@
 import torch
 from torch import nn
 from torch.cuda.amp.autocast_mode import autocast
+from trainer.trainer_utils import get_optimizer, get_scheduler
 
+from TTS.tts.layers.tacotron.capacitron_layers import CapacitronVAE
 from TTS.tts.layers.tacotron.gst_layers import GST
 from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet
 from TTS.tts.models.base_tacotron import BaseTacotron
@@ -13,6 +15,7 @@
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
+from TTS.utils.capacitron_optimizer import CapacitronOptimizer
 
 
 class Tacotron2(BaseTacotron):
@@ -65,6 +68,9 @@ def __init__(
         if self.use_gst:
             self.decoder_in_features += self.gst.gst_embedding_dim
 
+        if self.use_capacitron_vae:
+            self.decoder_in_features += self.capacitron_vae.capacitron_VAE_embedding_dim
+
         # embedding layer
         self.embedding = nn.Embedding(self.num_chars, 512, padding_idx=0)
 
@@ -102,6 +108,20 @@ def __init__(
                 gst_embedding_dim=self.gst.gst_embedding_dim,
             )
 
+        # Capacitron VAE Layers
+        if self.capacitron_vae and self.use_capacitron_vae:
+            self.capacitron_vae_layer = CapacitronVAE(
+                num_mel=self.decoder_output_dim,
+                encoder_output_dim=self.encoder_in_features,
+                capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim,
+                speaker_embedding_dim=self.embedded_speaker_dim
+                if self.capacitron_vae.capacitron_use_speaker_embedding
+                else None,
+                text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim
+                if self.capacitron_vae.capacitron_use_text_summary_embeddings
+                else None,
+            )
+
         # backward pass decoder
         if self.bidirectional_decoder:
             self._init_backward_decoder()
@@ -166,6 +186,20 @@ def forward(  # pylint: disable=dangerous-default-value
                 embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1)
             encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers)
 
+        # capacitron
+        if self.capacitron_vae and self.use_capacitron_vae:
+            # B x capacitron_VAE_embedding_dim
+            encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding(
+                encoder_outputs,
+                reference_mel_info=[mel_specs, mel_lengths],
+                text_info=[embedded_inputs.transpose(1, 2), text_lengths]
+                if self.capacitron_vae.capacitron_use_text_summary_embeddings
+                else None,
+                speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None,
+            )
+        else:
+            capacitron_vae_outputs = None
+
         encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
 
         # B x mel_dim x T_out -- B x T_out//r x T_in -- B x T_out//r
@@ -197,6 +231,7 @@ def forward(  # pylint: disable=dangerous-default-value
                 "decoder_outputs": decoder_outputs,
                 "alignments": alignments,
                 "stop_tokens": stop_tokens,
+                "capacitron_vae_outputs": capacitron_vae_outputs,
             }
         )
         return outputs
@@ -217,6 +252,29 @@ def inference(self, text, aux_input=None):
             # B x gst_dim
             encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"])
 
+        if self.capacitron_vae and self.use_capacitron_vae:
+            if aux_input["style_text"] is not None:
+                style_text_embedding = self.embedding(aux_input["style_text"])
+                style_text_length = torch.tensor([style_text_embedding.size(1)], dtype=torch.int64).to(
+                    encoder_outputs.device
+                )  # pylint: disable=not-callable
+            reference_mel_length = (
+                torch.tensor([aux_input["style_mel"].size(1)], dtype=torch.int64).to(encoder_outputs.device)
+                if aux_input["style_mel"] is not None
+                else None
+            )  # pylint: disable=not-callable
+            # B x capacitron_VAE_embedding_dim
+            encoder_outputs, *_ = self.compute_capacitron_VAE_embedding(
+                encoder_outputs,
+                reference_mel_info=[aux_input["style_mel"], reference_mel_length]
+                if aux_input["style_mel"] is not None
+                else None,
+                text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None,
+                speaker_embedding=aux_input["d_vectors"]
+                if self.capacitron_vae.capacitron_use_speaker_embedding
+                else None,
+            )
+
         if self.num_speakers > 1:
             if not self.use_d_vector_file:
                 embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[None]
@@ -242,6 +300,13 @@ def inference(self, text, aux_input=None):
         }
         return outputs
 
+    def before_backward_pass(self, loss_dict, optimizer) -> None:
+        # Extracting custom training specific operations for capacitron
+        # from the trainer
+        if self.use_capacitron_vae:
+            loss_dict["capacitron_vae_beta_loss"].backward()
+            optimizer.first_step()
+
     def train_step(self, batch: Dict, criterion: torch.nn.Module):
         """A single training step. Forward pass and loss computation.
 
@@ -258,14 +323,8 @@ def train_step(self, batch: Dict, criterion: torch.nn.Module):
         speaker_ids = batch["speaker_ids"]
         d_vectors = batch["d_vectors"]
 
-        # forward pass model
-        outputs = self.forward(
-            text_input,
-            text_lengths,
-            mel_input,
-            mel_lengths,
-            aux_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors},
-        )
+        aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
+        outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
 
         # set the [alignment] lengths wrt reduction factor for guided attention
         if mel_lengths.max() % self.decoder.r != 0:
@@ -275,9 +334,6 @@ def train_step(self, batch: Dict, criterion: torch.nn.Module):
         else:
             alignment_lengths = mel_lengths // self.decoder.r
 
-        aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
-        outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
-
         # compute loss
         with autocast(enabled=False):  # use float32 for the criterion
             loss_dict = criterion(
@@ -288,6 +344,7 @@ def train_step(self, batch: Dict, criterion: torch.nn.Module):
                 outputs["stop_tokens"].float(),
                 stop_targets.float(),
                 stop_target_lengths,
+                outputs["capacitron_vae_outputs"] if self.capacitron_vae else None,
                 mel_lengths,
                 None if outputs["decoder_outputs_backward"] is None else outputs["decoder_outputs_backward"].float(),
                 outputs["alignments"].float(),
@@ -301,6 +358,25 @@ def train_step(self, batch: Dict, criterion: torch.nn.Module):
         loss_dict["align_error"] = align_error
         return outputs, loss_dict
 
+    def get_optimizer(self) -> List:
+        if self.use_capacitron_vae:
+            return CapacitronOptimizer(self.config, self.named_parameters())
+        return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self)
+
+    def get_scheduler(self, optimizer: object):
+        opt = optimizer.primary_optimizer if self.use_capacitron_vae else optimizer
+        return get_scheduler(self.config.lr_scheduler, self.config.lr_scheduler_params, opt)
+
+    def before_gradient_clipping(self):
+        if self.use_capacitron_vae:
+            # Capacitron model specific gradient clipping
+            model_params_to_clip = []
+            for name, param in self.named_parameters():
+                if param.requires_grad:
+                    if name != "capacitron_vae_layer.beta":
+                        model_params_to_clip.append(param)
+            torch.nn.utils.clip_grad_norm_(model_params_to_clip, self.capacitron_vae.capacitron_grad_clip)
+
     def _create_logs(self, batch, outputs, ap):
         """Create dashboard log information."""
         postnet_outputs = outputs["model_outputs"]
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index f9e132513a..a74300dc94 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -26,6 +26,7 @@ def run_model_torch(
     inputs: torch.Tensor,
     speaker_id: int = None,
     style_mel: torch.Tensor = None,
+    style_text: str = None,
     d_vector: torch.Tensor = None,
     language_id: torch.Tensor = None,
 ) -> Dict:
@@ -53,6 +54,7 @@ def run_model_torch(
             "speaker_ids": speaker_id,
             "d_vectors": d_vector,
             "style_mel": style_mel,
+            "style_text": style_text,
             "language_ids": language_id,
         },
     )
@@ -115,6 +117,7 @@ def synthesis(
     use_cuda,
     speaker_id=None,
     style_wav=None,
+    style_text=None,
     use_griffin_lim=False,
     do_trim_silence=False,
     d_vector=None,
@@ -140,7 +143,12 @@ def synthesis(
             Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
 
         style_wav (str | Dict[str, float]):
-            Path or tensor to/of a waveform used for computing the style embedding. Defaults to None.
+            Path or tensor to/of a waveform used for computing the style embedding based on GST or Capacitron.
+            Defaults to None, meaning that Capacitron models will sample from the prior distribution to
+            generate random but realistic prosody.
+
+        style_text (str):
+            Transcription of style_wav for Capacitron models. Defaults to None.
 
         enable_eos_bos_chars (bool):
             enable special chars for end of sentence and start of sentence. Defaults to False.
@@ -154,13 +162,19 @@ def synthesis(
         language_id (int):
             Language ID passed to the language embedding layer in multi-langual model. Defaults to None.
     """
-    # GST processing
+    # GST or Capacitron processing
+    # TODO: need to handle the case of setting both gst and capacitron to true somewhere
     style_mel = None
     if CONFIG.has("gst") and CONFIG.gst and style_wav is not None:
         if isinstance(style_wav, dict):
             style_mel = style_wav
         else:
             style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda)
+
+    if CONFIG.has("capacitron_vae") and CONFIG.use_capacitron_vae and style_wav is not None:
+        style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda)
+        style_mel = style_mel.transpose(1, 2)  # [1, time, depth]
+
     # convert text to sequence of token IDs
     text_inputs = np.asarray(
         model.tokenizer.text_to_ids(text, language=language_id),
@@ -177,11 +191,28 @@ def synthesis(
         language_id = id_to_torch(language_id, cuda=use_cuda)
 
     if not isinstance(style_mel, dict):
+        # GST or Capacitron style mel
         style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda)
+        if style_text is not None:
+            style_text = np.asarray(
+                model.tokenizer.text_to_ids(style_text, language=language_id),
+                dtype=np.int32,
+            )
+            style_text = numpy_to_torch(style_text, torch.long, cuda=use_cuda)
+            style_text = style_text.unsqueeze(0)
+
     text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=use_cuda)
     text_inputs = text_inputs.unsqueeze(0)
     # synthesize voice
-    outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, d_vector=d_vector, language_id=language_id)
+    outputs = run_model_torch(
+        model,
+        text_inputs,
+        speaker_id,
+        style_mel,
+        style_text,
+        d_vector=d_vector,
+        language_id=language_id,
+    )
     model_outputs = outputs["model_outputs"]
     model_outputs = model_outputs[0].data.cpu().numpy()
     alignments = outputs["alignments"]
diff --git a/TTS/utils/capacitron_optimizer.py b/TTS/utils/capacitron_optimizer.py
new file mode 100644
index 0000000000..c9f075afac
--- /dev/null
+++ b/TTS/utils/capacitron_optimizer.py
@@ -0,0 +1,65 @@
+from typing import Generator
+
+from trainer.trainer_utils import get_optimizer
+
+
+class CapacitronOptimizer:
+    """Double optimizer class for the Capacitron model."""
+
+    def __init__(self, config: dict, model_params: Generator) -> None:
+        self.primary_params, self.secondary_params = self.split_model_parameters(model_params)
+
+        optimizer_names = list(config.optimizer_params.keys())
+        optimizer_parameters = list(config.optimizer_params.values())
+
+        self.primary_optimizer = get_optimizer(
+            optimizer_names[0],
+            optimizer_parameters[0],
+            config.lr,
+            parameters=self.primary_params,
+        )
+
+        self.secondary_optimizer = get_optimizer(
+            optimizer_names[1],
+            self.extract_optimizer_parameters(optimizer_parameters[1]),
+            optimizer_parameters[1]["lr"],
+            parameters=self.secondary_params,
+        )
+
+        self.param_groups = self.primary_optimizer.param_groups
+
+    def first_step(self):
+        self.secondary_optimizer.step()
+        self.secondary_optimizer.zero_grad()
+        self.primary_optimizer.zero_grad()
+
+    def step(self):
+        self.primary_optimizer.step()
+
+    def zero_grad(self):
+        self.primary_optimizer.zero_grad()
+        self.secondary_optimizer.zero_grad()
+
+    def load_state_dict(self, state_dict):
+        self.primary_optimizer.load_state_dict(state_dict[0])
+        self.secondary_optimizer.load_state_dict(state_dict[1])
+
+    def state_dict(self):
+        return [self.primary_optimizer.state_dict(), self.secondary_optimizer.state_dict()]
+
+    @staticmethod
+    def split_model_parameters(model_params: Generator) -> list:
+        primary_params = []
+        secondary_params = []
+        for name, param in model_params:
+            if param.requires_grad:
+                if name == "capacitron_vae_layer.beta":
+                    secondary_params.append(param)
+                else:
+                    primary_params.append(param)
+        return [iter(primary_params), iter(secondary_params)]
+
+    @staticmethod
+    def extract_optimizer_parameters(params: dict) -> dict:
+        """Extract parameters that are not the learning rate"""
+        return {k: v for k, v in params.items() if k != "lr"}
diff --git a/TTS/utils/io.py b/TTS/utils/io.py
index 304df5ed21..0b32f77ab2 100644
--- a/TTS/utils/io.py
+++ b/TTS/utils/io.py
@@ -106,6 +106,8 @@ def save_model(config, model, optimizer, scaler, current_step, epoch, output_pat
         model_state = model.state_dict()
     if isinstance(optimizer, list):
         optimizer_state = [optim.state_dict() for optim in optimizer]
+    elif optimizer.__class__.__name__ == "CapacitronOptimizer":
+        optimizer_state = [optimizer.primary_optimizer.state_dict(), optimizer.secondary_optimizer.state_dict()]
     else:
         optimizer_state = optimizer.state_dict() if optimizer is not None else None
 
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 2c28861324..9ce528a3b4 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -1,5 +1,5 @@
 import time
-from typing import List, Union
+from typing import List
 
 import numpy as np
 import pysbd
@@ -178,8 +178,9 @@ def tts(
         text: str = "",
         speaker_name: str = "",
         language_name: str = "",
-        speaker_wav: Union[str, List[str]] = None,
+        speaker_wav=None,
         style_wav=None,
+        style_text=None,
         reference_wav=None,
         reference_speaker_name=None,
     ) -> List[int]:
@@ -191,6 +192,7 @@ def tts(
             language_name (str, optional): language id for multi-language models. Defaults to "".
             speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None.
             style_wav ([type], optional): style waveform for GST. Defaults to None.
+            style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None.
             reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
             reference_speaker_name ([type], optional): spekaer id of reference waveform. Defaults to None.
         Returns:
@@ -273,10 +275,11 @@ def tts(
                     CONFIG=self.tts_config,
                     use_cuda=self.use_cuda,
                     speaker_id=speaker_id,
-                    language_id=language_id,
                     style_wav=style_wav,
+                    style_text=style_text,
                     use_griffin_lim=use_gl,
                     d_vector=speaker_embedding,
+                    language_id=language_id,
                 )
                 waveform = outputs["wav"]
                 mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy()
diff --git a/recipes/blizzard2013/README.md b/recipes/blizzard2013/README.md
new file mode 100644
index 0000000000..9dcb739728
--- /dev/null
+++ b/recipes/blizzard2013/README.md
@@ -0,0 +1,12 @@
+# How to get the Blizzard 2013 Dataset
+
+The Capacitron model is a variational encoder extension of standard Tacotron based models to model prosody.
+
+To take full advantage of the model, it is advised to train the model with a dataset that contains a significant amount of prosodic information in the utterances. A tested candidate for such applications is the blizzard2013 dataset from the Blizzard Challenge, containing many hours of high quality audio book recordings.
+
+To get a license and download link for this dataset, you need to visit the [website](https://www.cstr.ed.ac.uk/projects/blizzard/2013/lessac_blizzard2013/license.html) of the Centre for Speech Technology Research of the University of Edinburgh.
+
+You get access to the raw dataset in a couple of days. There are a few preprocessing steps you need to do to be able to use the high fidelity dataset.
+
+1. Get the forced time alignments for the blizzard dataset from [here](https://github.com/mueller91/tts_alignments).
+2. Segment the high fidelity audio-book files based on the instructions [here](https://github.com/Tomiinek/Blizzard2013_Segmentation).
\ No newline at end of file
diff --git a/recipes/blizzard2013/tacotron1-Capacitron/train_capacitron_t1.py b/recipes/blizzard2013/tacotron1-Capacitron/train_capacitron_t1.py
new file mode 100644
index 0000000000..52c6098fa2
--- /dev/null
+++ b/recipes/blizzard2013/tacotron1-Capacitron/train_capacitron_t1.py
@@ -0,0 +1,101 @@
+import os
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.config.shared_configs import BaseAudioConfig
+from TTS.tts.configs.shared_configs import BaseDatasetConfig, CapacitronVAEConfig
+from TTS.tts.configs.tacotron_config import TacotronConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.tacotron import Tacotron
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.utils.audio import AudioProcessor
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+
+data_path = "/srv/data/"
+
+# Using LJSpeech like dataset processing for the blizzard dataset
+dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=data_path)
+
+audio_config = BaseAudioConfig(
+    sample_rate=24000,
+    do_trim_silence=True,
+    trim_db=60.0,
+    signal_norm=True,
+    mel_fmin=80.0,
+    mel_fmax=12000,
+    spec_gain=20.0,
+    log_func="np.log10",
+    ref_level_db=20,
+    preemphasis=0.0,
+    min_level_db=-100,
+)
+
+# Using the standard Capacitron config
+capacitron_config = CapacitronVAEConfig(capacitron_VAE_loss_alpha=1.0)
+
+config = TacotronConfig(
+    run_name="Blizzard-Capacitron-T1",
+    audio=audio_config,
+    capacitron_vae=capacitron_config,
+    use_capacitron_vae=True,
+    batch_size=128,  # Tune this to your gpu
+    max_audio_len=6 * 24000,  # Tune this to your gpu
+    min_audio_len=0.5 * 24000,
+    eval_batch_size=16,
+    num_loader_workers=12,
+    num_eval_loader_workers=8,
+    precompute_num_workers=24,
+    run_eval=True,
+    test_delay_epochs=5,
+    ga_alpha=0.0,
+    r=2,
+    optimizer="CapacitronOptimizer",
+    optimizer_params={"RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6}, "SGD": {"lr": 1e-5, "momentum": 0.9}},
+    attention_type="graves",
+    attention_heads=5,
+    epochs=1000,
+    text_cleaner="phoneme_cleaners",
+    use_phonemes=True,
+    phoneme_language="en-us",
+    phonemizer="espeak",
+    phoneme_cache_path=os.path.join(data_path, "phoneme_cache"),
+    stopnet_pos_weight=15,
+    print_step=50,
+    print_eval=True,
+    mixed_precision=False,
+    output_path=output_path,
+    datasets=[dataset_config],
+    lr=1e-3,
+    lr_scheduler="StepwiseGradualLR",
+    lr_scheduler_params={"gradual_learning_rates": [[0, 1e-3], [2e4, 5e-4], [4e5, 3e-4], [6e4, 1e-4], [8e4, 5e-5]]},
+    scheduler_after_epoch=False,  # scheduler doesn't work without this flag
+    # Need to experiment with these below for capacitron
+    loss_masking=False,
+    decoder_loss_alpha=1.0,
+    postnet_loss_alpha=1.0,
+    postnet_diff_spec_alpha=0.0,
+    decoder_diff_spec_alpha=0.0,
+    decoder_ssim_alpha=0.0,
+    postnet_ssim_alpha=0.0,
+)
+
+ap = AudioProcessor(**config.audio.to_dict())
+
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+
+model = Tacotron(config, ap, tokenizer, speaker_manager=None)
+
+trainer = Trainer(
+    TrainerArgs(),
+    config,
+    output_path,
+    model=model,
+    train_samples=train_samples,
+    eval_samples=eval_samples,
+)
+
+# 🚀
+trainer.fit()
diff --git a/recipes/blizzard2013/tacotron2-Capacitron/train_capacitron_t2.py b/recipes/blizzard2013/tacotron2-Capacitron/train_capacitron_t2.py
new file mode 100644
index 0000000000..cf27b9dfd1
--- /dev/null
+++ b/recipes/blizzard2013/tacotron2-Capacitron/train_capacitron_t2.py
@@ -0,0 +1,117 @@
+import os
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.config.shared_configs import BaseAudioConfig
+from TTS.tts.configs.shared_configs import BaseDatasetConfig, CapacitronVAEConfig
+from TTS.tts.configs.tacotron2_config import Tacotron2Config
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.tacotron2 import Tacotron2
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.utils.audio import AudioProcessor
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+
+data_path = "/srv/data/blizzard2013/segmented"
+
+# Using LJSpeech like dataset processing for the blizzard dataset
+dataset_config = BaseDatasetConfig(
+    name="ljspeech",
+    meta_file_train="metadata.csv",
+    path=data_path,
+)
+
+audio_config = BaseAudioConfig(
+    sample_rate=24000,
+    do_trim_silence=True,
+    trim_db=60.0,
+    signal_norm=True,
+    mel_fmin=80.0,
+    mel_fmax=12000,
+    spec_gain=25.0,
+    log_func="np.log10",
+    ref_level_db=20,
+    preemphasis=0.0,
+    min_level_db=-100,
+)
+
+# Using the standard Capacitron config
+capacitron_config = CapacitronVAEConfig(capacitron_VAE_loss_alpha=1.0)
+
+config = Tacotron2Config(
+    run_name="Blizzard-Capacitron-T2",
+    audio=audio_config,
+    capacitron_vae=capacitron_config,
+    use_capacitron_vae=True,
+    batch_size=246,  # Tune this to your gpu
+    max_audio_len=6 * 24000,  # Tune this to your gpu
+    min_audio_len=1 * 24000,
+    eval_batch_size=16,
+    num_loader_workers=12,
+    num_eval_loader_workers=8,
+    precompute_num_workers=24,
+    run_eval=True,
+    test_delay_epochs=5,
+    ga_alpha=0.0,
+    r=2,
+    optimizer="CapacitronOptimizer",
+    optimizer_params={"RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6}, "SGD": {"lr": 1e-5, "momentum": 0.9}},
+    attention_type="dynamic_convolution",
+    grad_clip=0.0,  # Important! We overwrite the standard grad_clip with capacitron_grad_clip
+    double_decoder_consistency=False,
+    epochs=1000,
+    text_cleaner="phoneme_cleaners",
+    use_phonemes=True,
+    phoneme_language="en-us",
+    phonemizer="espeak",
+    phoneme_cache_path=os.path.join(data_path, "phoneme_cache"),
+    stopnet_pos_weight=15,
+    print_step=25,
+    print_eval=True,
+    mixed_precision=False,
+    output_path=output_path,
+    datasets=[dataset_config],
+    lr=1e-3,
+    lr_scheduler="StepwiseGradualLR",
+    lr_scheduler_params={
+        "gradual_learning_rates": [
+            [0, 1e-3],
+            [2e4, 5e-4],
+            [4e5, 3e-4],
+            [6e4, 1e-4],
+            [8e4, 5e-5],
+        ]
+    },
+    scheduler_after_epoch=False,  # scheduler doesn't work without this flag
+    # dashboard_logger='wandb',
+    # sort_by_audio_len=True,
+    seq_len_norm=True,
+    # Need to experiment with these below for capacitron
+    loss_masking=False,
+    decoder_loss_alpha=1.0,
+    postnet_loss_alpha=1.0,
+    postnet_diff_spec_alpha=0.0,
+    decoder_diff_spec_alpha=0.0,
+    decoder_ssim_alpha=0.0,
+    postnet_ssim_alpha=0.0,
+)
+
+ap = AudioProcessor(**config.audio.to_dict())
+
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+
+model = Tacotron2(config, ap, tokenizer, speaker_manager=None)
+
+trainer = Trainer(
+    TrainerArgs(),
+    config,
+    output_path,
+    model=model,
+    train_samples=train_samples,
+    eval_samples=eval_samples,
+    training_assets={"audio_processor": ap},
+)
+
+trainer.fit()
diff --git a/recipes/ljspeech/tacotron2-Capacitron/train_capacitron_t2.py b/recipes/ljspeech/tacotron2-Capacitron/train_capacitron_t2.py
new file mode 100644
index 0000000000..6bb0aed782
--- /dev/null
+++ b/recipes/ljspeech/tacotron2-Capacitron/train_capacitron_t2.py
@@ -0,0 +1,115 @@
+import os
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.config.shared_configs import BaseAudioConfig
+from TTS.tts.configs.shared_configs import BaseDatasetConfig, CapacitronVAEConfig
+from TTS.tts.configs.tacotron2_config import Tacotron2Config
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.tacotron2 import Tacotron2
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.utils.audio import AudioProcessor
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+
+data_path = "/srv/data/"
+
+# Using LJSpeech like dataset processing for the blizzard dataset
+dataset_config = BaseDatasetConfig(
+    name="ljspeech",
+    meta_file_train="metadata.csv",
+    path=data_path,
+)
+
+audio_config = BaseAudioConfig(
+    sample_rate=22050,
+    do_trim_silence=True,
+    trim_db=60.0,
+    signal_norm=False,
+    mel_fmin=0.0,
+    mel_fmax=11025,
+    spec_gain=1.0,
+    log_func="np.log",
+    ref_level_db=20,
+    preemphasis=0.0,
+)
+
+# Using the standard Capacitron config
+capacitron_config = CapacitronVAEConfig(capacitron_VAE_loss_alpha=1.0, capacitron_capacity=50)
+
+config = Tacotron2Config(
+    run_name="Capacitron-Tacotron2",
+    audio=audio_config,
+    capacitron_vae=capacitron_config,
+    use_capacitron_vae=True,
+    batch_size=128,  # Tune this to your gpu
+    max_audio_len=8 * 22050,  # Tune this to your gpu
+    min_audio_len=1 * 22050,
+    eval_batch_size=16,
+    num_loader_workers=8,
+    num_eval_loader_workers=8,
+    precompute_num_workers=24,
+    run_eval=True,
+    test_delay_epochs=25,
+    ga_alpha=0.0,
+    r=2,
+    optimizer="CapacitronOptimizer",
+    optimizer_params={"RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6}, "SGD": {"lr": 1e-5, "momentum": 0.9}},
+    attention_type="dynamic_convolution",
+    grad_clip=0.0,  # Important! We overwrite the standard grad_clip with capacitron_grad_clip
+    double_decoder_consistency=False,
+    epochs=1000,
+    text_cleaner="phoneme_cleaners",
+    use_phonemes=True,
+    phoneme_language="en-us",
+    phonemizer="espeak",
+    phoneme_cache_path=os.path.join(data_path, "phoneme_cache"),
+    stopnet_pos_weight=15,
+    print_step=25,
+    print_eval=True,
+    mixed_precision=False,
+    sort_by_audio_len=True,
+    seq_len_norm=True,
+    output_path=output_path,
+    datasets=[dataset_config],
+    lr=1e-3,
+    lr_scheduler="StepwiseGradualLR",
+    lr_scheduler_params={
+        "gradual_learning_rates": [
+            [0, 1e-3],
+            [2e4, 5e-4],
+            [4e5, 3e-4],
+            [6e4, 1e-4],
+            [8e4, 5e-5],
+        ]
+    },
+    scheduler_after_epoch=False,  # scheduler doesn't work without this flag
+    # Need to experiment with these below for capacitron
+    loss_masking=False,
+    decoder_loss_alpha=1.0,
+    postnet_loss_alpha=1.0,
+    postnet_diff_spec_alpha=0.0,
+    decoder_diff_spec_alpha=0.0,
+    decoder_ssim_alpha=0.0,
+    postnet_ssim_alpha=0.0,
+)
+
+ap = AudioProcessor(**config.audio.to_dict())
+
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+
+model = Tacotron2(config, ap, tokenizer, speaker_manager=None)
+
+trainer = Trainer(
+    TrainerArgs(),
+    config,
+    output_path,
+    model=model,
+    train_samples=train_samples,
+    eval_samples=eval_samples,
+    training_assets={"audio_processor": ap},
+)
+
+trainer.fit()
diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py
index df184a6a26..77c291f7b5 100644
--- a/tests/tts_tests/test_tacotron2_model.py
+++ b/tests/tts_tests/test_tacotron2_model.py
@@ -6,7 +6,7 @@
 from torch import nn, optim
 
 from tests import get_tests_input_path
-from TTS.tts.configs.shared_configs import GSTConfig
+from TTS.tts.configs.shared_configs import CapacitronVAEConfig, GSTConfig
 from TTS.tts.configs.tacotron2_config import Tacotron2Config
 from TTS.tts.layers.losses import MSELossMasked
 from TTS.tts.models.tacotron2 import Tacotron2
@@ -260,6 +260,73 @@ def test_train_step(self):
             count += 1
 
 
+class TacotronCapacitronTrainTest(unittest.TestCase):
+    @staticmethod
+    def test_train_step():
+        config = Tacotron2Config(
+            num_chars=32,
+            num_speakers=10,
+            use_speaker_embedding=True,
+            out_channels=80,
+            decoder_output_dim=80,
+            use_capacitron_vae=True,
+            capacitron_vae=CapacitronVAEConfig(),
+            optimizer="CapacitronOptimizer",
+            optimizer_params={
+                "RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6},
+                "SGD": {"lr": 1e-5, "momentum": 0.9},
+            },
+        )
+
+        batch = dict({})
+        batch["text_input"] = torch.randint(0, 24, (8, 128)).long().to(device)
+        batch["text_lengths"] = torch.randint(100, 129, (8,)).long().to(device)
+        batch["text_lengths"] = torch.sort(batch["text_lengths"], descending=True)[0]
+        batch["text_lengths"][0] = 128
+        batch["mel_input"] = torch.rand(8, 120, config.audio["num_mels"]).to(device)
+        batch["mel_lengths"] = torch.randint(20, 120, (8,)).long().to(device)
+        batch["mel_lengths"] = torch.sort(batch["mel_lengths"], descending=True)[0]
+        batch["mel_lengths"][0] = 120
+        batch["stop_targets"] = torch.zeros(8, 120, 1).float().to(device)
+        batch["stop_target_lengths"] = torch.randint(0, 120, (8,)).to(device)
+        batch["speaker_ids"] = torch.randint(0, 5, (8,)).long().to(device)
+        batch["d_vectors"] = None
+
+        for idx in batch["mel_lengths"]:
+            batch["stop_targets"][:, int(idx.item()) :, 0] = 1.0
+
+        batch["stop_targets"] = batch["stop_targets"].view(
+            batch["text_input"].shape[0], batch["stop_targets"].size(1) // config.r, -1
+        )
+        batch["stop_targets"] = (batch["stop_targets"].sum(2) > 0.0).unsqueeze(2).float().squeeze()
+
+        model = Tacotron2(config).to(device)
+        criterion = model.get_criterion()
+        optimizer = model.get_optimizer()
+
+        model.train()
+        model_ref = copy.deepcopy(model)
+        count = 0
+        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
+            assert (param - param_ref).sum() == 0, param
+            count += 1
+        for _ in range(10):
+            _, loss_dict = model.train_step(batch, criterion)
+            optimizer.zero_grad()
+            loss_dict["capacitron_vae_beta_loss"].backward()
+            optimizer.first_step()
+            loss_dict["loss"].backward()
+            optimizer.step()
+        # check parameter changes
+        count = 0
+        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
+            # ignore pre-higway layer since it works conditional
+            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
+                count, param.shape, param, param_ref
+            )
+            count += 1
+
+
 class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
     """Test multi-speaker Tacotron2 with Global Style Tokens and d-vector inputs."""
 
diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py
index 6e0e712bba..07351a6ae0 100644
--- a/tests/tts_tests/test_tacotron_model.py
+++ b/tests/tts_tests/test_tacotron_model.py
@@ -6,7 +6,7 @@
 from torch import nn, optim
 
 from tests import get_tests_input_path
-from TTS.tts.configs.shared_configs import GSTConfig
+from TTS.tts.configs.shared_configs import CapacitronVAEConfig, GSTConfig
 from TTS.tts.configs.tacotron_config import TacotronConfig
 from TTS.tts.layers.losses import L1LossMasked
 from TTS.tts.models.tacotron import Tacotron
@@ -248,6 +248,74 @@ def test_train_step():
             count += 1
 
 
+class TacotronCapacitronTrainTest(unittest.TestCase):
+    @staticmethod
+    def test_train_step():
+        config = TacotronConfig(
+            num_chars=32,
+            num_speakers=10,
+            use_speaker_embedding=True,
+            out_channels=513,
+            decoder_output_dim=80,
+            use_capacitron_vae=True,
+            capacitron_vae=CapacitronVAEConfig(),
+            optimizer="CapacitronOptimizer",
+            optimizer_params={
+                "RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6},
+                "SGD": {"lr": 1e-5, "momentum": 0.9},
+            },
+        )
+
+        batch = dict({})
+        batch["text_input"] = torch.randint(0, 24, (8, 128)).long().to(device)
+        batch["text_lengths"] = torch.randint(100, 129, (8,)).long().to(device)
+        batch["text_lengths"] = torch.sort(batch["text_lengths"], descending=True)[0]
+        batch["text_lengths"][0] = 128
+        batch["linear_input"] = torch.rand(8, 120, config.audio["fft_size"] // 2 + 1).to(device)
+        batch["mel_input"] = torch.rand(8, 120, config.audio["num_mels"]).to(device)
+        batch["mel_lengths"] = torch.randint(20, 120, (8,)).long().to(device)
+        batch["mel_lengths"] = torch.sort(batch["mel_lengths"], descending=True)[0]
+        batch["mel_lengths"][0] = 120
+        batch["stop_targets"] = torch.zeros(8, 120, 1).float().to(device)
+        batch["stop_target_lengths"] = torch.randint(0, 120, (8,)).to(device)
+        batch["speaker_ids"] = torch.randint(0, 5, (8,)).long().to(device)
+        batch["d_vectors"] = None
+
+        for idx in batch["mel_lengths"]:
+            batch["stop_targets"][:, int(idx.item()) :, 0] = 1.0
+
+        batch["stop_targets"] = batch["stop_targets"].view(
+            batch["text_input"].shape[0], batch["stop_targets"].size(1) // config.r, -1
+        )
+        batch["stop_targets"] = (batch["stop_targets"].sum(2) > 0.0).unsqueeze(2).float().squeeze()
+
+        model = Tacotron(config).to(device)
+        criterion = model.get_criterion()
+        optimizer = model.get_optimizer()
+        model.train()
+        print(" > Num parameters for Tacotron with Capacitron VAE model:%s" % (count_parameters(model)))
+        model_ref = copy.deepcopy(model)
+        count = 0
+        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
+            assert (param - param_ref).sum() == 0, param
+            count += 1
+        for _ in range(10):
+            _, loss_dict = model.train_step(batch, criterion)
+            optimizer.zero_grad()
+            loss_dict["capacitron_vae_beta_loss"].backward()
+            optimizer.first_step()
+            loss_dict["loss"].backward()
+            optimizer.step()
+        # check parameter changes
+        count = 0
+        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
+            # ignore pre-higway layer since it works conditional
+            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
+                count, param.shape, param, param_ref
+            )
+            count += 1
+
+
 class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
     @staticmethod
     def test_train_step():