NVIDIA · hsiehjackson · May 22, 2023 · May 22, 2023 · May 22, 2023 · May 22, 2023
diff --git a/examples/tts/conf/fastpitch_align_44100_adapter.yaml b/examples/tts/conf/fastpitch_align_44100_adapter.yaml
@@ -32,6 +32,9 @@ phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
 
 model:
+  unfreeze_aligner: false
+  unfreeze_duration_predictor: false
+  unfreeze_pitch_predictor: false
   learn_alignment: true
   bin_loss_warmup_epochs: 100
 

diff --git a/examples/tts/fastpitch_finetune_adapters.py b/examples/tts/fastpitch_finetune_adapters.py
@@ -107,6 +107,18 @@ def main(cfg):
     if adapter_global_cfg is not None:
         add_global_adapter_cfg(model, adapter_global_cfg)
 
+    if cfg.model.get("unfreeze_aligner", False):
+        for name, param in model.fastpitch.aligner.named_parameters():
+            param.requires_grad = True
+
+    if cfg.model.get("unfreeze_duration_predictor", False):
+        for name, param in model.fastpitch.duration_predictor.named_parameters():
+            param.requires_grad = True
+
+    if cfg.model.get("unfreeze_pitch_predictor", False):
+        for name, param in model.fastpitch.pitch_predictor.named_parameters():
+            param.requires_grad = True
+
     # Add adapters
     model.add_adapter(name=adapter_name, cfg=cfg.model.adapter)
     assert model.is_adapter_available()

diff --git a/nemo/collections/tts/losses/aligner_loss.py b/nemo/collections/tts/losses/aligner_loss.py
@@ -22,11 +22,12 @@
 
 
 class ForwardSumLoss(Loss):
-    def __init__(self, blank_logprob=-1):
+    def __init__(self, blank_logprob=-1, loss_scale=1.0):
         super().__init__()
         self.log_softmax = torch.nn.LogSoftmax(dim=-1)
         self.ctc_loss = torch.nn.CTCLoss(zero_infinity=True)
         self.blank_logprob = blank_logprob
+        self.loss_scale = loss_scale
 
     @property
     def input_types(self):
@@ -67,13 +68,15 @@ def forward(self, attn_logprob, in_lens, out_lens):
 
         # Evaluate CTC loss
         cost = self.ctc_loss(attn_logprob, target_seqs, input_lengths=query_lens, target_lengths=key_lens)
+        cost *= self.loss_scale
 
         return cost
 
 
 class BinLoss(Loss):
-    def __init__(self):
+    def __init__(self, loss_scale=1.0):
         super().__init__()
+        self.loss_scale = loss_scale
 
     @property
     def input_types(self):
@@ -91,4 +94,6 @@ def output_types(self):
     @typecheck()
     def forward(self, hard_attention, soft_attention):
         log_sum = torch.log(torch.clamp(soft_attention[hard_attention == 1], min=1e-12)).sum()
-        return -log_sum / hard_attention.sum()
+        loss = -log_sum / hard_attention.sum()
+        loss *= self.loss_scale
+        return loss
diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py
@@ -138,9 +138,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
 
         self.aligner = None
         if self.learn_alignment:
+            aligner_loss_scale = cfg.aligner_loss_scale if "aligner_loss_scale" in cfg else 1.0
             self.aligner = instantiate(self._cfg.alignment_module)
-            self.forward_sum_loss_fn = ForwardSumLoss()
-            self.bin_loss_fn = BinLoss()
+            self.forward_sum_loss_fn = ForwardSumLoss(loss_scale=aligner_loss_scale)
+            self.bin_loss_fn = BinLoss(loss_scale=aligner_loss_scale)
 
         self.preprocessor = instantiate(self._cfg.preprocessor)
         input_fft = instantiate(self._cfg.input_fft, **input_fft_kwargs)