NVIDIA · blisc · Feb 11, 2022 · Jan 14, 2022 · Jan 18, 2022 · Jan 19, 2022
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -2017,7 +2017,7 @@ pipeline {
       }
     }
 
-           
+
     stage('L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval') {
       when {
         anyOf {

diff --git a/examples/tts/conf/fastpitch_align_v1.05.yaml b/examples/tts/conf/fastpitch_align_v1.05.yaml
@@ -77,6 +77,7 @@ model:
       _target_: nemo.collections.tts.torch.g2ps.EnglishG2p
       phoneme_dict: ${phoneme_dict_path}
       heteronyms: ${heteronyms_path}
+      phoneme_probability: 0.5
 
   train_ds:
     dataset:
@@ -101,12 +102,13 @@ model:
       pitch_norm: true
       pitch_mean: ${model.pitch_mean}
       pitch_std: ${model.pitch_std}
+      use_beta_binomial_interpolator: true
 
     dataloader_params:
       drop_last: false
       shuffle: true
       batch_size: 32
-      num_workers: 12
+      num_workers: 0
 
   validation_ds:
     dataset:
@@ -131,12 +133,13 @@ model:
       pitch_norm: true
       pitch_mean: ${model.pitch_mean}
       pitch_std: ${model.pitch_std}
+      use_beta_binomial_interpolator: true
 
     dataloader_params:
       drop_last: false
       shuffle: false
       batch_size: 32
-      num_workers: 8
+      num_workers: 0
 
   preprocessor:
     _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor

diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py
@@ -310,20 +310,20 @@ def training_step(self, batch, batch_idx):
 
             self.tb_logger.add_image(
                 "train_mel_target",
-                plot_spectrogram_to_numpy(mels[0].data.cpu().numpy()),
+                plot_spectrogram_to_numpy(mels[0].data.cpu().float().numpy()),
                 self.global_step,
                 dataformats="HWC",
             )
-            spec_predict = mels_pred[0].data.cpu().numpy()
+            spec_predict = mels_pred[0].data.cpu().float().numpy()
             self.tb_logger.add_image(
                 "train_mel_predicted", plot_spectrogram_to_numpy(spec_predict), self.global_step, dataformats="HWC",
             )
             if self.learn_alignment:
-                attn = attn_hard[0].data.cpu().numpy().squeeze()
+                attn = attn_hard[0].data.cpu().float().numpy().squeeze()
                 self.tb_logger.add_image(
                     "train_attn", plot_alignment_to_numpy(attn.T), self.global_step, dataformats="HWC",
                 )
-                soft_attn = attn_soft[0].data.cpu().numpy().squeeze()
+                soft_attn = attn_soft[0].data.cpu().float().numpy().squeeze()
                 self.tb_logger.add_image(
                     "train_soft_attn", plot_alignment_to_numpy(soft_attn.T), self.global_step, dataformats="HWC",
                 )
@@ -394,11 +394,11 @@ def validation_epoch_end(self, outputs):
         if isinstance(self.logger, TensorBoardLogger):
             self.tb_logger.add_image(
                 "val_mel_target",
-                plot_spectrogram_to_numpy(spec_target[0].data.cpu().numpy()),
+                plot_spectrogram_to_numpy(spec_target[0].data.cpu().float().numpy()),
                 self.global_step,
                 dataformats="HWC",
             )
-            spec_predict = spec_predict[0].data.cpu().numpy()
+            spec_predict = spec_predict[0].data.cpu().float().numpy()
             self.tb_logger.add_image(
                 "val_mel_predicted", plot_spectrogram_to_numpy(spec_predict), self.global_step, dataformats="HWC",
             )

diff --git a/nemo/collections/tts/torch/data.py b/nemo/collections/tts/torch/data.py
@@ -134,9 +134,12 @@ def __init__(
 
         # Initialize text tokenizer
         self.text_tokenizer = text_tokenizer
+
+        self.phoneme_probability = None
         if isinstance(self.text_tokenizer, BaseTokenizer):
             self.text_tokenizer_pad_id = text_tokenizer.pad
             self.tokens = text_tokenizer.tokens
+            self.phoneme_probability = self.text_tokenizer.phoneme_probability
         else:
             if text_tokenizer_pad_id is None:
                 raise ValueError(f"text_tokenizer_pad_id must be specified if text_tokenizer is not BaseTokenizer")
@@ -146,6 +149,7 @@ def __init__(
 
             self.text_tokenizer_pad_id = text_tokenizer_pad_id
             self.tokens = tokens
+        self.cache_text = True if self.phoneme_probability is None else False
 
         # Initialize text normalizer is specified
         self.text_normalizer = text_normalizer
@@ -179,15 +183,14 @@ def __init__(
 
                     if "normalized_text" not in item:
                         text = item["text"]
-
                         if self.text_normalizer is not None:
                             text = self.text_normalizer_call(text, **self.text_normalizer_call_kwargs)
-
                         file_info["normalized_text"] = text
-                        file_info["text_tokens"] = self.text_tokenizer(text)
                     else:
                         file_info["normalized_text"] = item["normalized_text"]
-                        file_info["text_tokens"] = self.text_tokenizer(item["normalized_text"])
+
+                    if self.cache_text:
+                        file_info["text_tokens"] = self.text_tokenizer(file_info["normalized_text"])
 
                     data.append(file_info)
 
@@ -241,6 +244,7 @@ def __init__(
             hop_length=self.hop_len,
             win_length=self.win_length,
             window=window_fn(self.win_length, periodic=False).to(torch.float) if window_fn else None,
+            return_complex=True,
         )
 
         # Initialize sup_data_path, sup_data_types and run preprocessing methods for every supplementary data type
@@ -331,6 +335,13 @@ def add_align_prior_matrix(self, **kwargs):
         self.align_prior_matrix_folder.mkdir(exist_ok=True, parents=True)
 
         self.use_beta_binomial_interpolator = kwargs.pop('use_beta_binomial_interpolator', False)
+        if not self.cache_text:
+            if 'use_beta_binomial_interpolator' in kwargs and not self.use_beta_binomial_interpolator:
+                logging(
+                    "phoneme_probability is not None, but use_beta_binomial_interpolator=False, we"
+                    " set use_beta_binomial_interpolator=True manually to use phoneme_probability."
+                )
+            self.use_beta_binomial_interpolator = True
 
         if self.use_beta_binomial_interpolator:
             self.beta_binomial_interpolator = BetaBinomialInterpolator()
@@ -386,9 +397,13 @@ def __getitem__(self, index):
         features = self.featurizer.process(sample["audio_filepath"], trim=self.trim)
         audio, audio_length = features, torch.tensor(features.shape[0]).long()
 
-        # Load text
-        text = torch.tensor(sample["text_tokens"]).long()
-        text_length = torch.tensor(len(sample["text_tokens"])).long()
+        if "text_tokens" in sample:
+            text = torch.tensor(sample["text_tokens"]).long()
+            text_length = torch.tensor(len(sample["text_tokens"])).long()
+        else:
+            tokenized = self.text_tokenizer(sample["normalized_text"])
+            text = torch.tensor(tokenized).long()
+            text_length = torch.tensor(len(tokenized)).long()
 
         # Load mel if needed
         log_mel, log_mel_length = None, None
@@ -417,9 +432,10 @@ def __getitem__(self, index):
         # Load alignment prior matrix if needed
         align_prior_matrix = None
         if AlignPriorMatrix in self.sup_data_types_set:
+            align_prior_matrix = None
             if self.use_beta_binomial_interpolator:
-                mel_len = self.get_log_mel(audio).shape[2]
                 align_prior_matrix = torch.from_numpy(self.beta_binomial_interpolator(mel_len, text_length.item()))
+                mel_len = self.get_log_mel(audio).shape[2]
             else:
                 prior_path = self.align_prior_matrix_folder / f"{rel_audio_path_as_text_id}.pt"
 

diff --git a/nemo/collections/tts/torch/g2ps.py b/nemo/collections/tts/torch/g2ps.py
@@ -14,8 +14,10 @@
 
 import abc
 import pathlib
+import random
 import re
 import time
+from typing import Optional
 
 import nltk
 import torch
@@ -53,6 +55,7 @@ def __init__(
         ignore_ambiguous_words=True,
         heteronyms=None,
         encoding='latin-1',
+        phoneme_probability: Optional[float] = None,
     ):
         """English G2P module. This module converts words from grapheme to phoneme representation using phoneme_dict in CMU dict format.
         Optionally, it can ignore words which are heteronyms, ambiguous or marked as unchangeable by word_tokenize_func (see code for details).
@@ -67,6 +70,7 @@ def __init__(
             ignore_ambiguous_words: Whether to not handle word via phoneme_dict with ambiguous phoneme sequences. Defaults to True.
             heteronyms (str, Path, List): Path to file with heteronyms (every line is new word) or list of words.
             encoding: Encoding type.
+            phoneme_probability (Optional[float]): The probability (0.<var<1.) that each word is phonemized. Defaults to None which is the same as 1.
         """
         phoneme_dict = (
             self._parse_as_cmu_dict(phoneme_dict, encoding)
@@ -91,6 +95,8 @@ def __init__(
             if isinstance(heteronyms, str) or isinstance(heteronyms, pathlib.Path)
             else heteronyms
         )
+        self.phoneme_probability = phoneme_probability
+        self._rng = random.Random()
 
     @staticmethod
     def _parse_as_cmu_dict(phoneme_dict_path=None, encoding='latin-1'):
@@ -163,6 +169,9 @@ def parse_one_word(self, word: str):
         `status` will be `False` if word wasn't handled, `True` otherwise.
         """
 
+        if self.phoneme_probability is not None and self._rng.random() > self.phoneme_probability:
+            return word, True
+
         # punctuation
         if re.search("[a-zA-Z]", word) is None:
             return list(word), True

diff --git a/nemo/collections/tts/torch/tts_tokenizers.py b/nemo/collections/tts/torch/tts_tokenizers.py
@@ -282,6 +282,9 @@ def __init__(
              Note that lower() function shouldn't applied here, because text can contains phonemes (it will be handled by g2p).
         """
 
+        self.phoneme_probability = None
+        if hasattr(g2p, "phoneme_probability"):
+            self.phoneme_probability = g2p.phoneme_probability
         tokens = []
         self.space, tokens = len(tokens), tokens + [space]  # Space
 
@@ -295,7 +298,12 @@ def __init__(
             vowels = [f'{p}{s}' for p, s in itertools.product(vowels, (0, 1, 2))]
         tokens.extend(vowels)
 
-        if chars:
+        if chars or self.phoneme_probability is not None:
+            if not chars:
+                logging.warning(
+                    "phoneme_probability was not None, characters will be enabled even though "
+                    "chars was set to False."
+                )
             tokens.extend(string.ascii_lowercase)
 
         if apostrophe:
@@ -308,7 +316,7 @@ def __init__(
 
         super().__init__(tokens, oov=oov, sep=sep, add_blank_at=add_blank_at)
 
-        self.chars = chars
+        self.chars = chars if self.phoneme_probability is None else True
         self.punct = punct
         self.stresses = stresses
         self.pad_with_space = pad_with_space
@@ -321,7 +329,7 @@ def encode(self, text):
         ps, space, tokens = [], self.tokens[self.space], set(self.tokens)
 
         text = self.text_preprocessing_func(text)
-        g2p_text = self.g2p(text)
+        g2p_text = self.g2p(text)  # TODO: handle infer
 
         for p in g2p_text:  # noqa
             # Remove stress
-Original file line number
+Diff line change
@@ Expand Up / @@ -2017,7 +2017,7 @@ pipeline { @@
           }
         }
         stage('L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval') {
           when {
             anyOf {
@@ Expand Down @@