Merge branch 'main' into parallel_prompt_tuning

NVIDIA · Feb 16, 2022 · a31a264 · a31a264
2 parents 3cfd023 + b5012d0
commit a31a264
Show file tree

Hide file tree

Showing 16 changed files with 121 additions and 78 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -81,9 +81,9 @@ RUN --mount=from=nemo-src,target=/tmp/nemo cd /tmp/nemo && pip install ".[all]"
     python -c "import nemo.collections.tts as nemo_tts" && \
     python -c "import nemo_text_processing.text_normalization as text_normalization"
 
-# TODO: Try to remove once 21.07 container is the base container
+# TODO: Update to newer numba 0.56.0RC1 for 22.02 container
 # install pinned numba version
-RUN conda install -c conda-forge numba=0.54.1
+# RUN conda install -c conda-forge numba==0.54.1
 
 # copy scripts/examples/tests into container for end user
 WORKDIR /workspace/nemo

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -718,7 +718,7 @@ pipeline {
       parallel {
         stage('SGD-GEN') {
           steps {
-            sh 'cd examples/nlp/dialogue_state_tracking_generative && \
+            sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue_state_tracking_generative && \
             python sgd_gen.py \
             model.dataset.data_dir=/home/TestData/nlp/sgd_small \
             model.language_model.lm_checkpoint=/home/TestData/nlp/gpt2/pytorch_model.bin\
@@ -734,16 +734,18 @@ pipeline {
             trainer.val_check_interval=0.0 \
             trainer.gpus=[0] \
             model.dataset.use_cache=false \
-            model.tokenizer.special_tokens={pad_token:"endoftext"}\
-            model.language_model.pretrained_model_name=gpt2 \
+            model.tokenizer.special_tokens={pad_token:"endoftext"} \
+            model.tokenizer.tokenizer_name=gpt2 \
+            model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\
+            model.language_model.pretrained_model_name=/home/TestData/nlp/gpt2 \
             trainer.accelerator=ddp \
             exp_manager=null  && \
             rm -rf sgd_gen_outputs'
           }
         }
         stage('SGD-GEN Backward compatible with SGDQA') {
           steps {
-            sh 'cd examples/nlp/dialogue_state_tracking_generative && \
+            sh 'TRANSFORMERS_OFFLINE=0 && cd examples/nlp/dialogue_state_tracking_generative && \
             python sgd_gen.py \
             model.dataset.data_dir=/home/TestData/nlp/sgd_small \
             model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \
@@ -760,7 +762,7 @@ pipeline {
             model.language_model.pretrained_model_name=bert-base-cased \
             trainer.accelerator=ddp \
             exp_manager=null  && \
-            rm -rf sgd_gen_bert_outputs'
+            rm -rf sgd_gen_bert_outputs && TRANSFORMERS_OFFLINE=1'
           }
         }
       }

diff --git a/examples/nlp/dialogue_state_tracking_generative/conf/dialogue_config.yaml b/examples/nlp/dialogue_state_tracking_generative/conf/dialogue_config.yaml
@@ -29,10 +29,12 @@ trainer:
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   checkpoint_callback: False  # Provided by exp_manager
-  logger: False  # Provided by exp_manager
-
+  logger: False  # Provided by exp_manager 
+  
 model:
+  tensor_model_parallel_size: 1
   nemo_path: null # filename to save the model and associated artifacts to .nemo file
+  library: huggingface # huggingface or megatron
   tokenizer:
       tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece
       vocab_file: null # path to vocab file

diff --git a/examples/nlp/dialogue_state_tracking_generative/sgd_gen.py b/examples/nlp/dialogue_state_tracking_generative/sgd_gen.py
@@ -106,18 +106,29 @@
 
 from nemo.collections.nlp.models.dialogue_state_tracking_generative.dialogue_gpt_model import DialogueGPTModel
 from nemo.collections.nlp.models.dialogue_state_tracking_sgdqa.sgdqa_model import SGDQAModel
+from nemo.collections.nlp.modules.common.megatron.megatron_utils import compute_model_parallel_rank
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
+from nemo.utils.app_state import AppState
 from nemo.utils.exp_manager import exp_manager
 
 
 @hydra_runner(config_path="conf", config_name="dialogue_config")
 def main(cfg: DictConfig) -> None:
     pl.seed_everything(42)
     logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
-    trainer = pl.Trainer(**cfg.trainer)
+
+    plugin = NLPDDPPlugin()
+    trainer = pl.Trainer(**cfg.trainer, plugins=plugin)
+
     exp_manager(trainer, cfg.get("exp_manager", None))
 
+    app_state = AppState()
+    if cfg.model.tensor_model_parallel_size > 1:
+        app_state.model_parallel_size = cfg.model.tensor_model_parallel_size
+        app_state.model_parallel_rank = compute_model_parallel_rank(trainer.local_rank, app_state.model_parallel_size)
+
     if 'bert' in cfg.model.language_model.pretrained_model_name:
         model_class = SGDQAModel
     elif 'gpt' in cfg.model.language_model.pretrained_model_name.lower():
@@ -155,7 +166,7 @@ def main(cfg: DictConfig) -> None:
 
     if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.ds_item is not None:
         gpu = 1 if cfg.trainer.gpus != 0 else 0
-        trainer = pl.Trainer(gpus=gpu)
+        trainer = pl.Trainer(gpus=gpu, plugins=plugin, precision=16)
         model.setup_multiple_test_data(test_data_config=cfg.model.test_ds)
         if model.prepare_test(trainer):
             trainer.test(model)

diff --git a/nemo/collections/asr/parts/preprocessing/features.py b/nemo/collections/asr/parts/preprocessing/features.py
@@ -169,8 +169,8 @@ def inverse(self, magnitude, phase):
 
         if self.window is not None:
             window_sum = librosa.filters.window_sumsquare(
-                self.window,
-                magnitude.size(-1),
+                window=self.window,
+                n_frames=magnitude.size(-1),
                 hop_length=self.hop_length,
                 win_length=self.win_length,
                 n_fft=self.filter_length,
@@ -302,7 +302,8 @@ def __init__(
         highfreq = highfreq or sample_rate / 2
 
         filterbanks = torch.tensor(
-            librosa.filters.mel(sample_rate, self.n_fft, n_mels=nfilt, fmin=lowfreq, fmax=highfreq), dtype=torch.float
+            librosa.filters.mel(sr=sample_rate, n_fft=self.n_fft, n_mels=nfilt, fmin=lowfreq, fmax=highfreq),
+            dtype=torch.float,
         ).unsqueeze(0)
         self.register_buffer("fb", filterbanks)
 

diff --git a/nemo/collections/asr/parts/preprocessing/perturb.py b/nemo/collections/asr/parts/preprocessing/perturb.py
@@ -162,7 +162,9 @@ def perturb(self, data):
             return
 
         new_sr = int(self._sr * speed_rate)
-        data._samples = librosa.core.resample(data._samples, self._sr, new_sr, res_type=self._res_type)
+        data._samples = librosa.core.resample(
+            data._samples, orig_sr=self._sr, target_sr=new_sr, res_type=self._res_type
+        )
 
 
 class TimeStretchPerturbation(Perturbation):

diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py
@@ -72,10 +72,10 @@ def __init__(self, samples, sample_rate, target_sr=None, trim=False, trim_db=60,
         """
         samples = self._convert_samples_to_float32(samples)
         if target_sr is not None and target_sr != sample_rate:
-            samples = librosa.core.resample(samples, sample_rate, target_sr)
+            samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr)
             sample_rate = target_sr
         if trim:
-            samples, _ = librosa.effects.trim(samples, trim_db)
+            samples, _ = librosa.effects.trim(samples, top_db=trim_db)
         self._samples = samples
         self._sample_rate = sample_rate
         if self._samples.ndim >= 2:

diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
@@ -281,11 +281,11 @@ def _greedy_decode(
         # out_len: [seq_len]
 
         # Initialize blank state and empty label set in Hypothesis
-        hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[])
+        hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None)
 
         if partial_hypotheses is not None:
-            if len(partial_hypotheses.y_sequence) > 0:
-                hypothesis.y_sequence.append(partial_hypotheses.y_sequence[-1].cpu().numpy())
+            hypothesis.last_token = partial_hypotheses.last_token
+            if partial_hypotheses.dec_state is not None:
                 hypothesis.dec_state = self.decoder.batch_concat_states([partial_hypotheses.dec_state])
                 hypothesis.dec_state = _states_to_device(hypothesis.dec_state, x.device)
 
@@ -308,11 +308,10 @@ def _greedy_decode(
             while not_blank and (self.max_symbols is None or symbols_added < self.max_symbols):
                 # In the first timestep, we initialize the network with RNNT Blank
                 # In later timesteps, we provide previous predicted label as input.
-                last_label = (
-                    self._SOS
-                    if (hypothesis.y_sequence == [] and hypothesis.dec_state is None)
-                    else hypothesis.y_sequence[-1]
-                )
+                if hypothesis.last_token is None and hypothesis.dec_state is None:
+                    last_label = self._SOS
+                else:
+                    last_label = label_collate([[hypothesis.last_token]])
 
                 # Perform prediction network and joint network steps.
                 g, hidden_prime = self._pred_step(last_label, hypothesis.dec_state)
@@ -347,6 +346,7 @@ def _greedy_decode(
                     hypothesis.score += float(v)
                     hypothesis.timestep.append(time_idx)
                     hypothesis.dec_state = hidden_prime
+                    hypothesis.last_token = k
 
                 # Increment token counter.
                 symbols_added += 1
@@ -359,10 +359,6 @@ def _greedy_decode(
         # Unpack the hidden states
         hypothesis.dec_state = self.decoder.batch_select_state(hypothesis.dec_state, 0)
 
-        # Remove the original input label if partial hypothesis was provided
-        if partial_hypotheses is not None:
-            hypothesis.y_sequence = hypothesis.y_sequence[1:]
-
         return hypothesis