Merge branch 'main' into multilang_asr_models

NVIDIA · Mar 31, 2022 · 98db7d1 · 98db7d1
2 parents f4470dd + ca8a7e0
commit 98db7d1
Show file tree

Hide file tree

Showing 17 changed files with 92 additions and 47 deletions.
diff --git a/.github/workflows/changelog-build.yml b/.github/workflows/changelog-build.yml
@@ -25,7 +25,7 @@ jobs:
 
       - name: Build Changelog
         id: github_tag
-        uses: mikepenz/release-changelog-builder-action@v2.8.0
+        uses: mikepenz/release-changelog-builder-action@v2.9.0
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:

diff --git a/.github/workflows/config/changelog-config.json b/.github/workflows/config/changelog-config.json
@@ -1,43 +1,43 @@
 {
     "categories": [
       {
-        "title": "## ASR",
+        "title": "## ASR \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
         "labels": ["asr"]
       },
       {
-        "title": "## TTS",
+        "title": "## TTS \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
         "labels": ["tts"]
       },
       {
-        "title": "## NLP / NMT",
+        "title": "## NLP / NMT \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
         "labels": ["nlp", "nmt", "megatron"]
       },
       {
-        "title": "## Text Normalization / Inverse Text Normalization",
+        "title": "## Text Normalization / Inverse Text Normalization \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
         "labels": ["tn", "itn"]
       },
       {
-        "title": "## NeMo Tools",
+        "title": "## NeMo Tools \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
         "labels": ["tools"]
       },
       {
-        "title": "## Export",
+        "title": "## Export \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
         "labels": ["export"]
       },
       {
-        "title": "## Documentation",
+        "title": "## Documentation \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
         "labels": ["docs"]
       },
       {
-        "title": "## Bugfixes",
+        "title": "## Bugfixes \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
         "labels": ["bug"]
       }
     ],
     "ignore_labels": [
       "ignore"
     ],
     "sort": "ASC",
-    "template": "${{CHANGELOG}}\nUncategorized:\n${{UNCATEGORIZED}}\n\n",
+    "template": "\n${{CHANGELOG}}\nUncategorized:\n${{UNCATEGORIZED}}\n\n",
     "pr_template": "- ${{TITLE}} by @${{AUTHOR}} :: PR: #${{NUMBER}}",
     "empty_template": "${{OWNER}}\n${{REPO}}\n${{FROM_TAG}}\n${{TO_TAG}}",
     "label_extractor": [

diff --git a/Dockerfile b/Dockerfile
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:22.02-py3
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:22.03-py3
 
 
 # build an image that includes only the nemo dependencies, ensures that dependencies

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -1,7 +1,7 @@
 pipeline {
   agent {
         docker {
-      image 'gitlab-master.nvidia.com/sandeepsub/nemo_containers:nemo-180-2202-ci-apex-2998a235292bdf1c7de1500ff3baaa96732b6a4f'
+      image 'gitlab-master.nvidia.com/sandeepsub/nemo_containers:nemo-180-2203-ci-apex-8cc91ceaa8faa64451d90e11b8ad4732393b32aa'
       args '--device=/dev/nvidia0 --gpus all -e TRANSFORMERS_OFFLINE=1 --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g'
         }
   }

diff --git a/README.rst b/README.rst
@@ -182,7 +182,7 @@ Megatron GPT training requires NVIDIA Apex to be installed.
 
     git clone https://github.com/NVIDIA/apex
     cd apex
-    git checkout 89edb8196546187247c487a0494f7b6767ff0dc5
+    git checkout 8cc91ceaa8faa64451d90e11b8ad4732393b32aa
     pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" ./
 
 Docker containers:
@@ -194,13 +194,13 @@ To build a nemo container with Dockerfile from a branch, please run
     DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest .
 
 
-If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 22.01-py3 and then installing from GitHub.
+If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 22.03-py3 and then installing from GitHub.
 
 .. code-block:: bash
 
     docker run --gpus all -it --rm -v <nemo_github_folder>:/NeMo --shm-size=8g \
     -p 8888:8888 -p 6006:6006 --ulimit memlock=-1 --ulimit \
-    stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:22.02-py3
+    stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:22.03-py3
 
 Examples
 --------

diff --git a/nemo/collections/common/tokenizers/en_ja_tokenizers.py b/nemo/collections/common/tokenizers/en_ja_tokenizers.py
@@ -11,8 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
 from typing import List
 
+import ipadic
+import MeCab
+from pangu import spacing
 from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer
 
 
@@ -55,3 +59,29 @@ def normalize(self, text) -> str:
             return self.normalizer.normalize(text)
         else:
             return text
+
+
+class JaMecabProcessor:
+    """
+    Tokenizer, Detokenizer and Normalizer utilities for Japanese MeCab & English
+    """
+
+    def __init__(self):
+        self.mecab_tokenizer = MeCab.Tagger(ipadic.MECAB_ARGS + " -Owakati")
+
+    def detokenize(self, text: List[str]) -> str:
+        RE_WS_IN_FW = re.compile(
+            r'([\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])\s+(?=[\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])'
+        )
+
+        detokenize = lambda s: spacing(RE_WS_IN_FW.sub(r'\1', s)).strip()
+        return detokenize(' '.join(text))
+
+    def tokenize(self, text) -> str:
+        """
+        Tokenizes text using Moses. Returns a string of tokens.
+        """
+        return self.mecab_tokenizer.parse(text).strip()
+
+    def normalize(self, text) -> str:
+        return text
diff --git a/nemo/collections/nlp/data/glue_benchmark/gpt_ptune_dataset.py b/nemo/collections/nlp/data/glue_benchmark/gpt_ptune_dataset.py
@@ -21,16 +21,15 @@
 import functools
 import json
 import re
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional
 
 import numpy as np
 import torch
-from sympy import substitution
 
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
-from nemo.collections.nlp.data.language_modeling.megatron.t5_dataset import (
-    make_attention_mask_3d,
-    make_history_mask_3d,
+from nemo.collections.nlp.modules.common.megatron.utils import (
+    make_inference_attention_mask_3d,
+    make_inference_history_mask_3d,
 )
 from nemo.core.classes import Dataset
 from nemo.core.neural_types import CategoricalValuesType, ChannelType, MaskType, NeuralType, RegressionValuesType
@@ -369,8 +368,8 @@ def collate_fn(self, batch):
         label_position = torch.cat([label_start.unsqueeze(1), label_position.unsqueeze(1)], 1)
         loss_mask[labels == SMALL_NUM] = 0
 
-        input_attn_mask = make_attention_mask_3d(enc_input, enc_input, self.pad_id)
-        input_attn_mask = (input_attn_mask * make_history_mask_3d(enc_input)).long()
+        input_attn_mask = make_inference_attention_mask_3d(enc_input, enc_input, self.pad_id)
+        input_attn_mask = (input_attn_mask * make_inference_history_mask_3d(enc_input)).long()
 
         return {
             'enc_input': enc_input,

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_ptune_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_ptune_gpt_model.py
@@ -21,15 +21,13 @@
 from torch import Tensor
 
 from nemo.collections.nlp.data.glue_benchmark.gpt_ptune_dataset import GPTPTuneDataset, GPTPTuneInferenceDataset
-from nemo.collections.nlp.data.language_modeling.megatron.t5_dataset import (
-    make_attention_mask_3d,
-    make_history_mask_3d,
-)
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.modules.common.megatron.utils import (
     average_losses_across_data_parallel_group,
     build_position_ids,
+    make_inference_attention_mask_3d,
+    make_inference_history_mask_3d,
 )
 from nemo.collections.nlp.modules.common.prompt_encoder import PromptEncoder
 from nemo.utils import logging
@@ -238,8 +236,10 @@ def decode(self, enc_query, enc_taskname, label_position, num_tokens_to_generate
             label_start = label_position[:, 0].clone()
 
             for _ in range(num_tokens_to_generate):
-                attn_mask = make_attention_mask_3d(predicted_tokens_dec, predicted_tokens_dec, self.pad_token_id)
-                attn_mask = attn_mask * make_history_mask_3d(predicted_tokens_dec)
+                attn_mask = make_inference_attention_mask_3d(
+                    predicted_tokens_dec, predicted_tokens_dec, self.pad_token_id
+                )
+                attn_mask = attn_mask * make_inference_history_mask_3d(predicted_tokens_dec)
 
                 attn_mask = attn_mask < 0.5
 

diff --git a/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py b/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py
@@ -34,7 +34,7 @@
 from nemo.collections.common.parts import transformer_weights_init
 from nemo.collections.common.tokenizers.bytelevel_tokenizers import ByteLevelProcessor
 from nemo.collections.common.tokenizers.chinese_tokenizers import ChineseProcessor
-from nemo.collections.common.tokenizers.en_ja_tokenizers import EnJaProcessor
+from nemo.collections.common.tokenizers.en_ja_tokenizers import EnJaProcessor, JaMecabProcessor
 from nemo.collections.common.tokenizers.indic_tokenizers import IndicProcessor
 from nemo.collections.common.tokenizers.moses_tokenizers import MosesProcessor
 from nemo.collections.nlp.data import TarredTranslationDataset, TranslationDataset
@@ -427,9 +427,13 @@ def eval_epoch_end(self, outputs, mode, global_rank):
                     _translations += [t for (t, g) in tr_and_gt[rank]]
                     _ground_truths += [g for (t, g) in tr_and_gt[rank]]
 
-                if self.tgt_language in ['ja']:
+                if self.multilingual and isinstance(self.tgt_language, ListConfig):
+                    tgt_language = self.tgt_language[dataloader_idx]
+                else:
+                    tgt_language = self.tgt_language
+                if tgt_language in ['ja', 'ja-mecab']:
                     sacre_bleu = corpus_bleu(_translations, [_ground_truths], tokenize="ja-mecab")
-                elif self.tgt_language in ['zh']:
+                elif tgt_language in ['zh']:
                     sacre_bleu = corpus_bleu(_translations, [_ground_truths], tokenize="zh")
                 else:
                     sacre_bleu = corpus_bleu(_translations, [_ground_truths], tokenize="13a")
@@ -862,7 +866,9 @@ def setup_pre_and_post_processing_utils(
         if encoder_tokenizer_library == 'byte-level':
             source_processor = ByteLevelProcessor()
         elif (source_lang == 'en' and target_lang == 'ja') or (source_lang == 'ja' and target_lang == 'en'):
-            source_processor = EnJaProcessor(source_lang)
+            self.source_processor = EnJaProcessor(source_lang)
+        elif source_lang == 'ja-mecab':
+            self.source_processor = JaMecabProcessor()
         elif source_lang == 'zh':
             source_processor = ChineseProcessor()
         elif source_lang == 'hi':
@@ -875,7 +881,9 @@ def setup_pre_and_post_processing_utils(
         if decoder_tokenizer_library == 'byte-level':
             target_processor = ByteLevelProcessor()
         elif (source_lang == 'en' and target_lang == 'ja') or (source_lang == 'ja' and target_lang == 'en'):
-            target_processor = EnJaProcessor(target_lang)
+            self.target_processor = EnJaProcessor(target_lang)
+        elif target_lang == 'ja-mecab':
+            self.target_processor = JaMecabProcessor()
         elif target_lang == 'zh':
             target_processor = ChineseProcessor()
         elif target_lang == 'hi':

diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py
@@ -207,6 +207,10 @@ def normalize(
 
         Returns: spoken form
         """
+        assert (
+            len(text.split()) < 500
+        ), "Your input is too long. Please split up the input into sentences, or strings with fewer than 500 words"
+
         original_text = text
         if punct_pre_process:
             text = pre_process(text)

diff --git a/nemo_text_processing/text_normalization/normalize_with_audio.py b/nemo_text_processing/text_normalization/normalize_with_audio.py
@@ -124,6 +124,10 @@ def normalize(self, text: str, n_tagged: int, punct_post_process: bool = True, v
         Returns:
             normalized text options (usually there are multiple ways of normalizing a given semiotic class)
         """
+
+        assert (
+            len(text.split()) < 500
+        ), "Your input is too long. Please split up the input into sentences, or strings with fewer than 500 words"
         original_text = text
 
         if self.lang == "en":

diff --git a/reinstall.sh b/reinstall.sh
@@ -20,7 +20,8 @@ ${PIP} uninstall -y nemo_cv
 ${PIP} install -U setuptools
 
 # TODO: check if we need this for 22.03
-if [ "${NVIDIA_PYTORCH_VERSION}" = "22.01" ] || [ "${NVIDIA_PYTORCH_VERSION}" = "22.02" ]
+if [ "${NVIDIA_PYTORCH_VERSION}" = "22.01" ] || [ "${NVIDIA_PYTORCH_VERSION}" = "22.02" ] || [ "${NVIDIA_PYTORCH_VERSION}" = "22.03" ]
+
 then
   echo 'Installing NeMo in NVIDIA PyTorch container:' ${NVIDIA_PYTORCH_VERSION} 'so will not install numba'
 else

diff --git a/tests/collections/asr/test_asr_datasets.py b/tests/collections/asr/test_asr_datasets.py
@@ -164,6 +164,7 @@ def test_tarred_bpe_dataset(self, test_data_dir):
             count += 1
         assert count == 32
 
+    @pytest.mark.pleasefixme
     @pytest.mark.skipif(not HAVE_DALI, reason="NVIDIA DALI is not installed or incompatible version")
     @pytest.mark.unit
     def test_dali_char_dataset(self, test_data_dir):
@@ -413,6 +414,7 @@ def test_dali_char_vs_ref_dataset(self, test_data_dir):
                 assert np.mean(err) < 0.0001
                 assert np.max(err) < 0.01
 
+    @pytest.mark.pleasefixme
     @pytest.mark.skipif(not HAVE_DALI, reason="NVIDIA DALI is not installed or incompatible version")
     @pytest.mark.unit
     def test_tarred_dali_char_dataset(self, test_data_dir):

diff --git a/tests/collections/nlp/test_huggingface.py b/tests/collections/nlp/test_huggingface.py
@@ -49,7 +49,8 @@ def test_get_pretrained_bert_model(self):
         self.omega_conf.language_model.pretrained_model_name = 'bert-base-uncased'
         model = nemo_nlp.modules.get_lm_model(cfg=self.omega_conf)
         assert isinstance(model, nemo_nlp.modules.BertEncoder)
-        do_export(model, "bert-base-uncased")
+        # TODO: Fix
+        # do_export(model, "bert-base-uncased")
 
     @pytest.mark.with_downloads()
     @pytest.mark.unit
@@ -73,7 +74,8 @@ def test_get_pretrained_albert_model(self):
         self.omega_conf.language_model.pretrained_model_name = 'albert-base-v1'
         model = nemo_nlp.modules.get_lm_model(cfg=self.omega_conf)
         assert isinstance(model, nemo_nlp.modules.AlbertEncoder)
-        do_export(model, "albert-base-v1")
+        # TODO: fix
+        # do_export(model, "albert-base-v1")
 
     @pytest.mark.with_downloads()
     @pytest.mark.unit

diff --git a/tests/collections/nlp/test_nlp_exportables.py b/tests/collections/nlp/test_nlp_exportables.py
@@ -99,6 +99,7 @@ def test_IntentSlotClassificationModel_export_to_onnx(self, dummy_data):
             assert onnx_model.graph.output[0].name == 'intent_logits'
             assert onnx_model.graph.output[1].name == 'slot_logits'
 
+    @pytest.mark.pleasefixme
     @pytest.mark.with_downloads()
     @pytest.mark.run_only_on('GPU')
     @pytest.mark.unit
@@ -129,6 +130,7 @@ def test_PunctuationCapitalizationModel_export_to_onnx(self):
             assert onnx_model.graph.output[0].name == 'punct_logits'
             assert onnx_model.graph.output[1].name == 'capit_logits'
 
+    @pytest.mark.pleasefixme
     @pytest.mark.with_downloads()
     @pytest.mark.run_only_on('GPU')
     @pytest.mark.unit

diff --git a/tests/collections/tts/test_tts_exportables.py b/tests/collections/tts/test_tts_exportables.py
@@ -39,6 +39,7 @@ def hifigan_model():
 
 
 class TestExportable:
+    @pytest.mark.pleasefixme
     @pytest.mark.run_only_on('GPU')
     @pytest.mark.unit
     def test_FastPitchModel_export_to_onnx(self, fastpitch_model):

diff --git a/tutorials/asr/ASR_with_NeMo.ipynb b/tutorials/asr/ASR_with_NeMo.ipynb
@@ -25,15 +25,6 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.7.7"
-  },
-  "pycharm": {
-   "stem_cell": {
-    "cell_type": "raw",
-    "source": [],
-    "metadata": {
-     "collapsed": false
-    }
-   }
   }
  },
  "cells": [
@@ -801,6 +792,7 @@
     "# Setup the test data loader and make sure the model is on GPU\n",
     "first_asr_model.setup_test_data(test_data_config=params['model']['validation_ds'])\n",
     "first_asr_model.cuda()\n",
+    "first_asr_model.eval()\n",
     "\n",
     "# We will be computing Word Error Rate (WER) metric between our hypothesis and predictions.\n",
     "# WER is computed as numerator/denominator.\n",
@@ -967,7 +959,7 @@
     "trainer = pl.Trainer(amp_level='O1', precision=16)\n",
     "\n",
     "# Trainer with a distributed backend:\n",
-    "trainer = pl.Trainer(devices=2, num_nodes=2, accelerator='gpu', strategy='dp')\n",
+    "trainer = pl.Trainer(devices=2, num_nodes=2, accelerator='gpu', strategy='ddp')\n",
     "\n",
     "# Of course, you can combine these flags as well.\n",
     "```\n",
@@ -1173,4 +1165,4 @@
    "outputs": []
   }
  ]
-}
+}