Skip to content

Commit

Permalink
Merge branch 'main' into multilang_asr_models
Browse files Browse the repository at this point in the history
  • Loading branch information
titu1994 authored Mar 31, 2022
2 parents f4470dd + ca8a7e0 commit 98db7d1
Show file tree
Hide file tree
Showing 17 changed files with 92 additions and 47 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/changelog-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
- name: Build Changelog
id: github_tag
uses: mikepenz/release-changelog-builder-action@v2.8.0
uses: mikepenz/release-changelog-builder-action@v2.9.0
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
Expand Down
18 changes: 9 additions & 9 deletions .github/workflows/config/changelog-config.json
Original file line number Diff line number Diff line change
@@ -1,43 +1,43 @@
{
"categories": [
{
"title": "## ASR",
"title": "## ASR \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
"labels": ["asr"]
},
{
"title": "## TTS",
"title": "## TTS \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
"labels": ["tts"]
},
{
"title": "## NLP / NMT",
"title": "## NLP / NMT \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
"labels": ["nlp", "nmt", "megatron"]
},
{
"title": "## Text Normalization / Inverse Text Normalization",
"title": "## Text Normalization / Inverse Text Normalization \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
"labels": ["tn", "itn"]
},
{
"title": "## NeMo Tools",
"title": "## NeMo Tools \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
"labels": ["tools"]
},
{
"title": "## Export",
"title": "## Export \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
"labels": ["export"]
},
{
"title": "## Documentation",
"title": "## Documentation \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
"labels": ["docs"]
},
{
"title": "## Bugfixes",
"title": "## Bugfixes \n\n<details><summary>Changelog</summary>\n\n</details>\n\n",
"labels": ["bug"]
}
],
"ignore_labels": [
"ignore"
],
"sort": "ASC",
"template": "${{CHANGELOG}}\nUncategorized:\n${{UNCATEGORIZED}}\n\n",
"template": "\n${{CHANGELOG}}\nUncategorized:\n${{UNCATEGORIZED}}\n\n",
"pr_template": "- ${{TITLE}} by @${{AUTHOR}} :: PR: #${{NUMBER}}",
"empty_template": "${{OWNER}}\n${{REPO}}\n${{FROM_TAG}}\n${{TO_TAG}}",
"label_extractor": [
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:22.02-py3
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:22.03-py3


# build an image that includes only the nemo dependencies, ensures that dependencies
Expand Down
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
pipeline {
agent {
docker {
image 'gitlab-master.nvidia.com/sandeepsub/nemo_containers:nemo-180-2202-ci-apex-2998a235292bdf1c7de1500ff3baaa96732b6a4f'
image 'gitlab-master.nvidia.com/sandeepsub/nemo_containers:nemo-180-2203-ci-apex-8cc91ceaa8faa64451d90e11b8ad4732393b32aa'
args '--device=/dev/nvidia0 --gpus all -e TRANSFORMERS_OFFLINE=1 --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g'
}
}
Expand Down
6 changes: 3 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ Megatron GPT training requires NVIDIA Apex to be installed.
git clone https://github.com/NVIDIA/apex
cd apex
git checkout 89edb8196546187247c487a0494f7b6767ff0dc5
git checkout 8cc91ceaa8faa64451d90e11b8ad4732393b32aa
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" ./
Docker containers:
Expand All @@ -194,13 +194,13 @@ To build a nemo container with Dockerfile from a branch, please run
DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest .
If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 22.01-py3 and then installing from GitHub.
If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 22.03-py3 and then installing from GitHub.

.. code-block:: bash
docker run --gpus all -it --rm -v <nemo_github_folder>:/NeMo --shm-size=8g \
-p 8888:8888 -p 6006:6006 --ulimit memlock=-1 --ulimit \
stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:22.02-py3
stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:22.03-py3
Examples
--------
Expand Down
30 changes: 30 additions & 0 deletions nemo/collections/common/tokenizers/en_ja_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from typing import List

import ipadic
import MeCab
from pangu import spacing
from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer


Expand Down Expand Up @@ -55,3 +59,29 @@ def normalize(self, text) -> str:
return self.normalizer.normalize(text)
else:
return text


class JaMecabProcessor:
"""
Tokenizer, Detokenizer and Normalizer utilities for Japanese MeCab & English
"""

def __init__(self):
self.mecab_tokenizer = MeCab.Tagger(ipadic.MECAB_ARGS + " -Owakati")

def detokenize(self, text: List[str]) -> str:
RE_WS_IN_FW = re.compile(
r'([\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])\s+(?=[\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])'
)

detokenize = lambda s: spacing(RE_WS_IN_FW.sub(r'\1', s)).strip()
return detokenize(' '.join(text))

def tokenize(self, text) -> str:
"""
Tokenizes text using Moses. Returns a string of tokens.
"""
return self.mecab_tokenizer.parse(text).strip()

def normalize(self, text) -> str:
return text
13 changes: 6 additions & 7 deletions nemo/collections/nlp/data/glue_benchmark/gpt_ptune_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,15 @@
import functools
import json
import re
from typing import Dict, List, Optional, Union
from typing import Dict, List, Optional

import numpy as np
import torch
from sympy import substitution

from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
from nemo.collections.nlp.data.language_modeling.megatron.t5_dataset import (
make_attention_mask_3d,
make_history_mask_3d,
from nemo.collections.nlp.modules.common.megatron.utils import (
make_inference_attention_mask_3d,
make_inference_history_mask_3d,
)
from nemo.core.classes import Dataset
from nemo.core.neural_types import CategoricalValuesType, ChannelType, MaskType, NeuralType, RegressionValuesType
Expand Down Expand Up @@ -369,8 +368,8 @@ def collate_fn(self, batch):
label_position = torch.cat([label_start.unsqueeze(1), label_position.unsqueeze(1)], 1)
loss_mask[labels == SMALL_NUM] = 0

input_attn_mask = make_attention_mask_3d(enc_input, enc_input, self.pad_id)
input_attn_mask = (input_attn_mask * make_history_mask_3d(enc_input)).long()
input_attn_mask = make_inference_attention_mask_3d(enc_input, enc_input, self.pad_id)
input_attn_mask = (input_attn_mask * make_inference_history_mask_3d(enc_input)).long()

return {
'enc_input': enc_input,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,13 @@
from torch import Tensor

from nemo.collections.nlp.data.glue_benchmark.gpt_ptune_dataset import GPTPTuneDataset, GPTPTuneInferenceDataset
from nemo.collections.nlp.data.language_modeling.megatron.t5_dataset import (
make_attention_mask_3d,
make_history_mask_3d,
)
from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
from nemo.collections.nlp.modules.common.megatron.utils import (
average_losses_across_data_parallel_group,
build_position_ids,
make_inference_attention_mask_3d,
make_inference_history_mask_3d,
)
from nemo.collections.nlp.modules.common.prompt_encoder import PromptEncoder
from nemo.utils import logging
Expand Down Expand Up @@ -238,8 +236,10 @@ def decode(self, enc_query, enc_taskname, label_position, num_tokens_to_generate
label_start = label_position[:, 0].clone()

for _ in range(num_tokens_to_generate):
attn_mask = make_attention_mask_3d(predicted_tokens_dec, predicted_tokens_dec, self.pad_token_id)
attn_mask = attn_mask * make_history_mask_3d(predicted_tokens_dec)
attn_mask = make_inference_attention_mask_3d(
predicted_tokens_dec, predicted_tokens_dec, self.pad_token_id
)
attn_mask = attn_mask * make_inference_history_mask_3d(predicted_tokens_dec)

attn_mask = attn_mask < 0.5

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from nemo.collections.common.parts import transformer_weights_init
from nemo.collections.common.tokenizers.bytelevel_tokenizers import ByteLevelProcessor
from nemo.collections.common.tokenizers.chinese_tokenizers import ChineseProcessor
from nemo.collections.common.tokenizers.en_ja_tokenizers import EnJaProcessor
from nemo.collections.common.tokenizers.en_ja_tokenizers import EnJaProcessor, JaMecabProcessor
from nemo.collections.common.tokenizers.indic_tokenizers import IndicProcessor
from nemo.collections.common.tokenizers.moses_tokenizers import MosesProcessor
from nemo.collections.nlp.data import TarredTranslationDataset, TranslationDataset
Expand Down Expand Up @@ -427,9 +427,13 @@ def eval_epoch_end(self, outputs, mode, global_rank):
_translations += [t for (t, g) in tr_and_gt[rank]]
_ground_truths += [g for (t, g) in tr_and_gt[rank]]

if self.tgt_language in ['ja']:
if self.multilingual and isinstance(self.tgt_language, ListConfig):
tgt_language = self.tgt_language[dataloader_idx]
else:
tgt_language = self.tgt_language
if tgt_language in ['ja', 'ja-mecab']:
sacre_bleu = corpus_bleu(_translations, [_ground_truths], tokenize="ja-mecab")
elif self.tgt_language in ['zh']:
elif tgt_language in ['zh']:
sacre_bleu = corpus_bleu(_translations, [_ground_truths], tokenize="zh")
else:
sacre_bleu = corpus_bleu(_translations, [_ground_truths], tokenize="13a")
Expand Down Expand Up @@ -862,7 +866,9 @@ def setup_pre_and_post_processing_utils(
if encoder_tokenizer_library == 'byte-level':
source_processor = ByteLevelProcessor()
elif (source_lang == 'en' and target_lang == 'ja') or (source_lang == 'ja' and target_lang == 'en'):
source_processor = EnJaProcessor(source_lang)
self.source_processor = EnJaProcessor(source_lang)
elif source_lang == 'ja-mecab':
self.source_processor = JaMecabProcessor()
elif source_lang == 'zh':
source_processor = ChineseProcessor()
elif source_lang == 'hi':
Expand All @@ -875,7 +881,9 @@ def setup_pre_and_post_processing_utils(
if decoder_tokenizer_library == 'byte-level':
target_processor = ByteLevelProcessor()
elif (source_lang == 'en' and target_lang == 'ja') or (source_lang == 'ja' and target_lang == 'en'):
target_processor = EnJaProcessor(target_lang)
self.target_processor = EnJaProcessor(target_lang)
elif target_lang == 'ja-mecab':
self.target_processor = JaMecabProcessor()
elif target_lang == 'zh':
target_processor = ChineseProcessor()
elif target_lang == 'hi':
Expand Down
4 changes: 4 additions & 0 deletions nemo_text_processing/text_normalization/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,10 @@ def normalize(
Returns: spoken form
"""
assert (
len(text.split()) < 500
), "Your input is too long. Please split up the input into sentences, or strings with fewer than 500 words"

original_text = text
if punct_pre_process:
text = pre_process(text)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@ def normalize(self, text: str, n_tagged: int, punct_post_process: bool = True, v
Returns:
normalized text options (usually there are multiple ways of normalizing a given semiotic class)
"""

assert (
len(text.split()) < 500
), "Your input is too long. Please split up the input into sentences, or strings with fewer than 500 words"
original_text = text

if self.lang == "en":
Expand Down
3 changes: 2 additions & 1 deletion reinstall.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ ${PIP} uninstall -y nemo_cv
${PIP} install -U setuptools

# TODO: check if we need this for 22.03
if [ "${NVIDIA_PYTORCH_VERSION}" = "22.01" ] || [ "${NVIDIA_PYTORCH_VERSION}" = "22.02" ]
if [ "${NVIDIA_PYTORCH_VERSION}" = "22.01" ] || [ "${NVIDIA_PYTORCH_VERSION}" = "22.02" ] || [ "${NVIDIA_PYTORCH_VERSION}" = "22.03" ]

then
echo 'Installing NeMo in NVIDIA PyTorch container:' ${NVIDIA_PYTORCH_VERSION} 'so will not install numba'
else
Expand Down
2 changes: 2 additions & 0 deletions tests/collections/asr/test_asr_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ def test_tarred_bpe_dataset(self, test_data_dir):
count += 1
assert count == 32

@pytest.mark.pleasefixme
@pytest.mark.skipif(not HAVE_DALI, reason="NVIDIA DALI is not installed or incompatible version")
@pytest.mark.unit
def test_dali_char_dataset(self, test_data_dir):
Expand Down Expand Up @@ -413,6 +414,7 @@ def test_dali_char_vs_ref_dataset(self, test_data_dir):
assert np.mean(err) < 0.0001
assert np.max(err) < 0.01

@pytest.mark.pleasefixme
@pytest.mark.skipif(not HAVE_DALI, reason="NVIDIA DALI is not installed or incompatible version")
@pytest.mark.unit
def test_tarred_dali_char_dataset(self, test_data_dir):
Expand Down
6 changes: 4 additions & 2 deletions tests/collections/nlp/test_huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ def test_get_pretrained_bert_model(self):
self.omega_conf.language_model.pretrained_model_name = 'bert-base-uncased'
model = nemo_nlp.modules.get_lm_model(cfg=self.omega_conf)
assert isinstance(model, nemo_nlp.modules.BertEncoder)
do_export(model, "bert-base-uncased")
# TODO: Fix
# do_export(model, "bert-base-uncased")

@pytest.mark.with_downloads()
@pytest.mark.unit
Expand All @@ -73,7 +74,8 @@ def test_get_pretrained_albert_model(self):
self.omega_conf.language_model.pretrained_model_name = 'albert-base-v1'
model = nemo_nlp.modules.get_lm_model(cfg=self.omega_conf)
assert isinstance(model, nemo_nlp.modules.AlbertEncoder)
do_export(model, "albert-base-v1")
# TODO: fix
# do_export(model, "albert-base-v1")

@pytest.mark.with_downloads()
@pytest.mark.unit
Expand Down
2 changes: 2 additions & 0 deletions tests/collections/nlp/test_nlp_exportables.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ def test_IntentSlotClassificationModel_export_to_onnx(self, dummy_data):
assert onnx_model.graph.output[0].name == 'intent_logits'
assert onnx_model.graph.output[1].name == 'slot_logits'

@pytest.mark.pleasefixme
@pytest.mark.with_downloads()
@pytest.mark.run_only_on('GPU')
@pytest.mark.unit
Expand Down Expand Up @@ -129,6 +130,7 @@ def test_PunctuationCapitalizationModel_export_to_onnx(self):
assert onnx_model.graph.output[0].name == 'punct_logits'
assert onnx_model.graph.output[1].name == 'capit_logits'

@pytest.mark.pleasefixme
@pytest.mark.with_downloads()
@pytest.mark.run_only_on('GPU')
@pytest.mark.unit
Expand Down
1 change: 1 addition & 0 deletions tests/collections/tts/test_tts_exportables.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def hifigan_model():


class TestExportable:
@pytest.mark.pleasefixme
@pytest.mark.run_only_on('GPU')
@pytest.mark.unit
def test_FastPitchModel_export_to_onnx(self, fastpitch_model):
Expand Down
14 changes: 3 additions & 11 deletions tutorials/asr/ASR_with_NeMo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,6 @@
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"cells": [
Expand Down Expand Up @@ -801,6 +792,7 @@
"# Setup the test data loader and make sure the model is on GPU\n",
"first_asr_model.setup_test_data(test_data_config=params['model']['validation_ds'])\n",
"first_asr_model.cuda()\n",
"first_asr_model.eval()\n",
"\n",
"# We will be computing Word Error Rate (WER) metric between our hypothesis and predictions.\n",
"# WER is computed as numerator/denominator.\n",
Expand Down Expand Up @@ -967,7 +959,7 @@
"trainer = pl.Trainer(amp_level='O1', precision=16)\n",
"\n",
"# Trainer with a distributed backend:\n",
"trainer = pl.Trainer(devices=2, num_nodes=2, accelerator='gpu', strategy='dp')\n",
"trainer = pl.Trainer(devices=2, num_nodes=2, accelerator='gpu', strategy='ddp')\n",
"\n",
"# Of course, you can combine these flags as well.\n",
"```\n",
Expand Down Expand Up @@ -1173,4 +1165,4 @@
"outputs": []
}
]
}
}

0 comments on commit 98db7d1

Please sign in to comment.