Merge branch 'master' into type_docs_tests

Signed-off-by: Oleksii Kuchaiev <okuchaiev@nvidia.com>
NVIDIA · Feb 14, 2020 · d980b64 · d980b64
2 parents bcd26d2 + 3e04e09
commit d980b64
Show file tree

Hide file tree

Showing 23 changed files with 570 additions and 344 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -94,6 +94,8 @@ To release a new version, please update the changelog as followed:
     - Updated licenses
 - Updated nemo's use of the logging library. from nemo import logging is now the reccomended way of using the nemo logger. neural_factory.logger and all other instances of logger are now deprecated and planned for removal in the next version. Please see PR 267 for complete change information.
 ([PR #267](https://github.com/NVIDIA/NeMo/pull/267), [PR #283](https://github.com/NVIDIA/NeMo/pull/283), [PR #305](https://github.com/NVIDIA/NeMo/pull/305), [PR #311](https://github.com/NVIDIA/NeMo/pull/311)) - @blisc
+- Changed Distributed Data Parallel from Apex to Torch
+([PR #336](https://github.com/NVIDIA/NeMo/pull/336)) - @blisc
 
 - Added TRADE (dialogue state tracking model) on MultiWOZ dataset
 ([PR #322](https://github.com/NVIDIA/NeMo/pull/322)) - @chiphuyen, @VahidooX
@@ -108,6 +110,8 @@ To release a new version, please update the changelog as followed:
 ([PR #308](https://github.com/NVIDIA/NeMo/pull/309)) - @tkornuta-nvidia
 
 ### Removed
+- gradient_predivide_factor arg of train() now has no effect
+([PR #336](https://github.com/NVIDIA/NeMo/pull/336)) - @blisc
 - Dropped support of the following ASR configs: jasper10x4.yaml, quartznet10x5.yaml, quartznet15x5_in.yaml, quartznet5x3.yaml, quartznet5x5.yaml, quartznet_an4.yaml. They are moved to experimental/configs and can still be used with v0.9 for use in replicating paper results
 ([PR #354](https://github.com/NVIDIA/NeMo/pull/354)) - @blisc
 

diff --git a/docs/sources/source/nlp/bert_pretraining.rst b/docs/sources/source/nlp/bert_pretraining.rst
@@ -6,6 +6,12 @@ Make sure you have ``nemo`` and ``nemo_nlp`` installed before starting this tuto
 
 The code used in this tutorial can be found at ``examples/nlp/language_modeling/bert_pretraining.py``.
 
+.. tip::
+    Pretrained BERT models can be found at 
+    `https://ngc.nvidia.com/catalog/models/nvidia:bertlargeuncasedfornemo <https://ngc.nvidia.com/catalog/models/nvidia:bertlargeuncasedfornemo>`__
+    `https://ngc.nvidia.com/catalog/models/nvidia:bertbaseuncasedfornemo <https://ngc.nvidia.com/catalog/models/nvidia:bertbaseuncasedfornemo>`__
+    `https://ngc.nvidia.com/catalog/models/nvidia:bertbasecasedfornemo <https://ngc.nvidia.com/catalog/models/nvidia:bertbasecasedfornemo>`__
+
 Introduction
 ------------
 

diff --git a/docs/sources/source/nlp/joint_intent_slot_filling.rst b/docs/sources/source/nlp/joint_intent_slot_filling.rst
@@ -9,6 +9,10 @@ There are four pre-trained BERT models that we can select from using the argumen
 using the script for loading pre-trained models from `pytorch_transformers`. See the list of available pre-trained models
 `here <https://huggingface.co/pytorch-transformers/pretrained_models.html>`__. 
 
+.. tip::
+
+    For pretraining BERT in NeMo and pretrained model checkpoints go to `BERT pretraining <https://nvidia.github.io/NeMo/nlp/bert_pretraining.html>`__.
+
 
 Preliminaries
 -------------

diff --git a/docs/sources/source/nlp/ner.rst b/docs/sources/source/nlp/ner.rst
@@ -4,6 +4,12 @@ Tutorial
 Make sure you have ``nemo`` and ``nemo_nlp`` installed before starting this
 tutorial. See the :ref:`installation` section for more details.
 
+.. tip::
+
+    For pretraining BERT in NeMo and pretrained model checkpoints go to `BERT pretraining <https://nvidia.github.io/NeMo/nlp/bert_pretraining.html>`__.
+
+
+
 Introduction
 ------------
 

diff --git a/docs/sources/source/nlp/punctuation.rst b/docs/sources/source/nlp/punctuation.rst
@@ -7,6 +7,8 @@ An ASR system typically generates text with no punctuation and capitalization of
 .. tip::
 
     We recommend you to try this example in Jupyter notebook examples/nlp/token_classification/PunctuationWithBERT.ipynb.
+    For pretraining BERT in NeMo and pretrained model checkpoints go to `BERT pretraining <https://nvidia.github.io/NeMo/nlp/bert_pretraining.html>`__.
+
 
 Task Description
 ----------------

diff --git a/docs/sources/source/nlp/question_answering.rst b/docs/sources/source/nlp/question_answering.rst
@@ -13,6 +13,12 @@ The pretrained back-bone models can be specified by `--model_type` and the speci
 See the list of available pre-trained models
 `here <https://huggingface.co/transformers/pretrained_models.html>`__. 
 
+.. tip::
+
+    For pretraining BERT in NeMo and pretrained model checkpoints go to `BERT pretraining <https://nvidia.github.io/NeMo/nlp/bert_pretraining.html>`__.
+
+
+
 Preliminaries
 -------------
 

diff --git a/examples/nlp/asr_postprocessor/asr_postprocessor.py b/examples/nlp/asr_postprocessor/asr_postprocessor.py
@@ -26,6 +26,7 @@
     eval_epochs_done_callback_wer,
     eval_iter_callback,
 )
+from nemo.core import WeightShareTransform
 from nemo.core.callbacks import CheckpointCallback
 from nemo.utils.lr_policies import SquareAnnealing
 
@@ -126,9 +127,33 @@
 )
 
 # tie all embeddings weights
-t_log_softmax.mlp.layer0.weight = encoder.bert.embeddings.word_embeddings.weight
-decoder.embedding_layer.token_embedding.weight = encoder.bert.embeddings.word_embeddings.weight
-decoder.embedding_layer.position_embedding.weight = encoder.bert.embeddings.position_embeddings.weight
+# t_log_softmax.mlp.layer0.weight = encoder.bert.embeddings.word_embeddings.weight
+# decoder.embedding_layer.token_embedding.weight = encoder.bert.embeddings.word_embeddings.weight
+# decoder.embedding_layer.position_embedding.weight = encoder.bert.embeddings.position_embeddings.weight
+t_log_softmax.tie_weights_with(
+    encoder,
+    weight_names=["mlp.layer0.weight"],
+    name2name_and_transform={
+        "mlp.layer0.weight": ("bert.embeddings.word_embeddings.weight", WeightShareTransform.SAME)
+    },
+)
+decoder.tie_weights_with(
+    encoder,
+    weight_names=["embedding_layer.token_embedding.weight"],
+    name2name_and_transform={
+        "embedding_layer.token_embedding.weight": ("bert.embeddings.word_embeddings.weight", WeightShareTransform.SAME)
+    },
+)
+decoder.tie_weights_with(
+    encoder,
+    weight_names=["embedding_layer.position_embedding.weight"],
+    name2name_and_transform={
+        "embedding_layer.position_embedding.weight": (
+            "bert.embeddings.position_embeddings.weight",
+            WeightShareTransform.SAME,
+        )
+    },
+)
 
 
 def create_pipeline(dataset, tokens_in_batch, clean=False, training=True):

diff --git a/examples/nlp/language_modeling/bert_pretraining.py b/examples/nlp/language_modeling/bert_pretraining.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-
 """
 
 To pretrain BERT on raw text dataset run
@@ -70,6 +69,15 @@
 
 350000 iterations on a DGX1 with 8 V100 32GB GPUs with AMP O1 optimization
 should finish under 5 days and yield an MRPC score of ACC/F1 85.05/89.35.
+
+More information about BERT pretraining can be found at 
+https://nvidia.github.io/NeMo/nlp/bert_pretraining.html
+
+Pretrained BERT models can be found at 
+https://ngc.nvidia.com/catalog/models/nvidia:bertlargeuncasedfornemo
+https://ngc.nvidia.com/catalog/models/nvidia:bertbaseuncasedfornemo
+https://ngc.nvidia.com/catalog/models/nvidia:bertbasecasedfornemo
+
 """
 import argparse
 import math
@@ -215,7 +223,14 @@
 # tie weights of MLM softmax layer and embedding layer of the encoder
 if mlm_classifier.mlp.last_linear_layer.weight.shape != bert_model.bert.embeddings.word_embeddings.weight.shape:
     raise ValueError("Final classification layer does not match embedding " "layer.")
-mlm_classifier.mlp.last_linear_layer.weight = bert_model.bert.embeddings.word_embeddings.weight
+# mlm_classifier.mlp.last_linear_layer.weight = bert_model.bert.embeddings.word_embeddings.weight
+mlm_classifier.tie_weights_with(
+    bert_model,
+    weight_names=["mlp.last_linear_layer.weight"],
+    name2name_and_transform={
+        "mlp.last_linear_layer.weight": ("bert.embeddings.word_embeddings.weight", nemo_core.WeightShareTransform.SAME)
+    },
+)
 
 
 def create_pipeline(data_file, batch_size, preprocessed_data=False, batches_per_step=1, **kwargs):

diff --git a/examples/nlp/language_modeling/language_modeling_transformer.py b/examples/nlp/language_modeling/language_modeling_transformer.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-
 import math
 
 import nemo
@@ -22,6 +21,7 @@
 import nemo.collections.nlp.nm.trainables.common.token_classification_nm
 from nemo.collections.nlp.callbacks.lm_transformer_callback import eval_epochs_done_callback, eval_iter_callback
 from nemo.collections.nlp.data.datasets.lm_transformer_dataset import LanguageModelDataDesc
+from nemo.core import WeightShareTransform
 from nemo.utils.lr_policies import CosineAnnealing
 
 parser = nemo.utils.NemoArgParser(description='LM Transformer')
@@ -114,7 +114,14 @@
 )
 
 # tie weight of embedding and log_softmax layers
-log_softmax.mlp.last_linear_layer.weight = encoder.embedding_layer.token_embedding.weight
+# log_softmax.mlp.last_linear_layer.weight = encoder.embedding_layer.token_embedding.weight
+log_softmax.tie_weights_with(
+    encoder,
+    weight_names=["mlp.layer0.weight"],
+    name2name_and_transform={
+        "mlp.layer0.weight": ("embedding_layer.token_embedding.weight", WeightShareTransform.SAME)
+    },
+)
 
 
 def create_pipeline(

diff --git a/examples/nlp/neural_machine_translation/machine_translation_tutorial.py b/examples/nlp/neural_machine_translation/machine_translation_tutorial.py
@@ -24,6 +24,7 @@
 import nemo
 import nemo.collections.nlp as nemo_nlp
 from nemo.collections.nlp.callbacks.machine_translation_callback import eval_epochs_done_callback, eval_iter_callback
+from nemo.core import WeightShareTransform
 from nemo.utils.lr_policies import get_lr_policy
 
 parser = nemo.utils.NemoArgParser(description='Transformer for Neural Machine Translation')
@@ -165,8 +166,25 @@
 )
 
 if tie_weight:
-    log_softmax.mlp.last_linear_layer.weight = encoder.embedding_layer.token_embedding.weight
-    decoder.embedding_layer.token_embedding.weight = encoder.embedding_layer.token_embedding.weight
+    # log_softmax.mlp.last_linear_layer.weight = encoder.embedding_layer.token_embedding.weight
+    log_softmax.tie_weights_with(
+        encoder,
+        weight_names=["mlp.last_linear_layer.weight"],
+        name2name_and_transform={
+            "mlp.last_linear_layer.weight": ("embedding_layer.token_embedding.weight", WeightShareTransform.SAME)
+        },
+    )
+    # decoder.embedding_layer.token_embedding.weight = encoder.embedding_layer.token_embedding.weight
+    decoder.tie_weights_with(
+        encoder,
+        weight_names=["embedding_layer.token_embedding.weight"],
+        name2name_and_transform={
+            "embedding_layer.token_embedding.weight": (
+                "embedding_layer.token_embedding.weight",
+                WeightShareTransform.SAME,
+            )
+        },
+    )
 
 
 def create_pipeline(dataset_src, dataset_tgt, tokens_in_batch, clean=False, training=True):