Merge branch 'tacotron2_retrain' of https://github.com/NVIDIA/NeMo in…

…to tacotron2_retrain
NVIDIA · May 11, 2022 · 2693024 · 2693024
2 parents cfa290f + 5f81452
commit 2693024
Show file tree

Hide file tree

Showing 7 changed files with 164 additions and 56 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -1555,6 +1555,105 @@ pipeline {
         }
       }
     }
+    stage('Punctuation & Capitalization, Different ways of passing labels to model') {
+      when {
+        anyOf {
+          branch 'r1.9.0'
+          changeRequest target: 'r1.9.0'
+        }
+      }
+      failFast true
+      stages {
+        stage('Punctuation & Capitalization, Using model.common_datasest_parameters.label_vocab_dir') {
+          steps {
+            sh 'cd examples/nlp/token_classification && \
+            label_vocab_dir=label_vocab_dir && \
+            mkdir -p ${label_vocab_dir} && \
+            punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \
+            capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \
+            printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \
+            printf "O\nU\n" > "${capit_label_vocab}" && \
+            CUDA_LAUNCH_BLOCKING=1 python punctuation_capitalization_train_evaluate.py \
+              model.train_ds.use_tarred_dataset=false \
+              model.train_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \
+              model.validation_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \
+              model.test_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \
+              model.language_model.pretrained_model_name=distilbert-base-uncased \
+              model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \
+              model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \
+              model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \
+              +model.train_ds.use_cache=false \
+              +model.validation_ds.use_cache=false \
+              +model.test_ds.use_cache=false \
+              trainer.devices=[0,1] \
+              trainer.strategy=ddp \
+              trainer.max_epochs=1 \
+              +exp_manager.explicit_log_dir=/home/TestData/nlp/token_classification_punctuation/output \
+              +do_testing=false && \
+            CUDA_LAUNCH_BLOCKING=1 python punctuation_capitalization_train_evaluate.py \
+              +do_training=false \
+              +do_testing=true \
+              ~model.train_ds \
+              ~model.validation_ds \
+              model.test_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \
+              pretrained_model=/home/TestData/nlp/token_classification_punctuation/output/checkpoints/Punctuation_and_Capitalization.nemo \
+              +model.train_ds.use_cache=false \
+              +model.validation_ds.use_cache=false \
+              +model.test_ds.use_cache=false \
+              trainer.devices=[0,1] \
+              trainer.strategy=ddp \
+              trainer.max_epochs=1 \
+              exp_manager=null && \
+            rm -r "${label_vocab_dir}" && \
+            rm -rf /home/TestData/nlp/token_classification_punctuation/output/*'
+          }
+        }
+        stage('Punctuation & Capitalization, Using model.common_datasest_parameters.{punct,capit}_label_ids') {
+          steps {
+            sh 'cd examples/nlp/token_classification && \
+            conf_path=/home/TestData/nlp/token_classification_punctuation && \
+            conf_name=punctuation_capitalization_config_with_ids && \
+            cp conf/punctuation_capitalization_config.yaml "${conf_path}/${conf_name}.yaml" && \
+            sed -i $\'s/punct_label_ids: null/punct_label_ids: {O: 0, \\\',\\\': 1, .: 2, \\\'?\\\': 3}/\' \
+              "${conf_path}/${conf_name}.yaml" && \
+            sed -i $\'s/capit_label_ids: null/capit_label_ids: {O: 0, U: 1}/\' \
+              "${conf_path}/${conf_name}.yaml" && \
+            CUDA_LAUNCH_BLOCKING=1 python punctuation_capitalization_train_evaluate.py \
+              --config-path "${conf_path}" \
+              --config-name "${conf_name}" \
+              model.train_ds.use_tarred_dataset=false \
+              model.train_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \
+              model.validation_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \
+              model.test_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \
+              model.language_model.pretrained_model_name=distilbert-base-uncased \
+              +model.train_ds.use_cache=false \
+              +model.validation_ds.use_cache=false \
+              +model.test_ds.use_cache=false \
+              trainer.devices=[0,1] \
+              trainer.strategy=ddp \
+              trainer.max_epochs=1 \
+              +exp_manager.explicit_log_dir=/home/TestData/nlp/token_classification_punctuation/output \
+              +do_testing=false && \
+            CUDA_LAUNCH_BLOCKING=1 python punctuation_capitalization_train_evaluate.py \
+              +do_training=false \
+              +do_testing=true \
+              ~model.train_ds \
+              ~model.validation_ds \
+              model.test_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \
+              pretrained_model=/home/TestData/nlp/token_classification_punctuation/output/checkpoints/Punctuation_and_Capitalization.nemo \
+              +model.train_ds.use_cache=false \
+              +model.validation_ds.use_cache=false \
+              +model.test_ds.use_cache=false \
+              trainer.devices=[0,1] \
+              trainer.strategy=ddp \
+              trainer.max_epochs=1 \
+              exp_manager=null && \
+            rm -rf /home/TestData/nlp/token_classification_punctuation/output/* && \
+            rm "${conf_path}/${conf_name}.yaml"'
+          }
+        }
+      }
+    }
     stage('Punctuation & Capitalization inference') {
       when {
         anyOf {

diff --git a/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py b/nemo/collections/nlp/data/token_classification/punctuation_capitalization_tarred_dataset.py
@@ -19,6 +19,7 @@
 import pickle
 import re
 import shutil
+import tempfile
 from collections import deque
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterator, List, Optional, Set, Tuple, Type, Union
@@ -160,36 +161,44 @@ def process_fragment(
         special_tokens=special_tokens,
         use_fast=use_fast_tokenizer,
     )
-    tmp_text = output_dir / f'tmp_text_{fragment_idx}.txt'
-    tmp_labels = output_dir / f'tmp_labels_{fragment_idx}.txt'
-    with text_file.open() as tf, labels_file.open() as lf, tmp_text.open('w') as otf, tmp_labels.open('w') as olf:
-        tf.seek(text_start_pos)
-        lf.seek(label_start_pos)
-        for _ in range(lines_per_dataset_fragment):
-            text_line = tf.readline()
-            if not text_line:
-                break
-            otf.write(text_line)
-            olf.write(lf.readline())
-    dataset = BertPunctuationCapitalizationDataset(
-        tmp_text,
-        tmp_labels,
-        max_seq_length,
-        tokenizer,
-        tokens_in_batch=tokens_in_batch,
-        pad_label=pad_label,
-        punct_label_ids=punct_label_ids,
-        capit_label_ids=capit_label_ids,
-        n_jobs=0,
-        use_cache=False,
-        add_masks_and_segment_ids_to_batch=False,
-        verbose=False,
-        tokenization_progress_queue=tokenization_progress_queue,
-        batch_mark_up_progress_queue=batch_mark_up_progress_queue,
-        batch_building_progress_queue=batch_building_progress_queue,
-    )
-    tmp_text.unlink()
-    tmp_labels.unlink()
+    tmp_text: Optional[str] = None
+    tmp_labels: Optional[str] = None
+    try:
+        otfd, tmp_text = tempfile.mkstemp(suffix='.txt', prefix=f'text_{fragment_idx}_', dir=output_dir, text=True)
+        olfd, tmp_labels = tempfile.mkstemp(suffix='.txt', prefix=f'labels_{fragment_idx}_', dir=output_dir, text=True)
+        with text_file.open() as tf, labels_file.open() as lf, os.fdopen(otfd, 'w') as otf, os.fdopen(
+            olfd, 'w'
+        ) as olf:
+            tf.seek(text_start_pos)
+            lf.seek(label_start_pos)
+            for _ in range(lines_per_dataset_fragment):
+                text_line = tf.readline()
+                if not text_line:
+                    break
+                otf.write(text_line)
+                olf.write(lf.readline())
+        dataset = BertPunctuationCapitalizationDataset(
+            tmp_text,
+            tmp_labels,
+            max_seq_length,
+            tokenizer,
+            tokens_in_batch=tokens_in_batch,
+            pad_label=pad_label,
+            punct_label_ids=punct_label_ids,
+            capit_label_ids=capit_label_ids,
+            n_jobs=0,
+            use_cache=False,
+            add_masks_and_segment_ids_to_batch=False,
+            verbose=False,
+            tokenization_progress_queue=tokenization_progress_queue,
+            batch_mark_up_progress_queue=batch_mark_up_progress_queue,
+            batch_building_progress_queue=batch_building_progress_queue,
+        )
+    finally:
+        if tmp_text is not None and os.path.exists(tmp_text):
+            os.remove(tmp_text)
+        if tmp_labels is not None and os.path.exists(tmp_labels):
+            os.remove(tmp_labels)
     dataset.features_pkl.unlink()
     tar_ctr = 0
     current_file_name = output_dir / TAR_FRAGMENT_TMPL_IN_PROGRESS.format(fragment_idx=fragment_idx, file_idx=tar_ctr)

diff --git a/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py b/nemo/collections/nlp/models/token_classification/punctuation_capitalization_model.py
@@ -638,16 +638,16 @@ def _check_label_config_parameters(self) -> None:
                 )
 
     def _extract_label_vocab_files_from_config(self) -> Tuple[Optional[Path], Optional[Path]]:
-        if self._cfg.common_dataset_parameters.label_vocab_dir is None:
-            if self._is_model_being_restored():
-                punct_label_vocab_file = self._cfg.class_labels.punct_labels_file
-                capit_label_vocab_file = self._cfg.class_labels.capit_labels_file
-            else:
-                punct_label_vocab_file, capit_label_vocab_file = None, None
+        if self._is_model_being_restored():
+            punct_label_vocab_file = self._cfg.class_labels.punct_labels_file
+            capit_label_vocab_file = self._cfg.class_labels.capit_labels_file
         else:
-            label_vocab_dir = Path(self._cfg.common_dataset_parameters.label_vocab_dir).expanduser()
-            punct_label_vocab_file = label_vocab_dir / self._cfg.class_labels.punct_labels_file
-            capit_label_vocab_file = label_vocab_dir / self._cfg.class_labels.capit_labels_file
+            if self._cfg.common_dataset_parameters.label_vocab_dir is None:
+                punct_label_vocab_file, capit_label_vocab_file = None, None
+            else:
+                label_vocab_dir = Path(self._cfg.common_dataset_parameters.label_vocab_dir).expanduser()
+                punct_label_vocab_file = label_vocab_dir / self._cfg.class_labels.punct_labels_file
+                capit_label_vocab_file = label_vocab_dir / self._cfg.class_labels.capit_labels_file
         return punct_label_vocab_file, capit_label_vocab_file
 
     def _set_label_ids(self) -> None:

diff --git a/nemo_text_processing/text_normalization/ru/data/measurements.tsv b/nemo_text_processing/text_normalization/ru/data/measurements.tsv
@@ -84,17 +84,17 @@
 га	гектарами
 га	гектаре
 га	гектарах
-м²	квадратный метр	-0.1
-м²	квадратные метры	-0.1
-м²	квадратного метра	-0.1
-м²	квадратных метров	-0.1
-м²	квадратному метру	-0.1
-м²	квадратным метрам	-0.1
-м²	квадратные метры	-0.1
-м²	квадратным метром	-0.1
-м²	квадратными метрами	-0.1
-м²	квадратном метре	-0.1
-м²	квадратных метрах	-0.1
+м²	квадратный метр	-0.11
+м²	квадратные метры	-0.11
+м²	квадратного метра	-0.11
+м²	квадратных метров	-0.11
+м²	квадратному метру	-0.11
+м²	квадратным метрам	-0.11
+м²	квадратные метры	-0.11
+м²	квадратным метром	-0.11
+м²	квадратными метрами	-0.11
+м²	квадратном метре	-0.11
+м²	квадратных метрах	-0.11
 кв. м.	квадратный метр	-0.1
 кв. м.	квадратные метры	-0.1
 кв. м.	квадратного метра	-0.1

diff --git a/scripts/speaker_tasks/pathsfiles_to_manifest.py b/scripts/speaker_tasks/pathsfiles_to_manifest.py
@@ -50,15 +50,15 @@ def get_dict_from_wavlist(pathlist):
     path_dict = od()
     pathlist = sorted(pathlist)
     for line_path in pathlist:
-        uniq_id = os.path.basename(line_path).split('.')[0]
+        uniq_id = os.path.splitext(os.path.basename(line_path))[0]
         path_dict[uniq_id] = line_path
     return path_dict
 
 
 def get_dict_from_list(data_pathlist, uniqids):
     path_dict = {}
     for line_path in data_pathlist:
-        uniq_id = os.path.basename(line_path).split('.')[0]
+        uniq_id = os.path.splitext(os.path.basename(line_path))[0]
         if uniq_id in uniqids:
             path_dict[uniq_id] = line_path
         else:

diff --git a/tutorials/nlp/02_NLP_Tokenizers.ipynb b/tutorials/nlp/02_NLP_Tokenizers.ipynb
@@ -98,7 +98,7 @@
                 "\n",
                 "Hugging Face and Megatron tokenizers (which uses Hugging Face underneath) can be automatically instantiated by only `tokenizer_name`, which downloads the corresponding `vocab_file` from the internet. \n",
                 "\n",
-                "For SentencePieceTokenizer, WordTokenizer, and CharTokenizers `tokenizer_model` or/and `vocab_file` can be generated offline in advance using [`scripts/tokenizers/process_asr_text_tokenizer.py`](https://github.com/NVIDIA/NeMo/blob/stable/scripts/process_asr_text_tokenizer.py)\n",
+                "For SentencePieceTokenizer, WordTokenizer, and CharTokenizers `tokenizer_model` or/and `vocab_file` can be generated offline in advance using [`scripts/tokenizers/process_asr_text_tokenizer.py`](https://github.com/NVIDIA/NeMo/blob/stable/scripts/tokenizers/process_asr_text_tokenizer.py)\n",
                 "\n",
                 "The tokenizers in NeMo are designed to be used interchangeably, especially when\n",
                 "used in combination with a BERT-based model.\n",
@@ -381,7 +381,7 @@
                 "id": "ykwKmREuPQE-"
             },
             "source": [
-                "We use the [`scripts/tokenizers/process_asr_text_tokenizer.py`](https://github.com/NVIDIA/NeMo/blob/stable/scripts/process_asr_text_tokenizer.py) script to create a custom tokenizer model with its own vocabulary from an input file"
+                "We use the [`scripts/tokenizers/process_asr_text_tokenizer.py`](https://github.com/NVIDIA/NeMo/blob/stable/scripts/tokenizers/process_asr_text_tokenizer.py) script to create a custom tokenizer model with its own vocabulary from an input file"
             ]
         },
         {
@@ -585,4 +585,4 @@
     },
     "nbformat": 4,
     "nbformat_minor": 1
-}
+}
diff --git a/tutorials/text_processing/WFST_Tutorial.ipynb b/tutorials/text_processing/WFST_Tutorial.ipynb
@@ -6754,7 +6754,7 @@
     "Our last step is to create a universal Verbalizer for all classes. This is very similar to development of `ClassifierFst`, except that the Verbalizer breaks its normalization task into two components:\n",
     "- `VerbalizeFst`, which removes formatting for each token\n",
     "- `VerbalizeFinalFst`, which extends `VerbalizeFst` across all tokens in a string\n",
-    "Why two componenets when `tokenize_and_classify` was one? Because Sparrowhawk performs all the functionality of `VerbalizeFinalFst`, so its inclusion would break deployment. However, without it, your NeMo grammar would be unable to function at base. So we separate the two to allow the best of both world."
+    "Why two components when `tokenize_and_classify` was one? Because Sparrowhawk performs all the functionality of `VerbalizeFinalFst`, so its inclusion would break deployment. However, without it, your NeMo grammar would be unable to function at base. So we separate the two to allow the best of both world."
    ]
   },
   {