Skip to content

Commit

Permalink
Merge branch 'tacotron2_retrain' of https://github.com/NVIDIA/NeMo in…
Browse files Browse the repository at this point in the history
…to tacotron2_retrain
  • Loading branch information
treacker committed May 11, 2022
2 parents cfa290f + 5f81452 commit 2693024
Show file tree
Hide file tree
Showing 7 changed files with 164 additions and 56 deletions.
99 changes: 99 additions & 0 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -1555,6 +1555,105 @@ pipeline {
}
}
}
stage('Punctuation & Capitalization, Different ways of passing labels to model') {
when {
anyOf {
branch 'r1.9.0'
changeRequest target: 'r1.9.0'
}
}
failFast true
stages {
stage('Punctuation & Capitalization, Using model.common_datasest_parameters.label_vocab_dir') {
steps {
sh 'cd examples/nlp/token_classification && \
label_vocab_dir=label_vocab_dir && \
mkdir -p ${label_vocab_dir} && \
punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \
capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \
printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \
printf "O\nU\n" > "${capit_label_vocab}" && \
CUDA_LAUNCH_BLOCKING=1 python punctuation_capitalization_train_evaluate.py \
model.train_ds.use_tarred_dataset=false \
model.train_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \
model.validation_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \
model.test_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \
model.language_model.pretrained_model_name=distilbert-base-uncased \
model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \
model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \
model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \
+model.train_ds.use_cache=false \
+model.validation_ds.use_cache=false \
+model.test_ds.use_cache=false \
trainer.devices=[0,1] \
trainer.strategy=ddp \
trainer.max_epochs=1 \
+exp_manager.explicit_log_dir=/home/TestData/nlp/token_classification_punctuation/output \
+do_testing=false && \
CUDA_LAUNCH_BLOCKING=1 python punctuation_capitalization_train_evaluate.py \
+do_training=false \
+do_testing=true \
~model.train_ds \
~model.validation_ds \
model.test_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \
pretrained_model=/home/TestData/nlp/token_classification_punctuation/output/checkpoints/Punctuation_and_Capitalization.nemo \
+model.train_ds.use_cache=false \
+model.validation_ds.use_cache=false \
+model.test_ds.use_cache=false \
trainer.devices=[0,1] \
trainer.strategy=ddp \
trainer.max_epochs=1 \
exp_manager=null && \
rm -r "${label_vocab_dir}" && \
rm -rf /home/TestData/nlp/token_classification_punctuation/output/*'
}
}
stage('Punctuation & Capitalization, Using model.common_datasest_parameters.{punct,capit}_label_ids') {
steps {
sh 'cd examples/nlp/token_classification && \
conf_path=/home/TestData/nlp/token_classification_punctuation && \
conf_name=punctuation_capitalization_config_with_ids && \
cp conf/punctuation_capitalization_config.yaml "${conf_path}/${conf_name}.yaml" && \
sed -i $\'s/punct_label_ids: null/punct_label_ids: {O: 0, \\\',\\\': 1, .: 2, \\\'?\\\': 3}/\' \
"${conf_path}/${conf_name}.yaml" && \
sed -i $\'s/capit_label_ids: null/capit_label_ids: {O: 0, U: 1}/\' \
"${conf_path}/${conf_name}.yaml" && \
CUDA_LAUNCH_BLOCKING=1 python punctuation_capitalization_train_evaluate.py \
--config-path "${conf_path}" \
--config-name "${conf_name}" \
model.train_ds.use_tarred_dataset=false \
model.train_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \
model.validation_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \
model.test_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \
model.language_model.pretrained_model_name=distilbert-base-uncased \
+model.train_ds.use_cache=false \
+model.validation_ds.use_cache=false \
+model.test_ds.use_cache=false \
trainer.devices=[0,1] \
trainer.strategy=ddp \
trainer.max_epochs=1 \
+exp_manager.explicit_log_dir=/home/TestData/nlp/token_classification_punctuation/output \
+do_testing=false && \
CUDA_LAUNCH_BLOCKING=1 python punctuation_capitalization_train_evaluate.py \
+do_training=false \
+do_testing=true \
~model.train_ds \
~model.validation_ds \
model.test_ds.ds_item=/home/TestData/nlp/token_classification_punctuation \
pretrained_model=/home/TestData/nlp/token_classification_punctuation/output/checkpoints/Punctuation_and_Capitalization.nemo \
+model.train_ds.use_cache=false \
+model.validation_ds.use_cache=false \
+model.test_ds.use_cache=false \
trainer.devices=[0,1] \
trainer.strategy=ddp \
trainer.max_epochs=1 \
exp_manager=null && \
rm -rf /home/TestData/nlp/token_classification_punctuation/output/* && \
rm "${conf_path}/${conf_name}.yaml"'
}
}
}
}
stage('Punctuation & Capitalization inference') {
when {
anyOf {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import pickle
import re
import shutil
import tempfile
from collections import deque
from pathlib import Path
from typing import Any, Callable, Dict, Iterator, List, Optional, Set, Tuple, Type, Union
Expand Down Expand Up @@ -160,36 +161,44 @@ def process_fragment(
special_tokens=special_tokens,
use_fast=use_fast_tokenizer,
)
tmp_text = output_dir / f'tmp_text_{fragment_idx}.txt'
tmp_labels = output_dir / f'tmp_labels_{fragment_idx}.txt'
with text_file.open() as tf, labels_file.open() as lf, tmp_text.open('w') as otf, tmp_labels.open('w') as olf:
tf.seek(text_start_pos)
lf.seek(label_start_pos)
for _ in range(lines_per_dataset_fragment):
text_line = tf.readline()
if not text_line:
break
otf.write(text_line)
olf.write(lf.readline())
dataset = BertPunctuationCapitalizationDataset(
tmp_text,
tmp_labels,
max_seq_length,
tokenizer,
tokens_in_batch=tokens_in_batch,
pad_label=pad_label,
punct_label_ids=punct_label_ids,
capit_label_ids=capit_label_ids,
n_jobs=0,
use_cache=False,
add_masks_and_segment_ids_to_batch=False,
verbose=False,
tokenization_progress_queue=tokenization_progress_queue,
batch_mark_up_progress_queue=batch_mark_up_progress_queue,
batch_building_progress_queue=batch_building_progress_queue,
)
tmp_text.unlink()
tmp_labels.unlink()
tmp_text: Optional[str] = None
tmp_labels: Optional[str] = None
try:
otfd, tmp_text = tempfile.mkstemp(suffix='.txt', prefix=f'text_{fragment_idx}_', dir=output_dir, text=True)
olfd, tmp_labels = tempfile.mkstemp(suffix='.txt', prefix=f'labels_{fragment_idx}_', dir=output_dir, text=True)
with text_file.open() as tf, labels_file.open() as lf, os.fdopen(otfd, 'w') as otf, os.fdopen(
olfd, 'w'
) as olf:
tf.seek(text_start_pos)
lf.seek(label_start_pos)
for _ in range(lines_per_dataset_fragment):
text_line = tf.readline()
if not text_line:
break
otf.write(text_line)
olf.write(lf.readline())
dataset = BertPunctuationCapitalizationDataset(
tmp_text,
tmp_labels,
max_seq_length,
tokenizer,
tokens_in_batch=tokens_in_batch,
pad_label=pad_label,
punct_label_ids=punct_label_ids,
capit_label_ids=capit_label_ids,
n_jobs=0,
use_cache=False,
add_masks_and_segment_ids_to_batch=False,
verbose=False,
tokenization_progress_queue=tokenization_progress_queue,
batch_mark_up_progress_queue=batch_mark_up_progress_queue,
batch_building_progress_queue=batch_building_progress_queue,
)
finally:
if tmp_text is not None and os.path.exists(tmp_text):
os.remove(tmp_text)
if tmp_labels is not None and os.path.exists(tmp_labels):
os.remove(tmp_labels)
dataset.features_pkl.unlink()
tar_ctr = 0
current_file_name = output_dir / TAR_FRAGMENT_TMPL_IN_PROGRESS.format(fragment_idx=fragment_idx, file_idx=tar_ctr)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -638,16 +638,16 @@ def _check_label_config_parameters(self) -> None:
)

def _extract_label_vocab_files_from_config(self) -> Tuple[Optional[Path], Optional[Path]]:
if self._cfg.common_dataset_parameters.label_vocab_dir is None:
if self._is_model_being_restored():
punct_label_vocab_file = self._cfg.class_labels.punct_labels_file
capit_label_vocab_file = self._cfg.class_labels.capit_labels_file
else:
punct_label_vocab_file, capit_label_vocab_file = None, None
if self._is_model_being_restored():
punct_label_vocab_file = self._cfg.class_labels.punct_labels_file
capit_label_vocab_file = self._cfg.class_labels.capit_labels_file
else:
label_vocab_dir = Path(self._cfg.common_dataset_parameters.label_vocab_dir).expanduser()
punct_label_vocab_file = label_vocab_dir / self._cfg.class_labels.punct_labels_file
capit_label_vocab_file = label_vocab_dir / self._cfg.class_labels.capit_labels_file
if self._cfg.common_dataset_parameters.label_vocab_dir is None:
punct_label_vocab_file, capit_label_vocab_file = None, None
else:
label_vocab_dir = Path(self._cfg.common_dataset_parameters.label_vocab_dir).expanduser()
punct_label_vocab_file = label_vocab_dir / self._cfg.class_labels.punct_labels_file
capit_label_vocab_file = label_vocab_dir / self._cfg.class_labels.capit_labels_file
return punct_label_vocab_file, capit_label_vocab_file

def _set_label_ids(self) -> None:
Expand Down
22 changes: 11 additions & 11 deletions nemo_text_processing/text_normalization/ru/data/measurements.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -84,17 +84,17 @@
га гектарами
га гектаре
га гектарах
м² квадратный метр -0.1
м² квадратные метры -0.1
м² квадратного метра -0.1
м² квадратных метров -0.1
м² квадратному метру -0.1
м² квадратным метрам -0.1
м² квадратные метры -0.1
м² квадратным метром -0.1
м² квадратными метрами -0.1
м² квадратном метре -0.1
м² квадратных метрах -0.1
м² квадратный метр -0.11
м² квадратные метры -0.11
м² квадратного метра -0.11
м² квадратных метров -0.11
м² квадратному метру -0.11
м² квадратным метрам -0.11
м² квадратные метры -0.11
м² квадратным метром -0.11
м² квадратными метрами -0.11
м² квадратном метре -0.11
м² квадратных метрах -0.11
кв. м. квадратный метр -0.1
кв. м. квадратные метры -0.1
кв. м. квадратного метра -0.1
Expand Down
4 changes: 2 additions & 2 deletions scripts/speaker_tasks/pathsfiles_to_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,15 @@ def get_dict_from_wavlist(pathlist):
path_dict = od()
pathlist = sorted(pathlist)
for line_path in pathlist:
uniq_id = os.path.basename(line_path).split('.')[0]
uniq_id = os.path.splitext(os.path.basename(line_path))[0]
path_dict[uniq_id] = line_path
return path_dict


def get_dict_from_list(data_pathlist, uniqids):
path_dict = {}
for line_path in data_pathlist:
uniq_id = os.path.basename(line_path).split('.')[0]
uniq_id = os.path.splitext(os.path.basename(line_path))[0]
if uniq_id in uniqids:
path_dict[uniq_id] = line_path
else:
Expand Down
6 changes: 3 additions & 3 deletions tutorials/nlp/02_NLP_Tokenizers.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@
"\n",
"Hugging Face and Megatron tokenizers (which uses Hugging Face underneath) can be automatically instantiated by only `tokenizer_name`, which downloads the corresponding `vocab_file` from the internet. \n",
"\n",
"For SentencePieceTokenizer, WordTokenizer, and CharTokenizers `tokenizer_model` or/and `vocab_file` can be generated offline in advance using [`scripts/tokenizers/process_asr_text_tokenizer.py`](https://github.com/NVIDIA/NeMo/blob/stable/scripts/process_asr_text_tokenizer.py)\n",
"For SentencePieceTokenizer, WordTokenizer, and CharTokenizers `tokenizer_model` or/and `vocab_file` can be generated offline in advance using [`scripts/tokenizers/process_asr_text_tokenizer.py`](https://github.com/NVIDIA/NeMo/blob/stable/scripts/tokenizers/process_asr_text_tokenizer.py)\n",
"\n",
"The tokenizers in NeMo are designed to be used interchangeably, especially when\n",
"used in combination with a BERT-based model.\n",
Expand Down Expand Up @@ -381,7 +381,7 @@
"id": "ykwKmREuPQE-"
},
"source": [
"We use the [`scripts/tokenizers/process_asr_text_tokenizer.py`](https://github.com/NVIDIA/NeMo/blob/stable/scripts/process_asr_text_tokenizer.py) script to create a custom tokenizer model with its own vocabulary from an input file"
"We use the [`scripts/tokenizers/process_asr_text_tokenizer.py`](https://github.com/NVIDIA/NeMo/blob/stable/scripts/tokenizers/process_asr_text_tokenizer.py) script to create a custom tokenizer model with its own vocabulary from an input file"
]
},
{
Expand Down Expand Up @@ -585,4 +585,4 @@
},
"nbformat": 4,
"nbformat_minor": 1
}
}
2 changes: 1 addition & 1 deletion tutorials/text_processing/WFST_Tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6754,7 +6754,7 @@
"Our last step is to create a universal Verbalizer for all classes. This is very similar to development of `ClassifierFst`, except that the Verbalizer breaks its normalization task into two components:\n",
"- `VerbalizeFst`, which removes formatting for each token\n",
"- `VerbalizeFinalFst`, which extends `VerbalizeFst` across all tokens in a string\n",
"Why two componenets when `tokenize_and_classify` was one? Because Sparrowhawk performs all the functionality of `VerbalizeFinalFst`, so its inclusion would break deployment. However, without it, your NeMo grammar would be unable to function at base. So we separate the two to allow the best of both world."
"Why two components when `tokenize_and_classify` was one? Because Sparrowhawk performs all the functionality of `VerbalizeFinalFst`, so its inclusion would break deployment. However, without it, your NeMo grammar would be unable to function at base. So we separate the two to allow the best of both world."
]
},
{
Expand Down

0 comments on commit 2693024

Please sign in to comment.