NVIDIA · XuesongYang · Jul 8, 2022 · Jul 5, 2022 · Jul 8, 2022 · Jul 8, 2022
diff --git a/examples/tts/conf/aligner.yaml b/examples/tts/conf/aligner.yaml
@@ -19,7 +19,7 @@ lowfreq: 0
 highfreq: 8000
 window: hann
 
-phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.01"
+phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.07"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921"
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 

diff --git a/examples/tts/conf/fastpitch_align_44100.yaml b/examples/tts/conf/fastpitch_align_44100.yaml
@@ -27,7 +27,7 @@ lowfreq: 0
 highfreq: null
 window: hann
 
-phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.01"
+phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.07"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921"
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 

diff --git a/examples/tts/conf/fastpitch_align_v1.05.yaml b/examples/tts/conf/fastpitch_align_v1.05.yaml
@@ -27,7 +27,7 @@ lowfreq: 0
 highfreq: 8000
 window: hann
 
-phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.01"
+phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.07"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921"
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 

diff --git a/examples/tts/conf/mixer-tts.yaml b/examples/tts/conf/mixer-tts.yaml
@@ -27,7 +27,7 @@ lowfreq: 0
 highfreq: 8000
 window: hann
 
-phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.01"
+phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.07"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921"
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 

diff --git a/examples/tts/conf/tacotron2.yaml b/examples/tts/conf/tacotron2.yaml
@@ -9,7 +9,7 @@ validation_datasets: ???
 sup_data_path: null
 sup_data_types: null
 
-phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.01"
+phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.07"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921"
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
 

diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py
@@ -47,7 +47,7 @@
 @dataclass
 class G2PConfig:
     _target_: str = "nemo.collections.tts.torch.g2ps.EnglishG2p"
-    phoneme_dict: str = "scripts/tts_dataset_files/cmudict-0.7b_nv22.01"
+    phoneme_dict: str = "scripts/tts_dataset_files/cmudict-0.7b_nv22.07"
     heteronyms: str = "scripts/tts_dataset_files/heteronyms-030921"
     phoneme_probability: float = 0.5
 

diff --git a/nemo/collections/tts/torch/g2ps.py b/nemo/collections/tts/torch/g2ps.py
@@ -139,7 +139,7 @@ def _parse_as_cmu_dict(phoneme_dict_path=None, encoding='latin-1'):
                 f"English g2p_dict will be used from nltk.corpus.cmudict.dict(), because phoneme_dict_path=None. "
                 "Note that nltk.corpus.cmudict.dict() has old version (0.6) of CMUDict. "
                 "You can use the latest official version of CMUDict (0.7b) with additional changes from NVIDIA directly from NeMo "
-                "using the path scripts/tts_dataset_files/cmudict-0.7b_nv22.01."
+                "using the path scripts/tts_dataset_files/cmudict-0.7b_nv22.07."
             )
 
             return nltk.corpus.cmudict.dict()

diff --git a/nemo/collections/tts/torch/tts_dataset.yaml b/nemo/collections/tts/torch/tts_dataset.yaml
@@ -42,5 +42,5 @@ tts_dataset:
     pad_with_space: True
     g2p:
       _target_: nemo.collections.tts.torch.g2ps.EnglishG2p
-      phoneme_dict: "scripts/tts_dataset_files/cmudict-0.7b_nv22.01"
+      phoneme_dict: "scripts/tts_dataset_files/cmudict-0.7b_nv22.07"
       heteronyms: "scripts/tts_dataset_files/heteronyms-030921"
diff --git a/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_fastpitch_align.yaml b/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_fastpitch_align.yaml
@@ -4,7 +4,7 @@ manifest_filepath: "train_manifest.json"
 sup_data_path: "sup_data"
 sup_data_types: [ "align_prior_matrix", "pitch" ]
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
-phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.01"
+phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.07"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921"
 
 dataset:

diff --git a/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_mixer_tts.yaml b/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_mixer_tts.yaml
@@ -4,7 +4,7 @@ manifest_filepath: "train_manifest.json"
 sup_data_path: "sup_data"
 sup_data_types: [ "align_prior_matrix", "pitch" ]
 whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
-phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.01"
+phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.07"
 heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921"
 
 dataset:

diff --git a/...ts/tts_dataset_files/cmudict-0.7b_nv22.01 → ...ts/tts_dataset_files/cmudict-0.7b_nv22.07 b/...ts/tts_dataset_files/cmudict-0.7b_nv22.01 → ...ts/tts_dataset_files/cmudict-0.7b_nv22.07
@@ -133889,6 +133889,7 @@ CSP  S IY1 EH1 S P IY1
 CTR  S IH1 T IY1 AA1 R
 CUDA  K UW1 D AH0
 CUDNN  K UW1 D IY1 EH2 N EH2 N
+CUSTOMIZABLE  K AH2 S T AH0 M AY1 Z AH0 B AH0 L
 CYBERCRIME  S AY1 B ER0 K R AY1 M
 DATACENTER  D EY1 T AH0 S EH2 N T ER0
 DDOS  D IY2 D AO1 S
@@ -133957,8 +133958,8 @@ NGC  EH1 N JH IY1 S IY1
 NGX  EH1 N JH IY1 EH1 K S
 NHS  EH1 N EY1 CH EH1 S
 NIH  EH1 N AY1 EY1 CH
-NVIDIA  EH0 N V IY1 D IY0 AH0
-NVIDIA'S  EH0 N V IY1 D IY0 AH0 Z
+NVIDIA  EH0 N V IH1 D IY0 AH0
+NVIDIA'S  EH0 N V IH1 D IY0 AH0 Z
 NVLINK  EH1 N V IY1 L IH1 NG K
 NVME  EH1 N V IY1 EH1 M IY1
 NVSWITCH  EH1 N V IH1 S W IH1 CH

diff --git a/tutorials/tts/FastPitch_Finetuning.ipynb b/tutorials/tts/FastPitch_Finetuning.ipynb
@@ -244,7 +244,7 @@
             "source": [
                 "# additional files\n",
                 "!mkdir -p tts_dataset_files && cd tts_dataset_files \\\n",
-                "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.01 \\\n",
+                "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.07 \\\n",
                 "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/heteronyms-030921 \\\n",
                 "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv \\\n",
                 "&& cd .."
@@ -286,7 +286,7 @@
                 "  train_dataset=./6097_manifest_train_dur_5_mins_local.json \\\n",
                 "  validation_datasets=./6097_manifest_dev_ns_all_local.json \\\n",
                 "  sup_data_path=./fastpitch_sup_data \\\n",
-                "  phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.01 \\\n",
+                "  phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.07 \\\n",
                 "  heteronyms_path=tts_dataset_files/heteronyms-030921 \\\n",
                 "  whitelist_path=tts_dataset_files/lj_speech.tsv \\\n",
                 "  exp_manager.exp_dir=./ljspeech_to_6097_no_mixing_5_mins \\\n",
@@ -318,7 +318,7 @@
                 "  sup_data_path=./fastpitch_sup_data`\n",
                 "  * We tell the script what manifest files to train and eval on, as well as where supplementary data is located (or will be calculated and saved during training if not provided).\n",
                 "  \n",
-                "* `phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.01 \n",
+                "* `phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.07 \n",
                 "heteronyms_path=tts_dataset_files/heteronyms-030921\n",
                 "whitelist_path=tts_dataset_files/lj_speech.tsv \n",
                 "`\n",
@@ -718,4 +718,4 @@
     },
     "nbformat": 4,
     "nbformat_minor": 5
-}
+}
diff --git a/tutorials/tts/FastPitch_MixerTTS_Training.ipynb b/tutorials/tts/FastPitch_MixerTTS_Training.ipynb
@@ -226,7 +226,7 @@
                 "\n",
                 "# additional files\n",
                 "!mkdir -p tts_dataset_files && cd tts_dataset_files \\\n",
-                "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.01 \\\n",
+                "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.07 \\\n",
                 "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/heteronyms-030921 \\\n",
                 "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv \\\n",
                 "&& cd .."
@@ -428,7 +428,7 @@
                 "\n",
                 "# Grapheme-to-phoneme module\n",
                 "g2p = EnglishG2p(\n",
-                "    phoneme_dict=\"tts_dataset_files/cmudict-0.7b_nv22.01\",\n",
+                "    phoneme_dict=\"tts_dataset_files/cmudict-0.7b_nv22.07\",\n",
                 "    heteronyms=\"tts_dataset_files/heteronyms-030921\"\n",
                 ")\n",
                 "\n",
@@ -554,7 +554,7 @@
                 "validation_datasets=tests/data/asr/an4_val.json \\\n",
                 "sup_data_types=\"['align_prior_matrix', 'pitch']\" \\\n",
                 "sup_data_path={mixer_tts_sup_data_path} \\\n",
-                "phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.01 \\\n",
+                "phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.07 \\\n",
                 "heteronyms_path=tts_dataset_files/heteronyms-030921 \\\n",
                 "whitelist_path=tts_dataset_files/lj_speech.tsv \\\n",
                 "pitch_mean={pitch_mean} \\\n",
@@ -606,4 +606,4 @@
     },
     "nbformat": 4,
     "nbformat_minor": 5
-}
+}
diff --git a/tutorials/tts/Tacotron2_Training.ipynb b/tutorials/tts/Tacotron2_Training.ipynb
@@ -163,7 +163,7 @@
     "# We will also need a few extra files for handling text.\n",
     "!(mkdir -p scripts/tts_dataset_files \\\n",
     "  && cd scripts/tts_dataset_files \\\n",
-    "  && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.01 \\\n",
+    "  && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.07 \\\n",
     "  && wget wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/heteronyms-030921 \\\n",
     "  && cd ..)\n",
     "        \n",
@@ -231,7 +231,7 @@
     "sup_data_path: null\n",
     "sup_data_types: null\n",
     "\n",
-    "phoneme_dict_path: \"scripts/tts_dataset_files/cmudict-0.7b_nv22.01\"\n",
+    "phoneme_dict_path: \"scripts/tts_dataset_files/cmudict-0.7b_nv22.07\"\n",
     "heteronyms_path: \"scripts/tts_dataset_files/heteronyms-030921\"\n",
     "whitelist_path: \"nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv\"\n",
     "```\n",