diff --git a/examples/tts/conf/aligner.yaml b/examples/tts/conf/aligner.yaml index 88ab5906df9c..7b2fa9660b82 100644 --- a/examples/tts/conf/aligner.yaml +++ b/examples/tts/conf/aligner.yaml @@ -19,7 +19,7 @@ lowfreq: 0 highfreq: 8000 window: hann -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.01" +phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.07" heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921" whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" diff --git a/examples/tts/conf/fastpitch_align_44100.yaml b/examples/tts/conf/fastpitch_align_44100.yaml index 7c7e9fe3b433..314c7c32e694 100644 --- a/examples/tts/conf/fastpitch_align_44100.yaml +++ b/examples/tts/conf/fastpitch_align_44100.yaml @@ -27,7 +27,7 @@ lowfreq: 0 highfreq: null window: hann -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.01" +phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.07" heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921" whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" diff --git a/examples/tts/conf/fastpitch_align_v1.05.yaml b/examples/tts/conf/fastpitch_align_v1.05.yaml index 47a32c7a897a..de4c60308d0e 100644 --- a/examples/tts/conf/fastpitch_align_v1.05.yaml +++ b/examples/tts/conf/fastpitch_align_v1.05.yaml @@ -27,7 +27,7 @@ lowfreq: 0 highfreq: 8000 window: hann -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.01" +phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.07" heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921" whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" diff --git a/examples/tts/conf/mixer-tts.yaml b/examples/tts/conf/mixer-tts.yaml index aac8ba92048f..a7b3941c4fef 100644 --- a/examples/tts/conf/mixer-tts.yaml +++ b/examples/tts/conf/mixer-tts.yaml @@ -27,7 +27,7 @@ lowfreq: 0 highfreq: 8000 window: hann -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.01" +phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.07" heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921" whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" diff --git a/examples/tts/conf/tacotron2.yaml b/examples/tts/conf/tacotron2.yaml index 31d4ea5d1c0e..664c78d0a151 100644 --- a/examples/tts/conf/tacotron2.yaml +++ b/examples/tts/conf/tacotron2.yaml @@ -9,7 +9,7 @@ validation_datasets: ??? sup_data_path: null sup_data_types: null -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.01" +phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.07" heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921" whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index e1336a414610..ae33ad412ae6 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -47,7 +47,7 @@ @dataclass class G2PConfig: _target_: str = "nemo.collections.tts.torch.g2ps.EnglishG2p" - phoneme_dict: str = "scripts/tts_dataset_files/cmudict-0.7b_nv22.01" + phoneme_dict: str = "scripts/tts_dataset_files/cmudict-0.7b_nv22.07" heteronyms: str = "scripts/tts_dataset_files/heteronyms-030921" phoneme_probability: float = 0.5 diff --git a/nemo/collections/tts/torch/g2ps.py b/nemo/collections/tts/torch/g2ps.py index a6286aa5710c..1f9f845a1dc9 100644 --- a/nemo/collections/tts/torch/g2ps.py +++ b/nemo/collections/tts/torch/g2ps.py @@ -139,7 +139,7 @@ def _parse_as_cmu_dict(phoneme_dict_path=None, encoding='latin-1'): f"English g2p_dict will be used from nltk.corpus.cmudict.dict(), because phoneme_dict_path=None. " "Note that nltk.corpus.cmudict.dict() has old version (0.6) of CMUDict. " "You can use the latest official version of CMUDict (0.7b) with additional changes from NVIDIA directly from NeMo " - "using the path scripts/tts_dataset_files/cmudict-0.7b_nv22.01." + "using the path scripts/tts_dataset_files/cmudict-0.7b_nv22.07." ) return nltk.corpus.cmudict.dict() diff --git a/nemo/collections/tts/torch/tts_dataset.yaml b/nemo/collections/tts/torch/tts_dataset.yaml index e7d122d2c6da..013b61f74bb9 100644 --- a/nemo/collections/tts/torch/tts_dataset.yaml +++ b/nemo/collections/tts/torch/tts_dataset.yaml @@ -42,5 +42,5 @@ tts_dataset: pad_with_space: True g2p: _target_: nemo.collections.tts.torch.g2ps.EnglishG2p - phoneme_dict: "scripts/tts_dataset_files/cmudict-0.7b_nv22.01" + phoneme_dict: "scripts/tts_dataset_files/cmudict-0.7b_nv22.07" heteronyms: "scripts/tts_dataset_files/heteronyms-030921" diff --git a/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_fastpitch_align.yaml b/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_fastpitch_align.yaml index 3be38d1259c5..86667cd499d9 100644 --- a/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_fastpitch_align.yaml +++ b/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_fastpitch_align.yaml @@ -4,7 +4,7 @@ manifest_filepath: "train_manifest.json" sup_data_path: "sup_data" sup_data_types: [ "align_prior_matrix", "pitch" ] whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.01" +phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.07" heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921" dataset: diff --git a/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_mixer_tts.yaml b/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_mixer_tts.yaml index 8ccb969c288c..d17b6252e9dc 100644 --- a/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_mixer_tts.yaml +++ b/scripts/dataset_processing/tts/ljspeech/ds_conf/ds_for_mixer_tts.yaml @@ -4,7 +4,7 @@ manifest_filepath: "train_manifest.json" sup_data_path: "sup_data" sup_data_types: [ "align_prior_matrix", "pitch" ] whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.01" +phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.07" heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921" dataset: diff --git a/scripts/tts_dataset_files/cmudict-0.7b_nv22.01 b/scripts/tts_dataset_files/cmudict-0.7b_nv22.07 similarity index 99% rename from scripts/tts_dataset_files/cmudict-0.7b_nv22.01 rename to scripts/tts_dataset_files/cmudict-0.7b_nv22.07 index ec41c21ec18f..ebc75bdbbecd 100644 --- a/scripts/tts_dataset_files/cmudict-0.7b_nv22.01 +++ b/scripts/tts_dataset_files/cmudict-0.7b_nv22.07 @@ -133889,6 +133889,7 @@ CSP S IY1 EH1 S P IY1 CTR S IH1 T IY1 AA1 R CUDA K UW1 D AH0 CUDNN K UW1 D IY1 EH2 N EH2 N +CUSTOMIZABLE K AH2 S T AH0 M AY1 Z AH0 B AH0 L CYBERCRIME S AY1 B ER0 K R AY1 M DATACENTER D EY1 T AH0 S EH2 N T ER0 DDOS D IY2 D AO1 S @@ -133957,8 +133958,8 @@ NGC EH1 N JH IY1 S IY1 NGX EH1 N JH IY1 EH1 K S NHS EH1 N EY1 CH EH1 S NIH EH1 N AY1 EY1 CH -NVIDIA EH0 N V IY1 D IY0 AH0 -NVIDIA'S EH0 N V IY1 D IY0 AH0 Z +NVIDIA EH0 N V IH1 D IY0 AH0 +NVIDIA'S EH0 N V IH1 D IY0 AH0 Z NVLINK EH1 N V IY1 L IH1 NG K NVME EH1 N V IY1 EH1 M IY1 NVSWITCH EH1 N V IH1 S W IH1 CH diff --git a/tutorials/tts/FastPitch_Finetuning.ipynb b/tutorials/tts/FastPitch_Finetuning.ipynb index b9099362ffa8..19b509502ef3 100755 --- a/tutorials/tts/FastPitch_Finetuning.ipynb +++ b/tutorials/tts/FastPitch_Finetuning.ipynb @@ -244,7 +244,7 @@ "source": [ "# additional files\n", "!mkdir -p tts_dataset_files && cd tts_dataset_files \\\n", - "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.01 \\\n", + "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.07 \\\n", "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/heteronyms-030921 \\\n", "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv \\\n", "&& cd .." @@ -286,7 +286,7 @@ " train_dataset=./6097_manifest_train_dur_5_mins_local.json \\\n", " validation_datasets=./6097_manifest_dev_ns_all_local.json \\\n", " sup_data_path=./fastpitch_sup_data \\\n", - " phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.01 \\\n", + " phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.07 \\\n", " heteronyms_path=tts_dataset_files/heteronyms-030921 \\\n", " whitelist_path=tts_dataset_files/lj_speech.tsv \\\n", " exp_manager.exp_dir=./ljspeech_to_6097_no_mixing_5_mins \\\n", @@ -318,7 +318,7 @@ " sup_data_path=./fastpitch_sup_data`\n", " * We tell the script what manifest files to train and eval on, as well as where supplementary data is located (or will be calculated and saved during training if not provided).\n", " \n", - "* `phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.01 \n", + "* `phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.07 \n", "heteronyms_path=tts_dataset_files/heteronyms-030921\n", "whitelist_path=tts_dataset_files/lj_speech.tsv \n", "`\n", @@ -718,4 +718,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/tutorials/tts/FastPitch_MixerTTS_Training.ipynb b/tutorials/tts/FastPitch_MixerTTS_Training.ipynb index 012a90bfb7a8..6c555c32b5a5 100644 --- a/tutorials/tts/FastPitch_MixerTTS_Training.ipynb +++ b/tutorials/tts/FastPitch_MixerTTS_Training.ipynb @@ -226,7 +226,7 @@ "\n", "# additional files\n", "!mkdir -p tts_dataset_files && cd tts_dataset_files \\\n", - "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.01 \\\n", + "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.07 \\\n", "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/heteronyms-030921 \\\n", "&& wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv \\\n", "&& cd .." @@ -428,7 +428,7 @@ "\n", "# Grapheme-to-phoneme module\n", "g2p = EnglishG2p(\n", - " phoneme_dict=\"tts_dataset_files/cmudict-0.7b_nv22.01\",\n", + " phoneme_dict=\"tts_dataset_files/cmudict-0.7b_nv22.07\",\n", " heteronyms=\"tts_dataset_files/heteronyms-030921\"\n", ")\n", "\n", @@ -554,7 +554,7 @@ "validation_datasets=tests/data/asr/an4_val.json \\\n", "sup_data_types=\"['align_prior_matrix', 'pitch']\" \\\n", "sup_data_path={mixer_tts_sup_data_path} \\\n", - "phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.01 \\\n", + "phoneme_dict_path=tts_dataset_files/cmudict-0.7b_nv22.07 \\\n", "heteronyms_path=tts_dataset_files/heteronyms-030921 \\\n", "whitelist_path=tts_dataset_files/lj_speech.tsv \\\n", "pitch_mean={pitch_mean} \\\n", @@ -606,4 +606,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/tutorials/tts/Tacotron2_Training.ipynb b/tutorials/tts/Tacotron2_Training.ipynb index 0ad3f114d458..3602e4fec24f 100644 --- a/tutorials/tts/Tacotron2_Training.ipynb +++ b/tutorials/tts/Tacotron2_Training.ipynb @@ -163,7 +163,7 @@ "# We will also need a few extra files for handling text.\n", "!(mkdir -p scripts/tts_dataset_files \\\n", " && cd scripts/tts_dataset_files \\\n", - " && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.01 \\\n", + " && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/cmudict-0.7b_nv22.07 \\\n", " && wget wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tts_dataset_files/heteronyms-030921 \\\n", " && cd ..)\n", " \n", @@ -231,7 +231,7 @@ "sup_data_path: null\n", "sup_data_types: null\n", "\n", - "phoneme_dict_path: \"scripts/tts_dataset_files/cmudict-0.7b_nv22.01\"\n", + "phoneme_dict_path: \"scripts/tts_dataset_files/cmudict-0.7b_nv22.07\"\n", "heteronyms_path: \"scripts/tts_dataset_files/heteronyms-030921\"\n", "whitelist_path: \"nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv\"\n", "```\n",