diff --git a/examples/tts/conf/zh/fastpitch_align_24finals_22050_ipa.yaml b/examples/tts/conf/zh/fastpitch_align_24finals_22050_ipa.yaml new file mode 100644 index 000000000000..ffb1017d5002 --- /dev/null +++ b/examples/tts/conf/zh/fastpitch_align_24finals_22050_ipa.yaml @@ -0,0 +1,253 @@ +# This config contains the default values for training FastPitch model with aligner using 22KHz sampling +# rate. If you want to train model on other dataset, you can change config values according to your dataset. +# Most dataset-specific arguments are in the head of the config file, see below. + +name: FastPitch + +train_dataset: ??? +validation_datasets: ??? +sup_data_path: ??? +sup_data_types: [ "align_prior_matrix", "pitch" ] + +# Default values from librosa.pyin +pitch_fmin: 65.40639132514966 +pitch_fmax: 1986.977294921875 + +# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values +# by running `scripts/dataset_processing/tts/extract_sup_data.py` +pitch_mean: ??? # e.g. 221.4948272705078 for SFbilingual dataset. +pitch_std: ??? # e.g. 64.6528930664063 for SFbilingual dataset. + +# Default values for dataset with sample_rate=22050 +sample_rate: 22050 +n_mel_channels: 80 +n_window_size: 1024 +n_window_stride: 256 +n_fft: 1024 +lowfreq: 0 +highfreq: null +window: hann + +phoneme_dict_path: "scripts/tts_dataset_files/zh/24finals/ipa_dict_nv23.05.txt" + +model: + learn_alignment: true + bin_loss_warmup_epochs: 100 + + n_speakers: 1 + max_token_duration: 75 + symbols_embedding_dim: 384 + pitch_embedding_kernel_size: 3 + + pitch_fmin: ${pitch_fmin} + pitch_fmax: ${pitch_fmax} + + pitch_mean: ${pitch_mean} + pitch_std: ${pitch_std} + + sample_rate: ${sample_rate} + n_mel_channels: ${n_mel_channels} + n_window_size: ${n_window_size} + n_window_stride: ${n_window_stride} + n_fft: ${n_fft} + lowfreq: ${lowfreq} + highfreq: ${highfreq} + window: ${window} + + text_normalizer: + _target_: nemo_text_processing.text_normalization.normalize.Normalizer + lang: zh + input_case: cased + + text_normalizer_call_kwargs: + verbose: false + punct_pre_process: true + punct_post_process: true + + text_tokenizer: + _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.ChinesePhonemesTokenizer + punct: true + apostrophe: true + pad_with_space: true + g2p: + _target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p + phoneme_dict: ${phoneme_dict_path} + word_segmenter: jieba # Only jieba is supported now. + phoneme_prefix: "" + phoneme_case: lower + tone_prefix: "#" + ascii_letter_prefix: "" + ascii_letter_case: upper + + train_ds: + dataset: + _target_: nemo.collections.tts.data.dataset.TTSDataset + manifest_filepath: ${train_dataset} + sample_rate: ${model.sample_rate} + sup_data_path: ${sup_data_path} + sup_data_types: ${sup_data_types} + n_fft: ${model.n_fft} + win_length: ${model.n_window_size} + hop_length: ${model.n_window_stride} + window: ${model.window} + n_mels: ${model.n_mel_channels} + lowfreq: ${model.lowfreq} + highfreq: ${model.highfreq} + max_duration: null # change to null to include longer audios. + min_duration: 0.1 + ignore_file: null + trim: true + trim_top_db: 50 + trim_frame_length: ${model.n_window_size} + trim_hop_length: ${model.n_window_stride} + pitch_fmin: ${model.pitch_fmin} + pitch_fmax: ${model.pitch_fmax} + pitch_norm: true + pitch_mean: ${model.pitch_mean} + pitch_std: ${model.pitch_std} + + dataloader_params: + drop_last: false + shuffle: true + batch_size: 32 + num_workers: 12 + pin_memory: true + + validation_ds: + dataset: + _target_: nemo.collections.tts.data.dataset.TTSDataset + manifest_filepath: ${validation_datasets} + sample_rate: ${model.sample_rate} + sup_data_path: ${sup_data_path} + sup_data_types: ${sup_data_types} + n_fft: ${model.n_fft} + win_length: ${model.n_window_size} + hop_length: ${model.n_window_stride} + window: ${model.window} + n_mels: ${model.n_mel_channels} + lowfreq: ${model.lowfreq} + highfreq: ${model.highfreq} + max_duration: null # change to null to include longer audios. + min_duration: 0.1 + ignore_file: null + trim: true + trim_top_db: 50 + trim_frame_length: ${model.n_window_size} + trim_hop_length: ${model.n_window_stride} + pitch_fmin: ${model.pitch_fmin} + pitch_fmax: ${model.pitch_fmax} + pitch_norm: true + pitch_mean: ${model.pitch_mean} + pitch_std: ${model.pitch_std} + + dataloader_params: + drop_last: false + shuffle: false + batch_size: 32 + num_workers: 2 + pin_memory: true + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + features: ${model.n_mel_channels} + lowfreq: ${model.lowfreq} + highfreq: ${model.highfreq} + n_fft: ${model.n_fft} + n_window_size: ${model.n_window_size} + window_size: false + n_window_stride: ${model.n_window_stride} + window_stride: false + pad_to: 1 + pad_value: 0 + sample_rate: ${model.sample_rate} + window: ${model.window} + normalize: null + preemph: null + dither: 0.0 + frame_splicing: 1 + log: true + log_zero_guard_type: add + log_zero_guard_value: 1e-05 + mag_power: 1.0 + + input_fft: #n_embed and padding_idx are added by the model + _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder + n_layer: 6 + n_head: 1 + d_model: ${model.symbols_embedding_dim} + d_head: 64 + d_inner: 1536 + kernel_size: 3 + dropout: 0.1 + dropatt: 0.1 + dropemb: 0.0 + d_embed: ${model.symbols_embedding_dim} + + output_fft: + _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder + n_layer: 6 + n_head: 1 + d_model: ${model.symbols_embedding_dim} + d_head: 64 + d_inner: 1536 + kernel_size: 3 + dropout: 0.1 + dropatt: 0.1 + dropemb: 0.0 + + alignment_module: + _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder + n_text_channels: ${model.symbols_embedding_dim} + + duration_predictor: + _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor + input_size: ${model.symbols_embedding_dim} + kernel_size: 3 + filter_size: 256 + dropout: 0.1 + n_layers: 2 + + pitch_predictor: + _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor + input_size: ${model.symbols_embedding_dim} + kernel_size: 3 + filter_size: 256 + dropout: 0.1 + n_layers: 2 + + optim: + name: adamw + lr: 1e-3 + betas: [0.9, 0.999] + weight_decay: 1e-6 + + sched: + name: NoamAnnealing + warmup_steps: 1000 + last_epoch: -1 + d_model: 1 # Disable scaling based on model dim + +trainer: + num_nodes: 1 + devices: -1 # number of gpus + accelerator: gpu + strategy: ddp + precision: 16 + max_epochs: 5000 + accumulate_grad_batches: 1 + gradient_clip_val: 1000.0 + enable_checkpointing: false # Provided by exp_manager + logger: false # Provided by exp_manager + log_every_n_steps: 100 + check_val_every_n_epoch: 5 + benchmark: false + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + monitor: val_loss + resume_if_exists: false + resume_ignore_no_checkpoint: false diff --git a/examples/tts/conf/zh/fastpitch_align_24finals_22050_pinyin.yaml b/examples/tts/conf/zh/fastpitch_align_24finals_22050_pinyin.yaml new file mode 100644 index 000000000000..0a37064ff63f --- /dev/null +++ b/examples/tts/conf/zh/fastpitch_align_24finals_22050_pinyin.yaml @@ -0,0 +1,253 @@ +# This config contains the default values for training FastPitch model with aligner using 22KHz sampling +# rate. If you want to train model on other dataset, you can change config values according to your dataset. +# Most dataset-specific arguments are in the head of the config file, see below. + +name: FastPitch + +train_dataset: ??? +validation_datasets: ??? +sup_data_path: ??? +sup_data_types: [ "align_prior_matrix", "pitch" ] + +# Default values from librosa.pyin +pitch_fmin: 65.40639132514966 +pitch_fmax: 1986.977294921875 + +# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values +# by running `scripts/dataset_processing/tts/extract_sup_data.py` +pitch_mean: ??? # e.g. 221.4948272705078 for SFbilingual dataset. +pitch_std: ??? # e.g. 64.6528930664063 for SFbilingual dataset. + +# Default values for dataset with sample_rate=22050 +sample_rate: 22050 +n_mel_channels: 80 +n_window_size: 1024 +n_window_stride: 256 +n_fft: 1024 +lowfreq: 0 +highfreq: null +window: hann + +phoneme_dict_path: "scripts/tts_dataset_files/zh/24finals/pinyin_dict_nv_22.10.txt" + +model: + learn_alignment: true + bin_loss_warmup_epochs: 100 + + n_speakers: 1 + max_token_duration: 75 + symbols_embedding_dim: 384 + pitch_embedding_kernel_size: 3 + + pitch_fmin: ${pitch_fmin} + pitch_fmax: ${pitch_fmax} + + pitch_mean: ${pitch_mean} + pitch_std: ${pitch_std} + + sample_rate: ${sample_rate} + n_mel_channels: ${n_mel_channels} + n_window_size: ${n_window_size} + n_window_stride: ${n_window_stride} + n_fft: ${n_fft} + lowfreq: ${lowfreq} + highfreq: ${highfreq} + window: ${window} + + text_normalizer: + _target_: nemo_text_processing.text_normalization.normalize.Normalizer + lang: zh + input_case: cased + + text_normalizer_call_kwargs: + verbose: false + punct_pre_process: true + punct_post_process: true + + text_tokenizer: + _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.ChinesePhonemesTokenizer + punct: true + apostrophe: true + pad_with_space: true + g2p: + _target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p + phoneme_dict: ${phoneme_dict_path} + word_segmenter: jieba # Only jieba is supported now. + phoneme_prefix: "" + phoneme_case: lower + tone_prefix: "#" + ascii_letter_prefix: "" + ascii_letter_case: upper + + train_ds: + dataset: + _target_: nemo.collections.tts.data.dataset.TTSDataset + manifest_filepath: ${train_dataset} + sample_rate: ${model.sample_rate} + sup_data_path: ${sup_data_path} + sup_data_types: ${sup_data_types} + n_fft: ${model.n_fft} + win_length: ${model.n_window_size} + hop_length: ${model.n_window_stride} + window: ${model.window} + n_mels: ${model.n_mel_channels} + lowfreq: ${model.lowfreq} + highfreq: ${model.highfreq} + max_duration: null # change to null to include longer audios. + min_duration: 0.1 + ignore_file: null + trim: true + trim_top_db: 50 + trim_frame_length: ${model.n_window_size} + trim_hop_length: ${model.n_window_stride} + pitch_fmin: ${model.pitch_fmin} + pitch_fmax: ${model.pitch_fmax} + pitch_norm: true + pitch_mean: ${model.pitch_mean} + pitch_std: ${model.pitch_std} + + dataloader_params: + drop_last: false + shuffle: true + batch_size: 32 + num_workers: 12 + pin_memory: true + + validation_ds: + dataset: + _target_: nemo.collections.tts.data.dataset.TTSDataset + manifest_filepath: ${validation_datasets} + sample_rate: ${model.sample_rate} + sup_data_path: ${sup_data_path} + sup_data_types: ${sup_data_types} + n_fft: ${model.n_fft} + win_length: ${model.n_window_size} + hop_length: ${model.n_window_stride} + window: ${model.window} + n_mels: ${model.n_mel_channels} + lowfreq: ${model.lowfreq} + highfreq: ${model.highfreq} + max_duration: null # change to null to include longer audios. + min_duration: 0.1 + ignore_file: null + trim: true + trim_top_db: 50 + trim_frame_length: ${model.n_window_size} + trim_hop_length: ${model.n_window_stride} + pitch_fmin: ${model.pitch_fmin} + pitch_fmax: ${model.pitch_fmax} + pitch_norm: true + pitch_mean: ${model.pitch_mean} + pitch_std: ${model.pitch_std} + + dataloader_params: + drop_last: false + shuffle: false + batch_size: 32 + num_workers: 2 + pin_memory: true + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + features: ${model.n_mel_channels} + lowfreq: ${model.lowfreq} + highfreq: ${model.highfreq} + n_fft: ${model.n_fft} + n_window_size: ${model.n_window_size} + window_size: false + n_window_stride: ${model.n_window_stride} + window_stride: false + pad_to: 1 + pad_value: 0 + sample_rate: ${model.sample_rate} + window: ${model.window} + normalize: null + preemph: null + dither: 0.0 + frame_splicing: 1 + log: true + log_zero_guard_type: add + log_zero_guard_value: 1e-05 + mag_power: 1.0 + + input_fft: #n_embed and padding_idx are added by the model + _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder + n_layer: 6 + n_head: 1 + d_model: ${model.symbols_embedding_dim} + d_head: 64 + d_inner: 1536 + kernel_size: 3 + dropout: 0.1 + dropatt: 0.1 + dropemb: 0.0 + d_embed: ${model.symbols_embedding_dim} + + output_fft: + _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder + n_layer: 6 + n_head: 1 + d_model: ${model.symbols_embedding_dim} + d_head: 64 + d_inner: 1536 + kernel_size: 3 + dropout: 0.1 + dropatt: 0.1 + dropemb: 0.0 + + alignment_module: + _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder + n_text_channels: ${model.symbols_embedding_dim} + + duration_predictor: + _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor + input_size: ${model.symbols_embedding_dim} + kernel_size: 3 + filter_size: 256 + dropout: 0.1 + n_layers: 2 + + pitch_predictor: + _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor + input_size: ${model.symbols_embedding_dim} + kernel_size: 3 + filter_size: 256 + dropout: 0.1 + n_layers: 2 + + optim: + name: adamw + lr: 1e-3 + betas: [0.9, 0.999] + weight_decay: 1e-6 + + sched: + name: NoamAnnealing + warmup_steps: 1000 + last_epoch: -1 + d_model: 1 # Disable scaling based on model dim + +trainer: + num_nodes: 1 + devices: -1 # number of gpus + accelerator: gpu + strategy: ddp + precision: 16 + max_epochs: 5000 + accumulate_grad_batches: 1 + gradient_clip_val: 1000.0 + enable_checkpointing: false # Provided by exp_manager + logger: false # Provided by exp_manager + log_every_n_steps: 100 + check_val_every_n_epoch: 5 + benchmark: false + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + monitor: val_loss + resume_if_exists: false + resume_ignore_no_checkpoint: false diff --git a/examples/tts/conf/zh/fastpitch_align_22050.yaml b/examples/tts/conf/zh/fastpitch_align_36finals_22050_ipa.yaml similarity index 96% rename from examples/tts/conf/zh/fastpitch_align_22050.yaml rename to examples/tts/conf/zh/fastpitch_align_36finals_22050_ipa.yaml index 6754986ec89a..da848f7b40c2 100644 --- a/examples/tts/conf/zh/fastpitch_align_22050.yaml +++ b/examples/tts/conf/zh/fastpitch_align_36finals_22050_ipa.yaml @@ -28,7 +28,7 @@ lowfreq: 0 highfreq: null window: hann -phoneme_dict_path: "scripts/tts_dataset_files/zh/pinyin_dict_nv_22.10.txt" +phoneme_dict_path: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt" model: learn_alignment: true @@ -73,6 +73,11 @@ model: _target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p phoneme_dict: ${phoneme_dict_path} word_segmenter: jieba # Only jieba is supported now. + phoneme_prefix: "" + phoneme_case: lower + tone_prefix: "#" + ascii_letter_prefix: "" + ascii_letter_case: upper train_ds: dataset: diff --git a/examples/tts/conf/zh/fastpitch_align_36finals_22050_pinyin.yaml b/examples/tts/conf/zh/fastpitch_align_36finals_22050_pinyin.yaml new file mode 100644 index 000000000000..96902ef9aeca --- /dev/null +++ b/examples/tts/conf/zh/fastpitch_align_36finals_22050_pinyin.yaml @@ -0,0 +1,253 @@ +# This config contains the default values for training FastPitch model with aligner using 22KHz sampling +# rate. If you want to train model on other dataset, you can change config values according to your dataset. +# Most dataset-specific arguments are in the head of the config file, see below. + +name: FastPitch + +train_dataset: ??? +validation_datasets: ??? +sup_data_path: ??? +sup_data_types: [ "align_prior_matrix", "pitch" ] + +# Default values from librosa.pyin +pitch_fmin: 65.40639132514966 +pitch_fmax: 1986.977294921875 + +# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values +# by running `scripts/dataset_processing/tts/extract_sup_data.py` +pitch_mean: ??? # e.g. 221.4948272705078 for SFbilingual dataset. +pitch_std: ??? # e.g. 64.6528930664063 for SFbilingual dataset. + +# Default values for dataset with sample_rate=22050 +sample_rate: 22050 +n_mel_channels: 80 +n_window_size: 1024 +n_window_stride: 256 +n_fft: 1024 +lowfreq: 0 +highfreq: null +window: hann + +phoneme_dict_path: "scripts/tts_dataset_files/zh/36finals/pinyin_dict_nv23.05.txt" + +model: + learn_alignment: true + bin_loss_warmup_epochs: 100 + + n_speakers: 1 + max_token_duration: 75 + symbols_embedding_dim: 384 + pitch_embedding_kernel_size: 3 + + pitch_fmin: ${pitch_fmin} + pitch_fmax: ${pitch_fmax} + + pitch_mean: ${pitch_mean} + pitch_std: ${pitch_std} + + sample_rate: ${sample_rate} + n_mel_channels: ${n_mel_channels} + n_window_size: ${n_window_size} + n_window_stride: ${n_window_stride} + n_fft: ${n_fft} + lowfreq: ${lowfreq} + highfreq: ${highfreq} + window: ${window} + + text_normalizer: + _target_: nemo_text_processing.text_normalization.normalize.Normalizer + lang: zh + input_case: cased + + text_normalizer_call_kwargs: + verbose: false + punct_pre_process: true + punct_post_process: true + + text_tokenizer: + _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.ChinesePhonemesTokenizer + punct: true + apostrophe: true + pad_with_space: true + g2p: + _target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p + phoneme_dict: ${phoneme_dict_path} + word_segmenter: jieba # Only jieba is supported now. + phoneme_prefix: "" + phoneme_case: lower + tone_prefix: "#" + ascii_letter_prefix: "" + ascii_letter_case: upper + + train_ds: + dataset: + _target_: nemo.collections.tts.data.dataset.TTSDataset + manifest_filepath: ${train_dataset} + sample_rate: ${model.sample_rate} + sup_data_path: ${sup_data_path} + sup_data_types: ${sup_data_types} + n_fft: ${model.n_fft} + win_length: ${model.n_window_size} + hop_length: ${model.n_window_stride} + window: ${model.window} + n_mels: ${model.n_mel_channels} + lowfreq: ${model.lowfreq} + highfreq: ${model.highfreq} + max_duration: null # change to null to include longer audios. + min_duration: 0.1 + ignore_file: null + trim: true + trim_top_db: 50 + trim_frame_length: ${model.n_window_size} + trim_hop_length: ${model.n_window_stride} + pitch_fmin: ${model.pitch_fmin} + pitch_fmax: ${model.pitch_fmax} + pitch_norm: true + pitch_mean: ${model.pitch_mean} + pitch_std: ${model.pitch_std} + + dataloader_params: + drop_last: false + shuffle: true + batch_size: 32 + num_workers: 12 + pin_memory: true + + validation_ds: + dataset: + _target_: nemo.collections.tts.data.dataset.TTSDataset + manifest_filepath: ${validation_datasets} + sample_rate: ${model.sample_rate} + sup_data_path: ${sup_data_path} + sup_data_types: ${sup_data_types} + n_fft: ${model.n_fft} + win_length: ${model.n_window_size} + hop_length: ${model.n_window_stride} + window: ${model.window} + n_mels: ${model.n_mel_channels} + lowfreq: ${model.lowfreq} + highfreq: ${model.highfreq} + max_duration: null # change to null to include longer audios. + min_duration: 0.1 + ignore_file: null + trim: true + trim_top_db: 50 + trim_frame_length: ${model.n_window_size} + trim_hop_length: ${model.n_window_stride} + pitch_fmin: ${model.pitch_fmin} + pitch_fmax: ${model.pitch_fmax} + pitch_norm: true + pitch_mean: ${model.pitch_mean} + pitch_std: ${model.pitch_std} + + dataloader_params: + drop_last: false + shuffle: false + batch_size: 32 + num_workers: 2 + pin_memory: true + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + features: ${model.n_mel_channels} + lowfreq: ${model.lowfreq} + highfreq: ${model.highfreq} + n_fft: ${model.n_fft} + n_window_size: ${model.n_window_size} + window_size: false + n_window_stride: ${model.n_window_stride} + window_stride: false + pad_to: 1 + pad_value: 0 + sample_rate: ${model.sample_rate} + window: ${model.window} + normalize: null + preemph: null + dither: 0.0 + frame_splicing: 1 + log: true + log_zero_guard_type: add + log_zero_guard_value: 1e-05 + mag_power: 1.0 + + input_fft: #n_embed and padding_idx are added by the model + _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder + n_layer: 6 + n_head: 1 + d_model: ${model.symbols_embedding_dim} + d_head: 64 + d_inner: 1536 + kernel_size: 3 + dropout: 0.1 + dropatt: 0.1 + dropemb: 0.0 + d_embed: ${model.symbols_embedding_dim} + + output_fft: + _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder + n_layer: 6 + n_head: 1 + d_model: ${model.symbols_embedding_dim} + d_head: 64 + d_inner: 1536 + kernel_size: 3 + dropout: 0.1 + dropatt: 0.1 + dropemb: 0.0 + + alignment_module: + _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder + n_text_channels: ${model.symbols_embedding_dim} + + duration_predictor: + _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor + input_size: ${model.symbols_embedding_dim} + kernel_size: 3 + filter_size: 256 + dropout: 0.1 + n_layers: 2 + + pitch_predictor: + _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor + input_size: ${model.symbols_embedding_dim} + kernel_size: 3 + filter_size: 256 + dropout: 0.1 + n_layers: 2 + + optim: + name: adamw + lr: 1e-3 + betas: [0.9, 0.999] + weight_decay: 1e-6 + + sched: + name: NoamAnnealing + warmup_steps: 1000 + last_epoch: -1 + d_model: 1 # Disable scaling based on model dim + +trainer: + num_nodes: 1 + devices: -1 # number of gpus + accelerator: gpu + strategy: ddp + precision: 16 + max_epochs: 5000 + accumulate_grad_batches: 1 + gradient_clip_val: 1000.0 + enable_checkpointing: false # Provided by exp_manager + logger: false # Provided by exp_manager + log_every_n_steps: 100 + check_val_every_n_epoch: 5 + benchmark: false + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + monitor: val_loss + resume_if_exists: false + resume_ignore_no_checkpoint: false diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py index abcbdb1661b9..4193cf00eb85 100644 --- a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py +++ b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py @@ -694,7 +694,10 @@ def __init__( pad_with_space=False, text_preprocessing_func=chinese_text_preprocessing, ): - """Chinese phoneme-based tokenizer. + """ + Chinese phoneme-based tokenizer. + Note: This tokenizer for now covers Chinese phonemes/tones and English letters because our dataset contains + both Chinese and English graphemes. Args: g2p: Grapheme to phoneme module. punct: Whether to reserve grapheme for basic punctuation or not. diff --git a/scripts/dataset_processing/tts/sfbilingual/ds_conf/ds_for_fastpitch_align.yaml b/scripts/dataset_processing/tts/sfbilingual/ds_conf/ds_for_fastpitch_align.yaml index 7337163f3bff..235d061a3a34 100755 --- a/scripts/dataset_processing/tts/sfbilingual/ds_conf/ds_for_fastpitch_align.yaml +++ b/scripts/dataset_processing/tts/sfbilingual/ds_conf/ds_for_fastpitch_align.yaml @@ -3,7 +3,7 @@ name: "ds_for_fastpitch_align" manifest_filepath: "train_manifest.json" sup_data_path: "sup_data" sup_data_types: [ "align_prior_matrix", "pitch" ] -phoneme_dict_path: "scripts/tts_dataset_files/zh/pinyin_dict_nv_22.10.txt" +phoneme_dict_path: "scripts/tts_dataset_files/zh/24finals/pinyin_dict_nv_22.10.txt" dataset: _target_: nemo.collections.tts.data.dataset.TTSDataset diff --git a/scripts/tts_dataset_files/zh/24finals/ipa_dict_nv23.05.txt b/scripts/tts_dataset_files/zh/24finals/ipa_dict_nv23.05.txt new file mode 100644 index 000000000000..f639042d41ec --- /dev/null +++ b/scripts/tts_dataset_files/zh/24finals/ipa_dict_nv23.05.txt @@ -0,0 +1,426 @@ +;;; # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +;;; # +;;; # Licensed under the Apache License, Version 2.0 (the "License"); +;;; # you may not use this file except in compliance with the License. +;;; # You may obtain a copy of the License at +;;; # +;;; # http://www.apache.org/licenses/LICENSE-2.0 +;;; # +;;; # Unless required by applicable law or agreed to in writing, software +;;; # distributed under the License is distributed on an "AS IS" BASIS, +;;; # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +;;; # See the License for the specific language governing permissions and +;;; # limitations under the License. +;;; +A a +AI ai +AN a n +ANG a ŋ +AO au +BA p a +BAI p ai +BAN p a n +BANG p a ŋ +BAO p au +BEI p ei +BEN p ə n +BENG p ə ŋ +BI p i +BIAN p i a n +BIAO p i au +BIE p j e +BIN p i n +BING p i ŋ +BO p o +BU p u +CA tsʰ a +CAI tsʰ ai +CAN tsʰ a n +CANG tsʰ a ŋ +CAO tsʰ au +CE tsʰ ɤ +CEN tsʰ ə n +CENG tsʰ ə ŋ +CHA ʈʂʰ a +CHAI ʈʂʰ ai +CHAN ʈʂʰ a n +CHANG ʈʂʰ a ŋ +CHAO ʈʂʰ au +CHE ʈʂʰ ɤ +CHEN ʈʂʰ ə n +CHENG ʈʂʰ ə ŋ +CHI ʈʂʰ i +CHONG ʈʂʰ ʊ ŋ +CHOU ʈʂʰ ou +CHU ʈʂʰ u +CHUAI ʈʂʰ u ai +CHUAN ʈʂʰ u a n +CHUANG ʈʂʰ u a ŋ +CHUI ʈʂʰ w ei +CHUN ʈʂʰ w ə n +CHUO ʈʂʰ u o +CI tsʰ i +CONG tsʰ ʊ ŋ +COU tsʰ ou +CU tsʰ u +CUAN tsʰ u a n +CUI tsʰ w ei +CUN tsʰ w ə n +CUO tsʰ u o +DA t a +DAI t ai +DAN t a n +DANG t a ŋ +DAO t au +DE t ɤ +DEI t ei +DEN t ə n +DENG t ə ŋ +DI t i +DIA t i a +DIAN t i a n +DIAO t i au +DIE t j e +DING t i ŋ +DIU t j ou +DONG t ʊ ŋ +DOU t ou +DU t u +DUAN t u a n +DUI t w ei +DUN t w ə n +DUO t u o +E ɤ +EI ei +EN ə n +ER ɚ +FA f a +FAI f ai +FAN f a n +FANG f a ŋ +FEI f ei +FEN f ə n +FENG f ə ŋ +FO f o +FOU f ou +FU f u +GA k a +GAI k ai +GAN k a n +GANG k a ŋ +GAO k au +GE k ɤ +GEI k ei +GEN k ə n +GENG k ə ŋ +GONG k ʊ ŋ +GOU k ou +GU k u +GUA k u a +GUAI k u ai +GUAN k u a n +GUANG k u a ŋ +GUI k w ei +GUN k w ə n +GUO k u o +HA x a +HAI x ai +HAN x a n +HANG x a ŋ +HAO x au +HE x ɤ +HEI x ei +HEN x ə n +HENG x ə ŋ +HONG x ʊ ŋ +HOU x ou +HU x u +HUA x u a +HUAI x u ai +HUAN x u a n +HUANG x u a ŋ +HUI x w ei +HUN x w ə n +HUO x u o +JI tɕ i +JIA tɕ i a +JIAN tɕ i a n +JIANG tɕ i a ŋ +JIAO tɕ i au +JIE tɕ j e +JIN tɕ i n +JING tɕ i ŋ +JIONG tɕ i ʊ ŋ +JIU tɕ j ou +JU tɕ y +JUAN tɕ y a n +JUE tɕ ɥ e +JUN tɕ y n +KA kʰ a +KAI kʰ ai +KAN kʰ a n +KANG kʰ a ŋ +KAO kʰ au +KE kʰ ɤ +KEI kʰ ei +KEN kʰ ə n +KENG kʰ ə ŋ +KIU kʰ j ou +KONG kʰ ʊ ŋ +KOU kʰ ou +KU kʰ u +KUA kʰ u a +KUAI kʰ u ai +KUAN kʰ u a n +KUANG kʰ u a ŋ +KUI kʰ w ei +KUN kʰ w ə n +KUO kʰ u o +LA l a +LAI l ai +LAN l a n +LANG l a ŋ +LAO l au +LE l ɤ +LEI l ei +LENG l ə ŋ +LI l i +LIA l i a +LIAN l i a n +LIANG l i a ŋ +LIAO l i au +LIE l j e +LIN l i n +LING l i ŋ +LIU l j ou +LO l o +LONG l ʊ ŋ +LOU l ou +LU l u +LUAN l u a n +LUN l w ə n +LUO l u o +LV l y +LVE l ɥ e +MA m a +MAI m ai +MAN m a n +MANG m a ŋ +MAO m au +ME m ɤ +MEI m ei +MEN m ə n +MENG m ə ŋ +MI m i +MIAN m i a n +MIAO m i au +MIE m j e +MIN m i n +MING m i ŋ +MIU m j ou +MO m o +MOU m ou +MU m u +NA n a +NAI n ai +NAN n a n +NANG n a ŋ +NAO n au +NE n ɤ +NEI n ei +NEN n ə n +NENG n ə ŋ +NG n ə n +NI n i +NIAN n i a n +NIANG n i a ŋ +NIAO n i au +NIE n j e +NIN n i n +NING n i ŋ +NIU n j ou +NONG n ʊ ŋ +NOU n ou +NU n u +NUAN n u a n +NUO n u o +NV n y +NVE n ɥ e +O o +OU ou +PA pʰ a +PAI pʰ ai +PAN pʰ a n +PANG pʰ a ŋ +PAO pʰ au +PEI pʰ ei +PEN pʰ ə n +PENG pʰ ə ŋ +PI pʰ i +PIAN pʰ i a n +PIAO pʰ i au +PIE pʰ j e +PIN pʰ i n +PING pʰ i ŋ +PO pʰ o +POU pʰ ou +PU pʰ u +QI tɕʰ i +QIA tɕʰ i a +QIAN tɕʰ i a n +QIANG tɕʰ i a ŋ +QIAO tɕʰ i au +QIE tɕʰ j e +QIN tɕʰ i n +QING tɕʰ i ŋ +QIONG tɕʰ i ʊ ŋ +QIU tɕʰ j ou +QU tɕʰ y +QUAN tɕʰ y a n +QUE tɕʰ ɥ e +QUN tɕʰ y n +RAN ʐ a n +RANG ʐ a ŋ +RAO ʐ au +RE ʐ ɤ +REN ʐ ə n +RENG ʐ ə ŋ +RI ʐ i +RONG ʐ ʊ ŋ +ROU ʐ ou +RU ʐ u +RUAN ʐ u a n +RUI ʐ w ei +RUN ʐ w ə n +RUO ʐ u o +SA s a +SAI s ai +SAN s a n +SANG s a ŋ +SAO s au +SE s ɤ +SEI s ei +SEN s ə n +SENG s ə ŋ +SHA ʂ a +SHAI ʂ ai +SHAN ʂ a n +SHANG ʂ a ŋ +SHAO ʂ au +SHE ʂ ɤ +SHEI ʂ ei +SHEN ʂ ə n +SHENG ʂ ə ŋ +SHI ʂ i +SHOU ʂ ou +SHU ʂ u +SHUA ʂ u a +SHUAI ʂ u ai +SHUAN ʂ u a n +SHUANG ʂ u a ŋ +SHUI ʂ w ei +SHUN ʂ w ə n +SHUO ʂ u o +SI s i +SONG s ʊ ŋ +SOU s ou +SU s u +SUAN s u a n +SUI s w ei +SUN s w ə n +SUO s u o +TA tʰ a +TAI tʰ ai +TAN tʰ a n +TANG tʰ a ŋ +TAO tʰ au +TE tʰ ɤ +TEI tʰ ei +TENG tʰ ə ŋ +TI tʰ i +TIAN tʰ i a n +TIAO tʰ i au +TIE tʰ j e +TING tʰ i ŋ +TONG tʰ ʊ ŋ +TOU tʰ ou +TU tʰ u +TUAN tʰ u a n +TUI tʰ w ei +TUN tʰ w ə n +TUO tʰ u o +WA w a +WAI w ai +WAN w a n +WANG w a ŋ +WEI w ei +WEN w ə n +WENG w ə ŋ +WO w o +WU w u +XI ɕ i +XIA ɕ i a +XIAN ɕ i a n +XIANG ɕ i a ŋ +XIAO ɕ i au +XIE ɕ j e +XIN ɕ i n +XING ɕ i ŋ +XIONG ɕ i ʊ ŋ +XIU ɕ j ou +XU ɕ y +XUAN ɕ y a n +XUE ɕ ɥ e +XUN ɕ y n +YA j a +YAN j a n +YANG j a ŋ +YAO j au +YE j ɤ +YI j i +YIN j i n +YING j i ŋ +YO j o +YONG j ʊ ŋ +YOU j ou +YU j y +YUAN j y a n +YUE j ɥ e +YUN j y n +ZA ts a +ZAI ts ai +ZAN ts a n +ZANG ts a ŋ +ZAO ts au +ZE ts ɤ +ZEI ts ei +ZEN ts ə n +ZENG ts ə ŋ +ZHA ʈʂ a +ZHAI ʈʂ ai +ZHAN ʈʂ a n +ZHANG ʈʂ a ŋ +ZHAO ʈʂ au +ZHE ʈʂ ɤ +ZHEI ʈʂ ei +ZHEN ʈʂ ə n +ZHENG ʈʂ ə ŋ +ZHI ʈʂ i +ZHONG ʈʂ ʊ ŋ +ZHOU ʈʂ ou +ZHU ʈʂ u +ZHUA ʈʂ u a +ZHUAI ʈʂ u ai +ZHUAN ʈʂ u a n +ZHUANG ʈʂ u a ŋ +ZHUI ʈʂ w ei +ZHUN ʈʂ w ə n +ZHUO ʈʂ u o +ZI ts i +ZONG ts ʊ ŋ +ZOU ts ou +ZU ts u +ZUAN ts u a n +ZUI ts w ei +ZUN ts w ə n +ZUO ts u o diff --git a/scripts/tts_dataset_files/zh/pinyin_dict_nv_22.10.txt b/scripts/tts_dataset_files/zh/24finals/pinyin_dict_nv_22.10.txt similarity index 85% rename from scripts/tts_dataset_files/zh/pinyin_dict_nv_22.10.txt rename to scripts/tts_dataset_files/zh/24finals/pinyin_dict_nv_22.10.txt index 952429c1a47b..6c87c343e8fa 100644 --- a/scripts/tts_dataset_files/zh/pinyin_dict_nv_22.10.txt +++ b/scripts/tts_dataset_files/zh/24finals/pinyin_dict_nv_22.10.txt @@ -1,3 +1,17 @@ +;;; # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +;;; # +;;; # Licensed under the Apache License, Version 2.0 (the "License"); +;;; # you may not use this file except in compliance with the License. +;;; # You may obtain a copy of the License at +;;; # +;;; # http://www.apache.org/licenses/LICENSE-2.0 +;;; # +;;; # Unless required by applicable law or agreed to in writing, software +;;; # distributed under the License is distributed on an "AS IS" BASIS, +;;; # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +;;; # See the License for the specific language governing permissions and +;;; # limitations under the License. +;;; A ^ A AI ^ AI AN ^ AN diff --git a/scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt b/scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt new file mode 100644 index 000000000000..edac6fd11d2d --- /dev/null +++ b/scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt @@ -0,0 +1,427 @@ +;;; # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +;;; # +;;; # Licensed under the Apache License, Version 2.0 (the "License"); +;;; # you may not use this file except in compliance with the License. +;;; # You may obtain a copy of the License at +;;; # +;;; # http://www.apache.org/licenses/LICENSE-2.0 +;;; # +;;; # Unless required by applicable law or agreed to in writing, software +;;; # distributed under the License is distributed on an "AS IS" BASIS, +;;; # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +;;; # See the License for the specific language governing permissions and +;;; # limitations under the License. +;;; +A a +AI ai +AN a n +ANG a ŋ +AO au +BA p a +BAI p ai +BAN p a n +BANG p a ŋ +BAO p au +BEI p ei +BEN p ə n +BENG p ə ŋ +BI p i +BIAN p j ɛ n +BIANG p j a ŋ +BIAO p j au +BIE p j e +BIN p i n +BING p i ŋ +BO p o +BU p u +CA tsʰ a +CAI tsʰ ai +CAN tsʰ a n +CANG tsʰ a ŋ +CAO tsʰ au +CE tsʰ ɤ +CEI tsʰ ei +CEN tsʰ ə n +CENG tsʰ ə ŋ +CHA ʈʂʰ a +CHAI ʈʂʰ ai +CHAN ʈʂʰ a n +CHANG ʈʂʰ a ŋ +CHAO ʈʂʰ au +CHE ʈʂʰ ɤ +CHEN ʈʂʰ ə n +CHENG ʈʂʰ ə ŋ +CHI ʈʂʰ i +CHONG ʈʂʰ ʊ ŋ +CHOU ʈʂʰ ou +CHU ʈʂʰ u +CHUA ʈʂʰ w a +CHUAI ʈʂʰ w ai +CHUAN ʈʂʰ w a n +CHUANG ʈʂʰ w a ŋ +CHUI ʈʂʰ w ei +CHUN ʈʂʰ w ə n +CHUO ʈʂʰ w o +CI tsʰ i +CONG tsʰ ʊ ŋ +COU tsʰ ou +CU tsʰ u +CUAN tsʰ w a n +CUI tsʰ w ei +CUN tsʰ w ə n +CUO tsʰ w o +DA t a +DAI t ai +DAN t a n +DANG t a ŋ +DAO t au +DE t ɤ +DEI t ei +DEN t ə n +DENG t ə ŋ +DI t i +DIA t j a +DIAN t j ɛ n +DIAO t j au +DIE t j e +DING t i ŋ +DIU t j ou +DONG t ʊ ŋ +DOU t ou +DU t u +DUAN t w a n +DUI t w ei +DUN t w ə n +DUO t w o +E ɤ +EI ei +EN ə n +ENG ə ŋ +ER ɚ +FA f a +FAN f a n +FANG f a ŋ +FEI f ei +FEN f ə n +FENG f ə ŋ +FO f o +FOU f ou +FU f u +GA k a +GAI k ai +GAN k a n +GANG k a ŋ +GAO k au +GE k ɤ +GEI k ei +GEN k ə n +GENG k ə ŋ +GONG k ʊ ŋ +GOU k ou +GU k u +GUA k w a +GUAI k w ai +GUAN k w a n +GUANG k w a ŋ +GUI k w ei +GUN k w ə n +GUO k w o +HA x a +HAI x ai +HAN x a n +HANG x a ŋ +HAO x au +HE x ɤ +HEI x ei +HEN x ə n +HENG x ə ŋ +HONG x ʊ ŋ +HOU x ou +HU x u +HUA x w a +HUAI x w ai +HUAN x w a n +HUANG x w a ŋ +HUI x w ei +HUN x w ə n +HUO x w o +JI tɕ i +JIA tɕ j a +JIAN tɕ j ɛ n +JIANG tɕ j a ŋ +JIAO tɕ j au +JIE tɕ j e +JIN tɕ i n +JING tɕ i ŋ +JIONG tɕ j ʊ ŋ +JIU tɕ j ou +JU tɕ y +JUAN tɕ ɥ ɛ n +JUE tɕ ɥ e +JUN tɕ y n +KA kʰ a +KAI kʰ ai +KAN kʰ a n +KANG kʰ a ŋ +KAO kʰ au +KE kʰ ɤ +KEI kʰ ei +KEN kʰ ə n +KENG kʰ ə ŋ +KONG kʰ ʊ ŋ +KOU kʰ ou +KU kʰ u +KUA kʰ w a +KUAI kʰ w ai +KUAN kʰ w a n +KUANG kʰ w a ŋ +KUI kʰ w ei +KUN kʰ w ə n +KUO kʰ w o +LA l a +LAI l ai +LAN l a n +LANG l a ŋ +LAO l au +LE l ɤ +LEI l ei +LENG l ə ŋ +LI l i +LIA l j a +LIAN l j ɛ n +LIANG l j a ŋ +LIAO l j au +LIE l j e +LIN l i n +LING l i ŋ +LIU l j ou +LO l o +LONG l ʊ ŋ +LOU l ou +LU l u +LUAN l w a n +LUN l w ə n +LUO l w o +LV l y +LVE l ɥ e +MA m a +MAI m ai +MAN m a n +MANG m a ŋ +MAO m au +ME m ɤ +MEI m ei +MEN m ə n +MENG m ə ŋ +MI m i +MIAN m j ɛ n +MIAO m j au +MIE m j e +MIN m i n +MING m i ŋ +MIU m j ou +MO m o +MOU m ou +MU m u +NA n a +NAI n ai +NAN n a n +NANG n a ŋ +NAO n au +NE n ɤ +NEI n ei +NEN n ə n +NENG n ə ŋ +NI n i +NIAN n j ɛ n +NIANG n j a ŋ +NIAO n j au +NIE n j e +NIN n i n +NING n i ŋ +NIU n j ou +NONG n ʊ ŋ +NOU n ou +NU n u +NUAN n w a n +NUO n w o +NV n y +NVE n ɥ e +O o +OU ou +PA pʰ a +PAI pʰ ai +PAN pʰ a n +PANG pʰ a ŋ +PAO pʰ au +PEI pʰ ei +PEN pʰ ə n +PENG pʰ ə ŋ +PI pʰ i +PIAN pʰ j ɛ n +PIAO pʰ j au +PIE pʰ j e +PIN pʰ i n +PING pʰ i ŋ +PO pʰ o +POU pʰ ou +PU pʰ u +QI tɕʰ i +QIA tɕʰ j a +QIAN tɕʰ j ɛ n +QIANG tɕʰ j a ŋ +QIAO tɕʰ j au +QIE tɕʰ j e +QIN tɕʰ i n +QING tɕʰ i ŋ +QIONG tɕʰ j ʊ ŋ +QIU tɕʰ j ou +QU tɕʰ y +QUAN tɕʰ ɥ ɛ n +QUE tɕʰ ɥ e +QUN tɕʰ y n +RAN ʐ a n +RANG ʐ a ŋ +RAO ʐ au +RE ʐ ɤ +REN ʐ ə n +RENG ʐ ə ŋ +RI ʐ i +RONG ʐ ʊ ŋ +ROU ʐ ou +RU ʐ u +RUA ʐ w a +RUAN ʐ w a n +RUI ʐ w ei +RUN ʐ w ə n +RUO ʐ w o +SA s a +SAI s ai +SAN s a n +SANG s a ŋ +SAO s au +SE s ɤ +SEN s ə n +SENG s ə ŋ +SHA ʂ a +SHAI ʂ ai +SHAN ʂ a n +SHANG ʂ a ŋ +SHAO ʂ au +SHE ʂ ɤ +SHEI ʂ ei +SHEN ʂ ə n +SHENG ʂ ə ŋ +SHI ʂ i +SHOU ʂ ou +SHU ʂ u +SHUA ʂ w a +SHUAI ʂ w ai +SHUAN ʂ w a n +SHUANG ʂ w a ŋ +SHUI ʂ w ei +SHUN ʂ w ə n +SHUO ʂ w o +SI s i +SONG s ʊ ŋ +SOU s ou +SU s u +SUAN s w a n +SUI s w ei +SUN s w ə n +SUO s w o +TA tʰ a +TAI tʰ ai +TAN tʰ a n +TANG tʰ a ŋ +TAO tʰ au +TE tʰ ɤ +TEI tʰ ei +TENG tʰ ə ŋ +TI tʰ i +TIAN tʰ j ɛ n +TIAO tʰ j au +TIE tʰ j e +TING tʰ i ŋ +TONG tʰ ʊ ŋ +TOU tʰ ou +TU tʰ u +TUAN tʰ w a n +TUI tʰ w ei +TUN tʰ w ə n +TUO tʰ w o +WA w a +WAI w ai +WAN w a n +WANG w a ŋ +WEI w ei +WEN w ə n +WENG w ə ŋ +WO w o +WU u +XI ɕ i +XIA ɕ j a +XIAN ɕ j ɛ n +XIANG ɕ j a ŋ +XIAO ɕ j au +XIE ɕ j e +XIN ɕ i n +XING ɕ i ŋ +XIONG ɕ j ʊ ŋ +XIU ɕ j ou +XU ɕ y +XUAN ɕ ɥ ɛ n +XUE ɕ ɥ e +XUN ɕ y n +YA j a +YAN j ɛ n +YANG j a ŋ +YAO j au +YE j e +YI i +YIN i n +YING i ŋ +YO o +YONG j ʊ ŋ +YOU j ou +YU y +YUAN ɥ ɛ n +YUE ɥ e +YUN y n +ZA ts a +ZAI ts ai +ZAN ts a n +ZANG ts a ŋ +ZAO ts au +ZE ts ɤ +ZEI ts ei +ZEN ts ə n +ZENG ts ə ŋ +ZHA ʈʂ a +ZHAI ʈʂ ai +ZHAN ʈʂ a n +ZHANG ʈʂ a ŋ +ZHAO ʈʂ au +ZHE ʈʂ ɤ +ZHEI ʈʂ ei +ZHEN ʈʂ ə n +ZHENG ʈʂ ə ŋ +ZHI ʈʂ i +ZHONG ʈʂ ʊ ŋ +ZHOU ʈʂ ou +ZHU ʈʂ u +ZHUA ʈʂ w a +ZHUAI ʈʂ w ai +ZHUAN ʈʂ w a n +ZHUANG ʈʂ w a ŋ +ZHUI ʈʂ w ei +ZHUN ʈʂ w ə n +ZHUO ʈʂ w o +ZI ts i +ZONG ts ʊ ŋ +ZOU ts ou +ZU ts u +ZUAN ts w a n +ZUI ts w ei +ZUN ts w ə n +ZUO ts w o diff --git a/scripts/tts_dataset_files/zh/36finals/pinyin_dict_nv23.05.txt b/scripts/tts_dataset_files/zh/36finals/pinyin_dict_nv23.05.txt new file mode 100644 index 000000000000..16319ce41b4f --- /dev/null +++ b/scripts/tts_dataset_files/zh/36finals/pinyin_dict_nv23.05.txt @@ -0,0 +1,442 @@ +;;; # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +;;; # +;;; # Licensed under the Apache License, Version 2.0 (the "License"); +;;; # you may not use this file except in compliance with the License. +;;; # You may obtain a copy of the License at +;;; # +;;; # http://www.apache.org/licenses/LICENSE-2.0 +;;; # +;;; # Unless required by applicable law or agreed to in writing, software +;;; # distributed under the License is distributed on an "AS IS" BASIS, +;;; # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +;;; # See the License for the specific language governing permissions and +;;; # limitations under the License. +;;; +;;; Notes ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +;;; [May 24, 2023] +;;; - Added syllables: {'BIANG', 'CEI', 'CHUA', 'ENG', 'RUA'} +;;; - Removed syllables that are not existing in standard Mandarin: {'FAI', 'KIU', 'NG', 'SEI'} +;;; - Increased the number of finals to 35 from 24 and updated all syllable entries. +;;; - Initials and finals are directly obtained by running, +;;; ``` +;;; from pypinyin.contrib.tone_convert import to_initials, to_finals +;;; initial = to_initials(syllable).upper() +;;; final = to_finals(syllable).upper() +;;; ``` +;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; +A A +AI AI +AN AN +ANG ANG +AO AO +BA B A +BAI B AI +BAN B AN +BANG B ANG +BAO B AO +BEI B EI +BEN B EN +BENG B ENG +BI B I +BIAN B IAN +BIANG B IANG +BIAO B IAO +BIE B IE +BIN B IN +BING B ING +BO B O +BU B U +CA C A +CAI C AI +CAN C AN +CANG C ANG +CAO C AO +CE C E +CEI C EI +CEN C EN +CENG C ENG +CHA CH A +CHAI CH AI +CHAN CH AN +CHANG CH ANG +CHAO CH AO +CHE CH E +CHEN CH EN +CHENG CH ENG +CHI CH I +CHONG CH ONG +CHOU CH OU +CHU CH U +CHUA CH UA +CHUAI CH UAI +CHUAN CH UAN +CHUANG CH UANG +CHUI CH UEI +CHUN CH UEN +CHUO CH UO +CI C I +CONG C ONG +COU C OU +CU C U +CUAN C UAN +CUI C UEI +CUN C UEN +CUO C UO +DA D A +DAI D AI +DAN D AN +DANG D ANG +DAO D AO +DE D E +DEI D EI +DEN D EN +DENG D ENG +DI D I +DIA D IA +DIAN D IAN +DIAO D IAO +DIE D IE +DING D ING +DIU D IOU +DONG D ONG +DOU D OU +DU D U +DUAN D UAN +DUI D UEI +DUN D UEN +DUO D UO +E E +EI EI +EN EN +ENG ENG +ER ER +FA F A +FAN F AN +FANG F ANG +FEI F EI +FEN F EN +FENG F ENG +FO F O +FOU F OU +FU F U +GA G A +GAI G AI +GAN G AN +GANG G ANG +GAO G AO +GE G E +GEI G EI +GEN G EN +GENG G ENG +GONG G ONG +GOU G OU +GU G U +GUA G UA +GUAI G UAI +GUAN G UAN +GUANG G UANG +GUI G UEI +GUN G UEN +GUO G UO +HA H A +HAI H AI +HAN H AN +HANG H ANG +HAO H AO +HE H E +HEI H EI +HEN H EN +HENG H ENG +HONG H ONG +HOU H OU +HU H U +HUA H UA +HUAI H UAI +HUAN H UAN +HUANG H UANG +HUI H UEI +HUN H UEN +HUO H UO +JI J I +JIA J IA +JIAN J IAN +JIANG J IANG +JIAO J IAO +JIE J IE +JIN J IN +JING J ING +JIONG J IONG +JIU J IOU +JU J V +JUAN J VAN +JUE J VE +JUN J VN +KA K A +KAI K AI +KAN K AN +KANG K ANG +KAO K AO +KE K E +KEI K EI +KEN K EN +KENG K ENG +KONG K ONG +KOU K OU +KU K U +KUA K UA +KUAI K UAI +KUAN K UAN +KUANG K UANG +KUI K UEI +KUN K UEN +KUO K UO +LA L A +LAI L AI +LAN L AN +LANG L ANG +LAO L AO +LE L E +LEI L EI +LENG L ENG +LI L I +LIA L IA +LIAN L IAN +LIANG L IANG +LIAO L IAO +LIE L IE +LIN L IN +LING L ING +LIU L IOU +LO L O +LONG L ONG +LOU L OU +LU L U +LUAN L UAN +LUN L UEN +LUO L UO +LV L V +LVE L VE +MA M A +MAI M AI +MAN M AN +MANG M ANG +MAO M AO +ME M E +MEI M EI +MEN M EN +MENG M ENG +MI M I +MIAN M IAN +MIAO M IAO +MIE M IE +MIN M IN +MING M ING +MIU M IOU +MO M O +MOU M OU +MU M U +NA N A +NAI N AI +NAN N AN +NANG N ANG +NAO N AO +NE N E +NEI N EI +NEN N EN +NENG N ENG +NI N I +NIAN N IAN +NIANG N IANG +NIAO N IAO +NIE N IE +NIN N IN +NING N ING +NIU N IOU +NONG N ONG +NOU N OU +NU N U +NUAN N UAN +NUO N UO +NV N V +NVE N VE +O O +OU OU +PA P A +PAI P AI +PAN P AN +PANG P ANG +PAO P AO +PEI P EI +PEN P EN +PENG P ENG +PI P I +PIAN P IAN +PIAO P IAO +PIE P IE +PIN P IN +PING P ING +PO P O +POU P OU +PU P U +QI Q I +QIA Q IA +QIAN Q IAN +QIANG Q IANG +QIAO Q IAO +QIE Q IE +QIN Q IN +QING Q ING +QIONG Q IONG +QIU Q IOU +QU Q V +QUAN Q VAN +QUE Q VE +QUN Q VN +RAN R AN +RANG R ANG +RAO R AO +RE R E +REN R EN +RENG R ENG +RI R I +RONG R ONG +ROU R OU +RU R U +RUA R UA +RUAN R UAN +RUI R UEI +RUN R UEN +RUO R UO +SA S A +SAI S AI +SAN S AN +SANG S ANG +SAO S AO +SE S E +SEN S EN +SENG S ENG +SHA SH A +SHAI SH AI +SHAN SH AN +SHANG SH ANG +SHAO SH AO +SHE SH E +SHEI SH EI +SHEN SH EN +SHENG SH ENG +SHI SH I +SHOU SH OU +SHU SH U +SHUA SH UA +SHUAI SH UAI +SHUAN SH UAN +SHUANG SH UANG +SHUI SH UEI +SHUN SH UEN +SHUO SH UO +SI S I +SONG S ONG +SOU S OU +SU S U +SUAN S UAN +SUI S UEI +SUN S UEN +SUO S UO +TA T A +TAI T AI +TAN T AN +TANG T ANG +TAO T AO +TE T E +TEI T EI +TENG T ENG +TI T I +TIAN T IAN +TIAO T IAO +TIE T IE +TING T ING +TONG T ONG +TOU T OU +TU T U +TUAN T UAN +TUI T UEI +TUN T UEN +TUO T UO +WA UA +WAI UAI +WAN UAN +WANG UANG +WEI UEI +WEN UEN +WENG UENG +WO UO +WU U +XI X I +XIA X IA +XIAN X IAN +XIANG X IANG +XIAO X IAO +XIE X IE +XIN X IN +XING X ING +XIONG X IONG +XIU X IOU +XU X V +XUAN X VAN +XUE X VE +XUN X VN +YA IA +YAN IAN +YANG IANG +YAO IAO +YE IE +YI I +YIN IN +YING ING +YO O +YONG IONG +YOU IOU +YU V +YUAN VAN +YUE VE +YUN VN +ZA Z A +ZAI Z AI +ZAN Z AN +ZANG Z ANG +ZAO Z AO +ZE Z E +ZEI Z EI +ZEN Z EN +ZENG Z ENG +ZHA ZH A +ZHAI ZH AI +ZHAN ZH AN +ZHANG ZH ANG +ZHAO ZH AO +ZHE ZH E +ZHEI ZH EI +ZHEN ZH EN +ZHENG ZH ENG +ZHI ZH I +ZHONG ZH ONG +ZHOU ZH OU +ZHU ZH U +ZHUA ZH UA +ZHUAI ZH UAI +ZHUAN ZH UAN +ZHUANG ZH UANG +ZHUI ZH UEI +ZHUN ZH UEN +ZHUO ZH UO +ZI Z I +ZONG Z ONG +ZOU Z OU +ZU Z U +ZUAN Z UAN +ZUI Z UEI +ZUN Z UEN +ZUO Z UO diff --git a/tutorials/tts/FastPitch_ChineseTTS_Training.ipynb b/tutorials/tts/FastPitch_ChineseTTS_Training.ipynb index 9c4ea4369534..2a12b417a271 100644 --- a/tutorials/tts/FastPitch_ChineseTTS_Training.ipynb +++ b/tutorials/tts/FastPitch_ChineseTTS_Training.ipynb @@ -103,7 +103,7 @@ "source": [ "# let's download the files we need to run this tutorial\n", "!mkdir -p NeMoChineseTTS\n", - "!cd NeMoChineseTTS && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/tts_dataset_files/zh/pinyin_dict_nv_22.10.txt\n", + "!cd NeMoChineseTTS && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/tts_dataset_files/zh/24finals/pinyin_dict_nv_22.10.txt\n", "!cd NeMoChineseTTS && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/dataset_processing/tts/sfbilingual/get_data.py\n", "!cd NeMoChineseTTS && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/dataset_processing/tts/sfbilingual/ds_conf/ds_for_fastpitch_align.yaml\n", "!cd NeMoChineseTTS && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/fastpitch.py\n",