diff --git a/examples/tts/conf/zh/fastpitch_align_24finals_22050_pinyin.yaml b/examples/tts/conf/zh/fastpitch_align_22050.yaml
similarity index 92%
rename from examples/tts/conf/zh/fastpitch_align_24finals_22050_pinyin.yaml
rename to examples/tts/conf/zh/fastpitch_align_22050.yaml
index 0a37064ff63f..98555102071c 100644
--- a/examples/tts/conf/zh/fastpitch_align_24finals_22050_pinyin.yaml
+++ b/examples/tts/conf/zh/fastpitch_align_22050.yaml
@@ -28,6 +28,12 @@ lowfreq: 0
 highfreq: null
 window: hann
 
+# There are four candidates of `phoneme_dict_path` provided for Chinese as shown below,
+#     1) (default) 24-final Pinyin: "scripts/tts_dataset_files/zh/24finals/pinyin_dict_nv_22.10.txt",
+#     2) IPA converted from 24-final Pinyin: "scripts/tts_dataset_files/zh/24finals/ipa_dict_nv23.05.txt",
+#     3) 36-final Pinyin: "scripts/tts_dataset_files/zh/36finals/pinyin_dict_nv23.05.txt",
+#     4) IPA converted from 36-final Pinyin: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt"
+# Suggest to choose IPA symbol set converted from 36-final Pinyin because better audio quality were observed.
 phoneme_dict_path: "scripts/tts_dataset_files/zh/24finals/pinyin_dict_nv_22.10.txt"
 
 model:
diff --git a/examples/tts/conf/zh/fastpitch_align_24finals_22050_ipa.yaml b/examples/tts/conf/zh/fastpitch_align_24finals_22050_ipa.yaml
deleted file mode 100644
index ffb1017d5002..000000000000
--- a/examples/tts/conf/zh/fastpitch_align_24finals_22050_ipa.yaml
+++ /dev/null
@@ -1,253 +0,0 @@
-# This config contains the default values for training FastPitch model with aligner using 22KHz sampling
-# rate. If you want to train model on other dataset, you can change config values according to your dataset.
-# Most dataset-specific arguments are in the head of the config file, see below.
-
-name: FastPitch
-
-train_dataset: ???
-validation_datasets: ???
-sup_data_path: ???
-sup_data_types: [ "align_prior_matrix", "pitch" ]
-
-# Default values from librosa.pyin
-pitch_fmin: 65.40639132514966
-pitch_fmax: 1986.977294921875
-
-# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
-# by running `scripts/dataset_processing/tts/extract_sup_data.py`
-pitch_mean: ???  # e.g. 221.4948272705078 for SFbilingual dataset.
-pitch_std:  ???  # e.g.  64.6528930664063 for SFbilingual dataset.
-
-# Default values for dataset with sample_rate=22050
-sample_rate: 22050
-n_mel_channels: 80
-n_window_size: 1024
-n_window_stride: 256
-n_fft: 1024
-lowfreq: 0
-highfreq: null
-window: hann
-
-phoneme_dict_path: "scripts/tts_dataset_files/zh/24finals/ipa_dict_nv23.05.txt"
-
-model:
-  learn_alignment: true
-  bin_loss_warmup_epochs: 100
-
-  n_speakers: 1
-  max_token_duration: 75
-  symbols_embedding_dim: 384
-  pitch_embedding_kernel_size: 3
-
-  pitch_fmin: ${pitch_fmin}
-  pitch_fmax: ${pitch_fmax}
-
-  pitch_mean: ${pitch_mean}
-  pitch_std: ${pitch_std}
-
-  sample_rate: ${sample_rate}
-  n_mel_channels: ${n_mel_channels}
-  n_window_size: ${n_window_size}
-  n_window_stride: ${n_window_stride}
-  n_fft: ${n_fft}
-  lowfreq: ${lowfreq}
-  highfreq: ${highfreq}
-  window: ${window}
-
-  text_normalizer:
-    _target_: nemo_text_processing.text_normalization.normalize.Normalizer
-    lang: zh
-    input_case: cased
-
-  text_normalizer_call_kwargs:
-    verbose: false
-    punct_pre_process: true
-    punct_post_process: true
-
-  text_tokenizer:
-    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.ChinesePhonemesTokenizer
-    punct: true
-    apostrophe: true
-    pad_with_space: true
-    g2p:
-      _target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p
-      phoneme_dict: ${phoneme_dict_path}
-      word_segmenter: jieba # Only jieba is supported now.
-      phoneme_prefix: ""
-      phoneme_case: lower
-      tone_prefix: "#"
-      ascii_letter_prefix: ""
-      ascii_letter_case: upper
-
-  train_ds:
-    dataset:
-      _target_: nemo.collections.tts.data.dataset.TTSDataset
-      manifest_filepath: ${train_dataset}
-      sample_rate: ${model.sample_rate}
-      sup_data_path: ${sup_data_path}
-      sup_data_types: ${sup_data_types}
-      n_fft: ${model.n_fft}
-      win_length: ${model.n_window_size}
-      hop_length: ${model.n_window_stride}
-      window: ${model.window}
-      n_mels: ${model.n_mel_channels}
-      lowfreq: ${model.lowfreq}
-      highfreq: ${model.highfreq}
-      max_duration: null # change to null to include longer audios.
-      min_duration: 0.1
-      ignore_file: null
-      trim: true
-      trim_top_db: 50
-      trim_frame_length: ${model.n_window_size}
-      trim_hop_length: ${model.n_window_stride}
-      pitch_fmin: ${model.pitch_fmin}
-      pitch_fmax: ${model.pitch_fmax}
-      pitch_norm: true
-      pitch_mean: ${model.pitch_mean}
-      pitch_std: ${model.pitch_std}
-
-    dataloader_params:
-      drop_last: false
-      shuffle: true
-      batch_size: 32
-      num_workers: 12
-      pin_memory: true
-
-  validation_ds:
-    dataset:
-      _target_: nemo.collections.tts.data.dataset.TTSDataset
-      manifest_filepath: ${validation_datasets}
-      sample_rate: ${model.sample_rate}
-      sup_data_path: ${sup_data_path}
-      sup_data_types: ${sup_data_types}
-      n_fft: ${model.n_fft}
-      win_length: ${model.n_window_size}
-      hop_length: ${model.n_window_stride}
-      window: ${model.window}
-      n_mels: ${model.n_mel_channels}
-      lowfreq: ${model.lowfreq}
-      highfreq: ${model.highfreq}
-      max_duration: null # change to null to include longer audios.
-      min_duration: 0.1
-      ignore_file: null
-      trim: true
-      trim_top_db: 50
-      trim_frame_length: ${model.n_window_size}
-      trim_hop_length: ${model.n_window_stride}
-      pitch_fmin: ${model.pitch_fmin}
-      pitch_fmax: ${model.pitch_fmax}
-      pitch_norm: true
-      pitch_mean: ${model.pitch_mean}
-      pitch_std: ${model.pitch_std}
-
-    dataloader_params:
-      drop_last: false
-      shuffle: false
-      batch_size: 32
-      num_workers: 2
-      pin_memory: true
-
-  preprocessor:
-    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
-    features: ${model.n_mel_channels}
-    lowfreq: ${model.lowfreq}
-    highfreq: ${model.highfreq}
-    n_fft: ${model.n_fft}
-    n_window_size: ${model.n_window_size}
-    window_size: false
-    n_window_stride: ${model.n_window_stride}
-    window_stride: false
-    pad_to: 1
-    pad_value: 0
-    sample_rate: ${model.sample_rate}
-    window: ${model.window}
-    normalize: null
-    preemph: null
-    dither: 0.0
-    frame_splicing: 1
-    log: true
-    log_zero_guard_type: add
-    log_zero_guard_value: 1e-05
-    mag_power: 1.0
-
-  input_fft: #n_embed and padding_idx are added by the model
-    _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder
-    n_layer: 6
-    n_head: 1
-    d_model: ${model.symbols_embedding_dim}
-    d_head: 64
-    d_inner: 1536
-    kernel_size: 3
-    dropout: 0.1
-    dropatt: 0.1
-    dropemb: 0.0
-    d_embed: ${model.symbols_embedding_dim}
-
-  output_fft:
-    _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder
-    n_layer: 6
-    n_head: 1
-    d_model: ${model.symbols_embedding_dim}
-    d_head: 64
-    d_inner: 1536
-    kernel_size: 3
-    dropout: 0.1
-    dropatt: 0.1
-    dropemb: 0.0
-
-  alignment_module:
-    _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder
-    n_text_channels: ${model.symbols_embedding_dim}
-
-  duration_predictor:
-    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
-    input_size: ${model.symbols_embedding_dim}
-    kernel_size: 3
-    filter_size: 256
-    dropout: 0.1
-    n_layers: 2
-
-  pitch_predictor:
-    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
-    input_size: ${model.symbols_embedding_dim}
-    kernel_size: 3
-    filter_size: 256
-    dropout: 0.1
-    n_layers: 2
-
-  optim:
-    name: adamw
-    lr: 1e-3
-    betas: [0.9, 0.999]
-    weight_decay: 1e-6
-
-    sched:
-      name: NoamAnnealing
-      warmup_steps: 1000
-      last_epoch: -1
-      d_model: 1 # Disable scaling based on model dim
-
-trainer:
-  num_nodes: 1
-  devices: -1 # number of gpus
-  accelerator: gpu
-  strategy: ddp
-  precision: 16
-  max_epochs: 5000
-  accumulate_grad_batches: 1
-  gradient_clip_val: 1000.0
-  enable_checkpointing: false # Provided by exp_manager
-  logger: false # Provided by exp_manager
-  log_every_n_steps: 100
-  check_val_every_n_epoch: 5
-  benchmark: false
-
-exp_manager:
-  exp_dir: null
-  name: ${name}
-  create_tensorboard_logger: true
-  create_checkpoint_callback: true
-  checkpoint_callback_params:
-    monitor: val_loss
-  resume_if_exists: false
-  resume_ignore_no_checkpoint: false
diff --git a/examples/tts/conf/zh/fastpitch_align_36finals_22050_ipa.yaml b/examples/tts/conf/zh/fastpitch_align_36finals_22050_ipa.yaml
deleted file mode 100644
index da848f7b40c2..000000000000
--- a/examples/tts/conf/zh/fastpitch_align_36finals_22050_ipa.yaml
+++ /dev/null
@@ -1,253 +0,0 @@
-# This config contains the default values for training FastPitch model with aligner using 22KHz sampling 
-# rate. If you want to train model on other dataset, you can change config values according to your dataset. 
-# Most dataset-specific arguments are in the head of the config file, see below.
-
-name: FastPitch
-
-train_dataset: ???
-validation_datasets: ???
-sup_data_path: ???
-sup_data_types: [ "align_prior_matrix", "pitch" ]
-
-# Default values from librosa.pyin
-pitch_fmin: 65.40639132514966
-pitch_fmax: 1986.977294921875
-
-# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
-# by running `scripts/dataset_processing/tts/extract_sup_data.py`
-pitch_mean: ???  # e.g. 221.4948272705078 for SFbilingual dataset.
-pitch_std:  ???  # e.g.  64.6528930664063 for SFbilingual dataset.
-
-# Default values for dataset with sample_rate=22050
-sample_rate: 22050
-n_mel_channels: 80
-n_window_size: 1024
-n_window_stride: 256
-n_fft: 1024
-lowfreq: 0
-highfreq: null
-window: hann
-
-phoneme_dict_path: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt"
-
-model:
-  learn_alignment: true
-  bin_loss_warmup_epochs: 100
-
-  n_speakers: 1
-  max_token_duration: 75
-  symbols_embedding_dim: 384
-  pitch_embedding_kernel_size: 3
-
-  pitch_fmin: ${pitch_fmin}
-  pitch_fmax: ${pitch_fmax}
-
-  pitch_mean: ${pitch_mean}
-  pitch_std: ${pitch_std}
-
-  sample_rate: ${sample_rate}
-  n_mel_channels: ${n_mel_channels}
-  n_window_size: ${n_window_size}
-  n_window_stride: ${n_window_stride}
-  n_fft: ${n_fft}
-  lowfreq: ${lowfreq}
-  highfreq: ${highfreq}
-  window: ${window}
-
-  text_normalizer:
-    _target_: nemo_text_processing.text_normalization.normalize.Normalizer
-    lang: zh
-    input_case: cased
-
-  text_normalizer_call_kwargs:
-    verbose: false
-    punct_pre_process: true
-    punct_post_process: true
-
-  text_tokenizer:
-    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.ChinesePhonemesTokenizer
-    punct: true
-    apostrophe: true
-    pad_with_space: true
-    g2p:
-      _target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p
-      phoneme_dict: ${phoneme_dict_path}
-      word_segmenter: jieba # Only jieba is supported now.
-      phoneme_prefix: "" 
-      phoneme_case: lower      
-      tone_prefix: "#"
-      ascii_letter_prefix: ""
-      ascii_letter_case: upper
-
-  train_ds:
-    dataset:
-      _target_: nemo.collections.tts.data.dataset.TTSDataset
-      manifest_filepath: ${train_dataset}
-      sample_rate: ${model.sample_rate}
-      sup_data_path: ${sup_data_path}
-      sup_data_types: ${sup_data_types}
-      n_fft: ${model.n_fft}
-      win_length: ${model.n_window_size}
-      hop_length: ${model.n_window_stride}
-      window: ${model.window}
-      n_mels: ${model.n_mel_channels}
-      lowfreq: ${model.lowfreq}
-      highfreq: ${model.highfreq}
-      max_duration: null # change to null to include longer audios.
-      min_duration: 0.1
-      ignore_file: null
-      trim: true
-      trim_top_db: 50
-      trim_frame_length: ${model.n_window_size}
-      trim_hop_length: ${model.n_window_stride}
-      pitch_fmin: ${model.pitch_fmin}
-      pitch_fmax: ${model.pitch_fmax}
-      pitch_norm: true
-      pitch_mean: ${model.pitch_mean}
-      pitch_std: ${model.pitch_std}
-
-    dataloader_params:
-      drop_last: false
-      shuffle: true
-      batch_size: 32
-      num_workers: 12
-      pin_memory: true
-
-  validation_ds:
-    dataset:
-      _target_: nemo.collections.tts.data.dataset.TTSDataset
-      manifest_filepath: ${validation_datasets}
-      sample_rate: ${model.sample_rate}
-      sup_data_path: ${sup_data_path}
-      sup_data_types: ${sup_data_types}
-      n_fft: ${model.n_fft}
-      win_length: ${model.n_window_size}
-      hop_length: ${model.n_window_stride}
-      window: ${model.window}
-      n_mels: ${model.n_mel_channels}
-      lowfreq: ${model.lowfreq}
-      highfreq: ${model.highfreq}
-      max_duration: null # change to null to include longer audios.
-      min_duration: 0.1
-      ignore_file: null
-      trim: true
-      trim_top_db: 50
-      trim_frame_length: ${model.n_window_size}
-      trim_hop_length: ${model.n_window_stride}
-      pitch_fmin: ${model.pitch_fmin}
-      pitch_fmax: ${model.pitch_fmax}
-      pitch_norm: true
-      pitch_mean: ${model.pitch_mean}
-      pitch_std: ${model.pitch_std}
-
-    dataloader_params:
-      drop_last: false
-      shuffle: false
-      batch_size: 32
-      num_workers: 2
-      pin_memory: true
-
-  preprocessor:
-    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
-    features: ${model.n_mel_channels}
-    lowfreq: ${model.lowfreq}
-    highfreq: ${model.highfreq}
-    n_fft: ${model.n_fft}
-    n_window_size: ${model.n_window_size}
-    window_size: false
-    n_window_stride: ${model.n_window_stride}
-    window_stride: false
-    pad_to: 1
-    pad_value: 0
-    sample_rate: ${model.sample_rate}
-    window: ${model.window}
-    normalize: null
-    preemph: null
-    dither: 0.0
-    frame_splicing: 1
-    log: true
-    log_zero_guard_type: add
-    log_zero_guard_value: 1e-05
-    mag_power: 1.0
-
-  input_fft: #n_embed and padding_idx are added by the model
-    _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder
-    n_layer: 6
-    n_head: 1
-    d_model: ${model.symbols_embedding_dim}
-    d_head: 64
-    d_inner: 1536
-    kernel_size: 3
-    dropout: 0.1
-    dropatt: 0.1
-    dropemb: 0.0
-    d_embed: ${model.symbols_embedding_dim}
-
-  output_fft:
-    _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder
-    n_layer: 6
-    n_head: 1
-    d_model: ${model.symbols_embedding_dim}
-    d_head: 64
-    d_inner: 1536
-    kernel_size: 3
-    dropout: 0.1
-    dropatt: 0.1
-    dropemb: 0.0
-
-  alignment_module:
-    _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder
-    n_text_channels: ${model.symbols_embedding_dim}
-
-  duration_predictor:
-    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
-    input_size: ${model.symbols_embedding_dim}
-    kernel_size: 3
-    filter_size: 256
-    dropout: 0.1
-    n_layers: 2
-
-  pitch_predictor:
-    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
-    input_size: ${model.symbols_embedding_dim}
-    kernel_size: 3
-    filter_size: 256
-    dropout: 0.1
-    n_layers: 2
-
-  optim:
-    name: adamw
-    lr: 1e-3
-    betas: [0.9, 0.999]
-    weight_decay: 1e-6
-
-    sched:
-      name: NoamAnnealing
-      warmup_steps: 1000
-      last_epoch: -1
-      d_model: 1 # Disable scaling based on model dim
-
-trainer:
-  num_nodes: 1
-  devices: -1 # number of gpus
-  accelerator: gpu
-  strategy: ddp
-  precision: 16
-  max_epochs: 5000
-  accumulate_grad_batches: 1
-  gradient_clip_val: 1000.0
-  enable_checkpointing: false # Provided by exp_manager
-  logger: false # Provided by exp_manager
-  log_every_n_steps: 100
-  check_val_every_n_epoch: 5
-  benchmark: false
-
-exp_manager:
-  exp_dir: null
-  name: ${name}
-  create_tensorboard_logger: true
-  create_checkpoint_callback: true
-  checkpoint_callback_params:
-    monitor: val_loss
-  resume_if_exists: false
-  resume_ignore_no_checkpoint: false
diff --git a/examples/tts/conf/zh/fastpitch_align_36finals_22050_pinyin.yaml b/examples/tts/conf/zh/fastpitch_align_36finals_22050_pinyin.yaml
deleted file mode 100644
index 96902ef9aeca..000000000000
--- a/examples/tts/conf/zh/fastpitch_align_36finals_22050_pinyin.yaml
+++ /dev/null
@@ -1,253 +0,0 @@
-# This config contains the default values for training FastPitch model with aligner using 22KHz sampling 
-# rate. If you want to train model on other dataset, you can change config values according to your dataset. 
-# Most dataset-specific arguments are in the head of the config file, see below.
-
-name: FastPitch
-
-train_dataset: ???
-validation_datasets: ???
-sup_data_path: ???
-sup_data_types: [ "align_prior_matrix", "pitch" ]
-
-# Default values from librosa.pyin
-pitch_fmin: 65.40639132514966
-pitch_fmax: 1986.977294921875
-
-# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
-# by running `scripts/dataset_processing/tts/extract_sup_data.py`
-pitch_mean: ???  # e.g. 221.4948272705078 for SFbilingual dataset.
-pitch_std:  ???  # e.g.  64.6528930664063 for SFbilingual dataset.
-
-# Default values for dataset with sample_rate=22050
-sample_rate: 22050
-n_mel_channels: 80
-n_window_size: 1024
-n_window_stride: 256
-n_fft: 1024
-lowfreq: 0
-highfreq: null
-window: hann
-
-phoneme_dict_path: "scripts/tts_dataset_files/zh/36finals/pinyin_dict_nv23.05.txt"
-
-model:
-  learn_alignment: true
-  bin_loss_warmup_epochs: 100
-
-  n_speakers: 1
-  max_token_duration: 75
-  symbols_embedding_dim: 384
-  pitch_embedding_kernel_size: 3
-
-  pitch_fmin: ${pitch_fmin}
-  pitch_fmax: ${pitch_fmax}
-
-  pitch_mean: ${pitch_mean}
-  pitch_std: ${pitch_std}
-
-  sample_rate: ${sample_rate}
-  n_mel_channels: ${n_mel_channels}
-  n_window_size: ${n_window_size}
-  n_window_stride: ${n_window_stride}
-  n_fft: ${n_fft}
-  lowfreq: ${lowfreq}
-  highfreq: ${highfreq}
-  window: ${window}
-
-  text_normalizer:
-    _target_: nemo_text_processing.text_normalization.normalize.Normalizer
-    lang: zh
-    input_case: cased
-
-  text_normalizer_call_kwargs:
-    verbose: false
-    punct_pre_process: true
-    punct_post_process: true
-
-  text_tokenizer:
-    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.ChinesePhonemesTokenizer
-    punct: true
-    apostrophe: true
-    pad_with_space: true
-    g2p:
-      _target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p
-      phoneme_dict: ${phoneme_dict_path}
-      word_segmenter: jieba # Only jieba is supported now.
-      phoneme_prefix: "" 
-      phoneme_case: lower      
-      tone_prefix: "#"
-      ascii_letter_prefix: ""
-      ascii_letter_case: upper
-
-  train_ds:
-    dataset:
-      _target_: nemo.collections.tts.data.dataset.TTSDataset
-      manifest_filepath: ${train_dataset}
-      sample_rate: ${model.sample_rate}
-      sup_data_path: ${sup_data_path}
-      sup_data_types: ${sup_data_types}
-      n_fft: ${model.n_fft}
-      win_length: ${model.n_window_size}
-      hop_length: ${model.n_window_stride}
-      window: ${model.window}
-      n_mels: ${model.n_mel_channels}
-      lowfreq: ${model.lowfreq}
-      highfreq: ${model.highfreq}
-      max_duration: null # change to null to include longer audios.
-      min_duration: 0.1
-      ignore_file: null
-      trim: true
-      trim_top_db: 50
-      trim_frame_length: ${model.n_window_size}
-      trim_hop_length: ${model.n_window_stride}
-      pitch_fmin: ${model.pitch_fmin}
-      pitch_fmax: ${model.pitch_fmax}
-      pitch_norm: true
-      pitch_mean: ${model.pitch_mean}
-      pitch_std: ${model.pitch_std}
-
-    dataloader_params:
-      drop_last: false
-      shuffle: true
-      batch_size: 32
-      num_workers: 12
-      pin_memory: true
-
-  validation_ds:
-    dataset:
-      _target_: nemo.collections.tts.data.dataset.TTSDataset
-      manifest_filepath: ${validation_datasets}
-      sample_rate: ${model.sample_rate}
-      sup_data_path: ${sup_data_path}
-      sup_data_types: ${sup_data_types}
-      n_fft: ${model.n_fft}
-      win_length: ${model.n_window_size}
-      hop_length: ${model.n_window_stride}
-      window: ${model.window}
-      n_mels: ${model.n_mel_channels}
-      lowfreq: ${model.lowfreq}
-      highfreq: ${model.highfreq}
-      max_duration: null # change to null to include longer audios.
-      min_duration: 0.1
-      ignore_file: null
-      trim: true
-      trim_top_db: 50
-      trim_frame_length: ${model.n_window_size}
-      trim_hop_length: ${model.n_window_stride}
-      pitch_fmin: ${model.pitch_fmin}
-      pitch_fmax: ${model.pitch_fmax}
-      pitch_norm: true
-      pitch_mean: ${model.pitch_mean}
-      pitch_std: ${model.pitch_std}
-
-    dataloader_params:
-      drop_last: false
-      shuffle: false
-      batch_size: 32
-      num_workers: 2
-      pin_memory: true
-
-  preprocessor:
-    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
-    features: ${model.n_mel_channels}
-    lowfreq: ${model.lowfreq}
-    highfreq: ${model.highfreq}
-    n_fft: ${model.n_fft}
-    n_window_size: ${model.n_window_size}
-    window_size: false
-    n_window_stride: ${model.n_window_stride}
-    window_stride: false
-    pad_to: 1
-    pad_value: 0
-    sample_rate: ${model.sample_rate}
-    window: ${model.window}
-    normalize: null
-    preemph: null
-    dither: 0.0
-    frame_splicing: 1
-    log: true
-    log_zero_guard_type: add
-    log_zero_guard_value: 1e-05
-    mag_power: 1.0
-
-  input_fft: #n_embed and padding_idx are added by the model
-    _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder
-    n_layer: 6
-    n_head: 1
-    d_model: ${model.symbols_embedding_dim}
-    d_head: 64
-    d_inner: 1536
-    kernel_size: 3
-    dropout: 0.1
-    dropatt: 0.1
-    dropemb: 0.0
-    d_embed: ${model.symbols_embedding_dim}
-
-  output_fft:
-    _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder
-    n_layer: 6
-    n_head: 1
-    d_model: ${model.symbols_embedding_dim}
-    d_head: 64
-    d_inner: 1536
-    kernel_size: 3
-    dropout: 0.1
-    dropatt: 0.1
-    dropemb: 0.0
-
-  alignment_module:
-    _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder
-    n_text_channels: ${model.symbols_embedding_dim}
-
-  duration_predictor:
-    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
-    input_size: ${model.symbols_embedding_dim}
-    kernel_size: 3
-    filter_size: 256
-    dropout: 0.1
-    n_layers: 2
-
-  pitch_predictor:
-    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
-    input_size: ${model.symbols_embedding_dim}
-    kernel_size: 3
-    filter_size: 256
-    dropout: 0.1
-    n_layers: 2
-
-  optim:
-    name: adamw
-    lr: 1e-3
-    betas: [0.9, 0.999]
-    weight_decay: 1e-6
-
-    sched:
-      name: NoamAnnealing
-      warmup_steps: 1000
-      last_epoch: -1
-      d_model: 1 # Disable scaling based on model dim
-
-trainer:
-  num_nodes: 1
-  devices: -1 # number of gpus
-  accelerator: gpu
-  strategy: ddp
-  precision: 16
-  max_epochs: 5000
-  accumulate_grad_batches: 1
-  gradient_clip_val: 1000.0
-  enable_checkpointing: false # Provided by exp_manager
-  logger: false # Provided by exp_manager
-  log_every_n_steps: 100
-  check_val_every_n_epoch: 5
-  benchmark: false
-
-exp_manager:
-  exp_dir: null
-  name: ${name}
-  create_tensorboard_logger: true
-  create_checkpoint_callback: true
-  checkpoint_callback_params:
-    monitor: val_loss
-  resume_if_exists: false
-  resume_ignore_no_checkpoint: false