Skip to content

Commit

Permalink
[TTS] Remove nested TTS configs (#7154)
Browse files Browse the repository at this point in the history
* [TTS] Remove nested TTS configs

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Modify tutorial to support multiple sampling rates

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Clarify min_duration unit

Signed-off-by: Ryan <rlangman@nvidia.com>

* [TTS] Default 22.05kHz highfreq to null

Signed-off-by: Ryan <rlangman@nvidia.com>

---------

Signed-off-by: Ryan <rlangman@nvidia.com>
Signed-off-by: jubick1337 <mattyson.so@gmail.com>
  • Loading branch information
rlangman authored and jubick1337 committed Aug 8, 2023
1 parent 2c9f05e commit 52ba772
Show file tree
Hide file tree
Showing 9 changed files with 574 additions and 92 deletions.
286 changes: 286 additions & 0 deletions examples/tts/conf/fastpitch/fastpitch_22050.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,286 @@
# This config contains the default values for training an English 22.05kHz FastPitch model.
# If you want to train a model on other dataset, you can change config values according to your dataset.
# Most dataset-specific arguments are in the head of the config file, see below.

name: FastPitch

max_epochs: ???
batch_size: 32
weighted_sampling_steps_per_epoch: null

n_speakers: ???
speaker_path: null
feature_stats_path: null

train_ds_meta: ???
val_ds_meta: ???
log_ds_meta: ???

phoneme_dict_path: ???
heteronyms_path: ???

log_dir: ???
vocoder_type: ???
vocoder_name: null
vocoder_checkpoint_path: null

# The below feature config should match the feature.yaml config used during preprocessing.
sample_rate: 22050
win_length: 1024
hop_length: 256

mel_feature:
_target_: nemo.collections.tts.parts.preprocessing.features.MelSpectrogramFeaturizer
sample_rate: ${sample_rate}
win_length: ${win_length}
hop_length: ${hop_length}
mel_dim: 80
lowfreq: 0
highfreq: null

pitch_feature:
_target_: nemo.collections.tts.parts.preprocessing.features.PitchFeaturizer
sample_rate: ${sample_rate}
win_length: ${win_length}
hop_length: ${hop_length}
pitch_fmin: 60
pitch_fmax: 640

energy_feature:
_target_: nemo.collections.tts.parts.preprocessing.features.EnergyFeaturizer
spec_featurizer: ${mel_feature}

featurizers:
pitch: ${pitch_feature}
energy: ${energy_feature}


model:
learn_alignment: true
bin_loss_warmup_epochs: 100

n_speakers: ${n_speakers}
n_mel_channels: ${mel_feature.mel_dim}
min_token_duration: 1
max_token_duration: 75
symbols_embedding_dim: 384
pitch_embedding_kernel_size: 3
energy_embedding_kernel_size: 3
speaker_emb_condition_prosody: true
speaker_emb_condition_aligner: true
use_log_energy: false
dur_loss_scale: 0.1
pitch_loss_scale: 0.1
energy_loss_scale: 0.1
aligner_loss_scale: 0.1

preprocessor:
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
features: ${mel_feature.mel_dim}
lowfreq: ${mel_feature.lowfreq}
highfreq: ${mel_feature.highfreq}
n_fft: ${win_length}
n_window_size: ${win_length}
window_size: false
n_window_stride: ${hop_length}
window_stride: false
pad_to: 1
pad_value: 0
sample_rate: ${sample_rate}
window: hann
normalize: null
preemph: null
dither: 0.0
frame_splicing: 1
log: true
log_zero_guard_type: add
log_zero_guard_value: 1.0
mag_power: 1.0
mel_norm: null

text_tokenizer:
_target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
punct: true
apostrophe: true
pad_with_space: true
g2p:
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
phoneme_dict: ${phoneme_dict_path}
heteronyms: ${heteronyms_path}
phoneme_probability: 0.8
ignore_ambiguous_words: false
use_chars: true
use_stresses: true

pitch_processor:
_target_: nemo.collections.tts.parts.preprocessing.feature_processors.MeanVarianceSpeakerNormalization
field: pitch
stats_path: ${feature_stats_path}

energy_processor:
_target_: nemo.collections.tts.parts.preprocessing.feature_processors.MeanVarianceSpeakerNormalization
field: energy
stats_path: ${feature_stats_path}

train_ds:
dataset:
_target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset
dataset_meta: ${train_ds_meta}
weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch}
sample_rate: ${sample_rate}
speaker_path: ${speaker_path}
align_prior_hop_length: ${hop_length}
featurizers: ${featurizers}
feature_processors:
pitch: ${model.pitch_processor}
energy: ${model.energy_processor}
min_duration: 0.1
max_duration: 10.0

dataloader_params:
batch_size: ${batch_size}
num_workers: 4

validation_ds:
dataset:
_target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset
dataset_meta: ${val_ds_meta}
sample_rate: ${sample_rate}
speaker_path: ${speaker_path}
align_prior_hop_length: ${hop_length}
featurizers: ${featurizers}
feature_processors:
pitch: ${model.pitch_processor}
energy: ${model.energy_processor}

dataloader_params:
batch_size: ${batch_size}
num_workers: 2

log_config:
log_dir: ${log_dir}
log_epochs: [10, 50]
epoch_frequency: 100
log_tensorboard: false
log_wandb: false

generators:
- _target_: nemo.collections.tts.parts.utils.callbacks.FastPitchArtifactGenerator
log_spectrogram: true
log_alignment: true
audio_params:
_target_: nemo.collections.tts.parts.utils.callbacks.LogAudioParams
log_audio_gta: true
vocoder_type: ${vocoder_type}
vocoder_name: ${vocoder_name}
vocoder_checkpoint_path: ${vocoder_checkpoint_path}

dataset:
_target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset
text_tokenizer: ${model.text_tokenizer}
sample_rate: ${sample_rate}
speaker_path: ${speaker_path}
align_prior_hop_length: ${hop_length}
featurizers: ${featurizers}

feature_processors:
pitch: ${model.pitch_processor}
energy: ${model.energy_processor}

dataset_meta: ${log_ds_meta}

dataloader_params:
batch_size: 8
num_workers: 2

input_fft:
_target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder
n_layer: 6
n_head: 2
d_model: ${model.symbols_embedding_dim}
d_head: 64
d_inner: 1536
kernel_size: 3
dropout: 0.1
dropatt: 0.1
dropemb: 0.0
d_embed: ${model.symbols_embedding_dim}

output_fft:
_target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder
n_layer: 6
n_head: 1
d_model: ${model.symbols_embedding_dim}
d_head: 64
d_inner: 1536
kernel_size: 3
dropout: 0.1
dropatt: 0.1
dropemb: 0.0

alignment_module:
_target_: nemo.collections.tts.modules.aligner.AlignmentEncoder
n_text_channels: ${model.symbols_embedding_dim}
dist_type: cosine
temperature: 15.0

duration_predictor:
_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
input_size: ${model.symbols_embedding_dim}
kernel_size: 3
filter_size: 256
dropout: 0.1
n_layers: 2

pitch_predictor:
_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
input_size: ${model.symbols_embedding_dim}
kernel_size: 3
filter_size: 256
dropout: 0.1
n_layers: 2

energy_predictor:
_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
input_size: ${model.symbols_embedding_dim}
kernel_size: 3
filter_size: 256
dropout: 0.1
n_layers: 2

optim:
name: adamw
lr: 1e-3
betas: [0.9, 0.999]
weight_decay: 1e-6

sched:
name: NoamAnnealing
warmup_steps: 1000
last_epoch: -1
d_model: 1 # Disable scaling based on model dim

trainer:
num_nodes: 1
devices: 1
accelerator: gpu
strategy: ddp
precision: 16
max_epochs: ${max_epochs}
accumulate_grad_batches: 1
gradient_clip_val: 10.0
enable_checkpointing: false # Provided by exp_manager
logger: false # Provided by exp_manager
log_every_n_steps: 100
check_val_every_n_epoch: 10
benchmark: false

exp_manager:
exp_dir: null
name: ${name}
create_tensorboard_logger: true
create_checkpoint_callback: true
checkpoint_callback_params:
monitor: val_loss
resume_if_exists: false
resume_ignore_no_checkpoint: false
Loading

0 comments on commit 52ba772

Please sign in to comment.