diff --git a/examples/tts/conf/fastpitch/fastpitch_22050.yaml b/examples/tts/conf/fastpitch/fastpitch_22050.yaml new file mode 100644 index 000000000000..846d09edcfee --- /dev/null +++ b/examples/tts/conf/fastpitch/fastpitch_22050.yaml @@ -0,0 +1,286 @@ +# This config contains the default values for training an English 22.05kHz FastPitch model. +# If you want to train a model on other dataset, you can change config values according to your dataset. +# Most dataset-specific arguments are in the head of the config file, see below. + +name: FastPitch + +max_epochs: ??? +batch_size: 32 +weighted_sampling_steps_per_epoch: null + +n_speakers: ??? +speaker_path: null +feature_stats_path: null + +train_ds_meta: ??? +val_ds_meta: ??? +log_ds_meta: ??? + +phoneme_dict_path: ??? +heteronyms_path: ??? + +log_dir: ??? +vocoder_type: ??? +vocoder_name: null +vocoder_checkpoint_path: null + +# The below feature config should match the feature.yaml config used during preprocessing. +sample_rate: 22050 +win_length: 1024 +hop_length: 256 + +mel_feature: + _target_: nemo.collections.tts.parts.preprocessing.features.MelSpectrogramFeaturizer + sample_rate: ${sample_rate} + win_length: ${win_length} + hop_length: ${hop_length} + mel_dim: 80 + lowfreq: 0 + highfreq: null + +pitch_feature: + _target_: nemo.collections.tts.parts.preprocessing.features.PitchFeaturizer + sample_rate: ${sample_rate} + win_length: ${win_length} + hop_length: ${hop_length} + pitch_fmin: 60 + pitch_fmax: 640 + +energy_feature: + _target_: nemo.collections.tts.parts.preprocessing.features.EnergyFeaturizer + spec_featurizer: ${mel_feature} + +featurizers: + pitch: ${pitch_feature} + energy: ${energy_feature} + + +model: + learn_alignment: true + bin_loss_warmup_epochs: 100 + + n_speakers: ${n_speakers} + n_mel_channels: ${mel_feature.mel_dim} + min_token_duration: 1 + max_token_duration: 75 + symbols_embedding_dim: 384 + pitch_embedding_kernel_size: 3 + energy_embedding_kernel_size: 3 + speaker_emb_condition_prosody: true + speaker_emb_condition_aligner: true + use_log_energy: false + dur_loss_scale: 0.1 + pitch_loss_scale: 0.1 + energy_loss_scale: 0.1 + aligner_loss_scale: 0.1 + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + features: ${mel_feature.mel_dim} + lowfreq: ${mel_feature.lowfreq} + highfreq: ${mel_feature.highfreq} + n_fft: ${win_length} + n_window_size: ${win_length} + window_size: false + n_window_stride: ${hop_length} + window_stride: false + pad_to: 1 + pad_value: 0 + sample_rate: ${sample_rate} + window: hann + normalize: null + preemph: null + dither: 0.0 + frame_splicing: 1 + log: true + log_zero_guard_type: add + log_zero_guard_value: 1.0 + mag_power: 1.0 + mel_norm: null + + text_tokenizer: + _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer + punct: true + apostrophe: true + pad_with_space: true + g2p: + _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p + phoneme_dict: ${phoneme_dict_path} + heteronyms: ${heteronyms_path} + phoneme_probability: 0.8 + ignore_ambiguous_words: false + use_chars: true + use_stresses: true + + pitch_processor: + _target_: nemo.collections.tts.parts.preprocessing.feature_processors.MeanVarianceSpeakerNormalization + field: pitch + stats_path: ${feature_stats_path} + + energy_processor: + _target_: nemo.collections.tts.parts.preprocessing.feature_processors.MeanVarianceSpeakerNormalization + field: energy + stats_path: ${feature_stats_path} + + train_ds: + dataset: + _target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset + dataset_meta: ${train_ds_meta} + weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch} + sample_rate: ${sample_rate} + speaker_path: ${speaker_path} + align_prior_hop_length: ${hop_length} + featurizers: ${featurizers} + feature_processors: + pitch: ${model.pitch_processor} + energy: ${model.energy_processor} + min_duration: 0.1 + max_duration: 10.0 + + dataloader_params: + batch_size: ${batch_size} + num_workers: 4 + + validation_ds: + dataset: + _target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset + dataset_meta: ${val_ds_meta} + sample_rate: ${sample_rate} + speaker_path: ${speaker_path} + align_prior_hop_length: ${hop_length} + featurizers: ${featurizers} + feature_processors: + pitch: ${model.pitch_processor} + energy: ${model.energy_processor} + + dataloader_params: + batch_size: ${batch_size} + num_workers: 2 + + log_config: + log_dir: ${log_dir} + log_epochs: [10, 50] + epoch_frequency: 100 + log_tensorboard: false + log_wandb: false + + generators: + - _target_: nemo.collections.tts.parts.utils.callbacks.FastPitchArtifactGenerator + log_spectrogram: true + log_alignment: true + audio_params: + _target_: nemo.collections.tts.parts.utils.callbacks.LogAudioParams + log_audio_gta: true + vocoder_type: ${vocoder_type} + vocoder_name: ${vocoder_name} + vocoder_checkpoint_path: ${vocoder_checkpoint_path} + + dataset: + _target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset + text_tokenizer: ${model.text_tokenizer} + sample_rate: ${sample_rate} + speaker_path: ${speaker_path} + align_prior_hop_length: ${hop_length} + featurizers: ${featurizers} + + feature_processors: + pitch: ${model.pitch_processor} + energy: ${model.energy_processor} + + dataset_meta: ${log_ds_meta} + + dataloader_params: + batch_size: 8 + num_workers: 2 + + input_fft: + _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder + n_layer: 6 + n_head: 2 + d_model: ${model.symbols_embedding_dim} + d_head: 64 + d_inner: 1536 + kernel_size: 3 + dropout: 0.1 + dropatt: 0.1 + dropemb: 0.0 + d_embed: ${model.symbols_embedding_dim} + + output_fft: + _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder + n_layer: 6 + n_head: 1 + d_model: ${model.symbols_embedding_dim} + d_head: 64 + d_inner: 1536 + kernel_size: 3 + dropout: 0.1 + dropatt: 0.1 + dropemb: 0.0 + + alignment_module: + _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder + n_text_channels: ${model.symbols_embedding_dim} + dist_type: cosine + temperature: 15.0 + + duration_predictor: + _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor + input_size: ${model.symbols_embedding_dim} + kernel_size: 3 + filter_size: 256 + dropout: 0.1 + n_layers: 2 + + pitch_predictor: + _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor + input_size: ${model.symbols_embedding_dim} + kernel_size: 3 + filter_size: 256 + dropout: 0.1 + n_layers: 2 + + energy_predictor: + _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor + input_size: ${model.symbols_embedding_dim} + kernel_size: 3 + filter_size: 256 + dropout: 0.1 + n_layers: 2 + + optim: + name: adamw + lr: 1e-3 + betas: [0.9, 0.999] + weight_decay: 1e-6 + + sched: + name: NoamAnnealing + warmup_steps: 1000 + last_epoch: -1 + d_model: 1 # Disable scaling based on model dim + +trainer: + num_nodes: 1 + devices: 1 + accelerator: gpu + strategy: ddp + precision: 16 + max_epochs: ${max_epochs} + accumulate_grad_batches: 1 + gradient_clip_val: 10.0 + enable_checkpointing: false # Provided by exp_manager + logger: false # Provided by exp_manager + log_every_n_steps: 100 + check_val_every_n_epoch: 10 + benchmark: false + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + monitor: val_loss + resume_if_exists: false + resume_ignore_no_checkpoint: false diff --git a/examples/tts/conf/fastpitch/fastpitch.yaml b/examples/tts/conf/fastpitch/fastpitch_44100.yaml similarity index 81% rename from examples/tts/conf/fastpitch/fastpitch.yaml rename to examples/tts/conf/fastpitch/fastpitch_44100.yaml index 39d5f395afbc..da9e9a29b1e7 100644 --- a/examples/tts/conf/fastpitch/fastpitch.yaml +++ b/examples/tts/conf/fastpitch/fastpitch_44100.yaml @@ -1,12 +1,9 @@ -# This config contains the default values for training an English FastPitch model. +# This config contains the default values for training an English 44.1kHz FastPitch model. # If you want to train a model on other dataset, you can change config values according to your dataset. # Most dataset-specific arguments are in the head of the config file, see below. name: FastPitch -defaults: - - feature: ??? - max_epochs: ??? batch_size: 32 weighted_sampling_steps_per_epoch: null @@ -27,12 +24,43 @@ vocoder_type: ??? vocoder_name: null vocoder_checkpoint_path: null +# The below feature config should match the feature.yaml config used during preprocessing. +sample_rate: 44100 +win_length: 2048 +hop_length: 512 + +mel_feature: + _target_: nemo.collections.tts.parts.preprocessing.features.MelSpectrogramFeaturizer + sample_rate: ${sample_rate} + win_length: ${win_length} + hop_length: ${hop_length} + mel_dim: 80 + lowfreq: 0 + highfreq: null + +pitch_feature: + _target_: nemo.collections.tts.parts.preprocessing.features.PitchFeaturizer + sample_rate: ${sample_rate} + win_length: ${win_length} + hop_length: ${hop_length} + pitch_fmin: 60 + pitch_fmax: 640 + +energy_feature: + _target_: nemo.collections.tts.parts.preprocessing.features.EnergyFeaturizer + spec_featurizer: ${mel_feature} + +featurizers: + pitch: ${pitch_feature} + energy: ${energy_feature} + + model: learn_alignment: true bin_loss_warmup_epochs: 100 n_speakers: ${n_speakers} - n_mel_channels: ${feature.mel_feature.mel_dim} + n_mel_channels: ${mel_feature.mel_dim} min_token_duration: 1 max_token_duration: 75 symbols_embedding_dim: 384 @@ -48,17 +76,17 @@ model: preprocessor: _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${feature.mel_feature.mel_dim} - lowfreq: ${feature.mel_feature.lowfreq} - highfreq: ${feature.mel_feature.highfreq} - n_fft: ${feature.win_length} - n_window_size: ${feature.win_length} + features: ${mel_feature.mel_dim} + lowfreq: ${mel_feature.lowfreq} + highfreq: ${mel_feature.highfreq} + n_fft: ${win_length} + n_window_size: ${win_length} window_size: false - n_window_stride: ${feature.hop_length} + n_window_stride: ${hop_length} window_stride: false pad_to: 1 pad_value: 0 - sample_rate: ${feature.sample_rate} + sample_rate: ${sample_rate} window: hann normalize: null preemph: null @@ -99,10 +127,10 @@ model: _target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset dataset_meta: ${train_ds_meta} weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch} - sample_rate: ${feature.sample_rate} + sample_rate: ${sample_rate} speaker_path: ${speaker_path} - align_prior_hop_length: ${feature.hop_length} - featurizers: ${feature.featurizers} + align_prior_hop_length: ${hop_length} + featurizers: ${featurizers} feature_processors: pitch: ${model.pitch_processor} energy: ${model.energy_processor} @@ -117,10 +145,10 @@ model: dataset: _target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset dataset_meta: ${val_ds_meta} - sample_rate: ${feature.sample_rate} + sample_rate: ${sample_rate} speaker_path: ${speaker_path} - align_prior_hop_length: ${feature.hop_length} - featurizers: ${feature.featurizers} + align_prior_hop_length: ${hop_length} + featurizers: ${featurizers} feature_processors: pitch: ${model.pitch_processor} energy: ${model.energy_processor} @@ -150,10 +178,10 @@ model: dataset: _target_: nemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDataset text_tokenizer: ${model.text_tokenizer} - sample_rate: ${feature.sample_rate} + sample_rate: ${sample_rate} speaker_path: ${speaker_path} - align_prior_hop_length: ${feature.hop_length} - featurizers: ${feature.featurizers} + align_prior_hop_length: ${hop_length} + featurizers: ${featurizers} feature_processors: pitch: ${model.pitch_processor} diff --git a/examples/tts/conf/feature/feature_22050.yaml b/examples/tts/conf/feature/feature_22050.yaml index 1b159bc66ddf..8071eb7933bb 100644 --- a/examples/tts/conf/feature/feature_22050.yaml +++ b/examples/tts/conf/feature/feature_22050.yaml @@ -4,25 +4,25 @@ hop_length: 256 mel_feature: _target_: nemo.collections.tts.parts.preprocessing.features.MelSpectrogramFeaturizer - sample_rate: ${..sample_rate} - win_length: ${..win_length} - hop_length: ${..hop_length} + sample_rate: ${sample_rate} + win_length: ${win_length} + hop_length: ${hop_length} mel_dim: 80 lowfreq: 0 - highfreq: 8000 + highfreq: null pitch_feature: _target_: nemo.collections.tts.parts.preprocessing.features.PitchFeaturizer - sample_rate: ${..sample_rate} - win_length: ${..win_length} - hop_length: ${..hop_length} + sample_rate: ${sample_rate} + win_length: ${win_length} + hop_length: ${hop_length} pitch_fmin: 60 pitch_fmax: 640 energy_feature: _target_: nemo.collections.tts.parts.preprocessing.features.EnergyFeaturizer - spec_featurizer: ${..mel_feature} + spec_featurizer: ${mel_feature} featurizers: - pitch: ${..pitch_feature} - energy: ${..energy_feature} + pitch: ${pitch_feature} + energy: ${energy_feature} diff --git a/examples/tts/conf/feature/feature_44100.yaml b/examples/tts/conf/feature/feature_44100.yaml index e852a93a2d6c..0cfc27f4dab3 100644 --- a/examples/tts/conf/feature/feature_44100.yaml +++ b/examples/tts/conf/feature/feature_44100.yaml @@ -4,25 +4,25 @@ hop_length: 512 mel_feature: _target_: nemo.collections.tts.parts.preprocessing.features.MelSpectrogramFeaturizer - sample_rate: ${..sample_rate} - win_length: ${..win_length} - hop_length: ${..hop_length} + sample_rate: ${sample_rate} + win_length: ${win_length} + hop_length: ${hop_length} mel_dim: 80 lowfreq: 0 highfreq: null pitch_feature: _target_: nemo.collections.tts.parts.preprocessing.features.PitchFeaturizer - sample_rate: ${..sample_rate} - win_length: ${..win_length} - hop_length: ${..hop_length} + sample_rate: ${sample_rate} + win_length: ${win_length} + hop_length: ${hop_length} pitch_fmin: 60 pitch_fmax: 640 energy_feature: _target_: nemo.collections.tts.parts.preprocessing.features.EnergyFeaturizer - spec_featurizer: ${..mel_feature} + spec_featurizer: ${mel_feature} featurizers: - pitch: ${..pitch_feature} - energy: ${..energy_feature} + pitch: ${pitch_feature} + energy: ${energy_feature} diff --git a/examples/tts/conf/hifigan/sample/sample_22050.yaml b/examples/tts/conf/hifigan/sample/sample_22050.yaml deleted file mode 100644 index 18bc206e2566..000000000000 --- a/examples/tts/conf/hifigan/sample/sample_22050.yaml +++ /dev/null @@ -1,3 +0,0 @@ -# Audio dataset sampling config for 22.05khz sampling rate -train_n_samples: 8192 -val_n_samples: 66048 diff --git a/examples/tts/conf/hifigan/sample/sample_44100.yaml b/examples/tts/conf/hifigan/sample/sample_44100.yaml deleted file mode 100644 index d8315623bbbe..000000000000 --- a/examples/tts/conf/hifigan/sample/sample_44100.yaml +++ /dev/null @@ -1,3 +0,0 @@ -# Audio dataset sampling config for 44.1khz sampling rate -train_n_samples: 16384 -val_n_samples: 131072 diff --git a/examples/tts/conf/hifigan/hifigan_data.yaml b/examples/tts/conf/hifigan_dataset/hifigan_22050.yaml similarity index 71% rename from examples/tts/conf/hifigan/hifigan_data.yaml rename to examples/tts/conf/hifigan_dataset/hifigan_22050.yaml index 62ce3344636e..0b72e810aa9e 100644 --- a/examples/tts/conf/hifigan/hifigan_data.yaml +++ b/examples/tts/conf/hifigan_dataset/hifigan_22050.yaml @@ -1,14 +1,9 @@ -# This config contains the default values for training a HiFi-GAN model. +# This config contains the default values for training a 22.05kHz HiFi-GAN model. # If you want to train model on other dataset, you can change config values according to your dataset. # Most dataset-specific arguments are in the head of the config file, see below. name: "HifiGan" -defaults: - - feature: ??? - - sample: ??? - - model/generator: ??? - max_epochs: ??? batch_size: 16 weighted_sampling_steps_per_epoch: null @@ -19,6 +14,19 @@ log_ds_meta: ??? log_dir: ??? +mel_dim: 80 +lowfreq: 0 +highfreq: null + +# Change these values depending on your sampling rate. +sample_rate: 22050 +win_length: 1024 +hop_length: 256 +upsample_rates: [8, 8, 2, 2] +train_n_samples: 8192 +val_min_duration_seconds: 3.0 +val_n_samples: 66048 + model: max_epochs: ${max_epochs} @@ -27,16 +35,16 @@ model: preprocessor: _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures - nfilt: ${feature.mel_feature.mel_dim} - lowfreq: ${feature.mel_feature.lowfreq} - highfreq: ${feature.mel_feature.highfreq} - n_fft: ${feature.win_length} - n_window_size: ${feature.win_length} - n_window_stride: ${feature.hop_length} + nfilt: ${mel_dim} + lowfreq: ${lowfreq} + highfreq: ${highfreq} + n_fft: ${win_length} + n_window_size: ${win_length} + n_window_stride: ${hop_length} pad_to: 0 pad_value: 0 exact_pad: true - sample_rate: ${feature.sample_rate} + sample_rate: ${sample_rate} window: hann normalize: null preemph: null @@ -49,12 +57,21 @@ model: mel_norm: null use_grads: false + generator: + _target_: nemo.collections.tts.modules.hifigan_modules.Generator + resblock: 1 + upsample_rates: ${upsample_rates} + upsample_kernel_sizes: [16, 16, 4, 4] + upsample_initial_channel: 512 + resblock_kernel_sizes: [3, 7, 11] + resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] + train_ds: dataset: _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch} - sample_rate: ${feature.sample_rate} - n_samples: ${sample.train_n_samples} + sample_rate: ${sample_rate} + n_samples: ${train_n_samples} min_duration: 0.4 max_duration: null dataset_meta: ${train_ds_meta} @@ -66,9 +83,9 @@ model: validation_ds: dataset: _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset - sample_rate: ${feature.sample_rate} - n_samples: ${sample.val_n_samples} - min_duration: 3.0 + sample_rate: ${sample_rate} + n_samples: ${val_n_samples} + min_duration: ${val_min_duration_seconds} max_duration: null dataset_meta: ${val_ds_meta} @@ -88,7 +105,7 @@ model: dataset: _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset - sample_rate: ${feature.sample_rate} + sample_rate: ${sample_rate} n_samples: null min_duration: null max_duration: null diff --git a/examples/tts/conf/hifigan_dataset/hifigan_44100.yaml b/examples/tts/conf/hifigan_dataset/hifigan_44100.yaml new file mode 100644 index 000000000000..537dc67cec38 --- /dev/null +++ b/examples/tts/conf/hifigan_dataset/hifigan_44100.yaml @@ -0,0 +1,151 @@ +# This config contains the default values for training a 44.1kHz HiFi-GAN model. +# If you want to train model on other dataset, you can change config values according to your dataset. +# Most dataset-specific arguments are in the head of the config file, see below. + +name: "HifiGan" + +max_epochs: ??? +batch_size: 16 +weighted_sampling_steps_per_epoch: null + +train_ds_meta: ??? +val_ds_meta: ??? +log_ds_meta: ??? + +log_dir: ??? + +mel_dim: 80 +lowfreq: 0 +highfreq: null + +# Change these values depending on your sampling rate. +sample_rate: 44100 +win_length: 2048 +hop_length: 512 +upsample_rates: [8, 8, 4, 2] +train_n_samples: 16384 +val_min_duration_seconds: 3.0 +val_n_samples: 131072 + +model: + + max_epochs: ${max_epochs} + steps_per_epoch: ${weighted_sampling_steps_per_epoch} + l1_loss_factor: 60 + + preprocessor: + _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures + nfilt: ${mel_dim} + lowfreq: ${lowfreq} + highfreq: ${highfreq} + n_fft: ${win_length} + n_window_size: ${win_length} + n_window_stride: ${hop_length} + pad_to: 0 + pad_value: 0 + exact_pad: true + sample_rate: ${sample_rate} + window: hann + normalize: null + preemph: null + dither: 0.0 + frame_splicing: 1 + log: true + log_zero_guard_type: add + log_zero_guard_value: 1.0 + mag_power: 1.0 + mel_norm: null + use_grads: false + + generator: + _target_: nemo.collections.tts.modules.hifigan_modules.Generator + resblock: 1 + upsample_rates: ${upsample_rates} + upsample_kernel_sizes: [16, 16, 4, 4] + upsample_initial_channel: 512 + resblock_kernel_sizes: [3, 7, 11] + resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] + + train_ds: + dataset: + _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset + weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch} + sample_rate: ${sample_rate} + n_samples: ${train_n_samples} + min_duration: 0.4 + max_duration: null + dataset_meta: ${train_ds_meta} + + dataloader_params: + batch_size: ${batch_size} + num_workers: 4 + + validation_ds: + dataset: + _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset + sample_rate: ${sample_rate} + n_samples: ${val_n_samples} + min_duration: ${val_min_duration_seconds} + max_duration: null + dataset_meta: ${val_ds_meta} + + dataloader_params: + batch_size: ${batch_size} + num_workers: 2 + + log_config: + log_dir: ${log_dir} + log_epochs: [10, 50] + epoch_frequency: 100 + log_tensorboard: false + log_wandb: false + + generators: + - _target_: nemo.collections.tts.parts.utils.callbacks.VocoderArtifactGenerator + + dataset: + _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset + sample_rate: ${sample_rate} + n_samples: null + min_duration: null + max_duration: null + trunc_duration: 15.0 + dataset_meta: ${log_ds_meta} + + dataloader_params: + batch_size: 4 + num_workers: 2 + + optim: + _target_: torch.optim.AdamW + lr: 2e-4 + betas: [0.8, 0.99] + weight_decay: 1e-6 + sched: + name: ExponentialLR + gamma: 0.999 + +trainer: + num_nodes: 1 + devices: 1 + accelerator: gpu + strategy: ddp + precision: 16 + max_epochs: ${max_epochs} + accumulate_grad_batches: 1 + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + log_every_n_steps: 100 + check_val_every_n_epoch: 10 + benchmark: false + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + create_wandb_logger: false + checkpoint_callback_params: + monitor: val_loss + resume_if_exists: false + resume_ignore_no_checkpoint: false diff --git a/tutorials/tts/FastPitch_Data_Preparation.ipynb b/tutorials/tts/FastPitch_Data_Preparation.ipynb index 8a3094c76a56..46778759d5cb 100644 --- a/tutorials/tts/FastPitch_Data_Preparation.ipynb +++ b/tutorials/tts/FastPitch_Data_Preparation.ipynb @@ -99,6 +99,7 @@ { "cell_type": "code", "source": [ + "\n", "# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n", "# comment out the below lines and set NEMO_ROOT_DIR to your local path.\n", "!git clone -b $BRANCH https://github.com/NVIDIA/NeMo.git $NEMO_ROOT_DIR" @@ -443,7 +444,7 @@ "# Directory with raw audio data\n", "input_audio_dir = DATA_DIR / \"audio\"\n", "# Directory to write preprocessed audio to\n", - "output_audio_dir = DATA_DIR / \"audio_44khz\"\n", + "output_audio_dir = DATA_DIR / \"audio_preprocessed\"\n", "# Whether to overwrite existing audio, if it exists in the output directory\n", "overwrite_audio = True\n", "# Whether to overwrite output manifest, if it exists\n", @@ -627,9 +628,18 @@ "source": [ "feature_script = NEMO_SCRIPT_DIR / \"compute_features.py\"\n", "\n", - "feature_config_path = NEMO_CONFIG_DIR / \"feature\" / \"feature_44100.yaml\"\n", - "audio_dir = DATA_DIR / \"audio_44khz\"\n", - "feature_dir = DATA_DIR / \"features_44khz\"\n", + "sample_rate = 44100\n", + "\n", + "if sample_rate == 22050:\n", + " feature_config_filename = \"feature_22050.yaml\"\n", + "elif sample_rate == 44100:\n", + " feature_config_filename = \"feature_44100.yaml\"\n", + "else:\n", + " raise ValueError(f\"Unsupported sampling rate {sample_rate}\")\n", + "\n", + "feature_config_path = NEMO_CONFIG_DIR / \"feature\" / feature_config_filename\n", + "audio_dir = DATA_DIR / \"audio_preprocessed\"\n", + "feature_dir = DATA_DIR / \"features\"\n", "num_workers = 4\n", "\n", "def compute_features(data_type):\n", @@ -723,7 +733,7 @@ "feature_stats_script = NEMO_SCRIPT_DIR / \"compute_feature_stats.py\"\n", "\n", "train_manifest_filepath = DATA_DIR / \"train_manifest.json\"\n", - "output_stats_path = DATA_DIR / \"feature_stats_44khz.json\"\n", + "output_stats_path = DATA_DIR / \"feature_stats.json\"\n", "\n", "args = [\n", " f\"--feature_config_path={feature_config_path}\",\n", @@ -800,7 +810,7 @@ "cell_type": "code", "source": [ "dataset_name = \"vctk\"\n", - "audio_dir = DATA_DIR / \"audio_44khz\"\n", + "audio_dir = DATA_DIR / \"audio_preprocessed\"\n", "train_manifest_filepath = DATA_DIR / \"train_manifest.json\"\n", "dev_manifest_filepath = DATA_DIR / \"dev_manifest.json\"" ], @@ -822,10 +832,14 @@ "sample_rate = 44100\n", "\n", "# Config files specifying all HiFi-GAN parameters\n", - "hifigan_config_dir = NEMO_CONFIG_DIR / \"hifigan\"\n", - "hifigan_config_filename = \"hifigan_data.yaml\"\n", - "feature_config = f\"feature_{sample_rate}\"\n", - "sample_config = f\"sample_{sample_rate}\"\n", + "hifigan_config_dir = NEMO_CONFIG_DIR / \"hifigan_dataset\"\n", + "\n", + "if sample_rate == 22050:\n", + " hifigan_config_filename = \"hifigan_22050.yaml\"\n", + "elif sample_rate == 44100:\n", + " hifigan_config_filename = \"hifigan_44100.yaml\"\n", + "else:\n", + " raise ValueError(f\"Unsupported sampling rate {sample_rate}\")\n", "\n", "# Name of the experiment that will determine where it is saved locally and in TensorBoard and WandB\n", "run_id = \"test_run\"\n", @@ -834,13 +848,6 @@ "# Directory where predicted audio will be stored periodically throughout training\n", "hifigan_log_dir = hifigan_exp_output_dir / \"logs\"\n", "\n", - "if sample_rate == 22050:\n", - " generator_config = \"v1\"\n", - "elif sample_rate == 44100:\n", - " generator_config = \"v1_44100\"\n", - "else:\n", - " raise ValueError(f\"Unsupported sampling rate {sample_rate}\")\n", - "\n", "if torch.cuda.is_available():\n", " accelerator=\"gpu\"\n", " batch_size = 16\n", @@ -850,11 +857,7 @@ "\n", "args = [\n", " f\"--config-path={hifigan_config_dir}\",\n", - " f\"--config-dir={NEMO_CONFIG_DIR}\",\n", " f\"--config-name={hifigan_config_filename}\",\n", - " f\"feature={feature_config}\",\n", - " f\"sample={sample_config}\",\n", - " f'model/generator={generator_config}',\n", " f\"max_epochs={epochs}\",\n", " f\"weighted_sampling_steps_per_epoch={steps_per_epoch}\",\n", " f\"batch_size={batch_size}\",\n", @@ -986,8 +989,13 @@ "\n", "# Config files specifying all FastPitch parameters\n", "fastpitch_config_dir = NEMO_CONFIG_DIR / \"fastpitch\"\n", - "fastpitch_config_filename = \"fastpitch.yaml\"\n", - "feature_config = f\"feature_{sample_rate}\"\n", + "\n", + "if sample_rate == 22050:\n", + " fastpitch_config_filename = \"fastpitch_22050.yaml\"\n", + "elif sample_rate == 44100:\n", + " fastpitch_config_filename = \"fastpitch_44100.yaml\"\n", + "else:\n", + " raise ValueError(f\"Unsupported sampling rate {sample_rate}\")\n", "\n", "# Metadata files and directories\n", "dataset_file_dir = NEMO_DIR / \"scripts\" / \"tts_dataset_files\"\n", @@ -995,8 +1003,8 @@ "heteronyms_path = dataset_file_dir / \"heteronyms-052722\"\n", "\n", "speaker_path = DATA_DIR / \"speakers.json\"\n", - "feature_dir = DATA_DIR / \"features_44khz\"\n", - "stats_path = DATA_DIR / \"feature_stats_44khz.json\"\n", + "feature_dir = DATA_DIR / \"features\"\n", + "stats_path = DATA_DIR / \"feature_stats.json\"\n", "\n", "def get_latest_checkpoint(checkpoint_dir):\n", " output_path = None\n", @@ -1031,9 +1039,7 @@ "\n", "args = [\n", " f\"--config-path={fastpitch_config_dir}\",\n", - " f\"--config-dir={NEMO_CONFIG_DIR}\",\n", " f\"--config-name={fastpitch_config_filename}\",\n", - " f\"feature={feature_config}\",\n", " f\"n_speakers={num_speakers}\",\n", " f\"speaker_path={speaker_path}\",\n", " f\"max_epochs={epochs}\",\n",