From b9a9c4041716d80e1209a6c65f0123ed911bfc54 Mon Sep 17 00:00:00 2001 From: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com> Date: Tue, 18 Apr 2023 00:45:50 +0800 Subject: [PATCH 01/23] [TTS] FastPitch adapter fine-tune and conditional layer normalization (#6416) [TTS] FastPitch adapter fine-tune and conditional layer normalization (#6416) --------- Signed-off-by: hsiehjackson Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../conf/fastpitch_align_44100_adapter.yaml | 286 ++++++++++++++ examples/tts/fastpitch_finetune_adapters.py | 141 +++++++ nemo/collections/tts/models/fastpitch.py | 21 +- nemo/collections/tts/modules/__init__.py | 1 + nemo/collections/tts/modules/adapters.py | 147 +++++++ nemo/collections/tts/modules/aligner.py | 11 +- nemo/collections/tts/modules/fastpitch.py | 100 ++--- nemo/collections/tts/modules/submodules.py | 99 ++++- nemo/collections/tts/modules/transformer.py | 98 +++-- nemo/collections/tts/parts/mixins/__init__.py | 15 + .../parts/mixins/fastpitch_adapter_mixins.py | 368 ++++++++++++++++++ .../tts/modules/test_submodules.py | 48 +++ 12 files changed, 1244 insertions(+), 91 deletions(-) create mode 100644 examples/tts/conf/fastpitch_align_44100_adapter.yaml create mode 100644 examples/tts/fastpitch_finetune_adapters.py create mode 100644 nemo/collections/tts/modules/adapters.py create mode 100644 nemo/collections/tts/parts/mixins/__init__.py create mode 100644 nemo/collections/tts/parts/mixins/fastpitch_adapter_mixins.py create mode 100644 tests/collections/tts/modules/test_submodules.py diff --git a/examples/tts/conf/fastpitch_align_44100_adapter.yaml b/examples/tts/conf/fastpitch_align_44100_adapter.yaml new file mode 100644 index 000000000000..032ab1da501f --- /dev/null +++ b/examples/tts/conf/fastpitch_align_44100_adapter.yaml @@ -0,0 +1,286 @@ +# This config contains the default values for training FastPitch speaker adaptation +# If you want to train model on other dataset, you can change config values according to your dataset. +# Most dataset-specific arguments are in the head of the config file, see below. + +name: FastPitch + +train_dataset: ??? +validation_datasets: ??? +sup_data_path: ??? +sup_data_types: [ "align_prior_matrix", "pitch", "speaker_id"] + +# Default values from librosa.pyin +pitch_fmin: 65.40639132514966 +pitch_fmax: 2093.004522404789 + +# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values +# by running `scripts/dataset_processing/tts/extract_sup_data.py` +pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech +pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech + +# Default values for dataset with sample_rate=44100 +sample_rate: 44100 +n_mel_channels: 80 +n_window_size: 2048 +n_window_stride: 512 +n_fft: 2048 +lowfreq: 0 +highfreq: 8000 +window: hann + +phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" +heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" + +model: + learn_alignment: true + bin_loss_warmup_epochs: 100 + + n_speakers: 1 + max_token_duration: 75 + symbols_embedding_dim: 384 + speaker_embedding_dim: 384 + pitch_embedding_kernel_size: 3 + + pitch_fmin: ${pitch_fmin} + pitch_fmax: ${pitch_fmax} + + pitch_mean: ${pitch_mean} + pitch_std: ${pitch_std} + + sample_rate: ${sample_rate} + n_mel_channels: ${n_mel_channels} + n_window_size: ${n_window_size} + n_window_stride: ${n_window_stride} + n_fft: ${n_fft} + lowfreq: ${lowfreq} + highfreq: ${highfreq} + window: ${window} + + text_normalizer: + _target_: nemo_text_processing.text_normalization.normalize.Normalizer + lang: en + input_case: cased + + text_normalizer_call_kwargs: + verbose: false + punct_pre_process: true + punct_post_process: true + + text_tokenizer: + _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer + punct: true + stresses: true + chars: true + apostrophe: true + pad_with_space: true + g2p: + _target_: nemo.collections.tts.g2p.modules.EnglishG2p + phoneme_dict: ${phoneme_dict_path} + heteronyms: ${heteronyms_path} + phoneme_probability: 0.5 + + adapter: + # Config of the adapter training/eval script. + adapter_name: "adapter" # Name of the adapter, used by the script + adapter_module_name: "encoder+decoder+duration_predictor+pitch_predictor+aligner" # Name of the adapter module. Combine multiple modules with '+' between module names. + adapter_state_dict_name: "adapters.pt" # If the individual adapters must be saved, a file name can be provided here. null disables this. + + # Config of the adapter module itself + _target_: nemo.collections.common.parts.adapter_modules.LinearAdapter + in_features: ${model.symbols_embedding_dim} # User must provide the output dimension of the layers of the model, which is the input dimension of this adapter. + dim: 256 # The hidden dimension of the adapter, as chosen by user, but small values are preferred to reduce param count. + activation: swish + norm_position: 'pre' # Can be `pre` or `post` + dropout: 0.0 # float, dropout for the adapter + + # Adapter strategy config + adapter_strategy: + _target_: nemo.core.classes.mixins.adapter_mixin_strategies.ResidualAddAdapterStrategy + stochastic_depth: 0.0 # float, setting to > 0 will enable stochastic depth for each adapter block. + l2_lambda: 0.0 # float, setting to > 0 will enable l2 norm auxiliary loss for each adapter's output. + + # Optional global config available to all adapters at a global level. + # A global config is shared across every layer of the adapters, defining global properties rather + # than properties local to the adapter (as defined above). + # This can be useful in order to select *which type of adapter* is added, *what adapters to enable*, + # and further global operations that can decide dynamically how to support the requested adapter. + global_cfg: + check_encoder_adapter: True # determines whether to check if encoder adapter modules is supported + check_decoder_adapter: True # determines whether to check if decoder adapter modules is supported + check_duration_predictor_adapter: True # determines whether to check if duration_predictor adapter modules is supported + check_pitch_predictor_adapter: True # determines whether to check if pitch_predictor adapter modules is supported + check_aligner_adapter: True # determines whether to check if aligner adapter modules is supported + + train_ds: + dataset: + _target_: nemo.collections.tts.data.dataset.TTSDataset + manifest_filepath: ${train_dataset} + sample_rate: ${model.sample_rate} + sup_data_path: ${sup_data_path} + sup_data_types: ${sup_data_types} + n_fft: ${model.n_fft} + win_length: ${model.n_window_size} + hop_length: ${model.n_window_stride} + window: ${model.window} + n_mels: ${model.n_mel_channels} + lowfreq: ${model.lowfreq} + highfreq: ${model.highfreq} + max_duration: null + min_duration: 0.1 + ignore_file: null + trim: false + pitch_fmin: ${model.pitch_fmin} + pitch_fmax: ${model.pitch_fmax} + pitch_norm: true + pitch_mean: ${model.pitch_mean} + pitch_std: ${model.pitch_std} + use_beta_binomial_interpolator: true + + dataloader_params: + drop_last: false + shuffle: true + batch_size: 32 + num_workers: 12 + pin_memory: true + + validation_ds: + dataset: + _target_: nemo.collections.tts.data.dataset.TTSDataset + manifest_filepath: ${validation_datasets} + sample_rate: ${model.sample_rate} + sup_data_path: ${sup_data_path} + sup_data_types: ${sup_data_types} + n_fft: ${model.n_fft} + win_length: ${model.n_window_size} + hop_length: ${model.n_window_stride} + window: ${model.window} + n_mels: ${model.n_mel_channels} + lowfreq: ${model.lowfreq} + highfreq: ${model.highfreq} + max_duration: null + min_duration: 0.1 + ignore_file: null + trim: false + pitch_fmin: ${model.pitch_fmin} + pitch_fmax: ${model.pitch_fmax} + pitch_norm: true + pitch_mean: ${model.pitch_mean} + pitch_std: ${model.pitch_std} + use_beta_binomial_interpolator: true + + dataloader_params: + drop_last: false + shuffle: false + batch_size: 32 + num_workers: 8 + pin_memory: true + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + features: ${model.n_mel_channels} + lowfreq: ${model.lowfreq} + highfreq: ${model.highfreq} + n_fft: ${model.n_fft} + n_window_size: ${model.n_window_size} + window_size: false + n_window_stride: ${model.n_window_stride} + window_stride: false + pad_to: 1 + pad_value: 0 + sample_rate: ${model.sample_rate} + window: ${model.window} + normalize: null + preemph: null + dither: 0.0 + frame_splicing: 1 + log: true + log_zero_guard_type: add + log_zero_guard_value: 1e-05 + mag_power: 1.0 + + input_fft: #n_embed and padding_idx are added by the model + _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder + n_layer: 6 + n_head: 1 + d_model: ${model.symbols_embedding_dim} + d_head: 64 + d_inner: 1536 + kernel_size: 3 + dropout: 0.1 + dropatt: 0.1 + dropemb: 0.0 + d_embed: ${model.symbols_embedding_dim} + condition_types: [ "add", "layernorm" ] # options: [ "add", "cat", "layernorm" ] + + output_fft: + _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder + n_layer: 6 + n_head: 1 + d_model: ${model.symbols_embedding_dim} + d_head: 64 + d_inner: 1536 + kernel_size: 3 + dropout: 0.1 + dropatt: 0.1 + dropemb: 0.0 + condition_types: [ "add", "layernorm" ] # options: [ "add", "cat", "layernorm" ] + + alignment_module: + _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder + n_text_channels: ${model.symbols_embedding_dim} + condition_types: [ "add" ] # options: [ "add", "cat" ] + + duration_predictor: + _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor + input_size: ${model.symbols_embedding_dim} + kernel_size: 3 + filter_size: 256 + dropout: 0.1 + n_layers: 2 + condition_types: [ "add", "layernorm" ] # options: [ "add", "cat", "layernorm" ] + + pitch_predictor: + _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor + input_size: ${model.symbols_embedding_dim} + kernel_size: 3 + filter_size: 256 + dropout: 0.1 + n_layers: 2 + condition_types: [ "add", "layernorm" ] # options: [ "add", "cat", "layernorm" ] + + optim: + name: adamw + lr: 1e-3 + betas: [0.9, 0.999] + weight_decay: 1e-6 + + sched: + name: NoamAnnealing + warmup_steps: 1000 + last_epoch: -1 + d_model: 1 # Disable scaling based on model dim + +trainer: + num_nodes: 1 + devices: 1 + accelerator: gpu + strategy: ddp + precision: 16 + max_epochs: 1000 + accumulate_grad_batches: 1 + gradient_clip_val: 1000.0 + enable_checkpointing: false # Provided by exp_manager + logger: false # Provided by exp_manager + log_every_n_steps: 100 + check_val_every_n_epoch: 1 + benchmark: false + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + monitor: val_loss + resume_if_exists: false + resume_ignore_no_checkpoint: false diff --git a/examples/tts/fastpitch_finetune_adapters.py b/examples/tts/fastpitch_finetune_adapters.py new file mode 100644 index 000000000000..396552b0f4fd --- /dev/null +++ b/examples/tts/fastpitch_finetune_adapters.py @@ -0,0 +1,141 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from dataclasses import is_dataclass + +import pytorch_lightning as pl +from omegaconf import DictConfig, OmegaConf, open_dict + +from nemo.collections.common.callbacks import LogEpochTimeCallback +from nemo.collections.tts.models import FastPitchModel +from nemo.core import adapter_mixins +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.exp_manager import exp_manager + + +def update_model_config_to_support_adapter(config) -> DictConfig: + with open_dict(config): + enc_adapter_metadata = adapter_mixins.get_registered_adapter(config.input_fft._target_) + if enc_adapter_metadata is not None: + config.input_fft._target_ = enc_adapter_metadata.adapter_class_path + + dec_adapter_metadata = adapter_mixins.get_registered_adapter(config.output_fft._target_) + if dec_adapter_metadata is not None: + config.output_fft._target_ = dec_adapter_metadata.adapter_class_path + + pitch_predictor_adapter_metadata = adapter_mixins.get_registered_adapter(config.pitch_predictor._target_) + if pitch_predictor_adapter_metadata is not None: + config.pitch_predictor._target_ = pitch_predictor_adapter_metadata.adapter_class_path + + duration_predictor_adapter_metadata = adapter_mixins.get_registered_adapter(config.duration_predictor._target_) + if duration_predictor_adapter_metadata is not None: + config.duration_predictor._target_ = duration_predictor_adapter_metadata.adapter_class_path + + aligner_adapter_metadata = adapter_mixins.get_registered_adapter(config.alignment_module._target_) + if aligner_adapter_metadata is not None: + config.alignment_module._target_ = aligner_adapter_metadata.adapter_class_path + + return config + + +def add_global_adapter_cfg(model, global_adapter_cfg): + # Convert to DictConfig from dict or Dataclass + if is_dataclass(global_adapter_cfg): + global_adapter_cfg = OmegaConf.structured(global_adapter_cfg) + + if not isinstance(global_adapter_cfg, DictConfig): + global_adapter_cfg = DictConfig(global_adapter_cfg) + + # Update the model.cfg with information about the new adapter global cfg + with open_dict(global_adapter_cfg), open_dict(model.cfg): + if 'adapters' not in model.cfg: + model.cfg.adapters = OmegaConf.create({}) + + # Add the global config for adapters to the model's internal config + model.cfg.adapters[model.adapter_global_cfg_key] = global_adapter_cfg + + # Update all adapter modules (that already exist) with this global adapter config + model.update_adapter_cfg(model.cfg.adapters) + + +@hydra_runner(config_path="conf", config_name="fastpitch_align_44100_adapter") +def main(cfg): + if hasattr(cfg.model.optim, 'sched'): + logging.warning("You are using an optimizer scheduler while finetuning. Are you sure this is intended?") + if cfg.model.optim.lr > 1e-3 or cfg.model.optim.lr < 1e-5: + logging.warning("The recommended learning rate for finetuning is 2e-4") + + trainer = pl.Trainer(**cfg.trainer) + exp_log_dir = exp_manager(trainer, cfg.get("exp_manager", None)) + # Initialize FastPitchModel + model = FastPitchModel(cfg=update_model_config_to_support_adapter(cfg.model), trainer=trainer) + model.maybe_init_from_pretrained_checkpoint(cfg=cfg) + + # Extract adapter parameters + with open_dict(cfg.model.adapter): + # Extract the name of the adapter (must be given for training) + adapter_name = cfg.model.adapter.pop("adapter_name", "adapter") + # Extract the name of the modules where adapters need to be added (must be given for training) + adapter_module_name = cfg.model.adapter.pop("adapter_module_name", None) + # Name of the adapter checkpoint which will be saved after training + adapter_state_dict_name = cfg.model.adapter.pop("adapter_state_dict_name", None) + + # augment adapter name with module name, if not provided by user + if adapter_module_name is not None and ':' not in adapter_name: + adapter_name = f'{adapter_module_name}:{adapter_name}' + + # Extract the global adapter config, if provided + adapter_global_cfg = cfg.model.adapter.pop(model.adapter_global_cfg_key, None) + + # Freeze model + model.freeze() + + # Setup adapters + if adapter_global_cfg is not None: + add_global_adapter_cfg(model, adapter_global_cfg) + + # Add adapters + model.add_adapter(name=adapter_name, cfg=cfg.model.adapter) + assert model.is_adapter_available() + # enable adapters + model.set_enabled_adapters(enabled=False) + model.set_enabled_adapters(adapter_name, enabled=True) + + # Set model to training mode. + model = model.train() + # Then, Unfreeze just the adapter weights that were enabled above (no part of model) + model.unfreeze_enabled_adapters() + # summarize the model + model.summarize() + + lr_logger = pl.callbacks.LearningRateMonitor() + epoch_time_logger = LogEpochTimeCallback() + trainer.callbacks.extend([lr_logger, epoch_time_logger]) + trainer.fit(model) + + # Save the adapter state dict after training has completed + if adapter_state_dict_name is not None: + state_path = exp_log_dir if exp_log_dir is not None else os.getcwd() + ckpt_path = os.path.join(state_path, "checkpoints") + if os.path.exists(ckpt_path): + state_path = ckpt_path + + # Save the adapter modules in a seperate file + model.save_adapters(os.path.join(state_path, adapter_state_dict_name)) + + +if __name__ == '__main__': + main() # noqa pylint: disable=no-value-for-parameter diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index 7d4a110df86f..76eaae2f9ba2 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -26,6 +26,7 @@ from nemo.collections.tts.losses.fastpitchloss import DurationLoss, EnergyLoss, MelLoss, PitchLoss from nemo.collections.tts.models.base import SpectrogramGenerator from nemo.collections.tts.modules.fastpitch import FastPitchModule +from nemo.collections.tts.parts.mixins import FastPitchAdapterModelMixin from nemo.collections.tts.parts.utils.helpers import ( batch_from_ragged, plot_alignment_to_numpy, @@ -74,7 +75,7 @@ class TextTokenizerConfig: text_tokenizer: TextTokenizer = TextTokenizer() -class FastPitchModel(SpectrogramGenerator, Exportable): +class FastPitchModel(SpectrogramGenerator, Exportable, FastPitchAdapterModelMixin): """FastPitch model (https://arxiv.org/abs/2006.06873) that is used to generate mel spectrogram from text.""" def __init__(self, cfg: DictConfig, trainer: Trainer = None): @@ -138,11 +139,22 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): output_fft = instantiate(self._cfg.output_fft) duration_predictor = instantiate(self._cfg.duration_predictor) pitch_predictor = instantiate(self._cfg.pitch_predictor) + energy_embedding_kernel_size = cfg.get("energy_embedding_kernel_size", 0) + energy_predictor = instantiate(self._cfg.get("energy_predictor", None)) + + # [TODO] may remove if we change the pre-trained config speaker_emb_condition_prosody = cfg.get("speaker_emb_condition_prosody", False) speaker_emb_condition_decoder = cfg.get("speaker_emb_condition_decoder", False) speaker_emb_condition_aligner = cfg.get("speaker_emb_condition_aligner", False) - energy_embedding_kernel_size = cfg.get("energy_embedding_kernel_size", 0) - energy_predictor = instantiate(self._cfg.get("energy_predictor", None)) + if cfg.n_speakers > 1: + input_fft.cond_input.condition_types.append("add") + if speaker_emb_condition_prosody: + duration_predictor.cond_input.condition_types.append("add") + pitch_predictor.cond_input.condition_types.append("add") + if speaker_emb_condition_decoder: + output_fft.cond_input.condition_types.append("add") + if speaker_emb_condition_aligner and self.aligner is not None: + self.aligner.cond_input.condition_types.append("add") self.fastpitch = FastPitchModule( input_fft, @@ -157,9 +169,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): energy_embedding_kernel_size, cfg.n_mel_channels, cfg.max_token_duration, - speaker_emb_condition_prosody, - speaker_emb_condition_decoder, - speaker_emb_condition_aligner, ) self._input_types = self._output_types = None self.export_config = { diff --git a/nemo/collections/tts/modules/__init__.py b/nemo/collections/tts/modules/__init__.py index 1354de22339d..ec7563d1966b 100644 --- a/nemo/collections/tts/modules/__init__.py +++ b/nemo/collections/tts/modules/__init__.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import nemo.collections.tts.modules.adapters from nemo.collections.tts.modules.tacotron2 import Decoder as Taco2Decoder from nemo.collections.tts.modules.tacotron2 import Encoder as Taco2Encoder from nemo.collections.tts.modules.tacotron2 import Postnet as Taco2Postnet diff --git a/nemo/collections/tts/modules/adapters.py b/nemo/collections/tts/modules/adapters.py new file mode 100644 index 000000000000..df5bdff84dc5 --- /dev/null +++ b/nemo/collections/tts/modules/adapters.py @@ -0,0 +1,147 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional + +from omegaconf import DictConfig + +from nemo.collections.asr.parts.utils import adapter_utils +from nemo.collections.tts.modules.aligner import AlignmentEncoder +from nemo.collections.tts.modules.fastpitch import TemporalPredictor +from nemo.collections.tts.modules.transformer import FFTransformerDecoder, FFTransformerEncoder +from nemo.core.classes import adapter_mixins + + +class FFTransformerDecoderAdapter(FFTransformerDecoder, adapter_mixins.AdapterModuleMixin): + """ Inherit from FFTransformerDecoder and add support for adapter""" + + def add_adapter(self, name: str, cfg: dict): + cfg = self._update_adapter_cfg_input_dim(cfg) + for fft_layer in self.layers: # type: adapter_mixins.AdapterModuleMixin + fft_layer.add_adapter(name, cfg) + + def is_adapter_available(self) -> bool: + return any([FFT_layer.is_adapter_available() for FFT_layer in self.layers]) + + def set_enabled_adapters(self, name: Optional[str] = None, enabled: bool = True): + for FFT_layer in self.layers: # type: adapter_mixins.AdapterModuleMixin + FFT_layer.set_enabled_adapters(name=name, enabled=enabled) + + def get_enabled_adapters(self) -> List[str]: + names = set([]) + for FFT_layer in self.layers: # type: adapter_mixins.AdapterModuleMixin + names.update(FFT_layer.get_enabled_adapters()) + + names = sorted(list(names)) + return names + + def _update_adapter_cfg_input_dim(self, cfg: DictConfig): + cfg = adapter_utils.update_adapter_cfg_input_dim(self, cfg, module_dim=self.d_model) + return cfg + + +class FFTransformerEncoderAdapter( + FFTransformerDecoderAdapter, FFTransformerEncoder, adapter_mixins.AdapterModuleMixin +): + """ Inherit from FFTransformerEncoder and add support for adapter""" + + pass + + +class AlignmentEncoderAdapter(AlignmentEncoder, adapter_mixins.AdapterModuleMixin): + """ Inherit from AlignmentEncoder and add support for adapter""" + + def add_adapter(self, name: str, cfg: dict): + + for i, conv_layer in enumerate(self.key_proj): + if i % 2 == 0: + cfg = self._update_adapter_cfg_input_dim(cfg, conv_layer.conv.out_channels) + conv_layer.add_adapter(name, cfg) + + for i, conv_layer in enumerate(self.query_proj): + if i % 2 == 0: + cfg = self._update_adapter_cfg_input_dim(cfg, conv_layer.conv.out_channels) + conv_layer.add_adapter(name, cfg) + + def is_adapter_available(self) -> bool: + return any( + [conv_layer.is_adapter_available() for i, conv_layer in enumerate(self.key_proj) if i % 2 == 0] + + [conv_layer.is_adapter_available() for i, conv_layer in enumerate(self.query_proj) if i % 2 == 0] + ) + + def set_enabled_adapters(self, name: Optional[str] = None, enabled: bool = True): + for i, conv_layer in enumerate(self.key_proj): + if i % 2 == 0: + conv_layer.set_enabled_adapters(name=name, enabled=enabled) + for i, conv_layer in enumerate(self.query_proj): + if i % 2 == 0: + conv_layer.set_enabled_adapters(name=name, enabled=enabled) + + def get_enabled_adapters(self) -> List[str]: + names = set([]) + for i, conv_layer in enumerate(self.key_proj): + if i % 2 == 0: + names.update(conv_layer.get_enabled_adapters()) + for i, conv_layer in enumerate(self.query_proj): + if i % 2 == 0: + names.update(conv_layer.get_enabled_adapters()) + + names = sorted(list(names)) + return names + + def _update_adapter_cfg_input_dim(self, cfg: DictConfig, module_dim: int): + cfg = adapter_utils.update_adapter_cfg_input_dim(self, cfg, module_dim=module_dim) + return cfg + + +class TemporalPredictorAdapter(TemporalPredictor, adapter_mixins.AdapterModuleMixin): + """ Inherit from TemporalPredictor and add support for adapter""" + + def add_adapter(self, name: str, cfg: dict): + cfg = self._update_adapter_cfg_input_dim(cfg) + for conv_layer in self.layers: # type: adapter_mixins.AdapterModuleMixin + conv_layer.add_adapter(name, cfg) + + def is_adapter_available(self) -> bool: + return any([conv_layer.is_adapter_available() for conv_layer in self.layers]) + + def set_enabled_adapters(self, name: Optional[str] = None, enabled: bool = True): + for conv_layer in self.layers: # type: adapter_mixins.AdapterModuleMixin + conv_layer.set_enabled_adapters(name=name, enabled=enabled) + + def get_enabled_adapters(self) -> List[str]: + names = set([]) + for conv_layer in self.layers: # type: adapter_mixins.AdapterModuleMixin + names.update(conv_layer.get_enabled_adapters()) + + names = sorted(list(names)) + return names + + def _update_adapter_cfg_input_dim(self, cfg: DictConfig): + cfg = adapter_utils.update_adapter_cfg_input_dim(self, cfg, module_dim=self.filter_size) + return cfg + + +"""Register any additional information""" +if adapter_mixins.get_registered_adapter(FFTransformerEncoder) is None: + adapter_mixins.register_adapter(base_class=FFTransformerEncoder, adapter_class=FFTransformerEncoderAdapter) + +if adapter_mixins.get_registered_adapter(FFTransformerDecoder) is None: + adapter_mixins.register_adapter(base_class=FFTransformerDecoder, adapter_class=FFTransformerDecoderAdapter) + +if adapter_mixins.get_registered_adapter(AlignmentEncoder) is None: + adapter_mixins.register_adapter(base_class=AlignmentEncoder, adapter_class=AlignmentEncoderAdapter) + +if adapter_mixins.get_registered_adapter(TemporalPredictor) is None: + adapter_mixins.register_adapter(base_class=TemporalPredictor, adapter_class=TemporalPredictorAdapter) diff --git a/nemo/collections/tts/modules/aligner.py b/nemo/collections/tts/modules/aligner.py index 2b03b7aea219..bc170742df23 100644 --- a/nemo/collections/tts/modules/aligner.py +++ b/nemo/collections/tts/modules/aligner.py @@ -12,10 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. + import torch from torch import nn -from nemo.collections.tts.modules.submodules import ConvNorm +from nemo.collections.tts.modules.submodules import ConditionalInput, ConvNorm from nemo.collections.tts.parts.utils.helpers import binarize_attention_parallel @@ -23,10 +24,11 @@ class AlignmentEncoder(torch.nn.Module): """Module for alignment text and mel spectrogram. """ def __init__( - self, n_mel_channels=80, n_text_channels=512, n_att_channels=80, temperature=0.0005, + self, n_mel_channels=80, n_text_channels=512, n_att_channels=80, temperature=0.0005, condition_types=[] ): super().__init__() self.temperature = temperature + self.cond_input = ConditionalInput(n_text_channels, n_text_channels, condition_types) self.softmax = torch.nn.Softmax(dim=3) self.log_softmax = torch.nn.LogSoftmax(dim=3) @@ -151,13 +153,12 @@ def forward(self, queries, keys, mask=None, attn_prior=None, conditioning=None): keys (torch.tensor): B x C2 x T2 tensor (text data). mask (torch.tensor): B x T2 x 1 tensor, binary mask for variable length entries (True = mask element, False = leave unchanged). attn_prior (torch.tensor): prior for attention matrix. - conditioning (torch.tensor): B x T2 x 1 conditioning embedding + conditioning (torch.tensor): B x 1 x C2 conditioning embedding Output: attn (torch.tensor): B x 1 x T1 x T2 attention mask. Final dim T2 should sum to 1. attn_logprob (torch.tensor): B x 1 x T1 x T2 log-prob attention mask. """ - if conditioning is not None: - keys = keys + conditioning.transpose(1, 2) + keys = self.cond_input(keys.transpose(1, 2), conditioning).transpose(1, 2) keys_enc = self.key_proj(keys) # B x n_attn_dims x T2 queries_enc = self.query_proj(queries) # B x n_attn_dims x T1 diff --git a/nemo/collections/tts/modules/fastpitch.py b/nemo/collections/tts/modules/fastpitch.py index eaee68a23517..83ec35d58693 100644 --- a/nemo/collections/tts/modules/fastpitch.py +++ b/nemo/collections/tts/modules/fastpitch.py @@ -44,8 +44,9 @@ import torch +from nemo.collections.tts.modules.submodules import ConditionalInput, ConditionalLayerNorm from nemo.collections.tts.parts.utils.helpers import binarize_attention_parallel, regulate_len -from nemo.core.classes import NeuralModule, typecheck +from nemo.core.classes import NeuralModule, adapter_mixins, typecheck from nemo.core.neural_types.elements import ( EncodedRepresentation, Index, @@ -79,40 +80,54 @@ def average_features(pitch, durs): return pitch_avg -class ConvReLUNorm(torch.nn.Module): - def __init__(self, in_channels, out_channels, kernel_size=1, dropout=0.0): +class ConvReLUNorm(torch.nn.Module, adapter_mixins.AdapterModuleMixin): + def __init__(self, in_channels, out_channels, kernel_size=1, dropout=0.0, condition_dim=384, condition_types=[]): super(ConvReLUNorm, self).__init__() self.conv = torch.nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, padding=(kernel_size // 2)) - self.norm = torch.nn.LayerNorm(out_channels) + self.norm = ConditionalLayerNorm(out_channels, condition_dim=condition_dim, condition_types=condition_types) self.dropout = torch.nn.Dropout(dropout) - def forward(self, signal): + def forward(self, signal, conditioning=None): out = torch.nn.functional.relu(self.conv(signal)) - out = self.norm(out.transpose(1, 2)).transpose(1, 2) - return self.dropout(out) + out = self.norm(out.transpose(1, 2), conditioning).transpose(1, 2) + out = self.dropout(out) + + if self.is_adapter_available(): + out = self.forward_enabled_adapters(out.transpose(1, 2)).transpose(1, 2) + + return out class TemporalPredictor(NeuralModule): """Predicts a single float per each temporal location""" - def __init__(self, input_size, filter_size, kernel_size, dropout, n_layers=2): + def __init__(self, input_size, filter_size, kernel_size, dropout, n_layers=2, condition_types=[]): super(TemporalPredictor, self).__init__() - self.layers = torch.nn.Sequential( - *[ + self.cond_input = ConditionalInput(input_size, input_size, condition_types) + self.layers = torch.nn.ModuleList() + for i in range(n_layers): + self.layers.append( ConvReLUNorm( - input_size if i == 0 else filter_size, filter_size, kernel_size=kernel_size, dropout=dropout + input_size if i == 0 else filter_size, + filter_size, + kernel_size=kernel_size, + dropout=dropout, + condition_dim=input_size, + condition_types=condition_types, ) - for i in range(n_layers) - ] - ) + ) self.fc = torch.nn.Linear(filter_size, 1, bias=True) + # Use for adapter input dimension + self.filter_size = filter_size + @property def input_types(self): return { "enc": NeuralType(('B', 'T', 'D'), EncodedRepresentation()), "enc_mask": NeuralType(('B', 'T', 1), TokenDurationType()), + "conditioning": NeuralType(('B', 'T', 'D'), EncodedRepresentation(), optional=True), } @property @@ -121,14 +136,20 @@ def output_types(self): "out": NeuralType(('B', 'T'), EncodedRepresentation()), } - def forward(self, enc, enc_mask): + def forward(self, enc, enc_mask, conditioning=None): + enc = self.cond_input(enc, conditioning) out = enc * enc_mask - out = self.layers(out.transpose(1, 2)).transpose(1, 2) + out = out.transpose(1, 2) + + for layer in self.layers: + out = layer(out, conditioning=conditioning) + + out = out.transpose(1, 2) out = self.fc(out) * enc_mask return out.squeeze(-1) -class FastPitchModule(NeuralModule): +class FastPitchModule(NeuralModule, adapter_mixins.AdapterModuleMixin): def __init__( self, encoder_module: NeuralModule, @@ -143,9 +164,6 @@ def __init__( energy_embedding_kernel_size: int, n_mel_channels: int = 80, max_token_duration: int = 75, - speaker_emb_condition_prosody: bool = False, - speaker_emb_condition_decoder: bool = False, - speaker_emb_condition_aligner: bool = False, ): super().__init__() @@ -158,9 +176,6 @@ def __init__( self.learn_alignment = aligner is not None self.use_duration_predictor = True self.binarize = False - self.speaker_emb_condition_prosody = speaker_emb_condition_prosody - self.speaker_emb_condition_decoder = speaker_emb_condition_decoder - self.speaker_emb_condition_aligner = speaker_emb_condition_aligner if n_speakers > 1: self.speaker_emb = torch.nn.Embedding(n_speakers, symbols_embedding_dim) @@ -245,33 +260,28 @@ def forward( # Calculate speaker embedding if self.speaker_emb is None or speaker is None: - spk_emb = 0 + spk_emb = None else: spk_emb = self.speaker_emb(speaker).unsqueeze(1) # Input FFT enc_out, enc_mask = self.encoder(input=text, conditioning=spk_emb) - if self.speaker_emb_condition_prosody: - prosody_input = enc_out + spk_emb - else: - prosody_input = enc_out - log_durs_predicted = self.duration_predictor(prosody_input, enc_mask) + + # Predict duration + log_durs_predicted = self.duration_predictor(enc_out, enc_mask, conditioning=spk_emb) durs_predicted = torch.clamp(torch.exp(log_durs_predicted) - 1, 0, self.max_token_duration) attn_soft, attn_hard, attn_hard_dur, attn_logprob = None, None, None, None if self.learn_alignment and spec is not None: text_emb = self.encoder.word_emb(text) - if self.speaker_emb_condition_aligner and not isinstance(spk_emb, int): - attn_soft, attn_logprob = self.aligner( - spec, text_emb.permute(0, 2, 1), enc_mask == 0, attn_prior, conditioning=spk_emb - ) - else: - attn_soft, attn_logprob = self.aligner(spec, text_emb.permute(0, 2, 1), enc_mask == 0, attn_prior) + attn_soft, attn_logprob = self.aligner( + spec, text_emb.permute(0, 2, 1), enc_mask == 0, attn_prior, conditioning=spk_emb + ) attn_hard = binarize_attention_parallel(attn_soft, input_lens, mel_lens) attn_hard_dur = attn_hard.sum(2)[:, 0, :] # Predict pitch - pitch_predicted = self.pitch_predictor(prosody_input, enc_mask) + pitch_predicted = self.pitch_predictor(enc_out, enc_mask, conditioning=spk_emb) if pitch is not None: if self.learn_alignment and pitch.shape[-1] != pitch_predicted.shape[-1]: # Pitch during training is per spectrogram frame, but during inference, it should be per character @@ -320,10 +330,7 @@ def forward( ) # Output FFT - if self.speaker_emb_condition_decoder: - dec_out, _ = self.decoder(input=len_regulated, seq_lens=dec_lens, conditioning=spk_emb) - else: - dec_out, _ = self.decoder(input=len_regulated, seq_lens=dec_lens) + dec_out, _ = self.decoder(input=len_regulated, seq_lens=dec_lens, conditioning=spk_emb) spect = self.proj(dec_out).transpose(1, 2) return ( spect, @@ -349,17 +356,13 @@ def infer(self, *, text, pitch=None, speaker=None, energy=None, pace=1.0, volume # Input FFT enc_out, enc_mask = self.encoder(input=text, conditioning=spk_emb) - if self.speaker_emb_condition_prosody: - prosody_input = enc_out + spk_emb - else: - prosody_input = enc_out # Predict duration and pitch - log_durs_predicted = self.duration_predictor(prosody_input, enc_mask) + log_durs_predicted = self.duration_predictor(enc_out, enc_mask, conditioning=spk_emb) durs_predicted = torch.clamp( torch.exp(log_durs_predicted) - 1.0, self.min_token_duration, self.max_token_duration ) - pitch_predicted = self.pitch_predictor(prosody_input, enc_mask) + pitch + pitch_predicted = self.pitch_predictor(enc_out, enc_mask, conditioning=spk_emb) + pitch pitch_emb = self.pitch_emb(pitch_predicted.unsqueeze(1)) enc_out = enc_out + pitch_emb.transpose(1, 2) @@ -380,10 +383,7 @@ def infer(self, *, text, pitch=None, speaker=None, energy=None, pace=1.0, volume volume_extended = volume_extended.squeeze(-1).float() # Output FFT - if self.speaker_emb_condition_decoder: - dec_out, _ = self.decoder(input=len_regulated, seq_lens=dec_lens, conditioning=spk_emb) - else: - dec_out, _ = self.decoder(input=len_regulated, seq_lens=dec_lens) + dec_out, _ = self.decoder(input=len_regulated, seq_lens=dec_lens, conditioning=spk_emb) spect = self.proj(dec_out).transpose(1, 2) return ( spect.to(torch.float), diff --git a/nemo/collections/tts/modules/submodules.py b/nemo/collections/tts/modules/submodules.py index 275468d60634..44ed0a92d776 100644 --- a/nemo/collections/tts/modules/submodules.py +++ b/nemo/collections/tts/modules/submodules.py @@ -19,6 +19,17 @@ from torch.autograd import Variable from torch.nn import functional as F +from nemo.core.classes import adapter_mixins + + +SUPPORTED_CONDITION_TYPES = ["add", "concat", "layernorm"] + + +def check_support_condition_types(condition_types): + for tp in condition_types: + if tp not in SUPPORTED_CONDITION_TYPES: + raise ValueError(f"Unknown conditioning type {tp}") + def masked_instance_norm( input: Tensor, mask: Tensor, weight: Tensor, bias: Tensor, momentum: float, eps: float = 1e-5, @@ -122,7 +133,7 @@ def forward(self, x): return self.linear_layer(x) -class ConvNorm(torch.nn.Module): +class ConvNorm(torch.nn.Module, adapter_mixins.AdapterModuleMixin): __constants__ = ['use_partial_padding'] use_partial_padding: bool @@ -176,6 +187,10 @@ def forward(self, signal, mask=None): ret = self.conv(signal) if self.norm is not None: ret = self.norm(ret) + + if self.is_adapter_available(): + ret = self.forward_enabled_adapters(ret.transpose(1, 2)).transpose(1, 2) + return ret @@ -410,3 +425,85 @@ def forward(self, forward_input: Tuple[torch.Tensor, torch.Tensor]): output = output + res_skip_acts return self.end(output) + + +class ConditionalLayerNorm(torch.nn.LayerNorm): + """ + This module is used to condition torch.nn.LayerNorm. + If we don't have any conditions, this will be a normal LayerNorm. + """ + + def __init__(self, hidden_dim, condition_dim=None, condition_types=[]): + check_support_condition_types(condition_types) + self.condition = "layernorm" in condition_types + super().__init__(hidden_dim, elementwise_affine=not self.condition) + + if self.condition: + self.cond_weight = torch.nn.Linear(condition_dim, hidden_dim) + self.cond_bias = torch.nn.Linear(condition_dim, hidden_dim) + self.init_parameters() + + def init_parameters(self): + torch.nn.init.constant_(self.cond_weight.weight, 0.0) + torch.nn.init.constant_(self.cond_weight.bias, 1.0) + torch.nn.init.constant_(self.cond_bias.weight, 0.0) + torch.nn.init.constant_(self.cond_bias.bias, 0.0) + + def forward(self, inputs, conditioning=None): + inputs = super().forward(inputs) + + # Normalize along channel + if self.condition: + if conditioning is None: + raise ValueError( + 'You should add additional data types as conditions e.g. speaker id or reference audio' + ) + inputs = inputs * self.cond_weight(conditioning) + inputs = inputs + self.cond_bias(conditioning) + + return inputs + + +class ConditionalInput(torch.nn.Module): + """ + This module is used to condition any model inputs. + If we don't have any conditions, this will be a normal pass. + """ + + def __init__(self, hidden_dim, condition_dim, condition_types=[]): + check_support_condition_types(condition_types) + super().__init__() + self.support_types = ["add", "concat"] + self.condition_types = [tp for tp in condition_types if tp in self.support_types] + self.hidden_dim = hidden_dim + self.condition_dim = condition_dim + + if "add" in self.condition_types and condition_dim != hidden_dim: + self.add_proj = torch.nn.Linear(condition_dim, hidden_dim) + + if "concat" in self.condition_types: + self.concat_proj = torch.nn.Linear(hidden_dim + condition_dim, hidden_dim) + + def forward(self, inputs, conditioning=None): + """ + Args: + inputs (torch.tensor): B x T x C tensor. + conditioning (torch.tensor): B x 1 x C conditioning embedding. + """ + if len(self.condition_types) > 0: + if conditioning is None: + raise ValueError( + 'You should add additional data types as conditions e.g. speaker id or reference audio' + ) + + if "add" in self.condition_types: + if self.condition_dim != self.hidden_dim: + conditioning = self.add_proj(conditioning) + inputs = inputs + conditioning + + if "concat" in self.condition_types: + conditioning = conditionting.repeat(1, inputs.shape[1], 1) + inputs = torch.cat([inputs, conditioning]) + inputs = self.concat_proj(inputs) + + return inputs diff --git a/nemo/collections/tts/modules/transformer.py b/nemo/collections/tts/modules/transformer.py index 0d2f8f417f4e..3dda8c522dcc 100644 --- a/nemo/collections/tts/modules/transformer.py +++ b/nemo/collections/tts/modules/transformer.py @@ -17,9 +17,9 @@ import torch.nn as nn import torch.nn.functional as F -from nemo.collections.tts.modules.submodules import LinearNorm +from nemo.collections.tts.modules.submodules import ConditionalInput, ConditionalLayerNorm, LinearNorm from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths -from nemo.core.classes import NeuralModule, typecheck +from nemo.core.classes import NeuralModule, adapter_mixins, typecheck from nemo.core.neural_types.elements import EncodedRepresentation, LengthsType, MaskType, TokenIndex from nemo.core.neural_types.neural_type import NeuralType @@ -51,7 +51,7 @@ def forward(self, pos_seq, bsz=None): class PositionwiseConvFF(nn.Module): - def __init__(self, d_model, d_inner, kernel_size, dropout, pre_lnorm=False): + def __init__(self, d_model, d_inner, kernel_size, dropout, pre_lnorm=False, condition_types=[]): super(PositionwiseConvFF, self).__init__() self.d_model = d_model @@ -68,17 +68,17 @@ def __init__(self, d_model, d_inner, kernel_size, dropout, pre_lnorm=False): nn.Conv1d(d_inner, d_model, kernel_size[1], 1, (kernel_size[1] // 2)), nn.Dropout(dropout), ) - self.layer_norm = nn.LayerNorm(d_model) + self.layer_norm = ConditionalLayerNorm(d_model, condition_dim=d_model, condition_types=condition_types) self.pre_lnorm = pre_lnorm - def forward(self, inp): - return self._forward(inp) + def forward(self, inp, conditioning=None): + return self._forward(inp, conditioning) - def _forward(self, inp): + def _forward(self, inp, conditioning=None): if self.pre_lnorm: # layer normalization + positionwise feed-forward core_out = inp.transpose(1, 2) - core_out = self.CoreNet(self.layer_norm(core_out).to(inp.dtype)) + core_out = self.CoreNet(self.layer_norm(core_out, conditioning).to(inp.dtype)) core_out = core_out.transpose(1, 2) # residual connection @@ -90,13 +90,13 @@ def _forward(self, inp): core_out = core_out.transpose(1, 2) # residual connection + layer normalization - output = self.layer_norm(inp + core_out).to(inp.dtype) + output = self.layer_norm(inp + core_out, conditioning).to(inp.dtype) return output class MultiHeadAttn(nn.Module): - def __init__(self, n_head, d_model, d_head, dropout, dropatt=0.1, pre_lnorm=False): + def __init__(self, n_head, d_model, d_head, dropout, dropatt=0.1, pre_lnorm=False, condition_types=[]): super(MultiHeadAttn, self).__init__() self.n_head = n_head @@ -109,17 +109,17 @@ def __init__(self, n_head, d_model, d_head, dropout, dropatt=0.1, pre_lnorm=Fals self.drop = nn.Dropout(dropout) self.dropatt = nn.Dropout(dropatt) self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) - self.layer_norm = nn.LayerNorm(d_model) + self.layer_norm = ConditionalLayerNorm(d_model, condition_dim=d_model, condition_types=condition_types) - def forward(self, inp, attn_mask=None): - return self._forward(inp, attn_mask) + def forward(self, inp, attn_mask=None, conditioning=None): + return self._forward(inp, attn_mask, conditioning) - def _forward(self, inp, attn_mask=None): + def _forward(self, inp, attn_mask=None, conditioning=None): residual = inp if self.pre_lnorm: # layer normalization - inp = self.layer_norm(inp) + inp = self.layer_norm(inp, conditioning) n_head, d_head = self.n_head, self.d_head @@ -157,29 +157,47 @@ def _forward(self, inp, attn_mask=None): output = residual + attn_out else: # residual connection + layer normalization - output = self.layer_norm(residual + attn_out) + output = self.layer_norm(residual + attn_out, conditioning) return output -class TransformerLayer(nn.Module): - def __init__(self, n_head, d_model, d_head, d_inner, kernel_size, dropout, **kwargs): +class TransformerLayer(nn.Module, adapter_mixins.AdapterModuleMixin): + def __init__(self, n_head, d_model, d_head, d_inner, kernel_size, dropout, condition_types=[], **kwargs): super(TransformerLayer, self).__init__() - self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs) - self.pos_ff = PositionwiseConvFF(d_model, d_inner, kernel_size, dropout, pre_lnorm=kwargs.get('pre_lnorm')) + self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, condition_types=condition_types, **kwargs) + self.pos_ff = PositionwiseConvFF( + d_model, d_inner, kernel_size, dropout, pre_lnorm=kwargs.get('pre_lnorm'), condition_types=condition_types + ) - def forward(self, dec_inp, mask=None): - output = self.dec_attn(dec_inp, attn_mask=~mask.squeeze(2)) + def forward(self, dec_inp, mask=None, conditioning=None): + output = self.dec_attn(dec_inp, attn_mask=~mask.squeeze(2), conditioning=conditioning) output *= mask - output = self.pos_ff(output) + output = self.pos_ff(output, conditioning) output *= mask + + if self.is_adapter_available(): + output = self.forward_enabled_adapters(output) + output *= mask + return output class FFTransformerDecoder(NeuralModule): def __init__( - self, n_layer, n_head, d_model, d_head, d_inner, kernel_size, dropout, dropatt, dropemb=0.0, pre_lnorm=False + self, + n_layer, + n_head, + d_model, + d_head, + d_inner, + kernel_size, + dropout, + dropatt, + dropemb=0.0, + pre_lnorm=False, + condition_types=[], ): super(FFTransformerDecoder, self).__init__() self.d_model = d_model @@ -189,11 +207,20 @@ def __init__( self.pos_emb = PositionalEmbedding(self.d_model) self.drop = nn.Dropout(dropemb) self.layers = nn.ModuleList() + self.cond_input = ConditionalInput(d_model, d_model, condition_types) for _ in range(n_layer): self.layers.append( TransformerLayer( - n_head, d_model, d_head, d_inner, kernel_size, dropout, dropatt=dropatt, pre_lnorm=pre_lnorm + n_head, + d_model, + d_head, + d_inner, + kernel_size, + dropout, + dropatt=dropatt, + pre_lnorm=pre_lnorm, + condition_types=condition_types, ) ) @@ -213,16 +240,18 @@ def output_types(self): } @typecheck() - def forward(self, input, seq_lens, conditioning=0): + def forward(self, input, seq_lens, conditioning=None): return self._forward(input, mask_from_lens(seq_lens).unsqueeze(2), conditioning) def _forward(self, inp, mask, conditioning): pos_seq = torch.arange(inp.size(1), device=inp.device).to(inp.dtype) pos_emb = self.pos_emb(pos_seq) * mask - out = self.drop(inp + pos_emb + conditioning) + inp += pos_emb + inp = self.cond_input(inp, conditioning) + out = self.drop(inp) for layer in self.layers: - out = layer(out, mask=mask) + out = layer(out, mask=mask, conditioning=conditioning) # out = self.drop(out) return out, mask @@ -244,9 +273,20 @@ def __init__( n_embed=None, d_embed=None, padding_idx=0, + condition_types=[], ): super(FFTransformerEncoder, self).__init__( - n_layer, n_head, d_model, d_head, d_inner, kernel_size, dropout, dropatt, dropemb, pre_lnorm + n_layer, + n_head, + d_model, + d_head, + d_inner, + kernel_size, + dropout, + dropatt, + dropemb, + pre_lnorm, + condition_types, ) self.padding_idx = padding_idx diff --git a/nemo/collections/tts/parts/mixins/__init__.py b/nemo/collections/tts/parts/mixins/__init__.py new file mode 100644 index 000000000000..bca487f8d96c --- /dev/null +++ b/nemo/collections/tts/parts/mixins/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo.collections.tts.parts.mixins.fastpitch_adapter_mixins import FastPitchAdapterModelMixin diff --git a/nemo/collections/tts/parts/mixins/fastpitch_adapter_mixins.py b/nemo/collections/tts/parts/mixins/fastpitch_adapter_mixins.py new file mode 100644 index 000000000000..375cf1fe51ee --- /dev/null +++ b/nemo/collections/tts/parts/mixins/fastpitch_adapter_mixins.py @@ -0,0 +1,368 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional + +from omegaconf import DictConfig, open_dict + +from nemo.core.classes.mixins.adapter_mixins import AdapterModelPTMixin, AdapterModuleMixin +from nemo.utils import logging, logging_mode + + +class FastPitchAdapterModelMixin(AdapterModelPTMixin): + """ FastPitch Adapter Mixin that can augment any Encoder module with Adapter module support. + This mixin class should be used only with a top level ModelPT subclass, that includes an `encoder` submodule. + This mixin class adds several utility methods which are propagated to the `encoder`. + An Adapter module is any Pytorch nn.Module that possess a few properties : + - It's input and output dimension are the same, while the hidden dimension need not be the same. + - The final layer of the Adapter module is zero-initialized, so that the residual connection to the adapter + yields the original output. + This mixin adds the following instance variables to the class this inherits it: + - `adapter_layer`: A torch.nn.ModuleDict(), whose keys are the names of the adapter (globally unique), + and values are the Adapter nn.Module(). + - `adapter_cfg`: A OmegaConf DictConfig object that holds the config of the adapters that are initialized. + - `adapter_global_cfg_key`: A str representing a key in the model config that can be provided by the user. + The value resolves to `global_cfg`, and can be overridden via `model.cfg.adapters.global_cfg.*`. + **Note**: This module **is** responsible for maintaining its config. At the ModelPT level, it will access and + write Adapter config information to `self.cfg.adapters`. + """ + + def setup_adapters(self): + """ + Utility method that is called in the ASR ModelPT-implementation constructor, so as to restore any + adapters that were previously added. + This method should be called just once at constructor time. + """ + supports_adapters = False + + # At least the encoder must extend AdapterModuleMixin + if hasattr(self.fastpitch, 'encoder') and isinstance(self.fastpitch.encoder, AdapterModuleMixin): + supports_adapters |= True + + if hasattr(self.fastpitch, 'decoder') and isinstance(self.fastpitch.decoder, AdapterModuleMixin): + supports_adapters |= True + + if hasattr(self.fastpitch, 'duration_predictor') and isinstance( + self.fastpitch.duration_predictor, AdapterModuleMixin + ): + supports_adapters |= True + + if hasattr(self.fastpitch, 'pitch_predictor') and isinstance( + self.fastpitch.pitch_predictor, AdapterModuleMixin + ): + supports_adapters |= True + + if hasattr(self.fastpitch, 'aligner') and isinstance(self.fastpitch.aligner, AdapterModuleMixin): + supports_adapters |= True + + # If adapters are supported, setup the adapter config + any modules (pre-existing adapter modules) + if supports_adapters: + super().setup_adapters() + + def add_adapter(self, name: str, cfg: DictConfig): + """ + Add an Adapter module to this model. + Args: + name: A globally unique name for the adapter. Will be used to access, enable and disable adapters. + cfg: A DictConfig that contains at the bare minimum `__target__` to instantiate a new Adapter module. + """ + # setup the config for adapters + super().add_adapter(name=name, cfg=cfg) + + # Resolve module name and adapter name + module_name, _ = self.resolve_adapter_module_name_(name) + + # Use + as a splitter, in order to share one name across multiple modules + if '+' in module_name: + module_names = module_name.split('+') + else: + module_names = [module_name] + + with open_dict(self.cfg): + for module_name in module_names: + # Check if encoder adapters should be added + if module_name == 'encoder': + # Dispatch the call to the encoder. + self.fastpitch.encoder.add_adapter(name=name, cfg=cfg) + + # Check if decoder adapters should be added + if module_name in ('', 'decoder'): + # Dispatch call to the decoder. (default use decoder) + self.fastpitch.decoder.add_adapter(name=name, cfg=cfg) + + # Check if duration_predictor adapters should be added + if module_name in ('', 'duration_predictor'): + # Dispatch call to the duration_predictor. (default use duration_predictor) + self.fastpitch.duration_predictor.add_adapter(name=name, cfg=cfg) + + # Check if pitch_predictor adapters should be added + if module_name in ('', 'pitch_predictor'): + # Dispatch call to the pitch_predictor. (default use pitch_predictor) + self.fastpitch.pitch_predictor.add_adapter(name=name, cfg=cfg) + + # Check if aligner adapters should be added + if module_name in ('', 'aligner'): + # Dispatch call to the aligner. (default use aligner) + self.fastpitch.aligner.add_adapter(name=name, cfg=cfg) + + def is_adapter_available(self) -> bool: + """ + Checks if any Adapter module has been instantiated. + Returns: + bool, determining if any Adapter module has been instantiated. Returns true even if the adapters are + enabled or disabled, false only if no adapters exist. + """ + config_contains_adapter = super().is_adapter_available() + + # Forward the method call to the individual modules + if hasattr(self.fastpitch, 'encoder') and isinstance(self.fastpitch.encoder, AdapterModuleMixin): + config_contains_adapter |= self.fastpitch.encoder.is_adapter_available() + + if hasattr(self.fastpitch, 'decoder') and isinstance(self.fastpitch.decoder, AdapterModuleMixin): + config_contains_adapter |= self.fastpitch.decoder.is_adapter_available() + + if hasattr(self.fastpitch, 'duration_predictor') and isinstance( + self.fastpitch.duration_predictor, AdapterModuleMixin + ): + config_contains_adapter |= self.fastpitch.duration_predictor.is_adapter_available() + + if hasattr(self.fastpitch, 'pitch_predictor') and isinstance( + self.fastpitch.pitch_predictor, AdapterModuleMixin + ): + config_contains_adapter |= self.fastpitch.pitch_predictor.is_adapter_available() + + if hasattr(self.fastpitch, 'aligner') and isinstance(self.fastpitch.aligner, AdapterModuleMixin): + config_contains_adapter |= self.fastpitch.aligner.is_adapter_available() + + return config_contains_adapter + + def set_enabled_adapters(self, name: Optional[str] = None, enabled: bool = True): + """ + Updated the internal adapter config, determining if an adapter (or all adapters) are either + enabled or disabled. + A common user pattern would be to disable all adapters (either after adding them, or restoring a model + with pre-existing adapters) and then simply enable one of the adapters. + .. code:: + model.set_enabled_adapters(enabled=False) + model.set_enabled_adapters(name=, enabled=True) + Args: + name: Optional str. If a str name is given, the config will be updated to the value of `enabled`. + If no name is given, then all adapters will be enabled/disabled. + enabled: Bool, determines if the adapter(s) will be enabled/disabled. + """ + super().set_enabled_adapters(name=name, enabled=enabled) + + # Resolve the module name and adapter name + if name is not None: + module_name, _ = self.resolve_adapter_module_name_(name) + else: + module_name = None + + # Use + as a splitter, in order to share one name across multiple modules + if module_name is not None and '+' in module_name: + module_names = module_name.split('+') + else: + module_names = [module_name] + + for module_name in module_names: + # Check if encoder adapters should be used + # Dispatch the call to the encoder. + if name is None or module_name == 'encoder': + if self.fastpitch.encoder.is_adapter_available(): + self.fastpitch.encoder.set_enabled_adapters(name=name, enabled=enabled) + + # Dispatch the call to the decoder. + if name is None or module_name in ('', 'decoder'): + if self.fastpitch.decoder.is_adapter_available(): + self.fastpitch.decoder.set_enabled_adapters(name=name, enabled=enabled) + + # Dispatch the call to the duration_predictor. + if name is None or module_name in ('', 'duration_predictor'): + if self.fastpitch.duration_predictor.is_adapter_available(): + self.fastpitch.duration_predictor.set_enabled_adapters(name=name, enabled=enabled) + + # Dispatch the call to the pitch_predictor. + if name is None or module_name in ('', 'pitch_predictor'): + if self.fastpitch.pitch_predictor.is_adapter_available(): + self.fastpitch.pitch_predictor.set_enabled_adapters(name=name, enabled=enabled) + + # Dispatch the call to the aligner. + if name is None or module_name in ('', 'aligner'): + if self.fastpitch.aligner.is_adapter_available(): + self.fastpitch.aligner.set_enabled_adapters(name=name, enabled=enabled) + + def get_enabled_adapters(self) -> List[str]: + """ + Returns a list of all enabled adapters. + Returns: + A list of str names of each enabled adapter(s). + """ + enabled_adapters = super().get_enabled_adapters() + + # Check if encoder adapters should be used or are enabled + if hasattr(self.fastpitch, 'encoder') and isinstance(self.fastpitch.encoder, AdapterModuleMixin): + enabled_adapters.extend(self.fastpitch.encoder.get_enabled_adapters()) + + if hasattr(self.fastpitch, 'decoder') and isinstance(self.fastpitch.decoder, AdapterModuleMixin): + enabled_adapters.extend(self.fastpitch.decoder.get_enabled_adapters()) + + if hasattr(self.fastpitch, 'duration_predictor') and isinstance( + self.fastpitch.duration_predictor, AdapterModuleMixin + ): + enabled_adapters.extend(self.fastpitch.duration_predictor.get_enabled_adapters()) + + if hasattr(self.fastpitch, 'pitch_predictor') and isinstance( + self.fastpitch.pitch_predictor, AdapterModuleMixin + ): + enabled_adapters.extend(self.fastpitch.pitch_predictor.get_enabled_adapters()) + + if hasattr(self.fastpitch, 'aligner') and isinstance(self.fastpitch.aligner, AdapterModuleMixin): + enabled_adapters.extend(self.fastpitch.aligner.get_enabled_adapters()) + + enabled_adapters = list(sorted(list(set(enabled_adapters)))) + + return enabled_adapters + + def check_valid_model_with_adapter_support_(self): + """ + Utility method to test if the subclass of this mixin is an appropriate subclass of ModelPT itself. + """ + # Obtain the global adapter config if possible, otherwise use sensible defaults. + global_cfg = self._get_global_cfg() + + # Test whether the encoder supports adapters + use_encoder_adapter = global_cfg.get('check_encoder_adapter', False) + if use_encoder_adapter: + if not hasattr(self.fastpitch, 'encoder'): + logging.warning( + "Cannot add adapter to this object as it does not have an `fastpitch.encoder` sub-module!", + mode=logging_mode.ONCE, + ) + + if hasattr(self.fastpitch, 'encoder') and not isinstance(self.fastpitch.encoder, AdapterModuleMixin): + logging.warning( + f'{self.fastpitch.encoder.__class__.__name__} does not implement `AdapterModuleMixin`', + mode=logging_mode.ONCE, + ) + + # Test whether the decoder supports adapters + use_decoder_adapter = global_cfg.get('check_decoder_adapter', True) + if use_decoder_adapter: + if not hasattr(self.fastpitch, 'decoder'): + logging.warning( + "Cannot add adapter to this object as it does not have an `fastpitch.decoder` sub-module!", + mode=logging_mode.ONCE, + ) + + if hasattr(self.fastpitch, 'decoder') and not isinstance(self.fastpitch.decoder, AdapterModuleMixin): + logging.warning( + f'{self.fastpitch.decoder.__class__.__name__} does not implement `AdapterModuleMixin`', + mode=logging_mode.ONCE, + ) + + # Test whether the duration_predictor supports adapters + use_duration_predictor_adapter = global_cfg.get('check_duration_predictor_adapter', True) + if use_duration_predictor_adapter: + if not hasattr(self.fastpitch, 'duration_predictor'): + logging.warning( + "Cannot add adapter to this object as it does not have an `fastpitch.duration_predictor` sub-module!", + mode=logging_mode.ONCE, + ) + + if hasattr(self.fastpitch, 'duration_predictor') and not isinstance( + self.fastpitch.duration_predictor, AdapterModuleMixin + ): + logging.warning( + f'{self.fastpitch.duration_predictor.__class__.__name__} does not implement `AdapterModuleMixin`', + mode=logging_mode.ONCE, + ) + + # Test whether the pitch_predictor supports adapters + use_pitch_predictor_adapter = global_cfg.get('check_pitch_predictor_adapter', True) + if use_pitch_predictor_adapter: + if not hasattr(self.fastpitch, 'pitch_predictor'): + logging.warning( + "Cannot add adapter to this object as it does not have an `fastpitch.pitch_predictor` sub-module!", + mode=logging_mode.ONCE, + ) + + if hasattr(self.fastpitch, 'pitch_predictor') and not isinstance( + self.fastpitch.pitch_predictor, AdapterModuleMixin + ): + logging.warning( + f'{self.fastpitch.pitch_predictor.__class__.__name__} does not implement `AdapterModuleMixin`', + mode=logging_mode.ONCE, + ) + + # Test whether the aligner supports adapters + use_aligner_adapter = global_cfg.get('check_aligner_adapter', True) + if use_aligner_adapter: + if not hasattr(self.fastpitch, 'aligner'): + logging.warning( + "Cannot add adapter to this object as it does not have an `fastpitch.aligner` sub-module!", + mode=logging_mode.ONCE, + ) + + if hasattr(self.fastpitch, 'aligner') and not isinstance(self.fastpitch.aligner, AdapterModuleMixin): + logging.warning( + f'{self.fastpitch.aligner.__class__.__name__} does not implement `AdapterModuleMixin`', + mode=logging_mode.ONCE, + ) + + def resolve_adapter_module_name_(self, name: str) -> (str, str): + """ + Utility method to resolve a given global/module adapter name to its components. + Always returns a tuple representing (module_name, adapter_name). ":" is used as the + delimiter for denoting the module name vs the adapter name. + Will attempt to also resolve a given adapter_name alone back to (module_name, adapter_name) + if the metadata config exists for access. + Args: + name: A global adapter, or a module adapter name (with structure module_name:adapter_name). + Returns: + A tuple representing (module_name, adapter_name). If a global adapter is provided, + module_name is set to ''. + """ + module_name, adapter_name = super().resolve_adapter_module_name_(name) + + # Use + as a splitter, in order to share one name across multiple modules + if '+' in module_name: + module_names = module_name.split('+') + else: + module_names = [module_name] + + # resolve name and module only for valid modules + valid_module_names = self.adapter_module_names + + for mod_name in module_names: + if mod_name not in valid_module_names: + raise ValueError(f"Provided module name `{mod_name}` is not in valid list : {valid_module_names}") + + return (module_name, adapter_name) + + def _get_global_cfg(self): + """ + Utility method, to either extract or construct the global config inside adapters config. + """ + global_config = DictConfig({}) + if 'adapters' in self.cfg and self.adapter_global_cfg_key in self.cfg.adapters: + global_config = self.adapter_cfg[self.adapter_global_cfg_key] + return global_config + + @property + def adapter_module_names(self) -> List[str]: + module_names = super().adapter_module_names # "Default" adapter module: '' + module_names.extend( + ['encoder', 'decoder', 'duration_predictor', 'pitch_predictor', 'aligner'] + ) # Add support for `encoder` and `decoder` modules + return module_names diff --git a/tests/collections/tts/modules/test_submodules.py b/tests/collections/tts/modules/test_submodules.py new file mode 100644 index 000000000000..5ee894398739 --- /dev/null +++ b/tests/collections/tts/modules/test_submodules.py @@ -0,0 +1,48 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import torch + +from nemo.collections.tts.modules import submodules + + +@pytest.mark.unit +def test_conditional_layer_norm(): + + # NLP Example + batch, sentence_length, embedding_dim = 20, 5, 10 + embedding = torch.randn(batch, sentence_length, embedding_dim) + ln = torch.nn.LayerNorm(embedding_dim) + cln = submodules.ConditionalLayerNorm(embedding_dim) + assert torch.all(ln(embedding) == cln(embedding)) + + weight = torch.nn.Parameter(torch.randn(embedding_dim)) + bias = torch.nn.Parameter(torch.randn(embedding_dim)) + ln.weight, ln.bias = weight, bias + cln.weight, cln.bias = weight, bias + assert torch.all(ln(embedding) == cln(embedding)) # Simulate trained weights + + # Image Example + N, C, H, W = 20, 5, 10, 10 + image = torch.randn(N, C, H, W) + ln = torch.nn.LayerNorm([C, H, W]) + cln = submodules.ConditionalLayerNorm([C, H, W]) + assert torch.all(ln(image) == cln(image)) + + weight = torch.nn.Parameter(torch.randn(C, H, W)) + bias = torch.nn.Parameter(torch.randn(C, H, W)) + ln.weight, ln.bias = weight, bias + cln.weight, cln.bias = weight, bias + assert torch.all(ln(image) == cln(image)) # Simulate trained weights From 14e9668bed65df34cc762d2f7322b38d8ae0d919 Mon Sep 17 00:00:00 2001 From: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Date: Mon, 17 Apr 2023 16:12:45 -0700 Subject: [PATCH 02/23] [TTS] whitelist broken path fix. (#6412) * [TTS] whitelist broken path fix. Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- scripts/dataset_processing/tts/ljspeech/get_data.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/dataset_processing/tts/ljspeech/get_data.py b/scripts/dataset_processing/tts/ljspeech/get_data.py index 733f9b76b354..7c28fb8ef903 100644 --- a/scripts/dataset_processing/tts/ljspeech/get_data.py +++ b/scripts/dataset_processing/tts/ljspeech/get_data.py @@ -27,7 +27,11 @@ def get_args(): parser = argparse.ArgumentParser(description='Download LJSpeech and create manifests with predefined split') parser.add_argument("--data-root", required=True, type=Path) - parser.add_argument('--whitelist-path', type=str, default="lj_speech.tsv") + parser.add_argument( + '--whitelist-path', + type=str, + default="lj_speech.tsv extracted from the readme file in the dataset. You can also download the file from https://github.com/NVIDIA/NeMo-text-processing/blob/main/nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv", + ) args = parser.parse_args() return args @@ -56,7 +60,7 @@ def __extract_file(filepath, data_dir): def __process_data(data_root, whitelist_path): if whitelist_path is None: wget.download( - "https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tts_dataset_files/en/whitelist_lj_speech.tsv", + "https://github.com/NVIDIA/NeMo-text-processing/blob/main/nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv", out=str(data_root), ) whitelist_path = data_root / "lj_speech.tsv" From 536ee625a50aa657b4f32e9f460a56ea4c79c9c6 Mon Sep 17 00:00:00 2001 From: Cheng-Ping Hsieh <37269846+hsiehjackson@users.noreply.github.com> Date: Tue, 18 Apr 2023 10:19:15 +0800 Subject: [PATCH 03/23] [TTS] FastPitch speaker encoder (#6417) * Add initial codes Signed-off-by: hsiehjackson * Remove wemb Signed-off-by: hsiehjackson * Fix import Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Restore aligner loss Signed-off-by: hsiehjackson * Add ConditionalInput Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix error and support pre-trained config Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Follow comments Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Rename config Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Change copyright and random weight test Signed-off-by: hsiehjackson * Add initial codes Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Fix import error Signed-off-by: hsiehjackson * Add initial codes Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Fix dataset error Signed-off-by: hsiehjackson * Remove reference speaker embedding Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Remove SV encoder Signed-off-by: hsiehjackson * Follow comments Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Fix length type Signed-off-by: hsiehjackson * Fix append Signed-off-by: hsiehjackson * Move error msg Signed-off-by: hsiehjackson * Add look-up into speaker encoder Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Add valueerror msg Signed-off-by: hsiehjackson * Move lookup Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Remove unused Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: hsiehjackson * Fix error Signed-off-by: hsiehjackson * Rebase and Fix error Signed-off-by: hsiehjackson * Fix spk encoder Signed-off-by: hsiehjackson * Rename n_speakers Signed-off-by: hsiehjackson * Follow comments Signed-off-by: hsiehjackson * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix n_speakers None error Signed-off-by: hsiehjackson --------- Signed-off-by: hsiehjackson Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../conf/fastpitch_align_44100_adapter.yaml | 26 +- nemo/collections/tts/data/dataset.py | 47 ++++ nemo/collections/tts/models/fastpitch.py | 78 +++++- nemo/collections/tts/modules/fastpitch.py | 53 +++- nemo/collections/tts/modules/submodules.py | 251 +++++++++++++++++- nemo/collections/tts/torch/tts_data_types.py | 5 + 6 files changed, 432 insertions(+), 28 deletions(-) diff --git a/examples/tts/conf/fastpitch_align_44100_adapter.yaml b/examples/tts/conf/fastpitch_align_44100_adapter.yaml index 032ab1da501f..bac6a64b06e9 100644 --- a/examples/tts/conf/fastpitch_align_44100_adapter.yaml +++ b/examples/tts/conf/fastpitch_align_44100_adapter.yaml @@ -7,7 +7,7 @@ name: FastPitch train_dataset: ??? validation_datasets: ??? sup_data_path: ??? -sup_data_types: [ "align_prior_matrix", "pitch", "speaker_id"] +sup_data_types: [ "align_prior_matrix", "pitch", "speaker_id", "reference_audio"] # Default values from librosa.pyin pitch_fmin: 65.40639132514966 @@ -35,10 +35,8 @@ model: learn_alignment: true bin_loss_warmup_epochs: 100 - n_speakers: 1 max_token_duration: 75 symbols_embedding_dim: 384 - speaker_embedding_dim: 384 pitch_embedding_kernel_size: 3 pitch_fmin: ${pitch_fmin} @@ -248,6 +246,28 @@ model: n_layers: 2 condition_types: [ "add", "layernorm" ] # options: [ "add", "cat", "layernorm" ] + speaker_encoder: + _target_: nemo.collections.tts.modules.submodules.SpeakerEncoder + lookup_module: + _target_: nemo.collections.tts.modules.submodules.SpeakerLookupTable + n_speakers: ??? + embedding_dim: ${model.symbols_embedding_dim} + gst_module: + _target_: nemo.collections.tts.modules.submodules.GlobalStyleToken + gst_size: ${model.symbols_embedding_dim} + n_style_token: 10 + n_style_attn_head: 4 + reference_encoder: + _target_: nemo.collections.tts.modules.submodules.ReferenceEncoder + n_mels: ${model.n_mel_channels} + cnn_filters: [32, 32, 64, 64, 128, 128] + dropout: 0.2 + gru_hidden: ${model.symbols_embedding_dim} + kernel_size: 3 + stride: 2 + padding: 1 + bias: true + optim: name: adamw lr: 1e-3 diff --git a/nemo/collections/tts/data/dataset.py b/nemo/collections/tts/data/dataset.py index af4df1e58668..6bb41d341b31 100644 --- a/nemo/collections/tts/data/dataset.py +++ b/nemo/collections/tts/data/dataset.py @@ -50,6 +50,7 @@ LogMel, P_voiced, Pitch, + ReferenceAudio, SpeakerID, TTSDataType, Voiced_mask, @@ -483,6 +484,13 @@ def add_energy(self, **kwargs): def add_speaker_id(self, **kwargs): pass + def add_reference_audio(self, **kwargs): + assert SpeakerID in self.sup_data_types, "Please add speaker_id in sup_data_types." + """Add a mapping for each speaker to their manifest indexes""" + self.speaker_to_index_map = defaultdict(set) + for i, d in enumerate(self.data): + self.speaker_to_index_map[d['speaker_id']].add(i) + def get_spec(self, audio): with torch.cuda.amp.autocast(enabled=False): spec = self.stft(audio) @@ -522,6 +530,12 @@ def _pad_wav_to_multiple(self, wav): ) return wav + # Random sample a reference index from the same speaker + def sample_reference_index(self, speaker_id): + reference_pool = self.speaker_to_index_map[speaker_id] + reference_index = random.sample(reference_pool, 1)[0] + return reference_index + def __getitem__(self, index): sample = self.data[index] @@ -683,6 +697,19 @@ def __getitem__(self, index): if SpeakerID in self.sup_data_types_set: speaker_id = torch.tensor(sample["speaker_id"]).long() + reference_audio, reference_audio_length = None, None + if ReferenceAudio in self.sup_data_types_set: + reference_index = self.sample_reference_index(sample["speaker_id"]) + reference_audio = self.featurizer.process( + self.data[reference_index]["audio_filepath"], + trim=self.trim, + trim_ref=self.trim_ref, + trim_top_db=self.trim_top_db, + trim_frame_length=self.trim_frame_length, + trim_hop_length=self.trim_hop_length, + ) + reference_audio_length = torch.tensor(reference_audio.shape[0]).long() + return ( audio, audio_length, @@ -700,6 +727,8 @@ def __getitem__(self, index): voiced_mask, p_voiced, audio_shifted, + reference_audio, + reference_audio_length, ) def __len__(self): @@ -733,6 +762,8 @@ def general_collate_fn(self, batch): voiced_masks, p_voiceds, _, + _, + reference_audio_lengths, ) = zip(*batch) max_audio_len = max(audio_lengths).item() @@ -741,6 +772,9 @@ def general_collate_fn(self, batch): max_durations_len = max([len(i) for i in durations_list]) if Durations in self.sup_data_types_set else None max_pitches_len = max(pitches_lengths).item() if Pitch in self.sup_data_types_set else None max_energies_len = max(energies_lengths).item() if Energy in self.sup_data_types_set else None + max_reference_audio_len = ( + max(reference_audio_lengths).item() if ReferenceAudio in self.sup_data_types_set else None + ) if LogMel in self.sup_data_types_set: log_mel_pad = torch.finfo(batch[0][4].dtype).tiny @@ -765,6 +799,7 @@ def general_collate_fn(self, batch): voiced_masks, p_voiceds, audios_shifted, + reference_audios, ) = ( [], [], @@ -776,6 +811,7 @@ def general_collate_fn(self, batch): [], [], [], + [], ) for i, sample_tuple in enumerate(batch): @@ -796,6 +832,8 @@ def general_collate_fn(self, batch): voiced_mask, p_voiced, audio_shifted, + reference_audio, + reference_audios_length, ) = sample_tuple audio = general_padding(audio, audio_len.item(), max_audio_len) @@ -834,6 +872,11 @@ def general_collate_fn(self, batch): if SpeakerID in self.sup_data_types_set: speaker_ids.append(speaker_id) + if ReferenceAudio in self.sup_data_types_set: + reference_audios.append( + general_padding(reference_audio, reference_audios_length.item(), max_reference_audio_len) + ) + data_dict = { "audio": torch.stack(audios), "audio_lens": torch.stack(audio_lengths), @@ -851,6 +894,10 @@ def general_collate_fn(self, batch): "voiced_mask": torch.stack(voiced_masks) if Voiced_mask in self.sup_data_types_set else None, "p_voiced": torch.stack(p_voiceds) if P_voiced in self.sup_data_types_set else None, "audio_shifted": torch.stack(audios_shifted) if audio_shifted is not None else None, + "reference_audio": torch.stack(reference_audios) if ReferenceAudio in self.sup_data_types_set else None, + "reference_audio_lens": torch.stack(reference_audio_lengths) + if ReferenceAudio in self.sup_data_types_set + else None, } return data_dict diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index 76eaae2f9ba2..5502e69a3111 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -139,14 +139,17 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): output_fft = instantiate(self._cfg.output_fft) duration_predictor = instantiate(self._cfg.duration_predictor) pitch_predictor = instantiate(self._cfg.pitch_predictor) + speaker_encoder = instantiate(self._cfg.get("speaker_encoder", None)) energy_embedding_kernel_size = cfg.get("energy_embedding_kernel_size", 0) energy_predictor = instantiate(self._cfg.get("energy_predictor", None)) # [TODO] may remove if we change the pre-trained config + # cfg: condition_types = [ "add" ] + n_speakers = cfg.get("n_speakers", 0) speaker_emb_condition_prosody = cfg.get("speaker_emb_condition_prosody", False) speaker_emb_condition_decoder = cfg.get("speaker_emb_condition_decoder", False) speaker_emb_condition_aligner = cfg.get("speaker_emb_condition_aligner", False) - if cfg.n_speakers > 1: + if n_speakers > 1 and "add" not in input_fft.cond_input.condition_types: input_fft.cond_input.condition_types.append("add") if speaker_emb_condition_prosody: duration_predictor.cond_input.condition_types.append("add") @@ -163,7 +166,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): pitch_predictor, energy_predictor, self.aligner, - cfg.n_speakers, + speaker_encoder, + n_speakers, cfg.symbols_embedding_dim, cfg.pitch_embedding_kernel_size, energy_embedding_kernel_size, @@ -305,6 +309,9 @@ def parse(self, str_input: str, normalize=True) -> torch.tensor: "attn_prior": NeuralType(('B', 'T_spec', 'T_text'), ProbsType(), optional=True), "mel_lens": NeuralType(('B'), LengthsType(), optional=True), "input_lens": NeuralType(('B'), LengthsType(), optional=True), + # reference_* data is used for multi-speaker FastPitch training + "reference_spec": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType(), optional=True), + "reference_spec_lens": NeuralType(('B'), LengthsType(), optional=True), } ) def forward( @@ -320,6 +327,8 @@ def forward( attn_prior=None, mel_lens=None, input_lens=None, + reference_spec=None, + reference_spec_lens=None, ): return self.fastpitch( text=text, @@ -332,21 +341,43 @@ def forward( attn_prior=attn_prior, mel_lens=mel_lens, input_lens=input_lens, + reference_spec=reference_spec, + reference_spec_lens=reference_spec_lens, ) @typecheck(output_types={"spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType())}) def generate_spectrogram( - self, tokens: 'torch.tensor', speaker: Optional[int] = None, pace: float = 1.0 + self, + tokens: 'torch.tensor', + speaker: Optional[int] = None, + pace: float = 1.0, + reference_spec: Optional['torch.tensor'] = None, + reference_spec_lens: Optional['torch.tensor'] = None, ) -> torch.tensor: if self.training: logging.warning("generate_spectrogram() is meant to be called in eval mode.") if isinstance(speaker, int): speaker = torch.tensor([speaker]).to(self.device) - spect, *_ = self(text=tokens, durs=None, pitch=None, speaker=speaker, pace=pace) + spect, *_ = self( + text=tokens, + durs=None, + pitch=None, + speaker=speaker, + pace=pace, + reference_spec=reference_spec, + reference_spec_lens=reference_spec_lens, + ) return spect def training_step(self, batch, batch_idx): - attn_prior, durs, speaker, energy = None, None, None, None + attn_prior, durs, speaker, energy, reference_audio, reference_audio_len = ( + None, + None, + None, + None, + None, + None, + ) if self.learn_alignment: assert self.ds_class_name == "TTSDataset", f"Unknown dataset class: {self.ds_class_name}" batch_dict = process_batch(batch, self._train_dl.dataset.sup_data_types_set) @@ -358,10 +389,17 @@ def training_step(self, batch, batch_idx): pitch = batch_dict.get("pitch", None) energy = batch_dict.get("energy", None) speaker = batch_dict.get("speaker_id", None) + reference_audio = batch_dict.get("reference_audio", None) + reference_audio_len = batch_dict.get("reference_audio_lens", None) else: audio, audio_lens, text, text_lens, durs, pitch, speaker = batch mels, spec_len = self.preprocessor(input_signal=audio, length=audio_lens) + reference_spec, reference_spec_len = None, None + if reference_audio is not None: + reference_spec, reference_spec_len = self.preprocessor( + input_signal=reference_audio, length=reference_audio_len + ) ( mels_pred, @@ -384,6 +422,8 @@ def training_step(self, batch, batch_idx): speaker=speaker, pace=1.0, spec=mels if self.learn_alignment else None, + reference_spec=reference_spec, + reference_spec_lens=reference_spec_len, attn_prior=attn_prior, mel_lens=spec_len, input_lens=text_lens, @@ -441,7 +481,14 @@ def training_step(self, batch, batch_idx): return loss def validation_step(self, batch, batch_idx): - attn_prior, durs, speaker, energy = None, None, None, None + attn_prior, durs, speaker, energy, reference_audio, reference_audio_len = ( + None, + None, + None, + None, + None, + None, + ) if self.learn_alignment: assert self.ds_class_name == "TTSDataset", f"Unknown dataset class: {self.ds_class_name}" batch_dict = process_batch(batch, self._train_dl.dataset.sup_data_types_set) @@ -453,10 +500,17 @@ def validation_step(self, batch, batch_idx): pitch = batch_dict.get("pitch", None) energy = batch_dict.get("energy", None) speaker = batch_dict.get("speaker_id", None) + reference_audio = batch_dict.get("reference_audio", None) + reference_audio_len = batch_dict.get("reference_audio_lens", None) else: audio, audio_lens, text, text_lens, durs, pitch, speaker = batch mels, mel_lens = self.preprocessor(input_signal=audio, length=audio_lens) + reference_spec, reference_spec_len = None, None + if reference_audio is not None: + reference_spec, reference_spec_len = self.preprocessor( + input_signal=reference_audio, length=reference_audio_len + ) # Calculate val loss on ground truth durations to better align L2 loss in time (mels_pred, _, _, log_durs_pred, pitch_pred, _, _, _, attn_hard_dur, pitch, energy_pred, energy_tgt,) = self( @@ -467,6 +521,8 @@ def validation_step(self, batch, batch_idx): speaker=speaker, pace=1.0, spec=mels if self.learn_alignment else None, + reference_spec=reference_spec, + reference_spec_lens=reference_spec_len, attn_prior=attn_prior, mel_lens=mel_lens, input_lens=text_lens, @@ -496,13 +552,13 @@ def validation_epoch_end(self, outputs): mel_loss = collect("mel_loss") dur_loss = collect("dur_loss") pitch_loss = collect("pitch_loss") - self.log("val_loss", val_loss) - self.log("val_mel_loss", mel_loss) - self.log("val_dur_loss", dur_loss) - self.log("val_pitch_loss", pitch_loss) + self.log("val_loss", val_loss, sync_dist=True) + self.log("val_mel_loss", mel_loss, sync_dist=True) + self.log("val_dur_loss", dur_loss, sync_dist=True) + self.log("val_pitch_loss", pitch_loss, sync_dist=True) if outputs[0]["energy_loss"] is not None: energy_loss = collect("energy_loss") - self.log("val_energy_loss", energy_loss) + self.log("val_energy_loss", energy_loss, sync_dist=True) _, _, _, _, _, spec_target, spec_predict = outputs[0].values() diff --git a/nemo/collections/tts/modules/fastpitch.py b/nemo/collections/tts/modules/fastpitch.py index 83ec35d58693..e2da672cf9c7 100644 --- a/nemo/collections/tts/modules/fastpitch.py +++ b/nemo/collections/tts/modules/fastpitch.py @@ -103,7 +103,6 @@ class TemporalPredictor(NeuralModule): def __init__(self, input_size, filter_size, kernel_size, dropout, n_layers=2, condition_types=[]): super(TemporalPredictor, self).__init__() - self.cond_input = ConditionalInput(input_size, input_size, condition_types) self.layers = torch.nn.ModuleList() for i in range(n_layers): @@ -158,6 +157,7 @@ def __init__( pitch_predictor: NeuralModule, energy_predictor: NeuralModule, aligner: NeuralModule, + speaker_encoder: NeuralModule, n_speakers: int, symbols_embedding_dim: int, pitch_embedding_kernel_size: int, @@ -173,11 +173,15 @@ def __init__( self.pitch_predictor = pitch_predictor self.energy_predictor = energy_predictor self.aligner = aligner + self.speaker_encoder = speaker_encoder self.learn_alignment = aligner is not None self.use_duration_predictor = True self.binarize = False - if n_speakers > 1: + # TODO: combine self.speaker_emb with self.speaker_encoder + # cfg: remove `n_speakers`, create `speaker_encoder.lookup_module` + # state_dict: move `speaker_emb.weight` to `speaker_encoder.lookup_module.table.weight` + if n_speakers > 1 and speaker_encoder is None: self.speaker_emb = torch.nn.Embedding(n_speakers, symbols_embedding_dim) else: self.speaker_emb = None @@ -219,6 +223,8 @@ def input_types(self): "attn_prior": NeuralType(('B', 'T_spec', 'T_text'), ProbsType(), optional=True), "mel_lens": NeuralType(('B'), LengthsType(), optional=True), "input_lens": NeuralType(('B'), LengthsType(), optional=True), + "reference_spec": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType(), optional=True), + "reference_spec_lens": NeuralType(('B'), LengthsType(), optional=True), } @property @@ -238,6 +244,19 @@ def output_types(self): "energy_tgt": NeuralType(('B', 'T_audio'), RegressionValuesType()), } + def get_speaker_embedding(self, speaker, reference_spec, reference_spec_lens): + """spk_emb: Bx1xD""" + if self.speaker_encoder is not None: + spk_emb = self.speaker_encoder(speaker, reference_spec, reference_spec_lens).unsqueeze(1) + elif self.speaker_emb is not None: + if speaker is None: + raise ValueError('Please give speaker id to get lookup speaker embedding.') + spk_emb = self.speaker_emb(speaker).unsqueeze(1) + else: + spk_emb = None + + return spk_emb + @typecheck() def forward( self, @@ -252,6 +271,8 @@ def forward( attn_prior=None, mel_lens=None, input_lens=None, + reference_spec=None, + reference_spec_lens=None, ): if not self.learn_alignment and self.training: @@ -259,10 +280,9 @@ def forward( assert pitch is not None # Calculate speaker embedding - if self.speaker_emb is None or speaker is None: - spk_emb = None - else: - spk_emb = self.speaker_emb(speaker).unsqueeze(1) + spk_emb = self.get_speaker_embedding( + speaker=speaker, reference_spec=reference_spec, reference_spec_lens=reference_spec_lens, + ) # Input FFT enc_out, enc_mask = self.encoder(input=text, conditioning=spk_emb) @@ -347,12 +367,23 @@ def forward( energy_tgt, ) - def infer(self, *, text, pitch=None, speaker=None, energy=None, pace=1.0, volume=None): + def infer( + self, + *, + text, + pitch=None, + speaker=None, + energy=None, + pace=1.0, + volume=None, + reference_spec=None, + reference_spec_lens=None, + ): + # Calculate speaker embedding - if self.speaker_emb is None or speaker is None: - spk_emb = 0 - else: - spk_emb = self.speaker_emb(speaker).unsqueeze(1) + spk_emb = self.get_speaker_embedding( + speaker=speaker, reference_spec=reference_spec, reference_spec_lens=reference_spec_lens, + ) # Input FFT enc_out, enc_mask = self.encoder(input=text, conditioning=spk_emb) diff --git a/nemo/collections/tts/modules/submodules.py b/nemo/collections/tts/modules/submodules.py index 44ed0a92d776..dbf26f1ceeee 100644 --- a/nemo/collections/tts/modules/submodules.py +++ b/nemo/collections/tts/modules/submodules.py @@ -18,8 +18,12 @@ from torch import Tensor from torch.autograd import Variable from torch.nn import functional as F +from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence -from nemo.core.classes import adapter_mixins +from nemo.core.classes import NeuralModule, adapter_mixins +from nemo.core.neural_types.elements import EncodedRepresentation, Index, LengthsType, MelSpectrogramType +from nemo.core.neural_types.neural_type import NeuralType +from nemo.utils import logging SUPPORTED_CONDITION_TYPES = ["add", "concat", "layernorm"] @@ -456,8 +460,10 @@ def forward(self, inputs, conditioning=None): if self.condition: if conditioning is None: raise ValueError( - 'You should add additional data types as conditions e.g. speaker id or reference audio' + """You should add additional data types as conditions (e.g. speaker id or reference audio) + and define speaker_encoder in your config.""" ) + inputs = inputs * self.cond_weight(conditioning) inputs = inputs + self.cond_bias(conditioning) @@ -493,7 +499,8 @@ def forward(self, inputs, conditioning=None): if len(self.condition_types) > 0: if conditioning is None: raise ValueError( - 'You should add additional data types as conditions e.g. speaker id or reference audio' + """You should add additional data types as conditions (e.g. speaker id or reference audio) + and define speaker_encoder in your config.""" ) if "add" in self.condition_types: @@ -507,3 +514,241 @@ def forward(self, inputs, conditioning=None): inputs = self.concat_proj(inputs) return inputs + + +class StyleAttention(NeuralModule): + def __init__(self, gst_size=128, n_style_token=10, n_style_attn_head=4): + super(StyleAttention, self).__init__() + + token_size = gst_size // n_style_attn_head + self.tokens = torch.nn.Parameter(torch.FloatTensor(n_style_token, token_size)) + self.mha = torch.nn.MultiheadAttention( + embed_dim=gst_size, + num_heads=n_style_attn_head, + dropout=0.0, + bias=True, + kdim=token_size, + vdim=token_size, + batch_first=True, + ) + torch.nn.init.normal_(self.tokens) + + @property + def input_types(self): + return { + "inputs": NeuralType(('B', 'D'), EncodedRepresentation()), + "token_id": NeuralType(('B'), Index(), optional=True), + } + + @property + def output_types(self): + return { + "style_emb": NeuralType(('B', 'D'), EncodedRepresentation()), + } + + def forward(self, inputs): + batch_size = inputs.size(0) + query = inputs.unsqueeze(1) + tokens = F.tanh(self.tokens).unsqueeze(0).expand(batch_size, -1, -1) + + style_emb, _ = self.mha(query=query, key=tokens, value=tokens) + style_emb = style_emb.squeeze(1) + return style_emb + + +class Conv2DReLUNorm(torch.nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=3, stride=2, padding=1, bias=True, dropout=0.0): + super(Conv2DReLUNorm, self).__init__() + self.conv = torch.nn.Conv2d( + in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias=bias + ) + self.norm = torch.nn.LayerNorm(out_channels) + self.dropout = torch.nn.Dropout(dropout) + + def forward(self, x, x_mask=None): + if x_mask is not None: + x = x * x_mask + + # bhwc -> bchw + x = x.contiguous().permute(0, 3, 1, 2) + x = F.relu(self.conv(x)) + # bchw -> bhwc + x = x.contiguous().permute(0, 2, 3, 1) + x = self.norm(x) + x = self.dropout(x) + return x + + +class ReferenceEncoder(NeuralModule): + """ + Encode mel-spectrograms to an utterance level feature + """ + + def __init__(self, n_mels, cnn_filters, dropout, gru_hidden, kernel_size, stride, padding, bias): + super(ReferenceEncoder, self).__init__() + self.filter_size = [1] + list(cnn_filters) + self.layers = torch.nn.ModuleList( + [ + Conv2DReLUNorm( + in_channels=int(self.filter_size[i]), + out_channels=int(self.filter_size[i + 1]), + kernel_size=kernel_size, + stride=stride, + padding=padding, + bias=bias, + dropout=dropout, + ) + for i in range(len(cnn_filters)) + ] + ) + post_conv_height = self.calculate_post_conv_lengths(n_mels, n_convs=len(cnn_filters)) + self.gru = torch.nn.GRU( + input_size=cnn_filters[-1] * post_conv_height, hidden_size=gru_hidden, batch_first=True, + ) + + @property + def input_types(self): + return { + "inputs": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()), + "inputs_lengths": NeuralType(('B'), LengthsType()), + } + + @property + def output_types(self): + return { + "out": NeuralType(('B', 'D'), EncodedRepresentation()), + } + + def forward(self, inputs, inputs_lengths): + # BMW -> BWMC (M: mels) + x = inputs.transpose(1, 2).unsqueeze(3) + x_lens = inputs_lengths + x_masks = self.lengths_to_masks(x_lens).unsqueeze(2).unsqueeze(3) + + for layer in self.layers: + x = layer(x, x_masks) + x_lens = self.calculate_post_conv_lengths(x_lens) + x_masks = self.lengths_to_masks(x_lens).unsqueeze(2).unsqueeze(3) + + # BWMC -> BWC + x = x.contiguous().view(x.shape[0], x.shape[1], -1) + + self.gru.flatten_parameters() + packed_x = pack_padded_sequence(x, x_lens.cpu(), batch_first=True, enforce_sorted=False) + packed_x, _ = self.gru(packed_x) + x, x_lens = pad_packed_sequence(packed_x, batch_first=True) + x = x[torch.arange(len(x_lens)), (x_lens - 1), :] + return x + + @staticmethod + def calculate_post_conv_lengths(lengths, n_convs=1, kernel_size=3, stride=2, pad=1): + """Batch lengths after n convolution with fixed kernel/stride/pad.""" + for _ in range(n_convs): + lengths = (lengths - kernel_size + 2 * pad) // stride + 1 + return lengths + + @staticmethod + def lengths_to_masks(lengths): + """Batch of lengths to batch of masks""" + # B -> BxT + masks = torch.arange(lengths.max()).to(lengths.device).expand( + lengths.shape[0], lengths.max() + ) < lengths.unsqueeze(1) + return masks + + +class GlobalStyleToken(NeuralModule): + """ + Global Style Token based Speaker Embedding + """ + + def __init__( + self, reference_encoder, gst_size=128, n_style_token=10, n_style_attn_head=4, + ): + super(GlobalStyleToken, self).__init__() + self.reference_encoder = reference_encoder + self.style_attention = StyleAttention( + gst_size=gst_size, n_style_token=n_style_token, n_style_attn_head=n_style_attn_head + ) + + @property + def input_types(self): + return { + "inp": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()), + "inp_lengths": NeuralType(('B'), LengthsType()), + } + + @property + def output_types(self): + return { + "gst": NeuralType(('B', 'D'), EncodedRepresentation()), + } + + def forward(self, inp, inp_lengths): + style_embedding = self.reference_encoder(inp, inp_lengths) + gst = self.style_attention(style_embedding) + return gst + + +class SpeakerLookupTable(torch.nn.Module): + """ + LookupTable based Speaker Embedding + """ + + def __init__(self, n_speakers, embedding_dim): + super(SpeakerLookupTable, self).__init__() + self.table = torch.nn.Embedding(n_speakers, embedding_dim) + + def forward(self, speaker): + return self.table(speaker) + + +class SpeakerEncoder(NeuralModule): + """ + class SpeakerEncoder represents speakers representation. + This module can combine GST (global style token) based speaker embeddings and lookup table speaker embeddings. + """ + + def __init__(self, lookup_module=None, gst_module=None): + """ + lookup_module: Torch module to get lookup based speaker embedding + gst_module: Neural module to get GST based speaker embedding + """ + super(SpeakerEncoder, self).__init__() + self.lookup_module = lookup_module + self.gst_module = gst_module + + @property + def input_types(self): + return { + "speaker": NeuralType(('B'), Index(), optional=True), + "reference_spec": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType(), optional=True), + "reference_spec_lens": NeuralType(('B'), LengthsType(), optional=True), + } + + @property + def output_types(self): + return { + "embs": NeuralType(('B', 'D'), EncodedRepresentation()), + } + + def forward(self, speaker=None, reference_spec=None, reference_spec_lens=None): + embs = None + + # Get Lookup table speaker embedding + if self.lookup_module is not None and speaker is not None: + embs = self.lookup_module(speaker) + + # Get GST based speaker embedding + if self.gst_module is not None: + if reference_spec is None or reference_spec_lens is None: + raise ValueError( + "You should add `reference_audio` in sup_data_types or remove `speaker_encoder`in config." + ) + out = self.gst_module(reference_spec, reference_spec_lens) + embs = out if embs is None else embs + out + + elif self.gst_module is None and reference_spec is not None and reference_spec_lens is not None: + logging.warning("You may add `gst_module` in speaker_encoder to use reference_audio.") + + return embs diff --git a/nemo/collections/tts/torch/tts_data_types.py b/nemo/collections/tts/torch/tts_data_types.py index 899e5da7d801..ae7516009cd9 100644 --- a/nemo/collections/tts/torch/tts_data_types.py +++ b/nemo/collections/tts/torch/tts_data_types.py @@ -67,6 +67,10 @@ class LMTokens(TTSDataType): name = "lm_tokens" +class ReferenceAudio(TTSDataType, WithLens): + name = "reference_audio" + + MAIN_DATA_TYPES = [Audio, Text] VALID_SUPPLEMENTARY_DATA_TYPES = [ LogMel, @@ -78,5 +82,6 @@ class LMTokens(TTSDataType): LMTokens, Voiced_mask, P_voiced, + ReferenceAudio, ] DATA_STR2DATA_CLASS = {d.name: d for d in MAIN_DATA_TYPES + VALID_SUPPLEMENTARY_DATA_TYPES} From ceb539fff25c2bffea744459fd2b23aab1a62300 Mon Sep 17 00:00:00 2001 From: Dima Rekesh Date: Tue, 18 Apr 2023 10:03:22 -0700 Subject: [PATCH 04/23] Sharded manifests for tarred datasets (#6395) * testing sharded manifests Signed-off-by: Dima Rekesh * compatibility Signed-off-by: Dima Rekesh * proper fixes Signed-off-by: Dima Rekesh * adding flag tot convert_to_tarred_audio_dataset Signed-off-by: Dima Rekesh * shard_manifests conf param Signed-off-by: Dima Rekesh * propagating the shard_manifests param Signed-off-by: Dima Rekesh * propagating the shard_manifests param Signed-off-by: Dima Rekesh * distributed checks Signed-off-by: Dima Rekesh * typo Signed-off-by: Dima Rekesh * typo Signed-off-by: Dima Rekesh * fixes Signed-off-by: Dima Rekesh * fixes Signed-off-by: Dima Rekesh * fixes Signed-off-by: Dima Rekesh * fixes Signed-off-by: Dima Rekesh * fixes Signed-off-by: Dima Rekesh * fixes Signed-off-by: Dima Rekesh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixes based on PR comments and tests Signed-off-by: Dima Rekesh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixes to convert_to_tarred_audio_dataset.py Signed-off-by: Dima Rekesh * reversing manifest shards flag Signed-off-by: Dima Rekesh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * tests Signed-off-by: Dima Rekesh * excluding manifests from webdataset url expansion Signed-off-by: Dima Rekesh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * expand manifest paths before attempting to cache from datastore Signed-off-by: Dima Rekesh * explicit use of UTF-8 for manifest i/o Signed-off-by: Dima Rekesh --------- Signed-off-by: Dima Rekesh Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- nemo/collections/asr/data/audio_to_label.py | 6 +- nemo/collections/asr/data/audio_to_text.py | 110 +++++++++++++++--- .../asr/data/audio_to_text_dali.py | 6 +- .../asr/data/audio_to_text_dataset.py | 5 + .../asr/models/configs/asr_models_config.py | 1 + nemo/utils/data_utils.py | 6 + .../convert_to_tarred_audio_dataset.py | 43 ++++++- .../asr/test_asr_ctc_encoder_model_bpe.py | 1 + .../asr/test_asr_ctcencdec_model.py | 1 + 9 files changed, 151 insertions(+), 28 deletions(-) diff --git a/nemo/collections/asr/data/audio_to_label.py b/nemo/collections/asr/data/audio_to_label.py index fe12be42be94..4317642a8fff 100644 --- a/nemo/collections/asr/data/audio_to_label.py +++ b/nemo/collections/asr/data/audio_to_label.py @@ -19,7 +19,7 @@ import torch import webdataset as wd -from nemo.collections.asr.data.audio_to_text import cache_datastore_manifests, expand_audio_filepaths +from nemo.collections.asr.data.audio_to_text import cache_datastore_manifests, expand_sharded_filepaths from nemo.collections.asr.parts.preprocessing.segment import available_formats as valid_sf_formats from nemo.collections.common.parts.preprocessing import collections from nemo.core.classes import Dataset, IterableDataset @@ -560,8 +560,8 @@ def __init__( for idx in range(len(self.labels[:5])): logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx])) - audio_tar_filepaths = expand_audio_filepaths( - audio_tar_filepaths=audio_tar_filepaths, + audio_tar_filepaths = expand_sharded_filepaths( + sharded_filepaths=audio_tar_filepaths, shard_strategy=shard_strategy, world_size=world_size, global_rank=global_rank, diff --git a/nemo/collections/asr/data/audio_to_text.py b/nemo/collections/asr/data/audio_to_text.py index 2f5b3add9fcc..756c05631627 100644 --- a/nemo/collections/asr/data/audio_to_text.py +++ b/nemo/collections/asr/data/audio_to_text.py @@ -171,31 +171,32 @@ def process_text_by_sample(self, sample: collections.ASRAudioText.OUTPUT_TYPE) - return t, tl -def expand_audio_filepaths(audio_tar_filepaths, shard_strategy: str, world_size: int, global_rank: int): +def expand_sharded_filepaths(sharded_filepaths, shard_strategy: str, world_size: int, global_rank: int): valid_shard_strategies = ['scatter', 'replicate'] if shard_strategy not in valid_shard_strategies: raise ValueError(f"`shard_strategy` must be one of {valid_shard_strategies}") - if isinstance(audio_tar_filepaths, str): + if isinstance(sharded_filepaths, str): # Replace '(' and '[' with '{' brace_keys_open = ['(', '[', '<', '_OP_'] for bkey in brace_keys_open: - if bkey in audio_tar_filepaths: - audio_tar_filepaths = audio_tar_filepaths.replace(bkey, "{") + if bkey in sharded_filepaths: + sharded_filepaths = sharded_filepaths.replace(bkey, "{") # Replace ')' and ']' with '}' brace_keys_close = [')', ']', '>', '_CL_'] for bkey in brace_keys_close: - if bkey in audio_tar_filepaths: - audio_tar_filepaths = audio_tar_filepaths.replace(bkey, "}") + if bkey in sharded_filepaths: + sharded_filepaths = sharded_filepaths.replace(bkey, "}") - if isinstance(audio_tar_filepaths, str): + if isinstance(sharded_filepaths, str): # Brace expand - audio_tar_filepaths = list(braceexpand.braceexpand(audio_tar_filepaths)) + sharded_filepaths = list(braceexpand.braceexpand(sharded_filepaths)) # Expand store paths into WebDataset URLs - audio_tar_filepaths = [ - datastore_path_to_webdataset_url(p) if is_datastore_path(p) else p for p in audio_tar_filepaths + sharded_filepaths = [ + datastore_path_to_webdataset_url(p) if is_datastore_path(p) and is_tarred_path(p) else p + for p in sharded_filepaths ] # Check for distributed and partition shards accordingly @@ -203,15 +204,15 @@ def expand_audio_filepaths(audio_tar_filepaths, shard_strategy: str, world_size: if shard_strategy == 'scatter': logging.info("All tarred dataset shards will be scattered evenly across all nodes.") - if len(audio_tar_filepaths) % world_size != 0: + if len(sharded_filepaths) % world_size != 0: logging.warning( - f"Number of shards in tarred dataset ({len(audio_tar_filepaths)}) is not divisible " + f"Number of shards in tarred dataset ({len(sharded_filepaths)}) is not divisible " f"by number of distributed workers ({world_size})." ) - begin_idx = (len(audio_tar_filepaths) // world_size) * global_rank - end_idx = begin_idx + len(audio_tar_filepaths) // world_size - audio_tar_filepaths = audio_tar_filepaths[begin_idx:end_idx] + begin_idx = (len(sharded_filepaths) // world_size) * global_rank + end_idx = begin_idx + len(sharded_filepaths) // world_size + sharded_filepaths = sharded_filepaths[begin_idx:end_idx] logging.info( "Partitioning tarred dataset: process (%d) taking shards [%d, %d)", global_rank, begin_idx, end_idx ) @@ -221,7 +222,7 @@ def expand_audio_filepaths(audio_tar_filepaths, shard_strategy: str, world_size: else: raise ValueError(f"Invalid shard strategy ! Allowed values are : {valid_shard_strategies}") - return audio_tar_filepaths + return sharded_filepaths def cache_datastore_manifests( @@ -345,6 +346,47 @@ def cache_data(manifest_filepaths, cache_audio, num_workers, max_num_workers): ) +"""Optionally expand / shard the list of manifests + This is made to use the same notation as the sharded audio files + + Args: + manifest_filepaths: list of manifest files (the sharded notation) + shard_strategy: scatter or replicate (scatter by default) + shard_manifests: bool, if False, no sharding / manifest filepath expansion will be attempted + global_rank: int, the rank of this worker + world_size: int, total number of workers +""" + + +def shard_manifests_if_needed( + manifest_filepaths: Union[str, List[str]], + shard_strategy: str, + shard_manifests: bool, + global_rank: int, + world_size: int, +): + if shard_manifests: + if not torch.distributed.is_available(): + logging.warning("Not running in torch.distributed mode. Manifest sharding not available") + return manifest_filepaths + + if not torch.distributed.is_initialized(): + logging.warning( + 'Manifest sharding was requested but torch.distributed is not initialized ' + 'Did you intend to set the defer_setup flag?' + ) + return manifest_filepaths + + manifest_filepaths = expand_sharded_filepaths( + sharded_filepaths=manifest_filepaths, + shard_strategy=shard_strategy, + world_size=world_size, + global_rank=global_rank, + ) + + return manifest_filepaths + + class _AudioTextDataset(Dataset): """ Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations (in seconds). @@ -748,6 +790,7 @@ class _TarredAudioToTextDataset(IterableDataset): occasions (when the number of shards is not divisible with ``world_size``), will not sample the entire dataset. For these reasons it is not advisable to use tarred datasets as validation or test datasets. + shard_manifests (bool): Whether or not to try / shard manifests. Defaults to False. global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. world_size (int): Total number of processes, used for partitioning shards. Defaults to 0. return_sample_id (bool): whether to return the sample_id as a part of each sample @@ -769,10 +812,22 @@ def __init__( eos_id: Optional[int] = None, pad_id: int = 0, shard_strategy: str = "scatter", + shard_manifests: bool = False, global_rank: int = 0, world_size: int = 0, return_sample_id: bool = False, ): + self.shard_manifests = shard_manifests + + # Shard manifests if necessary and possible and then expand the paths + manifest_filepath = shard_manifests_if_needed( + shard_manifests=shard_manifests, + shard_strategy=shard_strategy, + manifest_filepaths=manifest_filepath, + world_size=world_size, + global_rank=global_rank, + ) + # If necessary, cache manifests from object store cache_datastore_manifests(manifest_filepaths=manifest_filepath) @@ -788,6 +843,8 @@ def __init__( index_by_file_id=True, # Must set this so the manifest lines can be indexed by file ID ) + self.len = self._compute_len() + self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor) self.trim = trim self.eos_id = eos_id @@ -795,8 +852,8 @@ def __init__( self.pad_id = pad_id self.return_sample_id = return_sample_id - audio_tar_filepaths = expand_audio_filepaths( - audio_tar_filepaths=audio_tar_filepaths, + audio_tar_filepaths = expand_sharded_filepaths( + sharded_filepaths=audio_tar_filepaths, shard_strategy=shard_strategy, world_size=world_size, global_rank=global_rank, @@ -928,8 +985,19 @@ def get_manifest_sample(self, sample_id): def __iter__(self): return self._dataset.__iter__() + def _compute_len(self): + if self.shard_manifests and torch.distributed.is_available() and torch.distributed.is_initialized(): + my_len = torch.tensor(len(self.manifest_processor.collection), dtype=torch.int32).cuda() + torch.distributed.all_reduce(my_len) + my_len = my_len.int() + logging.info(f'Sharded manifests: Total length: {my_len}') + else: + my_len = len(self.manifest_processor.collection) + + return my_len + def __len__(self): - return len(self.manifest_processor.collection) + return self.len class TarredAudioToCharDataset(_TarredAudioToTextDataset): @@ -1042,6 +1110,7 @@ def __init__( parser: Optional[str] = 'en', pad_id: int = 0, shard_strategy: str = "scatter", + shard_manifests: bool = False, global_rank: int = 0, world_size: int = 0, return_sample_id: bool = False, @@ -1067,6 +1136,7 @@ def __init__( eos_id=eos_id, pad_id=pad_id, shard_strategy=shard_strategy, + shard_manifests=shard_manifests, global_rank=global_rank, world_size=world_size, return_sample_id=return_sample_id, @@ -1167,6 +1237,7 @@ def __init__( trim: bool = False, use_start_end_token: bool = True, shard_strategy: str = "scatter", + shard_manifests: bool = False, global_rank: int = 0, world_size: int = 0, return_sample_id: bool = False, @@ -1219,6 +1290,7 @@ def __call__(self, *args): eos_id=eos_id, pad_id=pad_id, shard_strategy=shard_strategy, + shard_manifests=shard_manifests, global_rank=global_rank, world_size=world_size, return_sample_id=return_sample_id, diff --git a/nemo/collections/asr/data/audio_to_text_dali.py b/nemo/collections/asr/data/audio_to_text_dali.py index b65823f94c97..77bd71129cc2 100644 --- a/nemo/collections/asr/data/audio_to_text_dali.py +++ b/nemo/collections/asr/data/audio_to_text_dali.py @@ -22,7 +22,7 @@ import torch from omegaconf import DictConfig -from nemo.collections.asr.data.audio_to_text import ASRManifestProcessor, expand_audio_filepaths +from nemo.collections.asr.data.audio_to_text import ASRManifestProcessor, expand_sharded_filepaths from nemo.collections.common.parts.preprocessing import parsers from nemo.utils import logging, model_utils @@ -345,10 +345,10 @@ def __init__( self.is_tarred_dataset = False elif audio_tar_filepaths is not None and audio_tar_index_filepaths is not None: - audio_tar_filepaths = expand_audio_filepaths( + audio_tar_filepaths = expand_sharded_filepaths( audio_tar_filepaths, shard_strategy=shard_strategy, world_size=world_size, global_rank=global_rank ) - audio_tar_index_filepaths = expand_audio_filepaths( + audio_tar_index_filepaths = expand_sharded_filepaths( audio_tar_index_filepaths, shard_strategy=shard_strategy, world_size=world_size, diff --git a/nemo/collections/asr/data/audio_to_text_dataset.py b/nemo/collections/asr/data/audio_to_text_dataset.py index 1cb0b880aa69..325857e81323 100644 --- a/nemo/collections/asr/data/audio_to_text_dataset.py +++ b/nemo/collections/asr/data/audio_to_text_dataset.py @@ -346,6 +346,9 @@ def get_tarred_dataset( ): if len(tarred_audio_filepath) == 1: tarred_audio_filepath = tarred_audio_filepath[0] + if len(manifest_filepath) == 1: + manifest_filepath = manifest_filepath[0] + if tokenizer is None: dataset = audio_to_text.TarredAudioToCharDataset( audio_tar_filepaths=tarred_audio_filepath, @@ -363,6 +366,7 @@ def get_tarred_dataset( trim=config.get('trim_silence', False), parser=config.get('parser', 'en'), shard_strategy=config.get('tarred_shard_strategy', 'scatter'), + shard_manifests=config.get('shard_manifests', False), global_rank=global_rank, world_size=world_size, return_sample_id=config.get('return_sample_id', False), @@ -381,6 +385,7 @@ def get_tarred_dataset( trim=config.get('trim_silence', False), use_start_end_token=config.get('use_start_end_token', True), shard_strategy=config.get('tarred_shard_strategy', 'scatter'), + shard_manifests=config.get('shard_manifests', False), global_rank=global_rank, world_size=world_size, return_sample_id=config.get('return_sample_id', False), diff --git a/nemo/collections/asr/models/configs/asr_models_config.py b/nemo/collections/asr/models/configs/asr_models_config.py index e0ceeff6b186..609d42216659 100644 --- a/nemo/collections/asr/models/configs/asr_models_config.py +++ b/nemo/collections/asr/models/configs/asr_models_config.py @@ -38,6 +38,7 @@ class ASRDatasetConfig(nemo.core.classes.dataset.DatasetConfig): is_tarred: bool = False tarred_audio_filepaths: Optional[Any] = None tarred_shard_strategy: str = "scatter" + shard_manifests: bool = False shuffle_n: int = 0 # Optional diff --git a/nemo/utils/data_utils.py b/nemo/utils/data_utils.py index 09da7ba93512..6479a65f1128 100644 --- a/nemo/utils/data_utils.py +++ b/nemo/utils/data_utils.py @@ -49,6 +49,12 @@ def is_datastore_path(path) -> bool: return path.startswith('ais://') +def is_tarred_path(path) -> bool: + """Check if a path is for a tarred file. + """ + return path.endswith('.tar') + + def is_datastore_cache_shared() -> bool: """Check if store cache is shared. """ diff --git a/scripts/speech_recognition/convert_to_tarred_audio_dataset.py b/scripts/speech_recognition/convert_to_tarred_audio_dataset.py index f227fbcd538c..64c086997ef0 100644 --- a/scripts/speech_recognition/convert_to_tarred_audio_dataset.py +++ b/scripts/speech_recognition/convert_to_tarred_audio_dataset.py @@ -174,6 +174,11 @@ "and it must be filled out by the user." ), ) +parser.add_argument( + "--no_shard_manifests", + action='store_true', + help="Do not write sharded manifests along with the aggregated manifest.", +) parser.add_argument('--workers', type=int, default=1, help='Number of worker processes') args = parser.parse_args() @@ -186,6 +191,7 @@ class ASRTarredDatasetConfig: min_duration: Optional[float] = None shuffle_seed: Optional[int] = None sort_in_shards: bool = True + shard_manifests: bool = True keep_files_together: bool = False @@ -322,6 +328,19 @@ def create_new_dataset(self, manifest_path: str, target_dir: str = "./tarred/", for i, (start_idx, end_idx) in enumerate(zip(start_indices, end_indices)) ) + if config.shard_manifests: + sharded_manifests_dir = target_dir + '/sharded_manifests' + if not os.path.exists(sharded_manifests_dir): + os.makedirs(sharded_manifests_dir) + + for manifest in new_entries_list: + shard_id = manifest[0]['shard_id'] + new_manifest_shard_path = os.path.join(sharded_manifests_dir, f'manifest_{shard_id}.json') + with open(new_manifest_shard_path, 'w', encoding='utf-8') as m2: + for entry in manifest: + json.dump(entry, m2) + m2.write('\n') + # Flatten the list of list of entries to a list of entries new_entries = [sample for manifest in new_entries_list for sample in manifest] del new_entries_list @@ -330,7 +349,7 @@ def create_new_dataset(self, manifest_path: str, target_dir: str = "./tarred/", # Write manifest new_manifest_path = os.path.join(target_dir, 'tarred_audio_manifest.json') - with open(new_manifest_path, 'w') as m2: + with open(new_manifest_path, 'w', encoding='utf-8') as m2: for entry in new_entries: json.dump(entry, m2) m2.write('\n') @@ -467,6 +486,19 @@ def create_concatenated_dataset( for i, (start_idx, end_idx, shard_idx) in enumerate(zip(start_indices, end_indices, shard_indices)) ) + if config.shard_manifests: + sharded_manifests_dir = target_dir + '/sharded_manifests' + if not os.path.exists(sharded_manifests_dir): + os.makedirs(sharded_manifests_dir) + + for manifest in new_entries_list: + shard_id = manifest[0]['shard_id'] + new_manifest_shard_path = os.path.join(sharded_manifests_dir, f'manifest_{shard_id}.json') + with open(new_manifest_shard_path, 'w', encoding='utf-8') as m2: + for entry in manifest: + json.dump(entry, m2) + m2.write('\n') + # Flatten the list of list of entries to a list of entries new_entries = [sample for manifest in new_entries_list for sample in manifest] del new_entries_list @@ -480,7 +512,7 @@ def create_concatenated_dataset( print("Total number of entries in manifest :", len(base_entries) + len(new_entries)) new_manifest_path = os.path.join(target_dir, f'tarred_audio_manifest_version_{new_version}.json') - with open(new_manifest_path, 'w') as m2: + with open(new_manifest_path, 'w', encoding='utf-8') as m2: # First write all the entries of base manifest for entry in base_entries: json.dump(entry, m2) @@ -523,7 +555,7 @@ def _read_manifest(self, manifest_path: str, config: ASRTarredDatasetConfig): total_duration = 0.0 filtered_entries = [] filtered_duration = 0.0 - with open(manifest_path, 'r') as m: + with open(manifest_path, 'r', encoding='utf-8') as m: for line in m: entry = json.loads(line) if (config.max_duration is None or entry['duration'] < config.max_duration) and ( @@ -626,6 +658,8 @@ def main(): def create_tar_datasets(min_duration: float, max_duration: float, target_dir: str): builder = ASRTarredDatasetBuilder() + shard_manifests = False if args.no_shard_manifests else True + if args.write_metadata: metadata = ASRTarredDatasetMetadata() dataset_cfg = ASRTarredDatasetConfig( @@ -635,6 +669,7 @@ def create_tar_datasets(min_duration: float, max_duration: float, target_dir: st min_duration=min_duration, shuffle_seed=args.shuffle_seed, sort_in_shards=args.sort_in_shards, + shard_manifests=shard_manifests, keep_files_together=args.keep_files_together, ) metadata.dataset_config = dataset_cfg @@ -655,6 +690,7 @@ def create_tar_datasets(min_duration: float, max_duration: float, target_dir: st min_duration=min_duration, shuffle_seed=args.shuffle_seed, sort_in_shards=args.sort_in_shards, + shard_manifests=shard_manifests, keep_files_together=args.keep_files_together, ) builder.configure(config) @@ -682,6 +718,7 @@ def create_tar_datasets(min_duration: float, max_duration: float, target_dir: st metadata.dataset_config.shuffle = args.shuffle metadata.dataset_config.shuffle_seed = args.shuffle_seed metadata.dataset_config.sort_in_shards = args.sort_in_shards + metadata.dataset_config.shard_manifests = shard_manifests builder.configure(metadata.dataset_config) diff --git a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py index f671fd925c38..13c31ef36a9c 100644 --- a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py +++ b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py @@ -291,6 +291,7 @@ def test_ASRDatasetConfig_for_AudioToBPEDataset(self): 'pin_memory', 'drop_last', 'tarred_shard_strategy', + 'shard_manifests', 'shuffle_n', 'parser', 'normalize', diff --git a/tests/collections/asr/test_asr_ctcencdec_model.py b/tests/collections/asr/test_asr_ctcencdec_model.py index 8dfa9ce62cd6..8d90079d0c51 100644 --- a/tests/collections/asr/test_asr_ctcencdec_model.py +++ b/tests/collections/asr/test_asr_ctcencdec_model.py @@ -266,6 +266,7 @@ def test_ASRDatasetConfig_for_AudioToCharDataset(self): 'pin_memory', 'drop_last', 'tarred_shard_strategy', + 'shard_manifests', 'shuffle_n', 'use_start_end_token', 'use_start_end_token', From 499a3b2f66449cc9691885bc52aa2122f9e4596a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jim=20O=E2=80=99Regan?= Date: Wed, 19 Apr 2023 00:58:57 +0200 Subject: [PATCH 05/23] Update wfst_text_normalization.rst (#6374) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Hungarian (incoming in NeMo-text-processing) Signed-off-by: Jim O’Regan --- .../nlp/text_normalization/wfst/wfst_text_normalization.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst b/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst index 632ec8768bb0..3f19872f1f8d 100644 --- a/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst +++ b/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst @@ -174,6 +174,8 @@ Language Support Matrix +------------------+----------+----------+----------+--------------------+----------------------+ | Chinese | zh | x | | | | +------------------+----------+----------+----------+--------------------+----------------------+ +| Hungarian | hu | x | | | | ++------------------+----------+----------+----------+--------------------+----------------------+ See :doc:`Grammar customization ` for grammar customization details. From a365879110646801bbcd8166b890f986d7d7793d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 18 Apr 2023 20:07:31 -0700 Subject: [PATCH 06/23] Support Swiglu in TP PP Conversion (#6437) (#6451) * Support Swiglu in TP PP Conversion * Guard activation * Guard activation --------- Signed-off-by: smajumdar Co-authored-by: Somshubra Majumdar --- .../megatron_change_num_partitions.py | 73 ++++++++++++++++--- 1 file changed, 61 insertions(+), 12 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_change_num_partitions.py b/examples/nlp/language_modeling/megatron_change_num_partitions.py index 0949d90f4b96..944565d8bd43 100644 --- a/examples/nlp/language_modeling/megatron_change_num_partitions.py +++ b/examples/nlp/language_modeling/megatron_change_num_partitions.py @@ -54,7 +54,7 @@ --target_pipeline_model_parallel_size=1 \ --target_pipeline_model_parallel_split_rank=0 \ --precision=bf16 - + ### Only Tensor Parallelism conversion ### To the above commands, add the following argument: `--tp_conversion_only` @@ -99,13 +99,14 @@ """ + ################# ### Utilities ### ################# def compute_tp_splits( - param_name, param, partitions, global_idx, tp_size, pp_size, pp_rank, pp_split_rank, megatron_legacy + param_name, param, partitions, global_idx, tp_size, pp_size, pp_rank, pp_split_rank, megatron_legacy, model_cfg ): """ Function to compute the splits required for tensor-parallelism. @@ -120,6 +121,7 @@ def compute_tp_splits( pp_rank: Int, pipeline-parallelism rank. pp_split_rank: Int, pipeline-parallelism split rank. This should be > 1 if TP is being used with EncDec models (T5) megatron_legacy: Bool, whether the model is a legacy Megatron model or not. + model_cfg: The model config as a OmegaConf DictConfig. Returns: List of torch tensors, each of which is a split of the current parameter. @@ -127,6 +129,8 @@ def compute_tp_splits( # alias the global index to idx idx = global_idx + swiglu_activation = 'swiglu' in str(model_cfg.get('activation', '')).lower() + if param.shape == partitions[0][idx].shape: split = [partitions[0][idx].data] * tp_size logging.debug(">> Perfect match, no splitting needed") @@ -156,6 +160,15 @@ def compute_tp_splits( for i in range(tp_size): tp_qkv = torch.cat([tp_qkv_splits[item] for item in range(i, tp_size * 2, tp_size)]) split.append(tp_qkv) + elif 'dense_h_to_4h.weight' in param_name and swiglu_activation: + # For Megatron GPT model with Swiglu activation + # Handle gated linear units + # concat all the first halves ('W's) and all the second halves ('V's) + w_split, k_split = torch.chunk(partitions[0][idx].data, 2, dim=0) + w_split = torch.chunk(w_split, tp_size, dim=0) + k_split = torch.chunk(k_split, tp_size, dim=0) + split = [torch.cat(weights, dim=0) for weights in zip(w_split, k_split)] # split per tp rank + # Regular split for Megatron and NeMo-Megatron models. else: split = torch.split(partitions[0][idx].data, param.shape[0], dim=0) @@ -163,7 +176,7 @@ def compute_tp_splits( return split -def compute_tp_merge(idx, name, param, partitions_pp): +def compute_tp_merge(idx, name, param, partitions_pp, model_cfg): """ Function to compute the partition merge required for tensor-parallelism. @@ -173,10 +186,13 @@ def compute_tp_merge(idx, name, param, partitions_pp): param: The parameter to be merged under TP 1 PP 1. partitions_pp: List of all TP partitions of the flattened parameter of the current model for a given PP rank (TP X PP Y). Indexed as partitions_pp[tp_rank][idx]. + model_cfg: The model config as an OmegaConf DictConfig. Returns: The concatenated parameter for TP 1 PP 1. """ + swiglu_activation = 'swiglu' in str(model_cfg.get('activation', '')).lower() + # Logic from original TP rank change if param.shape == partitions_pp[0][idx].shape: concated = partitions_pp[0][idx].data @@ -184,6 +200,19 @@ def compute_tp_merge(idx, name, param, partitions_pp): concated = torch.cat([partitions_pp[i][idx].data for i in range(len(partitions_pp))], dim=-1) else: concated = torch.cat([partitions_pp[i][idx].data for i in range(len(partitions_pp))], dim=0) + + # Logic for Swiglu activation + if 'dense_h_to_4h.weight' in name and swiglu_activation: + # concat all the first halves ('W's) and all the second halves ('V's) + wk_splits = [] + for tpr in range(len(partitions_pp)): + wk_splits.append(torch.chunk(partitions_pp[tpr][idx].data, 2, dim=0)) + + w_split = torch.cat([w[0] for w in wk_splits], dim=0) + k_split = torch.cat([w[1] for w in wk_splits], dim=0) + concated = torch.cat([w_split, k_split], dim=0) + + # Trim padding if concated.shape != param.shape: logging.info( f"Warning: Shape mismatch for parameter {name} required shape: {param.shape}, merged shape: {concated.shape}. Narrowing to match required size." @@ -301,7 +330,16 @@ def compute_splits(self, model, partitions, idx, tp_rank, pp_rank, pp_split_rank # Tensor Parallel Splitting split = compute_tp_splits( - param_name, param, partitions, idx, tp_size, pp_size, pp_rank, pp_split_rank, self.megatron_legacy + param_name, + param, + partitions, + idx, + tp_size, + pp_size, + pp_rank, + pp_split_rank, + self.megatron_legacy, + model.cfg, ) splits.append(split) @@ -419,7 +457,16 @@ def compute_splits(self, model, partitions, idx, tp_rank, pp_rank, pp_split_rank # Tensor Parallel Splitting split = compute_tp_splits( - param_name, param, partitions, idx, tp_size, pp_size, pp_rank, pp_split_rank, self.megatron_legacy + param_name, + param, + partitions, + idx, + tp_size, + pp_size, + pp_rank, + pp_split_rank, + self.megatron_legacy, + model.cfg, ) splits.append(split) @@ -445,12 +492,13 @@ def compute_splits(self, model, partitions, idx, tp_rank, pp_rank, pp_split_rank param_name, param, partitions, - 0, - tp_size, - pp_size, - pp_rank, - pp_split_rank, - self.megatron_legacy, + global_idx=0, + tp_size=tp_size, + pp_size=pp_size, + pp_rank=pp_rank, + pp_split_rank=pp_split_rank, + megatron_legacy=self.megatron_legacy, + model_cfg=model.cfg, ) splits.insert(self.intermediate_shared_embedding_location, split) break @@ -534,7 +582,7 @@ def merge_partition(model, partitions: Dict[int, List[List[torch.Tensor]]], writ ) # Original TP rank change logic - concated = compute_tp_merge(idx, name, param, partitions_pp) + concated = compute_tp_merge(idx, name, param, partitions_pp, model.cfg) # Update the model parameter with the merged tensor param.data = concated @@ -656,6 +704,7 @@ def split_tp_partition_only(model, partitions, tp_size, write_path=None, megatro pp_rank=0, pp_split_rank=0, megatron_legacy=megatron_legacy, + model_cfg=model.cfg, ) splits.append(split) idx += 1 From be711c948f1d1e3f600a9a693d478f4fb40cb153 Mon Sep 17 00:00:00 2001 From: Mostafa Ghorbandoost Date: Wed, 19 Apr 2023 11:49:21 -0700 Subject: [PATCH 07/23] Update NeMo_TTS_Primer.ipynb (#6436) * Update NeMo_TTS_Primer.ipynb Changed a mistake in line 782. Instead of frequency band (ie. pitch) we should write frequency bin. Note that frequency bins in FFT are not related to pitch. Signed-off-by: Mostafa Ghorbandoost * Update NeMo_TTS_Primer.ipynb Corrected the description of spectrogram and mel spectrogram calculations in lines 782 & 783 and added a fourth point to the description and added a reference for more mathematical details at the end of this point. Signed-off-by: Mostafa Ghorbandoost --------- Signed-off-by: Mostafa Ghorbandoost --- tutorials/tts/NeMo_TTS_Primer.ipynb | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tutorials/tts/NeMo_TTS_Primer.ipynb b/tutorials/tts/NeMo_TTS_Primer.ipynb index b6df5b91ba89..99306744dd05 100644 --- a/tutorials/tts/NeMo_TTS_Primer.ipynb +++ b/tutorials/tts/NeMo_TTS_Primer.ipynb @@ -777,10 +777,11 @@ "While raw audio shows amplitude versus time and is useful for easily recording and listening, it is not optimal when it comes to processing.\n", "\n", "For processing, it is usually preferable to represent the audio as a **spectrogram** which shows frequency versus time. Specifically, we:\n", - "\n", + "\n", "1. Group together audio samples into a much smaller set of time buckets, called **audio frames**. An audio frame will usually bucket around 50ms of audio.\n", - "2. For each audio frame, use the [Fast Fourier transform](https://en.wikipedia.org/wiki/Fast_Fourier_transform) (**FFT**) to calculate the magnitude (ie. energy, amplitude or \"loudness\") and phase (which we don't use) of each frequency band (ie. pitch).\n", - "3. Translate the original frequency bands, measured in units of hertz (Hz), into units of [mel frequency](https://en.wikipedia.org/wiki/Mel_scale). The output is called a **mel spectrogram**.\n", + "2. For each audio frame, use the [Fast Fourier transform](https://en.wikipedia.org/wiki/Fast_Fourier_transform) (**FFT**) to calculate the magnitude (ie. energy, amplitude or \"loudness\") and phase (which we don't use) of each frequency bin. We refer to the magnitudes of the frequency bins as a spectrogram\n", + "3. Map the original frequency bins onto the [mel scale](https://en.wikipedia.org/wiki/Mel_scale), using overlapped [triangular filters](https://en.wikipedia.org/wiki/Window_function#Triangular_window) to create mel filterbanks.\n", + "4. Multiply the original spectrogram by the mel filterbanks to produce a mel spectrogram (for more details see [here](https://www.mathworks.com/help/audio/ref/melspectrogram.html)).\n", "\n", "We then use the mel spectrogram as our final audio representation. The only thing we lose during this process is the phase information, the implications of which we will discuss more later on.\n", "\n", From 9e723265a52436f4bbc0b1fd0119989150033c58 Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Thu, 20 Apr 2023 20:30:55 +0300 Subject: [PATCH 08/23] add rampup batch size support for Megatron GPT (#6424) * added rampup batch size support Signed-off-by: Dmytro Pykhtar * added tests for rampup batch size Signed-off-by: Dmytro Pykhtar * fixed the typos Signed-off-by: Dmytro Pykhtar * added assertions Signed-off-by: Dmytro Pykhtar * changed assertion rules Signed-off-by: Dmytro Pykhtar * deleted unused imports Signed-off-by: Dmytro Pykhtar * changed tests for rampup batch size Signed-off-by: Dmytro Pykhtar * updated rampup batch size tests Signed-off-by: Dmytro Pykhtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed styling Signed-off-by: Dmytro Pykhtar * rampup batch size tests changes Signed-off-by: Dmytro Pykhtar --------- Signed-off-by: Dmytro Pykhtar Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: Dmytro Pykhtar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper --- .../conf/megatron_gpt_config.yaml | 1 + .../language_modeling/megatron_base_model.py | 1 + .../language_modeling/megatron_gpt_model.py | 42 +++- .../modules/common/megatron/megatron_init.py | 3 +- .../collections/nlp/test_rampup_batch_size.py | 195 ++++++++++++++++++ 5 files changed, 236 insertions(+), 6 deletions(-) create mode 100644 tests/collections/nlp/test_rampup_batch_size.py diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 27cb3af3ce91..09b30c08dd47 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -45,6 +45,7 @@ model: # gradient accumulation will be done automatically based on data_parallel_size micro_batch_size: 4 # limited by GPU memory global_batch_size: 8 # will use more micro batches to reach global batch size + rampup_batch_size: null # Should be a list of 3 values: [, , ] tensor_model_parallel_size: 1 # intra-layer model parallelism pipeline_model_parallel_size: 1 # inter-layer model parallelism virtual_pipeline_model_parallel_size: null # interleaved pipeline diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 3b223a5744af..5e5c177737fa 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -120,6 +120,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): pipeline_model_parallel_split_rank=cfg.get('pipeline_model_parallel_split_rank', 0), micro_batch_size=cfg.get('micro_batch_size'), global_batch_size=cfg.get('global_batch_size'), + rampup_batch_size=cfg.get('rampup_batch_size'), use_fp8=cfg.get('fp8', False), seed=self.cfg.get('seed', 1234), apex_transformer_log_level=self.cfg.get('apex_transformer_log_level', 30), diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index bf5799ea53c2..52a640a9efd1 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -55,6 +55,7 @@ from nemo.utils import logging try: + import apex.transformer.pipeline_parallel.utils from apex.transformer.pipeline_parallel.utils import get_num_microbatches HAVE_APEX = True @@ -427,15 +428,23 @@ def training_step(self, dataloader_iter, batch_idx): 'global_step', self.trainer.global_step, prog_bar=True, rank_zero_only=True, batch_size=1, ) + consumed_samples = self.compute_consumed_samples(self.trainer.global_step - self.init_global_step) # TODO: make sure compute_consumed_samples works for pipeline parallelism self.log( - 'consumed_samples', - self.compute_consumed_samples(self.trainer.global_step - self.init_global_step), - prog_bar=True, - rank_zero_only=True, - batch_size=1, + 'consumed_samples', consumed_samples, prog_bar=True, rank_zero_only=True, batch_size=1, ) + if self.cfg.get('rampup_batch_size', None): + micro_batch_size = self.cfg.get('micro_batch_size', 1) + total_gpus_number = self.trainer.num_devices * self.trainer.num_nodes + current_global_batch_size = get_num_microbatches() * micro_batch_size * total_gpus_number + self.log('global_batch_size', current_global_batch_size, prog_bar=True, rank_zero_only=True, batch_size=1) + + num_microbatch_calculator = apex.transformer.pipeline_parallel.utils._GLOBAL_NUM_MICROBATCHES_CALCULATOR + num_microbatch_calculator.update( + consumed_samples=consumed_samples, consistency_check=True, + ) + return loss_mean def backward(self, *args, **kwargs): @@ -815,6 +824,29 @@ def setup(self, stage=None): self.init_consumed_samples = init_consumed_samples self.init_global_step = self.trainer.global_step + rampup_batch_size = self.cfg.get('rampup_batch_size', None) + if rampup_batch_size: + start_batch_size = rampup_batch_size[0] + batch_size_increment = rampup_batch_size[1] + total_gpus_number = self.trainer.num_devices * self.trainer.num_nodes + + assert start_batch_size % (total_gpus_number) == 0, ( + 'expected' + ' start batch size ({}) to be divisible by total number of GPUs' + ' ({})'.format(start_batch_size, total_gpus_number) + ) + + micro_batch_size = self.cfg.get('micro_batch_size', 1) + tensor_model_parallel_size = self.cfg.get('tensor_model_parallel_size', 1) + pipeline_model_parallel_size = self.cfg.get('pipeline_model_parallel_size', 1) + total_data_parallel_size = total_gpus_number // (tensor_model_parallel_size * pipeline_model_parallel_size) + + assert batch_size_increment % (micro_batch_size * total_data_parallel_size) == 0, ( + 'expected' + ' batch size increment ({}) to be divisible by micro_batch_size ({}) times total data parallel size' + ' ({})'.format(batch_size_increment, micro_batch_size, total_data_parallel_size) + ) + if stage == 'predict': return else: diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py index 65a788de438c..e0551fad5d16 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py @@ -65,6 +65,7 @@ def initialize_model_parallel_for_nemo( pipeline_model_parallel_split_rank=None, micro_batch_size=None, global_batch_size=None, + rampup_batch_size=None, use_fp8=False, seed=1234, apex_transformer_log_level=30, @@ -121,7 +122,7 @@ def initialize_model_parallel_for_nemo( global_batch_size=global_batch_size, micro_batch_size=micro_batch_size, data_parallel_size=app_state.data_parallel_size, - rampup_batch_size=None, + rampup_batch_size=rampup_batch_size, ) else: if isinstance(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, ConstantNumMicroBatches): diff --git a/tests/collections/nlp/test_rampup_batch_size.py b/tests/collections/nlp/test_rampup_batch_size.py new file mode 100644 index 000000000000..86af6bf51e1d --- /dev/null +++ b/tests/collections/nlp/test_rampup_batch_size.py @@ -0,0 +1,195 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest +import torch +from omegaconf import DictConfig +from pytorch_lightning import Trainer + + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy + +try: + import apex.transformer.pipeline_parallel.utils + from apex.transformer.pipeline_parallel.utils import get_num_microbatches + + HAVE_APEX = True + +except (ImportError, ModuleNotFoundError): + + HAVE_APEX = False + +DEVICE_CAPABILITY = None +if torch.cuda.is_available(): + DEVICE_CAPABILITY = torch.cuda.get_device_capability() + + +def reset_microbatch_calculator(): + apex.transformer.pipeline_parallel.utils._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None + + +@pytest.fixture() +def model_cfg(test_data_dir): + + model_cfg = { + 'precision': 16, + 'micro_batch_size': 4, + 'global_batch_size': 16, + 'rampup_batch_size': [4, 4, 100], + 'tensor_model_parallel_size': 1, + 'pipeline_model_parallel_size': 1, + 'resume_from_checkpoint': None, + 'encoder_seq_length': 512, + 'max_position_embeddings': 512, + 'num_layers': 1, + 'hidden_size': 128, + 'ffn_hidden_size': 512, + 'num_attention_heads': 2, + 'init_method_std': 0.02, + 'hidden_dropout': 0.1, + 'kv_channels': None, + 'apply_query_key_layer_scaling': True, + 'layernorm_epsilon': 1e-5, + 'make_vocab_size_divisible_by': 128, + 'pre_process': True, + 'post_process': True, + 'persist_layer_norm': True, + 'gradient_as_bucket_view': True, + 'tokenizer': { + 'library': 'megatron', + 'type': 'GPT2BPETokenizer', + 'model': None, + 'vocab_file': os.path.join(test_data_dir, 'nlp/gpt_vocab_merges/vocab.json'), + 'merge_file': os.path.join(test_data_dir, 'nlp/gpt_vocab_merges/merges.txt'), + 'delimiter': None, + }, + 'native_amp_init_scale': 4294967296, + 'native_amp_growth_interval': 1000, + 'hysteresis': 2, + 'fp32_residual_connection': False, + 'fp16_lm_cross_entropy': False, + 'megatron_amp_O2': False, + 'seed': 1234, + 'use_cpu_initialization': False, + 'onnx_safe': False, + 'apex_transformer_log_level': 30, + 'activations_checkpoint_method': None, + 'activations_checkpoint_num_layers': 1, + 'data': { + 'data_prefix': '???', + 'index_mapping_dir': None, + 'data_impl': 'mmap', + 'splits_string': '900,50,50', + 'seq_length': 512, + 'skip_warmup': True, + 'num_workers': 2, + 'dataloader_type': 'single', + 'reset_position_ids': False, + 'reset_attention_mask': False, + 'eod_mask_loss': False, + }, + 'optim': { + 'name': 'fused_adam', + 'lr': 2e-4, + 'weight_decay': 0.01, + 'betas': [0.9, 0.98], + 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 500, 'constant_steps': 50000, 'min_lr': '2e-5'}, + }, + } + + return model_cfg + + +@pytest.fixture() +def trainer_cfg(): + + trainer_cfg = { + 'devices': 1, + 'num_nodes': 1, + 'accelerator': 'gpu', + 'precision': 16, + 'logger': False, + 'enable_checkpointing': False, + 'replace_sampler_ddp': False, + 'max_epochs': 1, + 'max_steps': 150, + 'log_every_n_steps': 10, + 'val_check_interval': 100, + 'limit_val_batches': 50, + 'limit_test_batches': 500, + 'accumulate_grad_batches': 1, + 'gradient_clip_val': 1.0, + } + + return trainer_cfg + + +@pytest.fixture() +def gpt_model(model_cfg, trainer_cfg): + + strategy = NLPDDPStrategy() + trainer = Trainer(strategy=strategy, **trainer_cfg) + cfg = DictConfig(model_cfg) + + reset_microbatch_calculator() + model = MegatronGPTModel(cfg, trainer) + + return model + + +@pytest.fixture() +def rampup_batch_size(): + + return [4, 4, 100] + + +@pytest.fixture() +def rampup_batch_size_schedule(): + + return [4, 8, 12, 16] + + +@pytest.mark.run_only_on('GPU') +class TestRampupBatchSize: + @pytest.mark.unit + def test_rampup_bs(self, gpt_model, rampup_batch_size): + + assert gpt_model.cfg.rampup_batch_size == rampup_batch_size + + @pytest.mark.unit + def test_rampup_bs_schedule(self, gpt_model, trainer_cfg, rampup_batch_size_schedule): + + num_microbatch_calculator = apex.transformer.pipeline_parallel.utils._GLOBAL_NUM_MICROBATCHES_CALCULATOR + micro_batch_size = gpt_model.cfg.micro_batch_size + num_devices = trainer_cfg["devices"] + num_nodes = trainer_cfg["num_nodes"] + max_steps = trainer_cfg["max_steps"] + + global_batch_size_schedule = [] + step, consumed_samples = 0, 0 + while step <= max_steps: + step += 1 + current_global_batch_size = get_num_microbatches() * micro_batch_size * num_devices * num_nodes + consumed_samples += current_global_batch_size + num_microbatch_calculator.update(consumed_samples=consumed_samples, consistency_check=True) + + if current_global_batch_size not in global_batch_size_schedule: + global_batch_size_schedule.append(current_global_batch_size) + + reset_microbatch_calculator() + + assert global_batch_size_schedule == rampup_batch_size_schedule From 41fcf4daccee9aa5082431f4ae89c76fc685eac9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 20 Apr 2023 15:20:49 -0400 Subject: [PATCH 09/23] Meagtron encoder decoder fix for empty validation outputs (#6459) (#6461) * 1. Meagtron encoder decoder fix for empty validation outputs. * 1. Debugging. --------- Signed-off-by: Micha Livne Co-authored-by: Micha Livne Co-authored-by: Micha Livne --- .../language_modeling/megatron_lm_encoder_decoder_model.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py index dc89165fc2af..94b2d348a61d 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py @@ -659,6 +659,10 @@ def validation_step(self, dataloader_iter, batch_idx, dataloader_idx=0): return self.fwd_bwd_step(dataloader_iter, batch_idx, True) def validation_epoch_end(self, outputs): + # NOTE: we need to make sure outputs is not empty (this is a workaround for a bug in pytorch lightning (?)) + if len(outputs) == 0: + logging.warning("validation_epoch_end: outputs is empty") + return if parallel_state.is_pipeline_last_stage(): # only the last pipeline parallel stages return loss averaged_loss = torch.stack(outputs).mean() From 77f095903316d74ca28d24411ad9762bf16ad1e2 Mon Sep 17 00:00:00 2001 From: Kunal Dhawan Date: Fri, 21 Apr 2023 12:03:08 -0700 Subject: [PATCH 10/23] Code-Switching dataset creation - upgrading to aggregate tokenizer manifest format (#6448) * added functionality to create agg tokenizer compatible manifest for CS, flag to use this mode by default Signed-off-by: Kunal Dhawan * updated README with the new agg_tokenizer_manifest flag Signed-off-by: Kunal Dhawan * fixed typo in scripts/speech_recognition/code_switching/README.md Signed-off-by: Kunal Dhawan * changed agg_tokenizer_manifest to is_lid_manifest Signed-off-by: Kunal Dhawan --------- Signed-off-by: Kunal Dhawan Co-authored-by: Dima Rekesh --- .../code_switching/README.md | 4 ++- .../code_switching_audio_data_creation.py | 26 +++++++++++++++++-- .../code_switching_manifest_creation.py | 2 +- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/scripts/speech_recognition/code_switching/README.md b/scripts/speech_recognition/code_switching/README.md index 82f3772008f4..42f1601b21b4 100644 --- a/scripts/speech_recognition/code_switching/README.md +++ b/scripts/speech_recognition/code_switching/README.md @@ -10,6 +10,8 @@ Follow the 2 steps listed below in order - 2. Create the synthetic audio data and the corresponding manifest file using `code_switching_audio_data_creation.py` It's usage is as follows: - `python code_switching_audio_data_creation.py --manifest_path --audio_save_folder_path --manifest_save_path --audio_normalized_amplitude --cs_data_sampling_rate --sample_beginning_pause_msec --sample_joining_pause_msec --sample_end_pause_msec --workers ` + `python code_switching_audio_data_creation.py --manifest_path --audio_save_folder_path --manifest_save_path --audio_normalized_amplitude --cs_data_sampling_rate --sample_beginning_pause_msec --sample_joining_pause_msec --sample_end_pause_msec --is_lid_manifest --workers ` + Example of the multi-sample LID format: ```[{“str”:“esta muestra ” “lang”:”es”},{“str”:“was generated synthetically”: “lang”:”en”}]``` + Estimated runtime for generating a 10,000 hour corpus is ~40 hrs with a single worker \ No newline at end of file diff --git a/scripts/speech_recognition/code_switching/code_switching_audio_data_creation.py b/scripts/speech_recognition/code_switching/code_switching_audio_data_creation.py index 6c378b7cdf99..c53b3eeaac36 100644 --- a/scripts/speech_recognition/code_switching/code_switching_audio_data_creation.py +++ b/scripts/speech_recognition/code_switching/code_switching_audio_data_creation.py @@ -60,6 +60,12 @@ parser.add_argument( "--sample_end_pause_msec", default=20, type=int, help='Pause to be added at the end of the sample (msec)' ) +parser.add_argument( + "--is_lid_manifest", + default=True, + type=bool, + help='If true, generate manifest in the multi-sample lid format, else the standard manifest format', +) parser.add_argument("--workers", default=1, type=int, help='Number of worker processes') args = parser.parse_args() @@ -116,6 +122,7 @@ def create_cs_data( pause_join_msec: int, pause_end_msec: int, cs_data_sampling_rate: int, + is_lid_manifest: bool, ): """ @@ -128,6 +135,7 @@ def create_cs_data( pause_join_msec: Pause to be added between different phrases of the sample (msec) pause_end_msec: Pause to be added at the end of the sample (msec) cs_data_sampling_rate: Desired sampling rate of the generated samples + is_lid_manifest: If true, generate manifest in the multi-sample lid format, else the standard manifest format Returns: @@ -144,8 +152,12 @@ def create_cs_data( staring_pause = np.zeros(int(pause_beg_msec * fs / 1000)) combined_audio += list(staring_pause) + text_entry_list = [] for index in range(len(data['lang_ids'])): + phrase_entry = {} + # dictionary to store the phrase information which will be added to the complete sentence + data_sample, fs_sample = librosa.load(data['paths'][index], sr=fs) # Alternative- fs_sample, data_sample = wavfile.read(data['paths'][index]) @@ -170,7 +182,12 @@ def create_cs_data( combined_audio += list(data_sample_norm) - # adding small pause between gemgments + phrase_entry['str'] = data['texts'][index] + phrase_entry['lang'] = data['lang_ids'][index] + + text_entry_list.append(phrase_entry) + + # adding small pause between semgments if index != (len(data['lang_ids']) - 1): pause = np.zeros(int(pause_join_msec * fs / 1000)) combined_audio += list(pause) @@ -192,7 +209,10 @@ def create_cs_data( metadata_json = {} metadata_json['audio_filepath'] = audio_file_path metadata_json['duration'] = float(len(combined_audio) / fs) - metadata_json['text'] = ' '.join(data['texts']) + if is_lid_manifest: + metadata_json['text'] = text_entry_list + else: + metadata_json['text'] = ' '.join(data['texts']) metadata_json['language_ids'] = data['lang_ids'] metadata_json['original_texts'] = data['texts'] @@ -213,6 +233,7 @@ def main(): pause_join_msec = args.sample_joining_pause_msec pause_end_msec = args.sample_end_pause_msec cs_data_sampling_rate = args.cs_data_sampling_rate + is_lid_manifest = args.is_lid_manifest num_process = args.workers # Sanity Checks @@ -249,6 +270,7 @@ def main(): pause_join_msec, pause_end_msec, cs_data_sampling_rate, + is_lid_manifest, ) for idx, split_manifest in enumerate(data_split) ) diff --git a/scripts/speech_recognition/code_switching/code_switching_manifest_creation.py b/scripts/speech_recognition/code_switching/code_switching_manifest_creation.py index 9eca4bb6977a..c783f803a74d 100644 --- a/scripts/speech_recognition/code_switching/code_switching_manifest_creation.py +++ b/scripts/speech_recognition/code_switching/code_switching_manifest_creation.py @@ -20,7 +20,7 @@ # Checks - # (Recommendation) Please normalize the text for each language (avoid numbers, special characters, punctuation) -# Please ensure that the audio_fielpaths are absolute locations +# Please ensure that the audio_filepaths are absolute locations parser = argparse.ArgumentParser(description='Create synthetic code-switching data manifest from monolingual data') From 2822ff330ce9ce6394f13f46d8cd8b6379fd0fd6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 21 Apr 2023 14:41:37 -0700 Subject: [PATCH 11/23] Added/updated new Conformer configs (#6426) (#6467) --- README.rst | 32 ++- docs/source/asr/models.rst | 6 + .../conformer_ctc_bpe_streaming.yaml | 36 ++- .../conformer_transducer_bpe_streaming.yaml | 45 ++- .../asr/conf/conformer/conformer_ctc_bpe.yaml | 35 ++- .../conf/conformer/conformer_ctc_char.yaml | 7 +- .../conformer/conformer_transducer_bpe.yaml | 31 +- .../conformer/conformer_transducer_char.yaml | 27 +- .../conformer_hybrid_transducer_ctc_bpe.yaml | 41 +-- .../conformer_hybrid_transducer_ctc_char.yaml | 40 +-- .../conformer_multiblank_transducer_bpe.yaml | 15 +- .../conformer_ctc_bpe_multilang.yaml | 8 +- .../conformer_transducer_bpe_multilang.yaml | 14 +- .../conf/contextnet_rnnt/contextnet_rnnt.yaml | 2 +- .../contextnet_rnnt/contextnet_rnnt_char.yaml | 2 +- .../contextnet_rnnt_multilang.yaml | 2 +- .../fastconformer_ctc_bpe_streaming.yaml | 198 +++++++++++++ .../fastconformer_ctc_char_streaming.yaml | 205 +++++++++++++ ...astconformer_transducer_bpe_streaming.yaml | 248 ++++++++++++++++ ...stconformer_transducer_char_streaming.yaml | 256 +++++++++++++++++ .../fastconformer/fast-conformer_ctc_bpe.yaml | 35 ++- .../fast-conformer_transducer_bpe.yaml | 42 +-- ...r_hybrid_transducer_ctc_bpe_streaming.yaml | 263 +++++++++++++++++ ..._hybrid_transducer_ctc_char_streaming.yaml | 271 ++++++++++++++++++ ...stconformer_hybrid_transducer_ctc_bpe.yaml | 251 ++++++++++++++++ ...tconformer_hybrid_transducer_ctc_char.yaml | 259 +++++++++++++++++ .../fast-conformer-long_ctc_bpe.yaml | 16 +- .../fast-conformer-long_transducer_bpe.yaml | 23 +- examples/asr/conf/lstm/lstm_ctc_bpe.yaml | 6 +- .../asr/conf/lstm/lstm_transducer_bpe.yaml | 13 +- .../squeezeformer/squeezeformer_ctc_bpe.yaml | 2 +- .../squeezeformer/squeezeformer_ctc_char.yaml | 2 +- .../conf/ssl/citrinet/citrinet_ssl_1024.yaml | 2 +- .../asr/conf/ssl/conformer/conformer_ssl.yaml | 2 +- .../conf/ssl/contextnet/contextnet_ssl.yaml | 2 +- examples/asr/conf/ssl/wav2vec/wav2vec_ci.yaml | 2 +- .../conf/ssl/wav2vec/wav2vec_pretrain.yaml | 2 +- examples/asr/conf/wav2vec_ctc/wav2vecCTC.yaml | 2 +- 38 files changed, 2182 insertions(+), 263 deletions(-) create mode 100644 examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml create mode 100644 examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml create mode 100644 examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml create mode 100644 examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml create mode 100644 examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml create mode 100644 examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml create mode 100644 examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml create mode 100644 examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml rename examples/asr/conf/fastconformer/{ => long_fastconformer}/fast-conformer-long_ctc_bpe.yaml (93%) rename examples/asr/conf/fastconformer/{ => long_fastconformer}/fast-conformer-long_transducer_bpe.yaml (93%) diff --git a/README.rst b/README.rst index a79d675abf64..fbf803b4e289 100644 --- a/README.rst +++ b/README.rst @@ -75,12 +75,20 @@ Key Features * Speech processing * `HuggingFace Space for Audio Transcription (File, Microphone and YouTube) `_ * `Automatic Speech Recognition (ASR) `_ - * Supported models: Jasper, QuartzNet, CitriNet, Conformer-CTC, Conformer-Transducer, Squeezeformer-CTC, Squeezeformer-Transducer, ContextNet, LSTM-Transducer (RNNT), LSTM-CTC, FastConformer-CTC, FastConformer-Transducer, Conformer-HAT... - * Supports CTC, Transducer/RNNT and Hybrid losses/decoders + * Supported ASR models: ``_ + * Jasper, QuartzNet, CitriNet, ContextNet + * Conformer-CTC, Conformer-Transducer, FastConformer-CTC, FastConformer-Transducer + * Squeezeformer-CTC and Squeezeformer-Transducer + * LSTM-Transducer (RNNT) and LSTM-CTC + * Supports the following decoders/losses: + * CTC + * Transducer/RNNT + * Hybrid Transducer/CTC * NeMo Original `Multi-blank Transducers `_ + * Streaming/Buffered ASR (CTC/Transducer) - `Chunked Inference Examples `_ + * Cache-aware Streaming Conformer - ``_ * Beam Search decoding * `Language Modelling for ASR `_: N-gram LM in fusion with Beam Search decoding, Neural Rescoring with Transformer - * Streaming and Buffered ASR (CTC/Transducer) - `Chunked Inference Examples `_ * `Support of long audios for Conformer with memory efficient local attention `_ * `Speech Classification, Speech Command Recognition and Language Identification `_: MatchboxNet (Command Recognition), AmberNet (LangID) * `Voice activity Detection (VAD) `_: MarbleNet @@ -98,12 +106,12 @@ Key Features * `Punctuation and Capitalization `_ * `Token classification (named entity recognition) `_ * `Text classification `_ - * `Joint Intent and Slot Classification `_ + * `Joint Intent and Slot Classification `_ * `Question answering `_ * `GLUE benchmark `_ * `Information retrieval `_ * `Entity Linking `_ - * `Dialogue State Tracking `_ + * `Dialogue State Tracking `_ * `Prompt Learning `_ * `NGC collection of pre-trained NLP models. `_ * `Synthetic Tabular Data Generation `_ @@ -170,7 +178,7 @@ We recommend installing NeMo in a fresh Conda environment. conda create --name nemo python==3.8.10 conda activate nemo -Install PyTorch using their `configurator `_. +Install PyTorch using their `configurator `_. .. code-block:: bash @@ -237,7 +245,7 @@ Install it manually if not using the NVIDIA PyTorch container. git checkout 57057e2fcf1c084c0fcc818f55c0ff6ea1b24ae2 pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./ -It is highly recommended to use the NVIDIA PyTorch or NeMo container if having issues installing Apex or any other dependencies. +It is highly recommended to use the NVIDIA PyTorch or NeMo container if having issues installing Apex or any other dependencies. While installing Apex, it may raise an error if the CUDA version on your system does not match the CUDA version torch was compiled with. This raise can be avoided by commenting it here: https://github.com/NVIDIA/apex/blob/master/setup.py#L32 @@ -251,13 +259,13 @@ cuda-nvprof is needed to install Apex. The version should match the CUDA version packaging is also needed: .. code-block:: bash - + pip install -y packaging Transformer Engine ~~~~~~~~~~~~~~~~~~ -NeMo Megatron GPT has been integrated with `NVIDIA Transformer Engine `_ +NeMo Megatron GPT has been integrated with `NVIDIA Transformer Engine `_ Transformer Engine enables FP8 training on NVIDIA Hopper GPUs. `Install `_ it manually if not using the NVIDIA PyTorch container. @@ -265,7 +273,7 @@ Transformer Engine enables FP8 training on NVIDIA Hopper GPUs. pip install --upgrade git+https://github.com/NVIDIA/TransformerEngine.git@stable -It is highly recommended to use the NVIDIA PyTorch or NeMo container if having issues installing Transformer Engine or any other dependencies. +It is highly recommended to use the NVIDIA PyTorch or NeMo container if having issues installing Transformer Engine or any other dependencies. Transformer Engine requires PyTorch to be built with CUDA 11.8. @@ -275,7 +283,7 @@ NeMo Text Processing, specifically (Inverse) Text Normalization, is now a separa Docker containers: ~~~~~~~~~~~~~~~~~~ -We release NeMo containers alongside NeMo releases. For example, NeMo ``r1.16.0`` comes with container ``nemo:23.01``, you may find more details about released containers in `releases page `_. +We release NeMo containers alongside NeMo releases. For example, NeMo ``r1.16.0`` comes with container ``nemo:23.01``, you may find more details about released containers in `releases page `_. To use built container, please run @@ -283,7 +291,7 @@ To use built container, please run docker pull nvcr.io/nvidia/nemo:23.01 -To build a nemo container with Dockerfile from a branch, please run +To build a nemo container with Dockerfile from a branch, please run .. code-block:: bash diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst index 799ea0647bc9..b08dd0cf0a8a 100644 --- a/docs/source/asr/models.rst +++ b/docs/source/asr/models.rst @@ -154,6 +154,7 @@ This allows using the model on longer audio (up to 70 minutes with Fast Conforme can be used with limited context attention even if trained with full context. However, if you also want to use global tokens, which help aggregate information from outside the limited context, then training is required. +You may find more examples under ``/examples/asr/conf/fastconformer/``. Cache-aware Streaming Conformer ------------------------------- @@ -212,6 +213,8 @@ To simulate cache-aware streaming, you may use the script at ``/e This script can be used for models trained offline with full-context but the accuracy would not be great unless the chunk size is large enough which would result in high latency. It is recommended to train a model in streaming model with limited context for this script. More info can be found in the script. +You may find FastConformer variants of cache-aware streaming models under ``/examples/asr/conf/fastconformer/``. + .. _LSTM-Transducer_model: LSTM-Transducer @@ -284,6 +287,9 @@ You may find the example config files of Conformer variant of such hybrid models ``/examples/asr/conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc_char.yaml`` and with sub-word encoding at ``/examples/asr/conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc_bpe.yaml``. +Similar example configs for FastConformer variants of Hybrid models can be found here: +``/examples/asr/conf/fastconformer/hybrid_transducer_ctc/`` +``/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/`` .. _Conformer-HAT_model: diff --git a/examples/asr/conf/conformer/cache_aware_streaming/conformer_ctc_bpe_streaming.yaml b/examples/asr/conf/conformer/cache_aware_streaming/conformer_ctc_bpe_streaming.yaml index 0ad49f4c7261..98f23458cd86 100644 --- a/examples/asr/conf/conformer/cache_aware_streaming/conformer_ctc_bpe_streaming.yaml +++ b/examples/asr/conf/conformer/cache_aware_streaming/conformer_ctc_bpe_streaming.yaml @@ -1,14 +1,14 @@ -# It contains the default values for training a streaming Conformer-CTC ASR model, large size (~120M) with CTC loss and sub-word encoding. +# It contains the default values for training a streaming cache-aware Conformer-CTC ASR model, large size (~120M) with CTC loss and sub-word encoding. +# Models trained with this config have limited right context which make them efficient for streaming ASR. -# Architecture and training config: -# You may find more detail on the architecture and training config at NeMo/examples/asr/comf/offline/conformer_ctc_bpe.yaml +# You may find more detail: +# Conformer's architecture config: NeMo/examples/asr/conf/conformer/conformer_ctc_bpe.yaml +# Cache-aware Streaming Conformer: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer -# Models trained with this config have limited right context which make them efficient for streaming ASR -# You may use NeMo/examples/asr/speech_to_text_streaming_infer.py to simulate/evaluate this model in cache-aware streaming mode +# You may use NeMo/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py to simulate/evaluate this model in cache-aware streaming mode +# Pre-trained ASR models can be found here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html -# if loss does not go down properly or gives NAN, you may try the followings: -# + using gradient clipping of 1.0 -# + increase the warmup steps from 10K to 20K +# Note: if loss does not go down properly or diverges, you may try increasing the warmup steps from 10K to 20K. name: "Conformer-CTC-BPE-Streaming" @@ -24,8 +24,6 @@ model: shuffle: true num_workers: 8 pin_memory: true - use_start_end_token: false - trim_silence: false max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset min_duration: 0.1 # tarred datasets @@ -41,18 +39,18 @@ model: sample_rate: ${model.sample_rate} batch_size: 16 # you may increase batch_size if your memory allows shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false test_ds: manifest_filepath: null sample_rate: ${model.sample_rate} batch_size: 16 # you may increase batch_size if your memory allows shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false # recommend small vocab size of 128 or 256 when using 4x sub-sampling # you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py @@ -63,7 +61,7 @@ model: preprocessor: _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor sample_rate: ${model.sample_rate} - normalize: "per_feature" + normalize: "NA" # No normalization for mel-spectogram makes streaming easier window_size: 0.025 window_stride: 0.01 window: "hann" @@ -89,9 +87,8 @@ model: d_model: 512 # Sub-sampling params - # stacking and stacking_norm would result in significant accuracy degradation and training instability with CTC models, recommend to use striding - # stacking_norm is more stable and robust for CTC models and can be around 25% faster during inference - subsampling: striding # vggnet, striding, stacking or stacking_norm, dw_striding + # stacking_norm, stacking and dw_striding can be around 25% faster than striding during inference, while they may give similar or slightly worse results in terms of accuracy for Transducer models + subsampling: striding # vggnet, striding, stacking, stacking_norm, or dw_striding subsampling_factor: 4 # must be power of 2 for striding and vggnet subsampling_conv_channels: -1 # -1 sets it to d_model causal_downsampling: true # enables causal convolutions during downsampling @@ -106,7 +103,9 @@ model: # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one - att_context_size: [102, 33] # -1 means unlimited context + # for chunked_limited you may calculate the look-ahead or right context by the following formula: + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 27*4*0.01=1.08s + att_context_size: [140, 27] # -1 means unlimited context att_context_style: chunked_limited # regular or chunked_limited xscaling: true # scales up the input embeddings by sqrt(d_model) @@ -161,7 +160,6 @@ model: d_model: ${model.encoder.d_model} # scheduler config override warmup_steps: 10000 # you may try larger warmup like 20K is training is not stable - warmup_ratio: null min_lr: 1e-6 trainer: @@ -174,7 +172,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 1.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/conformer/cache_aware_streaming/conformer_transducer_bpe_streaming.yaml b/examples/asr/conf/conformer/cache_aware_streaming/conformer_transducer_bpe_streaming.yaml index cd6b7d4a6d45..9d6e3a54d9fe 100644 --- a/examples/asr/conf/conformer/cache_aware_streaming/conformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/conformer/cache_aware_streaming/conformer_transducer_bpe_streaming.yaml @@ -1,15 +1,14 @@ -# It contains the default values for training a streaming Conformer-Transducer ASR model, large size (~120M) with Transducer loss and sub-word encoding. +# It contains the default values for training a streaming cache-aware Conformer-Transducer ASR model, large size (~120M) with Transducer loss and sub-word encoding. +# Models trained with this config have limited right context which make them efficient for streaming ASR. -# Architecture and training config: -# You may find more detail on the architecture and training config at NeMo/examples/asr/comf/offline/conformer_transducer_bpe.yaml +# You may find more detail: +# Conformer's architecture config: NeMo/examples/asr/conf/conformer/conformer_transducer_bpe.yaml +# Cache-aware Streaming Conformer: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer -# Models trained with this config have limited right context which make them efficient for streaming ASR -# You may use NeMo/examples/asr/speech_to_text_streaming_infer.py to simulate/evaluate this model in cache-aware streaming mode +# You may use NeMo/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py to simulate/evaluate this model in cache-aware streaming mode +# Pre-trained ASR models can be found here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html -# if loss does not go down properly or gives NAN, you may try the followings by order: -# + using gradient clipping of 1.0 -# + increase the warmup steps from 10K to 20K -# + use striding instead of stacking for downsampling +# Note: if loss does not go down properly or diverges, you may try increasing the warmup steps from 10K to 20K. name: "Conformer-Transducer-BPE-Streaming" @@ -30,8 +29,6 @@ model: shuffle: true num_workers: 8 pin_memory: true - use_start_end_token: false - trim_silence: false max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset min_duration: 0.1 # tarred datasets @@ -47,18 +44,18 @@ model: sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false test_ds: manifest_filepath: null sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py tokenizer: @@ -68,7 +65,7 @@ model: preprocessor: _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor sample_rate: ${model.sample_rate} - normalize: "per_feature" + normalize: "NA" # No normalization for mel-spectogram makes streaming easier window_size: 0.025 window_stride: 0.01 window: "hann" @@ -93,8 +90,8 @@ model: d_model: 512 # Sub-sampling params - # stacking_norm and stacking can be around 25% faster than striding during inference, while they may give similar or slightly worse results in terms of accuracy for Transducer models - subsampling: striding # vggnet, striding, stacking or stacking_norm + # stacking_norm, stacking and dw_striding can be around 25% faster than striding during inference, while they may give similar or slightly worse results in terms of accuracy for Transducer models + subsampling: striding # vggnet, striding, stacking, stacking_norm, or dw_striding subsampling_factor: 4 # must be power of 2 for striding and vggnet subsampling_conv_channels: -1 # -1 sets it to d_model causal_downsampling: true # enables causal convolutions during downsampling @@ -116,7 +113,9 @@ model: # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one - att_context_size: [102, 33] # -1 means unlimited context + # for chunked_limited you may calculate the look-ahead or right context by the following formula: + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 27*4*0.01=1.08s + att_context_size: [140, 27] # -1 means unlimited context att_context_style: chunked_limited # regular or chunked_limited xscaling: true # scales up the input embeddings by sqrt(d_model) @@ -169,7 +168,7 @@ model: # However, to preserve memory, this ratio can be 1:8 or even 1:16. # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. fuse_loss_wer: true - fused_batch_size: 16 + fused_batch_size: 4 jointnet: joint_hidden: ${model.model_defaults.joint_hidden} @@ -201,11 +200,6 @@ model: fastemit_lambda: 1e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. - # Adds Gaussian noise to the gradients of the decoder to avoid overfitting - variational_noise: - start_step: 0 - std: 0.0 - optim: name: adamw lr: 5.0 @@ -219,7 +213,6 @@ model: d_model: ${model.encoder.d_model} # scheduler config override warmup_steps: 10000 # you may try larger warmup like 20K is training is not stable - warmup_ratio: null min_lr: 1e-6 trainer: @@ -231,8 +224,8 @@ trainer: accelerator: auto strategy: ddp accumulate_grad_batches: 1 - gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + gradient_clip_val: 1.0 + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/conformer/conformer_ctc_bpe.yaml b/examples/asr/conf/conformer/conformer_ctc_bpe.yaml index f80d6c87c707..d605b9c25903 100644 --- a/examples/asr/conf/conformer/conformer_ctc_bpe.yaml +++ b/examples/asr/conf/conformer/conformer_ctc_bpe.yaml @@ -1,8 +1,6 @@ # It contains the default values for training a Conformer-CTC ASR model, large size (~120M) with CTC loss and sub-word encoding. # Architecture and training config: -# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective -# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. # Here are the recommended configs for different variants of Conformer-CTC, other parameters are the same as in this config file. # One extra layer (compared to original paper) is added to the medium and large variants to compensate for replacing the LSTM decoder with a linear one. # @@ -18,14 +16,31 @@ # | XLarge (635M)| 1024 | 8 | 24 | 5 | 10 | 6.4 | # +--------------+---------+--------+-----------+------------------+------------+-----+ # -# If you do not want to train with AMP, you may use weight decay of 0.0 or reduce the number of time maskings to 2 -# with time_width=100. It may help when you want to train for fewer epochs and need faster convergence. -# With weight_decay=0.0, learning rate may need to get reduced to 2.0. +# Default learning parameters in this config are set for global batch size of 2K while you may use lower values. +# To increase the global batch size with limited number of GPUs, you may use higher accumulate_grad_batches. +# However accumulate_grad_batches is better to be avoided as long as the global batch size is large enough and training is stable. # You may find more info about Conformer-CTC here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#conformer-ctc # Pre-trained models of Conformer-CTC can be found here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html # The checkpoint of the large model trained on LibriSpeech with this recipe can be found here: https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large_ls +# We suggest to use trainer.precision=bf16 for GPUs which support it otherwise trainer.precision=16 is recommended. +# Using bf16 or 16 would make it possible to double the batch size and speedup training/inference. If fp16 is not stable and model diverges after some epochs, you may use fp32. +# Here are the suggested batch size per GPU for each precision and memory sizes: +# +-----------+------------+------------+ +# | Precision | GPU Memory | Batch Size | +# +===========+============+============+ +# | 32 | 16GB | 8 | +# | | 32GB | 16 | +# | | 80GB | 32 | +# +-----------+------------+------------+ +# | 16 or | 16GB | 16 | +# | bf16 | 32GB | 32 | +# | | 80GB | 64 | +# +-----------+------------+------------+ +# Note: They are based on the assumption of max_duration of 20. If you have longer or shorter max_duration, then batch sizes may need to get updated accordingly. + + name: "Conformer-CTC-BPE" model: @@ -41,8 +56,6 @@ model: shuffle: true num_workers: 8 pin_memory: true - use_start_end_token: false - trim_silence: false max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset min_duration: 0.1 # tarred datasets @@ -58,20 +71,20 @@ model: sample_rate: ${model.sample_rate} batch_size: 16 # you may increase batch_size if your memory allows shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false test_ds: manifest_filepath: null sample_rate: ${model.sample_rate} batch_size: 16 # you may increase batch_size if your memory allows shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false - # recommend small vocab size of 128 or 256 when using 4x sub-sampling + # recommend to SPE Unigram tokenizer with small vocab size of 128 or 256 when using 4x sub-sampling # you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py tokenizer: dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (wpe) @@ -188,7 +201,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/conformer/conformer_ctc_char.yaml b/examples/asr/conf/conformer/conformer_ctc_char.yaml index e56057bd5588..4a52c5157d6d 100644 --- a/examples/asr/conf/conformer/conformer_ctc_char.yaml +++ b/examples/asr/conf/conformer/conformer_ctc_char.yaml @@ -1,6 +1,5 @@ # It contains the default values for training a Conformer-CTC ASR model, large size (~120M) with CTC loss and char-based vocabulary. -# Char-based encoding may give lower accuracy than sub-word encoding (conformer_ctc_bpe.yaml). - +# Char-based encoding may give lower accuracy than sub-word encoding for some languages (conformer_ctc_bpe.yaml). # You may find more detail on Conformer-CTC at `examples/asr/conf/conformer/conformer_ctc_bpe.yaml` name: "Conformer-CTC-Char" @@ -38,6 +37,7 @@ model: sample_rate: ${model.sample_rate} batch_size: 16 # you may increase batch_size if your memory allows shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true @@ -47,6 +47,7 @@ model: sample_rate: ${model.sample_rate} batch_size: 16 # you may increase batch_size if your memory allows shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true @@ -161,7 +162,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/conformer/conformer_transducer_bpe.yaml b/examples/asr/conf/conformer/conformer_transducer_bpe.yaml index 76cfbdde69ac..0c7a3e05bd9e 100644 --- a/examples/asr/conf/conformer/conformer_transducer_bpe.yaml +++ b/examples/asr/conf/conformer/conformer_transducer_bpe.yaml @@ -17,10 +17,30 @@ # | XLarge (644M)| 1024 | 8 | 24 | 5 | 1e-3 | 640 | 2 | # +--------------+---------+--------+-----------+------------------+--------------+--------------------------+-----------------+ +# Default learning parameters in this config are set for global batch size of 2K while you may use lower values. +# To increase the global batch size with limited number of GPUs, you may use higher accumulate_grad_batches. +# However accumulate_grad_batches is better to be avoided as long as the global batch size is large enough and training is stable. + # You may find more info about Conformer-Transducer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#conformer-transducer # Pre-trained models of Conformer-Transducer can be found here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html # The checkpoint of the large model trained on NeMo ASRSET with this recipe can be found here: https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_large +# We suggest to use trainer.precision=bf16 for GPUs which support it otherwise trainer.precision=16 is recommended. +# Using bf16 or 16 would make it possible to double the batch size and speedup training/inference. If fp16 is not stable and model diverges after some epochs, you may use fp32. +# Here are the suggested batch size per GPU for each precision and memory sizes: +# +-----------+------------+------------+ +# | Precision | GPU Memory | Batch Size | +# +===========+============+============+ +# | 32 | 16GB | 8 | +# | | 32GB | 16 | +# | | 80GB | 32 | +# +-----------+------------+------------+ +# | 16 or | 16GB | 16 | +# | bf16 | 32GB | 32 | +# | | 80GB | 64 | +# +-----------+------------+------------+ +# Note: They are based on the assumption of max_duration of 20. If you have longer or shorter max_duration, then batch sizes may need to get updated accordingly. + name: "Conformer-Transducer-BPE" model: @@ -41,8 +61,6 @@ model: shuffle: true num_workers: 8 pin_memory: true - use_start_end_token: false - trim_silence: false max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset min_duration: 0.1 # tarred datasets @@ -58,20 +76,21 @@ model: sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false test_ds: manifest_filepath: null sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py + # recommend to use SPE Unigram tokenizer with vocab size of 1K to 4k when using 4x sub-sampling tokenizer: dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) @@ -173,7 +192,7 @@ model: # However, to preserve memory, this ratio can be 1:8 or even 1:16. # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. fuse_loss_wer: true - fused_batch_size: 16 + fused_batch_size: 4 jointnet: joint_hidden: ${model.model_defaults.joint_hidden} @@ -235,7 +254,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/conformer/conformer_transducer_char.yaml b/examples/asr/conf/conformer/conformer_transducer_char.yaml index 977287fd7890..63a462cf026c 100644 --- a/examples/asr/conf/conformer/conformer_transducer_char.yaml +++ b/examples/asr/conf/conformer/conformer_transducer_char.yaml @@ -1,24 +1,5 @@ # It contains the default values for training a Conformer-Transducer ASR model, large size (~120M) with Transducer loss and char-based vocabulary. - -# Architecture and training config: -# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective -# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. -# Here are the recommended configs for different variants of Conformer-Transducer, other parameters are the same as in this config file. -# -# +-------------+---------+---------+----------+--------------+--------------------------+ -# | Model | d_model | n_heads | n_layers | weight_decay | pred_hidden/joint_hidden | -# +=============+=========+========+===========+==============+==========================+ -# | Small (14M)| 176 | 4 | 16 | 0.0 | 320 | -# +-------------+---------+--------+-----------+--------------+--------------------------+ -# | Medium (32M)| 256 | 4 | 16 | 1e-3 | 640 | -# +-------------+---------+--------+-----------+--------------+--------------------------+ -# | Large (120M)| 512 | 8 | 17 | 1e-3 | 640 | -# +-----------------------------------------------------------+--------------------------+ -# - -# You may find more info about Conformer-Transducer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#conformer-transducer -# Pre-trained models of Conformer-Transducer can be found here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html -# The checkpoint of the large model trained on NeMo ASRSET with this recipe can be found here: https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_large +# You may find more detail on Conformer-Transducer at `examples/asr/conf/conformer/conformer_transducer_bpe.yaml` name: "Conformer-Transducer-Char" @@ -59,6 +40,7 @@ model: sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true @@ -67,6 +49,7 @@ model: sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true @@ -167,7 +150,7 @@ model: # However, to preserve memory, this ratio can be 1:8 or even 1:16. # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. fuse_loss_wer: true - fused_batch_size: 16 + fused_batch_size: 4 jointnet: joint_hidden: ${model.model_defaults.joint_hidden} @@ -229,7 +212,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc_bpe.yaml b/examples/asr/conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc_bpe.yaml index 1eab770a3090..90f56c7fddb1 100644 --- a/examples/asr/conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc_bpe.yaml +++ b/examples/asr/conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc_bpe.yaml @@ -1,25 +1,11 @@ -# It contains the default values for training a Conformer-Hybrid-Transducer-CTC ASR model, large size (~120M) with Transducer loss and sub-word encoding. +# It contains the default values for training a Conformer-Hybrid-Transducer-CTC ASR model, large size (~120M) with sub-word encoding. # The model would have two decoders: RNNT (Transducer) and CTC -# Architecture and training config: -# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective -# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. -# Here are the recommended configs for different variants of Conformer-Transducer, other parameters are the same as in this config file. -# -# +-------------+---------+---------+----------+--------------+--------------------------+ -# | Model | d_model | n_heads | n_layers | weight_decay | pred_hidden/joint_hidden | -# +=============+=========+========+===========+==============+==========================+ -# | Small (14M)| 176 | 4 | 16 | 0.0 | 320 | -# +-------------+---------+--------+-----------+--------------+--------------------------+ -# | Medium (32M)| 256 | 4 | 16 | 1e-3 | 640 | -# +-------------+---------+--------+-----------+--------------+--------------------------+ -# | Large (120M)| 512 | 8 | 17 | 1e-3 | 640 | -# +-----------------------------------------------------------+--------------------------+ -# - -# You may find more info about Conformer-Transducer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#conformer-transducer -# Pre-trained models of Conformer-Transducer can be found here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html -# The checkpoint of the large model trained on NeMo ASRSET with this recipe can be found here: https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_large +# You may find more detail: +# Conformer's architecture config: NeMo/examples/asr/conf/conformer/conformer_ctc_bpe.yaml +# Hybrid ASR: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#hybrid-transducer-ctc + +# Pre-trained ASR models can be found here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html name: "Conformer-Hybrid-Transducer-CTC-BPE" @@ -41,8 +27,6 @@ model: shuffle: true num_workers: 8 pin_memory: true - use_start_end_token: false - trim_silence: false max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset min_duration: 0.1 # tarred datasets @@ -58,18 +42,18 @@ model: sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false test_ds: manifest_filepath: null sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py tokenizer: @@ -173,7 +157,7 @@ model: # However, to preserve memory, this ratio can be 1:8 or even 1:16. # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. fuse_loss_wer: true - fused_batch_size: 16 + fused_batch_size: 4 jointnet: joint_hidden: ${model.model_defaults.joint_hidden} @@ -226,11 +210,6 @@ model: fastemit_lambda: 0.0 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. - # Adds Gaussian noise to the gradients of the decoder to avoid overfitting - variational_noise: - start_step: 0 - std: 0.0 - optim: name: adamw lr: 5.0 @@ -257,7 +236,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc_char.yaml b/examples/asr/conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc_char.yaml index 4870a709142d..a7d0ef5dfa56 100644 --- a/examples/asr/conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc_char.yaml +++ b/examples/asr/conf/conformer/hybrid_transducer_ctc/conformer_hybrid_transducer_ctc_char.yaml @@ -1,24 +1,11 @@ -# It contains the default values for training a Conformer-Hybrid-Transducer-CTC ASR model, large size (~120M) with Transducer loss and char-based vocabulary. - -# Architecture and training config: -# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective -# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. -# Here are the recommended configs for different variants of Conformer-Transducer, other parameters are the same as in this config file. -# -# +-------------+---------+---------+----------+--------------+--------------------------+ -# | Model | d_model | n_heads | n_layers | weight_decay | pred_hidden/joint_hidden | -# +=============+=========+========+===========+==============+==========================+ -# | Small (14M)| 176 | 4 | 16 | 0.0 | 320 | -# +-------------+---------+--------+-----------+--------------+--------------------------+ -# | Medium (32M)| 256 | 4 | 16 | 1e-3 | 640 | -# +-------------+---------+--------+-----------+--------------+--------------------------+ -# | Large (120M)| 512 | 8 | 17 | 1e-3 | 640 | -# +-----------------------------------------------------------+--------------------------+ -# - -# You may find more info about Conformer-Transducer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#conformer-transducer -# Pre-trained models of Conformer-Transducer can be found here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html -# The checkpoint of the large model trained on NeMo ASRSET with this recipe can be found here: https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_large +# It contains the default values for training a Conformer-Hybrid-Transducer-CTC ASR model, large size (~120M) with char-based vocabulary. +# The model would have two decoders: RNNT (Transducer) and CTC + +# You may find more detail: +# Conformer's architecture config: NeMo/examples/asr/conf/conformer/conformer_ctc_char.yaml +# Hybrid ASR: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#hybrid-transducer-ctc + +# Pre-trained ASR models can be found here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html name: "Conformer-Hybrid-Transducer-CTC-Char" @@ -59,6 +46,7 @@ model: sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true @@ -67,6 +55,7 @@ model: sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true @@ -167,7 +156,7 @@ model: # However, to preserve memory, this ratio can be 1:8 or even 1:16. # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. fuse_loss_wer: true - fused_batch_size: 16 + fused_batch_size: 4 jointnet: joint_hidden: ${model.model_defaults.joint_hidden} @@ -220,11 +209,6 @@ model: fastemit_lambda: 0.0 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. - # Adds Gaussian noise to the gradients of the decoder to avoid overfitting - variational_noise: - start_step: 0 - std: 0.0 - optim: name: adamw lr: 5.0 @@ -251,7 +235,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/conformer/multiblank/conformer_multiblank_transducer_bpe.yaml b/examples/asr/conf/conformer/multiblank/conformer_multiblank_transducer_bpe.yaml index bf30db4d17fb..84d767e4a3b5 100644 --- a/examples/asr/conf/conformer/multiblank/conformer_multiblank_transducer_bpe.yaml +++ b/examples/asr/conf/conformer/multiblank/conformer_multiblank_transducer_bpe.yaml @@ -52,8 +52,6 @@ model: shuffle: true num_workers: 8 pin_memory: true - use_start_end_token: false - trim_silence: false max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset min_duration: 0.1 # tarred datasets @@ -69,18 +67,18 @@ model: sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false test_ds: manifest_filepath: null sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py tokenizer: @@ -171,7 +169,7 @@ model: # However, to preserve memory, this ratio can be 1:8 or even 1:16. # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. fuse_loss_wer: true - fused_batch_size: 16 + fused_batch_size: 4 jointnet: joint_hidden: ${model.model_defaults.joint_hidden} @@ -199,11 +197,6 @@ model: tsd_max_sym_exp: 50 # for Time Synchronous Decoding alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding - # Adds Gaussian noise to the gradients of the decoder to avoid overfitting - variational_noise: - start_step: 0 - std: 0.0 - optim: name: adamw lr: 5.0 @@ -230,7 +223,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/conformer/multilang/conformer_ctc_bpe_multilang.yaml b/examples/asr/conf/conformer/multilang/conformer_ctc_bpe_multilang.yaml index 4bf20bef3013..05a35b941611 100644 --- a/examples/asr/conf/conformer/multilang/conformer_ctc_bpe_multilang.yaml +++ b/examples/asr/conf/conformer/multilang/conformer_ctc_bpe_multilang.yaml @@ -39,8 +39,6 @@ model: shuffle: true num_workers: 8 pin_memory: true - use_start_end_token: false - trim_silence: false max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset min_duration: 0.1 # tarred datasets @@ -56,18 +54,18 @@ model: sample_rate: ${model.sample_rate} batch_size: 16 # you may increase batch_size if your memory allows shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false test_ds: manifest_filepath: null sample_rate: ${model.sample_rate} batch_size: 16 # you may increase batch_size if your memory allows shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false # recommend small vocab size of 128 or 256 when using 4x sub-sampling # you may find more detail on how to train a monolingual tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py @@ -173,7 +171,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/conformer/multilang/conformer_transducer_bpe_multilang.yaml b/examples/asr/conf/conformer/multilang/conformer_transducer_bpe_multilang.yaml index 0f827799b892..afacd1d2f7c3 100644 --- a/examples/asr/conf/conformer/multilang/conformer_transducer_bpe_multilang.yaml +++ b/examples/asr/conf/conformer/multilang/conformer_transducer_bpe_multilang.yaml @@ -40,7 +40,6 @@ model: shuffle: true num_workers: 8 pin_memory: true - use_start_end_token: false trim_silence: false max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset min_duration: 0.1 @@ -57,18 +56,18 @@ model: sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false test_ds: manifest_filepath: null sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false # You may find more detail on how to train a monolingual tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py tokenizer: @@ -172,7 +171,7 @@ model: # However, to preserve memory, this ratio can be 1:8 or even 1:16. # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. fuse_loss_wer: true - fused_batch_size: 16 + fused_batch_size: 4 jointnet: joint_hidden: ${model.model_defaults.joint_hidden} @@ -203,11 +202,6 @@ model: fastemit_lambda: 0.0 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. - # Adds Gaussian noise to the gradients of the decoder to avoid overfitting - variational_noise: - start_step: 0 - std: 0.0 - optim: name: adamw lr: 5.0 @@ -234,7 +228,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml b/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml index 596434cd8a79..501f11b9d53f 100644 --- a/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml +++ b/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml @@ -416,7 +416,7 @@ model: # However, to preserve memory, this ratio can be 1:8 or even 1:16. # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. fuse_loss_wer: true - fused_batch_size: 16 + fused_batch_size: 4 jointnet: joint_hidden: ${model.model_defaults.joint_hidden} diff --git a/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_char.yaml b/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_char.yaml index b190d7159529..08ea55f59179 100644 --- a/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_char.yaml +++ b/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_char.yaml @@ -418,7 +418,7 @@ model: # However, to preserve memory, this ratio can be 1:8 or even 1:16. # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. fuse_loss_wer: true - fused_batch_size: 16 + fused_batch_size: 4 jointnet: joint_hidden: ${model.model_defaults.joint_hidden} diff --git a/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_multilang.yaml b/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_multilang.yaml index 34ffdc923efb..57ba09aabb62 100644 --- a/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_multilang.yaml +++ b/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_multilang.yaml @@ -423,7 +423,7 @@ model: # However, to preserve memory, this ratio can be 1:8 or even 1:16. # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. fuse_loss_wer: true - fused_batch_size: 16 + fused_batch_size: 4 jointnet: joint_hidden: ${model.model_defaults.joint_hidden} diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml new file mode 100644 index 000000000000..c68b30a33d5a --- /dev/null +++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml @@ -0,0 +1,198 @@ +# It contains the default values for training a cache-aware streaming FastConformer-CTC ASR model, large size (~115M) with sub-word encoding. + +# You may find more detail: +# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer +# FastConformer-CTC's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml + +name: "FastConformer-CTC-BPE-Streaming" + +model: + sample_rate: 16000 + log_prediction: true # enables logging sample predictions in the output during training + ctc_reduction: 'mean_batch' + skip_nan_grad: false + + train_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 # you may increase batch_size if your memory allows + shuffle: true + num_workers: 8 + pin_memory: true + max_duration: 20 # you may need to update it for your dataset + min_duration: 0.1 + # tarred datasets + is_tarred: false + tarred_audio_filepaths: null + shuffle_n: 2048 + # bucketing params + bucketing_strategy: "synced_randomized" + bucketing_batch_size: null + + validation_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + use_start_end_token: false + num_workers: 8 + pin_memory: true + + test_ds: + manifest_filepath: null + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + use_start_end_token: false + num_workers: 8 + pin_memory: true + + # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py + # We recommend to use vocab size of 1024 with SPE Unigram for most languages + tokenizer: + dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) + type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: ${model.sample_rate} + normalize: "NA" # No normalization for mel-spectogram makes streaming easier + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + + # Sub-sampling parameters + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 8 # must be power of 2 for striding and vggnet + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + causal_downsampling: true + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large + # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one + # for chunked_limited you may calculate the look-ahead or right context by the following formula: + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + att_context_size: [70, 13] # -1 means unlimited context + att_context_style: chunked_limited # regular or chunked_limited + + xscaling: true # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly + conv_context_size: causal + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + decoder: + _target_: nemo.collections.asr.modules.ConvASRDecoder + feat_in: null + num_classes: -1 + vocabulary: [] + + # config for InterCTC loss: https://arxiv.org/abs/2102.03216 + # specify loss weights and which layers to use for InterCTC + # e.g., to reproduce the paper results, set loss_weights: [0.3] + # and apply_at_layers: [8] (assuming 18 layers). Note that final + # layer loss coefficient is automatically adjusted (to 0.7 in above example) + interctc: + loss_weights: [] + apply_at_layers: [] + + optim: + name: adamw + lr: 2.0 + # optimizer arguments + betas: [0.9, 0.98] + weight_decay: 1e-3 + + # scheduler setup + sched: + name: NoamAnnealing + d_model: ${model.encoder.d_model} + # scheduler config override + warmup_steps: 10000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: 1000 + max_steps: -1 # computed at runtime if not set + val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations + accelerator: auto + strategy: ddp + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + precision: 32 # 16, 32, or bf16 + log_every_n_steps: 10 # Interval of logging. + enable_progress_bar: True + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_wer" + mode: "min" + save_top_k: 5 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + resume_if_exists: false + resume_ignore_no_checkpoint: false + + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml new file mode 100644 index 000000000000..654895ec065d --- /dev/null +++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml @@ -0,0 +1,205 @@ +# It contains the default values for training a cache-aware streaming FastConformer-CTC ASR model, large size (~115M) with char-based vocabulary. + +# You may find more detail: +# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer +# FastConformer-CTC's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml + +name: "FastConformer-CTC-Char-Streaming" + +model: + sample_rate: 16000 + log_prediction: true # enables logging sample predictions in the output during training + ctc_reduction: 'mean_batch' + skip_nan_grad: false + + labels: [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", + "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] + + train_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 # you may increase batch_size if your memory allows + shuffle: true + num_workers: 8 + pin_memory: true + trim_silence: false + max_duration: 20 # you may need to update it for your dataset + min_duration: 0.1 + # tarred datasets + is_tarred: false + tarred_audio_filepaths: null + shuffle_n: 2048 + # bucketing params + bucketing_strategy: "synced_randomized" + bucketing_batch_size: null + + validation_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + use_start_end_token: false + num_workers: 8 + pin_memory: true + + test_ds: + manifest_filepath: null + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + use_start_end_token: false + num_workers: 8 + pin_memory: true + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: ${model.sample_rate} + normalize: "NA" # No normalization for mel-spectogram makes streaming easier + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + + # Sub-sampling params + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 8 # must be power of 2 for striding and vggnet + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + causal_downsampling: true + + # Reduction parameters: Can be used to add another subsampling layer at a given position. + # Having a 2x reduction will speedup the training and inference speech while keeping similar WER. + # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup. + reduction: null # pooling, striding, or null + reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder + reduction_factor: 1 + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 2 as multiple-layers may increase the effective right context too large + # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + att_context_size: [70, 13] # -1 means unlimited context + att_context_style: chunked_limited # regular or chunked_limited + + xscaling: true # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly + conv_context_size: causal + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + decoder: + _target_: nemo.collections.asr.modules.ConvASRDecoder + feat_in: null + num_classes: -1 + vocabulary: [] + + # config for InterCTC loss: https://arxiv.org/abs/2102.03216 + # specify loss weights and which layers to use for InterCTC + # e.g., to reproduce the paper results, set loss_weights: [0.3] + # and apply_at_layers: [8] (assuming 18 layers). Note that final + # layer loss coefficient is automatically adjusted (to 0.7 in above example) + interctc: + loss_weights: [] + apply_at_layers: [] + + optim: + name: adamw + lr: 2.0 + # optimizer arguments + betas: [0.9, 0.98] + weight_decay: 1e-3 + + # scheduler setup + sched: + name: NoamAnnealing + d_model: ${model.encoder.d_model} + # scheduler config override + warmup_steps: 10000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: 1000 + max_steps: -1 # computed at runtime if not set + val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations + accelerator: auto + strategy: ddp + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + precision: 32 # 16, 32, or bf16 + log_every_n_steps: 10 # Interval of logging. + enable_progress_bar: True + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: false # Provided by exp_manager + logger: false # Provided by exp_manager + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_wer" + mode: "min" + save_top_k: 5 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + + # you need to set these two to True to continue the training + resume_if_exists: false + resume_ignore_no_checkpoint: false + + # You may use this section to create a W&B logger + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml new file mode 100644 index 000000000000..94e8d56d6e5b --- /dev/null +++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml @@ -0,0 +1,248 @@ +# It contains the default values for training a cache-aware streaming FastConformer-Transducer ASR model, large size (~115M) with sub-word encoding. + +# You may find more detail: +# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer +# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml + +name: "FastConformer-Transducer-BPE-Streaming" + +model: + sample_rate: 16000 + compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. + log_prediction: true # enables logging sample predictions in the output during training + skip_nan_grad: false + + model_defaults: + enc_hidden: ${model.encoder.d_model} + pred_hidden: 640 + joint_hidden: 640 + + train_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 # you may increase batch_size if your memory allows + shuffle: true + num_workers: 8 + pin_memory: true + max_duration: 20 # you may need to update it for your dataset + min_duration: 0.1 + # tarred datasets + is_tarred: false + tarred_audio_filepaths: null + shuffle_n: 2048 + # bucketing params + bucketing_strategy: "synced_randomized" + bucketing_batch_size: null + + validation_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + use_start_end_token: false + num_workers: 8 + pin_memory: true + + test_ds: + manifest_filepath: null + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + use_start_end_token: false + num_workers: 8 + pin_memory: true + + # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py + # We recommend to use vocab size of 1024 with SPE Unigram for most languages + tokenizer: + dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) + type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: ${model.sample_rate} + normalize: "NA" # No normalization for mel-spectogram makes streaming easier + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + + # Sub-sampling parameters + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 8 # must be power of 2 for striding and vggnet + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + causal_downsampling: true + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large + # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one + # for chunked_limited you may calculate the look-ahead or right context by the following formula: + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + att_context_size: [70, 13] # -1 means unlimited context + att_context_style: chunked_limited # regular or chunked_limited + + xscaling: true # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly + conv_context_size: causal + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + decoder: + _target_: nemo.collections.asr.modules.RNNTDecoder + normalization_mode: null # Currently only null is supported for export. + random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf + blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. + + prednet: + pred_hidden: ${model.model_defaults.pred_hidden} + pred_rnn_layers: 1 + t_max: null + dropout: 0.2 + + joint: + _target_: nemo.collections.asr.modules.RNNTJoint + log_softmax: null # 'null' would set it automatically according to CPU/GPU device + preserve_memory: false # dramatically slows down training, but might preserve some memory + + # Fuses the computation of prediction net + joint net + loss + WER calculation + # to be run on sub-batches of size `fused_batch_size`. + # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. + # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. + # Using small values here will preserve a lot of memory during training, but will make training slower as well. + # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. + # However, to preserve memory, this ratio can be 1:8 or even 1:16. + # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. + fuse_loss_wer: true + fused_batch_size: 4 + + jointnet: + joint_hidden: ${model.model_defaults.joint_hidden} + activation: "relu" + dropout: 0.2 + + decoding: + strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. + + # greedy strategy config + greedy: + max_symbols: 10 + + # beam strategy config + beam: + beam_size: 2 + return_best_hypothesis: False + score_norm: true + tsd_max_sym_exp: 50 # for Time Synchronous Decoding + alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding + + # config for InterCTC loss: https://arxiv.org/abs/2102.03216 + # specify loss weights and which layers to use for InterCTC + # e.g., to reproduce the paper results, set loss_weights: [0.3] + # and apply_at_layers: [8] (assuming 18 layers). Note that final + # layer loss coefficient is automatically adjusted (to 0.7 in above example) + interctc: + loss_weights: [] + apply_at_layers: [] + + loss: + loss_name: "default" + + optim: + name: adamw + lr: 5.0 + # optimizer arguments + betas: [0.9, 0.98] + weight_decay: 1e-3 + + # scheduler setup + sched: + name: NoamAnnealing + d_model: ${model.encoder.d_model} + # scheduler config override + warmup_steps: 10000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: 1000 + max_steps: -1 # computed at runtime if not set + val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations + accelerator: auto + strategy: ddp + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + precision: 32 # 16, 32, or bf16 + log_every_n_steps: 10 # Interval of logging. + enable_progress_bar: True + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_wer" + mode: "min" + save_top_k: 5 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + resume_if_exists: false + resume_ignore_no_checkpoint: false + + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml new file mode 100644 index 000000000000..71fbad88aeb2 --- /dev/null +++ b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml @@ -0,0 +1,256 @@ +# It contains the default values for training a cache-aware streaming FastConformer-Transducer ASR model, large size (~115M) with char-based vocabulary. + +# You may find more detail: +# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer +# FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml +# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml + +name: "FastConformer-Transducer-Char-Streaming" + +model: + sample_rate: &sample_rate 16000 + compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. + log_prediction: true # enables logging sample predictions in the output during training + skip_nan_grad: false + + labels: [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", + "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] + + model_defaults: + enc_hidden: ${model.encoder.d_model} + pred_hidden: 640 + joint_hidden: 640 + + train_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 # you may increase batch_size if your memory allows + shuffle: true + num_workers: 8 + pin_memory: true + trim_silence: false + max_duration: 20 # you may need to update it for your dataset + min_duration: 0.1 + # tarred datasets + is_tarred: false + tarred_audio_filepaths: null + shuffle_n: 2048 + # bucketing params + bucketing_strategy: "synced_randomized" + bucketing_batch_size: null + + validation_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + use_start_end_token: false + num_workers: 8 + pin_memory: true + + test_ds: + manifest_filepath: null + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + use_start_end_token: false + num_workers: 8 + pin_memory: true + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: *sample_rate + normalize: "NA" # No normalization for mel-spectogram makes streaming easier + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + + # Sub-sampling params + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 8 # must be power of 2 for striding and vggnet + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + causal_downsampling: true + + # Reduction parameters: Can be used to add another subsampling layer at a given position. + # Having a 2x reduction will speedup the training and inference speech while keeping similar WER. + # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup. + reduction: null # pooling, striding, or null + reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder + reduction_factor: 1 + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 2 as multiple-layers may increase the effective right context too large + # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + att_context_size: [70, 13] # -1 means unlimited context + att_context_style: chunked_limited # regular or chunked_limited + + xscaling: true # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly + conv_context_size: causal + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + decoder: + _target_: nemo.collections.asr.modules.RNNTDecoder + normalization_mode: null # Currently only null is supported for export. + random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf + blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. + + prednet: + pred_hidden: ${model.model_defaults.pred_hidden} + pred_rnn_layers: 1 + t_max: null + dropout: 0.2 + + joint: + _target_: nemo.collections.asr.modules.RNNTJoint + log_softmax: null # 'null' would set it automatically according to CPU/GPU device + preserve_memory: false # dramatically slows down training, but might preserve some memory + + # Fuses the computation of prediction net + joint net + loss + WER calculation + # to be run on sub-batches of size `fused_batch_size`. + # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. + # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. + # Using small values here will preserve a lot of memory during training, but will make training slower as well. + # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. + # However, to preserve memory, this ratio can be 1:8 or even 1:16. + # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. + fuse_loss_wer: true + fused_batch_size: 4 + + jointnet: + joint_hidden: ${model.model_defaults.joint_hidden} + activation: "relu" + dropout: 0.2 + + decoding: + strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. + + # greedy strategy config + greedy: + max_symbols: 10 + + # beam strategy config + beam: + beam_size: 2 + return_best_hypothesis: False + score_norm: true + tsd_max_sym_exp: 50 # for Time Synchronous Decoding + alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding + + # config for InterCTC loss: https://arxiv.org/abs/2102.03216 + # specify loss weights and which layers to use for InterCTC + # e.g., to reproduce the paper results, set loss_weights: [0.3] + # and apply_at_layers: [8] (assuming 18 layers). Note that final + # layer loss coefficient is automatically adjusted (to 0.7 in above example) + interctc: + loss_weights: [] + apply_at_layers: [] + + loss: + loss_name: "default" + + optim: + name: adamw + lr: 5.0 + # optimizer arguments + betas: [0.9, 0.98] + weight_decay: 1e-3 + + # scheduler setup + sched: + name: NoamAnnealing + d_model: ${model.encoder.d_model} + # scheduler config override + warmup_steps: 10000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: 1000 + max_steps: -1 # computed at runtime if not set + val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations + accelerator: auto + strategy: ddp + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + precision: 32 # 16, 32, or bf16 + log_every_n_steps: 10 # Interval of logging. + enable_progress_bar: True + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: false # Provided by exp_manager + logger: false # Provided by exp_manager + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_wer" + mode: "min" + save_top_k: 5 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + + # you need to set these two to True to continue the training + resume_if_exists: false + resume_ignore_no_checkpoint: false + + # You may use this section to create a W&B logger + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null diff --git a/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml index ff9b0bc44e21..8c7561381299 100644 --- a/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml +++ b/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml @@ -1,10 +1,27 @@ # It contains the default values for training a Fast Conformer-CTC ASR model, large size (~120M) with CTC loss and sub-word encoding. -# Architecture and training config: -# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective -# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. - -# You may find more info about Fast Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# You may find more info about FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer + +# We suggest to use trainer.precision=bf16 for GPUs which support it otherwise trainer.precision=16 is recommended. +# Using bf16 or 16 would make it possible to double the batch size and speedup training/inference. If fp16 is not stable and model diverges after some epochs, you may use fp32. +# Here are the suggested batch size per GPU for each precision and memory sizes: + +# +-----------+------------+------------+ +# | Precision | GPU Memory | Batch Size | +# +===========+============+============+ +# | 32 | 16GB | 16 | +# | | 32GB | 32 | +# | | 80GB | 64 | +# +-----------+------------+------------+ +# | fp16 or | 16GB | 32 | +# | bf16 | 32GB | 64 | +# | | 80GB | 128 | +# +-----------+------------+------------+ +# Note: They are based on the assumption of max_duration of 20. If you have longer or shorter max_duration, then batch sizes may need to get updated accordingly. + +# Default learning parameters in this config are set for global batch size of 2K while you may use lower values. +# To increase the global batch size with limited number of GPUs, you may use higher accumulate_grad_batches. +# However accumulate_grad_batches is better to be avoided as long as the global batch size is large enough and training is stable. name: "FastConformer-CTC-BPE" @@ -21,8 +38,6 @@ model: shuffle: true num_workers: 8 pin_memory: true - use_start_end_token: false - trim_silence: false max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset min_duration: 0.1 # tarred datasets @@ -38,18 +53,18 @@ model: sample_rate: ${model.sample_rate} batch_size: 16 # you may increase batch_size if your memory allows shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false test_ds: manifest_filepath: null sample_rate: ${model.sample_rate} batch_size: 16 # you may increase batch_size if your memory allows shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false # recommend vocab size of 128 or 256 when training on ~1k hr datasets and 1k vocab size on 10+k hr datasets # you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py @@ -167,7 +182,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml index a85b9c29b82f..0b0ec78e077d 100644 --- a/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml +++ b/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml @@ -1,10 +1,27 @@ # It contains the default values for training a Fast Conformer-Transducer ASR model, large size (~120M) with Transducer loss and sub-word encoding. -# Architecture and training config: -# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective -# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. - -# You may find more info about Fast Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# You may find more info about FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer + +# We suggest to use trainer.precision=bf16 for GPUs which support it otherwise trainer.precision=16 is recommended. +# Using bf16 or 16 would make it possible to double the batch size and speedup training/inference. If fp16 is not stable and model diverges after some epochs, you may use fp32. +# Here are the suggested batch size per GPU for each precision and memory sizes: + +# +-----------+------------+------------+ +# | Precision | GPU Memory | Batch Size | +# +===========+============+============+ +# | 32 | 16GB | 16 | +# | | 32GB | 32 | +# | | 80GB | 64 | +# +-----------+------------+------------+ +# | fp16 or | 16GB | 32 | +# | bf16 | 32GB | 64 | +# | | 80GB | 128 | +# +-----------+------------+------------+ +# Note: They are based on the assumption of max_duration of 20. If you have longer or shorter max_duration, then batch sizes may need to get updated accordingly. + +# Default learning parameters in this config are set for global batch size of 2K while you may use lower values. +# To increase the global batch size with limited number of GPUs, you may use higher accumulate_grad_batches. +# However accumulate_grad_batches is better to be avoided as long as the global batch size is large enough and training is stable. name: "FastConformer-Transducer-BPE" @@ -27,8 +44,6 @@ model: shuffle: true num_workers: 8 pin_memory: true - use_start_end_token: false - trim_silence: false max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset min_duration: 0.1 # tarred datasets @@ -44,18 +59,18 @@ model: sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false test_ds: manifest_filepath: null sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py tokenizer: @@ -164,7 +179,7 @@ model: # However, to preserve memory, this ratio can be 1:8 or even 1:16. # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. fuse_loss_wer: true - fused_batch_size: 16 + fused_batch_size: 4 jointnet: joint_hidden: ${model.model_defaults.joint_hidden} @@ -195,11 +210,6 @@ model: fastemit_lambda: 0.0 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. - # Adds Gaussian noise to the gradients of the decoder to avoid overfitting - variational_noise: - start_step: 0 - std: 0.0 - optim: name: adamw lr: 5e-3 @@ -225,7 +235,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml new file mode 100644 index 000000000000..619a608fa86f --- /dev/null +++ b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml @@ -0,0 +1,263 @@ +# It contains the default values for training a cache-aware streaming FastConformer-Hybrid-Transducer-CTC ASR model, large size (~115M) with sub-word encoding. +# The model would have two decoders: RNNT (Transducer) and CTC + +# You may find more detail: +# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# Hybrid ASR: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#hybrid-transducer-ctc +# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer +# FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml +# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml + +name: "FastConformer-Hybrid-Transducer-CTC-BPE-Streaming" + +model: + sample_rate: 16000 + compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. + log_prediction: true # enables logging sample predictions in the output during training + skip_nan_grad: false + + model_defaults: + enc_hidden: ${model.encoder.d_model} + pred_hidden: 640 + joint_hidden: 640 + + train_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 # you may increase batch_size if your memory allows + shuffle: true + num_workers: 8 + pin_memory: true + max_duration: 20 # you may need to update it for your dataset + min_duration: 0.1 + # tarred datasets + is_tarred: false + tarred_audio_filepaths: null + shuffle_n: 2048 + # bucketing params + bucketing_strategy: "synced_randomized" + bucketing_batch_size: null + + validation_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + use_start_end_token: false + num_workers: 8 + pin_memory: true + + test_ds: + manifest_filepath: null + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + use_start_end_token: false + num_workers: 8 + pin_memory: true + + # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py + # We recommend to use vocab size of 1024 with SPE Unigram for most languages + tokenizer: + dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) + type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: ${model.sample_rate} + normalize: "NA" # No normalization for mel-spectogram makes streaming easier + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + + # Sub-sampling parameters + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 8 # must be power of 2 for striding and vggnet + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + causal_downsampling: true + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large + # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + att_context_size: [70, 13] # -1 means unlimited context + att_context_style: chunked_limited # regular or chunked_limited + + xscaling: true # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly + conv_context_size: causal + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + decoder: + _target_: nemo.collections.asr.modules.RNNTDecoder + normalization_mode: null # Currently only null is supported for export. + random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf + blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. + + prednet: + pred_hidden: ${model.model_defaults.pred_hidden} + pred_rnn_layers: 1 + t_max: null + dropout: 0.2 + + joint: + _target_: nemo.collections.asr.modules.RNNTJoint + log_softmax: null # 'null' would set it automatically according to CPU/GPU device + preserve_memory: false # dramatically slows down training, but might preserve some memory + + # Fuses the computation of prediction net + joint net + loss + WER calculation + # to be run on sub-batches of size `fused_batch_size`. + # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. + # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. + # Using small values here will preserve a lot of memory during training, but will make training slower as well. + # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. + # However, to preserve memory, this ratio can be 1:8 or even 1:16. + # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. + fuse_loss_wer: true + fused_batch_size: 4 + + jointnet: + joint_hidden: ${model.model_defaults.joint_hidden} + activation: "relu" + dropout: 0.2 + + decoding: + strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. + + # greedy strategy config + greedy: + max_symbols: 10 + + # beam strategy config + beam: + beam_size: 2 + return_best_hypothesis: False + score_norm: true + tsd_max_sym_exp: 50 # for Time Synchronous Decoding + alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding + + # The section which would contain the decoder and decoding configs of the auxiliary CTC decoder + aux_ctc: + ctc_loss_weight: 0.3 # the weight used to combine the CTC loss with the RNNT loss + use_cer: false + ctc_reduction: 'mean_batch' + decoder: + _target_: nemo.collections.asr.modules.ConvASRDecoder + feat_in: null + num_classes: -1 + vocabulary: [] + decoding: + strategy: "greedy" + + # config for InterCTC loss: https://arxiv.org/abs/2102.03216 + # specify loss weights and which layers to use for InterCTC + # e.g., to reproduce the paper results, set loss_weights: [0.3] + # and apply_at_layers: [8] (assuming 18 layers). Note that final + # layer loss coefficient is automatically adjusted (to 0.7 in above example) + interctc: + loss_weights: [] + apply_at_layers: [] + + loss: + loss_name: "default" + + optim: + name: adamw + lr: 5.0 + # optimizer arguments + betas: [0.9, 0.98] + weight_decay: 1e-3 + + # scheduler setup + sched: + name: NoamAnnealing + d_model: ${model.encoder.d_model} + # scheduler config override + warmup_steps: 10000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: 1000 + max_steps: -1 # computed at runtime if not set + val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations + accelerator: auto + strategy: ddp + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + precision: 32 # 16, 32, or bf16 + log_every_n_steps: 10 # Interval of logging. + enable_progress_bar: True + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_wer" + mode: "min" + save_top_k: 5 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + resume_if_exists: false + resume_ignore_no_checkpoint: false + + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null diff --git a/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml new file mode 100644 index 000000000000..fc3176485b34 --- /dev/null +++ b/examples/asr/conf/fastconformer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml @@ -0,0 +1,271 @@ +# It contains the default values for training a Conformer-Hybrid-Transducer-CTC ASR model, large size (~115M) with char-based vocabulary. +# The model would have two decoders: RNNT (Transducer) and CTC + +# You may find more detail: +# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# Hybrid ASR: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#hybrid-transducer-ctc +# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer +# FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml +# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml + +name: "FastConformer-Hybrid-Transducer-CTC-Char-Streaming" + +model: + sample_rate: &sample_rate 16000 + compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. + log_prediction: true # enables logging sample predictions in the output during training + skip_nan_grad: false + + labels: [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", + "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] + + model_defaults: + enc_hidden: ${model.encoder.d_model} + pred_hidden: 640 + joint_hidden: 640 + + train_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 # you may increase batch_size if your memory allows + shuffle: true + num_workers: 8 + pin_memory: true + trim_silence: false + max_duration: 20 # you may need to update it for your dataset + min_duration: 0.1 + # tarred datasets + is_tarred: false + tarred_audio_filepaths: null + shuffle_n: 2048 + # bucketing params + bucketing_strategy: "synced_randomized" + bucketing_batch_size: null + + validation_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + use_start_end_token: false + num_workers: 8 + pin_memory: true + + test_ds: + manifest_filepath: null + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + use_start_end_token: false + num_workers: 8 + pin_memory: true + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: *sample_rate + normalize: "NA" # No normalization for mel-spectogram makes streaming easier + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + + # Sub-sampling params + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 8 # must be power of 2 for striding and vggnet + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + causal_downsampling: true + + # Reduction parameters: Can be used to add another subsampling layer at a given position. + # Having a 2x reduction will speedup the training and inference speech while keeping similar WER. + # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup. + reduction: null # pooling, striding, or null + reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder + reduction_factor: 1 + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large + # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + att_context_size: [70, 13] # -1 means unlimited context + att_context_style: chunked_limited # regular or chunked_limited + + xscaling: true # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly + conv_context_size: causal + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + decoder: + _target_: nemo.collections.asr.modules.RNNTDecoder + normalization_mode: null # Currently only null is supported for export. + random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf + blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. + + prednet: + pred_hidden: ${model.model_defaults.pred_hidden} + pred_rnn_layers: 1 + t_max: null + dropout: 0.2 + + joint: + _target_: nemo.collections.asr.modules.RNNTJoint + log_softmax: null # 'null' would set it automatically according to CPU/GPU device + preserve_memory: false # dramatically slows down training, but might preserve some memory + + # Fuses the computation of prediction net + joint net + loss + WER calculation + # to be run on sub-batches of size `fused_batch_size`. + # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. + # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. + # Using small values here will preserve a lot of memory during training, but will make training slower as well. + # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. + # However, to preserve memory, this ratio can be 1:8 or even 1:16. + # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. + fuse_loss_wer: true + fused_batch_size: 4 + + jointnet: + joint_hidden: ${model.model_defaults.joint_hidden} + activation: "relu" + dropout: 0.2 + + decoding: + strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. + + # greedy strategy config + greedy: + max_symbols: 10 + + # beam strategy config + beam: + beam_size: 2 + return_best_hypothesis: False + score_norm: true + tsd_max_sym_exp: 50 # for Time Synchronous Decoding + alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding + + # The section which would contain the decoder and decoding configs of the auxiliary CTC decoder + aux_ctc: + ctc_loss_weight: 0.3 # the weight used to combine the CTC loss with the RNNT loss + use_cer: false + ctc_reduction: 'mean_batch' + decoder: + _target_: nemo.collections.asr.modules.ConvASRDecoder + feat_in: null + num_classes: -1 + vocabulary: ${model.labels} + decoding: + strategy: "greedy" + + # config for InterCTC loss: https://arxiv.org/abs/2102.03216 + # specify loss weights and which layers to use for InterCTC + # e.g., to reproduce the paper results, set loss_weights: [0.3] + # and apply_at_layers: [8] (assuming 18 layers). Note that final + # layer loss coefficient is automatically adjusted (to 0.7 in above example) + interctc: + loss_weights: [] + apply_at_layers: [] + + loss: + loss_name: "default" + + optim: + name: adamw + lr: 5.0 + # optimizer arguments + betas: [0.9, 0.98] + weight_decay: 1e-3 + + # scheduler setup + sched: + name: NoamAnnealing + d_model: ${model.encoder.d_model} + # scheduler config override + warmup_steps: 10000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: 1000 + max_steps: -1 # computed at runtime if not set + val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations + accelerator: auto + strategy: ddp + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + precision: 32 # 16, 32, or bf16 + log_every_n_steps: 10 # Interval of logging. + enable_progress_bar: True + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: false # Provided by exp_manager + logger: false # Provided by exp_manager + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_wer" + mode: "min" + save_top_k: 5 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + + # you need to set these two to True to continue the training + resume_if_exists: false + resume_ignore_no_checkpoint: false + + # You may use this section to create a W&B logger + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null diff --git a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml new file mode 100644 index 000000000000..3b7d37a39bc6 --- /dev/null +++ b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_bpe.yaml @@ -0,0 +1,251 @@ +# It contains the default values for training a FastConformer-Hybrid-Transducer-CTC ASR model, large size (~115M) with sub-word encoding. +# The model would have two decoders: RNNT (Transducer) and CTC + +# You may find more detail: +# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# Hybrid ASR: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#hybrid-transducer-ctc +# FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml +# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml + +name: "FastConformer-Hybrid-Transducer-CTC-BPE" + +model: + sample_rate: 16000 + compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. + log_prediction: true # enables logging sample predictions in the output during training + skip_nan_grad: false + + model_defaults: + enc_hidden: ${model.encoder.d_model} + pred_hidden: 640 + joint_hidden: 640 + + train_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 # you may increase batch_size if your memory allows + shuffle: true + num_workers: 8 + pin_memory: true + max_duration: 20 # you may need to update it for your dataset + min_duration: 0.1 + # tarred datasets + is_tarred: false + tarred_audio_filepaths: null + shuffle_n: 2048 + # bucketing params + bucketing_strategy: "synced_randomized" + bucketing_batch_size: null + + validation_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + use_start_end_token: false + num_workers: 8 + pin_memory: true + + test_ds: + manifest_filepath: null + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + use_start_end_token: false + num_workers: 8 + pin_memory: true + + # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py + # We recommend to use vocab size of 1024 with SPE Unigram for most languages + tokenizer: + dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) + type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: ${model.sample_rate} + normalize: "per_feature" + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + + # Sub-sampling parameters + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 8 # must be power of 2 for striding and vggnet + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + att_context_size: [-1, -1] # -1 means unlimited context + att_context_style: regular # regular or chunked_limited + xscaling: true # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + decoder: + _target_: nemo.collections.asr.modules.RNNTDecoder + normalization_mode: null # Currently only null is supported for export. + random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf + blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. + + prednet: + pred_hidden: ${model.model_defaults.pred_hidden} + pred_rnn_layers: 1 + t_max: null + dropout: 0.2 + + joint: + _target_: nemo.collections.asr.modules.RNNTJoint + log_softmax: null # 'null' would set it automatically according to CPU/GPU device + preserve_memory: false # dramatically slows down training, but might preserve some memory + + # Fuses the computation of prediction net + joint net + loss + WER calculation + # to be run on sub-batches of size `fused_batch_size`. + # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. + # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. + # Using small values here will preserve a lot of memory during training, but will make training slower as well. + # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. + # However, to preserve memory, this ratio can be 1:8 or even 1:16. + # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. + fuse_loss_wer: true + fused_batch_size: 4 + + jointnet: + joint_hidden: ${model.model_defaults.joint_hidden} + activation: "relu" + dropout: 0.2 + + decoding: + strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. + + # greedy strategy config + greedy: + max_symbols: 10 + + # beam strategy config + beam: + beam_size: 2 + return_best_hypothesis: False + score_norm: true + tsd_max_sym_exp: 50 # for Time Synchronous Decoding + alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding + + # The section which would contain the decoder and decoding configs of the auxiliary CTC decoder + aux_ctc: + ctc_loss_weight: 0.3 # the weight used to combine the CTC loss with the RNNT loss + use_cer: false + ctc_reduction: 'mean_batch' + decoder: + _target_: nemo.collections.asr.modules.ConvASRDecoder + feat_in: null + num_classes: -1 + vocabulary: [] + decoding: + strategy: "greedy" + + # config for InterCTC loss: https://arxiv.org/abs/2102.03216 + # specify loss weights and which layers to use for InterCTC + # e.g., to reproduce the paper results, set loss_weights: [0.3] + # and apply_at_layers: [8] (assuming 18 layers). Note that final + # layer loss coefficient is automatically adjusted (to 0.7 in above example) + interctc: + loss_weights: [] + apply_at_layers: [] + + loss: + loss_name: "default" + + optim: + name: adamw + lr: 5.0 + # optimizer arguments + betas: [0.9, 0.98] + weight_decay: 1e-3 + + # scheduler setup + sched: + name: NoamAnnealing + d_model: ${model.encoder.d_model} + # scheduler config override + warmup_steps: 10000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: 1000 + max_steps: -1 # computed at runtime if not set + val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations + accelerator: auto + strategy: ddp + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + precision: 32 # 16, 32, or bf16 + log_every_n_steps: 10 # Interval of logging. + enable_progress_bar: True + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_wer" + mode: "min" + save_top_k: 5 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + resume_if_exists: false + resume_ignore_no_checkpoint: false + + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null diff --git a/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml new file mode 100644 index 000000000000..c87a66227043 --- /dev/null +++ b/examples/asr/conf/fastconformer/hybrid_transducer_ctc/fastconformer_hybrid_transducer_ctc_char.yaml @@ -0,0 +1,259 @@ +# It contains the default values for training a Conformer-Hybrid-Transducer-CTC ASR model, large size (~115M) with char-based vocabulary. +# The model would have two decoders: RNNT (Transducer) and CTC + +# You may find more detail: +# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# Hybrid ASR: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#hybrid-transducer-ctc +# FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml +# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml + +name: "FastConformer-Hybrid-Transducer-CTC-Char" + +model: + sample_rate: &sample_rate 16000 + compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. + log_prediction: true # enables logging sample predictions in the output during training + skip_nan_grad: false + + labels: [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", + "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] + + model_defaults: + enc_hidden: ${model.encoder.d_model} + pred_hidden: 640 + joint_hidden: 640 + + train_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 # you may increase batch_size if your memory allows + shuffle: true + num_workers: 8 + pin_memory: true + trim_silence: false + max_duration: 20 # you may need to update it for your dataset + min_duration: 0.1 + # tarred datasets + is_tarred: false + tarred_audio_filepaths: null + shuffle_n: 2048 + # bucketing params + bucketing_strategy: "synced_randomized" + bucketing_batch_size: null + + validation_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + use_start_end_token: false + num_workers: 8 + pin_memory: true + + test_ds: + manifest_filepath: null + sample_rate: ${model.sample_rate} + batch_size: 16 + shuffle: false + use_start_end_token: false + num_workers: 8 + pin_memory: true + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: *sample_rate + normalize: "per_feature" + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + + # Sub-sampling params + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 8 # must be power of 2 for striding and vggnet + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + + # Reduction parameters: Can be used to add another subsampling layer at a given position. + # Having a 2x reduction will speedup the training and inference speech while keeping similar WER. + # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup. + reduction: null # pooling, striding, or null + reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder + reduction_factor: 1 + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + att_context_size: [-1, -1] # -1 means unlimited context + att_context_style: regular # regular or chunked_limited + xscaling: true # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + decoder: + _target_: nemo.collections.asr.modules.RNNTDecoder + normalization_mode: null # Currently only null is supported for export. + random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf + blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. + + prednet: + pred_hidden: ${model.model_defaults.pred_hidden} + pred_rnn_layers: 1 + t_max: null + dropout: 0.2 + + joint: + _target_: nemo.collections.asr.modules.RNNTJoint + log_softmax: null # 'null' would set it automatically according to CPU/GPU device + preserve_memory: false # dramatically slows down training, but might preserve some memory + + # Fuses the computation of prediction net + joint net + loss + WER calculation + # to be run on sub-batches of size `fused_batch_size`. + # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. + # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. + # Using small values here will preserve a lot of memory during training, but will make training slower as well. + # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. + # However, to preserve memory, this ratio can be 1:8 or even 1:16. + # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. + fuse_loss_wer: true + fused_batch_size: 4 + + jointnet: + joint_hidden: ${model.model_defaults.joint_hidden} + activation: "relu" + dropout: 0.2 + + decoding: + strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. + + # greedy strategy config + greedy: + max_symbols: 10 + + # beam strategy config + beam: + beam_size: 2 + return_best_hypothesis: False + score_norm: true + tsd_max_sym_exp: 50 # for Time Synchronous Decoding + alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding + + # The section which would contain the decoder and decoding configs of the auxiliary CTC decoder + aux_ctc: + ctc_loss_weight: 0.3 # the weight used to combine the CTC loss with the RNNT loss + use_cer: false + ctc_reduction: 'mean_batch' + decoder: + _target_: nemo.collections.asr.modules.ConvASRDecoder + feat_in: null + num_classes: -1 + vocabulary: ${model.labels} + decoding: + strategy: "greedy" + + # config for InterCTC loss: https://arxiv.org/abs/2102.03216 + # specify loss weights and which layers to use for InterCTC + # e.g., to reproduce the paper results, set loss_weights: [0.3] + # and apply_at_layers: [8] (assuming 18 layers). Note that final + # layer loss coefficient is automatically adjusted (to 0.7 in above example) + interctc: + loss_weights: [] + apply_at_layers: [] + + loss: + loss_name: "default" + + optim: + name: adamw + lr: 5.0 + # optimizer arguments + betas: [0.9, 0.98] + weight_decay: 1e-3 + + # scheduler setup + sched: + name: NoamAnnealing + d_model: ${model.encoder.d_model} + # scheduler config override + warmup_steps: 10000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: 1000 + max_steps: -1 # computed at runtime if not set + val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations + accelerator: auto + strategy: ddp + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + precision: 32 # 16, 32, or bf16 + log_every_n_steps: 10 # Interval of logging. + enable_progress_bar: True + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: false # Provided by exp_manager + logger: false # Provided by exp_manager + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_wer" + mode: "min" + save_top_k: 5 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + + # you need to set these two to True to continue the training + resume_if_exists: false + resume_ignore_no_checkpoint: false + + # You may use this section to create a W&B logger + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null diff --git a/examples/asr/conf/fastconformer/fast-conformer-long_ctc_bpe.yaml b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml similarity index 93% rename from examples/asr/conf/fastconformer/fast-conformer-long_ctc_bpe.yaml rename to examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml index e83ef931cf5c..e098e8523df1 100644 --- a/examples/asr/conf/fastconformer/fast-conformer-long_ctc_bpe.yaml +++ b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_ctc_bpe.yaml @@ -1,11 +1,9 @@ # It contains the default values for training a Fast Conformer-CTC ASR model, large size (~120M) with CTC loss and sub-word encoding. # This version uses Longformer-style attention in order to handle longer audio -# Architecture and training config: -# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective -# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. - -# You may find more info about Fast Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# You may find more detail: +# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml # Differences from baseline config are in # model.encoder.global_tokens @@ -27,8 +25,6 @@ model: shuffle: true num_workers: 8 pin_memory: true - use_start_end_token: false - trim_silence: false max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset min_duration: 0.1 # tarred datasets @@ -44,18 +40,18 @@ model: sample_rate: ${model.sample_rate} batch_size: 16 # you may increase batch_size if your memory allows shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false test_ds: manifest_filepath: null sample_rate: ${model.sample_rate} batch_size: 16 # you may increase batch_size if your memory allows shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false # recommend vocab size of 128 or 256 when training on ~1k hr datasets and 1k vocab size on 10+k hr datasets # you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py @@ -174,7 +170,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/fastconformer/fast-conformer-long_transducer_bpe.yaml b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml similarity index 93% rename from examples/asr/conf/fastconformer/fast-conformer-long_transducer_bpe.yaml rename to examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml index 50ad302eef45..5570bb710760 100644 --- a/examples/asr/conf/fastconformer/fast-conformer-long_transducer_bpe.yaml +++ b/examples/asr/conf/fastconformer/long_fastconformer/fast-conformer-long_transducer_bpe.yaml @@ -1,11 +1,9 @@ # It contains the default values for training a Fast Conformer-Transducer ASR model, large size (~120M) with Transducer loss and sub-word encoding. # This version uses Longformer-style attention in order to handle longer audio -# Architecture and training config: -# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective -# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. - -# You may find more info about Fast Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# You may find more detail: +# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# FastConformer-Transducer's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml # Differences from baseline config are in # model.encoder.global_tokens @@ -33,8 +31,6 @@ model: shuffle: true num_workers: 8 pin_memory: true - use_start_end_token: false - trim_silence: false max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset min_duration: 0.1 # tarred datasets @@ -50,18 +46,18 @@ model: sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false test_ds: manifest_filepath: null sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false + use_start_end_token: false num_workers: 8 pin_memory: true - use_start_end_token: false # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py tokenizer: @@ -172,7 +168,7 @@ model: # However, to preserve memory, this ratio can be 1:8 or even 1:16. # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. fuse_loss_wer: true - fused_batch_size: 16 + fused_batch_size: 4 jointnet: joint_hidden: ${model.model_defaults.joint_hidden} @@ -203,11 +199,6 @@ model: fastemit_lambda: 0.0 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. - # Adds Gaussian noise to the gradients of the decoder to avoid overfitting - variational_noise: - start_step: 0 - std: 0.0 - optim: name: adamw lr: 2.5e-3 @@ -233,7 +224,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/lstm/lstm_ctc_bpe.yaml b/examples/asr/conf/lstm/lstm_ctc_bpe.yaml index e899f44f97ef..121e8ac53751 100644 --- a/examples/asr/conf/lstm/lstm_ctc_bpe.yaml +++ b/examples/asr/conf/lstm/lstm_ctc_bpe.yaml @@ -25,8 +25,6 @@ model: shuffle: true num_workers: 4 pin_memory: true - use_start_end_token: false - trim_silence: false max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset min_duration: 0.1 # tarred datasets @@ -44,7 +42,6 @@ model: shuffle: false num_workers: 4 pin_memory: true - use_start_end_token: false test_ds: manifest_filepath: null @@ -53,7 +50,6 @@ model: shuffle: false num_workers: 4 pin_memory: true - use_start_end_token: false # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py tokenizer: @@ -129,7 +125,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.3 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/lstm/lstm_transducer_bpe.yaml b/examples/asr/conf/lstm/lstm_transducer_bpe.yaml index e35bb95d291f..d694fb4b2049 100644 --- a/examples/asr/conf/lstm/lstm_transducer_bpe.yaml +++ b/examples/asr/conf/lstm/lstm_transducer_bpe.yaml @@ -31,8 +31,6 @@ model: shuffle: true num_workers: 4 pin_memory: true - use_start_end_token: false - trim_silence: false max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset min_duration: 0.1 # tarred datasets @@ -50,7 +48,6 @@ model: shuffle: false num_workers: 4 pin_memory: true - use_start_end_token: false test_ds: manifest_filepath: null @@ -59,7 +56,6 @@ model: shuffle: false num_workers: 4 pin_memory: true - use_start_end_token: false # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py tokenizer: @@ -130,7 +126,7 @@ model: # However, to preserve memory, this ratio can be 1:8 or even 1:16. # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. fuse_loss_wer: true - fused_batch_size: 16 + fused_batch_size: 4 jointnet: joint_hidden: ${model.model_defaults.joint_hidden} @@ -161,11 +157,6 @@ model: # using fastemit_lambda=1e-3 can help the accuracy of the model when it is unidirectional fastemit_lambda: 0.0 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. - # Adds Gaussian noise to the gradients of the decoder to avoid overfitting - variational_noise: - start_step: 0 - std: 0.0 - optim: name: adamw lr: 5.0 @@ -192,7 +183,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.3 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/squeezeformer/squeezeformer_ctc_bpe.yaml b/examples/asr/conf/squeezeformer/squeezeformer_ctc_bpe.yaml index a045bd206fc0..ca56d62ebe6a 100644 --- a/examples/asr/conf/squeezeformer/squeezeformer_ctc_bpe.yaml +++ b/examples/asr/conf/squeezeformer/squeezeformer_ctc_bpe.yaml @@ -176,7 +176,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/squeezeformer/squeezeformer_ctc_char.yaml b/examples/asr/conf/squeezeformer/squeezeformer_ctc_char.yaml index 5230e6d151a6..55faaf69d5e6 100644 --- a/examples/asr/conf/squeezeformer/squeezeformer_ctc_char.yaml +++ b/examples/asr/conf/squeezeformer/squeezeformer_ctc_char.yaml @@ -161,7 +161,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/ssl/citrinet/citrinet_ssl_1024.yaml b/examples/asr/conf/ssl/citrinet/citrinet_ssl_1024.yaml index 2579b9777199..d77deb021bb9 100644 --- a/examples/asr/conf/ssl/citrinet/citrinet_ssl_1024.yaml +++ b/examples/asr/conf/ssl/citrinet/citrinet_ssl_1024.yaml @@ -478,7 +478,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 1.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/ssl/conformer/conformer_ssl.yaml b/examples/asr/conf/ssl/conformer/conformer_ssl.yaml index 37df0b131085..c0d8d7b317f9 100644 --- a/examples/asr/conf/ssl/conformer/conformer_ssl.yaml +++ b/examples/asr/conf/ssl/conformer/conformer_ssl.yaml @@ -188,7 +188,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 1.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/ssl/contextnet/contextnet_ssl.yaml b/examples/asr/conf/ssl/contextnet/contextnet_ssl.yaml index 54e73213ae45..d525e1c5d1b4 100644 --- a/examples/asr/conf/ssl/contextnet/contextnet_ssl.yaml +++ b/examples/asr/conf/ssl/contextnet/contextnet_ssl.yaml @@ -442,7 +442,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 1.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. diff --git a/examples/asr/conf/ssl/wav2vec/wav2vec_ci.yaml b/examples/asr/conf/ssl/wav2vec/wav2vec_ci.yaml index 7746b9a17e59..4d1d5b657d32 100644 --- a/examples/asr/conf/ssl/wav2vec/wav2vec_ci.yaml +++ b/examples/asr/conf/ssl/wav2vec/wav2vec_ci.yaml @@ -136,7 +136,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 100 # Interval of logging. resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it diff --git a/examples/asr/conf/ssl/wav2vec/wav2vec_pretrain.yaml b/examples/asr/conf/ssl/wav2vec/wav2vec_pretrain.yaml index 5678c3374921..f12204f7d882 100644 --- a/examples/asr/conf/ssl/wav2vec/wav2vec_pretrain.yaml +++ b/examples/asr/conf/ssl/wav2vec/wav2vec_pretrain.yaml @@ -132,7 +132,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 100 # Interval of logging. resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it diff --git a/examples/asr/conf/wav2vec_ctc/wav2vecCTC.yaml b/examples/asr/conf/wav2vec_ctc/wav2vecCTC.yaml index 02c6eb2a6273..10acaa9b88f3 100644 --- a/examples/asr/conf/wav2vec_ctc/wav2vecCTC.yaml +++ b/examples/asr/conf/wav2vec_ctc/wav2vecCTC.yaml @@ -131,7 +131,7 @@ trainer: strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 0.0 - precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. + precision: 32 # 16, 32, or bf16 log_every_n_steps: 100 # Interval of logging. resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it From 244ba8deeb15f10f02ff8be749276bf84a29be13 Mon Sep 17 00:00:00 2001 From: Andrei Andrusenko <52885736+andrusenkoau@users.noreply.github.com> Date: Sat, 22 Apr 2023 01:44:03 +0400 Subject: [PATCH 12/23] Update script for ngram rnnt and hat beam search decoding (#6370) * add rnnt ngram beamsearch script Signed-off-by: andrusenkoau * add return encoding embedding option Signed-off-by: andrusenkoau * update script Signed-off-by: andrusenkoau * add rnnt and hat ngram decoding script Signed-off-by: andrusenkoau * add some parameters Signed-off-by: andrusenkoau * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add return_encoder_embeddings parameter to RNNTDecodingConfig Signed-off-by: andrusenkoau * replace return_encoder_embeddings parameter Signed-off-by: andrusenkoau * generalization of scipt behavior Signed-off-by: andrusenkoau * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove return_encoder_embeddings parameter Signed-off-by: andrusenkoau * remove return_encoder_embeddings parameter Signed-off-by: andrusenkoau * add manual encoder_embeddings calculation Signed-off-by: andrusenkoau * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix beam_width value to 8 Signed-off-by: Andrei Andrusenko <52885736+andrusenkoau@users.noreply.github.com> * fix rescoring description Signed-off-by: Andrei Andrusenko <52885736+andrusenkoau@users.noreply.github.com> --------- Signed-off-by: andrusenkoau Signed-off-by: Andrei Andrusenko <52885736+andrusenkoau@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Somshubra Majumdar --- docs/source/asr/asr_language_modeling.rst | 23 + nemo/collections/asr/models/rnnt_models.py | 1 + .../eval_beamsearch_ngram_transducer.py | 534 ++++++++++++------ 3 files changed, 384 insertions(+), 174 deletions(-) diff --git a/docs/source/asr/asr_language_modeling.rst b/docs/source/asr/asr_language_modeling.rst index 308df41b28f6..a0e578092f50 100644 --- a/docs/source/asr/asr_language_modeling.rst +++ b/docs/source/asr/asr_language_modeling.rst @@ -281,6 +281,29 @@ For instance, the following set of parameters would results in 2*1*2=4 beam sear beam_beta=[1.0,0.5] +Beam search ngram decoding for Transducer models (RNNT and HAT) +=============================================================== + +The similar script to evaluate an RNNT/HAT model with beam search decoding and N-gram models can be found at +`scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py `_ + +.. code-block:: + + python eval_beamsearch_ngram_transducer.py nemo_model_file= \ + input_manifest= \ + beam_width=[] \ + beam_alpha=[] \ + preds_output_folder= \ + probs_cache_file=null \ + decoding_strategy= + maes_prefix_alpha=[] \ + maes_expansion_gamma=[] \ + hat_subtract_ilm= \ + hat_ilm_weight=[] \ + + + .. _neural_rescoring: **************** diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py index a3e36dbc1522..f4e227f510af 100644 --- a/nemo/collections/asr/models/rnnt_models.py +++ b/nemo/collections/asr/models/rnnt_models.py @@ -242,6 +242,7 @@ def transcribe( """ if paths2audio_files is None or len(paths2audio_files) == 0: return {} + # We will store transcriptions here hypotheses = [] all_hypotheses = [] diff --git a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py index 4e3c342b9da1..bbc33d214636 100644 --- a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py +++ b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py @@ -13,257 +13,443 @@ # limitations under the License. # +""" # This script would evaluate an N-gram language model trained with KenLM library (https://github.com/kpu/kenlm) in -# fusion with beam search decoders on top of a trained ASR model. NeMo's beam search decoders are capable of using the +# fusion with beam search decoders on top of a trained ASR Transducer model. NeMo's beam search decoders are capable of using the # KenLM's N-gram models to find the best candidates. This script supports both character level and BPE level # encodings and models which is detected automatically from the type of the model. # You may train the LM model with 'scripts/ngram_lm/train_kenlm.py'. -# -# USAGE: python eval_beamsearch_ngram.py --nemo_model_file= \ -# --input_manifest= \ -# --beam_width= \ -# --beam_alpha= \ -# --preds_output_folder= \ -# --decoding_mode=maes -# ... -# + +# Config Help + +To discover all arguments of the script, please run : +python eval_beamsearch_ngram.py --help +python eval_beamsearch_ngram.py --cfg job + +# USAGE + +python eval_beamsearch_ngram_transducer.py nemo_model_file= \ + input_manifest= \ + beam_width=[] \ + beam_alpha=[] \ + preds_output_folder= \ + probs_cache_file=null \ + decoding_strategy= + maes_prefix_alpha=[] \ + maes_expansion_gamma=[] \ + hat_subtract_ilm= \ + hat_ilm_weight=[] \ + ... + + +# Grid Search for Hyper parameters + +For grid search, you can provide a list of arguments as follows - + + beam_width=[4,8,16,....] \ + beam_alpha=[-2.0,-1.0,...,1.0,2.0] \ + # You may find more info on how to use this script at: # https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/asr_language_modeling.html +""" + -import argparse import contextlib import json import os import pickle +import tempfile +from dataclasses import dataclass, field, is_dataclass +from pathlib import Path +from typing import List, Optional import editdistance +import numpy as np import torch -from omegaconf import OmegaConf +from omegaconf import MISSING, OmegaConf +from sklearn.model_selection import ParameterGrid from tqdm.auto import tqdm import nemo.collections.asr as nemo_asr -from nemo.collections.asr.metrics.rnnt_wer_bpe import RNNTBPEDecodingConfig +from nemo.collections.asr.parts.submodules import rnnt_beam_decoding +from nemo.core.config import hydra_runner from nemo.utils import logging +# fmt: off + + +@dataclass +class EvalBeamSearchNGramConfig: + """ + Evaluate an ASR model with beam search decoding and n-gram KenLM language model. + """ + # # The path of the '.nemo' file of the ASR model or the name of a pretrained model (ngc / huggingface) + nemo_model_file: str = MISSING + + # File paths + input_manifest: str = MISSING # The manifest file of the evaluation set + kenlm_model_file: Optional[str] = None # The path of the KenLM binary model file + preds_output_folder: Optional[str] = None # The optional folder where the predictions are stored + probs_cache_file: Optional[str] = None # The cache file for storing the logprobs of the model + + # Parameters for inference + acoustic_batch_size: int = 128 # The batch size to calculate log probabilities + beam_batch_size: int = 128 # The batch size to be used for beam search decoding + device: str = "cuda" # The device to load the model onto to calculate log probabilities + use_amp: bool = False # Whether to use AMP if available to calculate log probabilities + num_workers: int = 1 # Number of workers for DataLoader + + # The decoding scheme to be used for evaluation + decoding_strategy: str = "greedy_batch" # ["greedy_batch", "beam", "tsd", "alsd", "maes"] + + # Beam Search hyperparameters + beam_width: List[int] = field(default_factory=lambda: [8]) # The width or list of the widths for the beam search decoding + beam_alpha: List[float] = field(default_factory=lambda: [0.2]) # The alpha parameter or list of the alphas for the beam search decoding + + maes_prefix_alpha: List[int] = field(default_factory=lambda: [2]) # The maes_prefix_alpha or list of the maes_prefix_alpha for the maes decoding + maes_expansion_gamma: List[float] = field(default_factory=lambda: [2.3]) # The maes_expansion_gamma or list of the maes_expansion_gamma for the maes decoding + + # HAT related parameters (only for internal lm subtraction) + hat_subtract_ilm: bool = False + hat_ilm_weight: List[float] = field(default_factory=lambda: [0.0]) + + decoding: rnnt_beam_decoding.BeamRNNTInferConfig = rnnt_beam_decoding.BeamRNNTInferConfig(beam_size=128) + + +# fmt: on + + +def decoding_step( + model: nemo_asr.models.ASRModel, + cfg: EvalBeamSearchNGramConfig, + all_probs: List[torch.Tensor], + target_transcripts: List[str], + preds_output_file: str = None, + beam_batch_size: int = 128, + progress_bar: bool = True, +): + level = logging.getEffectiveLevel() + logging.setLevel(logging.CRITICAL) + # Reset config + model.change_decoding_strategy(None) + + cfg.decoding.hat_ilm_weight = cfg.decoding.hat_ilm_weight * cfg.hat_subtract_ilm + # Override the beam search config with current search candidate configuration + cfg.decoding.return_best_hypothesis = False + cfg.decoding.ngram_lm_model = cfg.kenlm_model_file + cfg.decoding.hat_subtract_ilm = cfg.hat_subtract_ilm + + # Update model's decoding strategy config + model.cfg.decoding.strategy = cfg.decoding_strategy + model.cfg.decoding.beam = cfg.decoding + + # Update model's decoding strategy + model.change_decoding_strategy(model.cfg.decoding) + logging.setLevel(level) -def beam_search_eval(all_hypotheses, target_transcripts, preds_output_file=None): wer_dist_first = cer_dist_first = 0 wer_dist_best = cer_dist_best = 0 words_count = 0 chars_count = 0 + sample_idx = 0 if preds_output_file: - out_file = open(preds_output_file, 'w') - - it = tqdm(range(len(all_hypotheses)), desc=f"Beam search decoding...", ncols=120,) - - for sample_idx in it: - hypotheses = all_hypotheses[sample_idx] - target = target_transcripts[sample_idx] - target_split_w = target.split() - target_split_c = list(target) - words_count += len(target_split_w) - chars_count += len(target_split_c) - wer_dist_min = cer_dist_min = 10000 - if not isinstance(hypotheses, list): - hypotheses = [hypotheses] - for candidate_idx, candidate in enumerate(hypotheses): - pred_text = candidate.text - pred_split_w = pred_text.split() - wer_dist = editdistance.eval(target_split_w, pred_split_w) - pred_split_c = list(pred_text) - cer_dist = editdistance.eval(target_split_c, pred_split_c) - - wer_dist_min = min(wer_dist_min, wer_dist) - cer_dist_min = min(cer_dist_min, cer_dist) - - if candidate_idx == 0: - # first candidate - wer_dist_first += wer_dist - cer_dist_first += cer_dist - - if preds_output_file: - out_file.write('{}\t{}\n'.format(pred_text, candidate.score)) - wer_dist_best += wer_dist_min - cer_dist_best += cer_dist_min + out_file = open(preds_output_file, 'w', encoding='utf_8', newline='\n') + + if progress_bar: + if cfg.decoding_strategy == "greedy_batch": + description = "Greedy_batch decoding.." + else: + description = f"{cfg.decoding_strategy} decoding with bw={cfg.decoding.beam_size}, ba={cfg.decoding.ngram_lm_alpha}, ma={cfg.decoding.maes_prefix_alpha}, mg={cfg.decoding.maes_expansion_gamma}, hat_ilmw={cfg.decoding.hat_ilm_weight}" + it = tqdm(range(int(np.ceil(len(all_probs) / beam_batch_size))), desc=description, ncols=120) + else: + it = range(int(np.ceil(len(all_probs) / beam_batch_size))) + for batch_idx in it: + # disabling type checking + probs_batch = all_probs[batch_idx * beam_batch_size : (batch_idx + 1) * beam_batch_size] + probs_lens = torch.tensor([prob.shape[-1] for prob in probs_batch]) + with torch.no_grad(): + packed_batch = torch.zeros(len(probs_batch), probs_batch[0].shape[0], max(probs_lens), device='cpu') + + for prob_index in range(len(probs_batch)): + packed_batch[prob_index, :, : probs_lens[prob_index]] = torch.tensor( + probs_batch[prob_index].unsqueeze(0), device=packed_batch.device, dtype=packed_batch.dtype + ) + best_hyp_batch, beams_batch = model.decoding.rnnt_decoder_predictions_tensor( + packed_batch, probs_lens, return_hypotheses=True, + ) + if cfg.decoding_strategy == "greedy_batch": + beams_batch = [[x] for x in best_hyp_batch] + + for beams_idx, beams in enumerate(beams_batch): + target = target_transcripts[sample_idx + beams_idx] + target_split_w = target.split() + target_split_c = list(target) + words_count += len(target_split_w) + chars_count += len(target_split_c) + wer_dist_min = cer_dist_min = 10000 + for candidate_idx, candidate in enumerate(beams): # type: (int, rnnt_beam_decoding.rnnt_utils.Hypothesis) + pred_text = candidate.text + pred_split_w = pred_text.split() + wer_dist = editdistance.eval(target_split_w, pred_split_w) + pred_split_c = list(pred_text) + cer_dist = editdistance.eval(target_split_c, pred_split_c) + + wer_dist_min = min(wer_dist_min, wer_dist) + cer_dist_min = min(cer_dist_min, cer_dist) + + if candidate_idx == 0: + # first candidate + wer_dist_first += wer_dist + cer_dist_first += cer_dist + + score = candidate.score + if preds_output_file: + out_file.write('{}\t{}\n'.format(pred_text, score)) + wer_dist_best += wer_dist_min + cer_dist_best += cer_dist_min + sample_idx += len(probs_batch) + + if cfg.decoding_strategy == "greedy_batch": + return wer_dist_first / words_count, cer_dist_first / chars_count if preds_output_file: out_file.close() - logging.info(f"Stored the predictions of beam search decoding at '{preds_output_file}'.") + logging.info(f"Stored the predictions of {cfg.decoding_strategy} decoding at '{preds_output_file}'.") - logging.info( - 'WER/CER with the provided decoding strategy = {:.2%}/{:.2%}'.format( - wer_dist_first / words_count, cer_dist_first / chars_count + if cfg.decoding.ngram_lm_model: + logging.info( + f"WER/CER with {cfg.decoding_strategy} decoding and N-gram model = {wer_dist_first / words_count:.2%}/{cer_dist_first / chars_count:.2%}" + ) + else: + logging.info( + f"WER/CER with {cfg.decoding_strategy} decoding = {wer_dist_first / words_count:.2%}/{cer_dist_first / chars_count:.2%}" ) - ) - logging.info( - 'Oracle WER/CER in candidates = {:.2%}/{:.2%}'.format(wer_dist_best / words_count, cer_dist_best / chars_count) + f"Oracle WER/CER in candidates with perfect LM= {wer_dist_best / words_count:.2%}/{cer_dist_best / chars_count:.2%}" ) logging.info(f"=================================================================================") + return wer_dist_first / words_count, cer_dist_first / chars_count -def main(): - parser = argparse.ArgumentParser( - description='Evaluate an ASR model with beam search decoding and n-gram KenLM language model.' - ) - parser.add_argument( - "--nemo_model_file", required=True, type=str, help="The path of the '.nemo' file of the ASR model" - ) - parser.add_argument( - "--kenlm_model_file", required=False, default=None, type=str, help="The path of the KenLM binary model file" - ) - parser.add_argument("--input_manifest", required=True, type=str, help="The manifest file of the evaluation set") - parser.add_argument( - "--preds_output_folder", default=None, type=str, help="The optional folder where the predictions are stored" - ) - parser.add_argument( - "--probs_cache_file", default=None, type=str, help="The cache file for storing the outputs of the model" - ) - parser.add_argument( - "--acoustic_batch_size", default=16, type=int, help="The batch size to calculate log probabilities" - ) - parser.add_argument( - "--device", default="cuda", type=str, help="The device to load the model onto to calculate log probabilities" - ) - parser.add_argument( - "--use_amp", action="store_true", help="Whether to use AMP if available to calculate log probabilities" - ) - parser.add_argument( - "--decoding_mode", - choices=["greedy", "greedy_batch", "beam", "tsd", "alsd", "maes"], - default="beam", - type=str, - help="The decoding scheme to be used for evaluation.", - ) - parser.add_argument( - "--beam_width", required=True, type=int, help="The width for the beam search decoding", - ) - parser.add_argument( - "--beam_alpha", required=True, type=float, help="The alpha parameter for the beam search decoding", - ) - parser.add_argument( - "--beam_batch_size", default=128, type=int, help="The batch size to be used for beam search decoding" - ) - parser.add_argument( - "--maes_prefix_alpha", - default=1, - type=int, - help="Float pruning threshold used in the prune-by-value step when computing the expansions.", - ) - parser.add_argument( - "--maes_expansion_gamma", default=2.3, type=float, help="Maximum prefix length in prefix search" - ) - parser.add_argument( - "--hat_subtract_ilm", action="store_true", help="Subtract internal LM from the final HAT logprobs" - ) - parser.add_argument("--hat_ilm_weight", default=0.0, type=float, help="lamda2 weight for HAT ILM subsrtact") - args = parser.parse_args() +@hydra_runner(config_path=None, config_name='EvalBeamSearchNGramConfig', schema=EvalBeamSearchNGramConfig) +def main(cfg: EvalBeamSearchNGramConfig): + if is_dataclass(cfg): + cfg = OmegaConf.structured(cfg) # type: EvalBeamSearchNGramConfig - if args.kenlm_model_file and args.decoding_mode != "maes": - raise ValueError("External n-gram LM fusion is available only for 'maes' decoding mode.") + valid_decoding_strategis = ["greedy_batch", "beam", "tsd", "alsd", "maes"] + if cfg.decoding_strategy not in valid_decoding_strategis: + raise ValueError( + f"Given decoding_strategy={cfg.decoding_strategy} is invalid. Available options are :\n" + f"{valid_decoding_strategis}" + ) - if args.nemo_model_file.endswith('.nemo'): - asr_model = nemo_asr.models.ASRModel.restore_from(args.nemo_model_file, map_location=torch.device(args.device)) + if cfg.nemo_model_file.endswith('.nemo'): + asr_model = nemo_asr.models.ASRModel.restore_from(cfg.nemo_model_file, map_location=torch.device(cfg.device)) else: logging.warning( "nemo_model_file does not end with .nemo, therefore trying to load a pretrained model with this name." ) asr_model = nemo_asr.models.ASRModel.from_pretrained( - args.nemo_model_file, map_location=torch.device(args.device) + cfg.nemo_model_file, map_location=torch.device(cfg.device) ) + if cfg.kenlm_model_file: + if not os.path.exists(cfg.kenlm_model_file): + raise FileNotFoundError(f"Could not find the KenLM model file '{cfg.kenlm_model_file}'.") + if cfg.decoding_strategy != "maes": + raise ValueError(f"Decoding with kenlm model is supported only for maes decoding algorithm.") + lm_path = cfg.kenlm_model_file + else: + lm_path = None + cfg.beam_alpha = [0.0] + if cfg.hat_subtract_ilm: + assert lm_path, "kenlm must be set for hat internal lm subtraction" + + if cfg.decoding_strategy != "maes": + cfg.maes_prefix_alpha, cfg.maes_expansion_gamma, cfg.hat_ilm_weight = [0], [0], [0] + target_transcripts = [] - with open(args.input_manifest, 'r') as manifest_file: + manifest_dir = Path(cfg.input_manifest).parent + with open(cfg.input_manifest, 'r', encoding='utf_8') as manifest_file: audio_file_paths = [] - durations = [] - for line in tqdm(manifest_file, desc=f"Reading Manifest {args.input_manifest} ...", ncols=120): + for line in tqdm(manifest_file, desc=f"Reading Manifest {cfg.input_manifest} ...", ncols=120): data = json.loads(line) + audio_file = Path(data['audio_filepath']) + if not audio_file.is_file() and not audio_file.is_absolute(): + audio_file = manifest_dir / audio_file target_transcripts.append(data['text']) - audio_file_paths.append(data['audio_filepath']) - durations.append(data['duration']) + audio_file_paths.append(str(audio_file.absolute())) - if args.probs_cache_file and os.path.exists(args.probs_cache_file): - logging.info(f"Found a pickle file of probabilities at '{args.probs_cache_file}'.") - logging.info(f"Loading the cached pickle file of probabilities from '{args.probs_cache_file}' ...") - with open(args.probs_cache_file, 'rb') as probs_file: + if cfg.probs_cache_file and os.path.exists(cfg.probs_cache_file): + logging.info(f"Found a pickle file of probabilities at '{cfg.probs_cache_file}'.") + logging.info(f"Loading the cached pickle file of probabilities from '{cfg.probs_cache_file}' ...") + with open(cfg.probs_cache_file, 'rb') as probs_file: all_probs = pickle.load(probs_file) if len(all_probs) != len(audio_file_paths): raise ValueError( - f"The number of samples in the probabilities file '{args.probs_cache_file}' does not " + f"The number of samples in the probabilities file '{cfg.probs_cache_file}' does not " f"match the manifest file. You may need to delete the probabilities cached file." ) else: - asr_model = asr_model.eval() - rnnt_cfg = RNNTBPEDecodingConfig() - rnnt_cfg.strategy = args.decoding_mode # beam greedy - rnnt_cfg.beam.beam_size = args.beam_width - rnnt_cfg.beam.ngram_lm_model = args.kenlm_model_file - rnnt_cfg.beam.ngram_lm_alpha = args.beam_alpha # 0.2, 0.3 - rnnt_cfg.compute_hypothesis_token_set = False - rnnt_cfg.beam.return_best_hypothesis = False - rnnt_cfg.beam.maes_prefix_alpha = args.maes_prefix_alpha - rnnt_cfg.beam.maes_expansion_gamma = args.maes_expansion_gamma - rnnt_cfg.beam.hat_subtract_ilm = args.hat_subtract_ilm - rnnt_cfg.beam.hat_ilm_weight = args.hat_ilm_weight - asr_model.change_decoding_strategy(OmegaConf.structured(rnnt_cfg)) @contextlib.contextmanager def default_autocast(): yield - if args.use_amp: + if cfg.use_amp: if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'): logging.info("AMP is enabled!\n") autocast = torch.cuda.amp.autocast + else: autocast = default_autocast else: - autocast = default_autocast - - params = {'beam_width': args.beam_width, 'beam_alpha': args.beam_alpha} - logging.info(f"==============================Starting the beam search decoding===============================") - logging.info(f"Beam search params: {params}") - logging.info(f"It may take some time...") - logging.info(f"==============================================================================================") + autocast = default_autocast + # manual calculation of encoder_embeddings with autocast(): with torch.no_grad(): - hypotheses, all_hypotheses = asr_model.transcribe( - audio_file_paths, batch_size=args.acoustic_batch_size, return_hypotheses=True - ) - - # delete the model to free the memory - del asr_model + asr_model.eval() + asr_model.encoder.freeze() + device = next(asr_model.parameters()).device + all_probs = [] + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, 'manifest.json'), 'w', encoding='utf-8') as fp: + for audio_file in audio_file_paths: + entry = {'audio_filepath': audio_file, 'duration': 100000, 'text': ''} + fp.write(json.dumps(entry) + '\n') + config = { + 'paths2audio_files': audio_file_paths, + 'batch_size': cfg.acoustic_batch_size, + 'temp_dir': tmpdir, + 'num_workers': cfg.num_workers, + 'channel_selector': None, + 'augmentor': None, + } + temporary_datalayer = asr_model._setup_transcribe_dataloader(config) + for test_batch in tqdm(temporary_datalayer, desc="Transcribing", disable=True): + encoded, encoded_len = asr_model.forward( + input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) + ) + # dump encoder embeddings per file + for idx in range(encoded.shape[0]): + encoded_no_pad = encoded[idx, :, : encoded_len[idx]] + all_probs.append(encoded_no_pad) + + if cfg.probs_cache_file: + logging.info(f"Writing pickle files of probabilities at '{cfg.probs_cache_file}'...") + with open(cfg.probs_cache_file, 'wb') as f_dump: + pickle.dump(all_probs, f_dump) + + if cfg.decoding_strategy == "greedy_batch": + asr_model = asr_model.to('cpu') + candidate_wer, candidate_cer = decoding_step( + asr_model, + cfg, + all_probs=all_probs, + target_transcripts=target_transcripts, + beam_batch_size=cfg.beam_batch_size, + progress_bar=True, + ) + logging.info(f"Greedy batch WER/CER = {candidate_wer:.2%}/{candidate_cer:.2%}") + + asr_model = asr_model.to('cpu') + + # 'greedy_batch' decoding_strategy would skip the beam search decoding + if cfg.decoding_strategy in ["beam", "tsd", "alsd", "maes"]: + if cfg.beam_width is None or cfg.beam_alpha is None: + raise ValueError("beam_width and beam_alpha are needed to perform beam search decoding.") + params = { + 'beam_width': cfg.beam_width, + 'beam_alpha': cfg.beam_alpha, + 'maes_prefix_alpha': cfg.maes_prefix_alpha, + 'maes_expansion_gamma': cfg.maes_expansion_gamma, + 'hat_ilm_weight': cfg.hat_ilm_weight, + } + hp_grid = ParameterGrid(params) + hp_grid = list(hp_grid) + + best_wer_beam_size, best_cer_beam_size = None, None + best_wer_alpha, best_cer_alpha = None, None + best_wer, best_cer = 1e6, 1e6 + + logging.info( + f"==============================Starting the {cfg.decoding_strategy} decoding===============================" + ) + logging.info(f"Grid search size: {len(hp_grid)}") + logging.info(f"It may take some time...") + logging.info(f"==============================================================================================") - if args.preds_output_folder and not os.path.exists(args.preds_output_folder): - os.mkdir(args.preds_output_folder) + if cfg.preds_output_folder and not os.path.exists(cfg.preds_output_folder): + os.mkdir(cfg.preds_output_folder) + for hp in hp_grid: + if cfg.preds_output_folder: + results_file = f"preds_out_{cfg.decoding_strategy}_bw{hp['beam_width']}" + if cfg.decoding_strategy == "maes": + results_file = f"{results_file}_ma{hp['maes_prefix_alpha']}_mg{hp['maes_expansion_gamma']}" + if cfg.kenlm_model_file: + results_file = f"{results_file}_ba{hp['beam_alpha']}" + if cfg.hat_subtract_ilm: + results_file = f"{results_file}_hat_ilmw{hp['hat_ilm_weight']}" + preds_output_file = os.path.join(cfg.preds_output_folder, f"{results_file}.tsv") + else: + preds_output_file = None + + cfg.decoding.beam_size = hp["beam_width"] + cfg.decoding.ngram_lm_alpha = hp["beam_alpha"] + cfg.decoding.maes_prefix_alpha = hp["maes_prefix_alpha"] + cfg.decoding.maes_expansion_gamma = hp["maes_expansion_gamma"] + cfg.decoding.hat_ilm_weight = hp["hat_ilm_weight"] + + candidate_wer, candidate_cer = decoding_step( + asr_model, + cfg, + all_probs=all_probs, + target_transcripts=target_transcripts, + preds_output_file=preds_output_file, + beam_batch_size=cfg.beam_batch_size, + progress_bar=True, + ) - if args.preds_output_folder: - preds_output_file = os.path.join( - args.preds_output_folder, f"preds_out_width{args.beam_width}_alpha{args.beam_alpha}.tsv", + if candidate_cer < best_cer: + best_cer_beam_size = hp["beam_width"] + best_cer_alpha = hp["beam_alpha"] + best_cer_ma = hp["maes_prefix_alpha"] + best_cer_mg = hp["maes_expansion_gamma"] + best_cer_hat_ilm_weight = hp["hat_ilm_weight"] + best_cer = candidate_cer + + if candidate_wer < best_wer: + best_wer_beam_size = hp["beam_width"] + best_wer_alpha = hp["beam_alpha"] + best_wer_ma = hp["maes_prefix_alpha"] + best_wer_ga = hp["maes_expansion_gamma"] + best_wer_hat_ilm_weight = hp["hat_ilm_weight"] + best_wer = candidate_wer + + wer_hat_parameter = "" + if cfg.hat_subtract_ilm: + wer_hat_parameter = f"HAT ilm weight = {best_wer_hat_ilm_weight}, " + logging.info( + f'Best WER Candidate = {best_wer:.2%} :: Beam size = {best_wer_beam_size}, ' + f'Beam alpha = {best_wer_alpha}, {wer_hat_parameter}' + f'maes_prefix_alpha = {best_wer_ma}, maes_expansion_gamma = {best_wer_ga} ' ) - preds_output_manifest = os.path.join(args.preds_output_folder, f"preds_manifest.json",) - with open(preds_output_manifest, 'w') as fn: - for i, file_name in enumerate(audio_file_paths): - item = { - 'audio_filepath': file_name, - 'duration': durations[i], - 'text': target_transcripts[i], - 'pred_text': hypotheses[i].text, - } - fn.write(json.dumps(item) + "\n") - else: - preds_output_file = None - - beam_search_eval( - all_hypotheses=all_hypotheses, target_transcripts=target_transcripts, preds_output_file=preds_output_file, - ) + cer_hat_parameter = "" + if cfg.hat_subtract_ilm: + cer_hat_parameter = f"HAT ilm weight = {best_cer_hat_ilm_weight}" + logging.info( + f'Best CER Candidate = {best_cer:.2%} :: Beam size = {best_cer_beam_size}, ' + f'Beam alpha = {best_cer_alpha}, {cer_hat_parameter} ' + f'maes_prefix_alpha = {best_cer_ma}, maes_expansion_gamma = {best_cer_mg}' + ) + logging.info(f"=================================================================================") if __name__ == '__main__': From 094cbae8baeec368477aad35486c77059848de58 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 21 Apr 2023 17:09:26 -0700 Subject: [PATCH 13/23] BERT pre-training mp fork to spawn (#6442) (#6454) * change bert fork to spawn * num_workers=0 fix --------- Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar --- examples/nlp/language_modeling/megatron_bert_pretraining.py | 3 +++ .../nlp/models/language_modeling/megatron_bert_model.py | 6 +++++- .../nlp/models/language_modeling/megatron_gpt_model.py | 2 +- .../language_modeling/megatron_gpt_prompt_learning_model.py | 4 +++- .../language_modeling/megatron_t5_prompt_learning_model.py | 4 +++- .../nlp/models/machine_translation/megatron_nmt_model.py | 4 ++-- 6 files changed, 17 insertions(+), 6 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_bert_pretraining.py b/examples/nlp/language_modeling/megatron_bert_pretraining.py index b6cb84481fc7..e6abee295a1a 100644 --- a/examples/nlp/language_modeling/megatron_bert_pretraining.py +++ b/examples/nlp/language_modeling/megatron_bert_pretraining.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import torch.multiprocessing as mp from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.plugins.environments import TorchElasticEnvironment @@ -28,6 +29,8 @@ from nemo.utils import logging from nemo.utils.exp_manager import exp_manager +mp.set_start_method("spawn", force=True) + @hydra_runner(config_path="conf", config_name="megatron_bert_config") def main(cfg) -> None: diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py index 12d121ca2bcd..90053f3052c8 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py @@ -643,7 +643,11 @@ def build_pretraining_data_loader(self, dataset, consumed_samples): # Torch dataloader. return torch.utils.data.DataLoader( - dataset, batch_sampler=batch_sampler, num_workers=self.cfg.data.num_workers, pin_memory=True, + dataset, + batch_sampler=batch_sampler, + num_workers=self.cfg.data.num_workers, + pin_memory=True, + persistent_workers=True if self.cfg.data.num_workers > 0 else False, ) def setup_training_data(self, cfg): diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 52a640a9efd1..d4132c4c7e80 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -795,7 +795,7 @@ def build_pretraining_data_loader( batch_sampler=batch_sampler, num_workers=self.cfg.data.num_workers, pin_memory=True, - persistent_workers=True, + persistent_workers=True if self.cfg.data.num_workers > 0 else False, ) def setup(self, stage=None): diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py index acf96688b33a..331136c64a46 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py @@ -604,7 +604,9 @@ def build_virtual_prompt_dataset( drop_last=drop_last, num_workers=num_workers, pin_memory=pin_memory, - persistent_workers=True, # (@adithyare and @eharper) We need this to make spawn=True to work. + persistent_workers=True + if num_workers > 0 + else False, # (@adithyare and @eharper) We need this to make spawn=True to work. ) return dataset, dataloader diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py index bfcc2c43631d..b3c08dff7ae8 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py @@ -440,7 +440,9 @@ def build_virtual_prompt_dataset( drop_last=drop_last, num_workers=num_workers, pin_memory=pin_memory, - persistent_workers=True, # (@adithyare and @eharper) We need to set this to True to get around issues with spawn=True + persistent_workers=True + if num_workers > 0 + else False, # (@adithyare and @eharper) We need to set this to True to get around issues with spawn=True ) print('build success', len(dataloader), dataset_paths) return dataset, dataloader diff --git a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py index fbf4b029fcdc..efa059419eda 100644 --- a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py +++ b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py @@ -430,7 +430,7 @@ def _setup_eval_dataloader_from_config(self, cfg: DictConfig, dataset): pin_memory=cfg.get("pin_memory", False), drop_last=cfg.get("drop_last", False), shuffle=False, - persistent_workers=True, + persistent_workers=True if cfg.get("num_workers", 0) > 0 else False, ) ) @@ -592,7 +592,7 @@ def _setup_megatron_dataloader_from_config(self, cfg, dataset, consumed_samples) collate_fn=collate_fn, num_workers=cfg.num_workers, pin_memory=cfg.pin_memory, - persistent_workers=True, + persistent_workers=True if cfg.num_workers > 0 else False, ) def process_global_batch_for_text_translation_datasets(self, batch): From daa97446beb934cfc1e2bf11133f5b103409478e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 21 Apr 2023 17:09:38 -0700 Subject: [PATCH 14/23] fix replace_bos_with_pad not found (#6443) (#6450) Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar --- .../nlp/models/language_modeling/megatron_finetune_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py index 452819e1d5c4..7fc48856453f 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py @@ -331,7 +331,7 @@ def inference_step(self, dataloader_iter, batch_idx: int, mode: str, dataloader_ tokens_enc=batch['text_enc'], enc_mask=batch['enc_mask'], num_tokens_to_generate=30, - bos_id=self.tokenizer.pad_id if data_cfg.replace_bos_with_pad else self.tokenizer.bos_id, + bos_id=self.tokenizer.pad_id if data_cfg.get('replace_bos_with_pad', False) else self.tokenizer.bos_id, ) # Special ids to text function to handle stripping and special tokens with sentencepiece tokenizers. From 557c4b7ae766faf050374e6b9a862e2e67385b10 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 21 Apr 2023 17:10:06 -0700 Subject: [PATCH 15/23] reduce workers on NMT CI (#6472) (#6474) Signed-off-by: Abhinav Khattar Co-authored-by: Abhinav Khattar --- Jenkinsfile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index fa184482a22d..3e4895715df4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -2561,6 +2561,8 @@ pipeline { model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.train_ds.num_workers=1 \ + model.validation_ds.num_workers=1 \ ~model.test_ds \ model.train_ds.dataset_type=text_memmap \ model.encoder_tokenizer.library=sentencepiece \ @@ -2602,6 +2604,8 @@ pipeline { model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ + model.train_ds.num_workers=1 \ + model.validation_ds.num_workers=1 \ ~model.test_ds \ model.train_ds.dataset_type=text_memmap \ model.encoder_tokenizer.library=sentencepiece \ From 690742bb6b2c9ca33421440432f2b66e8e903c54 Mon Sep 17 00:00:00 2001 From: Micha Livne Date: Sun, 23 Apr 2023 16:20:07 -0700 Subject: [PATCH 16/23] 1. Added KERPLE positional embeddings to encoder-decoder. Signed-off-by: Micha Livne --- Jenkinsfile | 96 +++++++++++++++++++ .../conf/megatron_model_base_config.yaml | 2 +- .../megatron/token_level_encoder_decoder.py | 25 ++++- 3 files changed, 120 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 880f9bb78745..8007ea10ccd3 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -3806,6 +3806,102 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' sh "rm -rf examples/nlp/language_modeling/t5_index_mappings" } } + stage('L2: Megatron T5 with KERPLE Pretraining and Resume Training TP=2') { + when { + anyOf { + branch 'r1.18.0' + changeRequest target: 'r1.18.0' + } + } + failFast true + steps { + sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation='swiglu' \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.position_embedding_type=kerple \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation='swiglu' \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type='pre_ln' \ + model.decoder.transformer_block_type='pre_ln' \ + model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ + model.data.data_impl=text_mmap \ + +model.data.data_impl_kwargs.newline_int=10 \ + +model.data.data_impl_kwargs.header_lines=0 \ + +model.data.data_impl_kwargs.workers=null \ + +model.data.data_impl_kwargs.sort_dataset_paths=False \ + model.share_token_embeddings=False \ + model.share_decoder_tokens_head_embeddings=False" + sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.seq_length=128 \ + model.encoder.num_layers=4 \ + model.encoder.hidden_size=64 \ + model.encoder.num_attention_heads=8 \ + model.encoder.activation='swiglu' \ + model.encoder.masked_softmax_fusion=False \ + model.encoder.bias_activation_fusion=False \ + model.encoder.activations_checkpoint_method='block' \ + model.encoder.activations_checkpoint_num_layers=1 \ + model.encoder.position_embedding_type=kerple \ + model.decoder.num_layers=2 \ + model.decoder.hidden_size=64 \ + model.decoder.num_attention_heads=8 \ + model.decoder.activation='swiglu' \ + model.decoder.masked_softmax_fusion=False \ + model.decoder.bias_activation_fusion=False \ + model.decoder.activations_checkpoint_method='block' \ + model.decoder.activations_checkpoint_num_layers=1 \ + model.encoder.transformer_block_type='pre_ln' \ + model.decoder.transformer_block_type='pre_ln' \ + model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ + model.data.data_impl=text_mmap \ + +model.data.data_impl_kwargs.newline_int=10 \ + +model.data.data_impl_kwargs.header_lines=0 \ + +model.data.data_impl_kwargs.workers=null \ + +model.data.data_impl_kwargs.sort_dataset_paths=False \ + model.share_token_embeddings=False \ + model.share_decoder_tokens_head_embeddings=False" + sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results" + sh "rm -rf examples/nlp/language_modeling/t5_index_mappings" + } + } stage('L2: Megatron T5 Pretraining and Resume Training PP=2') { when { anyOf { diff --git a/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml b/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml index b623d08e4e8b..d3feb97ea9b4 100644 --- a/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml @@ -6,7 +6,7 @@ init_method_std: 0.02 # Standard deviation of the zero mean normal distribution hidden_dropout: 0.1 # Dropout probability for hidden state transformer. attention_dropout: 0.1 # Dropout probability in the attention layer. ffn_dropout: 0.0 # Dropout probability in the feed-forward layer. -position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'relative', 'alibi'] +position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'relative', 'alibi', 'kerple'] relative_attention_num_buckets: 32 # Relative position number of buckets for computing the bias relative_attention_max_distance: 128 # max_distance to keep relative distance in the attention_num_buckets. relative_position_bias_self_attention_only: True # whether to only use relative position bias for self attention only. diff --git a/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py b/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py index a02fb5300912..dcf41a696b6e 100644 --- a/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py +++ b/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py @@ -18,6 +18,9 @@ from nemo.collections.nlp.modules.common.megatron.alibi_relative_position_embedding import ( ALiBiRelativePositionEmbedding, ) +from nemo.collections.nlp.modules.common.megatron.kerple_relative_position_embedding import ( + KERPLERelativePositionEmbedding, +) from nemo.collections.nlp.modules.common.megatron.language_model import Embedding from nemo.collections.nlp.modules.common.megatron.layer_type import LayerType from nemo.collections.nlp.modules.common.megatron.megatron_decoders import get_decoder_model @@ -176,7 +179,16 @@ def __init__( num_attention_heads_alibi=None, max_seq_len=max_position_embeddings, ) - self._encoder_relative_position_embedding_key = "encoder_relative_position_embedding" + self._encoder_relative_position_embedding_key = "encoder_alibi_position_embedding" + elif self.encoder_cfg.get('position_embedding_type', 'learned_absolute') == 'kerple': + self.encoder_relative_position_embedding = KERPLERelativePositionEmbedding( + bidirectional=True, + num_attention_heads=encoder_cfg.num_attention_heads, + layer_type=LayerType.encoder, + num_attention_heads_kerple=None, + max_seq_len=max_position_embeddings, + ) + self._encoder_relative_position_embedding_key = "encoder_kerple_position_embedding" else: self.encoder_relative_position_embedding = None @@ -296,7 +308,16 @@ def __init__( num_attention_heads_alibi=None, max_seq_len=max_position_embeddings, ) - self._decoder_relative_position_embedding_key = "decoder_relative_position_embedding" + self._decoder_relative_position_embedding_key = "decoder_alibi_position_embedding" + elif self.decoder_cfg.get('position_embedding_type', 'learned_absolute') == 'kerple': + self.decoder_relative_position_embedding = KERPLERelativePositionEmbedding( + bidirectional=False, + num_attention_heads=decoder_cfg.num_attention_heads, + layer_type=LayerType.decoder, + num_attention_heads_kerple=None, + max_seq_len=max_position_embeddings, + ) + self._decoder_relative_position_embedding_key = "decoder_kerple_position_embedding" else: self.decoder_relative_position_embedding = None From ed4c373b0b87f2f0f50af93fd05bda7244ec0fee Mon Sep 17 00:00:00 2001 From: Micha Livne Date: Sun, 23 Apr 2023 16:33:01 -0700 Subject: [PATCH 17/23] 1. Added a missing file. Signed-off-by: Micha Livne --- .../kerple_relative_position_embedding.py | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 nemo/collections/nlp/modules/common/megatron/kerple_relative_position_embedding.py diff --git a/nemo/collections/nlp/modules/common/megatron/kerple_relative_position_embedding.py b/nemo/collections/nlp/modules/common/megatron/kerple_relative_position_embedding.py new file mode 100644 index 000000000000..b156429b4377 --- /dev/null +++ b/nemo/collections/nlp/modules/common/megatron/kerple_relative_position_embedding.py @@ -0,0 +1,88 @@ +# coding=utf-8 +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import torch + +from nemo.collections.nlp.modules.common.megatron.alibi_relative_position_embedding import ( + build_slopes, + build_relative_position, +) + +__all__ = ['KERPLERelativePositionEmbedding'] + + +class KERPLERelativePositionEmbedding(torch.nn.Module): + """ + kerple (Attention with Linear Biases) relative position embedding for auto-regressive decoder + and joint encoder (symmetric for forward and backward distance). + Based on https://arxiv.org/bas/2108.12409 + """ + + def __init__( + self, bidirectional, num_attention_heads, layer_type, num_attention_heads_kerple=None, max_seq_len=512 + ): + """ + Args: + bidirectional: Whether to use bidirectional relative position embedding + num_attention_heads: Number of attention heads + layer_type: Layer type. Can be one of [LayerType.encoder or LayerType.decoder]. Willdetermine the bias construction + num_attention_heads_kerple: Number of attention heads for which kerple bias will be used + max_seq_len: Maximum sequence length for precomputed relative positions. Larger sizes will result in more memory usage by computing kerple mask on-the-fly. + """ + super().__init__() + + if (num_attention_heads_kerple is None) or (num_attention_heads_kerple <= 0): + num_attention_heads_kerple = num_attention_heads + + if num_attention_heads_kerple > num_attention_heads: + raise ValueError( + f"num_attention_heads_kerple ({num_attention_heads_kerple}) cannot be larger than num_attention_heads ({num_attention_heads})" + ) + + self.bidirectional = bidirectional + self.num_attention_heads = num_attention_heads + # LayerType.encoder or LayerType.decoder. Is only needed to determine the group for the all_reduce + self.layer_type = layer_type + # define the size of pre-computed relative position slopes. + # define the number of attention heads for which kerple mask will be pre-computed (the rest are disabled). + self.num_attention_heads_kerple = num_attention_heads_kerple + # Larger sizes will result in more memory usage by computing kerple mask on-the-fly. + self.max_seq_len = max_seq_len + + # initialize the slopes + self.kerple_b = torch.nn.Parameter(build_slopes(num_attention_heads, num_attention_heads_kerple)) + self.kerple_a = torch.zeros_like(self.kerple_b) + self.kerple_p = torch.ones_like(self.kerple_b) + + # cache the relative position bias. shape (num_attention_heads, max_seq_len, max_seq_len) + self.relative_position = build_relative_position(max_seq_len, max_seq_len, num_attention_heads) + + def forward(self, query_seq_length, key_seq_length): + # used cached relative position if possible + max_seq_len = max(query_seq_length, key_seq_length) + if max_seq_len > self.max_seq_len: + relative_position = build_relative_position(max_seq_len, max_seq_len, self.num_attention_heads) + else: + relative_position = self.relative_position + # shape (num_attention_heads, query_seq_length, key_seq_length) + relative_position = relative_position[:, :query_seq_length, :key_seq_length] + # if not bidirectional, mask out the future positions + if not self.bidirectional: + relative_position = torch.tril(relative_position) + + # shape (1, num_heads, query_length, key_length) + return - self.kerple_b * torch.log(1 + self.kerple_a * relative_position.unsqueeze(0).pow(self.kerple_p)) From e3ca4381d7789040911b7b1a924699b0ca9c23fd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 23 Apr 2023 23:35:51 +0000 Subject: [PATCH 18/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../common/megatron/kerple_relative_position_embedding.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nemo/collections/nlp/modules/common/megatron/kerple_relative_position_embedding.py b/nemo/collections/nlp/modules/common/megatron/kerple_relative_position_embedding.py index b156429b4377..54276d6fa21e 100644 --- a/nemo/collections/nlp/modules/common/megatron/kerple_relative_position_embedding.py +++ b/nemo/collections/nlp/modules/common/megatron/kerple_relative_position_embedding.py @@ -18,8 +18,8 @@ import torch from nemo.collections.nlp.modules.common.megatron.alibi_relative_position_embedding import ( - build_slopes, build_relative_position, + build_slopes, ) __all__ = ['KERPLERelativePositionEmbedding'] @@ -67,7 +67,7 @@ def __init__( self.kerple_b = torch.nn.Parameter(build_slopes(num_attention_heads, num_attention_heads_kerple)) self.kerple_a = torch.zeros_like(self.kerple_b) self.kerple_p = torch.ones_like(self.kerple_b) - + # cache the relative position bias. shape (num_attention_heads, max_seq_len, max_seq_len) self.relative_position = build_relative_position(max_seq_len, max_seq_len, num_attention_heads) @@ -85,4 +85,4 @@ def forward(self, query_seq_length, key_seq_length): relative_position = torch.tril(relative_position) # shape (1, num_heads, query_length, key_length) - return - self.kerple_b * torch.log(1 + self.kerple_a * relative_position.unsqueeze(0).pow(self.kerple_p)) + return -self.kerple_b * torch.log(1 + self.kerple_a * relative_position.unsqueeze(0).pow(self.kerple_p)) From c6fa1a994e5ad452998e255bccaf0d4603e18274 Mon Sep 17 00:00:00 2001 From: Micha Livne Date: Sun, 23 Apr 2023 16:37:51 -0700 Subject: [PATCH 19/23] 1. Fixing commits. Signed-off-by: Micha Livne --- docs/source/asr/asr_language_modeling.rst | 23 ------------------- .../wfst/wfst_text_normalization.rst | 2 -- 2 files changed, 25 deletions(-) diff --git a/docs/source/asr/asr_language_modeling.rst b/docs/source/asr/asr_language_modeling.rst index a0e578092f50..308df41b28f6 100644 --- a/docs/source/asr/asr_language_modeling.rst +++ b/docs/source/asr/asr_language_modeling.rst @@ -281,29 +281,6 @@ For instance, the following set of parameters would results in 2*1*2=4 beam sear beam_beta=[1.0,0.5] -Beam search ngram decoding for Transducer models (RNNT and HAT) -=============================================================== - -The similar script to evaluate an RNNT/HAT model with beam search decoding and N-gram models can be found at -`scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py `_ - -.. code-block:: - - python eval_beamsearch_ngram_transducer.py nemo_model_file= \ - input_manifest= \ - beam_width=[] \ - beam_alpha=[] \ - preds_output_folder= \ - probs_cache_file=null \ - decoding_strategy= - maes_prefix_alpha=[] \ - maes_expansion_gamma=[] \ - hat_subtract_ilm= \ - hat_ilm_weight=[] \ - - - .. _neural_rescoring: **************** diff --git a/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst b/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst index 3f19872f1f8d..632ec8768bb0 100644 --- a/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst +++ b/docs/source/nlp/text_normalization/wfst/wfst_text_normalization.rst @@ -174,8 +174,6 @@ Language Support Matrix +------------------+----------+----------+----------+--------------------+----------------------+ | Chinese | zh | x | | | | +------------------+----------+----------+----------+--------------------+----------------------+ -| Hungarian | hu | x | | | | -+------------------+----------+----------+----------+--------------------+----------------------+ See :doc:`Grammar customization ` for grammar customization details. From f6ed850174addf97e0a5f9935fdce99113d79be1 Mon Sep 17 00:00:00 2001 From: Micha Livne Date: Sun, 23 Apr 2023 16:45:20 -0700 Subject: [PATCH 20/23] 1. Debugging. --- .../conf/megatron_gpt_config.yaml | 1 - .../conf/fastpitch_align_44100_adapter.yaml | 306 ------------------ examples/tts/fastpitch_finetune_adapters.py | 141 -------- nemo/collections/asr/data/audio_to_label.py | 6 +- nemo/collections/asr/data/audio_to_text.py | 110 ++----- .../asr/data/audio_to_text_dali.py | 6 +- .../asr/data/audio_to_text_dataset.py | 5 - .../asr/models/configs/asr_models_config.py | 1 - nemo/collections/asr/models/rnnt_models.py | 1 - 9 files changed, 25 insertions(+), 552 deletions(-) delete mode 100644 examples/tts/conf/fastpitch_align_44100_adapter.yaml delete mode 100644 examples/tts/fastpitch_finetune_adapters.py diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 09b30c08dd47..27cb3af3ce91 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -45,7 +45,6 @@ model: # gradient accumulation will be done automatically based on data_parallel_size micro_batch_size: 4 # limited by GPU memory global_batch_size: 8 # will use more micro batches to reach global batch size - rampup_batch_size: null # Should be a list of 3 values: [, , ] tensor_model_parallel_size: 1 # intra-layer model parallelism pipeline_model_parallel_size: 1 # inter-layer model parallelism virtual_pipeline_model_parallel_size: null # interleaved pipeline diff --git a/examples/tts/conf/fastpitch_align_44100_adapter.yaml b/examples/tts/conf/fastpitch_align_44100_adapter.yaml deleted file mode 100644 index bac6a64b06e9..000000000000 --- a/examples/tts/conf/fastpitch_align_44100_adapter.yaml +++ /dev/null @@ -1,306 +0,0 @@ -# This config contains the default values for training FastPitch speaker adaptation -# If you want to train model on other dataset, you can change config values according to your dataset. -# Most dataset-specific arguments are in the head of the config file, see below. - -name: FastPitch - -train_dataset: ??? -validation_datasets: ??? -sup_data_path: ??? -sup_data_types: [ "align_prior_matrix", "pitch", "speaker_id", "reference_audio"] - -# Default values from librosa.pyin -pitch_fmin: 65.40639132514966 -pitch_fmax: 2093.004522404789 - -# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values -# by running `scripts/dataset_processing/tts/extract_sup_data.py` -pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech -pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech - -# Default values for dataset with sample_rate=44100 -sample_rate: 44100 -n_mel_channels: 80 -n_window_size: 2048 -n_window_stride: 512 -n_fft: 2048 -lowfreq: 0 -highfreq: 8000 -window: hann - -phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" -heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" - -model: - learn_alignment: true - bin_loss_warmup_epochs: 100 - - max_token_duration: 75 - symbols_embedding_dim: 384 - pitch_embedding_kernel_size: 3 - - pitch_fmin: ${pitch_fmin} - pitch_fmax: ${pitch_fmax} - - pitch_mean: ${pitch_mean} - pitch_std: ${pitch_std} - - sample_rate: ${sample_rate} - n_mel_channels: ${n_mel_channels} - n_window_size: ${n_window_size} - n_window_stride: ${n_window_stride} - n_fft: ${n_fft} - lowfreq: ${lowfreq} - highfreq: ${highfreq} - window: ${window} - - text_normalizer: - _target_: nemo_text_processing.text_normalization.normalize.Normalizer - lang: en - input_case: cased - - text_normalizer_call_kwargs: - verbose: false - punct_pre_process: true - punct_post_process: true - - text_tokenizer: - _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer - punct: true - stresses: true - chars: true - apostrophe: true - pad_with_space: true - g2p: - _target_: nemo.collections.tts.g2p.modules.EnglishG2p - phoneme_dict: ${phoneme_dict_path} - heteronyms: ${heteronyms_path} - phoneme_probability: 0.5 - - adapter: - # Config of the adapter training/eval script. - adapter_name: "adapter" # Name of the adapter, used by the script - adapter_module_name: "encoder+decoder+duration_predictor+pitch_predictor+aligner" # Name of the adapter module. Combine multiple modules with '+' between module names. - adapter_state_dict_name: "adapters.pt" # If the individual adapters must be saved, a file name can be provided here. null disables this. - - # Config of the adapter module itself - _target_: nemo.collections.common.parts.adapter_modules.LinearAdapter - in_features: ${model.symbols_embedding_dim} # User must provide the output dimension of the layers of the model, which is the input dimension of this adapter. - dim: 256 # The hidden dimension of the adapter, as chosen by user, but small values are preferred to reduce param count. - activation: swish - norm_position: 'pre' # Can be `pre` or `post` - dropout: 0.0 # float, dropout for the adapter - - # Adapter strategy config - adapter_strategy: - _target_: nemo.core.classes.mixins.adapter_mixin_strategies.ResidualAddAdapterStrategy - stochastic_depth: 0.0 # float, setting to > 0 will enable stochastic depth for each adapter block. - l2_lambda: 0.0 # float, setting to > 0 will enable l2 norm auxiliary loss for each adapter's output. - - # Optional global config available to all adapters at a global level. - # A global config is shared across every layer of the adapters, defining global properties rather - # than properties local to the adapter (as defined above). - # This can be useful in order to select *which type of adapter* is added, *what adapters to enable*, - # and further global operations that can decide dynamically how to support the requested adapter. - global_cfg: - check_encoder_adapter: True # determines whether to check if encoder adapter modules is supported - check_decoder_adapter: True # determines whether to check if decoder adapter modules is supported - check_duration_predictor_adapter: True # determines whether to check if duration_predictor adapter modules is supported - check_pitch_predictor_adapter: True # determines whether to check if pitch_predictor adapter modules is supported - check_aligner_adapter: True # determines whether to check if aligner adapter modules is supported - - train_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${train_dataset} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - use_beta_binomial_interpolator: true - - dataloader_params: - drop_last: false - shuffle: true - batch_size: 32 - num_workers: 12 - pin_memory: true - - validation_ds: - dataset: - _target_: nemo.collections.tts.data.dataset.TTSDataset - manifest_filepath: ${validation_datasets} - sample_rate: ${model.sample_rate} - sup_data_path: ${sup_data_path} - sup_data_types: ${sup_data_types} - n_fft: ${model.n_fft} - win_length: ${model.n_window_size} - hop_length: ${model.n_window_stride} - window: ${model.window} - n_mels: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - max_duration: null - min_duration: 0.1 - ignore_file: null - trim: false - pitch_fmin: ${model.pitch_fmin} - pitch_fmax: ${model.pitch_fmax} - pitch_norm: true - pitch_mean: ${model.pitch_mean} - pitch_std: ${model.pitch_std} - use_beta_binomial_interpolator: true - - dataloader_params: - drop_last: false - shuffle: false - batch_size: 32 - num_workers: 8 - pin_memory: true - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - features: ${model.n_mel_channels} - lowfreq: ${model.lowfreq} - highfreq: ${model.highfreq} - n_fft: ${model.n_fft} - n_window_size: ${model.n_window_size} - window_size: false - n_window_stride: ${model.n_window_stride} - window_stride: false - pad_to: 1 - pad_value: 0 - sample_rate: ${model.sample_rate} - window: ${model.window} - normalize: null - preemph: null - dither: 0.0 - frame_splicing: 1 - log: true - log_zero_guard_type: add - log_zero_guard_value: 1e-05 - mag_power: 1.0 - - input_fft: #n_embed and padding_idx are added by the model - _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - d_embed: ${model.symbols_embedding_dim} - condition_types: [ "add", "layernorm" ] # options: [ "add", "cat", "layernorm" ] - - output_fft: - _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder - n_layer: 6 - n_head: 1 - d_model: ${model.symbols_embedding_dim} - d_head: 64 - d_inner: 1536 - kernel_size: 3 - dropout: 0.1 - dropatt: 0.1 - dropemb: 0.0 - condition_types: [ "add", "layernorm" ] # options: [ "add", "cat", "layernorm" ] - - alignment_module: - _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder - n_text_channels: ${model.symbols_embedding_dim} - condition_types: [ "add" ] # options: [ "add", "cat" ] - - duration_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - condition_types: [ "add", "layernorm" ] # options: [ "add", "cat", "layernorm" ] - - pitch_predictor: - _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor - input_size: ${model.symbols_embedding_dim} - kernel_size: 3 - filter_size: 256 - dropout: 0.1 - n_layers: 2 - condition_types: [ "add", "layernorm" ] # options: [ "add", "cat", "layernorm" ] - - speaker_encoder: - _target_: nemo.collections.tts.modules.submodules.SpeakerEncoder - lookup_module: - _target_: nemo.collections.tts.modules.submodules.SpeakerLookupTable - n_speakers: ??? - embedding_dim: ${model.symbols_embedding_dim} - gst_module: - _target_: nemo.collections.tts.modules.submodules.GlobalStyleToken - gst_size: ${model.symbols_embedding_dim} - n_style_token: 10 - n_style_attn_head: 4 - reference_encoder: - _target_: nemo.collections.tts.modules.submodules.ReferenceEncoder - n_mels: ${model.n_mel_channels} - cnn_filters: [32, 32, 64, 64, 128, 128] - dropout: 0.2 - gru_hidden: ${model.symbols_embedding_dim} - kernel_size: 3 - stride: 2 - padding: 1 - bias: true - - optim: - name: adamw - lr: 1e-3 - betas: [0.9, 0.999] - weight_decay: 1e-6 - - sched: - name: NoamAnnealing - warmup_steps: 1000 - last_epoch: -1 - d_model: 1 # Disable scaling based on model dim - -trainer: - num_nodes: 1 - devices: 1 - accelerator: gpu - strategy: ddp - precision: 16 - max_epochs: 1000 - accumulate_grad_batches: 1 - gradient_clip_val: 1000.0 - enable_checkpointing: false # Provided by exp_manager - logger: false # Provided by exp_manager - log_every_n_steps: 100 - check_val_every_n_epoch: 1 - benchmark: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - monitor: val_loss - resume_if_exists: false - resume_ignore_no_checkpoint: false diff --git a/examples/tts/fastpitch_finetune_adapters.py b/examples/tts/fastpitch_finetune_adapters.py deleted file mode 100644 index 396552b0f4fd..000000000000 --- a/examples/tts/fastpitch_finetune_adapters.py +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from dataclasses import is_dataclass - -import pytorch_lightning as pl -from omegaconf import DictConfig, OmegaConf, open_dict - -from nemo.collections.common.callbacks import LogEpochTimeCallback -from nemo.collections.tts.models import FastPitchModel -from nemo.core import adapter_mixins -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager - - -def update_model_config_to_support_adapter(config) -> DictConfig: - with open_dict(config): - enc_adapter_metadata = adapter_mixins.get_registered_adapter(config.input_fft._target_) - if enc_adapter_metadata is not None: - config.input_fft._target_ = enc_adapter_metadata.adapter_class_path - - dec_adapter_metadata = adapter_mixins.get_registered_adapter(config.output_fft._target_) - if dec_adapter_metadata is not None: - config.output_fft._target_ = dec_adapter_metadata.adapter_class_path - - pitch_predictor_adapter_metadata = adapter_mixins.get_registered_adapter(config.pitch_predictor._target_) - if pitch_predictor_adapter_metadata is not None: - config.pitch_predictor._target_ = pitch_predictor_adapter_metadata.adapter_class_path - - duration_predictor_adapter_metadata = adapter_mixins.get_registered_adapter(config.duration_predictor._target_) - if duration_predictor_adapter_metadata is not None: - config.duration_predictor._target_ = duration_predictor_adapter_metadata.adapter_class_path - - aligner_adapter_metadata = adapter_mixins.get_registered_adapter(config.alignment_module._target_) - if aligner_adapter_metadata is not None: - config.alignment_module._target_ = aligner_adapter_metadata.adapter_class_path - - return config - - -def add_global_adapter_cfg(model, global_adapter_cfg): - # Convert to DictConfig from dict or Dataclass - if is_dataclass(global_adapter_cfg): - global_adapter_cfg = OmegaConf.structured(global_adapter_cfg) - - if not isinstance(global_adapter_cfg, DictConfig): - global_adapter_cfg = DictConfig(global_adapter_cfg) - - # Update the model.cfg with information about the new adapter global cfg - with open_dict(global_adapter_cfg), open_dict(model.cfg): - if 'adapters' not in model.cfg: - model.cfg.adapters = OmegaConf.create({}) - - # Add the global config for adapters to the model's internal config - model.cfg.adapters[model.adapter_global_cfg_key] = global_adapter_cfg - - # Update all adapter modules (that already exist) with this global adapter config - model.update_adapter_cfg(model.cfg.adapters) - - -@hydra_runner(config_path="conf", config_name="fastpitch_align_44100_adapter") -def main(cfg): - if hasattr(cfg.model.optim, 'sched'): - logging.warning("You are using an optimizer scheduler while finetuning. Are you sure this is intended?") - if cfg.model.optim.lr > 1e-3 or cfg.model.optim.lr < 1e-5: - logging.warning("The recommended learning rate for finetuning is 2e-4") - - trainer = pl.Trainer(**cfg.trainer) - exp_log_dir = exp_manager(trainer, cfg.get("exp_manager", None)) - # Initialize FastPitchModel - model = FastPitchModel(cfg=update_model_config_to_support_adapter(cfg.model), trainer=trainer) - model.maybe_init_from_pretrained_checkpoint(cfg=cfg) - - # Extract adapter parameters - with open_dict(cfg.model.adapter): - # Extract the name of the adapter (must be given for training) - adapter_name = cfg.model.adapter.pop("adapter_name", "adapter") - # Extract the name of the modules where adapters need to be added (must be given for training) - adapter_module_name = cfg.model.adapter.pop("adapter_module_name", None) - # Name of the adapter checkpoint which will be saved after training - adapter_state_dict_name = cfg.model.adapter.pop("adapter_state_dict_name", None) - - # augment adapter name with module name, if not provided by user - if adapter_module_name is not None and ':' not in adapter_name: - adapter_name = f'{adapter_module_name}:{adapter_name}' - - # Extract the global adapter config, if provided - adapter_global_cfg = cfg.model.adapter.pop(model.adapter_global_cfg_key, None) - - # Freeze model - model.freeze() - - # Setup adapters - if adapter_global_cfg is not None: - add_global_adapter_cfg(model, adapter_global_cfg) - - # Add adapters - model.add_adapter(name=adapter_name, cfg=cfg.model.adapter) - assert model.is_adapter_available() - # enable adapters - model.set_enabled_adapters(enabled=False) - model.set_enabled_adapters(adapter_name, enabled=True) - - # Set model to training mode. - model = model.train() - # Then, Unfreeze just the adapter weights that were enabled above (no part of model) - model.unfreeze_enabled_adapters() - # summarize the model - model.summarize() - - lr_logger = pl.callbacks.LearningRateMonitor() - epoch_time_logger = LogEpochTimeCallback() - trainer.callbacks.extend([lr_logger, epoch_time_logger]) - trainer.fit(model) - - # Save the adapter state dict after training has completed - if adapter_state_dict_name is not None: - state_path = exp_log_dir if exp_log_dir is not None else os.getcwd() - ckpt_path = os.path.join(state_path, "checkpoints") - if os.path.exists(ckpt_path): - state_path = ckpt_path - - # Save the adapter modules in a seperate file - model.save_adapters(os.path.join(state_path, adapter_state_dict_name)) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/nemo/collections/asr/data/audio_to_label.py b/nemo/collections/asr/data/audio_to_label.py index 4317642a8fff..fe12be42be94 100644 --- a/nemo/collections/asr/data/audio_to_label.py +++ b/nemo/collections/asr/data/audio_to_label.py @@ -19,7 +19,7 @@ import torch import webdataset as wd -from nemo.collections.asr.data.audio_to_text import cache_datastore_manifests, expand_sharded_filepaths +from nemo.collections.asr.data.audio_to_text import cache_datastore_manifests, expand_audio_filepaths from nemo.collections.asr.parts.preprocessing.segment import available_formats as valid_sf_formats from nemo.collections.common.parts.preprocessing import collections from nemo.core.classes import Dataset, IterableDataset @@ -560,8 +560,8 @@ def __init__( for idx in range(len(self.labels[:5])): logging.debug(" label id {} and its mapped label {}".format(idx, self.id2label[idx])) - audio_tar_filepaths = expand_sharded_filepaths( - sharded_filepaths=audio_tar_filepaths, + audio_tar_filepaths = expand_audio_filepaths( + audio_tar_filepaths=audio_tar_filepaths, shard_strategy=shard_strategy, world_size=world_size, global_rank=global_rank, diff --git a/nemo/collections/asr/data/audio_to_text.py b/nemo/collections/asr/data/audio_to_text.py index 756c05631627..2f5b3add9fcc 100644 --- a/nemo/collections/asr/data/audio_to_text.py +++ b/nemo/collections/asr/data/audio_to_text.py @@ -171,32 +171,31 @@ def process_text_by_sample(self, sample: collections.ASRAudioText.OUTPUT_TYPE) - return t, tl -def expand_sharded_filepaths(sharded_filepaths, shard_strategy: str, world_size: int, global_rank: int): +def expand_audio_filepaths(audio_tar_filepaths, shard_strategy: str, world_size: int, global_rank: int): valid_shard_strategies = ['scatter', 'replicate'] if shard_strategy not in valid_shard_strategies: raise ValueError(f"`shard_strategy` must be one of {valid_shard_strategies}") - if isinstance(sharded_filepaths, str): + if isinstance(audio_tar_filepaths, str): # Replace '(' and '[' with '{' brace_keys_open = ['(', '[', '<', '_OP_'] for bkey in brace_keys_open: - if bkey in sharded_filepaths: - sharded_filepaths = sharded_filepaths.replace(bkey, "{") + if bkey in audio_tar_filepaths: + audio_tar_filepaths = audio_tar_filepaths.replace(bkey, "{") # Replace ')' and ']' with '}' brace_keys_close = [')', ']', '>', '_CL_'] for bkey in brace_keys_close: - if bkey in sharded_filepaths: - sharded_filepaths = sharded_filepaths.replace(bkey, "}") + if bkey in audio_tar_filepaths: + audio_tar_filepaths = audio_tar_filepaths.replace(bkey, "}") - if isinstance(sharded_filepaths, str): + if isinstance(audio_tar_filepaths, str): # Brace expand - sharded_filepaths = list(braceexpand.braceexpand(sharded_filepaths)) + audio_tar_filepaths = list(braceexpand.braceexpand(audio_tar_filepaths)) # Expand store paths into WebDataset URLs - sharded_filepaths = [ - datastore_path_to_webdataset_url(p) if is_datastore_path(p) and is_tarred_path(p) else p - for p in sharded_filepaths + audio_tar_filepaths = [ + datastore_path_to_webdataset_url(p) if is_datastore_path(p) else p for p in audio_tar_filepaths ] # Check for distributed and partition shards accordingly @@ -204,15 +203,15 @@ def expand_sharded_filepaths(sharded_filepaths, shard_strategy: str, world_size: if shard_strategy == 'scatter': logging.info("All tarred dataset shards will be scattered evenly across all nodes.") - if len(sharded_filepaths) % world_size != 0: + if len(audio_tar_filepaths) % world_size != 0: logging.warning( - f"Number of shards in tarred dataset ({len(sharded_filepaths)}) is not divisible " + f"Number of shards in tarred dataset ({len(audio_tar_filepaths)}) is not divisible " f"by number of distributed workers ({world_size})." ) - begin_idx = (len(sharded_filepaths) // world_size) * global_rank - end_idx = begin_idx + len(sharded_filepaths) // world_size - sharded_filepaths = sharded_filepaths[begin_idx:end_idx] + begin_idx = (len(audio_tar_filepaths) // world_size) * global_rank + end_idx = begin_idx + len(audio_tar_filepaths) // world_size + audio_tar_filepaths = audio_tar_filepaths[begin_idx:end_idx] logging.info( "Partitioning tarred dataset: process (%d) taking shards [%d, %d)", global_rank, begin_idx, end_idx ) @@ -222,7 +221,7 @@ def expand_sharded_filepaths(sharded_filepaths, shard_strategy: str, world_size: else: raise ValueError(f"Invalid shard strategy ! Allowed values are : {valid_shard_strategies}") - return sharded_filepaths + return audio_tar_filepaths def cache_datastore_manifests( @@ -346,47 +345,6 @@ def cache_data(manifest_filepaths, cache_audio, num_workers, max_num_workers): ) -"""Optionally expand / shard the list of manifests - This is made to use the same notation as the sharded audio files - - Args: - manifest_filepaths: list of manifest files (the sharded notation) - shard_strategy: scatter or replicate (scatter by default) - shard_manifests: bool, if False, no sharding / manifest filepath expansion will be attempted - global_rank: int, the rank of this worker - world_size: int, total number of workers -""" - - -def shard_manifests_if_needed( - manifest_filepaths: Union[str, List[str]], - shard_strategy: str, - shard_manifests: bool, - global_rank: int, - world_size: int, -): - if shard_manifests: - if not torch.distributed.is_available(): - logging.warning("Not running in torch.distributed mode. Manifest sharding not available") - return manifest_filepaths - - if not torch.distributed.is_initialized(): - logging.warning( - 'Manifest sharding was requested but torch.distributed is not initialized ' - 'Did you intend to set the defer_setup flag?' - ) - return manifest_filepaths - - manifest_filepaths = expand_sharded_filepaths( - sharded_filepaths=manifest_filepaths, - shard_strategy=shard_strategy, - world_size=world_size, - global_rank=global_rank, - ) - - return manifest_filepaths - - class _AudioTextDataset(Dataset): """ Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations (in seconds). @@ -790,7 +748,6 @@ class _TarredAudioToTextDataset(IterableDataset): occasions (when the number of shards is not divisible with ``world_size``), will not sample the entire dataset. For these reasons it is not advisable to use tarred datasets as validation or test datasets. - shard_manifests (bool): Whether or not to try / shard manifests. Defaults to False. global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. world_size (int): Total number of processes, used for partitioning shards. Defaults to 0. return_sample_id (bool): whether to return the sample_id as a part of each sample @@ -812,22 +769,10 @@ def __init__( eos_id: Optional[int] = None, pad_id: int = 0, shard_strategy: str = "scatter", - shard_manifests: bool = False, global_rank: int = 0, world_size: int = 0, return_sample_id: bool = False, ): - self.shard_manifests = shard_manifests - - # Shard manifests if necessary and possible and then expand the paths - manifest_filepath = shard_manifests_if_needed( - shard_manifests=shard_manifests, - shard_strategy=shard_strategy, - manifest_filepaths=manifest_filepath, - world_size=world_size, - global_rank=global_rank, - ) - # If necessary, cache manifests from object store cache_datastore_manifests(manifest_filepaths=manifest_filepath) @@ -843,8 +788,6 @@ def __init__( index_by_file_id=True, # Must set this so the manifest lines can be indexed by file ID ) - self.len = self._compute_len() - self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor) self.trim = trim self.eos_id = eos_id @@ -852,8 +795,8 @@ def __init__( self.pad_id = pad_id self.return_sample_id = return_sample_id - audio_tar_filepaths = expand_sharded_filepaths( - sharded_filepaths=audio_tar_filepaths, + audio_tar_filepaths = expand_audio_filepaths( + audio_tar_filepaths=audio_tar_filepaths, shard_strategy=shard_strategy, world_size=world_size, global_rank=global_rank, @@ -985,19 +928,8 @@ def get_manifest_sample(self, sample_id): def __iter__(self): return self._dataset.__iter__() - def _compute_len(self): - if self.shard_manifests and torch.distributed.is_available() and torch.distributed.is_initialized(): - my_len = torch.tensor(len(self.manifest_processor.collection), dtype=torch.int32).cuda() - torch.distributed.all_reduce(my_len) - my_len = my_len.int() - logging.info(f'Sharded manifests: Total length: {my_len}') - else: - my_len = len(self.manifest_processor.collection) - - return my_len - def __len__(self): - return self.len + return len(self.manifest_processor.collection) class TarredAudioToCharDataset(_TarredAudioToTextDataset): @@ -1110,7 +1042,6 @@ def __init__( parser: Optional[str] = 'en', pad_id: int = 0, shard_strategy: str = "scatter", - shard_manifests: bool = False, global_rank: int = 0, world_size: int = 0, return_sample_id: bool = False, @@ -1136,7 +1067,6 @@ def __init__( eos_id=eos_id, pad_id=pad_id, shard_strategy=shard_strategy, - shard_manifests=shard_manifests, global_rank=global_rank, world_size=world_size, return_sample_id=return_sample_id, @@ -1237,7 +1167,6 @@ def __init__( trim: bool = False, use_start_end_token: bool = True, shard_strategy: str = "scatter", - shard_manifests: bool = False, global_rank: int = 0, world_size: int = 0, return_sample_id: bool = False, @@ -1290,7 +1219,6 @@ def __call__(self, *args): eos_id=eos_id, pad_id=pad_id, shard_strategy=shard_strategy, - shard_manifests=shard_manifests, global_rank=global_rank, world_size=world_size, return_sample_id=return_sample_id, diff --git a/nemo/collections/asr/data/audio_to_text_dali.py b/nemo/collections/asr/data/audio_to_text_dali.py index 77bd71129cc2..b65823f94c97 100644 --- a/nemo/collections/asr/data/audio_to_text_dali.py +++ b/nemo/collections/asr/data/audio_to_text_dali.py @@ -22,7 +22,7 @@ import torch from omegaconf import DictConfig -from nemo.collections.asr.data.audio_to_text import ASRManifestProcessor, expand_sharded_filepaths +from nemo.collections.asr.data.audio_to_text import ASRManifestProcessor, expand_audio_filepaths from nemo.collections.common.parts.preprocessing import parsers from nemo.utils import logging, model_utils @@ -345,10 +345,10 @@ def __init__( self.is_tarred_dataset = False elif audio_tar_filepaths is not None and audio_tar_index_filepaths is not None: - audio_tar_filepaths = expand_sharded_filepaths( + audio_tar_filepaths = expand_audio_filepaths( audio_tar_filepaths, shard_strategy=shard_strategy, world_size=world_size, global_rank=global_rank ) - audio_tar_index_filepaths = expand_sharded_filepaths( + audio_tar_index_filepaths = expand_audio_filepaths( audio_tar_index_filepaths, shard_strategy=shard_strategy, world_size=world_size, diff --git a/nemo/collections/asr/data/audio_to_text_dataset.py b/nemo/collections/asr/data/audio_to_text_dataset.py index 325857e81323..1cb0b880aa69 100644 --- a/nemo/collections/asr/data/audio_to_text_dataset.py +++ b/nemo/collections/asr/data/audio_to_text_dataset.py @@ -346,9 +346,6 @@ def get_tarred_dataset( ): if len(tarred_audio_filepath) == 1: tarred_audio_filepath = tarred_audio_filepath[0] - if len(manifest_filepath) == 1: - manifest_filepath = manifest_filepath[0] - if tokenizer is None: dataset = audio_to_text.TarredAudioToCharDataset( audio_tar_filepaths=tarred_audio_filepath, @@ -366,7 +363,6 @@ def get_tarred_dataset( trim=config.get('trim_silence', False), parser=config.get('parser', 'en'), shard_strategy=config.get('tarred_shard_strategy', 'scatter'), - shard_manifests=config.get('shard_manifests', False), global_rank=global_rank, world_size=world_size, return_sample_id=config.get('return_sample_id', False), @@ -385,7 +381,6 @@ def get_tarred_dataset( trim=config.get('trim_silence', False), use_start_end_token=config.get('use_start_end_token', True), shard_strategy=config.get('tarred_shard_strategy', 'scatter'), - shard_manifests=config.get('shard_manifests', False), global_rank=global_rank, world_size=world_size, return_sample_id=config.get('return_sample_id', False), diff --git a/nemo/collections/asr/models/configs/asr_models_config.py b/nemo/collections/asr/models/configs/asr_models_config.py index 609d42216659..e0ceeff6b186 100644 --- a/nemo/collections/asr/models/configs/asr_models_config.py +++ b/nemo/collections/asr/models/configs/asr_models_config.py @@ -38,7 +38,6 @@ class ASRDatasetConfig(nemo.core.classes.dataset.DatasetConfig): is_tarred: bool = False tarred_audio_filepaths: Optional[Any] = None tarred_shard_strategy: str = "scatter" - shard_manifests: bool = False shuffle_n: int = 0 # Optional diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py index f4e227f510af..a3e36dbc1522 100644 --- a/nemo/collections/asr/models/rnnt_models.py +++ b/nemo/collections/asr/models/rnnt_models.py @@ -242,7 +242,6 @@ def transcribe( """ if paths2audio_files is None or len(paths2audio_files) == 0: return {} - # We will store transcriptions here hypotheses = [] all_hypotheses = [] From 27cf8de39a4275333c4183be0fd99779d4d596fe Mon Sep 17 00:00:00 2001 From: Micha Livne Date: Sun, 23 Apr 2023 16:46:51 -0700 Subject: [PATCH 21/23] 1. Debugging. --- .../language_modeling/megatron_base_model.py | 1 - .../language_modeling/megatron_gpt_model.py | 42 +- .../modules/common/megatron/megatron_init.py | 3 +- nemo/collections/tts/data/dataset.py | 47 --- nemo/collections/tts/models/fastpitch.py | 97 +---- nemo/collections/tts/modules/__init__.py | 1 - nemo/collections/tts/modules/adapters.py | 147 ------- nemo/collections/tts/modules/aligner.py | 11 +- nemo/collections/tts/modules/fastpitch.py | 151 +++---- nemo/collections/tts/modules/submodules.py | 344 +--------------- nemo/collections/tts/modules/transformer.py | 98 ++--- nemo/collections/tts/parts/mixins/__init__.py | 15 - .../parts/mixins/fastpitch_adapter_mixins.py | 368 ------------------ nemo/collections/tts/torch/tts_data_types.py | 5 - 14 files changed, 117 insertions(+), 1213 deletions(-) delete mode 100644 nemo/collections/tts/modules/adapters.py delete mode 100644 nemo/collections/tts/parts/mixins/__init__.py delete mode 100644 nemo/collections/tts/parts/mixins/fastpitch_adapter_mixins.py diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 5e5c177737fa..3b223a5744af 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -120,7 +120,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): pipeline_model_parallel_split_rank=cfg.get('pipeline_model_parallel_split_rank', 0), micro_batch_size=cfg.get('micro_batch_size'), global_batch_size=cfg.get('global_batch_size'), - rampup_batch_size=cfg.get('rampup_batch_size'), use_fp8=cfg.get('fp8', False), seed=self.cfg.get('seed', 1234), apex_transformer_log_level=self.cfg.get('apex_transformer_log_level', 30), diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index d8f90c500182..70511288a9e8 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -55,7 +55,6 @@ from nemo.utils import logging try: - import apex.transformer.pipeline_parallel.utils from apex.transformer.pipeline_parallel.utils import get_num_microbatches HAVE_APEX = True @@ -429,23 +428,15 @@ def training_step(self, dataloader_iter, batch_idx): 'global_step', self.trainer.global_step, prog_bar=True, rank_zero_only=True, batch_size=1, ) - consumed_samples = self.compute_consumed_samples(self.trainer.global_step - self.init_global_step) # TODO: make sure compute_consumed_samples works for pipeline parallelism self.log( - 'consumed_samples', consumed_samples, prog_bar=True, rank_zero_only=True, batch_size=1, + 'consumed_samples', + self.compute_consumed_samples(self.trainer.global_step - self.init_global_step), + prog_bar=True, + rank_zero_only=True, + batch_size=1, ) - if self.cfg.get('rampup_batch_size', None): - micro_batch_size = self.cfg.get('micro_batch_size', 1) - total_gpus_number = self.trainer.num_devices * self.trainer.num_nodes - current_global_batch_size = get_num_microbatches() * micro_batch_size * total_gpus_number - self.log('global_batch_size', current_global_batch_size, prog_bar=True, rank_zero_only=True, batch_size=1) - - num_microbatch_calculator = apex.transformer.pipeline_parallel.utils._GLOBAL_NUM_MICROBATCHES_CALCULATOR - num_microbatch_calculator.update( - consumed_samples=consumed_samples, consistency_check=True, - ) - return loss_mean def backward(self, *args, **kwargs): @@ -826,29 +817,6 @@ def setup(self, stage=None): self.init_consumed_samples = init_consumed_samples self.init_global_step = self.trainer.global_step - rampup_batch_size = self.cfg.get('rampup_batch_size', None) - if rampup_batch_size: - start_batch_size = rampup_batch_size[0] - batch_size_increment = rampup_batch_size[1] - total_gpus_number = self.trainer.num_devices * self.trainer.num_nodes - - assert start_batch_size % (total_gpus_number) == 0, ( - 'expected' - ' start batch size ({}) to be divisible by total number of GPUs' - ' ({})'.format(start_batch_size, total_gpus_number) - ) - - micro_batch_size = self.cfg.get('micro_batch_size', 1) - tensor_model_parallel_size = self.cfg.get('tensor_model_parallel_size', 1) - pipeline_model_parallel_size = self.cfg.get('pipeline_model_parallel_size', 1) - total_data_parallel_size = total_gpus_number // (tensor_model_parallel_size * pipeline_model_parallel_size) - - assert batch_size_increment % (micro_batch_size * total_data_parallel_size) == 0, ( - 'expected' - ' batch size increment ({}) to be divisible by micro_batch_size ({}) times total data parallel size' - ' ({})'.format(batch_size_increment, micro_batch_size, total_data_parallel_size) - ) - if stage == 'predict': return else: diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py index e0551fad5d16..65a788de438c 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py @@ -65,7 +65,6 @@ def initialize_model_parallel_for_nemo( pipeline_model_parallel_split_rank=None, micro_batch_size=None, global_batch_size=None, - rampup_batch_size=None, use_fp8=False, seed=1234, apex_transformer_log_level=30, @@ -122,7 +121,7 @@ def initialize_model_parallel_for_nemo( global_batch_size=global_batch_size, micro_batch_size=micro_batch_size, data_parallel_size=app_state.data_parallel_size, - rampup_batch_size=rampup_batch_size, + rampup_batch_size=None, ) else: if isinstance(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, ConstantNumMicroBatches): diff --git a/nemo/collections/tts/data/dataset.py b/nemo/collections/tts/data/dataset.py index 6bb41d341b31..af4df1e58668 100644 --- a/nemo/collections/tts/data/dataset.py +++ b/nemo/collections/tts/data/dataset.py @@ -50,7 +50,6 @@ LogMel, P_voiced, Pitch, - ReferenceAudio, SpeakerID, TTSDataType, Voiced_mask, @@ -484,13 +483,6 @@ def add_energy(self, **kwargs): def add_speaker_id(self, **kwargs): pass - def add_reference_audio(self, **kwargs): - assert SpeakerID in self.sup_data_types, "Please add speaker_id in sup_data_types." - """Add a mapping for each speaker to their manifest indexes""" - self.speaker_to_index_map = defaultdict(set) - for i, d in enumerate(self.data): - self.speaker_to_index_map[d['speaker_id']].add(i) - def get_spec(self, audio): with torch.cuda.amp.autocast(enabled=False): spec = self.stft(audio) @@ -530,12 +522,6 @@ def _pad_wav_to_multiple(self, wav): ) return wav - # Random sample a reference index from the same speaker - def sample_reference_index(self, speaker_id): - reference_pool = self.speaker_to_index_map[speaker_id] - reference_index = random.sample(reference_pool, 1)[0] - return reference_index - def __getitem__(self, index): sample = self.data[index] @@ -697,19 +683,6 @@ def __getitem__(self, index): if SpeakerID in self.sup_data_types_set: speaker_id = torch.tensor(sample["speaker_id"]).long() - reference_audio, reference_audio_length = None, None - if ReferenceAudio in self.sup_data_types_set: - reference_index = self.sample_reference_index(sample["speaker_id"]) - reference_audio = self.featurizer.process( - self.data[reference_index]["audio_filepath"], - trim=self.trim, - trim_ref=self.trim_ref, - trim_top_db=self.trim_top_db, - trim_frame_length=self.trim_frame_length, - trim_hop_length=self.trim_hop_length, - ) - reference_audio_length = torch.tensor(reference_audio.shape[0]).long() - return ( audio, audio_length, @@ -727,8 +700,6 @@ def __getitem__(self, index): voiced_mask, p_voiced, audio_shifted, - reference_audio, - reference_audio_length, ) def __len__(self): @@ -762,8 +733,6 @@ def general_collate_fn(self, batch): voiced_masks, p_voiceds, _, - _, - reference_audio_lengths, ) = zip(*batch) max_audio_len = max(audio_lengths).item() @@ -772,9 +741,6 @@ def general_collate_fn(self, batch): max_durations_len = max([len(i) for i in durations_list]) if Durations in self.sup_data_types_set else None max_pitches_len = max(pitches_lengths).item() if Pitch in self.sup_data_types_set else None max_energies_len = max(energies_lengths).item() if Energy in self.sup_data_types_set else None - max_reference_audio_len = ( - max(reference_audio_lengths).item() if ReferenceAudio in self.sup_data_types_set else None - ) if LogMel in self.sup_data_types_set: log_mel_pad = torch.finfo(batch[0][4].dtype).tiny @@ -799,7 +765,6 @@ def general_collate_fn(self, batch): voiced_masks, p_voiceds, audios_shifted, - reference_audios, ) = ( [], [], @@ -811,7 +776,6 @@ def general_collate_fn(self, batch): [], [], [], - [], ) for i, sample_tuple in enumerate(batch): @@ -832,8 +796,6 @@ def general_collate_fn(self, batch): voiced_mask, p_voiced, audio_shifted, - reference_audio, - reference_audios_length, ) = sample_tuple audio = general_padding(audio, audio_len.item(), max_audio_len) @@ -872,11 +834,6 @@ def general_collate_fn(self, batch): if SpeakerID in self.sup_data_types_set: speaker_ids.append(speaker_id) - if ReferenceAudio in self.sup_data_types_set: - reference_audios.append( - general_padding(reference_audio, reference_audios_length.item(), max_reference_audio_len) - ) - data_dict = { "audio": torch.stack(audios), "audio_lens": torch.stack(audio_lengths), @@ -894,10 +851,6 @@ def general_collate_fn(self, batch): "voiced_mask": torch.stack(voiced_masks) if Voiced_mask in self.sup_data_types_set else None, "p_voiced": torch.stack(p_voiceds) if P_voiced in self.sup_data_types_set else None, "audio_shifted": torch.stack(audios_shifted) if audio_shifted is not None else None, - "reference_audio": torch.stack(reference_audios) if ReferenceAudio in self.sup_data_types_set else None, - "reference_audio_lens": torch.stack(reference_audio_lengths) - if ReferenceAudio in self.sup_data_types_set - else None, } return data_dict diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index 5502e69a3111..7d4a110df86f 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -26,7 +26,6 @@ from nemo.collections.tts.losses.fastpitchloss import DurationLoss, EnergyLoss, MelLoss, PitchLoss from nemo.collections.tts.models.base import SpectrogramGenerator from nemo.collections.tts.modules.fastpitch import FastPitchModule -from nemo.collections.tts.parts.mixins import FastPitchAdapterModelMixin from nemo.collections.tts.parts.utils.helpers import ( batch_from_ragged, plot_alignment_to_numpy, @@ -75,7 +74,7 @@ class TextTokenizerConfig: text_tokenizer: TextTokenizer = TextTokenizer() -class FastPitchModel(SpectrogramGenerator, Exportable, FastPitchAdapterModelMixin): +class FastPitchModel(SpectrogramGenerator, Exportable): """FastPitch model (https://arxiv.org/abs/2006.06873) that is used to generate mel spectrogram from text.""" def __init__(self, cfg: DictConfig, trainer: Trainer = None): @@ -139,25 +138,11 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): output_fft = instantiate(self._cfg.output_fft) duration_predictor = instantiate(self._cfg.duration_predictor) pitch_predictor = instantiate(self._cfg.pitch_predictor) - speaker_encoder = instantiate(self._cfg.get("speaker_encoder", None)) - energy_embedding_kernel_size = cfg.get("energy_embedding_kernel_size", 0) - energy_predictor = instantiate(self._cfg.get("energy_predictor", None)) - - # [TODO] may remove if we change the pre-trained config - # cfg: condition_types = [ "add" ] - n_speakers = cfg.get("n_speakers", 0) speaker_emb_condition_prosody = cfg.get("speaker_emb_condition_prosody", False) speaker_emb_condition_decoder = cfg.get("speaker_emb_condition_decoder", False) speaker_emb_condition_aligner = cfg.get("speaker_emb_condition_aligner", False) - if n_speakers > 1 and "add" not in input_fft.cond_input.condition_types: - input_fft.cond_input.condition_types.append("add") - if speaker_emb_condition_prosody: - duration_predictor.cond_input.condition_types.append("add") - pitch_predictor.cond_input.condition_types.append("add") - if speaker_emb_condition_decoder: - output_fft.cond_input.condition_types.append("add") - if speaker_emb_condition_aligner and self.aligner is not None: - self.aligner.cond_input.condition_types.append("add") + energy_embedding_kernel_size = cfg.get("energy_embedding_kernel_size", 0) + energy_predictor = instantiate(self._cfg.get("energy_predictor", None)) self.fastpitch = FastPitchModule( input_fft, @@ -166,13 +151,15 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): pitch_predictor, energy_predictor, self.aligner, - speaker_encoder, - n_speakers, + cfg.n_speakers, cfg.symbols_embedding_dim, cfg.pitch_embedding_kernel_size, energy_embedding_kernel_size, cfg.n_mel_channels, cfg.max_token_duration, + speaker_emb_condition_prosody, + speaker_emb_condition_decoder, + speaker_emb_condition_aligner, ) self._input_types = self._output_types = None self.export_config = { @@ -309,9 +296,6 @@ def parse(self, str_input: str, normalize=True) -> torch.tensor: "attn_prior": NeuralType(('B', 'T_spec', 'T_text'), ProbsType(), optional=True), "mel_lens": NeuralType(('B'), LengthsType(), optional=True), "input_lens": NeuralType(('B'), LengthsType(), optional=True), - # reference_* data is used for multi-speaker FastPitch training - "reference_spec": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType(), optional=True), - "reference_spec_lens": NeuralType(('B'), LengthsType(), optional=True), } ) def forward( @@ -327,8 +311,6 @@ def forward( attn_prior=None, mel_lens=None, input_lens=None, - reference_spec=None, - reference_spec_lens=None, ): return self.fastpitch( text=text, @@ -341,43 +323,21 @@ def forward( attn_prior=attn_prior, mel_lens=mel_lens, input_lens=input_lens, - reference_spec=reference_spec, - reference_spec_lens=reference_spec_lens, ) @typecheck(output_types={"spect": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType())}) def generate_spectrogram( - self, - tokens: 'torch.tensor', - speaker: Optional[int] = None, - pace: float = 1.0, - reference_spec: Optional['torch.tensor'] = None, - reference_spec_lens: Optional['torch.tensor'] = None, + self, tokens: 'torch.tensor', speaker: Optional[int] = None, pace: float = 1.0 ) -> torch.tensor: if self.training: logging.warning("generate_spectrogram() is meant to be called in eval mode.") if isinstance(speaker, int): speaker = torch.tensor([speaker]).to(self.device) - spect, *_ = self( - text=tokens, - durs=None, - pitch=None, - speaker=speaker, - pace=pace, - reference_spec=reference_spec, - reference_spec_lens=reference_spec_lens, - ) + spect, *_ = self(text=tokens, durs=None, pitch=None, speaker=speaker, pace=pace) return spect def training_step(self, batch, batch_idx): - attn_prior, durs, speaker, energy, reference_audio, reference_audio_len = ( - None, - None, - None, - None, - None, - None, - ) + attn_prior, durs, speaker, energy = None, None, None, None if self.learn_alignment: assert self.ds_class_name == "TTSDataset", f"Unknown dataset class: {self.ds_class_name}" batch_dict = process_batch(batch, self._train_dl.dataset.sup_data_types_set) @@ -389,17 +349,10 @@ def training_step(self, batch, batch_idx): pitch = batch_dict.get("pitch", None) energy = batch_dict.get("energy", None) speaker = batch_dict.get("speaker_id", None) - reference_audio = batch_dict.get("reference_audio", None) - reference_audio_len = batch_dict.get("reference_audio_lens", None) else: audio, audio_lens, text, text_lens, durs, pitch, speaker = batch mels, spec_len = self.preprocessor(input_signal=audio, length=audio_lens) - reference_spec, reference_spec_len = None, None - if reference_audio is not None: - reference_spec, reference_spec_len = self.preprocessor( - input_signal=reference_audio, length=reference_audio_len - ) ( mels_pred, @@ -422,8 +375,6 @@ def training_step(self, batch, batch_idx): speaker=speaker, pace=1.0, spec=mels if self.learn_alignment else None, - reference_spec=reference_spec, - reference_spec_lens=reference_spec_len, attn_prior=attn_prior, mel_lens=spec_len, input_lens=text_lens, @@ -481,14 +432,7 @@ def training_step(self, batch, batch_idx): return loss def validation_step(self, batch, batch_idx): - attn_prior, durs, speaker, energy, reference_audio, reference_audio_len = ( - None, - None, - None, - None, - None, - None, - ) + attn_prior, durs, speaker, energy = None, None, None, None if self.learn_alignment: assert self.ds_class_name == "TTSDataset", f"Unknown dataset class: {self.ds_class_name}" batch_dict = process_batch(batch, self._train_dl.dataset.sup_data_types_set) @@ -500,17 +444,10 @@ def validation_step(self, batch, batch_idx): pitch = batch_dict.get("pitch", None) energy = batch_dict.get("energy", None) speaker = batch_dict.get("speaker_id", None) - reference_audio = batch_dict.get("reference_audio", None) - reference_audio_len = batch_dict.get("reference_audio_lens", None) else: audio, audio_lens, text, text_lens, durs, pitch, speaker = batch mels, mel_lens = self.preprocessor(input_signal=audio, length=audio_lens) - reference_spec, reference_spec_len = None, None - if reference_audio is not None: - reference_spec, reference_spec_len = self.preprocessor( - input_signal=reference_audio, length=reference_audio_len - ) # Calculate val loss on ground truth durations to better align L2 loss in time (mels_pred, _, _, log_durs_pred, pitch_pred, _, _, _, attn_hard_dur, pitch, energy_pred, energy_tgt,) = self( @@ -521,8 +458,6 @@ def validation_step(self, batch, batch_idx): speaker=speaker, pace=1.0, spec=mels if self.learn_alignment else None, - reference_spec=reference_spec, - reference_spec_lens=reference_spec_len, attn_prior=attn_prior, mel_lens=mel_lens, input_lens=text_lens, @@ -552,13 +487,13 @@ def validation_epoch_end(self, outputs): mel_loss = collect("mel_loss") dur_loss = collect("dur_loss") pitch_loss = collect("pitch_loss") - self.log("val_loss", val_loss, sync_dist=True) - self.log("val_mel_loss", mel_loss, sync_dist=True) - self.log("val_dur_loss", dur_loss, sync_dist=True) - self.log("val_pitch_loss", pitch_loss, sync_dist=True) + self.log("val_loss", val_loss) + self.log("val_mel_loss", mel_loss) + self.log("val_dur_loss", dur_loss) + self.log("val_pitch_loss", pitch_loss) if outputs[0]["energy_loss"] is not None: energy_loss = collect("energy_loss") - self.log("val_energy_loss", energy_loss, sync_dist=True) + self.log("val_energy_loss", energy_loss) _, _, _, _, _, spec_target, spec_predict = outputs[0].values() diff --git a/nemo/collections/tts/modules/__init__.py b/nemo/collections/tts/modules/__init__.py index ec7563d1966b..1354de22339d 100644 --- a/nemo/collections/tts/modules/__init__.py +++ b/nemo/collections/tts/modules/__init__.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import nemo.collections.tts.modules.adapters from nemo.collections.tts.modules.tacotron2 import Decoder as Taco2Decoder from nemo.collections.tts.modules.tacotron2 import Encoder as Taco2Encoder from nemo.collections.tts.modules.tacotron2 import Postnet as Taco2Postnet diff --git a/nemo/collections/tts/modules/adapters.py b/nemo/collections/tts/modules/adapters.py deleted file mode 100644 index df5bdff84dc5..000000000000 --- a/nemo/collections/tts/modules/adapters.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List, Optional - -from omegaconf import DictConfig - -from nemo.collections.asr.parts.utils import adapter_utils -from nemo.collections.tts.modules.aligner import AlignmentEncoder -from nemo.collections.tts.modules.fastpitch import TemporalPredictor -from nemo.collections.tts.modules.transformer import FFTransformerDecoder, FFTransformerEncoder -from nemo.core.classes import adapter_mixins - - -class FFTransformerDecoderAdapter(FFTransformerDecoder, adapter_mixins.AdapterModuleMixin): - """ Inherit from FFTransformerDecoder and add support for adapter""" - - def add_adapter(self, name: str, cfg: dict): - cfg = self._update_adapter_cfg_input_dim(cfg) - for fft_layer in self.layers: # type: adapter_mixins.AdapterModuleMixin - fft_layer.add_adapter(name, cfg) - - def is_adapter_available(self) -> bool: - return any([FFT_layer.is_adapter_available() for FFT_layer in self.layers]) - - def set_enabled_adapters(self, name: Optional[str] = None, enabled: bool = True): - for FFT_layer in self.layers: # type: adapter_mixins.AdapterModuleMixin - FFT_layer.set_enabled_adapters(name=name, enabled=enabled) - - def get_enabled_adapters(self) -> List[str]: - names = set([]) - for FFT_layer in self.layers: # type: adapter_mixins.AdapterModuleMixin - names.update(FFT_layer.get_enabled_adapters()) - - names = sorted(list(names)) - return names - - def _update_adapter_cfg_input_dim(self, cfg: DictConfig): - cfg = adapter_utils.update_adapter_cfg_input_dim(self, cfg, module_dim=self.d_model) - return cfg - - -class FFTransformerEncoderAdapter( - FFTransformerDecoderAdapter, FFTransformerEncoder, adapter_mixins.AdapterModuleMixin -): - """ Inherit from FFTransformerEncoder and add support for adapter""" - - pass - - -class AlignmentEncoderAdapter(AlignmentEncoder, adapter_mixins.AdapterModuleMixin): - """ Inherit from AlignmentEncoder and add support for adapter""" - - def add_adapter(self, name: str, cfg: dict): - - for i, conv_layer in enumerate(self.key_proj): - if i % 2 == 0: - cfg = self._update_adapter_cfg_input_dim(cfg, conv_layer.conv.out_channels) - conv_layer.add_adapter(name, cfg) - - for i, conv_layer in enumerate(self.query_proj): - if i % 2 == 0: - cfg = self._update_adapter_cfg_input_dim(cfg, conv_layer.conv.out_channels) - conv_layer.add_adapter(name, cfg) - - def is_adapter_available(self) -> bool: - return any( - [conv_layer.is_adapter_available() for i, conv_layer in enumerate(self.key_proj) if i % 2 == 0] - + [conv_layer.is_adapter_available() for i, conv_layer in enumerate(self.query_proj) if i % 2 == 0] - ) - - def set_enabled_adapters(self, name: Optional[str] = None, enabled: bool = True): - for i, conv_layer in enumerate(self.key_proj): - if i % 2 == 0: - conv_layer.set_enabled_adapters(name=name, enabled=enabled) - for i, conv_layer in enumerate(self.query_proj): - if i % 2 == 0: - conv_layer.set_enabled_adapters(name=name, enabled=enabled) - - def get_enabled_adapters(self) -> List[str]: - names = set([]) - for i, conv_layer in enumerate(self.key_proj): - if i % 2 == 0: - names.update(conv_layer.get_enabled_adapters()) - for i, conv_layer in enumerate(self.query_proj): - if i % 2 == 0: - names.update(conv_layer.get_enabled_adapters()) - - names = sorted(list(names)) - return names - - def _update_adapter_cfg_input_dim(self, cfg: DictConfig, module_dim: int): - cfg = adapter_utils.update_adapter_cfg_input_dim(self, cfg, module_dim=module_dim) - return cfg - - -class TemporalPredictorAdapter(TemporalPredictor, adapter_mixins.AdapterModuleMixin): - """ Inherit from TemporalPredictor and add support for adapter""" - - def add_adapter(self, name: str, cfg: dict): - cfg = self._update_adapter_cfg_input_dim(cfg) - for conv_layer in self.layers: # type: adapter_mixins.AdapterModuleMixin - conv_layer.add_adapter(name, cfg) - - def is_adapter_available(self) -> bool: - return any([conv_layer.is_adapter_available() for conv_layer in self.layers]) - - def set_enabled_adapters(self, name: Optional[str] = None, enabled: bool = True): - for conv_layer in self.layers: # type: adapter_mixins.AdapterModuleMixin - conv_layer.set_enabled_adapters(name=name, enabled=enabled) - - def get_enabled_adapters(self) -> List[str]: - names = set([]) - for conv_layer in self.layers: # type: adapter_mixins.AdapterModuleMixin - names.update(conv_layer.get_enabled_adapters()) - - names = sorted(list(names)) - return names - - def _update_adapter_cfg_input_dim(self, cfg: DictConfig): - cfg = adapter_utils.update_adapter_cfg_input_dim(self, cfg, module_dim=self.filter_size) - return cfg - - -"""Register any additional information""" -if adapter_mixins.get_registered_adapter(FFTransformerEncoder) is None: - adapter_mixins.register_adapter(base_class=FFTransformerEncoder, adapter_class=FFTransformerEncoderAdapter) - -if adapter_mixins.get_registered_adapter(FFTransformerDecoder) is None: - adapter_mixins.register_adapter(base_class=FFTransformerDecoder, adapter_class=FFTransformerDecoderAdapter) - -if adapter_mixins.get_registered_adapter(AlignmentEncoder) is None: - adapter_mixins.register_adapter(base_class=AlignmentEncoder, adapter_class=AlignmentEncoderAdapter) - -if adapter_mixins.get_registered_adapter(TemporalPredictor) is None: - adapter_mixins.register_adapter(base_class=TemporalPredictor, adapter_class=TemporalPredictorAdapter) diff --git a/nemo/collections/tts/modules/aligner.py b/nemo/collections/tts/modules/aligner.py index bc170742df23..2b03b7aea219 100644 --- a/nemo/collections/tts/modules/aligner.py +++ b/nemo/collections/tts/modules/aligner.py @@ -12,11 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. - import torch from torch import nn -from nemo.collections.tts.modules.submodules import ConditionalInput, ConvNorm +from nemo.collections.tts.modules.submodules import ConvNorm from nemo.collections.tts.parts.utils.helpers import binarize_attention_parallel @@ -24,11 +23,10 @@ class AlignmentEncoder(torch.nn.Module): """Module for alignment text and mel spectrogram. """ def __init__( - self, n_mel_channels=80, n_text_channels=512, n_att_channels=80, temperature=0.0005, condition_types=[] + self, n_mel_channels=80, n_text_channels=512, n_att_channels=80, temperature=0.0005, ): super().__init__() self.temperature = temperature - self.cond_input = ConditionalInput(n_text_channels, n_text_channels, condition_types) self.softmax = torch.nn.Softmax(dim=3) self.log_softmax = torch.nn.LogSoftmax(dim=3) @@ -153,12 +151,13 @@ def forward(self, queries, keys, mask=None, attn_prior=None, conditioning=None): keys (torch.tensor): B x C2 x T2 tensor (text data). mask (torch.tensor): B x T2 x 1 tensor, binary mask for variable length entries (True = mask element, False = leave unchanged). attn_prior (torch.tensor): prior for attention matrix. - conditioning (torch.tensor): B x 1 x C2 conditioning embedding + conditioning (torch.tensor): B x T2 x 1 conditioning embedding Output: attn (torch.tensor): B x 1 x T1 x T2 attention mask. Final dim T2 should sum to 1. attn_logprob (torch.tensor): B x 1 x T1 x T2 log-prob attention mask. """ - keys = self.cond_input(keys.transpose(1, 2), conditioning).transpose(1, 2) + if conditioning is not None: + keys = keys + conditioning.transpose(1, 2) keys_enc = self.key_proj(keys) # B x n_attn_dims x T2 queries_enc = self.query_proj(queries) # B x n_attn_dims x T1 diff --git a/nemo/collections/tts/modules/fastpitch.py b/nemo/collections/tts/modules/fastpitch.py index e2da672cf9c7..eaee68a23517 100644 --- a/nemo/collections/tts/modules/fastpitch.py +++ b/nemo/collections/tts/modules/fastpitch.py @@ -44,9 +44,8 @@ import torch -from nemo.collections.tts.modules.submodules import ConditionalInput, ConditionalLayerNorm from nemo.collections.tts.parts.utils.helpers import binarize_attention_parallel, regulate_len -from nemo.core.classes import NeuralModule, adapter_mixins, typecheck +from nemo.core.classes import NeuralModule, typecheck from nemo.core.neural_types.elements import ( EncodedRepresentation, Index, @@ -80,53 +79,40 @@ def average_features(pitch, durs): return pitch_avg -class ConvReLUNorm(torch.nn.Module, adapter_mixins.AdapterModuleMixin): - def __init__(self, in_channels, out_channels, kernel_size=1, dropout=0.0, condition_dim=384, condition_types=[]): +class ConvReLUNorm(torch.nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=1, dropout=0.0): super(ConvReLUNorm, self).__init__() self.conv = torch.nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, padding=(kernel_size // 2)) - self.norm = ConditionalLayerNorm(out_channels, condition_dim=condition_dim, condition_types=condition_types) + self.norm = torch.nn.LayerNorm(out_channels) self.dropout = torch.nn.Dropout(dropout) - def forward(self, signal, conditioning=None): + def forward(self, signal): out = torch.nn.functional.relu(self.conv(signal)) - out = self.norm(out.transpose(1, 2), conditioning).transpose(1, 2) - out = self.dropout(out) - - if self.is_adapter_available(): - out = self.forward_enabled_adapters(out.transpose(1, 2)).transpose(1, 2) - - return out + out = self.norm(out.transpose(1, 2)).transpose(1, 2) + return self.dropout(out) class TemporalPredictor(NeuralModule): """Predicts a single float per each temporal location""" - def __init__(self, input_size, filter_size, kernel_size, dropout, n_layers=2, condition_types=[]): + def __init__(self, input_size, filter_size, kernel_size, dropout, n_layers=2): super(TemporalPredictor, self).__init__() - self.cond_input = ConditionalInput(input_size, input_size, condition_types) - self.layers = torch.nn.ModuleList() - for i in range(n_layers): - self.layers.append( + + self.layers = torch.nn.Sequential( + *[ ConvReLUNorm( - input_size if i == 0 else filter_size, - filter_size, - kernel_size=kernel_size, - dropout=dropout, - condition_dim=input_size, - condition_types=condition_types, + input_size if i == 0 else filter_size, filter_size, kernel_size=kernel_size, dropout=dropout ) - ) + for i in range(n_layers) + ] + ) self.fc = torch.nn.Linear(filter_size, 1, bias=True) - # Use for adapter input dimension - self.filter_size = filter_size - @property def input_types(self): return { "enc": NeuralType(('B', 'T', 'D'), EncodedRepresentation()), "enc_mask": NeuralType(('B', 'T', 1), TokenDurationType()), - "conditioning": NeuralType(('B', 'T', 'D'), EncodedRepresentation(), optional=True), } @property @@ -135,20 +121,14 @@ def output_types(self): "out": NeuralType(('B', 'T'), EncodedRepresentation()), } - def forward(self, enc, enc_mask, conditioning=None): - enc = self.cond_input(enc, conditioning) + def forward(self, enc, enc_mask): out = enc * enc_mask - out = out.transpose(1, 2) - - for layer in self.layers: - out = layer(out, conditioning=conditioning) - - out = out.transpose(1, 2) + out = self.layers(out.transpose(1, 2)).transpose(1, 2) out = self.fc(out) * enc_mask return out.squeeze(-1) -class FastPitchModule(NeuralModule, adapter_mixins.AdapterModuleMixin): +class FastPitchModule(NeuralModule): def __init__( self, encoder_module: NeuralModule, @@ -157,13 +137,15 @@ def __init__( pitch_predictor: NeuralModule, energy_predictor: NeuralModule, aligner: NeuralModule, - speaker_encoder: NeuralModule, n_speakers: int, symbols_embedding_dim: int, pitch_embedding_kernel_size: int, energy_embedding_kernel_size: int, n_mel_channels: int = 80, max_token_duration: int = 75, + speaker_emb_condition_prosody: bool = False, + speaker_emb_condition_decoder: bool = False, + speaker_emb_condition_aligner: bool = False, ): super().__init__() @@ -173,15 +155,14 @@ def __init__( self.pitch_predictor = pitch_predictor self.energy_predictor = energy_predictor self.aligner = aligner - self.speaker_encoder = speaker_encoder self.learn_alignment = aligner is not None self.use_duration_predictor = True self.binarize = False + self.speaker_emb_condition_prosody = speaker_emb_condition_prosody + self.speaker_emb_condition_decoder = speaker_emb_condition_decoder + self.speaker_emb_condition_aligner = speaker_emb_condition_aligner - # TODO: combine self.speaker_emb with self.speaker_encoder - # cfg: remove `n_speakers`, create `speaker_encoder.lookup_module` - # state_dict: move `speaker_emb.weight` to `speaker_encoder.lookup_module.table.weight` - if n_speakers > 1 and speaker_encoder is None: + if n_speakers > 1: self.speaker_emb = torch.nn.Embedding(n_speakers, symbols_embedding_dim) else: self.speaker_emb = None @@ -223,8 +204,6 @@ def input_types(self): "attn_prior": NeuralType(('B', 'T_spec', 'T_text'), ProbsType(), optional=True), "mel_lens": NeuralType(('B'), LengthsType(), optional=True), "input_lens": NeuralType(('B'), LengthsType(), optional=True), - "reference_spec": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType(), optional=True), - "reference_spec_lens": NeuralType(('B'), LengthsType(), optional=True), } @property @@ -244,19 +223,6 @@ def output_types(self): "energy_tgt": NeuralType(('B', 'T_audio'), RegressionValuesType()), } - def get_speaker_embedding(self, speaker, reference_spec, reference_spec_lens): - """spk_emb: Bx1xD""" - if self.speaker_encoder is not None: - spk_emb = self.speaker_encoder(speaker, reference_spec, reference_spec_lens).unsqueeze(1) - elif self.speaker_emb is not None: - if speaker is None: - raise ValueError('Please give speaker id to get lookup speaker embedding.') - spk_emb = self.speaker_emb(speaker).unsqueeze(1) - else: - spk_emb = None - - return spk_emb - @typecheck() def forward( self, @@ -271,8 +237,6 @@ def forward( attn_prior=None, mel_lens=None, input_lens=None, - reference_spec=None, - reference_spec_lens=None, ): if not self.learn_alignment and self.training: @@ -280,28 +244,34 @@ def forward( assert pitch is not None # Calculate speaker embedding - spk_emb = self.get_speaker_embedding( - speaker=speaker, reference_spec=reference_spec, reference_spec_lens=reference_spec_lens, - ) + if self.speaker_emb is None or speaker is None: + spk_emb = 0 + else: + spk_emb = self.speaker_emb(speaker).unsqueeze(1) # Input FFT enc_out, enc_mask = self.encoder(input=text, conditioning=spk_emb) - - # Predict duration - log_durs_predicted = self.duration_predictor(enc_out, enc_mask, conditioning=spk_emb) + if self.speaker_emb_condition_prosody: + prosody_input = enc_out + spk_emb + else: + prosody_input = enc_out + log_durs_predicted = self.duration_predictor(prosody_input, enc_mask) durs_predicted = torch.clamp(torch.exp(log_durs_predicted) - 1, 0, self.max_token_duration) attn_soft, attn_hard, attn_hard_dur, attn_logprob = None, None, None, None if self.learn_alignment and spec is not None: text_emb = self.encoder.word_emb(text) - attn_soft, attn_logprob = self.aligner( - spec, text_emb.permute(0, 2, 1), enc_mask == 0, attn_prior, conditioning=spk_emb - ) + if self.speaker_emb_condition_aligner and not isinstance(spk_emb, int): + attn_soft, attn_logprob = self.aligner( + spec, text_emb.permute(0, 2, 1), enc_mask == 0, attn_prior, conditioning=spk_emb + ) + else: + attn_soft, attn_logprob = self.aligner(spec, text_emb.permute(0, 2, 1), enc_mask == 0, attn_prior) attn_hard = binarize_attention_parallel(attn_soft, input_lens, mel_lens) attn_hard_dur = attn_hard.sum(2)[:, 0, :] # Predict pitch - pitch_predicted = self.pitch_predictor(enc_out, enc_mask, conditioning=spk_emb) + pitch_predicted = self.pitch_predictor(prosody_input, enc_mask) if pitch is not None: if self.learn_alignment and pitch.shape[-1] != pitch_predicted.shape[-1]: # Pitch during training is per spectrogram frame, but during inference, it should be per character @@ -350,7 +320,10 @@ def forward( ) # Output FFT - dec_out, _ = self.decoder(input=len_regulated, seq_lens=dec_lens, conditioning=spk_emb) + if self.speaker_emb_condition_decoder: + dec_out, _ = self.decoder(input=len_regulated, seq_lens=dec_lens, conditioning=spk_emb) + else: + dec_out, _ = self.decoder(input=len_regulated, seq_lens=dec_lens) spect = self.proj(dec_out).transpose(1, 2) return ( spect, @@ -367,33 +340,26 @@ def forward( energy_tgt, ) - def infer( - self, - *, - text, - pitch=None, - speaker=None, - energy=None, - pace=1.0, - volume=None, - reference_spec=None, - reference_spec_lens=None, - ): - + def infer(self, *, text, pitch=None, speaker=None, energy=None, pace=1.0, volume=None): # Calculate speaker embedding - spk_emb = self.get_speaker_embedding( - speaker=speaker, reference_spec=reference_spec, reference_spec_lens=reference_spec_lens, - ) + if self.speaker_emb is None or speaker is None: + spk_emb = 0 + else: + spk_emb = self.speaker_emb(speaker).unsqueeze(1) # Input FFT enc_out, enc_mask = self.encoder(input=text, conditioning=spk_emb) + if self.speaker_emb_condition_prosody: + prosody_input = enc_out + spk_emb + else: + prosody_input = enc_out # Predict duration and pitch - log_durs_predicted = self.duration_predictor(enc_out, enc_mask, conditioning=spk_emb) + log_durs_predicted = self.duration_predictor(prosody_input, enc_mask) durs_predicted = torch.clamp( torch.exp(log_durs_predicted) - 1.0, self.min_token_duration, self.max_token_duration ) - pitch_predicted = self.pitch_predictor(enc_out, enc_mask, conditioning=spk_emb) + pitch + pitch_predicted = self.pitch_predictor(prosody_input, enc_mask) + pitch pitch_emb = self.pitch_emb(pitch_predicted.unsqueeze(1)) enc_out = enc_out + pitch_emb.transpose(1, 2) @@ -414,7 +380,10 @@ def infer( volume_extended = volume_extended.squeeze(-1).float() # Output FFT - dec_out, _ = self.decoder(input=len_regulated, seq_lens=dec_lens, conditioning=spk_emb) + if self.speaker_emb_condition_decoder: + dec_out, _ = self.decoder(input=len_regulated, seq_lens=dec_lens, conditioning=spk_emb) + else: + dec_out, _ = self.decoder(input=len_regulated, seq_lens=dec_lens) spect = self.proj(dec_out).transpose(1, 2) return ( spect.to(torch.float), diff --git a/nemo/collections/tts/modules/submodules.py b/nemo/collections/tts/modules/submodules.py index dbf26f1ceeee..275468d60634 100644 --- a/nemo/collections/tts/modules/submodules.py +++ b/nemo/collections/tts/modules/submodules.py @@ -18,21 +18,6 @@ from torch import Tensor from torch.autograd import Variable from torch.nn import functional as F -from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence - -from nemo.core.classes import NeuralModule, adapter_mixins -from nemo.core.neural_types.elements import EncodedRepresentation, Index, LengthsType, MelSpectrogramType -from nemo.core.neural_types.neural_type import NeuralType -from nemo.utils import logging - - -SUPPORTED_CONDITION_TYPES = ["add", "concat", "layernorm"] - - -def check_support_condition_types(condition_types): - for tp in condition_types: - if tp not in SUPPORTED_CONDITION_TYPES: - raise ValueError(f"Unknown conditioning type {tp}") def masked_instance_norm( @@ -137,7 +122,7 @@ def forward(self, x): return self.linear_layer(x) -class ConvNorm(torch.nn.Module, adapter_mixins.AdapterModuleMixin): +class ConvNorm(torch.nn.Module): __constants__ = ['use_partial_padding'] use_partial_padding: bool @@ -191,10 +176,6 @@ def forward(self, signal, mask=None): ret = self.conv(signal) if self.norm is not None: ret = self.norm(ret) - - if self.is_adapter_available(): - ret = self.forward_enabled_adapters(ret.transpose(1, 2)).transpose(1, 2) - return ret @@ -429,326 +410,3 @@ def forward(self, forward_input: Tuple[torch.Tensor, torch.Tensor]): output = output + res_skip_acts return self.end(output) - - -class ConditionalLayerNorm(torch.nn.LayerNorm): - """ - This module is used to condition torch.nn.LayerNorm. - If we don't have any conditions, this will be a normal LayerNorm. - """ - - def __init__(self, hidden_dim, condition_dim=None, condition_types=[]): - check_support_condition_types(condition_types) - self.condition = "layernorm" in condition_types - super().__init__(hidden_dim, elementwise_affine=not self.condition) - - if self.condition: - self.cond_weight = torch.nn.Linear(condition_dim, hidden_dim) - self.cond_bias = torch.nn.Linear(condition_dim, hidden_dim) - self.init_parameters() - - def init_parameters(self): - torch.nn.init.constant_(self.cond_weight.weight, 0.0) - torch.nn.init.constant_(self.cond_weight.bias, 1.0) - torch.nn.init.constant_(self.cond_bias.weight, 0.0) - torch.nn.init.constant_(self.cond_bias.bias, 0.0) - - def forward(self, inputs, conditioning=None): - inputs = super().forward(inputs) - - # Normalize along channel - if self.condition: - if conditioning is None: - raise ValueError( - """You should add additional data types as conditions (e.g. speaker id or reference audio) - and define speaker_encoder in your config.""" - ) - - inputs = inputs * self.cond_weight(conditioning) - inputs = inputs + self.cond_bias(conditioning) - - return inputs - - -class ConditionalInput(torch.nn.Module): - """ - This module is used to condition any model inputs. - If we don't have any conditions, this will be a normal pass. - """ - - def __init__(self, hidden_dim, condition_dim, condition_types=[]): - check_support_condition_types(condition_types) - super().__init__() - self.support_types = ["add", "concat"] - self.condition_types = [tp for tp in condition_types if tp in self.support_types] - self.hidden_dim = hidden_dim - self.condition_dim = condition_dim - - if "add" in self.condition_types and condition_dim != hidden_dim: - self.add_proj = torch.nn.Linear(condition_dim, hidden_dim) - - if "concat" in self.condition_types: - self.concat_proj = torch.nn.Linear(hidden_dim + condition_dim, hidden_dim) - - def forward(self, inputs, conditioning=None): - """ - Args: - inputs (torch.tensor): B x T x C tensor. - conditioning (torch.tensor): B x 1 x C conditioning embedding. - """ - if len(self.condition_types) > 0: - if conditioning is None: - raise ValueError( - """You should add additional data types as conditions (e.g. speaker id or reference audio) - and define speaker_encoder in your config.""" - ) - - if "add" in self.condition_types: - if self.condition_dim != self.hidden_dim: - conditioning = self.add_proj(conditioning) - inputs = inputs + conditioning - - if "concat" in self.condition_types: - conditioning = conditionting.repeat(1, inputs.shape[1], 1) - inputs = torch.cat([inputs, conditioning]) - inputs = self.concat_proj(inputs) - - return inputs - - -class StyleAttention(NeuralModule): - def __init__(self, gst_size=128, n_style_token=10, n_style_attn_head=4): - super(StyleAttention, self).__init__() - - token_size = gst_size // n_style_attn_head - self.tokens = torch.nn.Parameter(torch.FloatTensor(n_style_token, token_size)) - self.mha = torch.nn.MultiheadAttention( - embed_dim=gst_size, - num_heads=n_style_attn_head, - dropout=0.0, - bias=True, - kdim=token_size, - vdim=token_size, - batch_first=True, - ) - torch.nn.init.normal_(self.tokens) - - @property - def input_types(self): - return { - "inputs": NeuralType(('B', 'D'), EncodedRepresentation()), - "token_id": NeuralType(('B'), Index(), optional=True), - } - - @property - def output_types(self): - return { - "style_emb": NeuralType(('B', 'D'), EncodedRepresentation()), - } - - def forward(self, inputs): - batch_size = inputs.size(0) - query = inputs.unsqueeze(1) - tokens = F.tanh(self.tokens).unsqueeze(0).expand(batch_size, -1, -1) - - style_emb, _ = self.mha(query=query, key=tokens, value=tokens) - style_emb = style_emb.squeeze(1) - return style_emb - - -class Conv2DReLUNorm(torch.nn.Module): - def __init__(self, in_channels, out_channels, kernel_size=3, stride=2, padding=1, bias=True, dropout=0.0): - super(Conv2DReLUNorm, self).__init__() - self.conv = torch.nn.Conv2d( - in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias=bias - ) - self.norm = torch.nn.LayerNorm(out_channels) - self.dropout = torch.nn.Dropout(dropout) - - def forward(self, x, x_mask=None): - if x_mask is not None: - x = x * x_mask - - # bhwc -> bchw - x = x.contiguous().permute(0, 3, 1, 2) - x = F.relu(self.conv(x)) - # bchw -> bhwc - x = x.contiguous().permute(0, 2, 3, 1) - x = self.norm(x) - x = self.dropout(x) - return x - - -class ReferenceEncoder(NeuralModule): - """ - Encode mel-spectrograms to an utterance level feature - """ - - def __init__(self, n_mels, cnn_filters, dropout, gru_hidden, kernel_size, stride, padding, bias): - super(ReferenceEncoder, self).__init__() - self.filter_size = [1] + list(cnn_filters) - self.layers = torch.nn.ModuleList( - [ - Conv2DReLUNorm( - in_channels=int(self.filter_size[i]), - out_channels=int(self.filter_size[i + 1]), - kernel_size=kernel_size, - stride=stride, - padding=padding, - bias=bias, - dropout=dropout, - ) - for i in range(len(cnn_filters)) - ] - ) - post_conv_height = self.calculate_post_conv_lengths(n_mels, n_convs=len(cnn_filters)) - self.gru = torch.nn.GRU( - input_size=cnn_filters[-1] * post_conv_height, hidden_size=gru_hidden, batch_first=True, - ) - - @property - def input_types(self): - return { - "inputs": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()), - "inputs_lengths": NeuralType(('B'), LengthsType()), - } - - @property - def output_types(self): - return { - "out": NeuralType(('B', 'D'), EncodedRepresentation()), - } - - def forward(self, inputs, inputs_lengths): - # BMW -> BWMC (M: mels) - x = inputs.transpose(1, 2).unsqueeze(3) - x_lens = inputs_lengths - x_masks = self.lengths_to_masks(x_lens).unsqueeze(2).unsqueeze(3) - - for layer in self.layers: - x = layer(x, x_masks) - x_lens = self.calculate_post_conv_lengths(x_lens) - x_masks = self.lengths_to_masks(x_lens).unsqueeze(2).unsqueeze(3) - - # BWMC -> BWC - x = x.contiguous().view(x.shape[0], x.shape[1], -1) - - self.gru.flatten_parameters() - packed_x = pack_padded_sequence(x, x_lens.cpu(), batch_first=True, enforce_sorted=False) - packed_x, _ = self.gru(packed_x) - x, x_lens = pad_packed_sequence(packed_x, batch_first=True) - x = x[torch.arange(len(x_lens)), (x_lens - 1), :] - return x - - @staticmethod - def calculate_post_conv_lengths(lengths, n_convs=1, kernel_size=3, stride=2, pad=1): - """Batch lengths after n convolution with fixed kernel/stride/pad.""" - for _ in range(n_convs): - lengths = (lengths - kernel_size + 2 * pad) // stride + 1 - return lengths - - @staticmethod - def lengths_to_masks(lengths): - """Batch of lengths to batch of masks""" - # B -> BxT - masks = torch.arange(lengths.max()).to(lengths.device).expand( - lengths.shape[0], lengths.max() - ) < lengths.unsqueeze(1) - return masks - - -class GlobalStyleToken(NeuralModule): - """ - Global Style Token based Speaker Embedding - """ - - def __init__( - self, reference_encoder, gst_size=128, n_style_token=10, n_style_attn_head=4, - ): - super(GlobalStyleToken, self).__init__() - self.reference_encoder = reference_encoder - self.style_attention = StyleAttention( - gst_size=gst_size, n_style_token=n_style_token, n_style_attn_head=n_style_attn_head - ) - - @property - def input_types(self): - return { - "inp": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType()), - "inp_lengths": NeuralType(('B'), LengthsType()), - } - - @property - def output_types(self): - return { - "gst": NeuralType(('B', 'D'), EncodedRepresentation()), - } - - def forward(self, inp, inp_lengths): - style_embedding = self.reference_encoder(inp, inp_lengths) - gst = self.style_attention(style_embedding) - return gst - - -class SpeakerLookupTable(torch.nn.Module): - """ - LookupTable based Speaker Embedding - """ - - def __init__(self, n_speakers, embedding_dim): - super(SpeakerLookupTable, self).__init__() - self.table = torch.nn.Embedding(n_speakers, embedding_dim) - - def forward(self, speaker): - return self.table(speaker) - - -class SpeakerEncoder(NeuralModule): - """ - class SpeakerEncoder represents speakers representation. - This module can combine GST (global style token) based speaker embeddings and lookup table speaker embeddings. - """ - - def __init__(self, lookup_module=None, gst_module=None): - """ - lookup_module: Torch module to get lookup based speaker embedding - gst_module: Neural module to get GST based speaker embedding - """ - super(SpeakerEncoder, self).__init__() - self.lookup_module = lookup_module - self.gst_module = gst_module - - @property - def input_types(self): - return { - "speaker": NeuralType(('B'), Index(), optional=True), - "reference_spec": NeuralType(('B', 'D', 'T_spec'), MelSpectrogramType(), optional=True), - "reference_spec_lens": NeuralType(('B'), LengthsType(), optional=True), - } - - @property - def output_types(self): - return { - "embs": NeuralType(('B', 'D'), EncodedRepresentation()), - } - - def forward(self, speaker=None, reference_spec=None, reference_spec_lens=None): - embs = None - - # Get Lookup table speaker embedding - if self.lookup_module is not None and speaker is not None: - embs = self.lookup_module(speaker) - - # Get GST based speaker embedding - if self.gst_module is not None: - if reference_spec is None or reference_spec_lens is None: - raise ValueError( - "You should add `reference_audio` in sup_data_types or remove `speaker_encoder`in config." - ) - out = self.gst_module(reference_spec, reference_spec_lens) - embs = out if embs is None else embs + out - - elif self.gst_module is None and reference_spec is not None and reference_spec_lens is not None: - logging.warning("You may add `gst_module` in speaker_encoder to use reference_audio.") - - return embs diff --git a/nemo/collections/tts/modules/transformer.py b/nemo/collections/tts/modules/transformer.py index 3dda8c522dcc..0d2f8f417f4e 100644 --- a/nemo/collections/tts/modules/transformer.py +++ b/nemo/collections/tts/modules/transformer.py @@ -17,9 +17,9 @@ import torch.nn as nn import torch.nn.functional as F -from nemo.collections.tts.modules.submodules import ConditionalInput, ConditionalLayerNorm, LinearNorm +from nemo.collections.tts.modules.submodules import LinearNorm from nemo.collections.tts.parts.utils.helpers import get_mask_from_lengths -from nemo.core.classes import NeuralModule, adapter_mixins, typecheck +from nemo.core.classes import NeuralModule, typecheck from nemo.core.neural_types.elements import EncodedRepresentation, LengthsType, MaskType, TokenIndex from nemo.core.neural_types.neural_type import NeuralType @@ -51,7 +51,7 @@ def forward(self, pos_seq, bsz=None): class PositionwiseConvFF(nn.Module): - def __init__(self, d_model, d_inner, kernel_size, dropout, pre_lnorm=False, condition_types=[]): + def __init__(self, d_model, d_inner, kernel_size, dropout, pre_lnorm=False): super(PositionwiseConvFF, self).__init__() self.d_model = d_model @@ -68,17 +68,17 @@ def __init__(self, d_model, d_inner, kernel_size, dropout, pre_lnorm=False, cond nn.Conv1d(d_inner, d_model, kernel_size[1], 1, (kernel_size[1] // 2)), nn.Dropout(dropout), ) - self.layer_norm = ConditionalLayerNorm(d_model, condition_dim=d_model, condition_types=condition_types) + self.layer_norm = nn.LayerNorm(d_model) self.pre_lnorm = pre_lnorm - def forward(self, inp, conditioning=None): - return self._forward(inp, conditioning) + def forward(self, inp): + return self._forward(inp) - def _forward(self, inp, conditioning=None): + def _forward(self, inp): if self.pre_lnorm: # layer normalization + positionwise feed-forward core_out = inp.transpose(1, 2) - core_out = self.CoreNet(self.layer_norm(core_out, conditioning).to(inp.dtype)) + core_out = self.CoreNet(self.layer_norm(core_out).to(inp.dtype)) core_out = core_out.transpose(1, 2) # residual connection @@ -90,13 +90,13 @@ def _forward(self, inp, conditioning=None): core_out = core_out.transpose(1, 2) # residual connection + layer normalization - output = self.layer_norm(inp + core_out, conditioning).to(inp.dtype) + output = self.layer_norm(inp + core_out).to(inp.dtype) return output class MultiHeadAttn(nn.Module): - def __init__(self, n_head, d_model, d_head, dropout, dropatt=0.1, pre_lnorm=False, condition_types=[]): + def __init__(self, n_head, d_model, d_head, dropout, dropatt=0.1, pre_lnorm=False): super(MultiHeadAttn, self).__init__() self.n_head = n_head @@ -109,17 +109,17 @@ def __init__(self, n_head, d_model, d_head, dropout, dropatt=0.1, pre_lnorm=Fals self.drop = nn.Dropout(dropout) self.dropatt = nn.Dropout(dropatt) self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) - self.layer_norm = ConditionalLayerNorm(d_model, condition_dim=d_model, condition_types=condition_types) + self.layer_norm = nn.LayerNorm(d_model) - def forward(self, inp, attn_mask=None, conditioning=None): - return self._forward(inp, attn_mask, conditioning) + def forward(self, inp, attn_mask=None): + return self._forward(inp, attn_mask) - def _forward(self, inp, attn_mask=None, conditioning=None): + def _forward(self, inp, attn_mask=None): residual = inp if self.pre_lnorm: # layer normalization - inp = self.layer_norm(inp, conditioning) + inp = self.layer_norm(inp) n_head, d_head = self.n_head, self.d_head @@ -157,47 +157,29 @@ def _forward(self, inp, attn_mask=None, conditioning=None): output = residual + attn_out else: # residual connection + layer normalization - output = self.layer_norm(residual + attn_out, conditioning) + output = self.layer_norm(residual + attn_out) return output -class TransformerLayer(nn.Module, adapter_mixins.AdapterModuleMixin): - def __init__(self, n_head, d_model, d_head, d_inner, kernel_size, dropout, condition_types=[], **kwargs): +class TransformerLayer(nn.Module): + def __init__(self, n_head, d_model, d_head, d_inner, kernel_size, dropout, **kwargs): super(TransformerLayer, self).__init__() - self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, condition_types=condition_types, **kwargs) - self.pos_ff = PositionwiseConvFF( - d_model, d_inner, kernel_size, dropout, pre_lnorm=kwargs.get('pre_lnorm'), condition_types=condition_types - ) + self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs) + self.pos_ff = PositionwiseConvFF(d_model, d_inner, kernel_size, dropout, pre_lnorm=kwargs.get('pre_lnorm')) - def forward(self, dec_inp, mask=None, conditioning=None): - output = self.dec_attn(dec_inp, attn_mask=~mask.squeeze(2), conditioning=conditioning) + def forward(self, dec_inp, mask=None): + output = self.dec_attn(dec_inp, attn_mask=~mask.squeeze(2)) output *= mask - output = self.pos_ff(output, conditioning) + output = self.pos_ff(output) output *= mask - - if self.is_adapter_available(): - output = self.forward_enabled_adapters(output) - output *= mask - return output class FFTransformerDecoder(NeuralModule): def __init__( - self, - n_layer, - n_head, - d_model, - d_head, - d_inner, - kernel_size, - dropout, - dropatt, - dropemb=0.0, - pre_lnorm=False, - condition_types=[], + self, n_layer, n_head, d_model, d_head, d_inner, kernel_size, dropout, dropatt, dropemb=0.0, pre_lnorm=False ): super(FFTransformerDecoder, self).__init__() self.d_model = d_model @@ -207,20 +189,11 @@ def __init__( self.pos_emb = PositionalEmbedding(self.d_model) self.drop = nn.Dropout(dropemb) self.layers = nn.ModuleList() - self.cond_input = ConditionalInput(d_model, d_model, condition_types) for _ in range(n_layer): self.layers.append( TransformerLayer( - n_head, - d_model, - d_head, - d_inner, - kernel_size, - dropout, - dropatt=dropatt, - pre_lnorm=pre_lnorm, - condition_types=condition_types, + n_head, d_model, d_head, d_inner, kernel_size, dropout, dropatt=dropatt, pre_lnorm=pre_lnorm ) ) @@ -240,18 +213,16 @@ def output_types(self): } @typecheck() - def forward(self, input, seq_lens, conditioning=None): + def forward(self, input, seq_lens, conditioning=0): return self._forward(input, mask_from_lens(seq_lens).unsqueeze(2), conditioning) def _forward(self, inp, mask, conditioning): pos_seq = torch.arange(inp.size(1), device=inp.device).to(inp.dtype) pos_emb = self.pos_emb(pos_seq) * mask - inp += pos_emb - inp = self.cond_input(inp, conditioning) - out = self.drop(inp) + out = self.drop(inp + pos_emb + conditioning) for layer in self.layers: - out = layer(out, mask=mask, conditioning=conditioning) + out = layer(out, mask=mask) # out = self.drop(out) return out, mask @@ -273,20 +244,9 @@ def __init__( n_embed=None, d_embed=None, padding_idx=0, - condition_types=[], ): super(FFTransformerEncoder, self).__init__( - n_layer, - n_head, - d_model, - d_head, - d_inner, - kernel_size, - dropout, - dropatt, - dropemb, - pre_lnorm, - condition_types, + n_layer, n_head, d_model, d_head, d_inner, kernel_size, dropout, dropatt, dropemb, pre_lnorm ) self.padding_idx = padding_idx diff --git a/nemo/collections/tts/parts/mixins/__init__.py b/nemo/collections/tts/parts/mixins/__init__.py deleted file mode 100644 index bca487f8d96c..000000000000 --- a/nemo/collections/tts/parts/mixins/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo.collections.tts.parts.mixins.fastpitch_adapter_mixins import FastPitchAdapterModelMixin diff --git a/nemo/collections/tts/parts/mixins/fastpitch_adapter_mixins.py b/nemo/collections/tts/parts/mixins/fastpitch_adapter_mixins.py deleted file mode 100644 index 375cf1fe51ee..000000000000 --- a/nemo/collections/tts/parts/mixins/fastpitch_adapter_mixins.py +++ /dev/null @@ -1,368 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List, Optional - -from omegaconf import DictConfig, open_dict - -from nemo.core.classes.mixins.adapter_mixins import AdapterModelPTMixin, AdapterModuleMixin -from nemo.utils import logging, logging_mode - - -class FastPitchAdapterModelMixin(AdapterModelPTMixin): - """ FastPitch Adapter Mixin that can augment any Encoder module with Adapter module support. - This mixin class should be used only with a top level ModelPT subclass, that includes an `encoder` submodule. - This mixin class adds several utility methods which are propagated to the `encoder`. - An Adapter module is any Pytorch nn.Module that possess a few properties : - - It's input and output dimension are the same, while the hidden dimension need not be the same. - - The final layer of the Adapter module is zero-initialized, so that the residual connection to the adapter - yields the original output. - This mixin adds the following instance variables to the class this inherits it: - - `adapter_layer`: A torch.nn.ModuleDict(), whose keys are the names of the adapter (globally unique), - and values are the Adapter nn.Module(). - - `adapter_cfg`: A OmegaConf DictConfig object that holds the config of the adapters that are initialized. - - `adapter_global_cfg_key`: A str representing a key in the model config that can be provided by the user. - The value resolves to `global_cfg`, and can be overridden via `model.cfg.adapters.global_cfg.*`. - **Note**: This module **is** responsible for maintaining its config. At the ModelPT level, it will access and - write Adapter config information to `self.cfg.adapters`. - """ - - def setup_adapters(self): - """ - Utility method that is called in the ASR ModelPT-implementation constructor, so as to restore any - adapters that were previously added. - This method should be called just once at constructor time. - """ - supports_adapters = False - - # At least the encoder must extend AdapterModuleMixin - if hasattr(self.fastpitch, 'encoder') and isinstance(self.fastpitch.encoder, AdapterModuleMixin): - supports_adapters |= True - - if hasattr(self.fastpitch, 'decoder') and isinstance(self.fastpitch.decoder, AdapterModuleMixin): - supports_adapters |= True - - if hasattr(self.fastpitch, 'duration_predictor') and isinstance( - self.fastpitch.duration_predictor, AdapterModuleMixin - ): - supports_adapters |= True - - if hasattr(self.fastpitch, 'pitch_predictor') and isinstance( - self.fastpitch.pitch_predictor, AdapterModuleMixin - ): - supports_adapters |= True - - if hasattr(self.fastpitch, 'aligner') and isinstance(self.fastpitch.aligner, AdapterModuleMixin): - supports_adapters |= True - - # If adapters are supported, setup the adapter config + any modules (pre-existing adapter modules) - if supports_adapters: - super().setup_adapters() - - def add_adapter(self, name: str, cfg: DictConfig): - """ - Add an Adapter module to this model. - Args: - name: A globally unique name for the adapter. Will be used to access, enable and disable adapters. - cfg: A DictConfig that contains at the bare minimum `__target__` to instantiate a new Adapter module. - """ - # setup the config for adapters - super().add_adapter(name=name, cfg=cfg) - - # Resolve module name and adapter name - module_name, _ = self.resolve_adapter_module_name_(name) - - # Use + as a splitter, in order to share one name across multiple modules - if '+' in module_name: - module_names = module_name.split('+') - else: - module_names = [module_name] - - with open_dict(self.cfg): - for module_name in module_names: - # Check if encoder adapters should be added - if module_name == 'encoder': - # Dispatch the call to the encoder. - self.fastpitch.encoder.add_adapter(name=name, cfg=cfg) - - # Check if decoder adapters should be added - if module_name in ('', 'decoder'): - # Dispatch call to the decoder. (default use decoder) - self.fastpitch.decoder.add_adapter(name=name, cfg=cfg) - - # Check if duration_predictor adapters should be added - if module_name in ('', 'duration_predictor'): - # Dispatch call to the duration_predictor. (default use duration_predictor) - self.fastpitch.duration_predictor.add_adapter(name=name, cfg=cfg) - - # Check if pitch_predictor adapters should be added - if module_name in ('', 'pitch_predictor'): - # Dispatch call to the pitch_predictor. (default use pitch_predictor) - self.fastpitch.pitch_predictor.add_adapter(name=name, cfg=cfg) - - # Check if aligner adapters should be added - if module_name in ('', 'aligner'): - # Dispatch call to the aligner. (default use aligner) - self.fastpitch.aligner.add_adapter(name=name, cfg=cfg) - - def is_adapter_available(self) -> bool: - """ - Checks if any Adapter module has been instantiated. - Returns: - bool, determining if any Adapter module has been instantiated. Returns true even if the adapters are - enabled or disabled, false only if no adapters exist. - """ - config_contains_adapter = super().is_adapter_available() - - # Forward the method call to the individual modules - if hasattr(self.fastpitch, 'encoder') and isinstance(self.fastpitch.encoder, AdapterModuleMixin): - config_contains_adapter |= self.fastpitch.encoder.is_adapter_available() - - if hasattr(self.fastpitch, 'decoder') and isinstance(self.fastpitch.decoder, AdapterModuleMixin): - config_contains_adapter |= self.fastpitch.decoder.is_adapter_available() - - if hasattr(self.fastpitch, 'duration_predictor') and isinstance( - self.fastpitch.duration_predictor, AdapterModuleMixin - ): - config_contains_adapter |= self.fastpitch.duration_predictor.is_adapter_available() - - if hasattr(self.fastpitch, 'pitch_predictor') and isinstance( - self.fastpitch.pitch_predictor, AdapterModuleMixin - ): - config_contains_adapter |= self.fastpitch.pitch_predictor.is_adapter_available() - - if hasattr(self.fastpitch, 'aligner') and isinstance(self.fastpitch.aligner, AdapterModuleMixin): - config_contains_adapter |= self.fastpitch.aligner.is_adapter_available() - - return config_contains_adapter - - def set_enabled_adapters(self, name: Optional[str] = None, enabled: bool = True): - """ - Updated the internal adapter config, determining if an adapter (or all adapters) are either - enabled or disabled. - A common user pattern would be to disable all adapters (either after adding them, or restoring a model - with pre-existing adapters) and then simply enable one of the adapters. - .. code:: - model.set_enabled_adapters(enabled=False) - model.set_enabled_adapters(name=, enabled=True) - Args: - name: Optional str. If a str name is given, the config will be updated to the value of `enabled`. - If no name is given, then all adapters will be enabled/disabled. - enabled: Bool, determines if the adapter(s) will be enabled/disabled. - """ - super().set_enabled_adapters(name=name, enabled=enabled) - - # Resolve the module name and adapter name - if name is not None: - module_name, _ = self.resolve_adapter_module_name_(name) - else: - module_name = None - - # Use + as a splitter, in order to share one name across multiple modules - if module_name is not None and '+' in module_name: - module_names = module_name.split('+') - else: - module_names = [module_name] - - for module_name in module_names: - # Check if encoder adapters should be used - # Dispatch the call to the encoder. - if name is None or module_name == 'encoder': - if self.fastpitch.encoder.is_adapter_available(): - self.fastpitch.encoder.set_enabled_adapters(name=name, enabled=enabled) - - # Dispatch the call to the decoder. - if name is None or module_name in ('', 'decoder'): - if self.fastpitch.decoder.is_adapter_available(): - self.fastpitch.decoder.set_enabled_adapters(name=name, enabled=enabled) - - # Dispatch the call to the duration_predictor. - if name is None or module_name in ('', 'duration_predictor'): - if self.fastpitch.duration_predictor.is_adapter_available(): - self.fastpitch.duration_predictor.set_enabled_adapters(name=name, enabled=enabled) - - # Dispatch the call to the pitch_predictor. - if name is None or module_name in ('', 'pitch_predictor'): - if self.fastpitch.pitch_predictor.is_adapter_available(): - self.fastpitch.pitch_predictor.set_enabled_adapters(name=name, enabled=enabled) - - # Dispatch the call to the aligner. - if name is None or module_name in ('', 'aligner'): - if self.fastpitch.aligner.is_adapter_available(): - self.fastpitch.aligner.set_enabled_adapters(name=name, enabled=enabled) - - def get_enabled_adapters(self) -> List[str]: - """ - Returns a list of all enabled adapters. - Returns: - A list of str names of each enabled adapter(s). - """ - enabled_adapters = super().get_enabled_adapters() - - # Check if encoder adapters should be used or are enabled - if hasattr(self.fastpitch, 'encoder') and isinstance(self.fastpitch.encoder, AdapterModuleMixin): - enabled_adapters.extend(self.fastpitch.encoder.get_enabled_adapters()) - - if hasattr(self.fastpitch, 'decoder') and isinstance(self.fastpitch.decoder, AdapterModuleMixin): - enabled_adapters.extend(self.fastpitch.decoder.get_enabled_adapters()) - - if hasattr(self.fastpitch, 'duration_predictor') and isinstance( - self.fastpitch.duration_predictor, AdapterModuleMixin - ): - enabled_adapters.extend(self.fastpitch.duration_predictor.get_enabled_adapters()) - - if hasattr(self.fastpitch, 'pitch_predictor') and isinstance( - self.fastpitch.pitch_predictor, AdapterModuleMixin - ): - enabled_adapters.extend(self.fastpitch.pitch_predictor.get_enabled_adapters()) - - if hasattr(self.fastpitch, 'aligner') and isinstance(self.fastpitch.aligner, AdapterModuleMixin): - enabled_adapters.extend(self.fastpitch.aligner.get_enabled_adapters()) - - enabled_adapters = list(sorted(list(set(enabled_adapters)))) - - return enabled_adapters - - def check_valid_model_with_adapter_support_(self): - """ - Utility method to test if the subclass of this mixin is an appropriate subclass of ModelPT itself. - """ - # Obtain the global adapter config if possible, otherwise use sensible defaults. - global_cfg = self._get_global_cfg() - - # Test whether the encoder supports adapters - use_encoder_adapter = global_cfg.get('check_encoder_adapter', False) - if use_encoder_adapter: - if not hasattr(self.fastpitch, 'encoder'): - logging.warning( - "Cannot add adapter to this object as it does not have an `fastpitch.encoder` sub-module!", - mode=logging_mode.ONCE, - ) - - if hasattr(self.fastpitch, 'encoder') and not isinstance(self.fastpitch.encoder, AdapterModuleMixin): - logging.warning( - f'{self.fastpitch.encoder.__class__.__name__} does not implement `AdapterModuleMixin`', - mode=logging_mode.ONCE, - ) - - # Test whether the decoder supports adapters - use_decoder_adapter = global_cfg.get('check_decoder_adapter', True) - if use_decoder_adapter: - if not hasattr(self.fastpitch, 'decoder'): - logging.warning( - "Cannot add adapter to this object as it does not have an `fastpitch.decoder` sub-module!", - mode=logging_mode.ONCE, - ) - - if hasattr(self.fastpitch, 'decoder') and not isinstance(self.fastpitch.decoder, AdapterModuleMixin): - logging.warning( - f'{self.fastpitch.decoder.__class__.__name__} does not implement `AdapterModuleMixin`', - mode=logging_mode.ONCE, - ) - - # Test whether the duration_predictor supports adapters - use_duration_predictor_adapter = global_cfg.get('check_duration_predictor_adapter', True) - if use_duration_predictor_adapter: - if not hasattr(self.fastpitch, 'duration_predictor'): - logging.warning( - "Cannot add adapter to this object as it does not have an `fastpitch.duration_predictor` sub-module!", - mode=logging_mode.ONCE, - ) - - if hasattr(self.fastpitch, 'duration_predictor') and not isinstance( - self.fastpitch.duration_predictor, AdapterModuleMixin - ): - logging.warning( - f'{self.fastpitch.duration_predictor.__class__.__name__} does not implement `AdapterModuleMixin`', - mode=logging_mode.ONCE, - ) - - # Test whether the pitch_predictor supports adapters - use_pitch_predictor_adapter = global_cfg.get('check_pitch_predictor_adapter', True) - if use_pitch_predictor_adapter: - if not hasattr(self.fastpitch, 'pitch_predictor'): - logging.warning( - "Cannot add adapter to this object as it does not have an `fastpitch.pitch_predictor` sub-module!", - mode=logging_mode.ONCE, - ) - - if hasattr(self.fastpitch, 'pitch_predictor') and not isinstance( - self.fastpitch.pitch_predictor, AdapterModuleMixin - ): - logging.warning( - f'{self.fastpitch.pitch_predictor.__class__.__name__} does not implement `AdapterModuleMixin`', - mode=logging_mode.ONCE, - ) - - # Test whether the aligner supports adapters - use_aligner_adapter = global_cfg.get('check_aligner_adapter', True) - if use_aligner_adapter: - if not hasattr(self.fastpitch, 'aligner'): - logging.warning( - "Cannot add adapter to this object as it does not have an `fastpitch.aligner` sub-module!", - mode=logging_mode.ONCE, - ) - - if hasattr(self.fastpitch, 'aligner') and not isinstance(self.fastpitch.aligner, AdapterModuleMixin): - logging.warning( - f'{self.fastpitch.aligner.__class__.__name__} does not implement `AdapterModuleMixin`', - mode=logging_mode.ONCE, - ) - - def resolve_adapter_module_name_(self, name: str) -> (str, str): - """ - Utility method to resolve a given global/module adapter name to its components. - Always returns a tuple representing (module_name, adapter_name). ":" is used as the - delimiter for denoting the module name vs the adapter name. - Will attempt to also resolve a given adapter_name alone back to (module_name, adapter_name) - if the metadata config exists for access. - Args: - name: A global adapter, or a module adapter name (with structure module_name:adapter_name). - Returns: - A tuple representing (module_name, adapter_name). If a global adapter is provided, - module_name is set to ''. - """ - module_name, adapter_name = super().resolve_adapter_module_name_(name) - - # Use + as a splitter, in order to share one name across multiple modules - if '+' in module_name: - module_names = module_name.split('+') - else: - module_names = [module_name] - - # resolve name and module only for valid modules - valid_module_names = self.adapter_module_names - - for mod_name in module_names: - if mod_name not in valid_module_names: - raise ValueError(f"Provided module name `{mod_name}` is not in valid list : {valid_module_names}") - - return (module_name, adapter_name) - - def _get_global_cfg(self): - """ - Utility method, to either extract or construct the global config inside adapters config. - """ - global_config = DictConfig({}) - if 'adapters' in self.cfg and self.adapter_global_cfg_key in self.cfg.adapters: - global_config = self.adapter_cfg[self.adapter_global_cfg_key] - return global_config - - @property - def adapter_module_names(self) -> List[str]: - module_names = super().adapter_module_names # "Default" adapter module: '' - module_names.extend( - ['encoder', 'decoder', 'duration_predictor', 'pitch_predictor', 'aligner'] - ) # Add support for `encoder` and `decoder` modules - return module_names diff --git a/nemo/collections/tts/torch/tts_data_types.py b/nemo/collections/tts/torch/tts_data_types.py index ae7516009cd9..899e5da7d801 100644 --- a/nemo/collections/tts/torch/tts_data_types.py +++ b/nemo/collections/tts/torch/tts_data_types.py @@ -67,10 +67,6 @@ class LMTokens(TTSDataType): name = "lm_tokens" -class ReferenceAudio(TTSDataType, WithLens): - name = "reference_audio" - - MAIN_DATA_TYPES = [Audio, Text] VALID_SUPPLEMENTARY_DATA_TYPES = [ LogMel, @@ -82,6 +78,5 @@ class ReferenceAudio(TTSDataType, WithLens): LMTokens, Voiced_mask, P_voiced, - ReferenceAudio, ] DATA_STR2DATA_CLASS = {d.name: d for d in MAIN_DATA_TYPES + VALID_SUPPLEMENTARY_DATA_TYPES} From 0f593b863ccab88795047e2968f020cb0a23ea14 Mon Sep 17 00:00:00 2001 From: Micha Livne Date: Sun, 23 Apr 2023 16:47:46 -0700 Subject: [PATCH 22/23] 1. Debugging. --- .../eval_beamsearch_ngram_transducer.py | 534 ++++++------------ .../tts/ljspeech/get_data.py | 8 +- .../code_switching/README.md | 4 +- .../code_switching_audio_data_creation.py | 26 +- .../code_switching_manifest_creation.py | 2 +- .../convert_to_tarred_audio_dataset.py | 43 +- .../asr/test_asr_ctc_encoder_model_bpe.py | 1 - .../asr/test_asr_ctcencdec_model.py | 1 - .../collections/nlp/test_rampup_batch_size.py | 195 ------- .../tts/modules/test_submodules.py | 48 -- 10 files changed, 183 insertions(+), 679 deletions(-) delete mode 100644 tests/collections/nlp/test_rampup_batch_size.py delete mode 100644 tests/collections/tts/modules/test_submodules.py diff --git a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py index bbc33d214636..4e3c342b9da1 100644 --- a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py +++ b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py @@ -13,443 +13,257 @@ # limitations under the License. # -""" # This script would evaluate an N-gram language model trained with KenLM library (https://github.com/kpu/kenlm) in -# fusion with beam search decoders on top of a trained ASR Transducer model. NeMo's beam search decoders are capable of using the +# fusion with beam search decoders on top of a trained ASR model. NeMo's beam search decoders are capable of using the # KenLM's N-gram models to find the best candidates. This script supports both character level and BPE level # encodings and models which is detected automatically from the type of the model. # You may train the LM model with 'scripts/ngram_lm/train_kenlm.py'. - -# Config Help - -To discover all arguments of the script, please run : -python eval_beamsearch_ngram.py --help -python eval_beamsearch_ngram.py --cfg job - -# USAGE - -python eval_beamsearch_ngram_transducer.py nemo_model_file= \ - input_manifest= \ - beam_width=[] \ - beam_alpha=[] \ - preds_output_folder= \ - probs_cache_file=null \ - decoding_strategy= - maes_prefix_alpha=[] \ - maes_expansion_gamma=[] \ - hat_subtract_ilm= \ - hat_ilm_weight=[] \ - ... - - -# Grid Search for Hyper parameters - -For grid search, you can provide a list of arguments as follows - - - beam_width=[4,8,16,....] \ - beam_alpha=[-2.0,-1.0,...,1.0,2.0] \ - +# +# USAGE: python eval_beamsearch_ngram.py --nemo_model_file= \ +# --input_manifest= \ +# --beam_width= \ +# --beam_alpha= \ +# --preds_output_folder= \ +# --decoding_mode=maes +# ... +# # You may find more info on how to use this script at: # https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/asr_language_modeling.html -""" - +import argparse import contextlib import json import os import pickle -import tempfile -from dataclasses import dataclass, field, is_dataclass -from pathlib import Path -from typing import List, Optional import editdistance -import numpy as np import torch -from omegaconf import MISSING, OmegaConf -from sklearn.model_selection import ParameterGrid +from omegaconf import OmegaConf from tqdm.auto import tqdm import nemo.collections.asr as nemo_asr -from nemo.collections.asr.parts.submodules import rnnt_beam_decoding -from nemo.core.config import hydra_runner +from nemo.collections.asr.metrics.rnnt_wer_bpe import RNNTBPEDecodingConfig from nemo.utils import logging -# fmt: off - - -@dataclass -class EvalBeamSearchNGramConfig: - """ - Evaluate an ASR model with beam search decoding and n-gram KenLM language model. - """ - # # The path of the '.nemo' file of the ASR model or the name of a pretrained model (ngc / huggingface) - nemo_model_file: str = MISSING - - # File paths - input_manifest: str = MISSING # The manifest file of the evaluation set - kenlm_model_file: Optional[str] = None # The path of the KenLM binary model file - preds_output_folder: Optional[str] = None # The optional folder where the predictions are stored - probs_cache_file: Optional[str] = None # The cache file for storing the logprobs of the model - - # Parameters for inference - acoustic_batch_size: int = 128 # The batch size to calculate log probabilities - beam_batch_size: int = 128 # The batch size to be used for beam search decoding - device: str = "cuda" # The device to load the model onto to calculate log probabilities - use_amp: bool = False # Whether to use AMP if available to calculate log probabilities - num_workers: int = 1 # Number of workers for DataLoader - - # The decoding scheme to be used for evaluation - decoding_strategy: str = "greedy_batch" # ["greedy_batch", "beam", "tsd", "alsd", "maes"] - - # Beam Search hyperparameters - beam_width: List[int] = field(default_factory=lambda: [8]) # The width or list of the widths for the beam search decoding - beam_alpha: List[float] = field(default_factory=lambda: [0.2]) # The alpha parameter or list of the alphas for the beam search decoding - - maes_prefix_alpha: List[int] = field(default_factory=lambda: [2]) # The maes_prefix_alpha or list of the maes_prefix_alpha for the maes decoding - maes_expansion_gamma: List[float] = field(default_factory=lambda: [2.3]) # The maes_expansion_gamma or list of the maes_expansion_gamma for the maes decoding - - # HAT related parameters (only for internal lm subtraction) - hat_subtract_ilm: bool = False - hat_ilm_weight: List[float] = field(default_factory=lambda: [0.0]) - - decoding: rnnt_beam_decoding.BeamRNNTInferConfig = rnnt_beam_decoding.BeamRNNTInferConfig(beam_size=128) - - -# fmt: on - - -def decoding_step( - model: nemo_asr.models.ASRModel, - cfg: EvalBeamSearchNGramConfig, - all_probs: List[torch.Tensor], - target_transcripts: List[str], - preds_output_file: str = None, - beam_batch_size: int = 128, - progress_bar: bool = True, -): - level = logging.getEffectiveLevel() - logging.setLevel(logging.CRITICAL) - # Reset config - model.change_decoding_strategy(None) - - cfg.decoding.hat_ilm_weight = cfg.decoding.hat_ilm_weight * cfg.hat_subtract_ilm - # Override the beam search config with current search candidate configuration - cfg.decoding.return_best_hypothesis = False - cfg.decoding.ngram_lm_model = cfg.kenlm_model_file - cfg.decoding.hat_subtract_ilm = cfg.hat_subtract_ilm - - # Update model's decoding strategy config - model.cfg.decoding.strategy = cfg.decoding_strategy - model.cfg.decoding.beam = cfg.decoding - - # Update model's decoding strategy - model.change_decoding_strategy(model.cfg.decoding) - logging.setLevel(level) +def beam_search_eval(all_hypotheses, target_transcripts, preds_output_file=None): wer_dist_first = cer_dist_first = 0 wer_dist_best = cer_dist_best = 0 words_count = 0 chars_count = 0 - sample_idx = 0 if preds_output_file: - out_file = open(preds_output_file, 'w', encoding='utf_8', newline='\n') - - if progress_bar: - if cfg.decoding_strategy == "greedy_batch": - description = "Greedy_batch decoding.." - else: - description = f"{cfg.decoding_strategy} decoding with bw={cfg.decoding.beam_size}, ba={cfg.decoding.ngram_lm_alpha}, ma={cfg.decoding.maes_prefix_alpha}, mg={cfg.decoding.maes_expansion_gamma}, hat_ilmw={cfg.decoding.hat_ilm_weight}" - it = tqdm(range(int(np.ceil(len(all_probs) / beam_batch_size))), desc=description, ncols=120) - else: - it = range(int(np.ceil(len(all_probs) / beam_batch_size))) - for batch_idx in it: - # disabling type checking - probs_batch = all_probs[batch_idx * beam_batch_size : (batch_idx + 1) * beam_batch_size] - probs_lens = torch.tensor([prob.shape[-1] for prob in probs_batch]) - with torch.no_grad(): - packed_batch = torch.zeros(len(probs_batch), probs_batch[0].shape[0], max(probs_lens), device='cpu') - - for prob_index in range(len(probs_batch)): - packed_batch[prob_index, :, : probs_lens[prob_index]] = torch.tensor( - probs_batch[prob_index].unsqueeze(0), device=packed_batch.device, dtype=packed_batch.dtype - ) - best_hyp_batch, beams_batch = model.decoding.rnnt_decoder_predictions_tensor( - packed_batch, probs_lens, return_hypotheses=True, - ) - if cfg.decoding_strategy == "greedy_batch": - beams_batch = [[x] for x in best_hyp_batch] - - for beams_idx, beams in enumerate(beams_batch): - target = target_transcripts[sample_idx + beams_idx] - target_split_w = target.split() - target_split_c = list(target) - words_count += len(target_split_w) - chars_count += len(target_split_c) - wer_dist_min = cer_dist_min = 10000 - for candidate_idx, candidate in enumerate(beams): # type: (int, rnnt_beam_decoding.rnnt_utils.Hypothesis) - pred_text = candidate.text - pred_split_w = pred_text.split() - wer_dist = editdistance.eval(target_split_w, pred_split_w) - pred_split_c = list(pred_text) - cer_dist = editdistance.eval(target_split_c, pred_split_c) - - wer_dist_min = min(wer_dist_min, wer_dist) - cer_dist_min = min(cer_dist_min, cer_dist) - - if candidate_idx == 0: - # first candidate - wer_dist_first += wer_dist - cer_dist_first += cer_dist - - score = candidate.score - if preds_output_file: - out_file.write('{}\t{}\n'.format(pred_text, score)) - wer_dist_best += wer_dist_min - cer_dist_best += cer_dist_min - sample_idx += len(probs_batch) - - if cfg.decoding_strategy == "greedy_batch": - return wer_dist_first / words_count, cer_dist_first / chars_count + out_file = open(preds_output_file, 'w') + + it = tqdm(range(len(all_hypotheses)), desc=f"Beam search decoding...", ncols=120,) + + for sample_idx in it: + hypotheses = all_hypotheses[sample_idx] + target = target_transcripts[sample_idx] + target_split_w = target.split() + target_split_c = list(target) + words_count += len(target_split_w) + chars_count += len(target_split_c) + wer_dist_min = cer_dist_min = 10000 + if not isinstance(hypotheses, list): + hypotheses = [hypotheses] + for candidate_idx, candidate in enumerate(hypotheses): + pred_text = candidate.text + pred_split_w = pred_text.split() + wer_dist = editdistance.eval(target_split_w, pred_split_w) + pred_split_c = list(pred_text) + cer_dist = editdistance.eval(target_split_c, pred_split_c) + + wer_dist_min = min(wer_dist_min, wer_dist) + cer_dist_min = min(cer_dist_min, cer_dist) + + if candidate_idx == 0: + # first candidate + wer_dist_first += wer_dist + cer_dist_first += cer_dist + + if preds_output_file: + out_file.write('{}\t{}\n'.format(pred_text, candidate.score)) + wer_dist_best += wer_dist_min + cer_dist_best += cer_dist_min if preds_output_file: out_file.close() - logging.info(f"Stored the predictions of {cfg.decoding_strategy} decoding at '{preds_output_file}'.") + logging.info(f"Stored the predictions of beam search decoding at '{preds_output_file}'.") - if cfg.decoding.ngram_lm_model: - logging.info( - f"WER/CER with {cfg.decoding_strategy} decoding and N-gram model = {wer_dist_first / words_count:.2%}/{cer_dist_first / chars_count:.2%}" - ) - else: - logging.info( - f"WER/CER with {cfg.decoding_strategy} decoding = {wer_dist_first / words_count:.2%}/{cer_dist_first / chars_count:.2%}" + logging.info( + 'WER/CER with the provided decoding strategy = {:.2%}/{:.2%}'.format( + wer_dist_first / words_count, cer_dist_first / chars_count ) + ) + logging.info( - f"Oracle WER/CER in candidates with perfect LM= {wer_dist_best / words_count:.2%}/{cer_dist_best / chars_count:.2%}" + 'Oracle WER/CER in candidates = {:.2%}/{:.2%}'.format(wer_dist_best / words_count, cer_dist_best / chars_count) ) logging.info(f"=================================================================================") - return wer_dist_first / words_count, cer_dist_first / chars_count +def main(): + parser = argparse.ArgumentParser( + description='Evaluate an ASR model with beam search decoding and n-gram KenLM language model.' + ) + parser.add_argument( + "--nemo_model_file", required=True, type=str, help="The path of the '.nemo' file of the ASR model" + ) + parser.add_argument( + "--kenlm_model_file", required=False, default=None, type=str, help="The path of the KenLM binary model file" + ) + parser.add_argument("--input_manifest", required=True, type=str, help="The manifest file of the evaluation set") + parser.add_argument( + "--preds_output_folder", default=None, type=str, help="The optional folder where the predictions are stored" + ) + parser.add_argument( + "--probs_cache_file", default=None, type=str, help="The cache file for storing the outputs of the model" + ) + parser.add_argument( + "--acoustic_batch_size", default=16, type=int, help="The batch size to calculate log probabilities" + ) + parser.add_argument( + "--device", default="cuda", type=str, help="The device to load the model onto to calculate log probabilities" + ) + parser.add_argument( + "--use_amp", action="store_true", help="Whether to use AMP if available to calculate log probabilities" + ) + parser.add_argument( + "--decoding_mode", + choices=["greedy", "greedy_batch", "beam", "tsd", "alsd", "maes"], + default="beam", + type=str, + help="The decoding scheme to be used for evaluation.", + ) + parser.add_argument( + "--beam_width", required=True, type=int, help="The width for the beam search decoding", + ) + parser.add_argument( + "--beam_alpha", required=True, type=float, help="The alpha parameter for the beam search decoding", + ) + parser.add_argument( + "--beam_batch_size", default=128, type=int, help="The batch size to be used for beam search decoding" + ) + parser.add_argument( + "--maes_prefix_alpha", + default=1, + type=int, + help="Float pruning threshold used in the prune-by-value step when computing the expansions.", + ) + parser.add_argument( + "--maes_expansion_gamma", default=2.3, type=float, help="Maximum prefix length in prefix search" + ) + parser.add_argument( + "--hat_subtract_ilm", action="store_true", help="Subtract internal LM from the final HAT logprobs" + ) + parser.add_argument("--hat_ilm_weight", default=0.0, type=float, help="lamda2 weight for HAT ILM subsrtact") -@hydra_runner(config_path=None, config_name='EvalBeamSearchNGramConfig', schema=EvalBeamSearchNGramConfig) -def main(cfg: EvalBeamSearchNGramConfig): - if is_dataclass(cfg): - cfg = OmegaConf.structured(cfg) # type: EvalBeamSearchNGramConfig + args = parser.parse_args() - valid_decoding_strategis = ["greedy_batch", "beam", "tsd", "alsd", "maes"] - if cfg.decoding_strategy not in valid_decoding_strategis: - raise ValueError( - f"Given decoding_strategy={cfg.decoding_strategy} is invalid. Available options are :\n" - f"{valid_decoding_strategis}" - ) + if args.kenlm_model_file and args.decoding_mode != "maes": + raise ValueError("External n-gram LM fusion is available only for 'maes' decoding mode.") - if cfg.nemo_model_file.endswith('.nemo'): - asr_model = nemo_asr.models.ASRModel.restore_from(cfg.nemo_model_file, map_location=torch.device(cfg.device)) + if args.nemo_model_file.endswith('.nemo'): + asr_model = nemo_asr.models.ASRModel.restore_from(args.nemo_model_file, map_location=torch.device(args.device)) else: logging.warning( "nemo_model_file does not end with .nemo, therefore trying to load a pretrained model with this name." ) asr_model = nemo_asr.models.ASRModel.from_pretrained( - cfg.nemo_model_file, map_location=torch.device(cfg.device) + args.nemo_model_file, map_location=torch.device(args.device) ) - if cfg.kenlm_model_file: - if not os.path.exists(cfg.kenlm_model_file): - raise FileNotFoundError(f"Could not find the KenLM model file '{cfg.kenlm_model_file}'.") - if cfg.decoding_strategy != "maes": - raise ValueError(f"Decoding with kenlm model is supported only for maes decoding algorithm.") - lm_path = cfg.kenlm_model_file - else: - lm_path = None - cfg.beam_alpha = [0.0] - if cfg.hat_subtract_ilm: - assert lm_path, "kenlm must be set for hat internal lm subtraction" - - if cfg.decoding_strategy != "maes": - cfg.maes_prefix_alpha, cfg.maes_expansion_gamma, cfg.hat_ilm_weight = [0], [0], [0] - target_transcripts = [] - manifest_dir = Path(cfg.input_manifest).parent - with open(cfg.input_manifest, 'r', encoding='utf_8') as manifest_file: + with open(args.input_manifest, 'r') as manifest_file: audio_file_paths = [] - for line in tqdm(manifest_file, desc=f"Reading Manifest {cfg.input_manifest} ...", ncols=120): + durations = [] + for line in tqdm(manifest_file, desc=f"Reading Manifest {args.input_manifest} ...", ncols=120): data = json.loads(line) - audio_file = Path(data['audio_filepath']) - if not audio_file.is_file() and not audio_file.is_absolute(): - audio_file = manifest_dir / audio_file target_transcripts.append(data['text']) - audio_file_paths.append(str(audio_file.absolute())) + audio_file_paths.append(data['audio_filepath']) + durations.append(data['duration']) - if cfg.probs_cache_file and os.path.exists(cfg.probs_cache_file): - logging.info(f"Found a pickle file of probabilities at '{cfg.probs_cache_file}'.") - logging.info(f"Loading the cached pickle file of probabilities from '{cfg.probs_cache_file}' ...") - with open(cfg.probs_cache_file, 'rb') as probs_file: + if args.probs_cache_file and os.path.exists(args.probs_cache_file): + logging.info(f"Found a pickle file of probabilities at '{args.probs_cache_file}'.") + logging.info(f"Loading the cached pickle file of probabilities from '{args.probs_cache_file}' ...") + with open(args.probs_cache_file, 'rb') as probs_file: all_probs = pickle.load(probs_file) if len(all_probs) != len(audio_file_paths): raise ValueError( - f"The number of samples in the probabilities file '{cfg.probs_cache_file}' does not " + f"The number of samples in the probabilities file '{args.probs_cache_file}' does not " f"match the manifest file. You may need to delete the probabilities cached file." ) else: + asr_model = asr_model.eval() + rnnt_cfg = RNNTBPEDecodingConfig() + rnnt_cfg.strategy = args.decoding_mode # beam greedy + rnnt_cfg.beam.beam_size = args.beam_width + rnnt_cfg.beam.ngram_lm_model = args.kenlm_model_file + rnnt_cfg.beam.ngram_lm_alpha = args.beam_alpha # 0.2, 0.3 + rnnt_cfg.compute_hypothesis_token_set = False + rnnt_cfg.beam.return_best_hypothesis = False + rnnt_cfg.beam.maes_prefix_alpha = args.maes_prefix_alpha + rnnt_cfg.beam.maes_expansion_gamma = args.maes_expansion_gamma + rnnt_cfg.beam.hat_subtract_ilm = args.hat_subtract_ilm + rnnt_cfg.beam.hat_ilm_weight = args.hat_ilm_weight + asr_model.change_decoding_strategy(OmegaConf.structured(rnnt_cfg)) @contextlib.contextmanager def default_autocast(): yield - if cfg.use_amp: + if args.use_amp: if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'): logging.info("AMP is enabled!\n") autocast = torch.cuda.amp.autocast - else: autocast = default_autocast else: - autocast = default_autocast - # manual calculation of encoder_embeddings - with autocast(): - with torch.no_grad(): - asr_model.eval() - asr_model.encoder.freeze() - device = next(asr_model.parameters()).device - all_probs = [] - with tempfile.TemporaryDirectory() as tmpdir: - with open(os.path.join(tmpdir, 'manifest.json'), 'w', encoding='utf-8') as fp: - for audio_file in audio_file_paths: - entry = {'audio_filepath': audio_file, 'duration': 100000, 'text': ''} - fp.write(json.dumps(entry) + '\n') - config = { - 'paths2audio_files': audio_file_paths, - 'batch_size': cfg.acoustic_batch_size, - 'temp_dir': tmpdir, - 'num_workers': cfg.num_workers, - 'channel_selector': None, - 'augmentor': None, - } - temporary_datalayer = asr_model._setup_transcribe_dataloader(config) - for test_batch in tqdm(temporary_datalayer, desc="Transcribing", disable=True): - encoded, encoded_len = asr_model.forward( - input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) - ) - # dump encoder embeddings per file - for idx in range(encoded.shape[0]): - encoded_no_pad = encoded[idx, :, : encoded_len[idx]] - all_probs.append(encoded_no_pad) - - if cfg.probs_cache_file: - logging.info(f"Writing pickle files of probabilities at '{cfg.probs_cache_file}'...") - with open(cfg.probs_cache_file, 'wb') as f_dump: - pickle.dump(all_probs, f_dump) - - if cfg.decoding_strategy == "greedy_batch": - asr_model = asr_model.to('cpu') - candidate_wer, candidate_cer = decoding_step( - asr_model, - cfg, - all_probs=all_probs, - target_transcripts=target_transcripts, - beam_batch_size=cfg.beam_batch_size, - progress_bar=True, - ) - logging.info(f"Greedy batch WER/CER = {candidate_wer:.2%}/{candidate_cer:.2%}") - - asr_model = asr_model.to('cpu') - - # 'greedy_batch' decoding_strategy would skip the beam search decoding - if cfg.decoding_strategy in ["beam", "tsd", "alsd", "maes"]: - if cfg.beam_width is None or cfg.beam_alpha is None: - raise ValueError("beam_width and beam_alpha are needed to perform beam search decoding.") - params = { - 'beam_width': cfg.beam_width, - 'beam_alpha': cfg.beam_alpha, - 'maes_prefix_alpha': cfg.maes_prefix_alpha, - 'maes_expansion_gamma': cfg.maes_expansion_gamma, - 'hat_ilm_weight': cfg.hat_ilm_weight, - } - hp_grid = ParameterGrid(params) - hp_grid = list(hp_grid) - - best_wer_beam_size, best_cer_beam_size = None, None - best_wer_alpha, best_cer_alpha = None, None - best_wer, best_cer = 1e6, 1e6 - - logging.info( - f"==============================Starting the {cfg.decoding_strategy} decoding===============================" - ) - logging.info(f"Grid search size: {len(hp_grid)}") + params = {'beam_width': args.beam_width, 'beam_alpha': args.beam_alpha} + + logging.info(f"==============================Starting the beam search decoding===============================") + logging.info(f"Beam search params: {params}") logging.info(f"It may take some time...") logging.info(f"==============================================================================================") - if cfg.preds_output_folder and not os.path.exists(cfg.preds_output_folder): - os.mkdir(cfg.preds_output_folder) - for hp in hp_grid: - if cfg.preds_output_folder: - results_file = f"preds_out_{cfg.decoding_strategy}_bw{hp['beam_width']}" - if cfg.decoding_strategy == "maes": - results_file = f"{results_file}_ma{hp['maes_prefix_alpha']}_mg{hp['maes_expansion_gamma']}" - if cfg.kenlm_model_file: - results_file = f"{results_file}_ba{hp['beam_alpha']}" - if cfg.hat_subtract_ilm: - results_file = f"{results_file}_hat_ilmw{hp['hat_ilm_weight']}" - preds_output_file = os.path.join(cfg.preds_output_folder, f"{results_file}.tsv") - else: - preds_output_file = None - - cfg.decoding.beam_size = hp["beam_width"] - cfg.decoding.ngram_lm_alpha = hp["beam_alpha"] - cfg.decoding.maes_prefix_alpha = hp["maes_prefix_alpha"] - cfg.decoding.maes_expansion_gamma = hp["maes_expansion_gamma"] - cfg.decoding.hat_ilm_weight = hp["hat_ilm_weight"] - - candidate_wer, candidate_cer = decoding_step( - asr_model, - cfg, - all_probs=all_probs, - target_transcripts=target_transcripts, - preds_output_file=preds_output_file, - beam_batch_size=cfg.beam_batch_size, - progress_bar=True, - ) + with autocast(): + with torch.no_grad(): + hypotheses, all_hypotheses = asr_model.transcribe( + audio_file_paths, batch_size=args.acoustic_batch_size, return_hypotheses=True + ) - if candidate_cer < best_cer: - best_cer_beam_size = hp["beam_width"] - best_cer_alpha = hp["beam_alpha"] - best_cer_ma = hp["maes_prefix_alpha"] - best_cer_mg = hp["maes_expansion_gamma"] - best_cer_hat_ilm_weight = hp["hat_ilm_weight"] - best_cer = candidate_cer - - if candidate_wer < best_wer: - best_wer_beam_size = hp["beam_width"] - best_wer_alpha = hp["beam_alpha"] - best_wer_ma = hp["maes_prefix_alpha"] - best_wer_ga = hp["maes_expansion_gamma"] - best_wer_hat_ilm_weight = hp["hat_ilm_weight"] - best_wer = candidate_wer - - wer_hat_parameter = "" - if cfg.hat_subtract_ilm: - wer_hat_parameter = f"HAT ilm weight = {best_wer_hat_ilm_weight}, " - logging.info( - f'Best WER Candidate = {best_wer:.2%} :: Beam size = {best_wer_beam_size}, ' - f'Beam alpha = {best_wer_alpha}, {wer_hat_parameter}' - f'maes_prefix_alpha = {best_wer_ma}, maes_expansion_gamma = {best_wer_ga} ' - ) + # delete the model to free the memory + del asr_model + + if args.preds_output_folder and not os.path.exists(args.preds_output_folder): + os.mkdir(args.preds_output_folder) - cer_hat_parameter = "" - if cfg.hat_subtract_ilm: - cer_hat_parameter = f"HAT ilm weight = {best_cer_hat_ilm_weight}" - logging.info( - f'Best CER Candidate = {best_cer:.2%} :: Beam size = {best_cer_beam_size}, ' - f'Beam alpha = {best_cer_alpha}, {cer_hat_parameter} ' - f'maes_prefix_alpha = {best_cer_ma}, maes_expansion_gamma = {best_cer_mg}' + if args.preds_output_folder: + preds_output_file = os.path.join( + args.preds_output_folder, f"preds_out_width{args.beam_width}_alpha{args.beam_alpha}.tsv", ) - logging.info(f"=================================================================================") + preds_output_manifest = os.path.join(args.preds_output_folder, f"preds_manifest.json",) + with open(preds_output_manifest, 'w') as fn: + for i, file_name in enumerate(audio_file_paths): + item = { + 'audio_filepath': file_name, + 'duration': durations[i], + 'text': target_transcripts[i], + 'pred_text': hypotheses[i].text, + } + fn.write(json.dumps(item) + "\n") + + else: + preds_output_file = None + + beam_search_eval( + all_hypotheses=all_hypotheses, target_transcripts=target_transcripts, preds_output_file=preds_output_file, + ) if __name__ == '__main__': diff --git a/scripts/dataset_processing/tts/ljspeech/get_data.py b/scripts/dataset_processing/tts/ljspeech/get_data.py index 7c28fb8ef903..733f9b76b354 100644 --- a/scripts/dataset_processing/tts/ljspeech/get_data.py +++ b/scripts/dataset_processing/tts/ljspeech/get_data.py @@ -27,11 +27,7 @@ def get_args(): parser = argparse.ArgumentParser(description='Download LJSpeech and create manifests with predefined split') parser.add_argument("--data-root", required=True, type=Path) - parser.add_argument( - '--whitelist-path', - type=str, - default="lj_speech.tsv extracted from the readme file in the dataset. You can also download the file from https://github.com/NVIDIA/NeMo-text-processing/blob/main/nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv", - ) + parser.add_argument('--whitelist-path', type=str, default="lj_speech.tsv") args = parser.parse_args() return args @@ -60,7 +56,7 @@ def __extract_file(filepath, data_dir): def __process_data(data_root, whitelist_path): if whitelist_path is None: wget.download( - "https://github.com/NVIDIA/NeMo-text-processing/blob/main/nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv", + "https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tts_dataset_files/en/whitelist_lj_speech.tsv", out=str(data_root), ) whitelist_path = data_root / "lj_speech.tsv" diff --git a/scripts/speech_recognition/code_switching/README.md b/scripts/speech_recognition/code_switching/README.md index 42f1601b21b4..82f3772008f4 100644 --- a/scripts/speech_recognition/code_switching/README.md +++ b/scripts/speech_recognition/code_switching/README.md @@ -10,8 +10,6 @@ Follow the 2 steps listed below in order - 2. Create the synthetic audio data and the corresponding manifest file using `code_switching_audio_data_creation.py` It's usage is as follows: - `python code_switching_audio_data_creation.py --manifest_path --audio_save_folder_path --manifest_save_path --audio_normalized_amplitude --cs_data_sampling_rate --sample_beginning_pause_msec --sample_joining_pause_msec --sample_end_pause_msec --is_lid_manifest --workers ` + `python code_switching_audio_data_creation.py --manifest_path --audio_save_folder_path --manifest_save_path --audio_normalized_amplitude --cs_data_sampling_rate --sample_beginning_pause_msec --sample_joining_pause_msec --sample_end_pause_msec --workers ` - Example of the multi-sample LID format: ```[{“str”:“esta muestra ” “lang”:”es”},{“str”:“was generated synthetically”: “lang”:”en”}]``` - Estimated runtime for generating a 10,000 hour corpus is ~40 hrs with a single worker \ No newline at end of file diff --git a/scripts/speech_recognition/code_switching/code_switching_audio_data_creation.py b/scripts/speech_recognition/code_switching/code_switching_audio_data_creation.py index c53b3eeaac36..6c378b7cdf99 100644 --- a/scripts/speech_recognition/code_switching/code_switching_audio_data_creation.py +++ b/scripts/speech_recognition/code_switching/code_switching_audio_data_creation.py @@ -60,12 +60,6 @@ parser.add_argument( "--sample_end_pause_msec", default=20, type=int, help='Pause to be added at the end of the sample (msec)' ) -parser.add_argument( - "--is_lid_manifest", - default=True, - type=bool, - help='If true, generate manifest in the multi-sample lid format, else the standard manifest format', -) parser.add_argument("--workers", default=1, type=int, help='Number of worker processes') args = parser.parse_args() @@ -122,7 +116,6 @@ def create_cs_data( pause_join_msec: int, pause_end_msec: int, cs_data_sampling_rate: int, - is_lid_manifest: bool, ): """ @@ -135,7 +128,6 @@ def create_cs_data( pause_join_msec: Pause to be added between different phrases of the sample (msec) pause_end_msec: Pause to be added at the end of the sample (msec) cs_data_sampling_rate: Desired sampling rate of the generated samples - is_lid_manifest: If true, generate manifest in the multi-sample lid format, else the standard manifest format Returns: @@ -152,12 +144,8 @@ def create_cs_data( staring_pause = np.zeros(int(pause_beg_msec * fs / 1000)) combined_audio += list(staring_pause) - text_entry_list = [] for index in range(len(data['lang_ids'])): - phrase_entry = {} - # dictionary to store the phrase information which will be added to the complete sentence - data_sample, fs_sample = librosa.load(data['paths'][index], sr=fs) # Alternative- fs_sample, data_sample = wavfile.read(data['paths'][index]) @@ -182,12 +170,7 @@ def create_cs_data( combined_audio += list(data_sample_norm) - phrase_entry['str'] = data['texts'][index] - phrase_entry['lang'] = data['lang_ids'][index] - - text_entry_list.append(phrase_entry) - - # adding small pause between semgments + # adding small pause between gemgments if index != (len(data['lang_ids']) - 1): pause = np.zeros(int(pause_join_msec * fs / 1000)) combined_audio += list(pause) @@ -209,10 +192,7 @@ def create_cs_data( metadata_json = {} metadata_json['audio_filepath'] = audio_file_path metadata_json['duration'] = float(len(combined_audio) / fs) - if is_lid_manifest: - metadata_json['text'] = text_entry_list - else: - metadata_json['text'] = ' '.join(data['texts']) + metadata_json['text'] = ' '.join(data['texts']) metadata_json['language_ids'] = data['lang_ids'] metadata_json['original_texts'] = data['texts'] @@ -233,7 +213,6 @@ def main(): pause_join_msec = args.sample_joining_pause_msec pause_end_msec = args.sample_end_pause_msec cs_data_sampling_rate = args.cs_data_sampling_rate - is_lid_manifest = args.is_lid_manifest num_process = args.workers # Sanity Checks @@ -270,7 +249,6 @@ def main(): pause_join_msec, pause_end_msec, cs_data_sampling_rate, - is_lid_manifest, ) for idx, split_manifest in enumerate(data_split) ) diff --git a/scripts/speech_recognition/code_switching/code_switching_manifest_creation.py b/scripts/speech_recognition/code_switching/code_switching_manifest_creation.py index c783f803a74d..9eca4bb6977a 100644 --- a/scripts/speech_recognition/code_switching/code_switching_manifest_creation.py +++ b/scripts/speech_recognition/code_switching/code_switching_manifest_creation.py @@ -20,7 +20,7 @@ # Checks - # (Recommendation) Please normalize the text for each language (avoid numbers, special characters, punctuation) -# Please ensure that the audio_filepaths are absolute locations +# Please ensure that the audio_fielpaths are absolute locations parser = argparse.ArgumentParser(description='Create synthetic code-switching data manifest from monolingual data') diff --git a/scripts/speech_recognition/convert_to_tarred_audio_dataset.py b/scripts/speech_recognition/convert_to_tarred_audio_dataset.py index 64c086997ef0..f227fbcd538c 100644 --- a/scripts/speech_recognition/convert_to_tarred_audio_dataset.py +++ b/scripts/speech_recognition/convert_to_tarred_audio_dataset.py @@ -174,11 +174,6 @@ "and it must be filled out by the user." ), ) -parser.add_argument( - "--no_shard_manifests", - action='store_true', - help="Do not write sharded manifests along with the aggregated manifest.", -) parser.add_argument('--workers', type=int, default=1, help='Number of worker processes') args = parser.parse_args() @@ -191,7 +186,6 @@ class ASRTarredDatasetConfig: min_duration: Optional[float] = None shuffle_seed: Optional[int] = None sort_in_shards: bool = True - shard_manifests: bool = True keep_files_together: bool = False @@ -328,19 +322,6 @@ def create_new_dataset(self, manifest_path: str, target_dir: str = "./tarred/", for i, (start_idx, end_idx) in enumerate(zip(start_indices, end_indices)) ) - if config.shard_manifests: - sharded_manifests_dir = target_dir + '/sharded_manifests' - if not os.path.exists(sharded_manifests_dir): - os.makedirs(sharded_manifests_dir) - - for manifest in new_entries_list: - shard_id = manifest[0]['shard_id'] - new_manifest_shard_path = os.path.join(sharded_manifests_dir, f'manifest_{shard_id}.json') - with open(new_manifest_shard_path, 'w', encoding='utf-8') as m2: - for entry in manifest: - json.dump(entry, m2) - m2.write('\n') - # Flatten the list of list of entries to a list of entries new_entries = [sample for manifest in new_entries_list for sample in manifest] del new_entries_list @@ -349,7 +330,7 @@ def create_new_dataset(self, manifest_path: str, target_dir: str = "./tarred/", # Write manifest new_manifest_path = os.path.join(target_dir, 'tarred_audio_manifest.json') - with open(new_manifest_path, 'w', encoding='utf-8') as m2: + with open(new_manifest_path, 'w') as m2: for entry in new_entries: json.dump(entry, m2) m2.write('\n') @@ -486,19 +467,6 @@ def create_concatenated_dataset( for i, (start_idx, end_idx, shard_idx) in enumerate(zip(start_indices, end_indices, shard_indices)) ) - if config.shard_manifests: - sharded_manifests_dir = target_dir + '/sharded_manifests' - if not os.path.exists(sharded_manifests_dir): - os.makedirs(sharded_manifests_dir) - - for manifest in new_entries_list: - shard_id = manifest[0]['shard_id'] - new_manifest_shard_path = os.path.join(sharded_manifests_dir, f'manifest_{shard_id}.json') - with open(new_manifest_shard_path, 'w', encoding='utf-8') as m2: - for entry in manifest: - json.dump(entry, m2) - m2.write('\n') - # Flatten the list of list of entries to a list of entries new_entries = [sample for manifest in new_entries_list for sample in manifest] del new_entries_list @@ -512,7 +480,7 @@ def create_concatenated_dataset( print("Total number of entries in manifest :", len(base_entries) + len(new_entries)) new_manifest_path = os.path.join(target_dir, f'tarred_audio_manifest_version_{new_version}.json') - with open(new_manifest_path, 'w', encoding='utf-8') as m2: + with open(new_manifest_path, 'w') as m2: # First write all the entries of base manifest for entry in base_entries: json.dump(entry, m2) @@ -555,7 +523,7 @@ def _read_manifest(self, manifest_path: str, config: ASRTarredDatasetConfig): total_duration = 0.0 filtered_entries = [] filtered_duration = 0.0 - with open(manifest_path, 'r', encoding='utf-8') as m: + with open(manifest_path, 'r') as m: for line in m: entry = json.loads(line) if (config.max_duration is None or entry['duration'] < config.max_duration) and ( @@ -658,8 +626,6 @@ def main(): def create_tar_datasets(min_duration: float, max_duration: float, target_dir: str): builder = ASRTarredDatasetBuilder() - shard_manifests = False if args.no_shard_manifests else True - if args.write_metadata: metadata = ASRTarredDatasetMetadata() dataset_cfg = ASRTarredDatasetConfig( @@ -669,7 +635,6 @@ def create_tar_datasets(min_duration: float, max_duration: float, target_dir: st min_duration=min_duration, shuffle_seed=args.shuffle_seed, sort_in_shards=args.sort_in_shards, - shard_manifests=shard_manifests, keep_files_together=args.keep_files_together, ) metadata.dataset_config = dataset_cfg @@ -690,7 +655,6 @@ def create_tar_datasets(min_duration: float, max_duration: float, target_dir: st min_duration=min_duration, shuffle_seed=args.shuffle_seed, sort_in_shards=args.sort_in_shards, - shard_manifests=shard_manifests, keep_files_together=args.keep_files_together, ) builder.configure(config) @@ -718,7 +682,6 @@ def create_tar_datasets(min_duration: float, max_duration: float, target_dir: st metadata.dataset_config.shuffle = args.shuffle metadata.dataset_config.shuffle_seed = args.shuffle_seed metadata.dataset_config.sort_in_shards = args.sort_in_shards - metadata.dataset_config.shard_manifests = shard_manifests builder.configure(metadata.dataset_config) diff --git a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py index 13c31ef36a9c..f671fd925c38 100644 --- a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py +++ b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py @@ -291,7 +291,6 @@ def test_ASRDatasetConfig_for_AudioToBPEDataset(self): 'pin_memory', 'drop_last', 'tarred_shard_strategy', - 'shard_manifests', 'shuffle_n', 'parser', 'normalize', diff --git a/tests/collections/asr/test_asr_ctcencdec_model.py b/tests/collections/asr/test_asr_ctcencdec_model.py index 8d90079d0c51..8dfa9ce62cd6 100644 --- a/tests/collections/asr/test_asr_ctcencdec_model.py +++ b/tests/collections/asr/test_asr_ctcencdec_model.py @@ -266,7 +266,6 @@ def test_ASRDatasetConfig_for_AudioToCharDataset(self): 'pin_memory', 'drop_last', 'tarred_shard_strategy', - 'shard_manifests', 'shuffle_n', 'use_start_end_token', 'use_start_end_token', diff --git a/tests/collections/nlp/test_rampup_batch_size.py b/tests/collections/nlp/test_rampup_batch_size.py deleted file mode 100644 index 86af6bf51e1d..000000000000 --- a/tests/collections/nlp/test_rampup_batch_size.py +++ /dev/null @@ -1,195 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import pytest -import torch -from omegaconf import DictConfig -from pytorch_lightning import Trainer - - -from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy - -try: - import apex.transformer.pipeline_parallel.utils - from apex.transformer.pipeline_parallel.utils import get_num_microbatches - - HAVE_APEX = True - -except (ImportError, ModuleNotFoundError): - - HAVE_APEX = False - -DEVICE_CAPABILITY = None -if torch.cuda.is_available(): - DEVICE_CAPABILITY = torch.cuda.get_device_capability() - - -def reset_microbatch_calculator(): - apex.transformer.pipeline_parallel.utils._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None - - -@pytest.fixture() -def model_cfg(test_data_dir): - - model_cfg = { - 'precision': 16, - 'micro_batch_size': 4, - 'global_batch_size': 16, - 'rampup_batch_size': [4, 4, 100], - 'tensor_model_parallel_size': 1, - 'pipeline_model_parallel_size': 1, - 'resume_from_checkpoint': None, - 'encoder_seq_length': 512, - 'max_position_embeddings': 512, - 'num_layers': 1, - 'hidden_size': 128, - 'ffn_hidden_size': 512, - 'num_attention_heads': 2, - 'init_method_std': 0.02, - 'hidden_dropout': 0.1, - 'kv_channels': None, - 'apply_query_key_layer_scaling': True, - 'layernorm_epsilon': 1e-5, - 'make_vocab_size_divisible_by': 128, - 'pre_process': True, - 'post_process': True, - 'persist_layer_norm': True, - 'gradient_as_bucket_view': True, - 'tokenizer': { - 'library': 'megatron', - 'type': 'GPT2BPETokenizer', - 'model': None, - 'vocab_file': os.path.join(test_data_dir, 'nlp/gpt_vocab_merges/vocab.json'), - 'merge_file': os.path.join(test_data_dir, 'nlp/gpt_vocab_merges/merges.txt'), - 'delimiter': None, - }, - 'native_amp_init_scale': 4294967296, - 'native_amp_growth_interval': 1000, - 'hysteresis': 2, - 'fp32_residual_connection': False, - 'fp16_lm_cross_entropy': False, - 'megatron_amp_O2': False, - 'seed': 1234, - 'use_cpu_initialization': False, - 'onnx_safe': False, - 'apex_transformer_log_level': 30, - 'activations_checkpoint_method': None, - 'activations_checkpoint_num_layers': 1, - 'data': { - 'data_prefix': '???', - 'index_mapping_dir': None, - 'data_impl': 'mmap', - 'splits_string': '900,50,50', - 'seq_length': 512, - 'skip_warmup': True, - 'num_workers': 2, - 'dataloader_type': 'single', - 'reset_position_ids': False, - 'reset_attention_mask': False, - 'eod_mask_loss': False, - }, - 'optim': { - 'name': 'fused_adam', - 'lr': 2e-4, - 'weight_decay': 0.01, - 'betas': [0.9, 0.98], - 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 500, 'constant_steps': 50000, 'min_lr': '2e-5'}, - }, - } - - return model_cfg - - -@pytest.fixture() -def trainer_cfg(): - - trainer_cfg = { - 'devices': 1, - 'num_nodes': 1, - 'accelerator': 'gpu', - 'precision': 16, - 'logger': False, - 'enable_checkpointing': False, - 'replace_sampler_ddp': False, - 'max_epochs': 1, - 'max_steps': 150, - 'log_every_n_steps': 10, - 'val_check_interval': 100, - 'limit_val_batches': 50, - 'limit_test_batches': 500, - 'accumulate_grad_batches': 1, - 'gradient_clip_val': 1.0, - } - - return trainer_cfg - - -@pytest.fixture() -def gpt_model(model_cfg, trainer_cfg): - - strategy = NLPDDPStrategy() - trainer = Trainer(strategy=strategy, **trainer_cfg) - cfg = DictConfig(model_cfg) - - reset_microbatch_calculator() - model = MegatronGPTModel(cfg, trainer) - - return model - - -@pytest.fixture() -def rampup_batch_size(): - - return [4, 4, 100] - - -@pytest.fixture() -def rampup_batch_size_schedule(): - - return [4, 8, 12, 16] - - -@pytest.mark.run_only_on('GPU') -class TestRampupBatchSize: - @pytest.mark.unit - def test_rampup_bs(self, gpt_model, rampup_batch_size): - - assert gpt_model.cfg.rampup_batch_size == rampup_batch_size - - @pytest.mark.unit - def test_rampup_bs_schedule(self, gpt_model, trainer_cfg, rampup_batch_size_schedule): - - num_microbatch_calculator = apex.transformer.pipeline_parallel.utils._GLOBAL_NUM_MICROBATCHES_CALCULATOR - micro_batch_size = gpt_model.cfg.micro_batch_size - num_devices = trainer_cfg["devices"] - num_nodes = trainer_cfg["num_nodes"] - max_steps = trainer_cfg["max_steps"] - - global_batch_size_schedule = [] - step, consumed_samples = 0, 0 - while step <= max_steps: - step += 1 - current_global_batch_size = get_num_microbatches() * micro_batch_size * num_devices * num_nodes - consumed_samples += current_global_batch_size - num_microbatch_calculator.update(consumed_samples=consumed_samples, consistency_check=True) - - if current_global_batch_size not in global_batch_size_schedule: - global_batch_size_schedule.append(current_global_batch_size) - - reset_microbatch_calculator() - - assert global_batch_size_schedule == rampup_batch_size_schedule diff --git a/tests/collections/tts/modules/test_submodules.py b/tests/collections/tts/modules/test_submodules.py deleted file mode 100644 index 5ee894398739..000000000000 --- a/tests/collections/tts/modules/test_submodules.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -import torch - -from nemo.collections.tts.modules import submodules - - -@pytest.mark.unit -def test_conditional_layer_norm(): - - # NLP Example - batch, sentence_length, embedding_dim = 20, 5, 10 - embedding = torch.randn(batch, sentence_length, embedding_dim) - ln = torch.nn.LayerNorm(embedding_dim) - cln = submodules.ConditionalLayerNorm(embedding_dim) - assert torch.all(ln(embedding) == cln(embedding)) - - weight = torch.nn.Parameter(torch.randn(embedding_dim)) - bias = torch.nn.Parameter(torch.randn(embedding_dim)) - ln.weight, ln.bias = weight, bias - cln.weight, cln.bias = weight, bias - assert torch.all(ln(embedding) == cln(embedding)) # Simulate trained weights - - # Image Example - N, C, H, W = 20, 5, 10, 10 - image = torch.randn(N, C, H, W) - ln = torch.nn.LayerNorm([C, H, W]) - cln = submodules.ConditionalLayerNorm([C, H, W]) - assert torch.all(ln(image) == cln(image)) - - weight = torch.nn.Parameter(torch.randn(C, H, W)) - bias = torch.nn.Parameter(torch.randn(C, H, W)) - ln.weight, ln.bias = weight, bias - cln.weight, cln.bias = weight, bias - assert torch.all(ln(image) == cln(image)) # Simulate trained weights From 9e84e42f38df22bec0d79f8d110bfc477535824b Mon Sep 17 00:00:00 2001 From: Micha Livne Date: Sun, 23 Apr 2023 16:48:30 -0700 Subject: [PATCH 23/23] 1. Debugging. --- nemo/utils/data_utils.py | 6 ------ tutorials/tts/NeMo_TTS_Primer.ipynb | 7 +++---- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/nemo/utils/data_utils.py b/nemo/utils/data_utils.py index 6479a65f1128..09da7ba93512 100644 --- a/nemo/utils/data_utils.py +++ b/nemo/utils/data_utils.py @@ -49,12 +49,6 @@ def is_datastore_path(path) -> bool: return path.startswith('ais://') -def is_tarred_path(path) -> bool: - """Check if a path is for a tarred file. - """ - return path.endswith('.tar') - - def is_datastore_cache_shared() -> bool: """Check if store cache is shared. """ diff --git a/tutorials/tts/NeMo_TTS_Primer.ipynb b/tutorials/tts/NeMo_TTS_Primer.ipynb index bcd30f804958..92d9fb60d848 100644 --- a/tutorials/tts/NeMo_TTS_Primer.ipynb +++ b/tutorials/tts/NeMo_TTS_Primer.ipynb @@ -777,11 +777,10 @@ "While raw audio shows amplitude versus time and is useful for easily recording and listening, it is not optimal when it comes to processing.\n", "\n", "For processing, it is usually preferable to represent the audio as a **spectrogram** which shows frequency versus time. Specifically, we:\n", - "\n", + "\n", "1. Group together audio samples into a much smaller set of time buckets, called **audio frames**. An audio frame will usually bucket around 50ms of audio.\n", - "2. For each audio frame, use the [Fast Fourier transform](https://en.wikipedia.org/wiki/Fast_Fourier_transform) (**FFT**) to calculate the magnitude (ie. energy, amplitude or \"loudness\") and phase (which we don't use) of each frequency bin. We refer to the magnitudes of the frequency bins as a spectrogram\n", - "3. Map the original frequency bins onto the [mel scale](https://en.wikipedia.org/wiki/Mel_scale), using overlapped [triangular filters](https://en.wikipedia.org/wiki/Window_function#Triangular_window) to create mel filterbanks.\n", - "4. Multiply the original spectrogram by the mel filterbanks to produce a mel spectrogram (for more details see [here](https://www.mathworks.com/help/audio/ref/melspectrogram.html)).\n", + "2. For each audio frame, use the [Fast Fourier transform](https://en.wikipedia.org/wiki/Fast_Fourier_transform) (**FFT**) to calculate the magnitude (ie. energy, amplitude or \"loudness\") and phase (which we don't use) of each frequency band (ie. pitch).\n", + "3. Translate the original frequency bands, measured in units of hertz (Hz), into units of [mel frequency](https://en.wikipedia.org/wiki/Mel_scale). The output is called a **mel spectrogram**.\n", "\n", "We then use the mel spectrogram as our final audio representation. The only thing we lose during this process is the phase information, the implications of which we will discuss more later on.\n", "\n",