diff --git a/examples/asr/conf/speech_translation/fast-conformer_transformer.yaml b/examples/asr/conf/speech_translation/fast-conformer_transformer.yaml new file mode 100644 index 000000000000..4e480df62e59 --- /dev/null +++ b/examples/asr/conf/speech_translation/fast-conformer_transformer.yaml @@ -0,0 +1,218 @@ +# It contains the default values for training an autoregressive FastConformer-Transformer ST model with sub-word encoding. + +# Architecture and training config: +# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective +# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. +# Here are the recommended configs for different variants of FastConformer-Transformer, other parameters are the same as in this config file. +# One extra (linear projection) layer is added between FastConformer encoder and Transformer decoder if they have different hidden sizes +# It is recommended to initialize FastConformer with ASR pre-trained encoder for better accuracy and faster convergence + +name: "FastConformer-Transformer-BPE-st" + +# Initialize model encoder with pre-trained ASR FastConformer encoder for faster convergence and improved accuracy +init_from_nemo_model: + model0: + path: ??? + include: ["preprocessor", "encoder"] + +model: + sample_rate: 16000 + label_smoothing: 0.0 + log_prediction: true # enables logging sample predictions in the output during training + + train_ds: + is_tarred: true + tarred_audio_filepaths: ??? + manifest_filepath: ??? + sample_rate: 16000 + shuffle: false + trim_silence: false + batch_size: 4 + num_workers: 8 + + validation_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 # you may increase batch_size if your memory allows + shuffle: false + num_workers: 4 + pin_memory: true + use_start_end_token: true + + test_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 # you may increase batch_size if your memory allows + shuffle: false + num_workers: 4 + pin_memory: true + use_start_end_token: true + + # recommend small vocab size of 128 or 256 when using 4x sub-sampling + # you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py + tokenizer: + dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (wpe) + type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: ${model.sample_rate} + normalize: "per_feature" + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + log: true + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + pad_value: 0.0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + # you may use lower time_masks for smaller models to have a faster convergence + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + + # Sub-sampling params + subsampling: dw_striding # vggnet or striding, vggnet may give better results but needs more memory + subsampling_factor: 8 # must be power of 2 + subsampling_conv_channels: 256 # -1 sets it to d_model + causal_downsampling: false + reduction: null + reduction_position: null + reduction_factor: 1 + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + att_context_size: [-1, -1] # -1 means unlimited context + xscaling: true # scales up the input embeddings by sqrt(d_model) + untie_biases: true # unties the biases of the TransformerXL layers + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: batch_norm + conv_context_size: null + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + transf_encoder: + num_layers: 0 + hidden_size: 512 + inner_size: 2048 + num_attention_heads: 8 + ffn_dropout: 0.1 + attn_score_dropout: 0.1 + attn_layer_dropout: 0.1 + + transf_decoder: + library: nemo + model_name: null + pretrained: false + max_sequence_length: 512 + num_token_types: 0 + embedding_dropout: 0.1 + learn_positional_encodings: false + hidden_size: 512 + inner_size: 2048 + num_layers: 6 + num_attention_heads: 4 + ffn_dropout: 0.1 + attn_score_dropout: 0.1 + attn_layer_dropout: 0.1 + hidden_act: relu + pre_ln: true + pre_ln_final_layer_norm: true + + head: + num_layers: 1 + activation: relu + log_softmax: true + dropout: 0.0 + use_transformer_init: true + + beam_search: + beam_size: 4 + len_pen: 0.0 + max_generation_delta: 50 + + optim: + name: adam + lr: 0.0001 + # optimizer arguments + betas: [0.9, 0.98] + # less necessity for weight_decay as we already have large augmentations with SpecAug + # you may need weight_decay for large models, stable AMP training, small datasets, or when lower augmentations are used + # weight decay of 0.0 with lr of 2.0 also works fine + #weight_decay: 1e-3 + + # scheduler setup + sched: + name: InverseSquareRootAnnealing + #d_model: ${model.encoder.d_model} + # scheduler config override + warmup_steps: 1000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + gpus: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: 100 + max_steps: -1 # computed at runtime if not set + val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations + accelerator: auto + strategy: ddp + accumulate_grad_batches: 1 + gradient_clip_val: 0.0 + precision: 16 # Should be set to 16 for O1 and O2 to enable the AMP. + log_every_n_steps: 100 # Interval of logging. + enable_progress_bar: True + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_sacreBLEU" + mode: "max" + save_top_k: 3 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + + # you need to set these two to True to continue the training + resume_if_exists: false + resume_ignore_no_checkpoint: false + + # You may use this section to create a W&B logger + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null \ No newline at end of file diff --git a/examples/asr/speech_translation/speech_to_text_transformer.py b/examples/asr/speech_translation/speech_to_text_transformer.py new file mode 100644 index 000000000000..0c0882859b88 --- /dev/null +++ b/examples/asr/speech_translation/speech_to_text_transformer.py @@ -0,0 +1,70 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +# Training the model +```sh +python speech_to_text_transformer.py \ + # (Optional: --config-path= --config-name=) \ + model.train_ds.audio.tarred_audio_filepaths= \ + model.train_ds.audio_manifest_filepath= \ + model.validation_ds.manifest_filepath= \ + model.test_ds.manifest_filepath= \ + model.tokenizer.dir= \ + model.tokenizer.model_path= \ + model.tokenizer.type= \ + trainer.gpus=-1 \ + trainer.accelerator="ddp" \ + trainer.max_epochs=100 \ + model.optim.name="adamw" \ + model.optim.lr=0.001 \ + model.optim.betas=[0.9,0.999] \ + model.optim.weight_decay=0.0001 \ + model.optim.sched.warmup_steps=2000 + exp_manager.create_wandb_logger=True \ + exp_manager.wandb_logger_kwargs.name="" \ + exp_manager.wandb_logger_kwargs.project="" +``` + + +""" + +import pytorch_lightning as pl +from omegaconf import OmegaConf + +from nemo.collections.asr.models import EncDecTransfModelBPE +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.exp_manager import exp_manager + + +@hydra_runner(config_path="../conf/speech_translation/", config_name="fast-conformer_transformer") +def main(cfg): + logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') + + trainer = pl.Trainer(**cfg.trainer) + exp_manager(trainer, cfg.get("exp_manager", None)) + asr_model = EncDecTransfModelBPE(cfg=cfg.model, trainer=trainer) + + # Initialize the weights of the model from another model, if provided via config + asr_model.maybe_init_from_pretrained_checkpoint(cfg) + trainer.fit(asr_model) + + if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: + if asr_model.prepare_test(trainer): + trainer.test(asr_model) + + +if __name__ == '__main__': + main() diff --git a/examples/asr/speech_translation/translate_speech.py b/examples/asr/speech_translation/translate_speech.py new file mode 100644 index 000000000000..203852b52ee9 --- /dev/null +++ b/examples/asr/speech_translation/translate_speech.py @@ -0,0 +1,210 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import json +import os +from dataclasses import dataclass, is_dataclass +from typing import List, Optional, Union + +import pytorch_lightning as pl +import torch +from omegaconf import OmegaConf + +from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig +from nemo.collections.asr.parts.utils.transcribe_utils import compute_output_filename, prepare_audio_data, setup_model +from nemo.core.config import hydra_runner +from nemo.utils import logging + +""" +Translate audio file on a single CPU/GPU. Useful for translations of moderate amounts of audio data. + +# Arguments + model_path: path to .nemo ST checkpoint + pretrained_name: name of pretrained ST model (from NGC registry) + audio_dir: path to directory with audio files + dataset_manifest: path to dataset JSON manifest file (in NeMo format) + + output_filename: Output filename where the translations will be written + batch_size: batch size during inference + + cuda: Optional int to enable or disable execution of model on certain CUDA device. + allow_mps: Bool to allow using MPS (Apple Silicon M-series GPU) device if available + amp: Bool to decide if Automatic Mixed Precision should be used during inference + audio_type: Str filetype of the audio. Supported = wav, flac, mp3 + + overwrite_translations: Bool which when set allows repeated translations to overwrite previous results. + +# Usage +ST model can be specified by either "model_path" or "pretrained_name". +Data for translation can be defined with either "audio_dir" or "dataset_manifest". +Results are returned in a JSON manifest file. + +python translate_speech.py \ + model_path=null \ + pretrained_name=null \ + audio_dir="" \ + dataset_manifest="" \ + output_filename="" \ + batch_size=32 \ + cuda=0 \ + amp=True \ +""" + + +@dataclass +class ModelChangeConfig: + + # Sub-config for changes specific to the Conformer Encoder + conformer: ConformerChangeConfig = ConformerChangeConfig() + + +@dataclass +class TranslationConfig: + # Required configs + model_path: Optional[str] = None # Path to a .nemo file + pretrained_name: Optional[str] = None # Name of a pretrained model + audio_dir: Optional[str] = None # Path to a directory which contains audio files + dataset_manifest: Optional[str] = None # Path to dataset's JSON manifest + audio_key: str = 'audio_filepath' # Used to override the default audio key in dataset_manifest + eval_config_yaml: Optional[str] = None # Path to a yaml file of config of evaluation + + # General configs + output_filename: Optional[str] = None + batch_size: int = 32 + random_seed: Optional[int] = None # seed number going to be used in seed_everything() + + # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA + # device anyway, and do inference on CPU only if CUDA device is not found. + # If `cuda` is a negative number, inference will be on CPU only. + cuda: Optional[int] = None + allow_mps: bool = False # allow to select MPS device (Apple Silicon M-series GPU) + amp: bool = False + audio_type: str = "wav" + + # Recompute model translation, even if the output folder exists with scores. + overwrite_translations: bool = True + + # can be set to True to return list of translations instead of the config + # if True, will also skip writing anything to the output file + return_translations: bool = False + + +@hydra_runner(config_name="TranslationConfig", schema=TranslationConfig) +def main(cfg: TranslationConfig) -> Union[TranslationConfig, List[str]]: + logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') + + for key in cfg: + cfg[key] = None if cfg[key] == 'None' else cfg[key] + + if is_dataclass(cfg): + cfg = OmegaConf.structured(cfg) + + if cfg.random_seed: + pl.seed_everything(cfg.random_seed) + + if cfg.model_path is None and cfg.pretrained_name is None: + raise ValueError("Both cfg.model_path and cfg.pretrained_name cannot be None!") + if cfg.audio_dir is None and cfg.dataset_manifest is None: + raise ValueError("Both cfg.audio_dir and cfg.dataset_manifest cannot be None!") + + # Load augmentor from exteranl yaml file which contains eval info, could be extend to other feature such VAD, P&C + augmentor = None + if cfg.eval_config_yaml: + eval_config = OmegaConf.load(cfg.eval_config_yaml) + augmentor = eval_config.test_ds.get("augmentor") + logging.info(f"Will apply on-the-fly augmentation on samples during translation: {augmentor} ") + + # setup GPU + if cfg.cuda is None: + if torch.cuda.is_available(): + device = [0] # use 0th CUDA device + accelerator = 'gpu' + map_location = torch.device('cuda:0') + elif cfg.allow_mps and hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + logging.warning( + "MPS device (Apple Silicon M-series GPU) support is experimental." + " Env variable `PYTORCH_ENABLE_MPS_FALLBACK=1` should be set in most cases to avoid failures." + ) + device = [0] + accelerator = 'mps' + map_location = torch.device('mps') + else: + device = 1 + accelerator = 'cpu' + map_location = torch.device('cpu') + else: + device = [cfg.cuda] + accelerator = 'gpu' + map_location = torch.device(f'cuda:{cfg.cuda}') + + logging.info(f"Inference will be done on device: {map_location}") + + asr_model, model_name = setup_model(cfg, map_location) + trainer = pl.Trainer(devices=device, accelerator=accelerator) + asr_model.set_trainer(trainer) + asr_model = asr_model.eval() + + # collect additional translation information + return_hypotheses = False + + # prepare audio filepaths and decide wether it's partial audio + filepaths, partial_audio = prepare_audio_data(cfg) + + # setup AMP (optional) + if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'): + logging.info("AMP enabled!\n") + autocast = torch.cuda.amp.autocast + else: + + @contextlib.contextmanager + def autocast(): + yield + + # Compute output filename + cfg = compute_output_filename(cfg, model_name) + + # if translations should not be overwritten, and already exists, skip re-translation step and return + if not cfg.return_translations and not cfg.overwrite_translations and os.path.exists(cfg.output_filename): + logging.info( + f"Previous translations found at {cfg.output_filename}, and flag `overwrite_translations`" + f"is {cfg.overwrite_translations}. Returning without re-translating text." + ) + return cfg + + # translate audio + with autocast(): + with torch.no_grad(): + translations = asr_model.translate( + paths2audio_files=filepaths, batch_size=cfg.batch_size, return_hypotheses=return_hypotheses, + ) + + logging.info(f"Finished translating {len(filepaths)} files !") + logging.info(f"Writing translations into file: {cfg.output_filename}") + + if cfg.return_translations: + return translations + + # write audio translations + with open(cfg.output_filename, 'w', encoding='utf-8', newline='\n') as f: + for filepath, translation in zip(filepaths, translations): + item = {'audio_filepath': filepath, 'pred_translation': translation} + f.write(json.dumps(item, ensure_ascii=False) + "\n") + logging.info(f"Finished writing predictions to {cfg.output_filename}!") + + return cfg + + +if __name__ == '__main__': + main() # noqa pylint: disable=no-value-for-parameter diff --git a/nemo/collections/asr/models/__init__.py b/nemo/collections/asr/models/__init__.py index a7275faea3d0..34f2c4f62e29 100644 --- a/nemo/collections/asr/models/__init__.py +++ b/nemo/collections/asr/models/__init__.py @@ -33,3 +33,4 @@ from nemo.collections.asr.models.rnnt_models import EncDecRNNTModel from nemo.collections.asr.models.slu_models import SLUIntentSlotBPEModel from nemo.collections.asr.models.ssl_models import SpeechEncDecSelfSupervisedModel +from nemo.collections.asr.models.transformer_bpe_models import EncDecTransfModelBPE diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py new file mode 100644 index 000000000000..178746795ae8 --- /dev/null +++ b/nemo/collections/asr/models/transformer_bpe_models.py @@ -0,0 +1,614 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import json +import os +import tempfile +from math import ceil +from typing import Dict, List, Optional, Union + +import editdistance +import torch +import torch.distributed as dist +from omegaconf import DictConfig, OmegaConf +from pytorch_lightning import Trainer +from tqdm.auto import tqdm + +from nemo.collections.asr.data import audio_to_text_dataset +from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs +from nemo.collections.asr.models.asr_model import ASRModel, ExportableEncDecModel +from nemo.collections.asr.parts.mixins import ASRBPEMixin +from nemo.collections.common.losses import SmoothedCrossEntropyLoss +from nemo.collections.common.metrics import GlobalAverageLossMetric +from nemo.collections.common.parts import transformer_weights_init + +from nemo.core.classes.common import typecheck +from nemo.core.neural_types import ( + AudioSignal, + ChannelType, + LabelsType, + LengthsType, + LogprobsType, + MaskType, + NeuralType, + SpectrogramType, +) +from nemo.utils import logging + +try: + from sacrebleu import corpus_bleu + from nemo.collections.nlp.modules.common import TokenClassifier + from nemo.collections.nlp.modules.common.lm_utils import get_transformer + from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder + + NLP_AVAILABLE = True +except (ImportError, ModuleNotFoundError): + NLP_AVAILABLE = False + logging.warning("Could not import NeMo NLP collection which is required for speech translation model.") + +__all__ = ['EncDecTransfModelBPE'] + + +def lens_to_mask(lens, max_length): + batch_size = lens.shape[0] + mask = torch.arange(max_length).repeat(batch_size, 1).to(lens.device) < lens[:, None] + return mask + + +class EncDecTransfModelBPE(ASRModel, ExportableEncDecModel, ASRBPEMixin): + """Base class for encoder decoder CTC-based models.""" + + def __init__(self, cfg: DictConfig, trainer: Trainer = None): + + if 'tokenizer' not in cfg: + raise ValueError("`cfg` must have `tokenizer` config to create a tokenizer !") + + # Setup the tokenizer + self._setup_tokenizer(cfg.tokenizer) + + super().__init__(cfg=cfg, trainer=trainer) + + # Setup audio preprocessor + self.preprocessor = EncDecTransfModelBPE.from_config_dict(self.cfg.preprocessor) + + # Setup audio encoder + self.encoder = EncDecTransfModelBPE.from_config_dict(self.cfg.encoder) + + # Add projection layer if encoder and decoder differ in hidden size + if self.cfg.encoder['d_model'] != self.cfg.transf_decoder['hidden_size']: + self.adapter = torch.nn.Linear(self.cfg.encoder['d_model'], self.cfg.transf_decoder['hidden_size']) + else: + self.adapter = torch.nn.Identity() + + transf_encoder_cfg_dict = OmegaConf.to_container(cfg.get('transf_encoder')) + + # Whether to add Transformer Encoder block between Conformer and Transformer Decoder + self.use_transf_encoder = False + if transf_encoder_cfg_dict['num_layers'] > 0: + self.use_transf_encoder = True + + self.transf_encoder = TransformerEncoder( + num_layers=transf_encoder_cfg_dict['num_layers'], + hidden_size=transf_encoder_cfg_dict['hidden_size'], + inner_size=transf_encoder_cfg_dict['inner_size'], + mask_future=False, + num_attention_heads=transf_encoder_cfg_dict['num_attention_heads'], + attn_score_dropout=transf_encoder_cfg_dict['attn_score_dropout'], + attn_layer_dropout=transf_encoder_cfg_dict['attn_layer_dropout'], + ffn_dropout=transf_encoder_cfg_dict['ffn_dropout'], + pre_ln=transf_encoder_cfg_dict.get('pre_ln', True), + pre_ln_final_layer_norm=transf_encoder_cfg_dict.get('pre_ln_final_layer_norm', True), + ) + std_init_range = 1 / transf_encoder_cfg_dict['hidden_size'] ** 0.5 + self.transf_encoder.apply(lambda module: transformer_weights_init(module, std_init_range)) + + transf_decoder_cfg_dict = OmegaConf.to_container(cfg.get('transf_decoder')) + + # Transformer decoder + vocab_size = 8 * ceil(self.tokenizer.vocab_size / 8) + transf_decoder_cfg_dict['vocab_size'] = vocab_size + library = transf_decoder_cfg_dict.pop('library', 'nemo') + model_name = transf_decoder_cfg_dict.pop('model_name', None) + pretrained = transf_decoder_cfg_dict.pop('pretrained', False) + self.transf_decoder = get_transformer( + library=library, + model_name=model_name, + pretrained=pretrained, + config_dict=transf_decoder_cfg_dict, + encoder=False, + pre_ln_final_layer_norm=transf_decoder_cfg_dict.get("pre_ln_final_layer_norm", False), + ) + + self.log_softmax = TokenClassifier( + hidden_size=self.transf_decoder.hidden_size, + num_classes=vocab_size, + activation=self.cfg.head.activation, + log_softmax=self.cfg.head.log_softmax, + dropout=self.cfg.head.dropout, + use_transformer_init=self.cfg.head.use_transformer_init, + ) + self.log_softmax.mlp.layer0.weight = self.transf_decoder.embedding.token_embedding.weight + std_init_range = 1 / self.transf_decoder.hidden_size ** 0.5 + self.transf_decoder.apply(lambda module: transformer_weights_init(module, std_init_range)) + self.log_softmax.apply(lambda module: transformer_weights_init(module, std_init_range)) + + # Beam Search decoding + self.beam_search = BeamSearchSequenceGenerator( + embedding=self.transf_decoder.embedding, + decoder=self.transf_decoder.decoder, + log_softmax=self.log_softmax, + max_sequence_length=self.transf_decoder.max_sequence_length, + beam_size=self.cfg.beam_search.beam_size, + bos=self.tokenizer.bos_id, + pad=self.tokenizer.pad_id, + eos=self.tokenizer.eos_id, + len_pen=self.cfg.beam_search.len_pen, + max_delta_length=self.cfg.beam_search.max_generation_delta, + ) + + # Define autoregressive CE loss + self.transf_loss = SmoothedCrossEntropyLoss( + pad_id=self.tokenizer.pad_id, label_smoothing=self.cfg.label_smoothing + ) + + if hasattr(self.cfg, 'spec_augment') and self.cfg.spec_augment is not None: + self.spec_augmentation = EncDecTransfModelBPE.from_config_dict(self.cfg.spec_augment) + else: + self.spec_augmentation = None + + self.val_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True) + + @torch.no_grad() + def translate( + self, + paths2audio_files: List[str], + batch_size: int = 4, + logprobs: bool = False, + return_hypotheses: bool = False, + ) -> List[str]: + hypotheses = self.transcribe(paths2audio_files, batch_size, logprobs, return_hypotheses) + return hypotheses + + @torch.no_grad() + def transcribe( + self, + paths2audio_files: List[str], + batch_size: int = 4, + logprobs: bool = False, + return_hypotheses: bool = False, + ) -> List[str]: + """ + Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping. + Args: + paths2audio_files: (a list) of paths to audio files. \ + Recommended length per file is between 5 and 25 seconds. \ + But it is possible to pass a few hours long file if enough GPU memory is available. + batch_size: (int) batch size to use during inference. + Bigger will result in better throughput performance but would use more memory. + logprobs: (bool) pass True to get log probabilities instead of transcripts. + return_hypotheses: (bool) Either return hypotheses or text + With hypotheses can do some postprocessing like getting timestamp or rescoring + Returns: + A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files + """ + if paths2audio_files is None or len(paths2audio_files) == 0: + return {} + + if return_hypotheses and logprobs: + raise ValueError( + "Either `return_hypotheses` or `logprobs` can be True at any given time." + "Returned hypotheses will contain the logprobs." + ) + + # We will store transcriptions here + hypotheses = [] + + # Model's mode and device + mode = self.training + device = next(self.parameters()).device + dither_value = self.preprocessor.featurizer.dither + pad_to_value = self.preprocessor.featurizer.pad_to + + try: + self.preprocessor.featurizer.dither = 0.0 + self.preprocessor.featurizer.pad_to = 0 + # Switch model to evaluation mode + self.eval() + # Freeze the encoder and decoder modules + self.encoder.freeze() + self.transf_decoder.freeze() + logging_level = logging.get_verbosity() + logging.set_verbosity(logging.WARNING) + # Work in tmp directory - will store manifest file there + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, 'manifest.json'), 'w') as fp: + for audio_file in paths2audio_files: + entry = {'audio_filepath': audio_file, 'duration': 100000, 'text': 'nothing'} + fp.write(json.dumps(entry) + '\n') + + config = {'paths2audio_files': paths2audio_files, 'batch_size': batch_size, 'temp_dir': tmpdir} + + temporary_datalayer = self._setup_transcribe_dataloader(config) + for test_batch in tqdm(temporary_datalayer, desc="Transcribing"): + log_probs, encoded_len, enc_states, enc_mask = self.forward( + input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) + ) + + beam_hypotheses = ( + self.beam_search( + encoder_hidden_states=enc_states, encoder_input_mask=enc_mask, return_beam_scores=False + ) + .detach() + .cpu() + .numpy() + ) + beam_hypotheses = [self.tokenizer.ids_to_text(hyp) for hyp in beam_hypotheses] + + if return_hypotheses: + # dump log probs per file + for idx in range(logits.shape[0]): + current_hypotheses[idx].y_sequence = logits[idx][: logits_len[idx]] + + hypotheses += beam_hypotheses + + del test_batch, log_probs, encoded_len, enc_states, enc_mask + finally: + # set mode back to its original value + self.train(mode=mode) + self.preprocessor.featurizer.dither = dither_value + self.preprocessor.featurizer.pad_to = pad_to_value + if mode is True: + self.encoder.unfreeze() + self.transf_decoder.unfreeze() + logging.set_verbosity(logging_level) + + return hypotheses + + def _setup_dataloader_from_config(self, config: Optional[Dict]): + + dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config( + config=config, + local_rank=self.local_rank, + global_rank=self.global_rank, + world_size=self.world_size, + tokenizer=self.tokenizer, + preprocessor_cfg=self.cfg.get("preprocessor", None), + ) + + if dataset is None: + return None + + shuffle = config['shuffle'] + if config.get('is_tarred', False): + shuffle = False + + if hasattr(dataset, 'collate_fn'): + collate_fn = dataset.collate_fn + else: + collate_fn = dataset.datasets[0].collate_fn + + return torch.utils.data.DataLoader( + dataset=dataset, + batch_size=config['batch_size'], + collate_fn=collate_fn, + drop_last=config.get('drop_last', False), + shuffle=shuffle, + num_workers=config.get('num_workers', 0), + pin_memory=config.get('pin_memory', False), + ) + + def setup_training_data(self, train_data_config: Optional[DictConfig]): + + # create audio-only data loader + self._update_dataset_config(dataset_name='train', config=train_data_config) + self._train_dl = self._setup_dataloader_from_config(config=train_data_config) + + # Need to set this because if using an IterableDataset, the length of the + # dataloader is the total number of samples rather than the number of batches, + # and this messes up the tqdm progress bar. So we set the number of steps manually + # (to the correct number) to fix this. + if 'is_tarred' in train_data_config and train_data_config['is_tarred']: + # We also need to check if limit_train_batches is already set. + # If it's an int, we assume that the user has set it to something sane, + # i.e. <= # training batches, and don't change it. Otherwise, adjust + # batches accordingly if it's a float (including 1.0). + if self._trainer is not None and isinstance(self._trainer.limit_train_batches, float): + self._trainer.limit_train_batches = int( + self._trainer.limit_train_batches + * ceil((len(self._train_dl.dataset) / self.world_size) / train_data_config['batch_size']) + ) + elif self._trainer is None: + logging.warning( + "Model Trainer was not set before constructing the dataset, incorrect number of " + "training batches will be used. Please set the trainer and rebuild the dataset." + ) + + def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]): + """ + Sets up the validation data loader via a Dict-like object. + Args: + val_data_config: A config that contains the information regarding construction + of an ASR Training dataset. + Supported Datasets: + - :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset` + - :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset` + - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset` + - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` + - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` + """ + if 'shuffle' not in val_data_config: + val_data_config['shuffle'] = False + + # preserve config + self._update_dataset_config(dataset_name='validation', config=val_data_config) + self._validation_dl = self._setup_dataloader_from_config(config=val_data_config) + + def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]): + """ + Sets up the test data loader via a Dict-like object. + Args: + test_data_config: A config that contains the information regarding construction + of an ASR Training dataset. + Supported Datasets: + - :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset` + - :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset` + - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset` + - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` + - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` + """ + if 'shuffle' not in test_data_config: + test_data_config['shuffle'] = False + + # preserve config + self._update_dataset_config(dataset_name='test', config=test_data_config) + self._test_dl = self._setup_dataloader_from_config(config=test_data_config) + + @property + def input_types(self) -> Optional[Dict[str, NeuralType]]: + if hasattr(self.preprocessor, '_sample_rate'): + input_signal_eltype = AudioSignal(freq=self.preprocessor._sample_rate) + else: + input_signal_eltype = AudioSignal() + return { + "input_signal": NeuralType(('B', 'T'), input_signal_eltype, optional=True), + "input_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), + "processed_signal": NeuralType(('B', 'D', 'T'), SpectrogramType(), optional=True), + "processed_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), + "transcript": NeuralType(('B', 'T'), LabelsType(), optional=True), + "transcript_length": NeuralType(tuple('B'), LengthsType(), optional=True), + "sample_id": NeuralType(tuple('B'), LengthsType(), optional=True), + } + + @property + def output_types(self) -> Optional[Dict[str, NeuralType]]: + return { + "transf_log_probs": NeuralType(('B', 'T', 'D'), LogprobsType()), + "encoded_lengths": NeuralType(tuple('B'), LengthsType()), + "encoder_states": NeuralType(('B', 'T', 'D'), ChannelType()), + "encoder_mask": NeuralType(('B', 'T'), MaskType()), + } + + @typecheck() + def forward( + self, + input_signal=None, + input_signal_length=None, + processed_signal=None, + processed_signal_length=None, + transcript=None, + transcript_length=None, + ): + """ + Forward pass of the model. + Args: + input_signal: Tensor that represents a batch of raw audio signals, + of shape [B, T]. T here represents timesteps, with 1 second of audio represented as + `self.sample_rate` number of floating point values. + input_signal_length: Vector of length B, that contains the individual lengths of the audio + sequences. + processed_signal: Tensor that represents a batch of processed audio signals, + of shape (B, D, T) that has undergone processing via some DALI preprocessor. + processed_signal_length: Vector of length B, that contains the individual lengths of the + processed audio sequences. + Returns: + A tuple of 3 elements - + 1) The log probabilities tensor of shape [B, T, D]. + 2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B]. + 3) The greedy token predictions of the model of shape [B, T] (via argmax) + """ + has_input_signal = input_signal is not None and input_signal_length is not None + has_processed_signal = processed_signal is not None and processed_signal_length is not None + if (has_input_signal ^ has_processed_signal) == False: + raise ValueError( + f"{self} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive " + " with ``processed_signal`` and ``processed_signal_len`` arguments." + ) + + if not has_processed_signal: + processed_signal, processed_signal_length = self.preprocessor( + input_signal=input_signal, length=input_signal_length + ) + + if self.spec_augmentation is not None and self.training: + processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length) + + encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length) + + enc_states = encoded.permute(0, 2, 1) + enc_states = self.adapter(enc_states) + enc_mask = lens_to_mask(encoded_len, enc_states.shape[1]).to(enc_states.dtype) + if self.use_transf_encoder: + enc_states = self.transf_encoder(encoder_states=enc_states, encoder_mask=enc_mask) + + transf_log_probs = None + if transcript is not None: + dec_mask = lens_to_mask(transcript_length, transcript.shape[1]).to(transcript.dtype) + dec_states = self.transf_decoder( + input_ids=transcript, decoder_mask=dec_mask, encoder_embeddings=enc_states, encoder_mask=enc_mask + ) + transf_log_probs = self.log_softmax(hidden_states=dec_states) + + return transf_log_probs, encoded_len, enc_states, enc_mask + + def compute_audio_loss(self, batch): + + if batch is None: + return 0 + + signal, signal_len, transcript, transcript_len = batch + input_ids, labels = transcript[:, :-1], transcript[:, 1:] + + transf_log_probs, encoded_len, enc_states, enc_mask = self.forward( + input_signal=signal, + input_signal_length=signal_len, + transcript=input_ids, + transcript_length=transcript_len, + ) + + transf_loss = self.transf_loss(log_probs=transf_log_probs, labels=labels) + + return transf_loss + + # PTL-specific methods + def training_step(self, batch, batch_nb): + + audio_loss = self.compute_audio_loss(batch) + + tensorboard_logs = { + 'train_loss': audio_loss, + 'learning_rate': self._optimizer.param_groups[0]['lr'], + } + + return {'loss': audio_loss, 'log': tensorboard_logs} + + def validation_step(self, batch, batch_idx, dataloader_idx=0, eval_mode="val"): + signal, signal_len, transcript, transcript_len = batch + input_ids, labels = transcript[:, :-1], transcript[:, 1:] + + if isinstance(batch, DALIOutputs) and batch.has_processed_signal: + transf_log_probs, encoded_len, enc_states, enc_mask = self.forward( + processed_signal=signal, + processed_signal_length=signal_len, + transcript=input_ids, + transcript_length=transcript_len, + ) + else: + transf_log_probs, encoded_len, enc_states, enc_mask = self.forward( + input_signal=signal, + input_signal_length=signal_len, + transcript=input_ids, + transcript_length=transcript_len, + ) + + beam_hypotheses = self.beam_search( + encoder_hidden_states=enc_states, encoder_input_mask=enc_mask, return_beam_scores=False + ) + transf_loss = self.transf_loss(log_probs=transf_log_probs, labels=labels) + + ground_truths = [self.tokenizer.ids_to_text(sent) for sent in transcript.detach().cpu().tolist()] + translations = [self.tokenizer.ids_to_text(sent) for sent in beam_hypotheses.detach().cpu().tolist()] + + self.val_loss(loss=transf_loss, num_measurements=transf_log_probs.shape[0] * transf_log_probs.shape[1]) + + return {f'{eval_mode}_loss': transf_loss, 'translations': translations, 'ground_truths': ground_truths} + + def test_step(self, batch, batch_idx, dataloader_idx=0): + return self.validation_step(batch, batch_idx, dataloader_idx, eval_mode="test") + + def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0, eval_mode: str = "val"): + """ + Called at the end of validation to aggregate outputs. + :param outputs: list of individual outputs of each validation step. + """ + if not outputs: + return + + if isinstance(outputs[0], dict): + outputs = [outputs] + + for output in outputs: + eval_loss = getattr(self, 'val_loss').compute() + translations = list(itertools.chain(*[x['translations'] for x in output])) + ground_truths = list(itertools.chain(*[x['ground_truths'] for x in output])) + + # Gather translations and ground truths from all workers + tr_and_gt = [None for _ in range(self.world_size)] + # we also need to drop pairs where ground truth is an empty string + if self.world_size > 1: + dist.all_gather_object( + tr_and_gt, [(t, g) for (t, g) in zip(translations, ground_truths) if g.strip() != ''] + ) + else: + tr_and_gt[0] = [(t, g) for (t, g) in zip(translations, ground_truths) if g.strip() != ''] + + if self.global_rank == 0: + _translations = [] + _ground_truths = [] + for rank in range(0, self.world_size): + _translations += [t for (t, g) in tr_and_gt[rank]] + _ground_truths += [g for (t, g) in tr_and_gt[rank]] + + sacre_bleu = corpus_bleu(_translations, [_ground_truths], tokenize="13a") + sb_score = sacre_bleu.score * self.world_size + + wer_scores, wer_words = 0, 0 + for h, r in zip(_translations, _ground_truths): + wer_words += len(r.split()) + wer_scores += editdistance.eval(h.split(), r.split()) + wer_score = 1.0 * wer_scores * self.world_size / wer_words + + else: + sb_score = 0.0 + wer_score = 0.0 + + self.log(f"{eval_mode}_loss", eval_loss, sync_dist=True) + self.log(f"{eval_mode}_sacreBLEU", sb_score, sync_dist=True) + self.log(f"{eval_mode}_WER", wer_score, sync_dist=True) + self.val_loss.reset() + + def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): + return self.multi_validation_epoch_end(outputs, dataloader_idx, eval_mode="test") + + def test_dataloader(self): + if self._test_dl is not None: + return self._test_dl + + def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': + """ + Setup function for a temporary data loader which wraps the provided audio file. + Args: + config: A python dictionary which contains the following keys: + paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments. \ + Recommended length per file is between 5 and 25 seconds. + batch_size: (int) batch size to use during inference. \ + Bigger will result in better throughput performance but would use more memory. + temp_dir: (str) A temporary directory where the audio manifest is temporarily + stored. + Returns: + A pytorch DataLoader for the given audio file(s). + """ + batch_size = min(config['batch_size'], len(config['paths2audio_files'])) + dl_config = { + 'manifest_filepath': os.path.join(config['temp_dir'], 'manifest.json'), + 'sample_rate': self.preprocessor._sample_rate, + 'batch_size': batch_size, + 'trim_silence': False, + 'shuffle': False, + 'num_workers': min(batch_size, os.cpu_count() - 1), + 'pin_memory': True, + } + + temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config)) + return temporary_datalayer diff --git a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py index 0ab0cb784273..906154213ea1 100644 --- a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py +++ b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py @@ -299,7 +299,7 @@ def create_spt_model( byte_fallback: If , fallback to a byte sequence of the character. split_digits: If true, digits are split into individual tokens. split_by_whitespace: Whether to respect white space while creating subwords. If False, will learn merges across whitespace. - split_by_unicode_script: Whether to include multiple Unicode scripts. Ex. is Arabic diacritics which are considered part of the letter (عِدَّةُ) + split_by_unicode_script: Whether to include multiple Unicode scripts. Ex. is Arabic diacritics which are considered part of the letter (عِدَّةُ) """ if not data_file or not os.path.exists(data_file):