From ed71fbf53e503a4460aec0e128fc10300f7d01b8 Mon Sep 17 00:00:00 2001 From: AlexGrinch Date: Mon, 3 Jul 2023 16:56:55 -0700 Subject: [PATCH 01/14] st standalone model Signed-off-by: AlexGrinch --- .../speech_translation_transf.yaml | 235 +++++++ .../speech_to_text_transf.py | 78 +++ nemo/collections/asr/models/__init__.py | 1 + .../asr/models/transformer_bpe_models.py | 611 ++++++++++++++++++ nemo/collections/asr/parts/mixins/mixins.py | 36 +- .../tokenizers/sentencepiece_tokenizer.py | 2 +- 6 files changed, 958 insertions(+), 5 deletions(-) create mode 100644 examples/asr/conf/transformer_dec/speech_translation_transf.yaml create mode 100644 examples/asr/speech_translation/speech_to_text_transf.py create mode 100644 nemo/collections/asr/models/transformer_bpe_models.py diff --git a/examples/asr/conf/transformer_dec/speech_translation_transf.yaml b/examples/asr/conf/transformer_dec/speech_translation_transf.yaml new file mode 100644 index 000000000000..c7bfe739b46c --- /dev/null +++ b/examples/asr/conf/transformer_dec/speech_translation_transf.yaml @@ -0,0 +1,235 @@ +# It contains the default values for training a Conformer-CTC ASR model, large size (~120M) with CTC loss and sub-word encoding. + +# Architecture and training config: +# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective +# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. +# Here are the recommended configs for different variants of Conformer-CTC, other parameters are the same as in this config file. +# One extra layer (compared to original paper) is added to the medium and large variants to compensate for replacing the LSTM decoder with a linear one. +# +# +-------------+---------+---------+----------+------------+-----+ +# | Model | d_model | n_heads | n_layers | time_masks | lr | +# +=============+=========+========+===========+============+=====+ +# | Small (13M)| 176 | 4 | 16 | 5 | 5.0 | +# +-------------+---------+--------+-----------+------------+-----+ +# | Medium (30M)| 256 | 4 | 18 | 5 | 5.0 | +# +-------------+---------+--------+-----------+------------+-----+ +# | Large (121M)| 512 | 8 | 18 | 10 | 2.0 | +# +---------------------------------------------------------------+ +# +# If you do not want to train with AMP, you may use weight decay of 0.0 or reduce the number of time maskings to 2 +# with time_width=100. It may help when you want to train for fewer epochs and need faster convergence. +# With weight_decay=0.0, learning rate may need to get reduced to 2.0. + +# You may find more info about Conformer-CTC here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#conformer-ctc +# Pre-trained models of Conformer-CTC can be found here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html +# The checkpoint of the large model trained on LibriSpeech with this recipe can be found here: https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large_ls + +name: "Conformer-Transformer-BPE-st" + +model: + sample_rate: 16000 + label_smoothing: 0.0 + log_prediction: true # enables logging sample predictions in the output during training + + train_ds: + is_tarred: true + tarred_audio_filepaths: ??? + manifest_filepath: ??? + sample_rate: 16000 + shuffle: false + trim_silence: false + batch_size: 4 + num_workers: 8 + + validation_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 # you may increase batch_size if your memory allows + shuffle: false + num_workers: 4 + pin_memory: true + use_start_end_token: true + + test_ds: + manifest_filepath: ??? + sample_rate: ${model.sample_rate} + batch_size: 16 # you may increase batch_size if your memory allows + shuffle: false + num_workers: 4 + pin_memory: true + use_start_end_token: true + + # recommend small vocab size of 128 or 256 when using 4x sub-sampling + # you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py + tokenizer: + dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (wpe) + type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: ${model.sample_rate} + normalize: "per_feature" + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + log: true + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + pad_value: 0.0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + # you may use lower time_masks for smaller models to have a faster convergence + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + + # Sub-sampling params + subsampling: dw_striding # vggnet or striding, vggnet may give better results but needs more memory + subsampling_factor: 8 # must be power of 2 + subsampling_conv_channels: 256 # -1 sets it to d_model + causal_downsampling: false + reduction: null + reduction_position: null + reduction_factor: 1 + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + att_context_size: [-1, -1] # -1 means unlimited context + xscaling: true # scales up the input embeddings by sqrt(d_model) + untie_biases: true # unties the biases of the TransformerXL layers + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: batch_norm + conv_context_size: null + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + transf_encoder: + num_layers: 0 + hidden_size: 512 + inner_size: 2048 + num_attention_heads: 8 + ffn_dropout: 0.1 + attn_score_dropout: 0.1 + attn_layer_dropout: 0.1 + + transf_decoder: + library: nemo + model_name: null + pretrained: false + max_sequence_length: 512 + num_token_types: 0 + embedding_dropout: 0.1 + learn_positional_encodings: false + hidden_size: 512 + inner_size: 2048 + num_layers: 6 + num_attention_heads: 4 + ffn_dropout: 0.1 + attn_score_dropout: 0.1 + attn_layer_dropout: 0.1 + hidden_act: relu + pre_ln: true + pre_ln_final_layer_norm: true + + head: + num_layers: 1 + activation: relu + log_softmax: true + dropout: 0.0 + use_transformer_init: true + + beam_search: + beam_size: 4 + len_pen: 0.0 + max_generation_delta: 50 + + optim: + name: adam + lr: 0.0001 + # optimizer arguments + betas: [0.9, 0.98] + # less necessity for weight_decay as we already have large augmentations with SpecAug + # you may need weight_decay for large models, stable AMP training, small datasets, or when lower augmentations are used + # weight decay of 0.0 with lr of 2.0 also works fine + #weight_decay: 1e-3 + + # scheduler setup + sched: + name: InverseSquareRootAnnealing + #d_model: ${model.encoder.d_model} + # scheduler config override + warmup_steps: 1000 + warmup_ratio: null + min_lr: 1e-6 + +# Initialize model encoder with pre-trained ASR FastConformer encoder for faster convergence and improved accuracy +init_from_nemo_model: + model0: + path: ??? + include: ["preprocessor", "encoder"] + +trainer: + gpus: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: 100 + max_steps: -1 # computed at runtime if not set + val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations + accelerator: auto + strategy: ddp + accumulate_grad_batches: 1 + gradient_clip_val: 0.0 + precision: 16 # Should be set to 16 for O1 and O2 to enable the AMP. + log_every_n_steps: 100 # Interval of logging. + enable_progress_bar: True + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_sacreBLEU" + mode: "max" + save_top_k: 3 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + + # you need to set these two to True to continue the training + resume_if_exists: false + resume_ignore_no_checkpoint: false + + # You may use this section to create a W&B logger + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null \ No newline at end of file diff --git a/examples/asr/speech_translation/speech_to_text_transf.py b/examples/asr/speech_translation/speech_to_text_transf.py new file mode 100644 index 000000000000..ce3e657365a7 --- /dev/null +++ b/examples/asr/speech_translation/speech_to_text_transf.py @@ -0,0 +1,78 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +# Training the model +```sh +python speech_to_text_transf.py \ + # (Optional: --config-path= --config-name=) \ + model.train_ds.text.tar_files= \ + model.train_ds.text.metadata_file= \ + model.train_ds.audio.tarred_audio_filepaths= \ + model.train_ds.audio_manifest_filepath= \ + model.validation_ds.manifest_filepath= \ + model.test_ds.manifest_filepath= \ + model.tokenizer.dir= \ + model.tokenizer.model_path= \ + model.tokenizer.type= \ + model.encoder_tokenizer.tokenizer_model= \ + model.encoder_tokenizer.vocab_file= \ + model.decoder_tokenizer.tokenizer_model= \ + trainer.gpus=-1 \ + trainer.accelerator="ddp" \ + trainer.max_epochs=100 \ + model.optim.name="adamw" \ + model.optim.lr=0.001 \ + model.optim.betas=[0.9,0.999] \ + model.optim.weight_decay=0.0001 \ + model.optim.sched.warmup_steps=2000 + exp_manager.create_wandb_logger=True \ + exp_manager.wandb_logger_kwargs.name="" \ + exp_manager.wandb_logger_kwargs.project="" +``` + + +""" + +from collections import OrderedDict + +import pytorch_lightning as pl +import torch +from omegaconf import OmegaConf + +from nemo.collections.asr.models import EncDecTransfModelBPE +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.exp_manager import exp_manager + + +@hydra_runner(config_path="../conf/transformer_dec/", config_name="speech_translation_transf_test") +def main(cfg): + logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') + + trainer = pl.Trainer(**cfg.trainer) + exp_manager(trainer, cfg.get("exp_manager", None)) + asr_model = EncDecTransfModelBPE(cfg=cfg.model, trainer=trainer) + + # Initialize the weights of the model from another model, if provided via config + asr_model.maybe_init_from_pretrained_checkpoint(cfg) + trainer.fit(asr_model) + + if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: + if asr_model.prepare_test(trainer): + trainer.test(asr_model) + + +if __name__ == '__main__': + main() diff --git a/nemo/collections/asr/models/__init__.py b/nemo/collections/asr/models/__init__.py index a7275faea3d0..34f2c4f62e29 100644 --- a/nemo/collections/asr/models/__init__.py +++ b/nemo/collections/asr/models/__init__.py @@ -33,3 +33,4 @@ from nemo.collections.asr.models.rnnt_models import EncDecRNNTModel from nemo.collections.asr.models.slu_models import SLUIntentSlotBPEModel from nemo.collections.asr.models.ssl_models import SpeechEncDecSelfSupervisedModel +from nemo.collections.asr.models.transformer_bpe_models import EncDecTransfModelBPE diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py new file mode 100644 index 000000000000..b1a40d937a0c --- /dev/null +++ b/nemo/collections/asr/models/transformer_bpe_models.py @@ -0,0 +1,611 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import itertools +import json +import os +import random +import tempfile +from math import ceil +from typing import Dict, List, Optional, Union + +import editdistance +import torch +import torch.distributed as dist +from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict +from pytorch_lightning import Trainer +from sacrebleu import corpus_bleu +from torch.utils.data import ChainDataset, DataLoader +from tqdm.auto import tqdm + +from nemo.collections.asr.data import audio_to_text_dataset +from nemo.collections.asr.data.audio_to_text import _speech_collate_fn +from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs +from nemo.collections.asr.losses.ctc import CTCLoss +from nemo.collections.asr.metrics.wer_bpe import WERBPE, CTCBPEDecoding, CTCBPEDecodingConfig +from nemo.collections.asr.models.asr_model import ASRModel, ExportableEncDecModel +from nemo.collections.asr.parts.features import clean_spectrogram_batch, normalize_batch +from nemo.collections.asr.parts.mixins import ASRBPEMixin +from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations +from nemo.collections.common.data import ConcatDataset +from nemo.collections.common.losses import NLLLoss, SmoothedCrossEntropyLoss +from nemo.collections.common.metrics import GlobalAverageLossMetric +from nemo.collections.common.parts import transformer_weights_init +from nemo.collections.nlp.models.machine_translation import MTEncDecModel +from nemo.collections.nlp.modules.common import TokenClassifier +from nemo.collections.nlp.modules.common.lm_utils import get_transformer +from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder +from nemo.collections.tts.models import FastPitchModel, SpectrogramEnhancerModel +from nemo.core.classes.common import PretrainedModelInfo, typecheck +from nemo.core.neural_types import ( + AudioSignal, + ChannelType, + LabelsType, + LengthsType, + LogprobsType, + MaskType, + NeuralType, + SpectrogramType, +) +from nemo.utils import logging + +__all__ = ['EncDecTransfModelBPE'] + + +def lens_to_mask(lens, max_length): + batch_size = lens.shape[0] + mask = torch.arange(max_length).repeat(batch_size, 1).to(lens.device) < lens[:, None] + return mask + + +class EncDecTransfModelBPE(ASRModel, ExportableEncDecModel, ASRBPEMixin): + """Base class for encoder decoder CTC-based models.""" + + def __init__(self, cfg: DictConfig, trainer: Trainer = None): + + if 'tokenizer' not in cfg: + raise ValueError("`cfg` must have `tokenizer` config to create a tokenizer !") + + # Setup the tokenizer + self._setup_tokenizer(cfg.tokenizer) + + super().__init__(cfg=cfg, trainer=trainer) + + # Setup audio preprocessor + self.preprocessor = EncDecTransfModelBPE.from_config_dict(self._cfg.preprocessor) + + # Setup audio encoder + self.encoder = EncDecTransfModelBPE.from_config_dict(self._cfg.encoder) + + # Add projection layer if encoder and decoder differ in hidden size + if self._cfg.encoder['d_model'] != self._cfg.transf_decoder['hidden_size']: + self.adapter = torch.nn.Linear(self._cfg.encoder['d_model'], self._cfg.transf_decoder['hidden_size']) + else: + self.adapter = lambda x: x + + transf_encoder_cfg_dict = OmegaConf.to_container(cfg.get('transf_encoder')) + + # Whether to add Transformer Encoder block between Conformer and Transformer Decoder + self.use_transf_encoder = False + if transf_encoder_cfg_dict['num_layers'] > 0: + self.use_transf_encoder = True + + self.transf_encoder = TransformerEncoder( + num_layers=transf_encoder_cfg_dict['num_layers'], + hidden_size=transf_encoder_cfg_dict['hidden_size'], + inner_size=transf_encoder_cfg_dict['inner_size'], + mask_future=False, + num_attention_heads=transf_encoder_cfg_dict['num_attention_heads'], + attn_score_dropout=transf_encoder_cfg_dict['attn_score_dropout'], + attn_layer_dropout=transf_encoder_cfg_dict['attn_layer_dropout'], + ffn_dropout=transf_encoder_cfg_dict['ffn_dropout'], + pre_ln=transf_encoder_cfg_dict.get('pre_ln', True), + pre_ln_final_layer_norm=transf_encoder_cfg_dict.get('pre_ln_final_layer_norm', True), + ) + std_init_range = 1 / transf_encoder_cfg_dict['hidden_size'] ** 0.5 + self.transf_encoder.apply(lambda module: transformer_weights_init(module, std_init_range)) + + transf_decoder_cfg_dict = OmegaConf.to_container(cfg.get('transf_decoder')) + + # Transformer decoder + vocab_size = 8 * ceil(self.tokenizer.vocab_size / 8) + transf_decoder_cfg_dict['vocab_size'] = vocab_size + library = transf_decoder_cfg_dict.pop('library', 'nemo') + model_name = transf_decoder_cfg_dict.pop('model_name', None) + pretrained = transf_decoder_cfg_dict.pop('pretrained', False) + checkpoint_file = transf_decoder_cfg_dict.pop('checkpoint_file', None) + self.transf_decoder = get_transformer( + library=library, + model_name=model_name, + pretrained=pretrained, + config_dict=transf_decoder_cfg_dict, + encoder=False, + pre_ln_final_layer_norm=transf_decoder_cfg_dict.get("pre_ln_final_layer_norm", False), + ) + + self.log_softmax = TokenClassifier( + hidden_size=self.transf_decoder.hidden_size, + num_classes=vocab_size, + activation=self._cfg.head.activation, + log_softmax=self._cfg.head.log_softmax, + dropout=self._cfg.head.dropout, + use_transformer_init=self._cfg.head.use_transformer_init, + ) + self.log_softmax.mlp.layer0.weight = self.transf_decoder.embedding.token_embedding.weight + std_init_range = 1 / self.transf_decoder.hidden_size ** 0.5 + self.transf_decoder.apply(lambda module: transformer_weights_init(module, std_init_range)) + self.log_softmax.apply(lambda module: transformer_weights_init(module, std_init_range)) + + # Beam Search decoding + self.beam_search = BeamSearchSequenceGenerator( + embedding=self.transf_decoder.embedding, + decoder=self.transf_decoder.decoder, + log_softmax=self.log_softmax, + max_sequence_length=self.transf_decoder.max_sequence_length, + beam_size=self._cfg.beam_search.beam_size, + bos=self.tokenizer.bos_id, + pad=self.tokenizer.pad_id, + eos=self.tokenizer.eos_id, + len_pen=self._cfg.beam_search.len_pen, + max_delta_length=self._cfg.beam_search.max_generation_delta, + ) + + # Define autoregressive CE loss + self.transf_loss = SmoothedCrossEntropyLoss( + pad_id=self.tokenizer.pad_id, label_smoothing=self._cfg.label_smoothing + ) + + if hasattr(self._cfg, 'spec_augment') and self._cfg.spec_augment is not None: + self.spec_augmentation = EncDecTransfModelBPE.from_config_dict(self._cfg.spec_augment) + else: + self.spec_augmentation = None + + self.val_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True) + + @torch.no_grad() + def transcribe( + self, + paths2audio_files: List[str], + batch_size: int = 4, + logprobs: bool = False, + return_hypotheses: bool = False, + ) -> List[str]: + """ + Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping. + Args: + paths2audio_files: (a list) of paths to audio files. \ + Recommended length per file is between 5 and 25 seconds. \ + But it is possible to pass a few hours long file if enough GPU memory is available. + batch_size: (int) batch size to use during inference. + Bigger will result in better throughput performance but would use more memory. + logprobs: (bool) pass True to get log probabilities instead of transcripts. + return_hypotheses: (bool) Either return hypotheses or text + With hypotheses can do some postprocessing like getting timestamp or rescoring + Returns: + A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files + """ + if paths2audio_files is None or len(paths2audio_files) == 0: + return {} + + if return_hypotheses and logprobs: + raise ValueError( + "Either `return_hypotheses` or `logprobs` can be True at any given time." + "Returned hypotheses will contain the logprobs." + ) + + # We will store transcriptions here + hypotheses = [] + + # Model's mode and device + mode = self.training + device = next(self.parameters()).device + dither_value = self.preprocessor.featurizer.dither + pad_to_value = self.preprocessor.featurizer.pad_to + + try: + self.preprocessor.featurizer.dither = 0.0 + self.preprocessor.featurizer.pad_to = 0 + # Switch model to evaluation mode + self.eval() + # Freeze the encoder and decoder modules + self.encoder.freeze() + self.transf_decoder.freeze() + logging_level = logging.get_verbosity() + logging.set_verbosity(logging.WARNING) + # Work in tmp directory - will store manifest file there + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, 'manifest.json'), 'w') as fp: + for audio_file in paths2audio_files: + entry = {'audio_filepath': audio_file, 'duration': 100000, 'text': 'nothing'} + fp.write(json.dumps(entry) + '\n') + + config = {'paths2audio_files': paths2audio_files, 'batch_size': batch_size, 'temp_dir': tmpdir} + + temporary_datalayer = self._setup_transcribe_dataloader(config) + for test_batch in tqdm(temporary_datalayer, desc="Transcribing"): + ctc_lp, _, encoded_len, predictions, enc_states, enc_mask = self.forward( + input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) + ) + + beam_hypotheses = ( + self.beam_search( + encoder_hidden_states=enc_states, encoder_input_mask=enc_mask, return_beam_scores=False + ) + .detach() + .cpu() + .numpy() + ) + beam_hypotheses = [self.tokenizer.ids_to_text(hyp) for hyp in beam_hypotheses] + + if return_hypotheses: + # dump log probs per file + for idx in range(logits.shape[0]): + current_hypotheses[idx].y_sequence = logits[idx][: logits_len[idx]] + + hypotheses += beam_hypotheses + + del test_batch + finally: + # set mode back to its original value + self.train(mode=mode) + self.preprocessor.featurizer.dither = dither_value + self.preprocessor.featurizer.pad_to = pad_to_value + if mode is True: + self.encoder.unfreeze() + self.transf_decoder.unfreeze() + logging.set_verbosity(logging_level) + + return hypotheses + + def _setup_dataloader_from_config(self, config: Optional[Dict]): + + dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config( + config=config, + local_rank=self.local_rank, + global_rank=self.global_rank, + world_size=self.world_size, + tokenizer=self.tokenizer, + preprocessor_cfg=self.cfg.get("preprocessor", None), + ) + + if dataset is None: + return None + + shuffle = config['shuffle'] + if config.get('is_tarred', False): + shuffle = False + + if hasattr(dataset, 'collate_fn'): + collate_fn = dataset.collate_fn + else: + collate_fn = dataset.datasets[0].collate_fn + + return torch.utils.data.DataLoader( + dataset=dataset, + batch_size=config['batch_size'], + collate_fn=collate_fn, + drop_last=config.get('drop_last', False), + shuffle=shuffle, + num_workers=config.get('num_workers', 0), + pin_memory=config.get('pin_memory', False), + ) + + def setup_training_data(self, train_data_config: Optional[DictConfig]): + + # create audio-only data loader + self._update_dataset_config(dataset_name='train', config=train_data_config) + self._train_dl = self._setup_dataloader_from_config(config=train_data_config) + + # Need to set this because if using an IterableDataset, the length of the + # dataloader is the total number of samples rather than the number of batches, + # and this messes up the tqdm progress bar. So we set the number of steps manually + # (to the correct number) to fix this. + if 'is_tarred' in train_data_config and train_data_config['is_tarred']: + # We also need to check if limit_train_batches is already set. + # If it's an int, we assume that the user has set it to something sane, + # i.e. <= # training batches, and don't change it. Otherwise, adjust + # batches accordingly if it's a float (including 1.0). + if self._trainer is not None and isinstance(self._trainer.limit_train_batches, float): + self._trainer.limit_train_batches = int( + self._trainer.limit_train_batches + * ceil((len(self._train_dl.dataset) / self.world_size) / train_data_config['batch_size']) + ) + elif self._trainer is None: + logging.warning( + "Model Trainer was not set before constructing the dataset, incorrect number of " + "training batches will be used. Please set the trainer and rebuild the dataset." + ) + + def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]): + """ + Sets up the validation data loader via a Dict-like object. + Args: + val_data_config: A config that contains the information regarding construction + of an ASR Training dataset. + Supported Datasets: + - :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset` + - :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset` + - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset` + - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` + - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` + """ + if 'shuffle' not in val_data_config: + val_data_config['shuffle'] = False + + # preserve config + self._update_dataset_config(dataset_name='validation', config=val_data_config) + self._validation_dl = self._setup_dataloader_from_config(config=val_data_config) + + def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]): + """ + Sets up the test data loader via a Dict-like object. + Args: + test_data_config: A config that contains the information regarding construction + of an ASR Training dataset. + Supported Datasets: + - :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset` + - :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset` + - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset` + - :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset` + - :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset` + """ + if 'shuffle' not in test_data_config: + test_data_config['shuffle'] = False + + # preserve config + self._update_dataset_config(dataset_name='test', config=test_data_config) + self._test_dl = self._setup_dataloader_from_config(config=test_data_config) + + @property + def input_types(self) -> Optional[Dict[str, NeuralType]]: + if hasattr(self.preprocessor, '_sample_rate'): + input_signal_eltype = AudioSignal(freq=self.preprocessor._sample_rate) + else: + input_signal_eltype = AudioSignal() + return { + "input_signal": NeuralType(('B', 'T'), input_signal_eltype, optional=True), + "input_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), + "processed_signal": NeuralType(('B', 'D', 'T'), SpectrogramType(), optional=True), + "processed_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True), + "transcript": NeuralType(('B', 'T'), LabelsType(), optional=True), + "transcript_length": NeuralType(tuple('B'), LengthsType(), optional=True), + "sample_id": NeuralType(tuple('B'), LengthsType(), optional=True), + } + + @property + def output_types(self) -> Optional[Dict[str, NeuralType]]: + return { + "transf_log_probs": NeuralType(('B', 'T', 'D'), LogprobsType()), + "encoded_lengths": NeuralType(tuple('B'), LengthsType()), + "encoder_states": NeuralType(('B', 'T', 'D'), ChannelType()), + "encoder_mask": NeuralType(('B', 'T'), MaskType()), + } + + @typecheck() + def forward( + self, + input_signal=None, + input_signal_length=None, + processed_signal=None, + processed_signal_length=None, + transcript=None, + transcript_length=None, + ): + """ + Forward pass of the model. + Args: + input_signal: Tensor that represents a batch of raw audio signals, + of shape [B, T]. T here represents timesteps, with 1 second of audio represented as + `self.sample_rate` number of floating point values. + input_signal_length: Vector of length B, that contains the individual lengths of the audio + sequences. + processed_signal: Tensor that represents a batch of processed audio signals, + of shape (B, D, T) that has undergone processing via some DALI preprocessor. + processed_signal_length: Vector of length B, that contains the individual lengths of the + processed audio sequences. + Returns: + A tuple of 3 elements - + 1) The log probabilities tensor of shape [B, T, D]. + 2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B]. + 3) The greedy token predictions of the model of shape [B, T] (via argmax) + """ + has_input_signal = input_signal is not None and input_signal_length is not None + has_processed_signal = processed_signal is not None and processed_signal_length is not None + if (has_input_signal ^ has_processed_signal) == False: + raise ValueError( + f"{self} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive " + " with ``processed_signal`` and ``processed_signal_len`` arguments." + ) + + if not has_processed_signal: + processed_signal, processed_signal_length = self.preprocessor( + input_signal=input_signal, length=input_signal_length + ) + + if self.spec_augmentation is not None and self.training: + processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length) + + encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length) + + enc_states = encoded.permute(0, 2, 1) + enc_states = self.adapter(enc_states) + enc_mask = lens_to_mask(encoded_len, enc_states.shape[1]).to(enc_states.dtype) + if self.use_transf_encoder: + enc_states = self.transf_encoder(encoder_states=enc_states, encoder_mask=enc_mask) + + dec_mask = lens_to_mask(transcript_length, transcript.shape[1]).to(transcript.dtype) + dec_states = self.transf_decoder( + input_ids=transcript, decoder_mask=dec_mask, encoder_embeddings=enc_states, encoder_mask=enc_mask + ) + transf_log_probs = self.log_softmax(hidden_states=dec_states) + + return transf_log_probs, encoded_len, enc_states, enc_mask + + def compute_audio_loss(self, batch): + + if batch is None: + return 0 + + signal, signal_len, transcript, transcript_len = batch + input_ids, labels = transcript[:, :-1], transcript[:, 1:] + batch_size = signal.shape[0] + + transf_log_probs, encoded_len, enc_states, enc_mask = self.forward( + input_signal=signal, + input_signal_length=signal_len, + transcript=input_ids, + transcript_length=transcript_len, + ) + + transf_loss = self.transf_loss(log_probs=transf_log_probs, labels=labels) + + return transf_loss + + # PTL-specific methods + def training_step(self, batch, batch_nb): + + audio_loss = self.compute_audio_loss(batch) + + tensorboard_logs = { + 'train_loss': audio_loss, + 'learning_rate': self._optimizer.param_groups[0]['lr'], + } + + if hasattr(self, '_trainer') and self._trainer is not None: + log_every_n_steps = self._trainer.log_every_n_steps + else: + log_every_n_steps = 1 + + return {'loss': audio_loss, 'log': tensorboard_logs} + + def validation_step(self, batch, batch_idx, dataloader_idx=0, eval_mode="val"): + signal, signal_len, transcript, transcript_len = batch + input_ids, labels = transcript[:, :-1], transcript[:, 1:] + + if isinstance(batch, DALIOutputs) and batch.has_processed_signal: + transf_log_probs, encoded_len, enc_states, enc_mask = self.forward( + processed_signal=signal, + processed_signal_length=signal_len, + transcript=input_ids, + transcript_length=transcript_len, + ) + else: + transf_log_probs, encoded_len, enc_states, enc_mask = self.forward( + input_signal=signal, + input_signal_length=signal_len, + transcript=input_ids, + transcript_length=transcript_len, + ) + + beam_hypotheses = self.beam_search( + encoder_hidden_states=enc_states, encoder_input_mask=enc_mask, return_beam_scores=False + ) + transf_loss = self.transf_loss(log_probs=transf_log_probs, labels=labels) + + ground_truths = [self.tokenizer.ids_to_text(sent) for sent in transcript.detach().cpu().tolist()] + translations = [self.tokenizer.ids_to_text(sent) for sent in beam_hypotheses.detach().cpu().tolist()] + + self.val_loss(loss=transf_loss, num_measurements=transf_log_probs.shape[0] * transf_log_probs.shape[1]) + + return {f'{eval_mode}_loss': transf_loss, 'translations': translations, 'ground_truths': ground_truths} + + def test_step(self, batch, batch_idx, dataloader_idx=0): + return self.validation_step(batch, batch_idx, dataloader_idx, eval_mode="test") + + def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0, eval_mode: str = "val"): + """ + Called at the end of validation to aggregate outputs. + :param outputs: list of individual outputs of each validation step. + """ + if not outputs: + return + + if isinstance(outputs[0], dict): + outputs = [outputs] + + for output in outputs: + eval_loss = getattr(self, 'val_loss').compute() + translations = list(itertools.chain(*[x['translations'] for x in output])) + ground_truths = list(itertools.chain(*[x['ground_truths'] for x in output])) + + # Gather translations and ground truths from all workers + tr_and_gt = [None for _ in range(self.world_size)] + # we also need to drop pairs where ground truth is an empty string + if self.world_size > 1: + dist.all_gather_object( + tr_and_gt, [(t, g) for (t, g) in zip(translations, ground_truths) if g.strip() != ''] + ) + else: + tr_and_gt[0] = [(t, g) for (t, g) in zip(translations, ground_truths) if g.strip() != ''] + + if self.global_rank == 0: + _translations = [] + _ground_truths = [] + for rank in range(0, self.world_size): + _translations += [t for (t, g) in tr_and_gt[rank]] + _ground_truths += [g for (t, g) in tr_and_gt[rank]] + + sacre_bleu = corpus_bleu(_translations, [_ground_truths], tokenize="13a") + sb_score = sacre_bleu.score * self.world_size + + wer_scores, wer_words = 0, 0 + for h, r in zip(_translations, _ground_truths): + wer_words += len(r.split()) + wer_scores += editdistance.eval(h.split(), r.split()) + wer_score = 1.0 * wer_scores * self.world_size / wer_words + + else: + sb_score = 0.0 + wer_score = 0.0 + + self.log(f"{eval_mode}_loss", eval_loss, sync_dist=True) + self.log(f"{eval_mode}_sacreBLEU", sb_score, sync_dist=True) + self.log(f"{eval_mode}_WER", wer_score, sync_dist=True) + self.val_loss.reset() + + def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): + return self.multi_validation_epoch_end(outputs, dataloader_idx, eval_mode="test") + + def test_dataloader(self): + if self._test_dl is not None: + return self._test_dl + + def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader': + """ + Setup function for a temporary data loader which wraps the provided audio file. + Args: + config: A python dictionary which contains the following keys: + paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments. \ + Recommended length per file is between 5 and 25 seconds. + batch_size: (int) batch size to use during inference. \ + Bigger will result in better throughput performance but would use more memory. + temp_dir: (str) A temporary directory where the audio manifest is temporarily + stored. + Returns: + A pytorch DataLoader for the given audio file(s). + """ + batch_size = min(config['batch_size'], len(config['paths2audio_files'])) + dl_config = { + 'manifest_filepath': os.path.join(config['temp_dir'], 'manifest.json'), + 'sample_rate': self.preprocessor._sample_rate, + 'batch_size': batch_size, + 'trim_silence': False, + 'shuffle': False, + 'num_workers': min(batch_size, os.cpu_count() - 1), + 'pin_memory': True, + } + + temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config)) + return temporary_datalayer diff --git a/nemo/collections/asr/parts/mixins/mixins.py b/nemo/collections/asr/parts/mixins/mixins.py index 4c43960ac9d2..bcb62dc409b7 100644 --- a/nemo/collections/asr/parts/mixins/mixins.py +++ b/nemo/collections/asr/parts/mixins/mixins.py @@ -83,7 +83,7 @@ def _setup_monolingual_tokenizer(self, tokenizer_cfg: DictConfig): with open_dict(self.cfg.tokenizer): self.cfg.tokenizer.hf_kwargs = tokenizer_cfg.get('hf_kwargs') - if self.tokenizer_type not in ['bpe', 'wpe']: + if self.tokenizer_type not in ['bpe', 'wpe', 'yttm']: raise ValueError( "`tokenizer.type` must be either `bpe` for SentencePiece tokenizer or " "`wpe` for BERT based tokenizer" @@ -140,7 +140,7 @@ def get_vocab(): self.tokenizer.tokenizer.get_vocab = get_vocab self.tokenizer.tokenizer.all_special_tokens = self.tokenizer.special_token_to_id - else: + elif self.tokenizer_type == 'wpe': # This is a WPE Tokenizer # If path from previous registration exists, remove it if 'vocab_path' in self.tokenizer_cfg: @@ -166,6 +166,20 @@ def get_vocab(): unk_token=self.hf_tokenizer_kwargs.get('unk_token', None), use_fast=self.hf_tokenizer_kwargs.get('use_fast', False), ) + else: + # This is a YouTokenToMe BPE Tokenizer + self.tokenizer = tokenizers.YouTokenToMeTokenizer(model_path=self.tokenizer_cfg.get('model_path')) + + vocabulary = {} + for i, piece in enumerate(self.tokenizer.tokenizer.vocab()): + vocabulary[piece] = i + + # wrapper method to get vocabulary conveniently + def get_vocab(): + return vocabulary + + self.tokenizer.tokenizer.vocab_size = len(vocabulary) + self.tokenizer.tokenizer.get_vocab = get_vocab logging.info( "Tokenizer {} initialized with {} tokens".format( @@ -221,7 +235,7 @@ def _make_tokenizer(self, tokenizer_cfg: DictConfig, lang=None): tokenizer_type = tokenizer_cfg.get('type').lower() tokenizer_dir = tokenizer_cfg.get('dir') - if tokenizer_type not in ['bpe', 'wpe']: + if tokenizer_type not in ['bpe', 'wpe', 'yttm']: raise ValueError( '`tokenizer.type` must be either `bpe` for SentencePiece tokenizer or' '`wpe` for BERT based tokenizer' ) @@ -289,7 +303,7 @@ def get_vocab(): tokenizer.tokenizer.get_vocab = get_vocab tokenizer.tokenizer.all_special_tokens = tokenizer.special_token_to_id - else: + elif tokenizer_type == 'wpe': # This is a WPE Tokenizer # If path from previous registration exists, remove it if 'vocab_path' in tokenizer_cfg: @@ -318,6 +332,20 @@ def get_vocab(): unk_token=hf_tokenizer_kwargs.get('unk_token', None), use_fast=hf_tokenizer_kwargs.get('use_fast', False), ) + else: + # This is a YouTokenToMe BPE Tokenizer + self.tokenizer = tokenizers.YouTokenToMeTokenizer(model_path=self.tokenizer_cfg.get('model_path')) + + vocabulary = {} + for i, piece in enumerate(self.tokenizer.tokenizer.vocab()): + vocabulary[piece] = i + + # wrapper method to get vocabulary conveniently + def get_vocab(): + return vocabulary + + self.tokenizer.tokenizer.vocab_size = len(vocabulary) + self.tokenizer.tokenizer.get_vocab = get_vocab logging.info( 'Tokenizer {} initialized with {} tokens'.format(tokenizer.__class__.__name__, tokenizer.vocab_size) diff --git a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py index 0ab0cb784273..906154213ea1 100644 --- a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py +++ b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py @@ -299,7 +299,7 @@ def create_spt_model( byte_fallback: If , fallback to a byte sequence of the character. split_digits: If true, digits are split into individual tokens. split_by_whitespace: Whether to respect white space while creating subwords. If False, will learn merges across whitespace. - split_by_unicode_script: Whether to include multiple Unicode scripts. Ex. is Arabic diacritics which are considered part of the letter (عِدَّةُ) + split_by_unicode_script: Whether to include multiple Unicode scripts. Ex. is Arabic diacritics which are considered part of the letter (عِدَّةُ) """ if not data_file or not os.path.exists(data_file): From 842f2e988f287a717467a75d0438c492ba53f53b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 4 Jul 2023 00:03:55 +0000 Subject: [PATCH 02/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo/collections/asr/models/transformer_bpe_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py index b1a40d937a0c..6d86fa221d7b 100644 --- a/nemo/collections/asr/models/transformer_bpe_models.py +++ b/nemo/collections/asr/models/transformer_bpe_models.py @@ -82,10 +82,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): self._setup_tokenizer(cfg.tokenizer) super().__init__(cfg=cfg, trainer=trainer) - + # Setup audio preprocessor self.preprocessor = EncDecTransfModelBPE.from_config_dict(self._cfg.preprocessor) - + # Setup audio encoder self.encoder = EncDecTransfModelBPE.from_config_dict(self._cfg.encoder) From 10019f8ded09838ca00135018fa2ec1f1e9e0f52 Mon Sep 17 00:00:00 2001 From: AlexGrinch Date: Mon, 3 Jul 2023 17:04:27 -0700 Subject: [PATCH 03/14] style fix Signed-off-by: AlexGrinch --- nemo/collections/asr/models/transformer_bpe_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py index b1a40d937a0c..6d86fa221d7b 100644 --- a/nemo/collections/asr/models/transformer_bpe_models.py +++ b/nemo/collections/asr/models/transformer_bpe_models.py @@ -82,10 +82,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): self._setup_tokenizer(cfg.tokenizer) super().__init__(cfg=cfg, trainer=trainer) - + # Setup audio preprocessor self.preprocessor = EncDecTransfModelBPE.from_config_dict(self._cfg.preprocessor) - + # Setup audio encoder self.encoder = EncDecTransfModelBPE.from_config_dict(self._cfg.encoder) From 6615136da757c05e32218d3348725547ca762f82 Mon Sep 17 00:00:00 2001 From: AlexGrinch Date: Wed, 5 Jul 2023 14:44:28 -0700 Subject: [PATCH 04/14] sacrebleu import fix, unused imports removed Signed-off-by: AlexGrinch --- .../asr/models/transformer_bpe_models.py | 17 +++-------------- requirements/requirements_asr.txt | 1 + 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py index 6d86fa221d7b..0954b98d8423 100644 --- a/nemo/collections/asr/models/transformer_bpe_models.py +++ b/nemo/collections/asr/models/transformer_bpe_models.py @@ -12,11 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy import itertools import json import os -import random import tempfile from math import ceil from typing import Dict, List, Optional, Union @@ -24,31 +22,22 @@ import editdistance import torch import torch.distributed as dist -from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict +from omegaconf import DictConfig, OmegaConf from pytorch_lightning import Trainer from sacrebleu import corpus_bleu -from torch.utils.data import ChainDataset, DataLoader from tqdm.auto import tqdm from nemo.collections.asr.data import audio_to_text_dataset -from nemo.collections.asr.data.audio_to_text import _speech_collate_fn from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs -from nemo.collections.asr.losses.ctc import CTCLoss -from nemo.collections.asr.metrics.wer_bpe import WERBPE, CTCBPEDecoding, CTCBPEDecodingConfig from nemo.collections.asr.models.asr_model import ASRModel, ExportableEncDecModel -from nemo.collections.asr.parts.features import clean_spectrogram_batch, normalize_batch from nemo.collections.asr.parts.mixins import ASRBPEMixin -from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations -from nemo.collections.common.data import ConcatDataset -from nemo.collections.common.losses import NLLLoss, SmoothedCrossEntropyLoss +from nemo.collections.common.losses import SmoothedCrossEntropyLoss from nemo.collections.common.metrics import GlobalAverageLossMetric from nemo.collections.common.parts import transformer_weights_init -from nemo.collections.nlp.models.machine_translation import MTEncDecModel from nemo.collections.nlp.modules.common import TokenClassifier from nemo.collections.nlp.modules.common.lm_utils import get_transformer from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder -from nemo.collections.tts.models import FastPitchModel, SpectrogramEnhancerModel -from nemo.core.classes.common import PretrainedModelInfo, typecheck +from nemo.core.classes.common import typecheck from nemo.core.neural_types import ( AudioSignal, ChannelType, diff --git a/requirements/requirements_asr.txt b/requirements/requirements_asr.txt index 011862ad723b..a9576d894e22 100644 --- a/requirements/requirements_asr.txt +++ b/requirements/requirements_asr.txt @@ -13,6 +13,7 @@ pyannote.core pyannote.metrics pydub ruamel.yaml +sacrebleu scipy>=0.14 soundfile sox From ced657bb899db499dfd9ce4fcb9340039ac8b65e Mon Sep 17 00:00:00 2001 From: AlexGrinch Date: Wed, 5 Jul 2023 15:20:49 -0700 Subject: [PATCH 05/14] import guard for nlp inside asr transformer bpe model Signed-off-by: AlexGrinch --- .../asr/models/transformer_bpe_models.py | 14 ++++++++++---- requirements/requirements_asr.txt | 1 - 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py index 0954b98d8423..bb835a53b39a 100644 --- a/nemo/collections/asr/models/transformer_bpe_models.py +++ b/nemo/collections/asr/models/transformer_bpe_models.py @@ -24,7 +24,6 @@ import torch.distributed as dist from omegaconf import DictConfig, OmegaConf from pytorch_lightning import Trainer -from sacrebleu import corpus_bleu from tqdm.auto import tqdm from nemo.collections.asr.data import audio_to_text_dataset @@ -34,9 +33,16 @@ from nemo.collections.common.losses import SmoothedCrossEntropyLoss from nemo.collections.common.metrics import GlobalAverageLossMetric from nemo.collections.common.parts import transformer_weights_init -from nemo.collections.nlp.modules.common import TokenClassifier -from nemo.collections.nlp.modules.common.lm_utils import get_transformer -from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder + +try: + from sacrebleu import corpus_bleu + from nemo.collections.nlp.modules.common import TokenClassifier + from nemo.collections.nlp.modules.common.lm_utils import get_transformer + from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder + ASR_AVAILABLE = True +except (ImportError, ModuleNotFoundError): + ASR_AVAILABLE = False + from nemo.core.classes.common import typecheck from nemo.core.neural_types import ( AudioSignal, diff --git a/requirements/requirements_asr.txt b/requirements/requirements_asr.txt index a9576d894e22..011862ad723b 100644 --- a/requirements/requirements_asr.txt +++ b/requirements/requirements_asr.txt @@ -13,7 +13,6 @@ pyannote.core pyannote.metrics pydub ruamel.yaml -sacrebleu scipy>=0.14 soundfile sox From e91980a8a66964b3dc75895bc78d998272824e62 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 5 Jul 2023 22:22:01 +0000 Subject: [PATCH 06/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo/collections/asr/models/transformer_bpe_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py index bb835a53b39a..5656a98da16d 100644 --- a/nemo/collections/asr/models/transformer_bpe_models.py +++ b/nemo/collections/asr/models/transformer_bpe_models.py @@ -39,6 +39,7 @@ from nemo.collections.nlp.modules.common import TokenClassifier from nemo.collections.nlp.modules.common.lm_utils import get_transformer from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder + ASR_AVAILABLE = True except (ImportError, ModuleNotFoundError): ASR_AVAILABLE = False From 1a13c14d16f06e3568604c70cfd2ddeddca3e1c4 Mon Sep 17 00:00:00 2001 From: AlexGrinch Date: Thu, 6 Jul 2023 13:34:49 -0700 Subject: [PATCH 07/14] codeql fixes Signed-off-by: AlexGrinch --- .../asr/speech_translation/speech_to_text_transf.py | 3 --- nemo/collections/asr/models/transformer_bpe_models.py | 11 ++--------- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/examples/asr/speech_translation/speech_to_text_transf.py b/examples/asr/speech_translation/speech_to_text_transf.py index ce3e657365a7..2e349e81de4a 100644 --- a/examples/asr/speech_translation/speech_to_text_transf.py +++ b/examples/asr/speech_translation/speech_to_text_transf.py @@ -45,10 +45,7 @@ """ -from collections import OrderedDict - import pytorch_lightning as pl -import torch from omegaconf import OmegaConf from nemo.collections.asr.models import EncDecTransfModelBPE diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py index bb835a53b39a..5c74a848f088 100644 --- a/nemo/collections/asr/models/transformer_bpe_models.py +++ b/nemo/collections/asr/models/transformer_bpe_models.py @@ -39,9 +39,9 @@ from nemo.collections.nlp.modules.common import TokenClassifier from nemo.collections.nlp.modules.common.lm_utils import get_transformer from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder - ASR_AVAILABLE = True + NLP_AVAILABLE = True except (ImportError, ModuleNotFoundError): - ASR_AVAILABLE = False + NLP_AVAILABLE = False from nemo.core.classes.common import typecheck from nemo.core.neural_types import ( @@ -120,7 +120,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): library = transf_decoder_cfg_dict.pop('library', 'nemo') model_name = transf_decoder_cfg_dict.pop('model_name', None) pretrained = transf_decoder_cfg_dict.pop('pretrained', False) - checkpoint_file = transf_decoder_cfg_dict.pop('checkpoint_file', None) self.transf_decoder = get_transformer( library=library, model_name=model_name, @@ -455,7 +454,6 @@ def compute_audio_loss(self, batch): signal, signal_len, transcript, transcript_len = batch input_ids, labels = transcript[:, :-1], transcript[:, 1:] - batch_size = signal.shape[0] transf_log_probs, encoded_len, enc_states, enc_mask = self.forward( input_signal=signal, @@ -478,11 +476,6 @@ def training_step(self, batch, batch_nb): 'learning_rate': self._optimizer.param_groups[0]['lr'], } - if hasattr(self, '_trainer') and self._trainer is not None: - log_every_n_steps = self._trainer.log_every_n_steps - else: - log_every_n_steps = 1 - return {'loss': audio_loss, 'log': tensorboard_logs} def validation_step(self, batch, batch_idx, dataloader_idx=0, eval_mode="val"): From 67a3d96d4d2ba46fd8fb936bb93cc75a9c3e030b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 6 Jul 2023 20:39:11 +0000 Subject: [PATCH 08/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo/collections/asr/models/transformer_bpe_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py index 5c74a848f088..8538bd253dee 100644 --- a/nemo/collections/asr/models/transformer_bpe_models.py +++ b/nemo/collections/asr/models/transformer_bpe_models.py @@ -39,6 +39,7 @@ from nemo.collections.nlp.modules.common import TokenClassifier from nemo.collections.nlp.modules.common.lm_utils import get_transformer from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder + NLP_AVAILABLE = True except (ImportError, ModuleNotFoundError): NLP_AVAILABLE = False From 069626304d44deb9acbf48f3985ff5b4ef84ae26 Mon Sep 17 00:00:00 2001 From: AlexGrinch Date: Wed, 12 Jul 2023 14:53:48 -0700 Subject: [PATCH 09/14] comments answered Signed-off-by: AlexGrinch --- .../fast-conformer_transformer.yaml} | 41 ++++++------------- ...ransf.py => speech_to_text_transformer.py} | 9 +--- .../asr/models/transformer_bpe_models.py | 35 ++++++++-------- 3 files changed, 32 insertions(+), 53 deletions(-) rename examples/asr/conf/{transformer_dec/speech_translation_transf.yaml => speech_translation/fast-conformer_transformer.yaml} (79%) rename examples/asr/speech_translation/{speech_to_text_transf.py => speech_to_text_transformer.py} (84%) diff --git a/examples/asr/conf/transformer_dec/speech_translation_transf.yaml b/examples/asr/conf/speech_translation/fast-conformer_transformer.yaml similarity index 79% rename from examples/asr/conf/transformer_dec/speech_translation_transf.yaml rename to examples/asr/conf/speech_translation/fast-conformer_transformer.yaml index c7bfe739b46c..4e480df62e59 100644 --- a/examples/asr/conf/transformer_dec/speech_translation_transf.yaml +++ b/examples/asr/conf/speech_translation/fast-conformer_transformer.yaml @@ -1,30 +1,19 @@ -# It contains the default values for training a Conformer-CTC ASR model, large size (~120M) with CTC loss and sub-word encoding. +# It contains the default values for training an autoregressive FastConformer-Transformer ST model with sub-word encoding. # Architecture and training config: # Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective # batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. -# Here are the recommended configs for different variants of Conformer-CTC, other parameters are the same as in this config file. -# One extra layer (compared to original paper) is added to the medium and large variants to compensate for replacing the LSTM decoder with a linear one. -# -# +-------------+---------+---------+----------+------------+-----+ -# | Model | d_model | n_heads | n_layers | time_masks | lr | -# +=============+=========+========+===========+============+=====+ -# | Small (13M)| 176 | 4 | 16 | 5 | 5.0 | -# +-------------+---------+--------+-----------+------------+-----+ -# | Medium (30M)| 256 | 4 | 18 | 5 | 5.0 | -# +-------------+---------+--------+-----------+------------+-----+ -# | Large (121M)| 512 | 8 | 18 | 10 | 2.0 | -# +---------------------------------------------------------------+ -# -# If you do not want to train with AMP, you may use weight decay of 0.0 or reduce the number of time maskings to 2 -# with time_width=100. It may help when you want to train for fewer epochs and need faster convergence. -# With weight_decay=0.0, learning rate may need to get reduced to 2.0. - -# You may find more info about Conformer-CTC here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#conformer-ctc -# Pre-trained models of Conformer-CTC can be found here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html -# The checkpoint of the large model trained on LibriSpeech with this recipe can be found here: https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large_ls - -name: "Conformer-Transformer-BPE-st" +# Here are the recommended configs for different variants of FastConformer-Transformer, other parameters are the same as in this config file. +# One extra (linear projection) layer is added between FastConformer encoder and Transformer decoder if they have different hidden sizes +# It is recommended to initialize FastConformer with ASR pre-trained encoder for better accuracy and faster convergence + +name: "FastConformer-Transformer-BPE-st" + +# Initialize model encoder with pre-trained ASR FastConformer encoder for faster convergence and improved accuracy +init_from_nemo_model: + model0: + path: ??? + include: ["preprocessor", "encoder"] model: sample_rate: 16000 @@ -186,12 +175,6 @@ model: warmup_ratio: null min_lr: 1e-6 -# Initialize model encoder with pre-trained ASR FastConformer encoder for faster convergence and improved accuracy -init_from_nemo_model: - model0: - path: ??? - include: ["preprocessor", "encoder"] - trainer: gpus: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 diff --git a/examples/asr/speech_translation/speech_to_text_transf.py b/examples/asr/speech_translation/speech_to_text_transformer.py similarity index 84% rename from examples/asr/speech_translation/speech_to_text_transf.py rename to examples/asr/speech_translation/speech_to_text_transformer.py index 2e349e81de4a..0c0882859b88 100644 --- a/examples/asr/speech_translation/speech_to_text_transf.py +++ b/examples/asr/speech_translation/speech_to_text_transformer.py @@ -15,10 +15,8 @@ """ # Training the model ```sh -python speech_to_text_transf.py \ +python speech_to_text_transformer.py \ # (Optional: --config-path= --config-name=) \ - model.train_ds.text.tar_files= \ - model.train_ds.text.metadata_file= \ model.train_ds.audio.tarred_audio_filepaths= \ model.train_ds.audio_manifest_filepath= \ model.validation_ds.manifest_filepath= \ @@ -26,9 +24,6 @@ model.tokenizer.dir= \ model.tokenizer.model_path= \ model.tokenizer.type= \ - model.encoder_tokenizer.tokenizer_model= \ - model.encoder_tokenizer.vocab_file= \ - model.decoder_tokenizer.tokenizer_model= \ trainer.gpus=-1 \ trainer.accelerator="ddp" \ trainer.max_epochs=100 \ @@ -54,7 +49,7 @@ from nemo.utils.exp_manager import exp_manager -@hydra_runner(config_path="../conf/transformer_dec/", config_name="speech_translation_transf_test") +@hydra_runner(config_path="../conf/speech_translation/", config_name="fast-conformer_transformer") def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py index 8538bd253dee..79890d62e111 100644 --- a/nemo/collections/asr/models/transformer_bpe_models.py +++ b/nemo/collections/asr/models/transformer_bpe_models.py @@ -43,6 +43,7 @@ NLP_AVAILABLE = True except (ImportError, ModuleNotFoundError): NLP_AVAILABLE = False + logging.warning("Could not import NeMo NLP collection which is required for speech translation model.") from nemo.core.classes.common import typecheck from nemo.core.neural_types import ( @@ -80,16 +81,16 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): super().__init__(cfg=cfg, trainer=trainer) # Setup audio preprocessor - self.preprocessor = EncDecTransfModelBPE.from_config_dict(self._cfg.preprocessor) + self.preprocessor = EncDecTransfModelBPE.from_config_dict(self.cfg.preprocessor) # Setup audio encoder - self.encoder = EncDecTransfModelBPE.from_config_dict(self._cfg.encoder) + self.encoder = EncDecTransfModelBPE.from_config_dict(self.cfg.encoder) # Add projection layer if encoder and decoder differ in hidden size - if self._cfg.encoder['d_model'] != self._cfg.transf_decoder['hidden_size']: - self.adapter = torch.nn.Linear(self._cfg.encoder['d_model'], self._cfg.transf_decoder['hidden_size']) + if self.cfg.encoder['d_model'] != self.cfg.transf_decoder['hidden_size']: + self.adapter = torch.nn.Linear(self.cfg.encoder['d_model'], self.cfg.transf_decoder['hidden_size']) else: - self.adapter = lambda x: x + self.adapter = torch.nn.Identity() transf_encoder_cfg_dict = OmegaConf.to_container(cfg.get('transf_encoder')) @@ -133,10 +134,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): self.log_softmax = TokenClassifier( hidden_size=self.transf_decoder.hidden_size, num_classes=vocab_size, - activation=self._cfg.head.activation, - log_softmax=self._cfg.head.log_softmax, - dropout=self._cfg.head.dropout, - use_transformer_init=self._cfg.head.use_transformer_init, + activation=self.cfg.head.activation, + log_softmax=self.cfg.head.log_softmax, + dropout=self.cfg.head.dropout, + use_transformer_init=self.cfg.head.use_transformer_init, ) self.log_softmax.mlp.layer0.weight = self.transf_decoder.embedding.token_embedding.weight std_init_range = 1 / self.transf_decoder.hidden_size ** 0.5 @@ -149,21 +150,21 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): decoder=self.transf_decoder.decoder, log_softmax=self.log_softmax, max_sequence_length=self.transf_decoder.max_sequence_length, - beam_size=self._cfg.beam_search.beam_size, + beam_size=self.cfg.beam_search.beam_size, bos=self.tokenizer.bos_id, pad=self.tokenizer.pad_id, eos=self.tokenizer.eos_id, - len_pen=self._cfg.beam_search.len_pen, - max_delta_length=self._cfg.beam_search.max_generation_delta, + len_pen=self.cfg.beam_search.len_pen, + max_delta_length=self.cfg.beam_search.max_generation_delta, ) # Define autoregressive CE loss self.transf_loss = SmoothedCrossEntropyLoss( - pad_id=self.tokenizer.pad_id, label_smoothing=self._cfg.label_smoothing + pad_id=self.tokenizer.pad_id, label_smoothing=self.cfg.label_smoothing ) - if hasattr(self._cfg, 'spec_augment') and self._cfg.spec_augment is not None: - self.spec_augmentation = EncDecTransfModelBPE.from_config_dict(self._cfg.spec_augment) + if hasattr(self.cfg, 'spec_augment') and self.cfg.spec_augment is not None: + self.spec_augmentation = EncDecTransfModelBPE.from_config_dict(self.cfg.spec_augment) else: self.spec_augmentation = None @@ -230,7 +231,7 @@ def transcribe( temporary_datalayer = self._setup_transcribe_dataloader(config) for test_batch in tqdm(temporary_datalayer, desc="Transcribing"): - ctc_lp, _, encoded_len, predictions, enc_states, enc_mask = self.forward( + log_probs, encoded_len, enc_states, enc_mask = self.forward( input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) ) @@ -251,7 +252,7 @@ def transcribe( hypotheses += beam_hypotheses - del test_batch + del test_batch, log_probs, encoded_len, enc_states, enc_mask finally: # set mode back to its original value self.train(mode=mode) From 249f312bafe422ef6379c3d3f93b7c1f768807de Mon Sep 17 00:00:00 2001 From: AlexGrinch Date: Wed, 12 Jul 2023 15:02:19 -0700 Subject: [PATCH 10/14] import ordering fix Signed-off-by: AlexGrinch --- .../asr/models/transformer_bpe_models.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py index 79890d62e111..840a86d47150 100644 --- a/nemo/collections/asr/models/transformer_bpe_models.py +++ b/nemo/collections/asr/models/transformer_bpe_models.py @@ -34,17 +34,6 @@ from nemo.collections.common.metrics import GlobalAverageLossMetric from nemo.collections.common.parts import transformer_weights_init -try: - from sacrebleu import corpus_bleu - from nemo.collections.nlp.modules.common import TokenClassifier - from nemo.collections.nlp.modules.common.lm_utils import get_transformer - from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder - - NLP_AVAILABLE = True -except (ImportError, ModuleNotFoundError): - NLP_AVAILABLE = False - logging.warning("Could not import NeMo NLP collection which is required for speech translation model.") - from nemo.core.classes.common import typecheck from nemo.core.neural_types import ( AudioSignal, @@ -58,6 +47,17 @@ ) from nemo.utils import logging +try: + from sacrebleu import corpus_bleu + from nemo.collections.nlp.modules.common import TokenClassifier + from nemo.collections.nlp.modules.common.lm_utils import get_transformer + from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder + + NLP_AVAILABLE = True +except (ImportError, ModuleNotFoundError): + NLP_AVAILABLE = False + logging.warning("Could not import NeMo NLP collection which is required for speech translation model.") + __all__ = ['EncDecTransfModelBPE'] From c2c00d4eb0ba9fe0e3c716ae79bc99b93d7e3d73 Mon Sep 17 00:00:00 2001 From: AlexGrinch Date: Thu, 13 Jul 2023 15:19:41 -0700 Subject: [PATCH 11/14] yttm for asr removed Signed-off-by: AlexGrinch --- nemo/collections/asr/parts/mixins/mixins.py | 40 +++------------------ 1 file changed, 4 insertions(+), 36 deletions(-) diff --git a/nemo/collections/asr/parts/mixins/mixins.py b/nemo/collections/asr/parts/mixins/mixins.py index bcb62dc409b7..1fa591c61abc 100644 --- a/nemo/collections/asr/parts/mixins/mixins.py +++ b/nemo/collections/asr/parts/mixins/mixins.py @@ -83,7 +83,7 @@ def _setup_monolingual_tokenizer(self, tokenizer_cfg: DictConfig): with open_dict(self.cfg.tokenizer): self.cfg.tokenizer.hf_kwargs = tokenizer_cfg.get('hf_kwargs') - if self.tokenizer_type not in ['bpe', 'wpe', 'yttm']: + if self.tokenizer_type not in ['bpe', 'wpe']: raise ValueError( "`tokenizer.type` must be either `bpe` for SentencePiece tokenizer or " "`wpe` for BERT based tokenizer" @@ -140,7 +140,7 @@ def get_vocab(): self.tokenizer.tokenizer.get_vocab = get_vocab self.tokenizer.tokenizer.all_special_tokens = self.tokenizer.special_token_to_id - elif self.tokenizer_type == 'wpe': + else: # This is a WPE Tokenizer # If path from previous registration exists, remove it if 'vocab_path' in self.tokenizer_cfg: @@ -166,20 +166,6 @@ def get_vocab(): unk_token=self.hf_tokenizer_kwargs.get('unk_token', None), use_fast=self.hf_tokenizer_kwargs.get('use_fast', False), ) - else: - # This is a YouTokenToMe BPE Tokenizer - self.tokenizer = tokenizers.YouTokenToMeTokenizer(model_path=self.tokenizer_cfg.get('model_path')) - - vocabulary = {} - for i, piece in enumerate(self.tokenizer.tokenizer.vocab()): - vocabulary[piece] = i - - # wrapper method to get vocabulary conveniently - def get_vocab(): - return vocabulary - - self.tokenizer.tokenizer.vocab_size = len(vocabulary) - self.tokenizer.tokenizer.get_vocab = get_vocab logging.info( "Tokenizer {} initialized with {} tokens".format( @@ -235,7 +221,7 @@ def _make_tokenizer(self, tokenizer_cfg: DictConfig, lang=None): tokenizer_type = tokenizer_cfg.get('type').lower() tokenizer_dir = tokenizer_cfg.get('dir') - if tokenizer_type not in ['bpe', 'wpe', 'yttm']: + if tokenizer_type not in ['bpe', 'wpe']: raise ValueError( '`tokenizer.type` must be either `bpe` for SentencePiece tokenizer or' '`wpe` for BERT based tokenizer' ) @@ -303,7 +289,7 @@ def get_vocab(): tokenizer.tokenizer.get_vocab = get_vocab tokenizer.tokenizer.all_special_tokens = tokenizer.special_token_to_id - elif tokenizer_type == 'wpe': + else: # This is a WPE Tokenizer # If path from previous registration exists, remove it if 'vocab_path' in tokenizer_cfg: @@ -332,24 +318,6 @@ def get_vocab(): unk_token=hf_tokenizer_kwargs.get('unk_token', None), use_fast=hf_tokenizer_kwargs.get('use_fast', False), ) - else: - # This is a YouTokenToMe BPE Tokenizer - self.tokenizer = tokenizers.YouTokenToMeTokenizer(model_path=self.tokenizer_cfg.get('model_path')) - - vocabulary = {} - for i, piece in enumerate(self.tokenizer.tokenizer.vocab()): - vocabulary[piece] = i - - # wrapper method to get vocabulary conveniently - def get_vocab(): - return vocabulary - - self.tokenizer.tokenizer.vocab_size = len(vocabulary) - self.tokenizer.tokenizer.get_vocab = get_vocab - - logging.info( - 'Tokenizer {} initialized with {} tokens'.format(tokenizer.__class__.__name__, tokenizer.vocab_size) - ) return tokenizer, model_path, vocab_path, spe_vocab_path From 66d428f404cef704402d7cdb0868ce8cfcb84937 Mon Sep 17 00:00:00 2001 From: AlexGrinch Date: Thu, 13 Jul 2023 16:06:33 -0700 Subject: [PATCH 12/14] logging added Signed-off-by: AlexGrinch --- nemo/collections/asr/parts/mixins/mixins.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nemo/collections/asr/parts/mixins/mixins.py b/nemo/collections/asr/parts/mixins/mixins.py index 1fa591c61abc..4c43960ac9d2 100644 --- a/nemo/collections/asr/parts/mixins/mixins.py +++ b/nemo/collections/asr/parts/mixins/mixins.py @@ -319,6 +319,10 @@ def get_vocab(): use_fast=hf_tokenizer_kwargs.get('use_fast', False), ) + logging.info( + 'Tokenizer {} initialized with {} tokens'.format(tokenizer.__class__.__name__, tokenizer.vocab_size) + ) + return tokenizer, model_path, vocab_path, spe_vocab_path def _cleanup_monolingual_and_aggregate_config_and_artifacts_if_needed(self): From 5fed6fd9c6d919d6682f52df45ccaf21dc07449a Mon Sep 17 00:00:00 2001 From: AlexGrinch Date: Fri, 14 Jul 2023 15:31:56 -0700 Subject: [PATCH 13/14] added inference and translate method Signed-off-by: AlexGrinch --- .../speech_translation/translate_speech.py | 214 ++++++++++++++++++ .../asr/models/transformer_bpe_models.py | 23 +- 2 files changed, 232 insertions(+), 5 deletions(-) create mode 100644 examples/asr/speech_translation/translate_speech.py diff --git a/examples/asr/speech_translation/translate_speech.py b/examples/asr/speech_translation/translate_speech.py new file mode 100644 index 000000000000..64dfe7dcf321 --- /dev/null +++ b/examples/asr/speech_translation/translate_speech.py @@ -0,0 +1,214 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import os +import json +from dataclasses import dataclass, is_dataclass +from typing import List, Optional, Union + +import pytorch_lightning as pl +import torch +from omegaconf import OmegaConf + +from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig +from nemo.collections.asr.parts.utils.transcribe_utils import ( + compute_output_filename, + prepare_audio_data, + setup_model, +) +from nemo.core.config import hydra_runner +from nemo.utils import logging + +""" +Translate audio file on a single CPU/GPU. Useful for translations of moderate amounts of audio data. + +# Arguments + model_path: path to .nemo ST checkpoint + pretrained_name: name of pretrained ST model (from NGC registry) + audio_dir: path to directory with audio files + dataset_manifest: path to dataset JSON manifest file (in NeMo format) + + output_filename: Output filename where the translations will be written + batch_size: batch size during inference + + cuda: Optional int to enable or disable execution of model on certain CUDA device. + allow_mps: Bool to allow using MPS (Apple Silicon M-series GPU) device if available + amp: Bool to decide if Automatic Mixed Precision should be used during inference + audio_type: Str filetype of the audio. Supported = wav, flac, mp3 + + overwrite_translations: Bool which when set allows repeated translations to overwrite previous results. + +# Usage +ST model can be specified by either "model_path" or "pretrained_name". +Data for translation can be defined with either "audio_dir" or "dataset_manifest". +Results are returned in a JSON manifest file. + +python translate_speech.py \ + model_path=null \ + pretrained_name=null \ + audio_dir="" \ + dataset_manifest="" \ + output_filename="" \ + batch_size=32 \ + cuda=0 \ + amp=True \ +""" + + +@dataclass +class ModelChangeConfig: + + # Sub-config for changes specific to the Conformer Encoder + conformer: ConformerChangeConfig = ConformerChangeConfig() + + +@dataclass +class TranslationConfig: + # Required configs + model_path: Optional[str] = None # Path to a .nemo file + pretrained_name: Optional[str] = None # Name of a pretrained model + audio_dir: Optional[str] = None # Path to a directory which contains audio files + dataset_manifest: Optional[str] = None # Path to dataset's JSON manifest + audio_key: str = 'audio_filepath' # Used to override the default audio key in dataset_manifest + eval_config_yaml: Optional[str] = None # Path to a yaml file of config of evaluation + + # General configs + output_filename: Optional[str] = None + batch_size: int = 32 + random_seed: Optional[int] = None # seed number going to be used in seed_everything() + + # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA + # device anyway, and do inference on CPU only if CUDA device is not found. + # If `cuda` is a negative number, inference will be on CPU only. + cuda: Optional[int] = None + allow_mps: bool = False # allow to select MPS device (Apple Silicon M-series GPU) + amp: bool = False + audio_type: str = "wav" + + # Recompute model translation, even if the output folder exists with scores. + overwrite_translations: bool = True + + # can be set to True to return list of translations instead of the config + # if True, will also skip writing anything to the output file + return_translations: bool = False + + +@hydra_runner(config_name="TranslationConfig", schema=TranslationConfig) +def main(cfg: TranslationConfig) -> Union[TranslationConfig, List[str]]: + logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') + + for key in cfg: + cfg[key] = None if cfg[key] == 'None' else cfg[key] + + if is_dataclass(cfg): + cfg = OmegaConf.structured(cfg) + + if cfg.random_seed: + pl.seed_everything(cfg.random_seed) + + if cfg.model_path is None and cfg.pretrained_name is None: + raise ValueError("Both cfg.model_path and cfg.pretrained_name cannot be None!") + if cfg.audio_dir is None and cfg.dataset_manifest is None: + raise ValueError("Both cfg.audio_dir and cfg.dataset_manifest cannot be None!") + + # Load augmentor from exteranl yaml file which contains eval info, could be extend to other feature such VAD, P&C + augmentor = None + if cfg.eval_config_yaml: + eval_config = OmegaConf.load(cfg.eval_config_yaml) + augmentor = eval_config.test_ds.get("augmentor") + logging.info(f"Will apply on-the-fly augmentation on samples during translation: {augmentor} ") + + # setup GPU + if cfg.cuda is None: + if torch.cuda.is_available(): + device = [0] # use 0th CUDA device + accelerator = 'gpu' + map_location = torch.device('cuda:0') + elif cfg.allow_mps and hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + logging.warning( + "MPS device (Apple Silicon M-series GPU) support is experimental." + " Env variable `PYTORCH_ENABLE_MPS_FALLBACK=1` should be set in most cases to avoid failures." + ) + device = [0] + accelerator = 'mps' + map_location = torch.device('mps') + else: + device = 1 + accelerator = 'cpu' + map_location = torch.device('cpu') + else: + device = [cfg.cuda] + accelerator = 'gpu' + map_location = torch.device(f'cuda:{cfg.cuda}') + + logging.info(f"Inference will be done on device: {map_location}") + + asr_model, model_name = setup_model(cfg, map_location) + trainer = pl.Trainer(devices=device, accelerator=accelerator) + asr_model.set_trainer(trainer) + asr_model = asr_model.eval() + + # collect additional translation information + return_hypotheses = False + + # prepare audio filepaths and decide wether it's partial audio + filepaths, partial_audio = prepare_audio_data(cfg) + + # setup AMP (optional) + if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'): + logging.info("AMP enabled!\n") + autocast = torch.cuda.amp.autocast + else: + + @contextlib.contextmanager + def autocast(): + yield + + # Compute output filename + cfg = compute_output_filename(cfg, model_name) + + # if translations should not be overwritten, and already exists, skip re-translation step and return + if not cfg.return_translations and not cfg.overwrite_translations and os.path.exists(cfg.output_filename): + logging.info( + f"Previous translations found at {cfg.output_filename}, and flag `overwrite_translations`" + f"is {cfg.overwrite_translations}. Returning without re-translating text." + ) + return cfg + + # translate audio + with autocast(): + with torch.no_grad(): + translations = asr_model.translate( + paths2audio_files=filepaths, batch_size=cfg.batch_size, return_hypotheses=return_hypotheses, + ) + + logging.info(f"Finished translating {len(filepaths)} files !") + logging.info(f"Writing translations into file: {cfg.output_filename}") + + if cfg.return_translations: + return translations + + # write audio translations + with open(cfg.output_filename, 'w', encoding='utf-8', newline='\n') as f: + for filepath, translation in zip(filepaths, translations): + item = {'audio_filepath': filepath, 'pred_translation': translation} + f.write(json.dumps(item, ensure_ascii=False) + "\n") + logging.info(f"Finished writing predictions to {cfg.output_filename}!") + + return cfg + + +if __name__ == '__main__': + main() # noqa pylint: disable=no-value-for-parameter diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py index 840a86d47150..178746795ae8 100644 --- a/nemo/collections/asr/models/transformer_bpe_models.py +++ b/nemo/collections/asr/models/transformer_bpe_models.py @@ -170,6 +170,17 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): self.val_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True) + @torch.no_grad() + def translate( + self, + paths2audio_files: List[str], + batch_size: int = 4, + logprobs: bool = False, + return_hypotheses: bool = False, + ) -> List[str]: + hypotheses = self.transcribe(paths2audio_files, batch_size, logprobs, return_hypotheses) + return hypotheses + @torch.no_grad() def transcribe( self, @@ -441,11 +452,13 @@ def forward( if self.use_transf_encoder: enc_states = self.transf_encoder(encoder_states=enc_states, encoder_mask=enc_mask) - dec_mask = lens_to_mask(transcript_length, transcript.shape[1]).to(transcript.dtype) - dec_states = self.transf_decoder( - input_ids=transcript, decoder_mask=dec_mask, encoder_embeddings=enc_states, encoder_mask=enc_mask - ) - transf_log_probs = self.log_softmax(hidden_states=dec_states) + transf_log_probs = None + if transcript is not None: + dec_mask = lens_to_mask(transcript_length, transcript.shape[1]).to(transcript.dtype) + dec_states = self.transf_decoder( + input_ids=transcript, decoder_mask=dec_mask, encoder_embeddings=enc_states, encoder_mask=enc_mask + ) + transf_log_probs = self.log_softmax(hidden_states=dec_states) return transf_log_probs, encoded_len, enc_states, enc_mask From 66fdfcbd8491d754176aba9affd08456eb72a729 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Jul 2023 22:33:24 +0000 Subject: [PATCH 14/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/asr/speech_translation/translate_speech.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/examples/asr/speech_translation/translate_speech.py b/examples/asr/speech_translation/translate_speech.py index 64dfe7dcf321..203852b52ee9 100644 --- a/examples/asr/speech_translation/translate_speech.py +++ b/examples/asr/speech_translation/translate_speech.py @@ -13,8 +13,8 @@ # limitations under the License. import contextlib -import os import json +import os from dataclasses import dataclass, is_dataclass from typing import List, Optional, Union @@ -23,11 +23,7 @@ from omegaconf import OmegaConf from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig -from nemo.collections.asr.parts.utils.transcribe_utils import ( - compute_output_filename, - prepare_audio_data, - setup_model, -) +from nemo.collections.asr.parts.utils.transcribe_utils import compute_output_filename, prepare_audio_data, setup_model from nemo.core.config import hydra_runner from nemo.utils import logging