From ed71fbf53e503a4460aec0e128fc10300f7d01b8 Mon Sep 17 00:00:00 2001
From: AlexGrinch <grinchuk.alexey@gmail.com>
Date: Mon, 3 Jul 2023 16:56:55 -0700
Subject: [PATCH 01/14] st standalone model

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>
---
 .../speech_translation_transf.yaml            | 235 +++++++
 .../speech_to_text_transf.py                  |  78 +++
 nemo/collections/asr/models/__init__.py       |   1 +
 .../asr/models/transformer_bpe_models.py      | 611 ++++++++++++++++++
 nemo/collections/asr/parts/mixins/mixins.py   |  36 +-
 .../tokenizers/sentencepiece_tokenizer.py     |   2 +-
 6 files changed, 958 insertions(+), 5 deletions(-)
 create mode 100644 examples/asr/conf/transformer_dec/speech_translation_transf.yaml
 create mode 100644 examples/asr/speech_translation/speech_to_text_transf.py
 create mode 100644 nemo/collections/asr/models/transformer_bpe_models.py

diff --git a/examples/asr/conf/transformer_dec/speech_translation_transf.yaml b/examples/asr/conf/transformer_dec/speech_translation_transf.yaml
new file mode 100644
index 000000000000..c7bfe739b46c
--- /dev/null
+++ b/examples/asr/conf/transformer_dec/speech_translation_transf.yaml
@@ -0,0 +1,235 @@
+# It contains the default values for training a Conformer-CTC ASR model, large size (~120M) with CTC loss and sub-word encoding.
+
+# Architecture and training config:
+# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective
+# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
+# Here are the recommended configs for different variants of Conformer-CTC, other parameters are the same as in this config file.
+# One extra layer (compared to original paper) is added to the medium and large variants to compensate for replacing the LSTM decoder with a linear one.
+#
+#  +-------------+---------+---------+----------+------------+-----+
+#  | Model       | d_model | n_heads | n_layers | time_masks | lr  |
+#  +=============+=========+========+===========+============+=====+
+#  | Small  (13M)|   176   |    4   |    16     |     5      | 5.0 |
+#  +-------------+---------+--------+-----------+------------+-----+
+#  | Medium (30M)|   256   |    4   |    18     |     5      | 5.0 |
+#  +-------------+---------+--------+-----------+------------+-----+
+#  | Large (121M)|   512   |    8   |    18     |     10     | 2.0 |
+#  +---------------------------------------------------------------+
+#
+# If you do not want to train with AMP, you may use weight decay of 0.0 or reduce the number of time maskings to 2
+# with time_width=100. It may help when you want to train for fewer epochs and need faster convergence.
+# With weight_decay=0.0, learning rate may need to get reduced to 2.0.
+
+# You may find more info about Conformer-CTC here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#conformer-ctc
+# Pre-trained models of Conformer-CTC can be found here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html
+# The checkpoint of the large model trained on LibriSpeech with this recipe can be found here: https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large_ls
+
+name: "Conformer-Transformer-BPE-st"
+
+model:
+  sample_rate: 16000
+  label_smoothing: 0.0
+  log_prediction: true # enables logging sample predictions in the output during training
+
+  train_ds:
+    is_tarred: true
+    tarred_audio_filepaths: ???
+    manifest_filepath: ???
+    sample_rate: 16000
+    shuffle: false
+    trim_silence: false
+    batch_size: 4
+    num_workers: 8
+
+  validation_ds:
+    manifest_filepath: ???
+    sample_rate: ${model.sample_rate}
+    batch_size: 16 # you may increase batch_size if your memory allows
+    shuffle: false
+    num_workers: 4
+    pin_memory: true
+    use_start_end_token: true
+
+  test_ds:
+    manifest_filepath: ???
+    sample_rate: ${model.sample_rate}
+    batch_size: 16 # you may increase batch_size if your memory allows
+    shuffle: false
+    num_workers: 4
+    pin_memory: true
+    use_start_end_token: true
+
+  # recommend small vocab size of 128 or 256 when using 4x sub-sampling
+  # you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py
+  tokenizer:
+    dir: ???  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (wpe)
+    type: bpe  # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer)
+
+  preprocessor:
+    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+    sample_rate: ${model.sample_rate}
+    normalize: "per_feature"
+    window_size: 0.025
+    window_stride: 0.01
+    window: "hann"
+    features: 80
+    n_fft: 512
+    log: true
+    frame_splicing: 1
+    dither: 0.00001
+    pad_to: 0
+    pad_value: 0.0
+
+  spec_augment:
+    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+    freq_masks: 2 # set to zero to disable it
+    # you may use lower time_masks for smaller models to have a faster convergence
+    time_masks: 10 # set to zero to disable it
+    freq_width: 27
+    time_width: 0.05
+
+  encoder:
+    _target_: nemo.collections.asr.modules.ConformerEncoder
+    feat_in: ${model.preprocessor.features}
+    feat_out: -1 # you may set it if you need different output size other than the default d_model
+    n_layers: 17
+    d_model: 512
+
+    # Sub-sampling params
+    subsampling: dw_striding # vggnet or striding, vggnet may give better results but needs more memory
+    subsampling_factor: 8 # must be power of 2
+    subsampling_conv_channels: 256 # -1 sets it to d_model
+    causal_downsampling: false
+    reduction: null
+    reduction_position: null
+    reduction_factor: 1
+
+    # Feed forward module's params
+    ff_expansion_factor: 4
+
+    # Multi-headed Attention Module's params
+    self_attention_model: rel_pos # rel_pos or abs_pos
+    n_heads: 8 # may need to be lower for smaller d_models
+    # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+    att_context_size: [-1, -1] # -1 means unlimited context
+    xscaling: true # scales up the input embeddings by sqrt(d_model)
+    untie_biases: true # unties the biases of the TransformerXL layers
+    pos_emb_max_len: 5000
+
+    # Convolution module's params
+    conv_kernel_size: 9
+    conv_norm_type: batch_norm
+    conv_context_size: null
+
+    ### regularization
+    dropout: 0.1 # The dropout used in most of the Conformer Modules
+    dropout_pre_encoder: 0.1
+    dropout_emb: 0.0 # The dropout used for embeddings
+    dropout_att: 0.1 # The dropout for multi-headed attention modules
+
+  transf_encoder:
+    num_layers: 0
+    hidden_size: 512
+    inner_size: 2048
+    num_attention_heads: 8
+    ffn_dropout: 0.1
+    attn_score_dropout: 0.1
+    attn_layer_dropout: 0.1
+
+  transf_decoder:
+    library: nemo
+    model_name: null
+    pretrained: false
+    max_sequence_length: 512
+    num_token_types: 0
+    embedding_dropout: 0.1
+    learn_positional_encodings: false
+    hidden_size: 512
+    inner_size: 2048
+    num_layers: 6
+    num_attention_heads: 4
+    ffn_dropout: 0.1
+    attn_score_dropout: 0.1
+    attn_layer_dropout: 0.1
+    hidden_act: relu
+    pre_ln: true
+    pre_ln_final_layer_norm: true
+
+  head:
+    num_layers: 1
+    activation: relu
+    log_softmax: true
+    dropout: 0.0
+    use_transformer_init: true
+    
+  beam_search:
+    beam_size: 4
+    len_pen: 0.0
+    max_generation_delta: 50
+
+  optim:
+    name: adam
+    lr: 0.0001
+    # optimizer arguments
+    betas: [0.9, 0.98]
+    # less necessity for weight_decay as we already have large augmentations with SpecAug
+    # you may need weight_decay for large models, stable AMP training, small datasets, or when lower augmentations are used
+    # weight decay of 0.0 with lr of 2.0 also works fine
+    #weight_decay: 1e-3
+
+    # scheduler setup
+    sched:
+      name: InverseSquareRootAnnealing
+      #d_model: ${model.encoder.d_model}
+      # scheduler config override
+      warmup_steps: 1000
+      warmup_ratio: null
+      min_lr: 1e-6
+
+# Initialize model encoder with pre-trained ASR FastConformer encoder for faster convergence and improved accuracy
+init_from_nemo_model:
+  model0:
+    path: ???
+    include: ["preprocessor", "encoder"]
+
+trainer:
+  gpus: -1 # number of GPUs, -1 would use all available GPUs
+  num_nodes: 1
+  max_epochs: 100
+  max_steps: -1 # computed at runtime if not set
+  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
+  accelerator: auto
+  strategy: ddp
+  accumulate_grad_batches: 1
+  gradient_clip_val: 0.0
+  precision: 16 # Should be set to 16 for O1 and O2 to enable the AMP.
+  log_every_n_steps: 100  # Interval of logging.
+  enable_progress_bar: True
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+  check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
+  sync_batchnorm: true
+  enable_checkpointing: False  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    # in case of multiple validation sets, first one is used
+    monitor: "val_sacreBLEU"
+    mode: "max"
+    save_top_k: 3
+    always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints
+
+  # you need to set these two to True to continue the training
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
+
+  # You may use this section to create a W&B logger
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: null
+    project: null
\ No newline at end of file
diff --git a/examples/asr/speech_translation/speech_to_text_transf.py b/examples/asr/speech_translation/speech_to_text_transf.py
new file mode 100644
index 000000000000..ce3e657365a7
--- /dev/null
+++ b/examples/asr/speech_translation/speech_to_text_transf.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+# Training the model
+```sh
+python speech_to_text_transf.py \
+    # (Optional: --config-path=<path to dir of configs> --config-name=<name of config without .yaml>) \
+    model.train_ds.text.tar_files=<path to tar files with text data> \
+    model.train_ds.text.metadata_file=<path to text metadata file> \
+    model.train_ds.audio.tarred_audio_filepaths=<path to tar files with audio> \
+    model.train_ds.audio_manifest_filepath=<path to audio data manifest> \
+    model.validation_ds.manifest_filepath=<path to validation manifest> \
+    model.test_ds.manifest_filepath=<path to test manifest> \
+    model.tokenizer.dir=<path to directory of tokenizer (not full path to the vocab file!)> \
+    model.tokenizer.model_path=<path to speech tokenizer model> \
+    model.tokenizer.type=<either bpe, wpe, or yttm> \
+    model.encoder_tokenizer.tokenizer_model=<path to cmudict> \
+    model.encoder_tokenizer.vocab_file=<path to heteronyms> \
+    model.decoder_tokenizer.tokenizer_model=<path to decoder tokenizer model> \
+    trainer.gpus=-1 \
+    trainer.accelerator="ddp" \
+    trainer.max_epochs=100 \
+    model.optim.name="adamw" \
+    model.optim.lr=0.001 \
+    model.optim.betas=[0.9,0.999] \
+    model.optim.weight_decay=0.0001 \
+    model.optim.sched.warmup_steps=2000
+    exp_manager.create_wandb_logger=True \
+    exp_manager.wandb_logger_kwargs.name="<Name of experiment>" \
+    exp_manager.wandb_logger_kwargs.project="<Name of project>"
+```
+
+
+"""
+
+from collections import OrderedDict
+
+import pytorch_lightning as pl
+import torch
+from omegaconf import OmegaConf
+
+from nemo.collections.asr.models import EncDecTransfModelBPE
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+
+
+@hydra_runner(config_path="../conf/transformer_dec/", config_name="speech_translation_transf_test")
+def main(cfg):
+    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
+
+    trainer = pl.Trainer(**cfg.trainer)
+    exp_manager(trainer, cfg.get("exp_manager", None))
+    asr_model = EncDecTransfModelBPE(cfg=cfg.model, trainer=trainer)
+
+    # Initialize the weights of the model from another model, if provided via config
+    asr_model.maybe_init_from_pretrained_checkpoint(cfg)
+    trainer.fit(asr_model)
+
+    if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None:
+        if asr_model.prepare_test(trainer):
+            trainer.test(asr_model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/nemo/collections/asr/models/__init__.py b/nemo/collections/asr/models/__init__.py
index a7275faea3d0..34f2c4f62e29 100644
--- a/nemo/collections/asr/models/__init__.py
+++ b/nemo/collections/asr/models/__init__.py
@@ -33,3 +33,4 @@
 from nemo.collections.asr.models.rnnt_models import EncDecRNNTModel
 from nemo.collections.asr.models.slu_models import SLUIntentSlotBPEModel
 from nemo.collections.asr.models.ssl_models import SpeechEncDecSelfSupervisedModel
+from nemo.collections.asr.models.transformer_bpe_models import EncDecTransfModelBPE
diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py
new file mode 100644
index 000000000000..b1a40d937a0c
--- /dev/null
+++ b/nemo/collections/asr/models/transformer_bpe_models.py
@@ -0,0 +1,611 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import itertools
+import json
+import os
+import random
+import tempfile
+from math import ceil
+from typing import Dict, List, Optional, Union
+
+import editdistance
+import torch
+import torch.distributed as dist
+from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict
+from pytorch_lightning import Trainer
+from sacrebleu import corpus_bleu
+from torch.utils.data import ChainDataset, DataLoader
+from tqdm.auto import tqdm
+
+from nemo.collections.asr.data import audio_to_text_dataset
+from nemo.collections.asr.data.audio_to_text import _speech_collate_fn
+from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs
+from nemo.collections.asr.losses.ctc import CTCLoss
+from nemo.collections.asr.metrics.wer_bpe import WERBPE, CTCBPEDecoding, CTCBPEDecodingConfig
+from nemo.collections.asr.models.asr_model import ASRModel, ExportableEncDecModel
+from nemo.collections.asr.parts.features import clean_spectrogram_batch, normalize_batch
+from nemo.collections.asr.parts.mixins import ASRBPEMixin
+from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations
+from nemo.collections.common.data import ConcatDataset
+from nemo.collections.common.losses import NLLLoss, SmoothedCrossEntropyLoss
+from nemo.collections.common.metrics import GlobalAverageLossMetric
+from nemo.collections.common.parts import transformer_weights_init
+from nemo.collections.nlp.models.machine_translation import MTEncDecModel
+from nemo.collections.nlp.modules.common import TokenClassifier
+from nemo.collections.nlp.modules.common.lm_utils import get_transformer
+from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder
+from nemo.collections.tts.models import FastPitchModel, SpectrogramEnhancerModel
+from nemo.core.classes.common import PretrainedModelInfo, typecheck
+from nemo.core.neural_types import (
+    AudioSignal,
+    ChannelType,
+    LabelsType,
+    LengthsType,
+    LogprobsType,
+    MaskType,
+    NeuralType,
+    SpectrogramType,
+)
+from nemo.utils import logging
+
+__all__ = ['EncDecTransfModelBPE']
+
+
+def lens_to_mask(lens, max_length):
+    batch_size = lens.shape[0]
+    mask = torch.arange(max_length).repeat(batch_size, 1).to(lens.device) < lens[:, None]
+    return mask
+
+
+class EncDecTransfModelBPE(ASRModel, ExportableEncDecModel, ASRBPEMixin):
+    """Base class for encoder decoder CTC-based models."""
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+
+        if 'tokenizer' not in cfg:
+            raise ValueError("`cfg` must have `tokenizer` config to create a tokenizer !")
+
+        # Setup the tokenizer
+        self._setup_tokenizer(cfg.tokenizer)
+
+        super().__init__(cfg=cfg, trainer=trainer)
+        
+        # Setup audio preprocessor
+        self.preprocessor = EncDecTransfModelBPE.from_config_dict(self._cfg.preprocessor)
+        
+        # Setup audio encoder
+        self.encoder = EncDecTransfModelBPE.from_config_dict(self._cfg.encoder)
+
+        # Add projection layer if encoder and decoder differ in hidden size
+        if self._cfg.encoder['d_model'] != self._cfg.transf_decoder['hidden_size']:
+            self.adapter = torch.nn.Linear(self._cfg.encoder['d_model'], self._cfg.transf_decoder['hidden_size'])
+        else:
+            self.adapter = lambda x: x
+
+        transf_encoder_cfg_dict = OmegaConf.to_container(cfg.get('transf_encoder'))
+
+        # Whether to add Transformer Encoder block between Conformer and Transformer Decoder
+        self.use_transf_encoder = False
+        if transf_encoder_cfg_dict['num_layers'] > 0:
+            self.use_transf_encoder = True
+
+            self.transf_encoder = TransformerEncoder(
+                num_layers=transf_encoder_cfg_dict['num_layers'],
+                hidden_size=transf_encoder_cfg_dict['hidden_size'],
+                inner_size=transf_encoder_cfg_dict['inner_size'],
+                mask_future=False,
+                num_attention_heads=transf_encoder_cfg_dict['num_attention_heads'],
+                attn_score_dropout=transf_encoder_cfg_dict['attn_score_dropout'],
+                attn_layer_dropout=transf_encoder_cfg_dict['attn_layer_dropout'],
+                ffn_dropout=transf_encoder_cfg_dict['ffn_dropout'],
+                pre_ln=transf_encoder_cfg_dict.get('pre_ln', True),
+                pre_ln_final_layer_norm=transf_encoder_cfg_dict.get('pre_ln_final_layer_norm', True),
+            )
+            std_init_range = 1 / transf_encoder_cfg_dict['hidden_size'] ** 0.5
+            self.transf_encoder.apply(lambda module: transformer_weights_init(module, std_init_range))
+
+        transf_decoder_cfg_dict = OmegaConf.to_container(cfg.get('transf_decoder'))
+
+        # Transformer decoder
+        vocab_size = 8 * ceil(self.tokenizer.vocab_size / 8)
+        transf_decoder_cfg_dict['vocab_size'] = vocab_size
+        library = transf_decoder_cfg_dict.pop('library', 'nemo')
+        model_name = transf_decoder_cfg_dict.pop('model_name', None)
+        pretrained = transf_decoder_cfg_dict.pop('pretrained', False)
+        checkpoint_file = transf_decoder_cfg_dict.pop('checkpoint_file', None)
+        self.transf_decoder = get_transformer(
+            library=library,
+            model_name=model_name,
+            pretrained=pretrained,
+            config_dict=transf_decoder_cfg_dict,
+            encoder=False,
+            pre_ln_final_layer_norm=transf_decoder_cfg_dict.get("pre_ln_final_layer_norm", False),
+        )
+
+        self.log_softmax = TokenClassifier(
+            hidden_size=self.transf_decoder.hidden_size,
+            num_classes=vocab_size,
+            activation=self._cfg.head.activation,
+            log_softmax=self._cfg.head.log_softmax,
+            dropout=self._cfg.head.dropout,
+            use_transformer_init=self._cfg.head.use_transformer_init,
+        )
+        self.log_softmax.mlp.layer0.weight = self.transf_decoder.embedding.token_embedding.weight
+        std_init_range = 1 / self.transf_decoder.hidden_size ** 0.5
+        self.transf_decoder.apply(lambda module: transformer_weights_init(module, std_init_range))
+        self.log_softmax.apply(lambda module: transformer_weights_init(module, std_init_range))
+
+        # Beam Search decoding
+        self.beam_search = BeamSearchSequenceGenerator(
+            embedding=self.transf_decoder.embedding,
+            decoder=self.transf_decoder.decoder,
+            log_softmax=self.log_softmax,
+            max_sequence_length=self.transf_decoder.max_sequence_length,
+            beam_size=self._cfg.beam_search.beam_size,
+            bos=self.tokenizer.bos_id,
+            pad=self.tokenizer.pad_id,
+            eos=self.tokenizer.eos_id,
+            len_pen=self._cfg.beam_search.len_pen,
+            max_delta_length=self._cfg.beam_search.max_generation_delta,
+        )
+
+        # Define autoregressive CE loss
+        self.transf_loss = SmoothedCrossEntropyLoss(
+            pad_id=self.tokenizer.pad_id, label_smoothing=self._cfg.label_smoothing
+        )
+
+        if hasattr(self._cfg, 'spec_augment') and self._cfg.spec_augment is not None:
+            self.spec_augmentation = EncDecTransfModelBPE.from_config_dict(self._cfg.spec_augment)
+        else:
+            self.spec_augmentation = None
+
+        self.val_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True)
+
+    @torch.no_grad()
+    def transcribe(
+        self,
+        paths2audio_files: List[str],
+        batch_size: int = 4,
+        logprobs: bool = False,
+        return_hypotheses: bool = False,
+    ) -> List[str]:
+        """
+        Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.
+        Args:
+            paths2audio_files: (a list) of paths to audio files. \
+                Recommended length per file is between 5 and 25 seconds. \
+                But it is possible to pass a few hours long file if enough GPU memory is available.
+            batch_size: (int) batch size to use during inference.
+                Bigger will result in better throughput performance but would use more memory.
+            logprobs: (bool) pass True to get log probabilities instead of transcripts.
+            return_hypotheses: (bool) Either return hypotheses or text
+                With hypotheses can do some postprocessing like getting timestamp or rescoring
+        Returns:
+            A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files
+        """
+        if paths2audio_files is None or len(paths2audio_files) == 0:
+            return {}
+
+        if return_hypotheses and logprobs:
+            raise ValueError(
+                "Either `return_hypotheses` or `logprobs` can be True at any given time."
+                "Returned hypotheses will contain the logprobs."
+            )
+
+        # We will store transcriptions here
+        hypotheses = []
+
+        # Model's mode and device
+        mode = self.training
+        device = next(self.parameters()).device
+        dither_value = self.preprocessor.featurizer.dither
+        pad_to_value = self.preprocessor.featurizer.pad_to
+
+        try:
+            self.preprocessor.featurizer.dither = 0.0
+            self.preprocessor.featurizer.pad_to = 0
+            # Switch model to evaluation mode
+            self.eval()
+            # Freeze the encoder and decoder modules
+            self.encoder.freeze()
+            self.transf_decoder.freeze()
+            logging_level = logging.get_verbosity()
+            logging.set_verbosity(logging.WARNING)
+            # Work in tmp directory - will store manifest file there
+            with tempfile.TemporaryDirectory() as tmpdir:
+                with open(os.path.join(tmpdir, 'manifest.json'), 'w') as fp:
+                    for audio_file in paths2audio_files:
+                        entry = {'audio_filepath': audio_file, 'duration': 100000, 'text': 'nothing'}
+                        fp.write(json.dumps(entry) + '\n')
+
+                config = {'paths2audio_files': paths2audio_files, 'batch_size': batch_size, 'temp_dir': tmpdir}
+
+                temporary_datalayer = self._setup_transcribe_dataloader(config)
+                for test_batch in tqdm(temporary_datalayer, desc="Transcribing"):
+                    ctc_lp, _, encoded_len, predictions, enc_states, enc_mask = self.forward(
+                        input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)
+                    )
+
+                    beam_hypotheses = (
+                        self.beam_search(
+                            encoder_hidden_states=enc_states, encoder_input_mask=enc_mask, return_beam_scores=False
+                        )
+                        .detach()
+                        .cpu()
+                        .numpy()
+                    )
+                    beam_hypotheses = [self.tokenizer.ids_to_text(hyp) for hyp in beam_hypotheses]
+
+                    if return_hypotheses:
+                        # dump log probs per file
+                        for idx in range(logits.shape[0]):
+                            current_hypotheses[idx].y_sequence = logits[idx][: logits_len[idx]]
+
+                    hypotheses += beam_hypotheses
+
+                    del test_batch
+        finally:
+            # set mode back to its original value
+            self.train(mode=mode)
+            self.preprocessor.featurizer.dither = dither_value
+            self.preprocessor.featurizer.pad_to = pad_to_value
+            if mode is True:
+                self.encoder.unfreeze()
+                self.transf_decoder.unfreeze()
+            logging.set_verbosity(logging_level)
+
+        return hypotheses
+
+    def _setup_dataloader_from_config(self, config: Optional[Dict]):
+
+        dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config(
+            config=config,
+            local_rank=self.local_rank,
+            global_rank=self.global_rank,
+            world_size=self.world_size,
+            tokenizer=self.tokenizer,
+            preprocessor_cfg=self.cfg.get("preprocessor", None),
+        )
+
+        if dataset is None:
+            return None
+
+        shuffle = config['shuffle']
+        if config.get('is_tarred', False):
+            shuffle = False
+
+        if hasattr(dataset, 'collate_fn'):
+            collate_fn = dataset.collate_fn
+        else:
+            collate_fn = dataset.datasets[0].collate_fn
+
+        return torch.utils.data.DataLoader(
+            dataset=dataset,
+            batch_size=config['batch_size'],
+            collate_fn=collate_fn,
+            drop_last=config.get('drop_last', False),
+            shuffle=shuffle,
+            num_workers=config.get('num_workers', 0),
+            pin_memory=config.get('pin_memory', False),
+        )
+
+    def setup_training_data(self, train_data_config: Optional[DictConfig]):
+
+        # create audio-only data loader
+        self._update_dataset_config(dataset_name='train', config=train_data_config)
+        self._train_dl = self._setup_dataloader_from_config(config=train_data_config)
+
+        # Need to set this because if using an IterableDataset, the length of the
+        # dataloader is the total number of samples rather than the number of batches,
+        # and this messes up the tqdm progress bar. So we set the number of steps manually
+        # (to the correct number) to fix this.
+        if 'is_tarred' in train_data_config and train_data_config['is_tarred']:
+            # We also need to check if limit_train_batches is already set.
+            # If it's an int, we assume that the user has set it to something sane,
+            # i.e. <= # training batches, and don't change it. Otherwise, adjust
+            # batches accordingly if it's a float (including 1.0).
+            if self._trainer is not None and isinstance(self._trainer.limit_train_batches, float):
+                self._trainer.limit_train_batches = int(
+                    self._trainer.limit_train_batches
+                    * ceil((len(self._train_dl.dataset) / self.world_size) / train_data_config['batch_size'])
+                )
+            elif self._trainer is None:
+                logging.warning(
+                    "Model Trainer was not set before constructing the dataset, incorrect number of "
+                    "training batches will be used. Please set the trainer and rebuild the dataset."
+                )
+
+    def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]):
+        """
+        Sets up the validation data loader via a Dict-like object.
+        Args:
+            val_data_config: A config that contains the information regarding construction
+                of an ASR Training dataset.
+        Supported Datasets:
+            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset`
+        """
+        if 'shuffle' not in val_data_config:
+            val_data_config['shuffle'] = False
+
+        # preserve config
+        self._update_dataset_config(dataset_name='validation', config=val_data_config)
+        self._validation_dl = self._setup_dataloader_from_config(config=val_data_config)
+
+    def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]):
+        """
+        Sets up the test data loader via a Dict-like object.
+        Args:
+            test_data_config: A config that contains the information regarding construction
+                of an ASR Training dataset.
+        Supported Datasets:
+            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset`
+        """
+        if 'shuffle' not in test_data_config:
+            test_data_config['shuffle'] = False
+
+        # preserve config
+        self._update_dataset_config(dataset_name='test', config=test_data_config)
+        self._test_dl = self._setup_dataloader_from_config(config=test_data_config)
+
+    @property
+    def input_types(self) -> Optional[Dict[str, NeuralType]]:
+        if hasattr(self.preprocessor, '_sample_rate'):
+            input_signal_eltype = AudioSignal(freq=self.preprocessor._sample_rate)
+        else:
+            input_signal_eltype = AudioSignal()
+        return {
+            "input_signal": NeuralType(('B', 'T'), input_signal_eltype, optional=True),
+            "input_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+            "processed_signal": NeuralType(('B', 'D', 'T'), SpectrogramType(), optional=True),
+            "processed_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+            "transcript": NeuralType(('B', 'T'), LabelsType(), optional=True),
+            "transcript_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+            "sample_id": NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {
+            "transf_log_probs": NeuralType(('B', 'T', 'D'), LogprobsType()),
+            "encoded_lengths": NeuralType(tuple('B'), LengthsType()),
+            "encoder_states": NeuralType(('B', 'T', 'D'), ChannelType()),
+            "encoder_mask": NeuralType(('B', 'T'), MaskType()),
+        }
+
+    @typecheck()
+    def forward(
+        self,
+        input_signal=None,
+        input_signal_length=None,
+        processed_signal=None,
+        processed_signal_length=None,
+        transcript=None,
+        transcript_length=None,
+    ):
+        """
+        Forward pass of the model.
+        Args:
+            input_signal: Tensor that represents a batch of raw audio signals,
+                of shape [B, T]. T here represents timesteps, with 1 second of audio represented as
+                `self.sample_rate` number of floating point values.
+            input_signal_length: Vector of length B, that contains the individual lengths of the audio
+                sequences.
+            processed_signal: Tensor that represents a batch of processed audio signals,
+                of shape (B, D, T) that has undergone processing via some DALI preprocessor.
+            processed_signal_length: Vector of length B, that contains the individual lengths of the
+                processed audio sequences.
+        Returns:
+            A tuple of 3 elements -
+            1) The log probabilities tensor of shape [B, T, D].
+            2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B].
+            3) The greedy token predictions of the model of shape [B, T] (via argmax)
+        """
+        has_input_signal = input_signal is not None and input_signal_length is not None
+        has_processed_signal = processed_signal is not None and processed_signal_length is not None
+        if (has_input_signal ^ has_processed_signal) == False:
+            raise ValueError(
+                f"{self} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive "
+                " with ``processed_signal`` and ``processed_signal_len`` arguments."
+            )
+
+        if not has_processed_signal:
+            processed_signal, processed_signal_length = self.preprocessor(
+                input_signal=input_signal, length=input_signal_length
+            )
+
+        if self.spec_augmentation is not None and self.training:
+            processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length)
+
+        encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length)
+
+        enc_states = encoded.permute(0, 2, 1)
+        enc_states = self.adapter(enc_states)
+        enc_mask = lens_to_mask(encoded_len, enc_states.shape[1]).to(enc_states.dtype)
+        if self.use_transf_encoder:
+            enc_states = self.transf_encoder(encoder_states=enc_states, encoder_mask=enc_mask)
+
+        dec_mask = lens_to_mask(transcript_length, transcript.shape[1]).to(transcript.dtype)
+        dec_states = self.transf_decoder(
+            input_ids=transcript, decoder_mask=dec_mask, encoder_embeddings=enc_states, encoder_mask=enc_mask
+        )
+        transf_log_probs = self.log_softmax(hidden_states=dec_states)
+
+        return transf_log_probs, encoded_len, enc_states, enc_mask
+
+    def compute_audio_loss(self, batch):
+
+        if batch is None:
+            return 0
+
+        signal, signal_len, transcript, transcript_len = batch
+        input_ids, labels = transcript[:, :-1], transcript[:, 1:]
+        batch_size = signal.shape[0]
+
+        transf_log_probs, encoded_len, enc_states, enc_mask = self.forward(
+            input_signal=signal,
+            input_signal_length=signal_len,
+            transcript=input_ids,
+            transcript_length=transcript_len,
+        )
+
+        transf_loss = self.transf_loss(log_probs=transf_log_probs, labels=labels)
+
+        return transf_loss
+
+    # PTL-specific methods
+    def training_step(self, batch, batch_nb):
+
+        audio_loss = self.compute_audio_loss(batch)
+
+        tensorboard_logs = {
+            'train_loss': audio_loss,
+            'learning_rate': self._optimizer.param_groups[0]['lr'],
+        }
+
+        if hasattr(self, '_trainer') and self._trainer is not None:
+            log_every_n_steps = self._trainer.log_every_n_steps
+        else:
+            log_every_n_steps = 1
+
+        return {'loss': audio_loss, 'log': tensorboard_logs}
+
+    def validation_step(self, batch, batch_idx, dataloader_idx=0, eval_mode="val"):
+        signal, signal_len, transcript, transcript_len = batch
+        input_ids, labels = transcript[:, :-1], transcript[:, 1:]
+
+        if isinstance(batch, DALIOutputs) and batch.has_processed_signal:
+            transf_log_probs, encoded_len, enc_states, enc_mask = self.forward(
+                processed_signal=signal,
+                processed_signal_length=signal_len,
+                transcript=input_ids,
+                transcript_length=transcript_len,
+            )
+        else:
+            transf_log_probs, encoded_len, enc_states, enc_mask = self.forward(
+                input_signal=signal,
+                input_signal_length=signal_len,
+                transcript=input_ids,
+                transcript_length=transcript_len,
+            )
+
+        beam_hypotheses = self.beam_search(
+            encoder_hidden_states=enc_states, encoder_input_mask=enc_mask, return_beam_scores=False
+        )
+        transf_loss = self.transf_loss(log_probs=transf_log_probs, labels=labels)
+
+        ground_truths = [self.tokenizer.ids_to_text(sent) for sent in transcript.detach().cpu().tolist()]
+        translations = [self.tokenizer.ids_to_text(sent) for sent in beam_hypotheses.detach().cpu().tolist()]
+
+        self.val_loss(loss=transf_loss, num_measurements=transf_log_probs.shape[0] * transf_log_probs.shape[1])
+
+        return {f'{eval_mode}_loss': transf_loss, 'translations': translations, 'ground_truths': ground_truths}
+
+    def test_step(self, batch, batch_idx, dataloader_idx=0):
+        return self.validation_step(batch, batch_idx, dataloader_idx, eval_mode="test")
+
+    def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0, eval_mode: str = "val"):
+        """
+        Called at the end of validation to aggregate outputs.
+        :param outputs: list of individual outputs of each validation step.
+        """
+        if not outputs:
+            return
+
+        if isinstance(outputs[0], dict):
+            outputs = [outputs]
+
+        for output in outputs:
+            eval_loss = getattr(self, 'val_loss').compute()
+            translations = list(itertools.chain(*[x['translations'] for x in output]))
+            ground_truths = list(itertools.chain(*[x['ground_truths'] for x in output]))
+
+            # Gather translations and ground truths from all workers
+            tr_and_gt = [None for _ in range(self.world_size)]
+            # we also need to drop pairs where ground truth is an empty string
+            if self.world_size > 1:
+                dist.all_gather_object(
+                    tr_and_gt, [(t, g) for (t, g) in zip(translations, ground_truths) if g.strip() != '']
+                )
+            else:
+                tr_and_gt[0] = [(t, g) for (t, g) in zip(translations, ground_truths) if g.strip() != '']
+
+            if self.global_rank == 0:
+                _translations = []
+                _ground_truths = []
+                for rank in range(0, self.world_size):
+                    _translations += [t for (t, g) in tr_and_gt[rank]]
+                    _ground_truths += [g for (t, g) in tr_and_gt[rank]]
+
+                sacre_bleu = corpus_bleu(_translations, [_ground_truths], tokenize="13a")
+                sb_score = sacre_bleu.score * self.world_size
+
+                wer_scores, wer_words = 0, 0
+                for h, r in zip(_translations, _ground_truths):
+                    wer_words += len(r.split())
+                    wer_scores += editdistance.eval(h.split(), r.split())
+                wer_score = 1.0 * wer_scores * self.world_size / wer_words
+
+            else:
+                sb_score = 0.0
+                wer_score = 0.0
+
+            self.log(f"{eval_mode}_loss", eval_loss, sync_dist=True)
+            self.log(f"{eval_mode}_sacreBLEU", sb_score, sync_dist=True)
+            self.log(f"{eval_mode}_WER", wer_score, sync_dist=True)
+            self.val_loss.reset()
+
+    def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0):
+        return self.multi_validation_epoch_end(outputs, dataloader_idx, eval_mode="test")
+
+    def test_dataloader(self):
+        if self._test_dl is not None:
+            return self._test_dl
+
+    def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader':
+        """
+        Setup function for a temporary data loader which wraps the provided audio file.
+        Args:
+            config: A python dictionary which contains the following keys:
+            paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments. \
+                Recommended length per file is between 5 and 25 seconds.
+            batch_size: (int) batch size to use during inference. \
+                Bigger will result in better throughput performance but would use more memory.
+            temp_dir: (str) A temporary directory where the audio manifest is temporarily
+                stored.
+        Returns:
+            A pytorch DataLoader for the given audio file(s).
+        """
+        batch_size = min(config['batch_size'], len(config['paths2audio_files']))
+        dl_config = {
+            'manifest_filepath': os.path.join(config['temp_dir'], 'manifest.json'),
+            'sample_rate': self.preprocessor._sample_rate,
+            'batch_size': batch_size,
+            'trim_silence': False,
+            'shuffle': False,
+            'num_workers': min(batch_size, os.cpu_count() - 1),
+            'pin_memory': True,
+        }
+
+        temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config))
+        return temporary_datalayer
diff --git a/nemo/collections/asr/parts/mixins/mixins.py b/nemo/collections/asr/parts/mixins/mixins.py
index 4c43960ac9d2..bcb62dc409b7 100644
--- a/nemo/collections/asr/parts/mixins/mixins.py
+++ b/nemo/collections/asr/parts/mixins/mixins.py
@@ -83,7 +83,7 @@ def _setup_monolingual_tokenizer(self, tokenizer_cfg: DictConfig):
                 with open_dict(self.cfg.tokenizer):
                     self.cfg.tokenizer.hf_kwargs = tokenizer_cfg.get('hf_kwargs')
 
-        if self.tokenizer_type not in ['bpe', 'wpe']:
+        if self.tokenizer_type not in ['bpe', 'wpe', 'yttm']:
             raise ValueError(
                 "`tokenizer.type` must be either `bpe` for SentencePiece tokenizer or "
                 "`wpe` for BERT based tokenizer"
@@ -140,7 +140,7 @@ def get_vocab():
             self.tokenizer.tokenizer.get_vocab = get_vocab
             self.tokenizer.tokenizer.all_special_tokens = self.tokenizer.special_token_to_id
 
-        else:
+        elif self.tokenizer_type == 'wpe':
             # This is a WPE Tokenizer
             # If path from previous registration exists, remove it
             if 'vocab_path' in self.tokenizer_cfg:
@@ -166,6 +166,20 @@ def get_vocab():
                 unk_token=self.hf_tokenizer_kwargs.get('unk_token', None),
                 use_fast=self.hf_tokenizer_kwargs.get('use_fast', False),
             )
+        else:
+            # This is a YouTokenToMe BPE Tokenizer
+            self.tokenizer = tokenizers.YouTokenToMeTokenizer(model_path=self.tokenizer_cfg.get('model_path'))
+
+            vocabulary = {}
+            for i, piece in enumerate(self.tokenizer.tokenizer.vocab()):
+                vocabulary[piece] = i
+
+            # wrapper method to get vocabulary conveniently
+            def get_vocab():
+                return vocabulary
+
+            self.tokenizer.tokenizer.vocab_size = len(vocabulary)
+            self.tokenizer.tokenizer.get_vocab = get_vocab
 
         logging.info(
             "Tokenizer {} initialized with {} tokens".format(
@@ -221,7 +235,7 @@ def _make_tokenizer(self, tokenizer_cfg: DictConfig, lang=None):
         tokenizer_type = tokenizer_cfg.get('type').lower()
         tokenizer_dir = tokenizer_cfg.get('dir')
 
-        if tokenizer_type not in ['bpe', 'wpe']:
+        if tokenizer_type not in ['bpe', 'wpe', 'yttm']:
             raise ValueError(
                 '`tokenizer.type` must be either `bpe` for SentencePiece tokenizer or' '`wpe` for BERT based tokenizer'
             )
@@ -289,7 +303,7 @@ def get_vocab():
             tokenizer.tokenizer.get_vocab = get_vocab
             tokenizer.tokenizer.all_special_tokens = tokenizer.special_token_to_id
 
-        else:
+        elif tokenizer_type == 'wpe':
             # This is a WPE Tokenizer
             # If path from previous registration exists, remove it
             if 'vocab_path' in tokenizer_cfg:
@@ -318,6 +332,20 @@ def get_vocab():
                 unk_token=hf_tokenizer_kwargs.get('unk_token', None),
                 use_fast=hf_tokenizer_kwargs.get('use_fast', False),
             )
+        else:
+            # This is a YouTokenToMe BPE Tokenizer
+            self.tokenizer = tokenizers.YouTokenToMeTokenizer(model_path=self.tokenizer_cfg.get('model_path'))
+
+            vocabulary = {}
+            for i, piece in enumerate(self.tokenizer.tokenizer.vocab()):
+                vocabulary[piece] = i
+
+            # wrapper method to get vocabulary conveniently
+            def get_vocab():
+                return vocabulary
+
+            self.tokenizer.tokenizer.vocab_size = len(vocabulary)
+            self.tokenizer.tokenizer.get_vocab = get_vocab
 
         logging.info(
             'Tokenizer {} initialized with {} tokens'.format(tokenizer.__class__.__name__, tokenizer.vocab_size)
diff --git a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
index 0ab0cb784273..906154213ea1 100644
--- a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
+++ b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
@@ -299,7 +299,7 @@ def create_spt_model(
         byte_fallback: If <unk>, fallback to a byte sequence of the character.
         split_digits: If true, digits are split into individual tokens.
         split_by_whitespace: Whether to respect white space while creating subwords. If False, will learn merges across whitespace.
-        split_by_unicode_script: Whether to include multiple Unicode scripts. Ex. is Arabic diacritics which are considered part of the letter (عِدَّةُ)
+        split_by_unicode_script: Whether to include multiple Unicode scripts. Ex. is Arabic diacritics which are considered part of the letter (عِدَّةُ)
     """
 
     if not data_file or not os.path.exists(data_file):

From 842f2e988f287a717467a75d0438c492ba53f53b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 4 Jul 2023 00:03:55 +0000
Subject: [PATCH 02/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 nemo/collections/asr/models/transformer_bpe_models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py
index b1a40d937a0c..6d86fa221d7b 100644
--- a/nemo/collections/asr/models/transformer_bpe_models.py
+++ b/nemo/collections/asr/models/transformer_bpe_models.py
@@ -82,10 +82,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         self._setup_tokenizer(cfg.tokenizer)
 
         super().__init__(cfg=cfg, trainer=trainer)
-        
+
         # Setup audio preprocessor
         self.preprocessor = EncDecTransfModelBPE.from_config_dict(self._cfg.preprocessor)
-        
+
         # Setup audio encoder
         self.encoder = EncDecTransfModelBPE.from_config_dict(self._cfg.encoder)
 

From 10019f8ded09838ca00135018fa2ec1f1e9e0f52 Mon Sep 17 00:00:00 2001
From: AlexGrinch <grinchuk.alexey@gmail.com>
Date: Mon, 3 Jul 2023 17:04:27 -0700
Subject: [PATCH 03/14] style fix

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>
---
 nemo/collections/asr/models/transformer_bpe_models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py
index b1a40d937a0c..6d86fa221d7b 100644
--- a/nemo/collections/asr/models/transformer_bpe_models.py
+++ b/nemo/collections/asr/models/transformer_bpe_models.py
@@ -82,10 +82,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         self._setup_tokenizer(cfg.tokenizer)
 
         super().__init__(cfg=cfg, trainer=trainer)
-        
+
         # Setup audio preprocessor
         self.preprocessor = EncDecTransfModelBPE.from_config_dict(self._cfg.preprocessor)
-        
+
         # Setup audio encoder
         self.encoder = EncDecTransfModelBPE.from_config_dict(self._cfg.encoder)
 

From 6615136da757c05e32218d3348725547ca762f82 Mon Sep 17 00:00:00 2001
From: AlexGrinch <grinchuk.alexey@gmail.com>
Date: Wed, 5 Jul 2023 14:44:28 -0700
Subject: [PATCH 04/14] sacrebleu import fix, unused imports removed

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>
---
 .../asr/models/transformer_bpe_models.py        | 17 +++--------------
 requirements/requirements_asr.txt               |  1 +
 2 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py
index 6d86fa221d7b..0954b98d8423 100644
--- a/nemo/collections/asr/models/transformer_bpe_models.py
+++ b/nemo/collections/asr/models/transformer_bpe_models.py
@@ -12,11 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 import itertools
 import json
 import os
-import random
 import tempfile
 from math import ceil
 from typing import Dict, List, Optional, Union
@@ -24,31 +22,22 @@
 import editdistance
 import torch
 import torch.distributed as dist
-from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict
+from omegaconf import DictConfig, OmegaConf
 from pytorch_lightning import Trainer
 from sacrebleu import corpus_bleu
-from torch.utils.data import ChainDataset, DataLoader
 from tqdm.auto import tqdm
 
 from nemo.collections.asr.data import audio_to_text_dataset
-from nemo.collections.asr.data.audio_to_text import _speech_collate_fn
 from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs
-from nemo.collections.asr.losses.ctc import CTCLoss
-from nemo.collections.asr.metrics.wer_bpe import WERBPE, CTCBPEDecoding, CTCBPEDecodingConfig
 from nemo.collections.asr.models.asr_model import ASRModel, ExportableEncDecModel
-from nemo.collections.asr.parts.features import clean_spectrogram_batch, normalize_batch
 from nemo.collections.asr.parts.mixins import ASRBPEMixin
-from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations
-from nemo.collections.common.data import ConcatDataset
-from nemo.collections.common.losses import NLLLoss, SmoothedCrossEntropyLoss
+from nemo.collections.common.losses import SmoothedCrossEntropyLoss
 from nemo.collections.common.metrics import GlobalAverageLossMetric
 from nemo.collections.common.parts import transformer_weights_init
-from nemo.collections.nlp.models.machine_translation import MTEncDecModel
 from nemo.collections.nlp.modules.common import TokenClassifier
 from nemo.collections.nlp.modules.common.lm_utils import get_transformer
 from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder
-from nemo.collections.tts.models import FastPitchModel, SpectrogramEnhancerModel
-from nemo.core.classes.common import PretrainedModelInfo, typecheck
+from nemo.core.classes.common import typecheck
 from nemo.core.neural_types import (
     AudioSignal,
     ChannelType,
diff --git a/requirements/requirements_asr.txt b/requirements/requirements_asr.txt
index 011862ad723b..a9576d894e22 100644
--- a/requirements/requirements_asr.txt
+++ b/requirements/requirements_asr.txt
@@ -13,6 +13,7 @@ pyannote.core
 pyannote.metrics
 pydub
 ruamel.yaml
+sacrebleu
 scipy>=0.14
 soundfile
 sox

From ced657bb899db499dfd9ce4fcb9340039ac8b65e Mon Sep 17 00:00:00 2001
From: AlexGrinch <grinchuk.alexey@gmail.com>
Date: Wed, 5 Jul 2023 15:20:49 -0700
Subject: [PATCH 05/14] import guard for nlp inside asr transformer bpe model

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>
---
 .../asr/models/transformer_bpe_models.py           | 14 ++++++++++----
 requirements/requirements_asr.txt                  |  1 -
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py
index 0954b98d8423..bb835a53b39a 100644
--- a/nemo/collections/asr/models/transformer_bpe_models.py
+++ b/nemo/collections/asr/models/transformer_bpe_models.py
@@ -24,7 +24,6 @@
 import torch.distributed as dist
 from omegaconf import DictConfig, OmegaConf
 from pytorch_lightning import Trainer
-from sacrebleu import corpus_bleu
 from tqdm.auto import tqdm
 
 from nemo.collections.asr.data import audio_to_text_dataset
@@ -34,9 +33,16 @@
 from nemo.collections.common.losses import SmoothedCrossEntropyLoss
 from nemo.collections.common.metrics import GlobalAverageLossMetric
 from nemo.collections.common.parts import transformer_weights_init
-from nemo.collections.nlp.modules.common import TokenClassifier
-from nemo.collections.nlp.modules.common.lm_utils import get_transformer
-from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder
+
+try:
+    from sacrebleu import corpus_bleu
+    from nemo.collections.nlp.modules.common import TokenClassifier
+    from nemo.collections.nlp.modules.common.lm_utils import get_transformer
+    from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder
+    ASR_AVAILABLE = True
+except (ImportError, ModuleNotFoundError):
+    ASR_AVAILABLE = False
+
 from nemo.core.classes.common import typecheck
 from nemo.core.neural_types import (
     AudioSignal,
diff --git a/requirements/requirements_asr.txt b/requirements/requirements_asr.txt
index a9576d894e22..011862ad723b 100644
--- a/requirements/requirements_asr.txt
+++ b/requirements/requirements_asr.txt
@@ -13,7 +13,6 @@ pyannote.core
 pyannote.metrics
 pydub
 ruamel.yaml
-sacrebleu
 scipy>=0.14
 soundfile
 sox

From e91980a8a66964b3dc75895bc78d998272824e62 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 5 Jul 2023 22:22:01 +0000
Subject: [PATCH 06/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 nemo/collections/asr/models/transformer_bpe_models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py
index bb835a53b39a..5656a98da16d 100644
--- a/nemo/collections/asr/models/transformer_bpe_models.py
+++ b/nemo/collections/asr/models/transformer_bpe_models.py
@@ -39,6 +39,7 @@
     from nemo.collections.nlp.modules.common import TokenClassifier
     from nemo.collections.nlp.modules.common.lm_utils import get_transformer
     from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder
+
     ASR_AVAILABLE = True
 except (ImportError, ModuleNotFoundError):
     ASR_AVAILABLE = False

From 1a13c14d16f06e3568604c70cfd2ddeddca3e1c4 Mon Sep 17 00:00:00 2001
From: AlexGrinch <grinchuk.alexey@gmail.com>
Date: Thu, 6 Jul 2023 13:34:49 -0700
Subject: [PATCH 07/14] codeql fixes

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>
---
 .../asr/speech_translation/speech_to_text_transf.py   |  3 ---
 nemo/collections/asr/models/transformer_bpe_models.py | 11 ++---------
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/examples/asr/speech_translation/speech_to_text_transf.py b/examples/asr/speech_translation/speech_to_text_transf.py
index ce3e657365a7..2e349e81de4a 100644
--- a/examples/asr/speech_translation/speech_to_text_transf.py
+++ b/examples/asr/speech_translation/speech_to_text_transf.py
@@ -45,10 +45,7 @@
 
 """
 
-from collections import OrderedDict
-
 import pytorch_lightning as pl
-import torch
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.models import EncDecTransfModelBPE
diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py
index bb835a53b39a..5c74a848f088 100644
--- a/nemo/collections/asr/models/transformer_bpe_models.py
+++ b/nemo/collections/asr/models/transformer_bpe_models.py
@@ -39,9 +39,9 @@
     from nemo.collections.nlp.modules.common import TokenClassifier
     from nemo.collections.nlp.modules.common.lm_utils import get_transformer
     from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder
-    ASR_AVAILABLE = True
+    NLP_AVAILABLE = True
 except (ImportError, ModuleNotFoundError):
-    ASR_AVAILABLE = False
+    NLP_AVAILABLE = False
 
 from nemo.core.classes.common import typecheck
 from nemo.core.neural_types import (
@@ -120,7 +120,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         library = transf_decoder_cfg_dict.pop('library', 'nemo')
         model_name = transf_decoder_cfg_dict.pop('model_name', None)
         pretrained = transf_decoder_cfg_dict.pop('pretrained', False)
-        checkpoint_file = transf_decoder_cfg_dict.pop('checkpoint_file', None)
         self.transf_decoder = get_transformer(
             library=library,
             model_name=model_name,
@@ -455,7 +454,6 @@ def compute_audio_loss(self, batch):
 
         signal, signal_len, transcript, transcript_len = batch
         input_ids, labels = transcript[:, :-1], transcript[:, 1:]
-        batch_size = signal.shape[0]
 
         transf_log_probs, encoded_len, enc_states, enc_mask = self.forward(
             input_signal=signal,
@@ -478,11 +476,6 @@ def training_step(self, batch, batch_nb):
             'learning_rate': self._optimizer.param_groups[0]['lr'],
         }
 
-        if hasattr(self, '_trainer') and self._trainer is not None:
-            log_every_n_steps = self._trainer.log_every_n_steps
-        else:
-            log_every_n_steps = 1
-
         return {'loss': audio_loss, 'log': tensorboard_logs}
 
     def validation_step(self, batch, batch_idx, dataloader_idx=0, eval_mode="val"):

From 67a3d96d4d2ba46fd8fb936bb93cc75a9c3e030b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 6 Jul 2023 20:39:11 +0000
Subject: [PATCH 08/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 nemo/collections/asr/models/transformer_bpe_models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py
index 5c74a848f088..8538bd253dee 100644
--- a/nemo/collections/asr/models/transformer_bpe_models.py
+++ b/nemo/collections/asr/models/transformer_bpe_models.py
@@ -39,6 +39,7 @@
     from nemo.collections.nlp.modules.common import TokenClassifier
     from nemo.collections.nlp.modules.common.lm_utils import get_transformer
     from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder
+
     NLP_AVAILABLE = True
 except (ImportError, ModuleNotFoundError):
     NLP_AVAILABLE = False

From 069626304d44deb9acbf48f3985ff5b4ef84ae26 Mon Sep 17 00:00:00 2001
From: AlexGrinch <grinchuk.alexey@gmail.com>
Date: Wed, 12 Jul 2023 14:53:48 -0700
Subject: [PATCH 09/14] comments answered

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>
---
 .../fast-conformer_transformer.yaml}          | 41 ++++++-------------
 ...ransf.py => speech_to_text_transformer.py} |  9 +---
 .../asr/models/transformer_bpe_models.py      | 35 ++++++++--------
 3 files changed, 32 insertions(+), 53 deletions(-)
 rename examples/asr/conf/{transformer_dec/speech_translation_transf.yaml => speech_translation/fast-conformer_transformer.yaml} (79%)
 rename examples/asr/speech_translation/{speech_to_text_transf.py => speech_to_text_transformer.py} (84%)

diff --git a/examples/asr/conf/transformer_dec/speech_translation_transf.yaml b/examples/asr/conf/speech_translation/fast-conformer_transformer.yaml
similarity index 79%
rename from examples/asr/conf/transformer_dec/speech_translation_transf.yaml
rename to examples/asr/conf/speech_translation/fast-conformer_transformer.yaml
index c7bfe739b46c..4e480df62e59 100644
--- a/examples/asr/conf/transformer_dec/speech_translation_transf.yaml
+++ b/examples/asr/conf/speech_translation/fast-conformer_transformer.yaml
@@ -1,30 +1,19 @@
-# It contains the default values for training a Conformer-CTC ASR model, large size (~120M) with CTC loss and sub-word encoding.
+# It contains the default values for training an autoregressive FastConformer-Transformer ST model with sub-word encoding.
 
 # Architecture and training config:
 # Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective
 # batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
-# Here are the recommended configs for different variants of Conformer-CTC, other parameters are the same as in this config file.
-# One extra layer (compared to original paper) is added to the medium and large variants to compensate for replacing the LSTM decoder with a linear one.
-#
-#  +-------------+---------+---------+----------+------------+-----+
-#  | Model       | d_model | n_heads | n_layers | time_masks | lr  |
-#  +=============+=========+========+===========+============+=====+
-#  | Small  (13M)|   176   |    4   |    16     |     5      | 5.0 |
-#  +-------------+---------+--------+-----------+------------+-----+
-#  | Medium (30M)|   256   |    4   |    18     |     5      | 5.0 |
-#  +-------------+---------+--------+-----------+------------+-----+
-#  | Large (121M)|   512   |    8   |    18     |     10     | 2.0 |
-#  +---------------------------------------------------------------+
-#
-# If you do not want to train with AMP, you may use weight decay of 0.0 or reduce the number of time maskings to 2
-# with time_width=100. It may help when you want to train for fewer epochs and need faster convergence.
-# With weight_decay=0.0, learning rate may need to get reduced to 2.0.
-
-# You may find more info about Conformer-CTC here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#conformer-ctc
-# Pre-trained models of Conformer-CTC can be found here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html
-# The checkpoint of the large model trained on LibriSpeech with this recipe can be found here: https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large_ls
-
-name: "Conformer-Transformer-BPE-st"
+# Here are the recommended configs for different variants of FastConformer-Transformer, other parameters are the same as in this config file.
+# One extra (linear projection) layer is added between FastConformer encoder and Transformer decoder if they have different hidden sizes
+# It is recommended to initialize FastConformer with ASR pre-trained encoder for better accuracy and faster convergence
+
+name: "FastConformer-Transformer-BPE-st"
+
+# Initialize model encoder with pre-trained ASR FastConformer encoder for faster convergence and improved accuracy
+init_from_nemo_model:
+  model0:
+    path: ???
+    include: ["preprocessor", "encoder"]
 
 model:
   sample_rate: 16000
@@ -186,12 +175,6 @@ model:
       warmup_ratio: null
       min_lr: 1e-6
 
-# Initialize model encoder with pre-trained ASR FastConformer encoder for faster convergence and improved accuracy
-init_from_nemo_model:
-  model0:
-    path: ???
-    include: ["preprocessor", "encoder"]
-
 trainer:
   gpus: -1 # number of GPUs, -1 would use all available GPUs
   num_nodes: 1
diff --git a/examples/asr/speech_translation/speech_to_text_transf.py b/examples/asr/speech_translation/speech_to_text_transformer.py
similarity index 84%
rename from examples/asr/speech_translation/speech_to_text_transf.py
rename to examples/asr/speech_translation/speech_to_text_transformer.py
index 2e349e81de4a..0c0882859b88 100644
--- a/examples/asr/speech_translation/speech_to_text_transf.py
+++ b/examples/asr/speech_translation/speech_to_text_transformer.py
@@ -15,10 +15,8 @@
 """
 # Training the model
 ```sh
-python speech_to_text_transf.py \
+python speech_to_text_transformer.py \
     # (Optional: --config-path=<path to dir of configs> --config-name=<name of config without .yaml>) \
-    model.train_ds.text.tar_files=<path to tar files with text data> \
-    model.train_ds.text.metadata_file=<path to text metadata file> \
     model.train_ds.audio.tarred_audio_filepaths=<path to tar files with audio> \
     model.train_ds.audio_manifest_filepath=<path to audio data manifest> \
     model.validation_ds.manifest_filepath=<path to validation manifest> \
@@ -26,9 +24,6 @@
     model.tokenizer.dir=<path to directory of tokenizer (not full path to the vocab file!)> \
     model.tokenizer.model_path=<path to speech tokenizer model> \
     model.tokenizer.type=<either bpe, wpe, or yttm> \
-    model.encoder_tokenizer.tokenizer_model=<path to cmudict> \
-    model.encoder_tokenizer.vocab_file=<path to heteronyms> \
-    model.decoder_tokenizer.tokenizer_model=<path to decoder tokenizer model> \
     trainer.gpus=-1 \
     trainer.accelerator="ddp" \
     trainer.max_epochs=100 \
@@ -54,7 +49,7 @@
 from nemo.utils.exp_manager import exp_manager
 
 
-@hydra_runner(config_path="../conf/transformer_dec/", config_name="speech_translation_transf_test")
+@hydra_runner(config_path="../conf/speech_translation/", config_name="fast-conformer_transformer")
 def main(cfg):
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
 
diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py
index 8538bd253dee..79890d62e111 100644
--- a/nemo/collections/asr/models/transformer_bpe_models.py
+++ b/nemo/collections/asr/models/transformer_bpe_models.py
@@ -43,6 +43,7 @@
     NLP_AVAILABLE = True
 except (ImportError, ModuleNotFoundError):
     NLP_AVAILABLE = False
+    logging.warning("Could not import NeMo NLP collection which is required for speech translation model.")
 
 from nemo.core.classes.common import typecheck
 from nemo.core.neural_types import (
@@ -80,16 +81,16 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         super().__init__(cfg=cfg, trainer=trainer)
 
         # Setup audio preprocessor
-        self.preprocessor = EncDecTransfModelBPE.from_config_dict(self._cfg.preprocessor)
+        self.preprocessor = EncDecTransfModelBPE.from_config_dict(self.cfg.preprocessor)
 
         # Setup audio encoder
-        self.encoder = EncDecTransfModelBPE.from_config_dict(self._cfg.encoder)
+        self.encoder = EncDecTransfModelBPE.from_config_dict(self.cfg.encoder)
 
         # Add projection layer if encoder and decoder differ in hidden size
-        if self._cfg.encoder['d_model'] != self._cfg.transf_decoder['hidden_size']:
-            self.adapter = torch.nn.Linear(self._cfg.encoder['d_model'], self._cfg.transf_decoder['hidden_size'])
+        if self.cfg.encoder['d_model'] != self.cfg.transf_decoder['hidden_size']:
+            self.adapter = torch.nn.Linear(self.cfg.encoder['d_model'], self.cfg.transf_decoder['hidden_size'])
         else:
-            self.adapter = lambda x: x
+            self.adapter = torch.nn.Identity()
 
         transf_encoder_cfg_dict = OmegaConf.to_container(cfg.get('transf_encoder'))
 
@@ -133,10 +134,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         self.log_softmax = TokenClassifier(
             hidden_size=self.transf_decoder.hidden_size,
             num_classes=vocab_size,
-            activation=self._cfg.head.activation,
-            log_softmax=self._cfg.head.log_softmax,
-            dropout=self._cfg.head.dropout,
-            use_transformer_init=self._cfg.head.use_transformer_init,
+            activation=self.cfg.head.activation,
+            log_softmax=self.cfg.head.log_softmax,
+            dropout=self.cfg.head.dropout,
+            use_transformer_init=self.cfg.head.use_transformer_init,
         )
         self.log_softmax.mlp.layer0.weight = self.transf_decoder.embedding.token_embedding.weight
         std_init_range = 1 / self.transf_decoder.hidden_size ** 0.5
@@ -149,21 +150,21 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             decoder=self.transf_decoder.decoder,
             log_softmax=self.log_softmax,
             max_sequence_length=self.transf_decoder.max_sequence_length,
-            beam_size=self._cfg.beam_search.beam_size,
+            beam_size=self.cfg.beam_search.beam_size,
             bos=self.tokenizer.bos_id,
             pad=self.tokenizer.pad_id,
             eos=self.tokenizer.eos_id,
-            len_pen=self._cfg.beam_search.len_pen,
-            max_delta_length=self._cfg.beam_search.max_generation_delta,
+            len_pen=self.cfg.beam_search.len_pen,
+            max_delta_length=self.cfg.beam_search.max_generation_delta,
         )
 
         # Define autoregressive CE loss
         self.transf_loss = SmoothedCrossEntropyLoss(
-            pad_id=self.tokenizer.pad_id, label_smoothing=self._cfg.label_smoothing
+            pad_id=self.tokenizer.pad_id, label_smoothing=self.cfg.label_smoothing
         )
 
-        if hasattr(self._cfg, 'spec_augment') and self._cfg.spec_augment is not None:
-            self.spec_augmentation = EncDecTransfModelBPE.from_config_dict(self._cfg.spec_augment)
+        if hasattr(self.cfg, 'spec_augment') and self.cfg.spec_augment is not None:
+            self.spec_augmentation = EncDecTransfModelBPE.from_config_dict(self.cfg.spec_augment)
         else:
             self.spec_augmentation = None
 
@@ -230,7 +231,7 @@ def transcribe(
 
                 temporary_datalayer = self._setup_transcribe_dataloader(config)
                 for test_batch in tqdm(temporary_datalayer, desc="Transcribing"):
-                    ctc_lp, _, encoded_len, predictions, enc_states, enc_mask = self.forward(
+                    log_probs, encoded_len, enc_states, enc_mask = self.forward(
                         input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)
                     )
 
@@ -251,7 +252,7 @@ def transcribe(
 
                     hypotheses += beam_hypotheses
 
-                    del test_batch
+                    del test_batch, log_probs, encoded_len, enc_states, enc_mask
         finally:
             # set mode back to its original value
             self.train(mode=mode)

From 249f312bafe422ef6379c3d3f93b7c1f768807de Mon Sep 17 00:00:00 2001
From: AlexGrinch <grinchuk.alexey@gmail.com>
Date: Wed, 12 Jul 2023 15:02:19 -0700
Subject: [PATCH 10/14] import ordering fix

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>
---
 .../asr/models/transformer_bpe_models.py      | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py
index 79890d62e111..840a86d47150 100644
--- a/nemo/collections/asr/models/transformer_bpe_models.py
+++ b/nemo/collections/asr/models/transformer_bpe_models.py
@@ -34,17 +34,6 @@
 from nemo.collections.common.metrics import GlobalAverageLossMetric
 from nemo.collections.common.parts import transformer_weights_init
 
-try:
-    from sacrebleu import corpus_bleu
-    from nemo.collections.nlp.modules.common import TokenClassifier
-    from nemo.collections.nlp.modules.common.lm_utils import get_transformer
-    from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder
-
-    NLP_AVAILABLE = True
-except (ImportError, ModuleNotFoundError):
-    NLP_AVAILABLE = False
-    logging.warning("Could not import NeMo NLP collection which is required for speech translation model.")
-
 from nemo.core.classes.common import typecheck
 from nemo.core.neural_types import (
     AudioSignal,
@@ -58,6 +47,17 @@
 )
 from nemo.utils import logging
 
+try:
+    from sacrebleu import corpus_bleu
+    from nemo.collections.nlp.modules.common import TokenClassifier
+    from nemo.collections.nlp.modules.common.lm_utils import get_transformer
+    from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder
+
+    NLP_AVAILABLE = True
+except (ImportError, ModuleNotFoundError):
+    NLP_AVAILABLE = False
+    logging.warning("Could not import NeMo NLP collection which is required for speech translation model.")
+
 __all__ = ['EncDecTransfModelBPE']
 
 

From c2c00d4eb0ba9fe0e3c716ae79bc99b93d7e3d73 Mon Sep 17 00:00:00 2001
From: AlexGrinch <grinchuk.alexey@gmail.com>
Date: Thu, 13 Jul 2023 15:19:41 -0700
Subject: [PATCH 11/14] yttm for asr removed

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>
---
 nemo/collections/asr/parts/mixins/mixins.py | 40 +++------------------
 1 file changed, 4 insertions(+), 36 deletions(-)

diff --git a/nemo/collections/asr/parts/mixins/mixins.py b/nemo/collections/asr/parts/mixins/mixins.py
index bcb62dc409b7..1fa591c61abc 100644
--- a/nemo/collections/asr/parts/mixins/mixins.py
+++ b/nemo/collections/asr/parts/mixins/mixins.py
@@ -83,7 +83,7 @@ def _setup_monolingual_tokenizer(self, tokenizer_cfg: DictConfig):
                 with open_dict(self.cfg.tokenizer):
                     self.cfg.tokenizer.hf_kwargs = tokenizer_cfg.get('hf_kwargs')
 
-        if self.tokenizer_type not in ['bpe', 'wpe', 'yttm']:
+        if self.tokenizer_type not in ['bpe', 'wpe']:
             raise ValueError(
                 "`tokenizer.type` must be either `bpe` for SentencePiece tokenizer or "
                 "`wpe` for BERT based tokenizer"
@@ -140,7 +140,7 @@ def get_vocab():
             self.tokenizer.tokenizer.get_vocab = get_vocab
             self.tokenizer.tokenizer.all_special_tokens = self.tokenizer.special_token_to_id
 
-        elif self.tokenizer_type == 'wpe':
+        else:
             # This is a WPE Tokenizer
             # If path from previous registration exists, remove it
             if 'vocab_path' in self.tokenizer_cfg:
@@ -166,20 +166,6 @@ def get_vocab():
                 unk_token=self.hf_tokenizer_kwargs.get('unk_token', None),
                 use_fast=self.hf_tokenizer_kwargs.get('use_fast', False),
             )
-        else:
-            # This is a YouTokenToMe BPE Tokenizer
-            self.tokenizer = tokenizers.YouTokenToMeTokenizer(model_path=self.tokenizer_cfg.get('model_path'))
-
-            vocabulary = {}
-            for i, piece in enumerate(self.tokenizer.tokenizer.vocab()):
-                vocabulary[piece] = i
-
-            # wrapper method to get vocabulary conveniently
-            def get_vocab():
-                return vocabulary
-
-            self.tokenizer.tokenizer.vocab_size = len(vocabulary)
-            self.tokenizer.tokenizer.get_vocab = get_vocab
 
         logging.info(
             "Tokenizer {} initialized with {} tokens".format(
@@ -235,7 +221,7 @@ def _make_tokenizer(self, tokenizer_cfg: DictConfig, lang=None):
         tokenizer_type = tokenizer_cfg.get('type').lower()
         tokenizer_dir = tokenizer_cfg.get('dir')
 
-        if tokenizer_type not in ['bpe', 'wpe', 'yttm']:
+        if tokenizer_type not in ['bpe', 'wpe']:
             raise ValueError(
                 '`tokenizer.type` must be either `bpe` for SentencePiece tokenizer or' '`wpe` for BERT based tokenizer'
             )
@@ -303,7 +289,7 @@ def get_vocab():
             tokenizer.tokenizer.get_vocab = get_vocab
             tokenizer.tokenizer.all_special_tokens = tokenizer.special_token_to_id
 
-        elif tokenizer_type == 'wpe':
+        else:
             # This is a WPE Tokenizer
             # If path from previous registration exists, remove it
             if 'vocab_path' in tokenizer_cfg:
@@ -332,24 +318,6 @@ def get_vocab():
                 unk_token=hf_tokenizer_kwargs.get('unk_token', None),
                 use_fast=hf_tokenizer_kwargs.get('use_fast', False),
             )
-        else:
-            # This is a YouTokenToMe BPE Tokenizer
-            self.tokenizer = tokenizers.YouTokenToMeTokenizer(model_path=self.tokenizer_cfg.get('model_path'))
-
-            vocabulary = {}
-            for i, piece in enumerate(self.tokenizer.tokenizer.vocab()):
-                vocabulary[piece] = i
-
-            # wrapper method to get vocabulary conveniently
-            def get_vocab():
-                return vocabulary
-
-            self.tokenizer.tokenizer.vocab_size = len(vocabulary)
-            self.tokenizer.tokenizer.get_vocab = get_vocab
-
-        logging.info(
-            'Tokenizer {} initialized with {} tokens'.format(tokenizer.__class__.__name__, tokenizer.vocab_size)
-        )
 
         return tokenizer, model_path, vocab_path, spe_vocab_path
 

From 66d428f404cef704402d7cdb0868ce8cfcb84937 Mon Sep 17 00:00:00 2001
From: AlexGrinch <grinchuk.alexey@gmail.com>
Date: Thu, 13 Jul 2023 16:06:33 -0700
Subject: [PATCH 12/14] logging added

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>
---
 nemo/collections/asr/parts/mixins/mixins.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/nemo/collections/asr/parts/mixins/mixins.py b/nemo/collections/asr/parts/mixins/mixins.py
index 1fa591c61abc..4c43960ac9d2 100644
--- a/nemo/collections/asr/parts/mixins/mixins.py
+++ b/nemo/collections/asr/parts/mixins/mixins.py
@@ -319,6 +319,10 @@ def get_vocab():
                 use_fast=hf_tokenizer_kwargs.get('use_fast', False),
             )
 
+        logging.info(
+            'Tokenizer {} initialized with {} tokens'.format(tokenizer.__class__.__name__, tokenizer.vocab_size)
+        )
+
         return tokenizer, model_path, vocab_path, spe_vocab_path
 
     def _cleanup_monolingual_and_aggregate_config_and_artifacts_if_needed(self):

From 5fed6fd9c6d919d6682f52df45ccaf21dc07449a Mon Sep 17 00:00:00 2001
From: AlexGrinch <grinchuk.alexey@gmail.com>
Date: Fri, 14 Jul 2023 15:31:56 -0700
Subject: [PATCH 13/14] added inference and translate method

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>
---
 .../speech_translation/translate_speech.py    | 214 ++++++++++++++++++
 .../asr/models/transformer_bpe_models.py      |  23 +-
 2 files changed, 232 insertions(+), 5 deletions(-)
 create mode 100644 examples/asr/speech_translation/translate_speech.py

diff --git a/examples/asr/speech_translation/translate_speech.py b/examples/asr/speech_translation/translate_speech.py
new file mode 100644
index 000000000000..64dfe7dcf321
--- /dev/null
+++ b/examples/asr/speech_translation/translate_speech.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import os
+import json
+from dataclasses import dataclass, is_dataclass
+from typing import List, Optional, Union
+
+import pytorch_lightning as pl
+import torch
+from omegaconf import OmegaConf
+
+from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig
+from nemo.collections.asr.parts.utils.transcribe_utils import (
+    compute_output_filename,
+    prepare_audio_data,
+    setup_model,
+)
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+
+"""
+Translate audio file on a single CPU/GPU. Useful for translations of moderate amounts of audio data.
+
+# Arguments
+  model_path: path to .nemo ST checkpoint
+  pretrained_name: name of pretrained ST model (from NGC registry)
+  audio_dir: path to directory with audio files
+  dataset_manifest: path to dataset JSON manifest file (in NeMo format)
+
+  output_filename: Output filename where the translations will be written
+  batch_size: batch size during inference
+
+  cuda: Optional int to enable or disable execution of model on certain CUDA device.
+  allow_mps: Bool to allow using MPS (Apple Silicon M-series GPU) device if available
+  amp: Bool to decide if Automatic Mixed Precision should be used during inference
+  audio_type: Str filetype of the audio. Supported = wav, flac, mp3
+
+  overwrite_translations: Bool which when set allows repeated translations to overwrite previous results.
+
+# Usage
+ST model can be specified by either "model_path" or "pretrained_name".
+Data for translation can be defined with either "audio_dir" or "dataset_manifest".
+Results are returned in a JSON manifest file.
+
+python translate_speech.py \
+    model_path=null \
+    pretrained_name=null \
+    audio_dir="<remove or path to folder of audio files>" \
+    dataset_manifest="<remove or path to manifest>" \
+    output_filename="<remove or specify output filename>" \
+    batch_size=32 \
+    cuda=0 \
+    amp=True \
+"""
+
+
+@dataclass
+class ModelChangeConfig:
+
+    # Sub-config for changes specific to the Conformer Encoder
+    conformer: ConformerChangeConfig = ConformerChangeConfig()
+
+
+@dataclass
+class TranslationConfig:
+    # Required configs
+    model_path: Optional[str] = None  # Path to a .nemo file
+    pretrained_name: Optional[str] = None  # Name of a pretrained model
+    audio_dir: Optional[str] = None  # Path to a directory which contains audio files
+    dataset_manifest: Optional[str] = None  # Path to dataset's JSON manifest
+    audio_key: str = 'audio_filepath'  # Used to override the default audio key in dataset_manifest
+    eval_config_yaml: Optional[str] = None  # Path to a yaml file of config of evaluation
+
+    # General configs
+    output_filename: Optional[str] = None
+    batch_size: int = 32
+    random_seed: Optional[int] = None  # seed number going to be used in seed_everything()
+
+    # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA
+    # device anyway, and do inference on CPU only if CUDA device is not found.
+    # If `cuda` is a negative number, inference will be on CPU only.
+    cuda: Optional[int] = None
+    allow_mps: bool = False  # allow to select MPS device (Apple Silicon M-series GPU)
+    amp: bool = False
+    audio_type: str = "wav"
+
+    # Recompute model translation, even if the output folder exists with scores.
+    overwrite_translations: bool = True
+
+    # can be set to True to return list of translations instead of the config
+    # if True, will also skip writing anything to the output file
+    return_translations: bool = False
+
+
+@hydra_runner(config_name="TranslationConfig", schema=TranslationConfig)
+def main(cfg: TranslationConfig) -> Union[TranslationConfig, List[str]]:
+    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
+
+    for key in cfg:
+        cfg[key] = None if cfg[key] == 'None' else cfg[key]
+
+    if is_dataclass(cfg):
+        cfg = OmegaConf.structured(cfg)
+
+    if cfg.random_seed:
+        pl.seed_everything(cfg.random_seed)
+
+    if cfg.model_path is None and cfg.pretrained_name is None:
+        raise ValueError("Both cfg.model_path and cfg.pretrained_name cannot be None!")
+    if cfg.audio_dir is None and cfg.dataset_manifest is None:
+        raise ValueError("Both cfg.audio_dir and cfg.dataset_manifest cannot be None!")
+
+    # Load augmentor from exteranl yaml file which contains eval info, could be extend to other feature such VAD, P&C
+    augmentor = None
+    if cfg.eval_config_yaml:
+        eval_config = OmegaConf.load(cfg.eval_config_yaml)
+        augmentor = eval_config.test_ds.get("augmentor")
+        logging.info(f"Will apply on-the-fly augmentation on samples during translation: {augmentor} ")
+
+    # setup GPU
+    if cfg.cuda is None:
+        if torch.cuda.is_available():
+            device = [0]  # use 0th CUDA device
+            accelerator = 'gpu'
+            map_location = torch.device('cuda:0')
+        elif cfg.allow_mps and hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            logging.warning(
+                "MPS device (Apple Silicon M-series GPU) support is experimental."
+                " Env variable `PYTORCH_ENABLE_MPS_FALLBACK=1` should be set in most cases to avoid failures."
+            )
+            device = [0]
+            accelerator = 'mps'
+            map_location = torch.device('mps')
+        else:
+            device = 1
+            accelerator = 'cpu'
+            map_location = torch.device('cpu')
+    else:
+        device = [cfg.cuda]
+        accelerator = 'gpu'
+        map_location = torch.device(f'cuda:{cfg.cuda}')
+
+    logging.info(f"Inference will be done on device: {map_location}")
+
+    asr_model, model_name = setup_model(cfg, map_location)
+    trainer = pl.Trainer(devices=device, accelerator=accelerator)
+    asr_model.set_trainer(trainer)
+    asr_model = asr_model.eval()
+
+    # collect additional translation information
+    return_hypotheses = False
+
+    # prepare audio filepaths and decide wether it's partial audio
+    filepaths, partial_audio = prepare_audio_data(cfg)
+
+    # setup AMP (optional)
+    if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
+        logging.info("AMP enabled!\n")
+        autocast = torch.cuda.amp.autocast
+    else:
+
+        @contextlib.contextmanager
+        def autocast():
+            yield
+
+    # Compute output filename
+    cfg = compute_output_filename(cfg, model_name)
+
+    # if translations should not be overwritten, and already exists, skip re-translation step and return
+    if not cfg.return_translations and not cfg.overwrite_translations and os.path.exists(cfg.output_filename):
+        logging.info(
+            f"Previous translations found at {cfg.output_filename}, and flag `overwrite_translations`"
+            f"is {cfg.overwrite_translations}. Returning without re-translating text."
+        )
+        return cfg
+
+    # translate audio
+    with autocast():
+        with torch.no_grad():
+            translations = asr_model.translate(
+                paths2audio_files=filepaths, batch_size=cfg.batch_size, return_hypotheses=return_hypotheses,
+            )
+
+    logging.info(f"Finished translating {len(filepaths)} files !")
+    logging.info(f"Writing translations into file: {cfg.output_filename}")
+
+    if cfg.return_translations:
+        return translations
+
+    # write audio translations
+    with open(cfg.output_filename, 'w', encoding='utf-8', newline='\n') as f:
+        for filepath, translation in zip(filepaths, translations):
+            item = {'audio_filepath': filepath, 'pred_translation': translation}
+            f.write(json.dumps(item, ensure_ascii=False) + "\n")
+    logging.info(f"Finished writing predictions to {cfg.output_filename}!")
+
+    return cfg
+
+
+if __name__ == '__main__':
+    main()  # noqa pylint: disable=no-value-for-parameter
diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py
index 840a86d47150..178746795ae8 100644
--- a/nemo/collections/asr/models/transformer_bpe_models.py
+++ b/nemo/collections/asr/models/transformer_bpe_models.py
@@ -170,6 +170,17 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
 
         self.val_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True)
 
+    @torch.no_grad()
+    def translate(
+        self,
+        paths2audio_files: List[str],
+        batch_size: int = 4,
+        logprobs: bool = False,
+        return_hypotheses: bool = False,
+    ) -> List[str]:
+        hypotheses = self.transcribe(paths2audio_files, batch_size, logprobs, return_hypotheses)
+        return hypotheses
+
     @torch.no_grad()
     def transcribe(
         self,
@@ -441,11 +452,13 @@ def forward(
         if self.use_transf_encoder:
             enc_states = self.transf_encoder(encoder_states=enc_states, encoder_mask=enc_mask)
 
-        dec_mask = lens_to_mask(transcript_length, transcript.shape[1]).to(transcript.dtype)
-        dec_states = self.transf_decoder(
-            input_ids=transcript, decoder_mask=dec_mask, encoder_embeddings=enc_states, encoder_mask=enc_mask
-        )
-        transf_log_probs = self.log_softmax(hidden_states=dec_states)
+        transf_log_probs = None
+        if transcript is not None:
+            dec_mask = lens_to_mask(transcript_length, transcript.shape[1]).to(transcript.dtype)
+            dec_states = self.transf_decoder(
+                input_ids=transcript, decoder_mask=dec_mask, encoder_embeddings=enc_states, encoder_mask=enc_mask
+            )
+            transf_log_probs = self.log_softmax(hidden_states=dec_states)
 
         return transf_log_probs, encoded_len, enc_states, enc_mask
 

From 66fdfcbd8491d754176aba9affd08456eb72a729 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 14 Jul 2023 22:33:24 +0000
Subject: [PATCH 14/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/asr/speech_translation/translate_speech.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/examples/asr/speech_translation/translate_speech.py b/examples/asr/speech_translation/translate_speech.py
index 64dfe7dcf321..203852b52ee9 100644
--- a/examples/asr/speech_translation/translate_speech.py
+++ b/examples/asr/speech_translation/translate_speech.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import contextlib
-import os
 import json
+import os
 from dataclasses import dataclass, is_dataclass
 from typing import List, Optional, Union
 
@@ -23,11 +23,7 @@
 from omegaconf import OmegaConf
 
 from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig
-from nemo.collections.asr.parts.utils.transcribe_utils import (
-    compute_output_filename,
-    prepare_audio_data,
-    setup_model,
-)
+from nemo.collections.asr.parts.utils.transcribe_utils import compute_output_filename, prepare_audio_data, setup_model
 from nemo.core.config import hydra_runner
 from nemo.utils import logging