diff --git a/examples/asr/conf/speech_translation/fast-conformer_transformer.yaml b/examples/asr/conf/speech_translation/fast-conformer_transformer.yaml
new file mode 100644
index 000000000000..4e480df62e59
--- /dev/null
+++ b/examples/asr/conf/speech_translation/fast-conformer_transformer.yaml
@@ -0,0 +1,218 @@
+# It contains the default values for training an autoregressive FastConformer-Transformer ST model with sub-word encoding.
+
+# Architecture and training config:
+# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective
+# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
+# Here are the recommended configs for different variants of FastConformer-Transformer, other parameters are the same as in this config file.
+# One extra (linear projection) layer is added between FastConformer encoder and Transformer decoder if they have different hidden sizes
+# It is recommended to initialize FastConformer with ASR pre-trained encoder for better accuracy and faster convergence
+
+name: "FastConformer-Transformer-BPE-st"
+
+# Initialize model encoder with pre-trained ASR FastConformer encoder for faster convergence and improved accuracy
+init_from_nemo_model:
+  model0:
+    path: ???
+    include: ["preprocessor", "encoder"]
+
+model:
+  sample_rate: 16000
+  label_smoothing: 0.0
+  log_prediction: true # enables logging sample predictions in the output during training
+
+  train_ds:
+    is_tarred: true
+    tarred_audio_filepaths: ???
+    manifest_filepath: ???
+    sample_rate: 16000
+    shuffle: false
+    trim_silence: false
+    batch_size: 4
+    num_workers: 8
+
+  validation_ds:
+    manifest_filepath: ???
+    sample_rate: ${model.sample_rate}
+    batch_size: 16 # you may increase batch_size if your memory allows
+    shuffle: false
+    num_workers: 4
+    pin_memory: true
+    use_start_end_token: true
+
+  test_ds:
+    manifest_filepath: ???
+    sample_rate: ${model.sample_rate}
+    batch_size: 16 # you may increase batch_size if your memory allows
+    shuffle: false
+    num_workers: 4
+    pin_memory: true
+    use_start_end_token: true
+
+  # recommend small vocab size of 128 or 256 when using 4x sub-sampling
+  # you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py
+  tokenizer:
+    dir: ???  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (wpe)
+    type: bpe  # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer)
+
+  preprocessor:
+    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+    sample_rate: ${model.sample_rate}
+    normalize: "per_feature"
+    window_size: 0.025
+    window_stride: 0.01
+    window: "hann"
+    features: 80
+    n_fft: 512
+    log: true
+    frame_splicing: 1
+    dither: 0.00001
+    pad_to: 0
+    pad_value: 0.0
+
+  spec_augment:
+    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+    freq_masks: 2 # set to zero to disable it
+    # you may use lower time_masks for smaller models to have a faster convergence
+    time_masks: 10 # set to zero to disable it
+    freq_width: 27
+    time_width: 0.05
+
+  encoder:
+    _target_: nemo.collections.asr.modules.ConformerEncoder
+    feat_in: ${model.preprocessor.features}
+    feat_out: -1 # you may set it if you need different output size other than the default d_model
+    n_layers: 17
+    d_model: 512
+
+    # Sub-sampling params
+    subsampling: dw_striding # vggnet or striding, vggnet may give better results but needs more memory
+    subsampling_factor: 8 # must be power of 2
+    subsampling_conv_channels: 256 # -1 sets it to d_model
+    causal_downsampling: false
+    reduction: null
+    reduction_position: null
+    reduction_factor: 1
+
+    # Feed forward module's params
+    ff_expansion_factor: 4
+
+    # Multi-headed Attention Module's params
+    self_attention_model: rel_pos # rel_pos or abs_pos
+    n_heads: 8 # may need to be lower for smaller d_models
+    # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+    att_context_size: [-1, -1] # -1 means unlimited context
+    xscaling: true # scales up the input embeddings by sqrt(d_model)
+    untie_biases: true # unties the biases of the TransformerXL layers
+    pos_emb_max_len: 5000
+
+    # Convolution module's params
+    conv_kernel_size: 9
+    conv_norm_type: batch_norm
+    conv_context_size: null
+
+    ### regularization
+    dropout: 0.1 # The dropout used in most of the Conformer Modules
+    dropout_pre_encoder: 0.1
+    dropout_emb: 0.0 # The dropout used for embeddings
+    dropout_att: 0.1 # The dropout for multi-headed attention modules
+
+  transf_encoder:
+    num_layers: 0
+    hidden_size: 512
+    inner_size: 2048
+    num_attention_heads: 8
+    ffn_dropout: 0.1
+    attn_score_dropout: 0.1
+    attn_layer_dropout: 0.1
+
+  transf_decoder:
+    library: nemo
+    model_name: null
+    pretrained: false
+    max_sequence_length: 512
+    num_token_types: 0
+    embedding_dropout: 0.1
+    learn_positional_encodings: false
+    hidden_size: 512
+    inner_size: 2048
+    num_layers: 6
+    num_attention_heads: 4
+    ffn_dropout: 0.1
+    attn_score_dropout: 0.1
+    attn_layer_dropout: 0.1
+    hidden_act: relu
+    pre_ln: true
+    pre_ln_final_layer_norm: true
+
+  head:
+    num_layers: 1
+    activation: relu
+    log_softmax: true
+    dropout: 0.0
+    use_transformer_init: true
+    
+  beam_search:
+    beam_size: 4
+    len_pen: 0.0
+    max_generation_delta: 50
+
+  optim:
+    name: adam
+    lr: 0.0001
+    # optimizer arguments
+    betas: [0.9, 0.98]
+    # less necessity for weight_decay as we already have large augmentations with SpecAug
+    # you may need weight_decay for large models, stable AMP training, small datasets, or when lower augmentations are used
+    # weight decay of 0.0 with lr of 2.0 also works fine
+    #weight_decay: 1e-3
+
+    # scheduler setup
+    sched:
+      name: InverseSquareRootAnnealing
+      #d_model: ${model.encoder.d_model}
+      # scheduler config override
+      warmup_steps: 1000
+      warmup_ratio: null
+      min_lr: 1e-6
+
+trainer:
+  gpus: -1 # number of GPUs, -1 would use all available GPUs
+  num_nodes: 1
+  max_epochs: 100
+  max_steps: -1 # computed at runtime if not set
+  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
+  accelerator: auto
+  strategy: ddp
+  accumulate_grad_batches: 1
+  gradient_clip_val: 0.0
+  precision: 16 # Should be set to 16 for O1 and O2 to enable the AMP.
+  log_every_n_steps: 100  # Interval of logging.
+  enable_progress_bar: True
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+  check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
+  sync_batchnorm: true
+  enable_checkpointing: False  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    # in case of multiple validation sets, first one is used
+    monitor: "val_sacreBLEU"
+    mode: "max"
+    save_top_k: 3
+    always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints
+
+  # you need to set these two to True to continue the training
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
+
+  # You may use this section to create a W&B logger
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: null
+    project: null
\ No newline at end of file
diff --git a/examples/asr/speech_translation/speech_to_text_transformer.py b/examples/asr/speech_translation/speech_to_text_transformer.py
new file mode 100644
index 000000000000..0c0882859b88
--- /dev/null
+++ b/examples/asr/speech_translation/speech_to_text_transformer.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+# Training the model
+```sh
+python speech_to_text_transformer.py \
+    # (Optional: --config-path=<path to dir of configs> --config-name=<name of config without .yaml>) \
+    model.train_ds.audio.tarred_audio_filepaths=<path to tar files with audio> \
+    model.train_ds.audio_manifest_filepath=<path to audio data manifest> \
+    model.validation_ds.manifest_filepath=<path to validation manifest> \
+    model.test_ds.manifest_filepath=<path to test manifest> \
+    model.tokenizer.dir=<path to directory of tokenizer (not full path to the vocab file!)> \
+    model.tokenizer.model_path=<path to speech tokenizer model> \
+    model.tokenizer.type=<either bpe, wpe, or yttm> \
+    trainer.gpus=-1 \
+    trainer.accelerator="ddp" \
+    trainer.max_epochs=100 \
+    model.optim.name="adamw" \
+    model.optim.lr=0.001 \
+    model.optim.betas=[0.9,0.999] \
+    model.optim.weight_decay=0.0001 \
+    model.optim.sched.warmup_steps=2000
+    exp_manager.create_wandb_logger=True \
+    exp_manager.wandb_logger_kwargs.name="<Name of experiment>" \
+    exp_manager.wandb_logger_kwargs.project="<Name of project>"
+```
+
+
+"""
+
+import pytorch_lightning as pl
+from omegaconf import OmegaConf
+
+from nemo.collections.asr.models import EncDecTransfModelBPE
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+
+
+@hydra_runner(config_path="../conf/speech_translation/", config_name="fast-conformer_transformer")
+def main(cfg):
+    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
+
+    trainer = pl.Trainer(**cfg.trainer)
+    exp_manager(trainer, cfg.get("exp_manager", None))
+    asr_model = EncDecTransfModelBPE(cfg=cfg.model, trainer=trainer)
+
+    # Initialize the weights of the model from another model, if provided via config
+    asr_model.maybe_init_from_pretrained_checkpoint(cfg)
+    trainer.fit(asr_model)
+
+    if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None:
+        if asr_model.prepare_test(trainer):
+            trainer.test(asr_model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/asr/speech_translation/translate_speech.py b/examples/asr/speech_translation/translate_speech.py
new file mode 100644
index 000000000000..203852b52ee9
--- /dev/null
+++ b/examples/asr/speech_translation/translate_speech.py
@@ -0,0 +1,210 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import json
+import os
+from dataclasses import dataclass, is_dataclass
+from typing import List, Optional, Union
+
+import pytorch_lightning as pl
+import torch
+from omegaconf import OmegaConf
+
+from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig
+from nemo.collections.asr.parts.utils.transcribe_utils import compute_output_filename, prepare_audio_data, setup_model
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+
+"""
+Translate audio file on a single CPU/GPU. Useful for translations of moderate amounts of audio data.
+
+# Arguments
+  model_path: path to .nemo ST checkpoint
+  pretrained_name: name of pretrained ST model (from NGC registry)
+  audio_dir: path to directory with audio files
+  dataset_manifest: path to dataset JSON manifest file (in NeMo format)
+
+  output_filename: Output filename where the translations will be written
+  batch_size: batch size during inference
+
+  cuda: Optional int to enable or disable execution of model on certain CUDA device.
+  allow_mps: Bool to allow using MPS (Apple Silicon M-series GPU) device if available
+  amp: Bool to decide if Automatic Mixed Precision should be used during inference
+  audio_type: Str filetype of the audio. Supported = wav, flac, mp3
+
+  overwrite_translations: Bool which when set allows repeated translations to overwrite previous results.
+
+# Usage
+ST model can be specified by either "model_path" or "pretrained_name".
+Data for translation can be defined with either "audio_dir" or "dataset_manifest".
+Results are returned in a JSON manifest file.
+
+python translate_speech.py \
+    model_path=null \
+    pretrained_name=null \
+    audio_dir="<remove or path to folder of audio files>" \
+    dataset_manifest="<remove or path to manifest>" \
+    output_filename="<remove or specify output filename>" \
+    batch_size=32 \
+    cuda=0 \
+    amp=True \
+"""
+
+
+@dataclass
+class ModelChangeConfig:
+
+    # Sub-config for changes specific to the Conformer Encoder
+    conformer: ConformerChangeConfig = ConformerChangeConfig()
+
+
+@dataclass
+class TranslationConfig:
+    # Required configs
+    model_path: Optional[str] = None  # Path to a .nemo file
+    pretrained_name: Optional[str] = None  # Name of a pretrained model
+    audio_dir: Optional[str] = None  # Path to a directory which contains audio files
+    dataset_manifest: Optional[str] = None  # Path to dataset's JSON manifest
+    audio_key: str = 'audio_filepath'  # Used to override the default audio key in dataset_manifest
+    eval_config_yaml: Optional[str] = None  # Path to a yaml file of config of evaluation
+
+    # General configs
+    output_filename: Optional[str] = None
+    batch_size: int = 32
+    random_seed: Optional[int] = None  # seed number going to be used in seed_everything()
+
+    # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA
+    # device anyway, and do inference on CPU only if CUDA device is not found.
+    # If `cuda` is a negative number, inference will be on CPU only.
+    cuda: Optional[int] = None
+    allow_mps: bool = False  # allow to select MPS device (Apple Silicon M-series GPU)
+    amp: bool = False
+    audio_type: str = "wav"
+
+    # Recompute model translation, even if the output folder exists with scores.
+    overwrite_translations: bool = True
+
+    # can be set to True to return list of translations instead of the config
+    # if True, will also skip writing anything to the output file
+    return_translations: bool = False
+
+
+@hydra_runner(config_name="TranslationConfig", schema=TranslationConfig)
+def main(cfg: TranslationConfig) -> Union[TranslationConfig, List[str]]:
+    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
+
+    for key in cfg:
+        cfg[key] = None if cfg[key] == 'None' else cfg[key]
+
+    if is_dataclass(cfg):
+        cfg = OmegaConf.structured(cfg)
+
+    if cfg.random_seed:
+        pl.seed_everything(cfg.random_seed)
+
+    if cfg.model_path is None and cfg.pretrained_name is None:
+        raise ValueError("Both cfg.model_path and cfg.pretrained_name cannot be None!")
+    if cfg.audio_dir is None and cfg.dataset_manifest is None:
+        raise ValueError("Both cfg.audio_dir and cfg.dataset_manifest cannot be None!")
+
+    # Load augmentor from exteranl yaml file which contains eval info, could be extend to other feature such VAD, P&C
+    augmentor = None
+    if cfg.eval_config_yaml:
+        eval_config = OmegaConf.load(cfg.eval_config_yaml)
+        augmentor = eval_config.test_ds.get("augmentor")
+        logging.info(f"Will apply on-the-fly augmentation on samples during translation: {augmentor} ")
+
+    # setup GPU
+    if cfg.cuda is None:
+        if torch.cuda.is_available():
+            device = [0]  # use 0th CUDA device
+            accelerator = 'gpu'
+            map_location = torch.device('cuda:0')
+        elif cfg.allow_mps and hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            logging.warning(
+                "MPS device (Apple Silicon M-series GPU) support is experimental."
+                " Env variable `PYTORCH_ENABLE_MPS_FALLBACK=1` should be set in most cases to avoid failures."
+            )
+            device = [0]
+            accelerator = 'mps'
+            map_location = torch.device('mps')
+        else:
+            device = 1
+            accelerator = 'cpu'
+            map_location = torch.device('cpu')
+    else:
+        device = [cfg.cuda]
+        accelerator = 'gpu'
+        map_location = torch.device(f'cuda:{cfg.cuda}')
+
+    logging.info(f"Inference will be done on device: {map_location}")
+
+    asr_model, model_name = setup_model(cfg, map_location)
+    trainer = pl.Trainer(devices=device, accelerator=accelerator)
+    asr_model.set_trainer(trainer)
+    asr_model = asr_model.eval()
+
+    # collect additional translation information
+    return_hypotheses = False
+
+    # prepare audio filepaths and decide wether it's partial audio
+    filepaths, partial_audio = prepare_audio_data(cfg)
+
+    # setup AMP (optional)
+    if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
+        logging.info("AMP enabled!\n")
+        autocast = torch.cuda.amp.autocast
+    else:
+
+        @contextlib.contextmanager
+        def autocast():
+            yield
+
+    # Compute output filename
+    cfg = compute_output_filename(cfg, model_name)
+
+    # if translations should not be overwritten, and already exists, skip re-translation step and return
+    if not cfg.return_translations and not cfg.overwrite_translations and os.path.exists(cfg.output_filename):
+        logging.info(
+            f"Previous translations found at {cfg.output_filename}, and flag `overwrite_translations`"
+            f"is {cfg.overwrite_translations}. Returning without re-translating text."
+        )
+        return cfg
+
+    # translate audio
+    with autocast():
+        with torch.no_grad():
+            translations = asr_model.translate(
+                paths2audio_files=filepaths, batch_size=cfg.batch_size, return_hypotheses=return_hypotheses,
+            )
+
+    logging.info(f"Finished translating {len(filepaths)} files !")
+    logging.info(f"Writing translations into file: {cfg.output_filename}")
+
+    if cfg.return_translations:
+        return translations
+
+    # write audio translations
+    with open(cfg.output_filename, 'w', encoding='utf-8', newline='\n') as f:
+        for filepath, translation in zip(filepaths, translations):
+            item = {'audio_filepath': filepath, 'pred_translation': translation}
+            f.write(json.dumps(item, ensure_ascii=False) + "\n")
+    logging.info(f"Finished writing predictions to {cfg.output_filename}!")
+
+    return cfg
+
+
+if __name__ == '__main__':
+    main()  # noqa pylint: disable=no-value-for-parameter
diff --git a/nemo/collections/asr/models/__init__.py b/nemo/collections/asr/models/__init__.py
index a7275faea3d0..34f2c4f62e29 100644
--- a/nemo/collections/asr/models/__init__.py
+++ b/nemo/collections/asr/models/__init__.py
@@ -33,3 +33,4 @@
 from nemo.collections.asr.models.rnnt_models import EncDecRNNTModel
 from nemo.collections.asr.models.slu_models import SLUIntentSlotBPEModel
 from nemo.collections.asr.models.ssl_models import SpeechEncDecSelfSupervisedModel
+from nemo.collections.asr.models.transformer_bpe_models import EncDecTransfModelBPE
diff --git a/nemo/collections/asr/models/transformer_bpe_models.py b/nemo/collections/asr/models/transformer_bpe_models.py
new file mode 100644
index 000000000000..178746795ae8
--- /dev/null
+++ b/nemo/collections/asr/models/transformer_bpe_models.py
@@ -0,0 +1,614 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import json
+import os
+import tempfile
+from math import ceil
+from typing import Dict, List, Optional, Union
+
+import editdistance
+import torch
+import torch.distributed as dist
+from omegaconf import DictConfig, OmegaConf
+from pytorch_lightning import Trainer
+from tqdm.auto import tqdm
+
+from nemo.collections.asr.data import audio_to_text_dataset
+from nemo.collections.asr.data.audio_to_text_dali import DALIOutputs
+from nemo.collections.asr.models.asr_model import ASRModel, ExportableEncDecModel
+from nemo.collections.asr.parts.mixins import ASRBPEMixin
+from nemo.collections.common.losses import SmoothedCrossEntropyLoss
+from nemo.collections.common.metrics import GlobalAverageLossMetric
+from nemo.collections.common.parts import transformer_weights_init
+
+from nemo.core.classes.common import typecheck
+from nemo.core.neural_types import (
+    AudioSignal,
+    ChannelType,
+    LabelsType,
+    LengthsType,
+    LogprobsType,
+    MaskType,
+    NeuralType,
+    SpectrogramType,
+)
+from nemo.utils import logging
+
+try:
+    from sacrebleu import corpus_bleu
+    from nemo.collections.nlp.modules.common import TokenClassifier
+    from nemo.collections.nlp.modules.common.lm_utils import get_transformer
+    from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TransformerEncoder
+
+    NLP_AVAILABLE = True
+except (ImportError, ModuleNotFoundError):
+    NLP_AVAILABLE = False
+    logging.warning("Could not import NeMo NLP collection which is required for speech translation model.")
+
+__all__ = ['EncDecTransfModelBPE']
+
+
+def lens_to_mask(lens, max_length):
+    batch_size = lens.shape[0]
+    mask = torch.arange(max_length).repeat(batch_size, 1).to(lens.device) < lens[:, None]
+    return mask
+
+
+class EncDecTransfModelBPE(ASRModel, ExportableEncDecModel, ASRBPEMixin):
+    """Base class for encoder decoder CTC-based models."""
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+
+        if 'tokenizer' not in cfg:
+            raise ValueError("`cfg` must have `tokenizer` config to create a tokenizer !")
+
+        # Setup the tokenizer
+        self._setup_tokenizer(cfg.tokenizer)
+
+        super().__init__(cfg=cfg, trainer=trainer)
+
+        # Setup audio preprocessor
+        self.preprocessor = EncDecTransfModelBPE.from_config_dict(self.cfg.preprocessor)
+
+        # Setup audio encoder
+        self.encoder = EncDecTransfModelBPE.from_config_dict(self.cfg.encoder)
+
+        # Add projection layer if encoder and decoder differ in hidden size
+        if self.cfg.encoder['d_model'] != self.cfg.transf_decoder['hidden_size']:
+            self.adapter = torch.nn.Linear(self.cfg.encoder['d_model'], self.cfg.transf_decoder['hidden_size'])
+        else:
+            self.adapter = torch.nn.Identity()
+
+        transf_encoder_cfg_dict = OmegaConf.to_container(cfg.get('transf_encoder'))
+
+        # Whether to add Transformer Encoder block between Conformer and Transformer Decoder
+        self.use_transf_encoder = False
+        if transf_encoder_cfg_dict['num_layers'] > 0:
+            self.use_transf_encoder = True
+
+            self.transf_encoder = TransformerEncoder(
+                num_layers=transf_encoder_cfg_dict['num_layers'],
+                hidden_size=transf_encoder_cfg_dict['hidden_size'],
+                inner_size=transf_encoder_cfg_dict['inner_size'],
+                mask_future=False,
+                num_attention_heads=transf_encoder_cfg_dict['num_attention_heads'],
+                attn_score_dropout=transf_encoder_cfg_dict['attn_score_dropout'],
+                attn_layer_dropout=transf_encoder_cfg_dict['attn_layer_dropout'],
+                ffn_dropout=transf_encoder_cfg_dict['ffn_dropout'],
+                pre_ln=transf_encoder_cfg_dict.get('pre_ln', True),
+                pre_ln_final_layer_norm=transf_encoder_cfg_dict.get('pre_ln_final_layer_norm', True),
+            )
+            std_init_range = 1 / transf_encoder_cfg_dict['hidden_size'] ** 0.5
+            self.transf_encoder.apply(lambda module: transformer_weights_init(module, std_init_range))
+
+        transf_decoder_cfg_dict = OmegaConf.to_container(cfg.get('transf_decoder'))
+
+        # Transformer decoder
+        vocab_size = 8 * ceil(self.tokenizer.vocab_size / 8)
+        transf_decoder_cfg_dict['vocab_size'] = vocab_size
+        library = transf_decoder_cfg_dict.pop('library', 'nemo')
+        model_name = transf_decoder_cfg_dict.pop('model_name', None)
+        pretrained = transf_decoder_cfg_dict.pop('pretrained', False)
+        self.transf_decoder = get_transformer(
+            library=library,
+            model_name=model_name,
+            pretrained=pretrained,
+            config_dict=transf_decoder_cfg_dict,
+            encoder=False,
+            pre_ln_final_layer_norm=transf_decoder_cfg_dict.get("pre_ln_final_layer_norm", False),
+        )
+
+        self.log_softmax = TokenClassifier(
+            hidden_size=self.transf_decoder.hidden_size,
+            num_classes=vocab_size,
+            activation=self.cfg.head.activation,
+            log_softmax=self.cfg.head.log_softmax,
+            dropout=self.cfg.head.dropout,
+            use_transformer_init=self.cfg.head.use_transformer_init,
+        )
+        self.log_softmax.mlp.layer0.weight = self.transf_decoder.embedding.token_embedding.weight
+        std_init_range = 1 / self.transf_decoder.hidden_size ** 0.5
+        self.transf_decoder.apply(lambda module: transformer_weights_init(module, std_init_range))
+        self.log_softmax.apply(lambda module: transformer_weights_init(module, std_init_range))
+
+        # Beam Search decoding
+        self.beam_search = BeamSearchSequenceGenerator(
+            embedding=self.transf_decoder.embedding,
+            decoder=self.transf_decoder.decoder,
+            log_softmax=self.log_softmax,
+            max_sequence_length=self.transf_decoder.max_sequence_length,
+            beam_size=self.cfg.beam_search.beam_size,
+            bos=self.tokenizer.bos_id,
+            pad=self.tokenizer.pad_id,
+            eos=self.tokenizer.eos_id,
+            len_pen=self.cfg.beam_search.len_pen,
+            max_delta_length=self.cfg.beam_search.max_generation_delta,
+        )
+
+        # Define autoregressive CE loss
+        self.transf_loss = SmoothedCrossEntropyLoss(
+            pad_id=self.tokenizer.pad_id, label_smoothing=self.cfg.label_smoothing
+        )
+
+        if hasattr(self.cfg, 'spec_augment') and self.cfg.spec_augment is not None:
+            self.spec_augmentation = EncDecTransfModelBPE.from_config_dict(self.cfg.spec_augment)
+        else:
+            self.spec_augmentation = None
+
+        self.val_loss = GlobalAverageLossMetric(dist_sync_on_step=False, take_avg_loss=True)
+
+    @torch.no_grad()
+    def translate(
+        self,
+        paths2audio_files: List[str],
+        batch_size: int = 4,
+        logprobs: bool = False,
+        return_hypotheses: bool = False,
+    ) -> List[str]:
+        hypotheses = self.transcribe(paths2audio_files, batch_size, logprobs, return_hypotheses)
+        return hypotheses
+
+    @torch.no_grad()
+    def transcribe(
+        self,
+        paths2audio_files: List[str],
+        batch_size: int = 4,
+        logprobs: bool = False,
+        return_hypotheses: bool = False,
+    ) -> List[str]:
+        """
+        Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.
+        Args:
+            paths2audio_files: (a list) of paths to audio files. \
+                Recommended length per file is between 5 and 25 seconds. \
+                But it is possible to pass a few hours long file if enough GPU memory is available.
+            batch_size: (int) batch size to use during inference.
+                Bigger will result in better throughput performance but would use more memory.
+            logprobs: (bool) pass True to get log probabilities instead of transcripts.
+            return_hypotheses: (bool) Either return hypotheses or text
+                With hypotheses can do some postprocessing like getting timestamp or rescoring
+        Returns:
+            A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files
+        """
+        if paths2audio_files is None or len(paths2audio_files) == 0:
+            return {}
+
+        if return_hypotheses and logprobs:
+            raise ValueError(
+                "Either `return_hypotheses` or `logprobs` can be True at any given time."
+                "Returned hypotheses will contain the logprobs."
+            )
+
+        # We will store transcriptions here
+        hypotheses = []
+
+        # Model's mode and device
+        mode = self.training
+        device = next(self.parameters()).device
+        dither_value = self.preprocessor.featurizer.dither
+        pad_to_value = self.preprocessor.featurizer.pad_to
+
+        try:
+            self.preprocessor.featurizer.dither = 0.0
+            self.preprocessor.featurizer.pad_to = 0
+            # Switch model to evaluation mode
+            self.eval()
+            # Freeze the encoder and decoder modules
+            self.encoder.freeze()
+            self.transf_decoder.freeze()
+            logging_level = logging.get_verbosity()
+            logging.set_verbosity(logging.WARNING)
+            # Work in tmp directory - will store manifest file there
+            with tempfile.TemporaryDirectory() as tmpdir:
+                with open(os.path.join(tmpdir, 'manifest.json'), 'w') as fp:
+                    for audio_file in paths2audio_files:
+                        entry = {'audio_filepath': audio_file, 'duration': 100000, 'text': 'nothing'}
+                        fp.write(json.dumps(entry) + '\n')
+
+                config = {'paths2audio_files': paths2audio_files, 'batch_size': batch_size, 'temp_dir': tmpdir}
+
+                temporary_datalayer = self._setup_transcribe_dataloader(config)
+                for test_batch in tqdm(temporary_datalayer, desc="Transcribing"):
+                    log_probs, encoded_len, enc_states, enc_mask = self.forward(
+                        input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)
+                    )
+
+                    beam_hypotheses = (
+                        self.beam_search(
+                            encoder_hidden_states=enc_states, encoder_input_mask=enc_mask, return_beam_scores=False
+                        )
+                        .detach()
+                        .cpu()
+                        .numpy()
+                    )
+                    beam_hypotheses = [self.tokenizer.ids_to_text(hyp) for hyp in beam_hypotheses]
+
+                    if return_hypotheses:
+                        # dump log probs per file
+                        for idx in range(logits.shape[0]):
+                            current_hypotheses[idx].y_sequence = logits[idx][: logits_len[idx]]
+
+                    hypotheses += beam_hypotheses
+
+                    del test_batch, log_probs, encoded_len, enc_states, enc_mask
+        finally:
+            # set mode back to its original value
+            self.train(mode=mode)
+            self.preprocessor.featurizer.dither = dither_value
+            self.preprocessor.featurizer.pad_to = pad_to_value
+            if mode is True:
+                self.encoder.unfreeze()
+                self.transf_decoder.unfreeze()
+            logging.set_verbosity(logging_level)
+
+        return hypotheses
+
+    def _setup_dataloader_from_config(self, config: Optional[Dict]):
+
+        dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config(
+            config=config,
+            local_rank=self.local_rank,
+            global_rank=self.global_rank,
+            world_size=self.world_size,
+            tokenizer=self.tokenizer,
+            preprocessor_cfg=self.cfg.get("preprocessor", None),
+        )
+
+        if dataset is None:
+            return None
+
+        shuffle = config['shuffle']
+        if config.get('is_tarred', False):
+            shuffle = False
+
+        if hasattr(dataset, 'collate_fn'):
+            collate_fn = dataset.collate_fn
+        else:
+            collate_fn = dataset.datasets[0].collate_fn
+
+        return torch.utils.data.DataLoader(
+            dataset=dataset,
+            batch_size=config['batch_size'],
+            collate_fn=collate_fn,
+            drop_last=config.get('drop_last', False),
+            shuffle=shuffle,
+            num_workers=config.get('num_workers', 0),
+            pin_memory=config.get('pin_memory', False),
+        )
+
+    def setup_training_data(self, train_data_config: Optional[DictConfig]):
+
+        # create audio-only data loader
+        self._update_dataset_config(dataset_name='train', config=train_data_config)
+        self._train_dl = self._setup_dataloader_from_config(config=train_data_config)
+
+        # Need to set this because if using an IterableDataset, the length of the
+        # dataloader is the total number of samples rather than the number of batches,
+        # and this messes up the tqdm progress bar. So we set the number of steps manually
+        # (to the correct number) to fix this.
+        if 'is_tarred' in train_data_config and train_data_config['is_tarred']:
+            # We also need to check if limit_train_batches is already set.
+            # If it's an int, we assume that the user has set it to something sane,
+            # i.e. <= # training batches, and don't change it. Otherwise, adjust
+            # batches accordingly if it's a float (including 1.0).
+            if self._trainer is not None and isinstance(self._trainer.limit_train_batches, float):
+                self._trainer.limit_train_batches = int(
+                    self._trainer.limit_train_batches
+                    * ceil((len(self._train_dl.dataset) / self.world_size) / train_data_config['batch_size'])
+                )
+            elif self._trainer is None:
+                logging.warning(
+                    "Model Trainer was not set before constructing the dataset, incorrect number of "
+                    "training batches will be used. Please set the trainer and rebuild the dataset."
+                )
+
+    def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]):
+        """
+        Sets up the validation data loader via a Dict-like object.
+        Args:
+            val_data_config: A config that contains the information regarding construction
+                of an ASR Training dataset.
+        Supported Datasets:
+            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset`
+        """
+        if 'shuffle' not in val_data_config:
+            val_data_config['shuffle'] = False
+
+        # preserve config
+        self._update_dataset_config(dataset_name='validation', config=val_data_config)
+        self._validation_dl = self._setup_dataloader_from_config(config=val_data_config)
+
+    def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]):
+        """
+        Sets up the test data loader via a Dict-like object.
+        Args:
+            test_data_config: A config that contains the information regarding construction
+                of an ASR Training dataset.
+        Supported Datasets:
+            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset`
+        """
+        if 'shuffle' not in test_data_config:
+            test_data_config['shuffle'] = False
+
+        # preserve config
+        self._update_dataset_config(dataset_name='test', config=test_data_config)
+        self._test_dl = self._setup_dataloader_from_config(config=test_data_config)
+
+    @property
+    def input_types(self) -> Optional[Dict[str, NeuralType]]:
+        if hasattr(self.preprocessor, '_sample_rate'):
+            input_signal_eltype = AudioSignal(freq=self.preprocessor._sample_rate)
+        else:
+            input_signal_eltype = AudioSignal()
+        return {
+            "input_signal": NeuralType(('B', 'T'), input_signal_eltype, optional=True),
+            "input_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+            "processed_signal": NeuralType(('B', 'D', 'T'), SpectrogramType(), optional=True),
+            "processed_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+            "transcript": NeuralType(('B', 'T'), LabelsType(), optional=True),
+            "transcript_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+            "sample_id": NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {
+            "transf_log_probs": NeuralType(('B', 'T', 'D'), LogprobsType()),
+            "encoded_lengths": NeuralType(tuple('B'), LengthsType()),
+            "encoder_states": NeuralType(('B', 'T', 'D'), ChannelType()),
+            "encoder_mask": NeuralType(('B', 'T'), MaskType()),
+        }
+
+    @typecheck()
+    def forward(
+        self,
+        input_signal=None,
+        input_signal_length=None,
+        processed_signal=None,
+        processed_signal_length=None,
+        transcript=None,
+        transcript_length=None,
+    ):
+        """
+        Forward pass of the model.
+        Args:
+            input_signal: Tensor that represents a batch of raw audio signals,
+                of shape [B, T]. T here represents timesteps, with 1 second of audio represented as
+                `self.sample_rate` number of floating point values.
+            input_signal_length: Vector of length B, that contains the individual lengths of the audio
+                sequences.
+            processed_signal: Tensor that represents a batch of processed audio signals,
+                of shape (B, D, T) that has undergone processing via some DALI preprocessor.
+            processed_signal_length: Vector of length B, that contains the individual lengths of the
+                processed audio sequences.
+        Returns:
+            A tuple of 3 elements -
+            1) The log probabilities tensor of shape [B, T, D].
+            2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B].
+            3) The greedy token predictions of the model of shape [B, T] (via argmax)
+        """
+        has_input_signal = input_signal is not None and input_signal_length is not None
+        has_processed_signal = processed_signal is not None and processed_signal_length is not None
+        if (has_input_signal ^ has_processed_signal) == False:
+            raise ValueError(
+                f"{self} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive "
+                " with ``processed_signal`` and ``processed_signal_len`` arguments."
+            )
+
+        if not has_processed_signal:
+            processed_signal, processed_signal_length = self.preprocessor(
+                input_signal=input_signal, length=input_signal_length
+            )
+
+        if self.spec_augmentation is not None and self.training:
+            processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length)
+
+        encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length)
+
+        enc_states = encoded.permute(0, 2, 1)
+        enc_states = self.adapter(enc_states)
+        enc_mask = lens_to_mask(encoded_len, enc_states.shape[1]).to(enc_states.dtype)
+        if self.use_transf_encoder:
+            enc_states = self.transf_encoder(encoder_states=enc_states, encoder_mask=enc_mask)
+
+        transf_log_probs = None
+        if transcript is not None:
+            dec_mask = lens_to_mask(transcript_length, transcript.shape[1]).to(transcript.dtype)
+            dec_states = self.transf_decoder(
+                input_ids=transcript, decoder_mask=dec_mask, encoder_embeddings=enc_states, encoder_mask=enc_mask
+            )
+            transf_log_probs = self.log_softmax(hidden_states=dec_states)
+
+        return transf_log_probs, encoded_len, enc_states, enc_mask
+
+    def compute_audio_loss(self, batch):
+
+        if batch is None:
+            return 0
+
+        signal, signal_len, transcript, transcript_len = batch
+        input_ids, labels = transcript[:, :-1], transcript[:, 1:]
+
+        transf_log_probs, encoded_len, enc_states, enc_mask = self.forward(
+            input_signal=signal,
+            input_signal_length=signal_len,
+            transcript=input_ids,
+            transcript_length=transcript_len,
+        )
+
+        transf_loss = self.transf_loss(log_probs=transf_log_probs, labels=labels)
+
+        return transf_loss
+
+    # PTL-specific methods
+    def training_step(self, batch, batch_nb):
+
+        audio_loss = self.compute_audio_loss(batch)
+
+        tensorboard_logs = {
+            'train_loss': audio_loss,
+            'learning_rate': self._optimizer.param_groups[0]['lr'],
+        }
+
+        return {'loss': audio_loss, 'log': tensorboard_logs}
+
+    def validation_step(self, batch, batch_idx, dataloader_idx=0, eval_mode="val"):
+        signal, signal_len, transcript, transcript_len = batch
+        input_ids, labels = transcript[:, :-1], transcript[:, 1:]
+
+        if isinstance(batch, DALIOutputs) and batch.has_processed_signal:
+            transf_log_probs, encoded_len, enc_states, enc_mask = self.forward(
+                processed_signal=signal,
+                processed_signal_length=signal_len,
+                transcript=input_ids,
+                transcript_length=transcript_len,
+            )
+        else:
+            transf_log_probs, encoded_len, enc_states, enc_mask = self.forward(
+                input_signal=signal,
+                input_signal_length=signal_len,
+                transcript=input_ids,
+                transcript_length=transcript_len,
+            )
+
+        beam_hypotheses = self.beam_search(
+            encoder_hidden_states=enc_states, encoder_input_mask=enc_mask, return_beam_scores=False
+        )
+        transf_loss = self.transf_loss(log_probs=transf_log_probs, labels=labels)
+
+        ground_truths = [self.tokenizer.ids_to_text(sent) for sent in transcript.detach().cpu().tolist()]
+        translations = [self.tokenizer.ids_to_text(sent) for sent in beam_hypotheses.detach().cpu().tolist()]
+
+        self.val_loss(loss=transf_loss, num_measurements=transf_log_probs.shape[0] * transf_log_probs.shape[1])
+
+        return {f'{eval_mode}_loss': transf_loss, 'translations': translations, 'ground_truths': ground_truths}
+
+    def test_step(self, batch, batch_idx, dataloader_idx=0):
+        return self.validation_step(batch, batch_idx, dataloader_idx, eval_mode="test")
+
+    def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0, eval_mode: str = "val"):
+        """
+        Called at the end of validation to aggregate outputs.
+        :param outputs: list of individual outputs of each validation step.
+        """
+        if not outputs:
+            return
+
+        if isinstance(outputs[0], dict):
+            outputs = [outputs]
+
+        for output in outputs:
+            eval_loss = getattr(self, 'val_loss').compute()
+            translations = list(itertools.chain(*[x['translations'] for x in output]))
+            ground_truths = list(itertools.chain(*[x['ground_truths'] for x in output]))
+
+            # Gather translations and ground truths from all workers
+            tr_and_gt = [None for _ in range(self.world_size)]
+            # we also need to drop pairs where ground truth is an empty string
+            if self.world_size > 1:
+                dist.all_gather_object(
+                    tr_and_gt, [(t, g) for (t, g) in zip(translations, ground_truths) if g.strip() != '']
+                )
+            else:
+                tr_and_gt[0] = [(t, g) for (t, g) in zip(translations, ground_truths) if g.strip() != '']
+
+            if self.global_rank == 0:
+                _translations = []
+                _ground_truths = []
+                for rank in range(0, self.world_size):
+                    _translations += [t for (t, g) in tr_and_gt[rank]]
+                    _ground_truths += [g for (t, g) in tr_and_gt[rank]]
+
+                sacre_bleu = corpus_bleu(_translations, [_ground_truths], tokenize="13a")
+                sb_score = sacre_bleu.score * self.world_size
+
+                wer_scores, wer_words = 0, 0
+                for h, r in zip(_translations, _ground_truths):
+                    wer_words += len(r.split())
+                    wer_scores += editdistance.eval(h.split(), r.split())
+                wer_score = 1.0 * wer_scores * self.world_size / wer_words
+
+            else:
+                sb_score = 0.0
+                wer_score = 0.0
+
+            self.log(f"{eval_mode}_loss", eval_loss, sync_dist=True)
+            self.log(f"{eval_mode}_sacreBLEU", sb_score, sync_dist=True)
+            self.log(f"{eval_mode}_WER", wer_score, sync_dist=True)
+            self.val_loss.reset()
+
+    def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0):
+        return self.multi_validation_epoch_end(outputs, dataloader_idx, eval_mode="test")
+
+    def test_dataloader(self):
+        if self._test_dl is not None:
+            return self._test_dl
+
+    def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader':
+        """
+        Setup function for a temporary data loader which wraps the provided audio file.
+        Args:
+            config: A python dictionary which contains the following keys:
+            paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments. \
+                Recommended length per file is between 5 and 25 seconds.
+            batch_size: (int) batch size to use during inference. \
+                Bigger will result in better throughput performance but would use more memory.
+            temp_dir: (str) A temporary directory where the audio manifest is temporarily
+                stored.
+        Returns:
+            A pytorch DataLoader for the given audio file(s).
+        """
+        batch_size = min(config['batch_size'], len(config['paths2audio_files']))
+        dl_config = {
+            'manifest_filepath': os.path.join(config['temp_dir'], 'manifest.json'),
+            'sample_rate': self.preprocessor._sample_rate,
+            'batch_size': batch_size,
+            'trim_silence': False,
+            'shuffle': False,
+            'num_workers': min(batch_size, os.cpu_count() - 1),
+            'pin_memory': True,
+        }
+
+        temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config))
+        return temporary_datalayer
diff --git a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
index 0ab0cb784273..906154213ea1 100644
--- a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
+++ b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
@@ -299,7 +299,7 @@ def create_spt_model(
         byte_fallback: If <unk>, fallback to a byte sequence of the character.
         split_digits: If true, digits are split into individual tokens.
         split_by_whitespace: Whether to respect white space while creating subwords. If False, will learn merges across whitespace.
-        split_by_unicode_script: Whether to include multiple Unicode scripts. Ex. is Arabic diacritics which are considered part of the letter (عِدَّةُ)
+        split_by_unicode_script: Whether to include multiple Unicode scripts. Ex. is Arabic diacritics which are considered part of the letter (عِدَّةُ)
     """
 
     if not data_file or not os.path.exists(data_file):