diff --git a/Jenkinsfile b/Jenkinsfile
index ded314e8bc02..d20a71dca2a5 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -3146,8 +3146,10 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
                 inference.add_BOS=False \
                 trainer.devices=2 \
                 tensor_model_parallel_size=2 \
+                pred_file_path=/home/TestData/nlp/prompt_learning/p_tuning_test_tp_preds.txt \
                 data_paths=['/home/TestData/nlp/prompt_learning/rte_CI_test.jsonl']"
             sh "rm -rf /home/TestData/nlp/prompt_learning/p_tuning_test_tp.nemo"
+            sh "rm -rf /home/TestData/nlp/prompt_learning/p_tuning_test_tp_preds.txt"
           }
         }
         stage('GPT Prompt Learning TP=1 PP=2') {
@@ -3173,8 +3175,10 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
                 inference.add_BOS=False \
                 trainer.devices=2 \
                 pipeline_model_parallel_size=2 \
+                pred_file_path=/home/TestData/nlp/prompt_learning/p_tuning_test_pp_preds.txt \
                 data_paths=['/home/TestData/nlp/prompt_learning/boolq_CI_test.jsonl']"
             sh "rm -rf /home/TestData/nlp/prompt_learning/p_tuning_test_pp.nemo"
+            sh "rm -rf /home/TestData/nlp/prompt_learning/p_tuning_test_pp_preds.txt"
           }
         }
       }
@@ -3433,7 +3437,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
                 trainer.max_steps=6 \
                 trainer.max_epochs=null \
                 model.tensor_model_parallel_size=1 \
-                model.pretrained_language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo' \
+                model.language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo' \
                 model.existing_tasks=[] \
                 model.new_tasks=['squad'] \
                 model.data.train_ds=['/home/TestData/nlp/prompt_learning/squad_CI_test.jsonl'] \
@@ -3443,11 +3447,13 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
             sh "rm -rf /home/TestData/nlp/prompt_learning/t5_p_tuning_test"
             sh "python examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py \
                 virtual_prompt_model_file='/home/TestData/nlp/prompt_learning/t5_p_tuning_test.nemo' \
-                pretrained_language_model_file='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo' \
+                language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo' \
                 data.test_ds=['/home/TestData/nlp/prompt_learning/squad_CI_test.jsonl'] \
+                pred_file_path='/home/TestData/nlp/prompt_learning/t5_p_tuning_test_preds.txt' \
                 data.global_batch_size=4 \
                 data.micro_batch_size=4"
             sh "rm -rf /home/TestData/nlp/prompt_learning/t5_p_tuning_test.nemo"
+            sh "rm -rf /home/TestData/nlp/prompt_learning/t5_p_tuning_test_preds.txt"
           }
         }
         stage('T5 Prompt Learning TP=2 PP=1') {
@@ -3459,7 +3465,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
                 trainer.max_steps=6 \
                 trainer.max_epochs=null \
                 model.tensor_model_parallel_size=2 \
-                model.pretrained_language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \
+                model.language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \
                 model.existing_tasks=[] \
                 model.new_tasks=['squad'] \
                 model.data.train_ds=['/home/TestData/nlp/prompt_learning/squad_CI_test.jsonl'] \
@@ -3469,13 +3475,15 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
             sh "rm -rf /home/TestData/nlp/prompt_learning/t5_p_tuning_test_tp2"
             sh "python examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py \
                 virtual_prompt_model_file='/home/TestData/nlp/prompt_learning/t5_p_tuning_test_tp2.nemo' \
-                pretrained_language_model_file='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \
+                language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \
                 data.test_ds=['/home/TestData/nlp/prompt_learning/squad_CI_test.jsonl'] \
+                pred_file_path='/home/TestData/nlp/prompt_learning/t5_p_tuning_test_tp2_preds.txt' \
                 tensor_model_parallel_size=2 \
                 trainer.devices=2 \
                 data.global_batch_size=8 \
                 data.micro_batch_size=8"
             sh "rm -rf /home/TestData/nlp/prompt_learning/t5_p_tuning_test_tp2.nemo"
+            sh "rm -rf /home/TestData/nlp/prompt_learning/t5_p_tuning_test_tp2_preds.txt"
           }
         }
       }
diff --git a/README.rst b/README.rst
index 272dd6cef024..853bca41d510 100644
--- a/README.rst
+++ b/README.rst
@@ -200,16 +200,10 @@ Megatron GPT training requires NVIDIA Apex to be installed.
 
 .. code-block:: bash
 
-    git clone https://github.com/NVIDIA/apex
+    git clone https://github.com/ericharper/apex.git
     cd apex
-    git checkout 3c19f1061879394f28272a99a7ea26d58f72dace
-    pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" ./
-
-.. note::
-
-  You may need to modify [setup.py](https://github.com/NVIDIA/apex/blob/3c19f1061879394f28272a99a7ea26d58f72dace/setup.py) if 
-  your version of CUDA does not match the version used to compile Pytorch binaries, comment lines 33-41 in the above link
-  before installing.
+    git checkout nm_v1.11.0
+    pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./
 
 Docker containers:
 ~~~~~~~~~~~~~~~~~~
diff --git a/docs/source/asr/data/benchmark_hr.csv b/docs/source/asr/data/benchmark_hr.csv
new file mode 100644
index 000000000000..ea506eed3432
--- /dev/null
+++ b/docs/source/asr/data/benchmark_hr.csv
@@ -0,0 +1,3 @@
+Model,Model Base Class,Model Card
+stt_hr_conformer_ctc_large,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hr_conformer_ctc_large"
+stt_hr_conformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hr_conformer_transducer_large"
diff --git a/docs/source/asr/data/scores/hr/conformer_hr.csv b/docs/source/asr/data/scores/hr/conformer_hr.csv
new file mode 100644
index 000000000000..9c8128534b2f
--- /dev/null
+++ b/docs/source/asr/data/scores/hr/conformer_hr.csv
@@ -0,0 +1,3 @@
+Model Name,Language,ParlaSpeech-HR v1.0 (dev),ParlaSpeech-HR v1.0 (test)
+stt_hr_conformer_ctc_large,hr,4.43,4.70
+stt_hr_conformer_transducer_large,hr,4.56,4.69
diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst
index 8878d66ae739..42a472d61fa9 100644
--- a/docs/source/asr/datasets.rst
+++ b/docs/source/asr/datasets.rst
@@ -171,7 +171,7 @@ The audio files can be of any format supported by `Pydub <https://github.com/jia
 WAV files as they are the default and have been most thoroughly tested.
 
 There should be one manifest file per dataset that will be passed in, therefore, if the user wants separate training and validation
-datasets, they should also have separate manifests. Otherwise, thay will be loading validation data with their training data and vice
+datasets, they should also have separate manifests. Otherwise, they will be loading validation data with their training data and vice
 versa.
 
 Each line of the manifest should be in the following format:
diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst
index c1fdf59e1515..293549a65d65 100644
--- a/docs/source/asr/models.rst
+++ b/docs/source/asr/models.rst
@@ -132,15 +132,17 @@ Cache-aware Streaming Conformer
 
 Buffered streaming uses overlapping chunks to make an offline ASR model to be used for streaming with reasonable accuracy. However, it uses significant amount of duplication in computations due to the overlapping chunks.
 Also there is a accuracy gep between the offline model and the streaming one as there is inconsistency between how we train the model and how we perform inference for streaming.
-The Cache-aware Streaming Conformer models would tackle and address these disadvantages. They are variants of Conformer which are trained with limited right context and it would make it possible to match the training and inference.
+The Cache-aware Streaming Conformer models would tackle and address these disadvantages. These streaming Conformers are trained with limited right context that it would make it possible to match how the model is being used in both the training and inference.
+They also uses caching to store intermediate activations to avoid any duplication in compute.
 The cache-aware approach is supported for both the Conformer-CTC and Conformer-Transducer and enables the model to be used very efficiently for streaming.
 
-Three categories of layers in Conformer have access to right tokens: 1-depthwise convolutions 2-self-attention, and 3-convolutions in downsampling layers.
+Three categories of layers in Conformer have access to right tokens: 1-depthwise convolutions 2-self-attention, and 3-convolutions in the downsampling layers.
 Streaming Conformer models uses causal convolutions or convolutions with lower right context and also self-attention with limited right context to limit the effective right context for the input.
-The model trained with such limitations can be used in streaming mode and give the exact same output and accuracy as when the whole audio is given to the model in offline mode.
+The model trained with such limitations can be used in streaming mode and give the exact same outputs and accuracy as when the whole audio is given to the model in offline mode.
 These model can use caching mechanism to store and reuse the activations during streaming inference to avoid any duplications in the computations as much as possible.
 
 We support the following three right context modeling:
+
 *  fully causal model with zero look-ahead: tokens would not see any future tokens. convolution layers are all causal and right tokens are masked for self-attention.
 It gives zero latency but with limited accuracy.
 To train such a model, you need to set `encoder.att_context_size=[left_context, 0]` and `encoder.conv_context_size=causal` in the config.
@@ -155,9 +157,9 @@ This approach is more efficient than regular look-ahead in terms of computations
 In terms of accuracy, this approach gives similar or even better results in term of accuracy than regular look-ahead as each token in each layer have access to more tokens on average. That is why we recommend to use this approach for streaming.
 
 
-** Note: Latencies are based on the assumption that the forward time of the network is zero.
+** Note: Latencies are based on the assumption that the forward time of the network is zero and it just estimates the time needed after a frame would be available until it is passed through the model.
 
-Approaches with non-zero look-ahead can give significantly better accuracy by sacrificing latency. The latency can get controlled by the left context size.
+Approaches with non-zero look-ahead can give significantly better accuracy by sacrificing latency. The latency can get controlled by the left context size. Increasing the right context would help the accuracy to a limit but would increase the compuation time.
 
 
 In all modes, left context can be controlled by the number of tokens to be visible in the self-attention and the kernel size of the convolutions.
@@ -168,12 +170,16 @@ Left context of convolutions is dependent to the their kernel size while it can
 Self-attention left context of around 6 secs would give close result to have unlimited left context. For a model with 4x downsampling and shift window of 10ms in the preprocessor, each token corresponds to 4*10=40ms.
 
 If striding approach is used for downsampling, all the convolutions in downsampling would be fully causal and don't see future tokens.
-It is recommended to use stacking for streaming model which is significantly faster and uses less memory.
+You may use stacking for downsampling in the streaming models which is significantly faster and uses less memory.
+It also does not some of the the limitations with striding and vggnet and you may use any downsampling rate.
 
 You may find the example config files of cache-aware streaming Conformer models at
 ``<NeMo_git_root>/examples/asr/conf/conformer/streaming/conformer_transducer_bpe_streaming.yaml`` for Transducer variant and
 at ``<NeMo_git_root>/examples/asr/conf/conformer/streaming/conformer_ctc_bpe.yaml`` for CTC variant.
 
+To simulate cache-aware stremaing, you may use the script at ``<NeMo_git_root>/examples/asr/asr_streaming/speech_to_text_streaming_infer.py``. It can simulate streaming in single stream or multi-stream mode (in batches) for an ASR model.
+This script can be used for models trained offline with full-context but the accuracy would not be great unless the chunk size is large enough which would result in high latency.
+It is recommended to train a model in streaming model with limited context for this script. More info can be found in the script.
 
 .. _LSTM-Transducer_model:
 
diff --git a/docs/source/asr/scores.rst b/docs/source/asr/scores.rst
index 77e9e5b09531..2eb11e3ab38e 100644
--- a/docs/source/asr/scores.rst
+++ b/docs/source/asr/scores.rst
@@ -169,6 +169,16 @@ FR
     
 --------------------
 
+HR
+^^
+
+.. csv-table::
+    :header-rows: 1
+    :align: left
+    :file: data/scores/hr/conformer_hr.csv
+    
+--------------------
+
 IT
 ^^
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 8164f231a5d6..33acabfd7acb 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -120,6 +120,7 @@
     'nlp/text_normalization/tn_itn_all.bib',
     'tools/tools_all.bib',
     'tts_all.bib',
+    'text_processing/text_processing_all.bib',
     'core/adapters/adapter_bib.bib',
 ]
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 4bafabee526d..326142830efb 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -44,6 +44,7 @@ NVIDIA NeMo User Guide
    nlp/machine_translation/machine_translation
    nlp/text_normalization/intro
    nlp/api
+   nlp/models
    
 
 .. toctree::
@@ -60,6 +61,14 @@ NVIDIA NeMo User Guide
    :caption: Common
    :name: Common
 
+   text_processing/intro
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Text Processing
+   :name: Text Processing
+
+   text_processing/g2p/g2p
    common/intro
 
 
diff --git a/docs/source/text_processing/g2p/g2p.rst b/docs/source/text_processing/g2p/g2p.rst
new file mode 100644
index 000000000000..5f8b01ac0152
--- /dev/null
+++ b/docs/source/text_processing/g2p/g2p.rst
@@ -0,0 +1,209 @@
+.. _g2p:
+
+Grapheme-to-Phoneme Models
+==========================
+
+Grapheme-to-phoneme conversion (G2P) is the task of transducing graphemes (i.e., orthographic symbols) to phonemes (i.e., units of the sound system of a language).
+For example, for `International_Phonetic_Alphabet (IPA): <https://en.wikipedia.org/wiki/International_Phonetic_Alphabet>`__ ``"Swifts, flushed from chimneys …" → "ˈswɪfts, ˈfɫəʃt ˈfɹəm ˈtʃɪmniz …"``.
+
+Modern text-to-speech (TTS) models can learn pronunciations from raw text input and its corresponding audio data,
+but by relying on grapheme input during training, such models fail to provide a reliable way of correcting wrong pronunciations. As a result, many TTS systems use phonetic input
+during training to directly access and correct pronunciations at inference time. G2P systems allow users to enforce the desired pronunciation by providing a phonetic transcript of the input.
+
+G2P models convert out-of-vocabulary words (OOV), e.g. proper names and loaner words, as well as heteronyms in their phonetic form to improve the quality of the syntesized text.
+
+*Heteronyms* represent words that have the same spelling but different pronunciations, e.g., “read” in “I will read the book.” vs. “She read her project last week.”  A single model that can handle OOVs and heteronyms and replace dictionary lookups can significantly simplify and improve the quality of synthesized speech.
+
+We support the following G2P models:
+
+* **ByT5 G2P** a text-to-text model that is based on ByT5 :cite:`g2p--xue2021byt5` neural network model that was originally proposed in :cite:`g2p--vrezavckova2021t5g2p` and :cite:`g2p--zhu2022byt5`.
+
+* **G2P-Conformer** CTC model -  uses a Conformer encoder :cite:`g2p--ggulati2020conformer` followed by a linear decoder; the model is trained with CTC-loss. G2P-Conformer model has about 20 times fewer parameters than the ByT5 model and is a non-autoregressive model that makes it faster during inference.
+
+The models can be trained using words or sentences as input.
+If trained with sentence-level input, the models can handle out-of-vocabulary (OOV) and heteronyms along with unambiguous words in a single pass.
+See :ref:`Sentence-level Dataset Preparation Pipeline <sentence_level_dataset_pipeline>` on how to label data for G2P model training.
+
+Additionally, we support a purpose-built BERT-based classification model for heteronym disambiguation, see :ref:`this <bert_heteronym_cl>` for details.
+
+Model Training, Evaluation and Inference
+----------------------------------------
+
+The section covers both ByT5 and G2P-Conformer models.
+
+The models take input data in `.json` manifest format, and there should be separate training and validation manifests.
+Each line of the manifest should be in the following format:
+
+.. code::
+
+  {"text_graphemes": "Swifts, flushed from chimneys.", "text": "ˈswɪfts, ˈfɫəʃt ˈfɹəm ˈtʃɪmniz."}
+
+Manifest fields:
+
+* ``text`` - name of the field in manifest_filepath for ground truth phonemes
+
+* ``text_graphemes`` - name of the field in manifest_filepath for input grapheme text
+
+The models can handle input with and without punctuation marks.
+
+To train ByT5 G2P model and evaluate it after at the end of the training, run:
+
+.. code::
+
+    python examples/text_processing/g2p/g2p_train_and_evaluate.py \
+        # (Optional: --config-path=<Path to dir of configs> --config-name=<name of config without .yaml>) \
+        model.train_ds.manifest_filepath="<Path to manifest file>" \
+        model.validation_ds.manifest_filepath="<Path to manifest file>" \
+        model.test_ds.manifest_filepath="<Path to manifest file>" \
+        trainer.devices=1 \
+        do_training=True \
+        do_testing=True
+
+Example of the config file: ``NeMo/examples/text_processing/g2p/conf/t5_g2p.yaml``.
+
+
+To train G2P-Conformer model and evaluate it after at the end of the training, run:
+
+.. code::
+
+    python examples/text_processing/g2p/g2p_train_and_evaluate.py \
+        # (Optional: --config-path=<Path to dir of configs> --config-name=<name of config without .yaml>) \
+        model.train_ds.manifest_filepath="<Path to manifest file>" \
+        model.validation_ds.manifest_filepath="<Path to manifest file>" \
+        model.test_ds.manifest_filepath="<Path to manifest file>" \
+        model.tokenizer.dir=<Path to pretrained tokenizer> \
+        model.tokenizer_grapheme.do_lower=False \
+        model.tokenizer_grapheme.add_punctuation=True \
+        trainer.devices=1 \
+
+        do_training=True \
+        do_testing=True
+
+Example of the config file: ``NeMo/examples/text_processing/g2p/conf/g2p_conformer_ctc.yaml``.
+
+
+To evaluate a pretrained G2P model, run:
+
+.. code::
+
+    python examples/text_processing/g2p/g2p_train_and_evaluate.py \
+        # (Optional: --config-path=<Path to dir of configs> --config-name=<name of config without .yaml>) \
+        pretrained_model="<Path to .nemo file or pretrained model name from list_available_models()>" \
+        model.test_ds.manifest_filepath="<Path to manifest file>" \
+        trainer.devices=1 \
+        do_training=False \
+        do_testing=True
+
+To run inference with a pretrained G2P model, run:
+
+.. code-block::
+
+    python g2p_inference.py \
+        pretrained_model=<Path to .nemo file or pretrained model name for G2PModel from list_available_models()>" \
+        manifest_filepath="<Path to .json manifest>" \
+        output_file="<Path to .json manifest to save prediction>" \
+        batch_size=32 \
+        num_workers=4 \
+        pred_field="pred_text"
+
+Model's predictions will be saved in `pred_field` of the `output_file`.
+
+.. _sentence_level_dataset_pipeline:
+
+Sentence-level Dataset Preparation Pipeline
+-------------------------------------------
+
+Here is the overall overview of the data labeling pipeline for sentence-level G2P model training:
+
+    .. image:: images/data_labeling_pipeline.png
+        :align: center
+        :alt: Data labeling pipeline for sentence-level G2P model training
+        :scale: 70%
+
+Here we describe the automatic phoneme-labeling process for generating augmented data. The figure below shows the phoneme-labeling steps to prepare data for sentence-level G2P model training. We first convert known unambiguous words to their phonetic pronunciations with dictionary lookups, e.g. CMU dictionary.
+Next, we automatically label heteronyms using a RAD-TTS Aligner :cite:`g2p--badlani2022one`. More details on how to disambiguate heteronyms with a pretrained Aligner model could be found in `NeMo/tutorials/tts/Aligner_Inference_Examples.ipynb <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/tts/Aligner_Inference_Examples.ipynb>`__ in `Google's Colab <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/tts/Aligner_Inference_Examples.ipynb>`_.
+Finally, we mask-out OOV words with a special masking token, “<unk>” in the figure below (note, we use `model.tokenizer_grapheme.unk_token="҂"` symbol during G2P model training.)
+Using this unknown token forces a G2P model to produce the same masking token as a phonetic representation during training. During inference, the model generates phoneme predictions for OOV words without emitting the masking token as long as this token is not included in the grapheme input.
+
+
+
+.. _bert_heteronym_cl:
+
+Purpose-built BERT-based classification model for heteronym disambiguation
+--------------------------------------------------------------------------
+
+HeteronymClassificationModel is a BERT-based :cite:`g2p--ddevlin2018bert` model represents a token classification model and can handle multiple heteronyms at once. The model takes a sentence as an input, and then for every word, it selects a heteronym option out of the available forms.
+We mask irrelevant forms to disregard the model’s predictions for non-ambiguous words. E.g., given  the input “The Poems are simple to read and easy to comprehend.” the model scores possible {READ_PRESENT and READ_PAST} options for the word “read”.
+Possible heteronym forms are extracted from the WikipediaHomographData :cite:`g2p--gorman2018improving`.
+
+The model expects input to be in `.json` manifest format, where is line contains at least the following fields:
+
+.. code::
+
+  {"text_graphemes": "Oxygen is less able to diffuse into the blood, leading to hypoxia.", "start_end": [23, 30], "homograph_span": "diffuse", "word_id": "diffuse_vrb"}
+
+Manifest fields:
+
+* `text_graphemes` - input sentence
+
+* `start_end` - beginning and end of the heteronym span in the input sentence
+
+* `homograph_span` - heteronym word in the sentence
+
+* `word_id` - heteronym label, e.g., word `diffuse` has the following possible labels: `diffuse_vrb` and `diffuse_adj`. See `https://github.com/google-research-datasets/WikipediaHomographData/blob/master/data/wordids.tsv <https://github.com/google-research-datasets/WikipediaHomographData/blob/master/data/wordids.tsv>`__ for more details.
+
+To convert the WikipediaHomographData to `.json` format suitable for the HeteronymClassificationModel training, run:
+
+.. code-block::
+
+    # WikipediaHomographData could be downloaded from `https://github.com/google-research-datasets/WikipediaHomographData <https://github.com/google-research-datasets/WikipediaHomographData>`__.
+
+    python NeMo/scripts/dataset_processing/g2p/export_wikihomograph_data_to_manifest.py \
+            --data_folder=<Path to WikipediaHomographData>/WikipediaHomographData-master/data/eval/
+            --output=eval.json
+    python NeMo/scripts/dataset_processing/g2p/export_wikihomograph_data_to_manifest.py \
+            --data_folder=<Path to WikipediaHomographData>/WikipediaHomographData-master/data/train/
+            --output=train.json
+
+To train and evaluate the model, run:
+
+.. code-block::
+
+    python heteronym_classification_train_and_evaluate.py \
+        train_manifest=<Path to manifest file>" \
+        validation_manifest=<Path to manifest file>" \
+        model.encoder.pretrained="<Path to .nemo file or pretrained model name from list_available_models()>" \
+        model.wordids=<Path to wordids.tsv file, similar to https://github.com/google-research-datasets/WikipediaHomographData/blob/master/data/wordids.tsv> \
+        do_training=True \
+        do_testing=True
+
+
+To run inference with a pretrained HeteronymClassificationModel, run:
+
+.. code-block::
+
+    python heteronym_classification_inference.py \
+        manifest="<Path to .json manifest>" \
+        pretrained_model="<Path to .nemo file or pretrained model name from list_available_models()>" \
+        output_file="<Path to .json manifest to save prediction>"
+
+Note, if the input manifest contains target "word_id", evaluation will be also performed. During inference, the model predicts heteronym `word_id` and saves predictions in `"pred_text"` field of the `output_file`:
+
+.. code::
+
+  {"text_graphemes": "Oxygen is less able to diffuse into the blood, leading to hypoxia.", "pred_text": "diffuse_vrb", "start_end": [23, 30], "homograph_span": "diffuse", "word_id": "diffuse_vrb"}
+
+
+Requirements
+------------
+
+G2P requires NeMo NLP and ASR collections installed. See `Installation instructions <https://github.com/NVIDIA/NeMo/blob/main/docs/source/starthere/intro.rst#installation>`__ for more details.
+
+
+References
+----------
+
+.. bibliography:: ../text_processing_all.bib
+    :style: plain
+    :labelprefix: g2p-
+    :keyprefix: g2p--
diff --git a/docs/source/text_processing/g2p/images/data_labeling_pipeline.png b/docs/source/text_processing/g2p/images/data_labeling_pipeline.png
new file mode 100644
index 000000000000..002b63246c95
Binary files /dev/null and b/docs/source/text_processing/g2p/images/data_labeling_pipeline.png differ
diff --git a/docs/source/text_processing/intro.rst b/docs/source/text_processing/intro.rst
new file mode 100644
index 000000000000..40f78ee0650b
--- /dev/null
+++ b/docs/source/text_processing/intro.rst
@@ -0,0 +1,12 @@
+NeMo Text Processing
+====================
+
+NeMo provides a set of models for text processing input and/or output of Automatic Speech Recognitions (ASR) and Text-to-Speech (TTS) models: \
+`https://github.com/NVIDIA/NeMo/tree/main/nemo_text_processing <https://github.com/NVIDIA/NeMo/tree/main/nemo_text_processing>`__ .
+
+.. toctree::
+   :maxdepth: 1
+
+   g2p
+
+
diff --git a/docs/source/text_processing/text_processing_all.bib b/docs/source/text_processing/text_processing_all.bib
new file mode 100644
index 000000000000..412d3ee21be6
--- /dev/null
+++ b/docs/source/text_processing/text_processing_all.bib
@@ -0,0 +1,53 @@
+@article{xue2021byt5,
+  title={ByT5: Towards a token-free future with pre-trained byte-to-byte models 2021},
+  author={Xue, Linting and Barua, Aditya and Constant, Noah and Al-Rfou, Rami and Narang, Sharan and Kale, Mihir and Roberts, Adam and Raffel, Colin},
+  journal={arXiv preprint arXiv:2105.13626},
+  year={2021}
+}
+
+@article{vrezavckova2021t5g2p,
+  title={T5g2p: Using text-to-text transfer transformer for grapheme-to-phoneme conversion},
+  author={{\v{R}}ez{\'a}{\v{c}}kov{\'a}, Mark{\'e}ta and {\v{S}}vec, Jan and Tihelka, Daniel},
+  year={2021},
+  journal={International Speech Communication Association}
+}
+
+@article{zhu2022byt5,
+  title={ByT5 model for massively multilingual grapheme-to-phoneme conversion},
+  author={Zhu, Jian and Zhang, Cong and Jurgens, David},
+  journal={arXiv preprint arXiv:2204.03067},
+  year={2022}
+}
+
+@article{ggulati2020conformer,
+  title={Conformer: Convolution-augmented transformer for speech recognition},
+  author={Gulati, Anmol and Qin, James and Chiu, Chung-Cheng and Parmar, Niki and Zhang, Yu and Yu, Jiahui and Han, Wei and Wang, Shibo and Zhang, Zhengdong and Wu, Yonghui and others},
+  journal={arXiv preprint arXiv:2005.08100},
+  year={2020}
+}
+
+@article{ddevlin2018bert,
+  title={Bert: Pre-training of deep bidirectional transformers for language understanding},
+  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
+  journal={arXiv preprint arXiv:1810.04805},
+  year={2018}
+}
+
+@inproceedings{gorman2018improving,
+  title={Improving homograph disambiguation with supervised machine learning},
+  author={Gorman, Kyle and Mazovetskiy, Gleb and Nikolaev, Vitaly},
+  booktitle={Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)},
+  year={2018}
+}
+
+
+@inproceedings{badlani2022one,
+  title={One TTS alignment to rule them all},
+  author={Badlani, Rohan and {\L}a{\'n}cucki, Adrian and Shih, Kevin J and Valle, Rafael and Ping, Wei and Catanzaro, Bryan},
+  booktitle={ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  pages={6092--6096},
+  year={2022},
+  organization={IEEE}
+}
+
+
diff --git a/examples/asr/asr_streaming/speech_to_text_streaming_infer.py b/examples/asr/asr_streaming/speech_to_text_streaming_infer.py
index 9e24a56a7e69..1eccb6079534 100644
--- a/examples/asr/asr_streaming/speech_to_text_streaming_infer.py
+++ b/examples/asr/asr_streaming/speech_to_text_streaming_infer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 """
-This script can be used to simulate frame-wise streaming for ASR models. The ASR model to be used with this script need to get trained in streaming mode. Currently only Conformer models supports this streaming mode.
+This script can be used to simulate cache-aware streaming for ASR models. The ASR model to be used with this script need to get trained in streaming mode. Currently only Conformer models supports this streaming mode.
 You may find examples of streaming models under 'NeMo/example/asr/conf/conformer/streaming/'.
 
 It works both on a manifest of audio files or a single audio file. It can perform streaming for a single stream (audio) or perform the evalution in multi-stream model (batch_size>1).
@@ -21,7 +21,7 @@
 
 # Usage
 
-## To evaluate a model in frame-wise streaming mode on a single audio file:
+## To evaluate a model in cache-aware streaming mode on a single audio file:
 
 python speech_to_text_streaming_infer.py \
     --asr_model=asr_model.nemo \
@@ -30,7 +30,7 @@
     --use_amp \
     --debug_mode
 
-## To evaluate a model in frame-wise streaming mode on a manifest file:
+## To evaluate a model in cache-aware streaming mode on a manifest file:
 
 python speech_to_text_streaming_infer.py \
     --asr_model=asr_model.nemo \
@@ -40,7 +40,32 @@
     --use_amp \
     --debug_mode
 
-You may drop the '--debug_mode' and '--compare_vs_offline' to speedup the streaming evaluation. If compare_vs_offline is not used, then significantly larger batch_size can be used.
+You may drop the '--debug_mode' and '--compare_vs_offline' to speedup the streaming evaluation.
+If compare_vs_offline is not used, then significantly larger batch_size can be used.
+
+## Evaluate a model trained with full context for offline mode
+
+You may try the cache-aware streaming with a model trained with full context in offline mode.
+But the accuracy would not be very good with small chunks as there is inconsistency between how the model is trained and how the streaming inference is done.
+The accuracy of the model on the borders of chunks would not be very good.
+
+To use a model trained with full context, you need to pass the chunk_size and shift_size arguments.
+If shift_size is not passed, chunk_size would be use as the shift_size too.
+Also argument online_normalization should be enabled to simulate a realistic streaming.
+The following command would simulate cache-aware streaming on a pretrained model from NGC with chunk_size of 100, shift_size of 50 and 2 left chunks as left context.
+The chunk_size of 100 would be 100*4*10=4000ms for a model with 4x downsampling and 10ms shift in feature extraction.
+
+python speech_to_text_streaming_infer.py \
+    --asr_model=stt_en_conformer_ctc_large \
+    --chunk_size=100 \
+    --shift_size=50 \
+    --left_chunks=2 \
+    --online_normalization \
+    --manifest_file=manifest_file.json \
+    --batch_size=16 \
+    --compare_vs_offline \
+    --use_amp \
+    --debug_mode
 
 """
 
@@ -57,7 +82,7 @@
 import nemo.collections.asr as nemo_asr
 from nemo.collections.asr.metrics.wer import word_error_rate
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
-from nemo.collections.asr.parts.utils.streaming_utils import FramewiseStreamingAudioBuffer
+from nemo.collections.asr.parts.utils.streaming_utils import CacheAwareStreamingAudioBuffer
 from nemo.utils import logging
 
 
@@ -118,6 +143,7 @@ def perform_streaming(asr_model, streaming_buffer, compare_vs_offline=False, deb
             with autocast():
                 # keep_all_outputs needs to be True for the last step of streaming when model is trained with att_context_style=regular
                 # otherwise the last outputs would get dropped
+
                 with torch.no_grad():
                     (
                         pred_out_stream,
@@ -183,7 +209,31 @@ def main():
         action="store_true",
         help="Whether to compare the output of the model with the offline mode.",
     )
-    parser.add_argument("--batch_size", type=int, default=32)
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=32,
+        help="The batch size to be used to perform streaming in batch mode with multiple streams",
+    )
+    parser.add_argument(
+        "--chunk_size",
+        type=int,
+        default=-1,
+        help="The chunk_size to be used for models trained with full context and offline models",
+    )
+    parser.add_argument(
+        "--shift_size",
+        type=int,
+        default=-1,
+        help="The shift_size to be used for models trained with full context and offline models",
+    )
+    parser.add_argument(
+        "--left_chunks",
+        type=int,
+        default=2,
+        help="The number of left chunks to be used as left context via caching for offline models",
+    )
+
     parser.add_argument(
         "--online_normalization",
         default=False,
@@ -237,6 +287,16 @@ def autocast():
     asr_model = asr_model.to(args.device)
     asr_model.eval()
 
+    # chunk_size is set automatically for models trained for streaming. For models trained for offline mode with full context, we need to pass the chunk_size explicitly.
+    if args.chunk_size > 0:
+        if args.shift_size < 0:
+            shift_size = args.chunk_size
+        else:
+            shift_size = args.shift_size
+        asr_model.encoder.setup_streaming_params(
+            chunk_size=args.chunk_size, left_chunks=args.left_chunks, shift_size=shift_size
+        )
+
     # In streaming, offline normalization is not feasible as we don't have access to the whole audio at the beginning
     # When online_normalization is enabled, the normalization of the input features (mel-spectrograms) are done per step
     # It is suggested to train the streaming models without any normalization in the input features.
@@ -252,7 +312,7 @@ def autocast():
     else:
         online_normalization = False
 
-    streaming_buffer = FramewiseStreamingAudioBuffer(model=asr_model, online_normalization=online_normalization)
+    streaming_buffer = CacheAwareStreamingAudioBuffer(model=asr_model, online_normalization=online_normalization)
     if args.audio_file is not None:
         # stream a single audio file
         processed_signal, processed_signal_length, stream_id = streaming_buffer.append_audio_file(
@@ -282,7 +342,7 @@ def autocast():
             )
             if "text" in sample:
                 all_refs_text.append(sample["text"])
-            print(f'Added sample to the buffer: {sample["audio_filepath"]}')
+            logging.info(f'Added this sample to the buffer: {sample["audio_filepath"]}')
 
             if (sample_idx + 1) % args.batch_size == 0 or sample_idx == len(samples) - 1:
                 logging.info(f"Starting to stream samples {sample_idx - len(streaming_buffer) + 1} to {sample_idx}...")
diff --git a/examples/asr/conf/conformer/streaming/conformer_ctc_bpe_streaming.yaml b/examples/asr/conf/conformer/streaming/conformer_ctc_bpe_streaming.yaml
index f6563787ca55..c1763c5fa789 100644
--- a/examples/asr/conf/conformer/streaming/conformer_ctc_bpe_streaming.yaml
+++ b/examples/asr/conf/conformer/streaming/conformer_ctc_bpe_streaming.yaml
@@ -4,7 +4,7 @@
 # You may find more detail on the architecture and training config at NeMo/examples/asr/comf/offline/conformer_ctc_bpe.yaml
 
 # Models trained with this config have limited right context which make them efficient for streaming ASR
-# You may use NeMo/examples/asr/speech_to_text_streaming_infer.py to simulate/evaluate this model in frame-wise streaming mode
+# You may use NeMo/examples/asr/speech_to_text_streaming_infer.py to simulate/evaluate this model in cache-aware streaming mode
 
 # if loss does not go down properly or gives NAN, you may try the followings:
 # + using gradient clipping of 1.0
diff --git a/examples/asr/conf/conformer/streaming/conformer_transducer_bpe_streaming.yaml b/examples/asr/conf/conformer/streaming/conformer_transducer_bpe_streaming.yaml
index 1f75ddb265e2..4087b26adfd5 100644
--- a/examples/asr/conf/conformer/streaming/conformer_transducer_bpe_streaming.yaml
+++ b/examples/asr/conf/conformer/streaming/conformer_transducer_bpe_streaming.yaml
@@ -4,7 +4,7 @@
 # You may find more detail on the architecture and training config at NeMo/examples/asr/comf/offline/conformer_transducer_bpe.yaml
 
 # Models trained with this config have limited right context which make them efficient for streaming ASR
-# You may use NeMo/examples/asr/speech_to_text_streaming_infer.py to simulate/evaluate this model in frame-wise streaming mode
+# You may use NeMo/examples/asr/speech_to_text_streaming_infer.py to simulate/evaluate this model in cache-aware streaming mode
 
 # if loss does not go down properly or gives NAN, you may try the followings by order:
 # + using gradient clipping of 1.0
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml
index be820552fdfc..4bbb20fc3aae 100644
--- a/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml
@@ -41,7 +41,6 @@ model:
   seed: 1234
   nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
   virtual_prompt_style: 'p-tuning' # one of 'prompt-tuning', 'p-tuning', or 'inference'
-  encoder_seq_length: 2048 
   tensor_model_parallel_size: 1 # intra-layer model parallelism
   pipeline_model_parallel_size: 1 # inter-layer model parallelism
   global_batch_size: 8
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_inference.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_inference.yaml
index 7098f07fa8b1..fef8c26759cd 100644
--- a/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_inference.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_inference.yaml
@@ -21,6 +21,7 @@ trainer:
 tensor_model_parallel_size: 1
 pipeline_model_parallel_size: 1
 gpt_model_file: null  # GPT nemo file path
-virtual_prompt_model_file: null # path to a MegatronGPTPromptLearningModel model if you want to use soft prompts
+virtual_prompt_model_file: ??? # path to a MegatronGPTPromptLearningModel model if you want to use soft prompts
+pred_file_path: ??? # Path will model predictions will be written
 data_paths: # paths to .jsonl files you want to perform inference on
  
\ No newline at end of file
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_eval.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_eval.yaml
new file mode 100644
index 000000000000..8be471a78dde
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_eval.yaml
@@ -0,0 +1,44 @@
+name: megatron_t5_finetune_eval
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  replace_sampler_ddp: False
+  benchmark: False
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_t5_finetune_eval
+  create_checkpoint_callback: False
+
+model:
+  restore_from_path: ??? # Path to a finetuned T5 .nemo file
+  gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  megatron_amp_O2: False # Enable O2 optimization for megatron amp
+
+  data:
+    validation_ds:
+      src_file_name: null # Path to the txt file corresponding to the source data.
+      tgt_file_name: null # Path to the txt file corresponding to the target data.
+      names: null # If src/tgt file names are ListConfigs, the corresponding label is used to log metrics.
+      global_batch_size: 64
+      micro_batch_size: 64
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_src_seq_length: 512
+      max_tgt_seq_length: 128
+      drop_last: False # TODO: Figure out if there is a way to avoid dropping last.
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      metric:
+        name: "exact_string_match" # Name of the evaluation metric to use.
+        average: micro # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null # Number of classes for the metric. Works only for 'F1', 'accuracy' and 'average_precision' etc. Refer to torchmetrics for metrics where this is supported.
+        class_labels: null # If the targets in your dataset are strings and not integers/float, you need to provide a list of class labels (size = num_classes) so we can convert from strings to integer categories to compute the metric.
+        labels_are_strings: True # NOTE: This is only required to properly handle metrics like f1, accuracy, average_precision etc. This does not affect extract_string_match.
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml
index 11bad4dc639a..87ce5ac03eb5 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml
@@ -33,4 +33,8 @@ model:
       max_seq_length: 512
       drop_last: False
       write_predictions_to_file: False
-      prediction_file_path_prefix: null # Prefix of the file to write predictions to.
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      metric:
+        name: "exact_string_match" # Name of the evaluation metric to use.
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning.yaml b/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning.yaml
index bd2b12d1a706..17be91ffd8ba 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning.yaml
@@ -30,23 +30,22 @@ exp_manager:
     monitor: val_loss
     save_top_k: 2
     mode: min
-    save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.virtual_prompt_save_path set below
+    save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below
     filename: "megatron_t5_prompt_tune--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}"
     model_parallel_size: ${model.tensor_model_parallel_size}
     save_best_model: True
 
 model:
   seed: 1234
-  virtual_prompt_save_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
+  nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
   virtual_prompt_style: "p-tuning" # one of 'prompt-tuning', 'p-tuning', or 'inference'
   tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1 
-  encoder_seq_length: 2048
-  global_batch_size: 8
-  micro_batch_size: 8
+  global_batch_size: 8 
+  micro_batch_size: 8 # micro batch size should equal global batch size when pipeline parallel = 1
   
   restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
-  pretrained_language_model_path: ??? # Path to the pretrained T5 language model .nemo file, always required
+  language_model_path: ??? # Path to the pretrained T5 language model .nemo file, always required
   existing_tasks: []
   new_tasks: ["squad"] 
 
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning_inference.yaml b/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning_inference.yaml
index d696c5120c8a..334edccfb252 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning_inference.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning_inference.yaml
@@ -14,7 +14,8 @@ data:
 tensor_model_parallel_size: 1
 pipeline_model_parallel_size: 1
 pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model
-pretrained_language_model_file: ???  # path to a pretrained T5 nemo file 
+language_model_path: ???  # path to a pretrained T5 nemo file 
 virtual_prompt_model_file: ??? # path to a MegatronT5PromptLearningModel nemo file
+pred_file_path: ??? # Path were all model predicitons will be written to a text file
 
 
diff --git a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
index b615c73c6256..2e4987f2e18c 100644
--- a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
+++ b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
@@ -72,6 +72,13 @@ def get_args():
     parser.add_argument("--gpus_per_node", type=int, required=True, default=None)
     parser.add_argument("--tensor_model_parallel_size", type=int, required=True, default=None)
     parser.add_argument("--pipeline_model_parallel_size", type=int, required=True, default=None)
+    parser.add_argument(
+        "--pipeline_model_parallel_split_rank",
+        type=int,
+        required=False,
+        default=None,
+        help="If pipeline parallel size > 1, this is the rank at which the encoder ends and the decoder begins.",
+    )
     parser.add_argument(
         "--model_type", type=str, required=True, default="gpt", choices=["gpt", "t5", "bert", "nmt", "bart", "retro"]
     )
@@ -96,11 +103,27 @@ def convert(local_rank, rank, world_size, args):
 
     app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size
     app_state.tensor_model_parallel_size = args.tensor_model_parallel_size
+    # Auto set split rank for T5, BART, NMT if split rank is None.
+    if args.pipeline_model_parallel_size > 1 and args.model_type in ['t5', 'bart', 'nmt']:
+        if args.pipeline_model_parallel_split_rank is not None:
+            app_state.pipeline_model_parallel_split_rank = args.pipeline_model_parallel_split_rank
+        else:
+            if args.pipeline_model_parallel_size % 2 != 0:
+                raise ValueError(
+                    f"Pipeline model parallel size {args.pipeline_model_parallel_size} must be even if split rank is not specified."
+                )
+            else:
+                # If split rank is not set, then we set it to be pipeline_model_parallel_size // 2 - this is because in most cases we have the same number of enc/dec layers.
+                app_state.pipeline_model_parallel_split_rank = args.pipeline_model_parallel_size // 2
+    else:
+        app_state.pipeline_model_parallel_split_rank = None
+
     app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size
 
     parallel_state.initialize_model_parallel(
         tensor_model_parallel_size_=app_state.tensor_model_parallel_size,
         pipeline_model_parallel_size_=app_state.pipeline_model_parallel_size,
+        pipeline_model_parallel_split_rank_=app_state.pipeline_model_parallel_split_rank,
     )
 
     app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank()
diff --git a/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py b/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py
index 826ed730536e..2a07001b29b3 100644
--- a/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py
+++ b/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import torch
+from apex.transformer import parallel_state
 from omegaconf import OmegaConf
 from omegaconf.omegaconf import open_dict
 from pytorch_lightning.trainer.trainer import Trainer
@@ -37,6 +38,7 @@
             trainer.num_nodes=1 \
             tensor_model_parallel_size=1 \
             pipeline_model_parallel_size=1 \
+            pred_file_path=PATH_WHERE_PRED_TEXT_FILE_WILL_BE_SAVED \
             data_paths=[path/to/dataset1.jsonl, path/to/dataset2.jsonl]
 
         virtual_prompt_model_file should be a path to a .nemo file saved after p-tuning/prompt tuning and model file
@@ -81,19 +83,19 @@ def main(cfg) -> None:
         == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
     ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size"
 
-    # Load prompt tuned model, virtual_prompt_model_file must be provided in config
-    # Update frozen GPT model path in case it has changed
+    # Update frozen GPT model path if it is given in case it has changed
     prompt_learning_cfg = MegatronGPTPromptLearningModel.restore_from(
         cfg.virtual_prompt_model_file, trainer=trainer, return_config=True
     )
-    with open_dict(prompt_learning_cfg):
-        prompt_learning_cfg.language_model_path = cfg.gpt_model_file
+    if cfg.get("gpt_model_file"):
+        with open_dict(prompt_learning_cfg):
+            prompt_learning_cfg.language_model_path = cfg.gpt_model_file
 
+    # Load prompt tuned model, virtual_prompt_model_file must be provided in config
     # Now load prompt learning model with frozen gpt model base
     model = MegatronGPTPromptLearningModel.restore_from(
         restore_path=cfg.virtual_prompt_model_file, trainer=trainer, override_config_path=prompt_learning_cfg
     )
-
     model.freeze()
 
     # Have to turn off activations_checkpoint_method for inference
@@ -102,6 +104,16 @@ def main(cfg) -> None:
     except AttributeError:
         pass
 
+    # Check whether the DDP is initialized
+    if parallel_state.is_unitialized():
+
+        def placeholder():
+            return
+
+        if model.trainer.strategy.launcher is not None:
+            model.trainer.strategy.launcher.launch(placeholder, trainer=model.trainer)
+        model.trainer.strategy.setup_environment()
+
     length_params: LengthParam = {
         "max_length": cfg.inference.tokens_to_generate,
         "min_length": cfg.inference.min_tokens_to_generate,
@@ -118,19 +130,6 @@ def main(cfg) -> None:
         "compute_logprob": cfg.inference.compute_logprob,
     }
 
-    # First method of running text generation, call model.generate method
-    # Input into generate method should be either list of string prompts or list of dicts
-    datapaths_dict = [{"data_path": path} for path in cfg.data_paths]
-
-    # Use for inference on a few examples
-    response = model.generate(inputs=datapaths_dict, length_params=length_params, sampling_params=sampling_params)
-
-    print("***************************")
-    print(response)
-    print("***************************")
-
-    # Second method of running text generation, call trainer.predict
-    # Use for batched inference on larger test sets
     max_input_length = model.frozen_model.cfg.encoder_seq_length - length_params["max_length"]
 
     _, dataloader = model.build_virtual_prompt_dataset(
@@ -151,7 +150,13 @@ def main(cfg) -> None:
     response = trainer.predict(model, dataloader)
 
     print("***************************")
-    print(response)
+    with open(cfg.pred_file_path, "w", encoding="utf-8") as pred_file:
+        for i in range(len(response)):
+            for sent in response[i]["sentences"]:
+                sent = sent.strip()
+                sent = sent.replace("\n", " ")
+                pred_file.write(sent + "\n")
+    print(f"Inference Complete, prediction file saved at {cfg.pred_file_path}")
     print("***************************")
 
 
diff --git a/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py b/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py
index b1d39141d742..812eb51975d3 100644
--- a/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py
+++ b/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py
@@ -64,18 +64,20 @@ def main(cfg) -> None:
             pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank,
         )
 
-    # Load prompt tuned model, virtual_prompt_model_file and pretrained_language_model_file must be provided in config
-    if (
-        cfg.get('virtual_prompt_model_file', None) is not None
-        and cfg.get('pretrained_language_model_file', None) is not None
-    ):
+    # Load prompt tuned model, virtual_prompt_model_file and language_model_path must be provided in config
+    if cfg.get('virtual_prompt_model_file', None) is not None and cfg.get('language_model_path', None) is not None:
 
         # Update frozen T5 model path in case it has changed
         prompt_learning_cfg = MegatronT5PromptLearningModel.restore_from(
             cfg.virtual_prompt_model_file, trainer=trainer, return_config=True
         )
         with open_dict(prompt_learning_cfg):
-            prompt_learning_cfg.pretrained_language_model_path = cfg.pretrained_language_model_file
+            if cfg.get("language_model_path"):
+                # This is for backward compatibility with old checkpoints that used `pretrained_language_model_path` instead of `language_model_path`.
+                if hasattr(prompt_learning_cfg, 'pretrained_language_model_path'):
+                    prompt_learning_cfg.pretrained_language_model_path = cfg.language_model_path
+                else:
+                    prompt_learning_cfg.language_model_path = cfg.language_model_path
             prompt_learning_cfg.micro_batch_size = cfg.data.get('micro_batch_size', 4)
             prompt_learning_cfg.global_batch_size = cfg.data.get('global_batch_size', 4)
 
@@ -99,7 +101,7 @@ def dummy():
 
     model.freeze()
 
-    test_ds, test_dl = model.build_virtual_prompt_dataset(
+    _, test_dl = model.build_virtual_prompt_dataset(
         dataset_paths=cfg.data.test_ds,
         batch_size=cfg.data.global_batch_size,
         for_train=False,
@@ -109,8 +111,13 @@ def dummy():
         pin_memory=True,
     )
 
-    trainer.predict(model, test_dl)
-
+    outputs = trainer.predict(model, test_dl)
+    with open(cfg.pred_file_path, "w", encoding="utf-8") as pred_file:
+        for batch in outputs:
+            preds = batch["predicted_token_ids"]
+            for pred in preds:
+                pred = pred.strip().replace("\n", " ")
+                pred_file.write(pred + "\n")
     print('test finish---------------------------------')
 
 
diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
index f51a809654ad..01cdd44d7976 100644
--- a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
+++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
@@ -20,7 +20,13 @@
 
 from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel
 from nemo.collections.nlp.models.language_modeling.megatron_glue_model import MegatronT5GLUEModel
-from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPStrategy
+from nemo.collections.nlp.models.language_modeling.megatron_t0_model import MegatronT0Model
+from nemo.collections.nlp.parts.nlp_overrides import (
+    GradScaler,
+    MegatronHalfPrecisionPlugin,
+    NLPDDPStrategy,
+    NLPSaveRestoreConnector,
+)
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import StatelessTimer, exp_manager
@@ -63,36 +69,52 @@ def main(cfg) -> None:
         if isinstance(callback, Timer):
             trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,)
 
-    # Get the T5 Base configuration.
-    if hasattr(t5_cfg.data.validation_ds, 'task_name'):
-        t5_cfg = MegatronT5GLUEModel.restore_from(
-            restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
-        )
-    else:
-        t5_cfg = MegatronT5FinetuneModel.restore_from(
-            restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
-        )
+    t5_cfg = MegatronT5GLUEModel.restore_from(
+        restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
+    )
 
     # Override the T5 configuration with the one from the config file.
     # NOTE: Only data can be overriden here since this the file being restored here should already correspond to a GLUE/XNLI finetuned model.
     OmegaConf.set_struct(t5_cfg, True)
     with open_dict(t5_cfg):
-        t5_cfg.masked_softmax_fusion = False
         t5_cfg.precision = cfg.trainer.precision
         # Overwrite data configs
-        t5_cfg.data = cfg.model.data
-        # XNLI has eval languages in the yaml config.
-        if hasattr(cfg.model, 'eval_languages'):
-            t5_cfg.eval_languages = cfg.model.eval_languages
-
-    if hasattr(t5_cfg.data.validation_ds, 'task_name'):
-        model = MegatronT5GLUEModel.restore_from(
-            restore_path=cfg.model.restore_from_path, trainer=trainer, override_config_path=t5_cfg
-        )
-    else:
-        model = MegatronT5FinetuneModel.restore_from(
-            restore_path=cfg.model.restore_from_path, trainer=trainer, override_config_path=t5_cfg
-        )
+        if cfg.model.data.validation_ds.get('src_file_name', None) is not None:
+            logging.info(
+                'Found validation_ds.src_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.'
+            )
+            t5_cfg.data.validation_ds.src_file_name = cfg.model.data.validation_ds.src_file_name
+        if cfg.model.data.validation_ds.get('tgt_file_name', None) is not None:
+            logging.info(
+                'Found validation_ds.tgt_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.'
+            )
+            t5_cfg.data.validation_ds.src_file_name = cfg.model.data.validation_ds.src_file_name
+
+        t5_cfg.data.validation_ds.micro_batch_size = cfg.model.data.validation_ds.micro_batch_size
+        t5_cfg.data.validation_ds.global_batch_size = cfg.model.data.validation_ds.global_batch_size
+
+        if hasattr(cfg.model.data.validation_ds, 'task_name'):
+            model = MegatronT5GLUEModel.restore_from(
+                restore_path=cfg.model.restore_from_path,
+                trainer=trainer,
+                override_config_path=t5_cfg,
+                save_restore_connector=NLPSaveRestoreConnector(),
+            )
+        elif hasattr(cfg.model.data.validation_ds, 'file_names'):
+            model = MegatronT0Model.restore_from(
+                restore_path=cfg.model.restore_from_path,
+                trainer=trainer,
+                override_config_path=t5_cfg,
+                save_restore_connector=NLPSaveRestoreConnector(),
+            )
+        else:
+            model = MegatronT5FinetuneModel.restore_from(
+                restore_path=cfg.model.restore_from_path,
+                trainer=trainer,
+                override_config_path=t5_cfg,
+                save_restore_connector=NLPSaveRestoreConnector(),
+            )
+
     model.freeze()
     trainer.validate(model)
     if hasattr(cfg.model.data, 'test_ds'):
diff --git a/examples/nlp/machine_translation/enc_dec_nmt_finetune.py b/examples/nlp/machine_translation/enc_dec_nmt_finetune.py
index 21858c30405f..16a635d09dee 100644
--- a/examples/nlp/machine_translation/enc_dec_nmt_finetune.py
+++ b/examples/nlp/machine_translation/enc_dec_nmt_finetune.py
@@ -29,6 +29,7 @@
 from nemo.utils.config_utils import update_model_config
 from nemo.utils.exp_manager import ExpManagerConfig, exp_manager
 
+
 """
 Usage:
  python enc_dec_nmt_finetune.py \
diff --git a/examples/text_processing/g2p/conf/g2p_conformer_ctc.yaml b/examples/text_processing/g2p/conf/g2p_conformer_ctc.yaml
index bc28f621ca99..72d069b2ba70 100644
--- a/examples/text_processing/g2p/conf/g2p_conformer_ctc.yaml
+++ b/examples/text_processing/g2p/conf/g2p_conformer_ctc.yaml
@@ -27,7 +27,7 @@ model:
     feat_in: ${model.embedding.d_model}
     feat_out: -1 # you may set it if you need different output size other than the default d_model
     n_layers: 16
-    d_model: 256
+    d_model: 176
 
     # Sub-sampling params
     subsampling: null # vggnet or striding, vggnet may give better results but needs more memory
@@ -39,7 +39,7 @@ model:
 
     # Multi-headed Attention Module's params
     self_attention_model: rel_pos # rel_pos or abs_pos
-    n_heads: 8 # may need to be lower for smaller d_models
+    n_heads: 4 # may need to be lower for smaller d_models
     # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
     att_context_size: [ -1, -1 ] # -1 means unlimited context
     xscaling: true # scales up the input embeddings by sqrt(d_model)
diff --git a/examples/text_processing/g2p/g2p_train_and_evaluate.py b/examples/text_processing/g2p/g2p_train_and_evaluate.py
index 4f7e1102200b..395b82c3f073 100644
--- a/examples/text_processing/g2p/g2p_train_and_evaluate.py
+++ b/examples/text_processing/g2p/g2p_train_and_evaluate.py
@@ -38,6 +38,8 @@
         trainer.devices=1 \
         do_training=True \
         do_testing=True
+    
+    Example of the config file: NeMo/examples/text_processing/g2p/conf/t5_g2p.yaml
         
 # Training Conformer-G2P Model and evaluation at the end of training:
     python examples/text_processing/g2p/g2p_train_and_evaluate.py \
@@ -50,6 +52,8 @@
         do_training=True \
         do_testing=True
         
+    Example of the config file: NeMo/examples/text_processing/g2p/conf/g2p_conformer_ctc.yaml
+        
 # Run evaluation of the pretrained model:
     python examples/text_processing/g2p/g2p_train_and_evaluate.py \
         # (Optional: --config-path=<Path to dir of configs> --config-name=<name of config without .yaml>) \
diff --git a/examples/text_processing/g2p/heteronym_classification_inference.py b/examples/text_processing/g2p/heteronym_classification_inference.py
index b0047093eb52..5e9afbb08fd1 100644
--- a/examples/text_processing/g2p/heteronym_classification_inference.py
+++ b/examples/text_processing/g2p/heteronym_classification_inference.py
@@ -31,6 +31,8 @@
 This script runs inference with HeteronymClassificationModel
 If the input manifest contains target "word_id", evaluation will be also performed.
 
+To prepare dataset, see NeMo/scripts/dataset_processing/g2p/export_wikihomograph_data_to_manifest.py
+
 python heteronym_classification_inference.py \
     manifest="<Path to .json manifest>" \
     pretrained_model="<Path to .nemo file or pretrained model name from list_available_models()>" \
diff --git a/examples/text_processing/g2p/heteronym_classification_train_and_evaluate.py b/examples/text_processing/g2p/heteronym_classification_train_and_evaluate.py
index a28021fe8b83..456d5ed32d5c 100644
--- a/examples/text_processing/g2p/heteronym_classification_train_and_evaluate.py
+++ b/examples/text_processing/g2p/heteronym_classification_train_and_evaluate.py
@@ -27,11 +27,14 @@
 """
 This script runs training and evaluation of HeteronymClassificationModel
 
+To prepare dataset, see NeMo/scripts/dataset_processing/g2p/export_wikihomograph_data_to_manifest.py
+
 To run training and testing:
 python heteronym_classification_train_and_evaluate.py \
     train_manifest=<Path to manifest file>" \
     validation_manifest=<Path to manifest file>" \
     model.encoder.pretrained="<Path to .nemo file or pretrained model name from list_available_models()>" \
+    model.wordids=<Path to wordids.tsv file, similar to https://github.com/google-research-datasets/WikipediaHomographData/blob/master/data/wordids.tsv> \ 
     do_training=True \
     do_testing=True
 """
diff --git a/examples/tts/conf/de/fastpitch_align_44100.yaml b/examples/tts/conf/de/fastpitch_align_44100.yaml
index 36ca8fb74632..9060a859f4b9 100644
--- a/examples/tts/conf/de/fastpitch_align_44100.yaml
+++ b/examples/tts/conf/de/fastpitch_align_44100.yaml
@@ -67,7 +67,7 @@ model:
     punct_post_process: true
 
   text_tokenizer:
-    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.GermanCharsTokenizer
+    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.GermanPhonemesTokenizer
     punct: true
     apostrophe: true
     pad_with_space: true
diff --git a/nemo/collections/asr/models/configs/__init__.py b/nemo/collections/asr/models/configs/__init__.py
index 01a98d3743a4..4ccc90c97cde 100644
--- a/nemo/collections/asr/models/configs/__init__.py
+++ b/nemo/collections/asr/models/configs/__init__.py
@@ -14,9 +14,9 @@
 
 from nemo.collections.asr.models.configs.asr_models_config import (
     ASRDatasetConfig,
+    CacheAwareStreamingConfig,
     EncDecCTCConfig,
     EncDecCTCModelConfig,
-    FramewiseStreamingConfig,
 )
 from nemo.collections.asr.models.configs.classification_models_config import (
     EncDecClassificationConfig,
diff --git a/nemo/collections/asr/models/configs/asr_models_config.py b/nemo/collections/asr/models/configs/asr_models_config.py
index 385d0f47ab27..e0ceeff6b186 100644
--- a/nemo/collections/asr/models/configs/asr_models_config.py
+++ b/nemo/collections/asr/models/configs/asr_models_config.py
@@ -94,7 +94,7 @@ class EncDecCTCModelConfig(model_cfg.NemoConfig):
 
 
 @dataclass
-class FramewiseStreamingConfig:
+class CacheAwareStreamingConfig:
     chunk_size: int = 0  # the size of each chunk at each step, it can be a list of two integers to specify different chunk sizes for the first step and others
     shift_size: int = 0  # the size of the shift in each step, it can be a list of two integers to specify different shift sizes for the first step and others
 
diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py
index 125fa1dd40b3..6e6a2cb6250e 100644
--- a/nemo/collections/asr/models/ctc_bpe_models.py
+++ b/nemo/collections/asr/models/ctc_bpe_models.py
@@ -549,4 +549,11 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]:
         )
         results.append(model)
 
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_hr_conformer_ctc_large",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hr_conformer_ctc_large",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_hr_conformer_ctc_large/versions/1.11.0/files/stt_hr_conformer_ctc_large.nemo",
+        )
+        results.append(model)
+
         return results
diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py
index 14578563d643..ed208b7f3b90 100644
--- a/nemo/collections/asr/models/rnnt_bpe_models.py
+++ b/nemo/collections/asr/models/rnnt_bpe_models.py
@@ -218,6 +218,13 @@ def list_available_models(cls) -> List[PretrainedModelInfo]:
         )
         results.append(model)
 
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_hr_conformer_transducer_large",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hr_conformer_transducer_large",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_hr_conformer_transducer_large/versions/1.11.0/files/stt_hr_conformer_transducer_large.nemo",
+        )
+        results.append(model)
+
         return results
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py
index b94ab4137563..54ecba770c25 100644
--- a/nemo/collections/asr/modules/conformer_encoder.py
+++ b/nemo/collections/asr/modules/conformer_encoder.py
@@ -21,7 +21,7 @@
 import torch.nn as nn
 from omegaconf import DictConfig, ListConfig
 
-from nemo.collections.asr.models.configs import FramewiseStreamingConfig
+from nemo.collections.asr.models.configs import CacheAwareStreamingConfig
 from nemo.collections.asr.parts.mixins.streaming import StreamingEncoder
 from nemo.collections.asr.parts.submodules.causal_convs import CausalConv1D
 from nemo.collections.asr.parts.submodules.conformer_modules import ConformerLayer
@@ -312,6 +312,19 @@ def __init__(
         self.setup_streaming_params()
         self.export_cache_support = False
 
+    def update_max_seq_length(self, seq_length: int, device):
+        # Find global max audio length across all nodes
+        if torch.distributed.is_initialized():
+            global_max_len = torch.tensor([seq_length], dtype=torch.float32, device=device)
+
+            # Update across all ranks in the distributed system
+            torch.distributed.all_reduce(global_max_len, op=torch.distributed.ReduceOp.MAX)
+
+            seq_length = global_max_len.int().item()
+
+        if seq_length > self.max_audio_length:
+            self.set_max_audio_length(seq_length)
+
     def set_max_audio_length(self, max_audio_length):
         """
         Sets maximum input length.
@@ -319,11 +332,6 @@ def set_max_audio_length(self, max_audio_length):
         """
         self.max_audio_length = max_audio_length
         device = next(self.parameters()).device
-        seq_range = torch.arange(0, self.max_audio_length, device=device)
-        if hasattr(self, 'seq_range'):
-            self.seq_range = seq_range
-        else:
-            self.register_buffer('seq_range', seq_range, persistent=False)
         self.pos_enc.extend_pe(max_audio_length, device)
 
         att_mask = torch.ones(1, max_audio_length, max_audio_length, dtype=torch.bool, device=device)
@@ -353,7 +361,7 @@ def forward(self, audio_signal, length, cache_last_channel=None, cache_last_time
 
         if length is None:
             length = audio_signal.new_full(
-                audio_signal.size(0), max_audio_length, dtype=torch.int32, device=self.seq_range.device
+                audio_signal.size(0), max_audio_length, dtype=torch.int32, device=audio_signal.device
             )
 
         if cache_last_channel is not None:
@@ -393,7 +401,9 @@ def forward(self, audio_signal, length, cache_last_channel=None, cache_last_time
             audio_signal, pos_emb = self.pos_enc(x=audio_signal)
 
         # pad_mask is the masking to be used to ignore paddings
-        pad_mask = self.make_pad_mask(max_audio_length=max_audio_length, seq_lens=padding_length)
+        pad_mask = torch.arange(0, max_audio_length, device=audio_signal.device).expand(
+            padding_length.size(0), -1
+        ) < padding_length.unsqueeze(-1)
 
         # pad_mask_for_att_mask is the mask which helps to ignore paddings
         pad_mask_for_att_mask = pad_mask.unsqueeze(1).repeat([1, max_audio_length, 1])
@@ -432,57 +442,49 @@ def forward(self, audio_signal, length, cache_last_channel=None, cache_last_time
         else:
             return audio_signal, length
 
-    def update_max_seq_length(self, seq_length: int, device):
-        # Find global max audio length across all nodes
-        if torch.distributed.is_initialized():
-            global_max_len = torch.tensor([seq_length], dtype=torch.float32, device=device)
-
-            # Update across all ranks in the distributed system
-            torch.distributed.all_reduce(global_max_len, op=torch.distributed.ReduceOp.MAX)
-
-            seq_length = global_max_len.int().item()
-
-        if seq_length > self.max_audio_length:
-            self.set_max_audio_length(seq_length)
-
-    def make_pad_mask(self, max_audio_length, seq_lens):
-        """Make masking for padding."""
-        mask = self.seq_range[:max_audio_length].expand(seq_lens.size(0), -1) < seq_lens.unsqueeze(-1)
-        return mask
-
     def enable_pad_mask(self, on=True):
-        # On inference, user may chose to disable pad mask
+        # On inference, user may choose to disable pad mask
         mask = self.use_pad_mask
         self.use_pad_mask = on
         return mask
 
     def setup_streaming_params(
-        self, max_context: int = 10000,
+        self, chunk_size: int = None, shift_size: int = None, left_chunks: int = None, max_context: int = 10000
     ):
         """
             This function sets the needed values and parameters to perform streaming. The configuration would be stored in self.streaming_cfg.
             The streaming configuration is needed to simulate streaming inference.
+            Args:
+                chunk_size (int): overrides the chunk size
+                shift_size (int): overrides the shift size for chunks
+                left_chunks (int): overrides the number of left chunks visible to each chunk
+                max_context (int): the value used for the cache size of last_channel layers if left context is set to infinity (-1)
+                    Defaults to -1 (means feat_out is d_model)
         """
-        streaming_cfg = FramewiseStreamingConfig()
-        if self.att_context_style == "chunked_limited":
+        streaming_cfg = CacheAwareStreamingConfig()
+        if chunk_size is not None:
+            if chunk_size <= 1:
+                raise ValueError("chunk_size needs to be a number larger or equal to one.")
+            lookahead_steps = chunk_size - 1
+            streaming_cfg.cache_drop_size = chunk_size - shift_size
+        elif self.att_context_style == "chunked_limited":
             lookahead_steps = self.att_context_size[1]
             streaming_cfg.cache_drop_size = 0
         elif self.att_context_style == "regular":
-            lookahead_steps_att = (
-                self.att_context_size[1] * self.n_layers if self.att_context_size[1] >= 0 else max_context
-            )
-            lookahead_steps_conv = (
-                self.conv_context_size[1] * self.n_layers if self.conv_context_size[1] >= 0 else max_context
-            )
-            lookahead_steps = max(lookahead_steps_att, lookahead_steps_conv)
+            lookahead_steps = self.att_context_size[1] * self.n_layers + self.conv_context_size[1] * self.n_layers
             streaming_cfg.cache_drop_size = lookahead_steps
         else:
-            streaming_cfg.cache_drop_size = cache_drop_size
+            streaming_cfg.cache_drop_size = 0
             lookahead_steps = None
 
-        streaming_cfg.last_channel_cache_size = (
-            self.att_context_size[0] if self.att_context_size[0] >= 0 else max_context
-        )
+        if chunk_size is None:
+            streaming_cfg.last_channel_cache_size = (
+                self.att_context_size[0] if self.att_context_size[0] >= 0 else max_context
+            )
+        else:
+            if left_chunks is None:
+                raise ValueError("left_chunks can not be None when chunk_size is set.")
+            streaming_cfg.last_channel_cache_size = left_chunks * chunk_size
 
         if hasattr(self.pre_encode, "get_sampling_frames"):
             sampling_frames = self.pre_encode.get_sampling_frames()
diff --git a/nemo/collections/asr/parts/mixins/streaming.py b/nemo/collections/asr/parts/mixins/streaming.py
index 25c0aeda9103..1159295c650e 100644
--- a/nemo/collections/asr/parts/mixins/streaming.py
+++ b/nemo/collections/asr/parts/mixins/streaming.py
@@ -23,7 +23,7 @@ def setup_streaming_params(
         self, max_look_ahead: int = 10000,
     ):
         """
-        This function sets the needed values and parameters to perform streaming. The configuration (FramewiseStreamingConfig) need to be stored in self.streaming_cfg.
+        This function sets the needed values and parameters to perform streaming. The configuration (CacheAwareStreamingConfig) need to be stored in self.streaming_cfg.
         The streaming configuration is needed to simulate streaming inference. It would set the following
         """
         pass
@@ -72,7 +72,12 @@ def cache_aware_stream_step(
             encoded, encoded_len, cache_last_channel_next, cache_last_time_next = encoder_output
 
         if cache_last_channel_next is not None and self.streaming_cfg.last_channel_cache_size >= 0:
-            cache_last_channel_next = cache_last_channel_next[:, :, -self.streaming_cfg.last_channel_cache_size :, :]
+            if self.streaming_cfg.last_channel_cache_size > 0:
+                cache_last_channel_next = cache_last_channel_next[
+                    :, :, -self.streaming_cfg.last_channel_cache_size :, :
+                ]
+            else:
+                cache_last_channel_next = cache_last_channel_next[:, :, 0:0, :]
         if not keep_all_outputs:
             encoded = encoded[:, :, : self.streaming_cfg.valid_out_len]
             encoded_len = torch.clamp(encoded_len, max=self.streaming_cfg.valid_out_len)
diff --git a/nemo/collections/asr/parts/submodules/causal_convs.py b/nemo/collections/asr/parts/submodules/causal_convs.py
index 4e44c9bd9923..39e03419cfac 100644
--- a/nemo/collections/asr/parts/submodules/causal_convs.py
+++ b/nemo/collections/asr/parts/submodules/causal_convs.py
@@ -141,10 +141,10 @@ def update_cache(self, x, cache=None, cache_next=None):
         if cache_next is not None:
             x_keep_size = input_x.size(-1) - self.cache_drop_size
             cache_keep_size = torch.tensor(x_keep_size, dtype=torch.int64)
-            cache_keep_size = cache_keep_size.clip(max=cache_next.size(-1))
+            cache_keep_size = cache_keep_size.clip(min=1, max=cache_next.size(-1))
 
-            cache_next[self._cache_id, :, :, :-x_keep_size] = cache[self._cache_id, :, :, cache_keep_size:]
-            input_x_kept = input_x[:, :, :x_keep_size]
+            cache_next[self._cache_id, :, :, :-cache_keep_size] = cache[self._cache_id, :, :, cache_keep_size:]
+            input_x_kept = input_x[:, :, :cache_keep_size]
             cache_next[self._cache_id, :, :, -cache_keep_size:] = input_x_kept[:, :, -cache_keep_size:]
         return x
 
diff --git a/nemo/collections/asr/parts/submodules/conformer_modules.py b/nemo/collections/asr/parts/submodules/conformer_modules.py
index 7948c691afe5..ca3052c31d2a 100644
--- a/nemo/collections/asr/parts/submodules/conformer_modules.py
+++ b/nemo/collections/asr/parts/submodules/conformer_modules.py
@@ -207,28 +207,15 @@ def __init__(
             in_channels=d_model, out_channels=d_model * 2, kernel_size=1, stride=1, padding=0, bias=True
         )
 
-        if conv_context_size is None or (
-            conv_context_size[0] == (kernel_size - 1) // 2 and conv_context_size[1] == (kernel_size - 1) // 2
-        ):
-            self.depthwise_conv = nn.Conv1d(
-                in_channels=dw_conv_input_dim,
-                out_channels=dw_conv_input_dim,
-                kernel_size=kernel_size,
-                stride=1,
-                padding=(kernel_size - 1) // 2,
-                groups=d_model,
-                bias=True,
-            )
-        else:
-            self.depthwise_conv = CausalConv1D(
-                in_channels=dw_conv_input_dim,
-                out_channels=dw_conv_input_dim,
-                kernel_size=kernel_size,
-                stride=1,
-                padding=conv_context_size,
-                groups=d_model,
-                bias=True,
-            )
+        self.depthwise_conv = CausalConv1D(
+            in_channels=dw_conv_input_dim,
+            out_channels=dw_conv_input_dim,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=conv_context_size,
+            groups=d_model,
+            bias=True,
+        )
 
         if norm_type == 'batch_norm':
             self.batch_norm = nn.BatchNorm1d(dw_conv_input_dim)
diff --git a/nemo/collections/asr/parts/submodules/multi_head_attention.py b/nemo/collections/asr/parts/submodules/multi_head_attention.py
index f2c7700357a2..3beacaffeec8 100644
--- a/nemo/collections/asr/parts/submodules/multi_head_attention.py
+++ b/nemo/collections/asr/parts/submodules/multi_head_attention.py
@@ -148,10 +148,11 @@ def update_cache(self, key, value, query, cache, cache_next):
         if cache_next is not None:
             cache_next_length = cache_next.size(2)
             q_keep_size = q_length - self.cache_drop_size
+            q_keep_size = torch.tensor(q_keep_size, dtype=torch.int64).clip(min=1)
 
             cache_next[self._cache_id, :, :-q_keep_size, :] = cache[
                 self._cache_id, :, -(cache_next_length - q_keep_size) :, :
-            ].clone()
+            ]
             cache_next[self._cache_id, :, -q_keep_size:, :] = q_input[:, :q_keep_size, :]
 
         return key, value, query
diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py
index 9b3d87431e8c..4a490ca3fcf4 100644
--- a/nemo/collections/asr/parts/utils/streaming_utils.py
+++ b/nemo/collections/asr/parts/utils/streaming_utils.py
@@ -1172,9 +1172,9 @@ def transcribe(
         return output
 
 
-class FramewiseStreamingAudioBuffer:
+class CacheAwareStreamingAudioBuffer:
     """
-    A buffer to be used for frame-wise streaming. It can load a single or multiple audio files/processed signals, split them in chunks and return one on one.
+    A buffer to be used for cache-aware streaming. It can load a single or multiple audio files/processed signals, split them in chunks and return one on one.
     It can be used to simulate streaming audio or audios.
     """
 
@@ -1203,6 +1203,11 @@ def __init__(self, model, online_normalization=None):
 
         self.preprocessor = self.extract_preprocessor()
 
+        if hasattr(model.encoder, "pre_encode") and hasattr(model.encoder.pre_encode, "get_sampling_frames"):
+            self.sampling_frames = model.encoder.pre_encode.get_sampling_frames()
+        else:
+            self.sampling_frames = None
+
     def __iter__(self):
         while True:
             if self.buffer_idx >= self.buffer.size(-1):
@@ -1228,6 +1233,17 @@ def __iter__(self):
 
             audio_chunk = self.buffer[:, :, self.buffer_idx : self.buffer_idx + chunk_size]
 
+            if self.sampling_frames is not None:
+                # checking to make sure the audio chunk has enough frames to produce at least one output after downsampling
+                if self.buffer_idx == 0 and isinstance(self.sampling_frames, list):
+                    cur_sampling_frames = self.sampling_frames[0]
+                else:
+                    cur_sampling_frames = (
+                        self.sampling_frames[1] if isinstance(self.sampling_frames, list) else self.sampling_frames
+                    )
+                if audio_chunk.size(-1) < cur_sampling_frames:
+                    return
+
             # Adding the cache needed for the pre-encoder part of the model to the chunk
             # if there is not enough frames to be used as the pre-encoding cache, zeros would be added
             zeros_pads = None
diff --git a/nemo/collections/nlp/data/common/sequence_to_sequence_dataset.py b/nemo/collections/nlp/data/common/sequence_to_sequence_dataset.py
index fd4843846fb0..9aaa8456ccf1 100644
--- a/nemo/collections/nlp/data/common/sequence_to_sequence_dataset.py
+++ b/nemo/collections/nlp/data/common/sequence_to_sequence_dataset.py
@@ -86,8 +86,12 @@ def _get_examples(self):
                     + self.tgt_tokenizer.text_to_ids(tgt.strip())
                     + [self.tgt_tokenizer.eos_id]
                 )
-                if len(src) <= self.max_src_seq_length and len(tgt) < self.max_tgt_seq_length:
-                    self.examples.append({'src': src, 'tgt': tgt})
+                # Truncate to max sequence length.
+                if len(src) > self.max_src_seq_length:
+                    src = src[-self.max_src_seq_length + 1 :]
+                if len(tgt) > self.max_tgt_seq_length:
+                    tgt = tgt[-self.max_tgt_seq_length + 1 :]
+                self.examples.append({'src': src, 'tgt': tgt})
 
         logging.info(f'Dataset Length : {len(self.examples)}')
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
index 32092ed3e5bb..28aebdbda769 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
@@ -21,6 +21,7 @@
 from pytorch_lightning.trainer.trainer import Trainer
 from torch import Tensor
 
+from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
 from nemo.collections.nlp.modules.common import (
     PromptEncoder,
@@ -65,7 +66,14 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
 
         self.tokenizer = self.frozen_model.tokenizer
 
-        self.hidden_size = self.frozen_model.cfg.hidden_size
+        if hasattr(self.frozen_model.cfg, "encoder") and hasattr(self.frozen_model.cfg, "decoder"):
+            self.hidden_size = (
+                self.frozen_model.cfg.encoder.hidden_size
+            )  # Encoder and decoder need to have the same hidden size and we check for this in the frozen enc-dec model.
+        else:
+            self.hidden_size = self.frozen_model.cfg.hidden_size
+
+        # TODO: Handle this when moving GPT prompt learning to the base class.
         self.word_embeddings = self.frozen_model.enc_dec_model.encoder_embedding.word_embeddings
         self.existing_tasks = list(self.cfg.get('existing_tasks', []))
         self.new_tasks = list(self.cfg.get('new_tasks', []))
@@ -101,7 +109,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
 
         # Prepare pseudo token ids for virtual/virtual prompt tokens
         self.pseudo_tokens = get_pseudo_tokens(self.max_virtual_tokens)
-        self.tokenizer.add_special_tokens({'additional_special_tokens': self.pseudo_tokens})
+        if isinstance(self.tokenizer, SentencePieceTokenizer):
+            self.tokenizer.add_special_tokens(self.pseudo_tokens)
+        else:
+            self.tokenizer.add_special_tokens({'additional_special_tokens': self.pseudo_tokens})
         self.pseudo_token_ids = self.tokenizer.tokens_to_ids(self.pseudo_tokens)
         self.pseudo_token_ids_start = self.pseudo_token_ids[0]
         self.pad_token_id = self.tokenizer.pad_id if self.tokenizer.pad_id is not None else self.tokenizer.unk_id
@@ -396,8 +407,8 @@ def on_train_end(self):
             self.cfg.virtual_prompt_style = VirtualPromptStyle.INFERENCE.value
 
         # Save the best nemo model
-        self.save_to(save_path=self.cfg.virtual_prompt_save_path)
-        logging.info(f"The final model was saved to {self.cfg.virtual_prompt_save_path}")
+        self.save_to(save_path=self.cfg.nemo_path)
+        logging.info(f"The final model was saved to {self.cfg.nemo_path}")
 
     def setup(self, stage=None):
         if stage == 'predict' or self.virtual_prompt_style == VirtualPromptStyle.INFERENCE:
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
index a23375c64ecb..021108345d09 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
@@ -439,12 +439,11 @@ def inference_epoch_end(self, outputs, mode, data_cfg):
                             for pred, label, input, category in zip(
                                 batch['preds'], batch['labels'], batch['inputs'], batch['categories']
                             ):
-                                if input + label not in gt_inp_set:
-                                    gt_inp_set.add(input + label)
-                                    deduplicated_outputs['preds'].append(pred)
-                                    deduplicated_outputs['labels'].append(label)
-                                    deduplicated_outputs['categories'].append(category)
-                                    deduplicated_outputs['inputs'].append(input)
+                                gt_inp_set.add(input + label)
+                                deduplicated_outputs['preds'].append(pred)
+                                deduplicated_outputs['labels'].append(label)
+                                deduplicated_outputs['categories'].append(category)
+                                deduplicated_outputs['inputs'].append(input)
                     self.write_predictions_to_file(
                         deduplicated_outputs, f"{data_cfg.output_file_path_prefix}_{filename_log_key}"
                     )
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
index 9ed0265a234a..9212dcda34fc 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
@@ -92,9 +92,11 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         # Need to overwrite some params in frozen model's config before restoring
         with open_dict(frozen_model_cfg):
             frozen_model_cfg.megatron_amp_O2 = False
+            frozen_model_cfg.optim.name = "fused_adam"
             frozen_model_cfg.micro_batch_size = self.cfg.micro_batch_size
             frozen_model_cfg.global_batch_size = self.cfg.global_batch_size
             frozen_model_cfg.precision = trainer.precision
+            frozen_model_cfg.sequence_parallel = False
 
         # Load pretrained GPT model and tokenizer, frozen model will have lr=0.0
         if cfg.get('language_model_path', None):
@@ -138,6 +140,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
                 hidden_size=self.hidden_size,
             )
 
+        self.padded_vocab_size = self.frozen_model.padded_vocab_size
         self._prompt_table_key = VirtualPromptSource.PROMPT_TABLE.value
         self._prompt_encoder_key = VirtualPromptSource.PROMPT_ENCODER.value
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index 6a87fd0f0052..2d72380af5e4 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -471,7 +471,7 @@ def loss_func(output_tensor):
 
         return fwd_output_and_loss_func
 
-    @functools.cached_property
+    @functools.lru_cache(maxsize=None)
     def _kwargs_to_arg_idx(self):
         """
         Returns a dict {kwarg name: arg index} to be used when mapping
@@ -500,7 +500,7 @@ def _build_forward_args_from_kwargs(self, args_name, args, **kwargs):
             raise ValueError(f"args_name = {args_name} cannot overlap kwargs = {list(kwargs.keys())}")
 
         # get mapping of kwarg names to arg index
-        kwargs_to_arg_idx = self._kwargs_to_arg_idx
+        kwargs_to_arg_idx = self._kwargs_to_arg_idx()
 
         # collect all arguments
         all_args_name = args_name[:]
@@ -790,7 +790,7 @@ def process_global_batch(self, global_batch):
     def build_train_valid_test_datasets(self):
         raise NotImplementedError("Please implement this method in child-class")
 
-    def build_pretraining_data_loader(self, dataset, consumed_samples):
+    def build_pretraining_data_loader(self, dataset, consumed_samples, num_workers):
         """Buld dataloader given an input dataset."""
 
         if dataset is None:
@@ -826,7 +826,7 @@ def build_pretraining_data_loader(self, dataset, consumed_samples):
 
         # Torch dataloader.
         return torch.utils.data.DataLoader(
-            dataset, batch_sampler=batch_sampler, num_workers=self._cfg.data.num_workers, pin_memory=True,
+            dataset, batch_sampler=batch_sampler, num_workers=num_workers, pin_memory=True,
         )
 
     def setup(self, stage=None):
@@ -881,7 +881,9 @@ def setup(self, stage=None):
     def setup_training_data(self, cfg):
         if hasattr(self, '_train_ds'):
             consumed_samples = self.compute_consumed_samples(0)
-            self._train_dl = self.build_pretraining_data_loader(self._train_ds, consumed_samples)
+            self._train_dl = self.build_pretraining_data_loader(
+                self._train_ds, consumed_samples, num_workers=self._cfg.data.num_workers
+            )
 
     def on_pretrain_routine_start(self) -> None:
         # keep a copy of init_global_step
@@ -891,12 +893,14 @@ def on_pretrain_routine_start(self) -> None:
     def setup_validation_data(self, cfg):
         if hasattr(self, '_validation_ds'):
             consumed_samples = 0
-            self._validation_dl = self.build_pretraining_data_loader(self._validation_ds, consumed_samples)
+            self._validation_dl = self.build_pretraining_data_loader(
+                self._validation_ds, consumed_samples, num_workers=0
+            )
 
     def setup_test_data(self, cfg):
         if hasattr(self, '_test_ds'):
             consumed_samples = 0
-            self._test_dl = self.build_pretraining_data_loader(self._test_ds, consumed_samples)
+            self._test_dl = self.build_pretraining_data_loader(self._test_ds, consumed_samples, num_workers=0)
 
     def compute_consumed_samples(self, steps_since_resume=0):
         app_state = AppState()
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py
index ceb2d8cd54ba..182c44bd4642 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py
@@ -83,8 +83,12 @@ def forward(
         else:
             input_embeds = self.embed_input_train(input_ids, taskname_ids)
 
-        position_embeddings = self.frozen_model.enc_dec_model.encoder_embedding.position_embeddings(position_ids)
-        encoder_input = input_embeds + position_embeddings
+        # TODO: This check needs to be revisited with PP support.
+        if hasattr(self.frozen_model.enc_dec_model.encoder_embedding, 'position_embeddings'):
+            position_embeddings = self.frozen_model.enc_dec_model.encoder_embedding.position_embeddings(position_ids)
+            encoder_input = input_embeds + position_embeddings
+        else:
+            encoder_input = input_embeds
 
         # Call forward on T5 model with preprocessed embeddings
         if self.autocast_dtype == torch.float32:
@@ -118,12 +122,14 @@ def load_frozen_model(self, cfg, trainer):
 
         # TODO: Fix this once apex patches FusedScaledMaskedSoftmax.
         # This is a workaround for the fact that `masked_softmax_fusion` has issues with certain input sizes that may be present while finetuning.
-        t5_cfg = MegatronT5Model.restore_from(
-            cfg.get('pretrained_language_model_path'), trainer=trainer, return_config=True
-        )
+        t5_cfg = MegatronT5Model.restore_from(cfg.get('language_model_path'), trainer=trainer, return_config=True)
         OmegaConf.set_struct(t5_cfg, True)
         with open_dict(t5_cfg):
-            t5_cfg.masked_softmax_fusion = False
+            if hasattr(t5_cfg, 'encoder') and hasattr(t5_cfg, 'decoder'):
+                t5_cfg.encoder.masked_softmax_fusion = False
+                t5_cfg.decoder.masked_softmax_fusion = False
+            else:
+                t5_cfg.masked_softmax_fusion = False
             t5_cfg.megatron_amp_O2 = self.megatron_amp_o2
             # hack to make the _GLOBAL_NUM_MICROBATCHES_CALCULATOR initialize
             t5_cfg.micro_batch_size = cfg.get('micro_batch_size', 4)
@@ -131,7 +137,7 @@ def load_frozen_model(self, cfg, trainer):
             t5_cfg.precision = trainer.precision
 
         self.frozen_model = MegatronT5Model.restore_from(
-            cfg.get('pretrained_language_model_path'),
+            cfg.get('language_model_path'),
             trainer=trainer,
             override_config_path=t5_cfg,
             save_restore_connector=NLPSaveRestoreConnector(),
@@ -237,8 +243,12 @@ def inference_step(self, batch, batch_idx, inference=False):
 
         input_embeds = self.embed_input_train(enc_input, taskname_ids)
 
-        position_embeddings = self.frozen_model.enc_dec_model.encoder_embedding.position_embeddings(position_ids)
-        encoder_input = input_embeds + position_embeddings
+        # TODO: This check needs to be revisited with PP support.
+        if hasattr(self.frozen_model.enc_dec_model.encoder_embedding, 'position_embeddings'):
+            position_embeddings = self.frozen_model.enc_dec_model.encoder_embedding.position_embeddings(position_ids)
+            encoder_input = input_embeds + position_embeddings
+        else:
+            encoder_input = input_embeds
 
         loss_mean = self.fwd_bwd_step(batch, batch_idx, forward_only=True)
 
@@ -259,9 +269,18 @@ def inference_step(self, batch, batch_idx, inference=False):
                 idx = pred.index(self.tokenizer.eos_id)
                 pred = pred[:idx]
 
-            pred = [id for id in pred if id not in self.tokenizer.tokenizer.additional_special_tokens_ids]
-            label = [id for id in label if id not in self.tokenizer.tokenizer.additional_special_tokens_ids]
-            enc_input = [id for id in enc_input if id not in self.tokenizer.tokenizer.additional_special_tokens_ids]
+            # Sentencepiece case
+            if hasattr(self.tokenizer, 'special_token_to_id'):
+                pred = [id for id in pred if id not in self.tokenizer.special_token_to_id.values()]
+                label = [id for id in label if id not in self.tokenizer.special_token_to_id.values()]
+                enc_input = [id for id in enc_input if id not in self.tokenizer.special_token_to_id.values()]
+            # HF Autotokenizer case.
+            else:
+                pred = [id for id in pred if id not in self.tokenizer.tokenizer.additional_special_tokens_ids]
+                label = [id for id in label if id not in self.tokenizer.tokenizer.additional_special_tokens_ids]
+                enc_input = [
+                    id for id in enc_input if id not in self.tokenizer.tokenizer.additional_special_tokens_ids
+                ]
 
             pred = self.tokenizer.ids_to_text(pred)
             label = self.tokenizer.ids_to_text(label)
@@ -374,8 +393,12 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> A
 
         input_embeds = self.embed_input_inference(enc_input, taskname_ids)
 
-        position_embeddings = self.frozen_model.enc_dec_model.encoder_embedding.position_embeddings(position_ids)
-        encoder_input = input_embeds + position_embeddings
+        # TODO: This check needs to be revisited with PP support.
+        if hasattr(self.frozen_model.enc_dec_model.encoder_embedding, 'position_embeddings'):
+            position_embeddings = self.frozen_model.enc_dec_model.encoder_embedding.position_embeddings(position_ids)
+            encoder_input = input_embeds + position_embeddings
+        else:
+            encoder_input = input_embeds
 
         predicted_token_ids, log_probs = self.frozen_model.decode(
             tokens_enc=enc_input,
diff --git a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py
index 4be5d2dd838a..5f0775afb6f6 100644
--- a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py
+++ b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py
@@ -39,6 +39,7 @@
 )
 from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
 from nemo.collections.nlp.models.machine_translation.mt_enc_dec_model import MTEncDecModel
+from nemo.collections.nlp.modules.common.megatron.megatron_export import DecEmb, EncEmb, TokensHeadEmb
 from nemo.collections.nlp.parts.nlp_overrides import GlobalBatchDataFetcher
 from nemo.collections.nlp.parts.utils_funcs import get_last_rank
 from nemo.utils import AppState, logging, timers
@@ -332,13 +333,6 @@ def eval_epoch_end(self, outputs, mode):
         if isinstance(outputs[0], dict):
             outputs = [outputs]
 
-        self.log(
-            'consumed_samples',
-            self.compute_consumed_samples(self.trainer.global_step - self.init_global_step),
-            rank_zero_only=True,
-        )
-        self.log('global_step', self.trainer.global_step, prog_bar=True, rank_zero_only=True)
-
         loss_list = []
         bleu_score_list = []
         for dataloader_idx, output in enumerate(outputs):
@@ -844,3 +838,15 @@ def on_validation_start(self) -> None:
 
     def on_test_start(self) -> None:
         self.trainer.test_loop._data_fetcher = GlobalBatchDataFetcher()
+
+    @property
+    def encoder(self):
+        return EncEmb(self.enc_dec_model.encoder_embedding, self.enc_dec_model.enc_dec_model.encoder, self.device)
+
+    @property
+    def decoder(self):
+        return DecEmb(self.enc_dec_model.decoder_embedding, self.enc_dec_model.enc_dec_model.decoder, self.device)
+
+    @property
+    def classifier(self):
+        return TokensHeadEmb(self.enc_dec_model.decoder_embedding, self.enc_dec_model.tokens_head, self.device)
diff --git a/nemo/collections/nlp/modules/common/megatron/fused_bias_gelu.py b/nemo/collections/nlp/modules/common/megatron/fused_bias_gelu.py
index b7103dfa6e9d..2ca47fd92fcd 100644
--- a/nemo/collections/nlp/modules/common/megatron/fused_bias_gelu.py
+++ b/nemo/collections/nlp/modules/common/megatron/fused_bias_gelu.py
@@ -62,6 +62,10 @@ def backward(ctx, grad_output):
         tmp = bias_gelu_back(grad_output, bias, input)
         return tmp, tmp
 
+    @staticmethod
+    def symbolic(g, input, bias):
+        return g.op("com.microsoft::BiasGelu", input, bias)
+
 
 def fused_bias_gelu(input, bias):
     args = _cast_if_autocast_enabled(input, bias)
diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_decoder_module.py b/nemo/collections/nlp/modules/common/megatron/megatron_decoder_module.py
new file mode 100644
index 000000000000..25abb4c35b0d
--- /dev/null
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_decoder_module.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC
+from typing import Dict, List, Optional
+
+from nemo.core.classes import NeuralModule
+from nemo.core.neural_types import ChannelType, MaskType, NeuralType
+
+__all__ = ['MegatronDecoderModule']
+
+
+class MegatronDecoderModule(NeuralModule, ABC):
+    """ Base class for encoder neural module to be used in NLP models. """
+
+    @property
+    def input_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {
+            "dec_input": NeuralType(('B', 'T', 'D'), ChannelType()),
+            "dec_attn_mask": NeuralType(('B', 'T'), MaskType()),
+            "enc_output": NeuralType(('B', 'T', 'D'), ChannelType()),
+            "enc_attn_mask": NeuralType(('B', 'T'), MaskType()),
+        }
+
+    @property
+    def input_names(self) -> List[str]:
+        return ['dec_input', 'dec_attn_mask', 'enc_output', 'enc_attn_mask']
+
+    @property
+    def output_names(self) -> List[str]:
+        return ['decoder_output']
+
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {"dec_output": NeuralType(('B', 'T', 'D'), ChannelType())}
diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_encoder_module.py b/nemo/collections/nlp/modules/common/megatron/megatron_encoder_module.py
new file mode 100644
index 000000000000..1f2314e12383
--- /dev/null
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_encoder_module.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from abc import ABC
+from re import L
+from typing import Dict, List, Optional
+
+from nemo.core.classes import NeuralModule
+from nemo.core.neural_types import ChannelType, MaskType, NeuralType
+
+__all__ = ['MegatronEncoderModule']
+
+
+class MegatronEncoderModule(NeuralModule, ABC):
+    """ Base class for encoder neural module to be used in NLP models. """
+
+    @property
+    def input_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {
+            "input_ids": NeuralType(('B', 'T'), ChannelType()),
+            "encoder_mask": NeuralType(('B', 'T'), MaskType()),
+        }
+
+    @property
+    def input_names(self) -> List[str]:
+        return ['input_ids', 'encoder_mask']
+
+    @property
+    def output_names(self) -> List[str]:
+        return ['encoder_output']
+
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {"encoder_output": NeuralType(('B', 'T', 'D'), ChannelType())}
diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_export.py b/nemo/collections/nlp/modules/common/megatron/megatron_export.py
new file mode 100644
index 000000000000..6fd9a239380c
--- /dev/null
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_export.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+from typing import Dict, List, Optional
+
+import torch
+import torch.nn.functional as F
+
+from nemo.collections.nlp.modules.common.megatron.utils import build_position_ids
+from nemo.core.classes.exportable import Exportable
+from nemo.core.neural_types import ChannelType, MaskType, NeuralType
+
+__all__ = ["TokensHeadEmb", "DecEmb", "EncEmb"]
+
+
+class TokensHeadEmb(torch.nn.Module, Exportable):
+    """
+    Combines decoder_embedding with the tokens_head layer to simulate the classifier in NemoNMT
+    """
+
+    def __init__(self, decoder_embedding, tokens_head, device):
+        super(TokensHeadEmb, self).__init__()
+
+        self.decoder_embedding = decoder_embedding
+        self.tokens_head_bias = tokens_head.bias
+        self.device = device
+
+        # properties needed for export
+        self.training = False
+
+    def train(self, dummy_input):
+        return None
+
+    def modules(self):
+        return []
+
+    def forward(self, dec_output):
+        if isinstance(dec_output, list):
+            dec_output = dec_output[0]
+
+        dec_output = torch.permute(dec_output, (1, 0, 2))
+
+        if self.tokens_head_bias is not None:
+            return F.linear(dec_output, self.decoder_embedding.word_embeddings.weight, self.tokens_head_bias)
+        return F.linear(dec_output, self.decoder_embedding.word_embeddings.weight)
+
+    def input_example(self, max_batch=1, max_dim=1024, seq_len=6):
+        return [
+            torch.randint(low=-3, high=3, size=(seq_len, max_batch, max_dim), device=self.device, dtype=torch.float32)
+        ]
+
+    @property
+    def input_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {
+            "hidden_states": NeuralType(('T', 'B', 'D'), ChannelType()),
+        }
+
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {"log_probs": NeuralType(('B', 'T', 'D'), ChannelType())}
+
+    @property
+    def input_names(self) -> List[str]:
+        return ['hidden_states']
+
+    @property
+    def output_names(self) -> List[str]:
+        return ['log_probs']
+
+
+class DecEmb(torch.nn.Module, Exportable):
+    """
+    Combines decoder_embedding with the decoder component
+    """
+
+    def __init__(self, decoder_embedding, decoder, device):
+        super(DecEmb, self).__init__()
+
+        self.decoder_embedding = decoder_embedding
+        self.decoder = decoder
+        self.device = device
+
+        # properties needed for export
+        self.training = False
+
+    def train(self, dummy_input):
+        return None
+
+    def modules(self):
+        return (self.decoder_embedding, self.decoder)
+
+    def forward(self, input_ids, decoder_mask, encoder_mask, encoder_embeddings, dec_mems):
+        position_ids = build_position_ids(input_ids)
+        dec_input = self.decoder_embedding(input_ids, position_ids, token_type_ids=None)
+
+        # dec_input, dec_attn_mask, enc_output, enc_attn_mask | dec_input, dec_attn_mask, enc_output, enc_attn_mask
+        _ = dec_mems
+
+        return self.decoder(dec_input, decoder_mask, encoder_embeddings, encoder_mask).float()
+
+    def input_example(self, max_batch=1, max_dim=1024, seq_len=6):
+        enc_output = torch.randint(
+            low=-3, high=3, size=(seq_len, max_batch, max_dim), device=self.device, dtype=torch.float32
+        )
+        enc_attn_mask = torch.tensor([[1 for _ in range(seq_len)]]).to(self.device)
+
+        dec_len = random.randint(10, 128)
+        dec_input = torch.randint(low=0, high=1000, size=(max_batch, dec_len), device=self.device)
+        dec_attn_mask = torch.tensor([[1 for _ in range(dec_len)]]).to(self.device)
+        decoder_mems = torch.zeros([8, 6, 1024], dtype=torch.float32).to(self.device)
+
+        # input_ids, decoder_mask, encoder_mask, encoder_embeddings
+        return (dec_input, dec_attn_mask, enc_attn_mask, enc_output, decoder_mems)
+
+    @property
+    def input_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {
+            "input_ids": NeuralType(('B', 'T', 'D'), ChannelType()),
+            "decoder_mask": NeuralType(('B', 'T'), MaskType()),
+            "encoder_mask": NeuralType(('T', 'B', 'D'), ChannelType()),
+            "encoder_embeddings": NeuralType(('B', 'T'), MaskType()),
+            "decoder_mems": NeuralType(('T', 'B', 'D'), ChannelType()),
+        }
+
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {"last_hidden_states": NeuralType(('T', 'B', 'D'), ChannelType())}
+
+    @property
+    def input_names(self) -> List[str]:
+        return ['input_ids', 'decoder_mask', 'encoder_mask', 'encoder_embeddings', 'decoder_memes']
+
+    @property
+    def output_names(self) -> List[str]:
+        return ['last_hidden_states']
+
+
+class EncEmb(torch.nn.Module, Exportable):
+    """
+    Combines encoder_embedding with the encoder component
+    """
+
+    def __init__(self, encoder_embedding, encoder, device):
+        super(EncEmb, self).__init__()
+
+        self.encoder_embedding = encoder_embedding
+        self.encoder = encoder
+        self.device = device
+
+        # properties needed for export
+        self.training = False
+
+    def train(self, dummy_input):
+        return None
+
+    def modules(self):
+        return (self.encoder_embedding, self.encoder)
+
+    def forward(self, input_ids, encoder_mask):
+        position_ids = build_position_ids(input_ids)
+        enc_input = self.encoder_embedding(input_ids, position_ids, token_type_ids=None)
+
+        # pass input through the encoder
+        return self.encoder(enc_input=enc_input, enc_attn_mask=encoder_mask,).type(torch.float32)
+
+    def input_example(self):
+        seq_len = random.randint(0, 128)
+        return (
+            torch.randint(0, 30000, (1, seq_len)).to(self.device),
+            torch.ones((1, seq_len), dtype=int).to(self.device),
+        )
+
+    @property
+    def input_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {
+            "input_ids": NeuralType(('B', 'T'), ChannelType()),
+            "encoder_mask": NeuralType(('B', 'T'), MaskType()),
+        }
+
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {"last_hidden_states": NeuralType(('T', 'B', 'D'), ChannelType())}
+
+    @property
+    def input_names(self) -> List[str]:
+        return ['input_ids', 'encoder_mask']
+
+    @property
+    def output_names(self) -> List[str]:
+        return ['last_hidden_states']
diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_tokens_head_module.py b/nemo/collections/nlp/modules/common/megatron/megatron_tokens_head_module.py
new file mode 100644
index 000000000000..acc8eb2bba1f
--- /dev/null
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_tokens_head_module.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC
+from typing import Dict, List, Optional
+
+from nemo.core.classes import NeuralModule
+from nemo.core.neural_types import ChannelType, MaskType, NeuralType
+
+__all__ = ['MegatronTokensHeadModule']
+
+
+class MegatronTokensHeadModule(NeuralModule, ABC):
+    """ Base class for encoder neural module to be used in NLP models. """
+
+    @property
+    def input_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {
+            "dec_output": NeuralType(('B', 'T', 'D'), ChannelType()),
+            "embeddings_weights": NeuralType(('T', 'D'), MaskType()),
+        }
+
+    @property
+    def input_names(self) -> List[str]:
+        return ['dec_output', 'embeddings_weights']
+
+    @property
+    def output_names(self) -> List[str]:
+        return ['logits']
+
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {"logits": NeuralType(('B', 'T', 'D'), ChannelType())}
diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py
index 00681c2ba57d..d6724fc371e3 100644
--- a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py
@@ -15,6 +15,7 @@
 """Transformer based language model."""
 
 from nemo.collections.nlp.modules.common.megatron.layer_type import LayerType
+from nemo.collections.nlp.modules.common.megatron.megatron_decoder_module import MegatronDecoderModule
 from nemo.collections.nlp.modules.common.megatron.module import MegatronModule
 from nemo.collections.nlp.modules.common.megatron.transformer import ParallelTransformer
 from nemo.collections.nlp.modules.common.megatron.utils import (
@@ -22,6 +23,7 @@
     attn_mask_postprocess,
     build_attention_mask_3d,
 )
+from nemo.core.classes.exportable import Exportable
 
 try:
     from apex.transformer.enums import AttnMaskType, ModelType
@@ -38,7 +40,7 @@
 __all__ = ["MegatronTransformerDecoderModule"]
 
 
-class MegatronTransformerDecoderModule(MegatronModule):
+class MegatronTransformerDecoderModule(MegatronModule, Exportable, MegatronDecoderModule):
     """Transformer decoder model.
     """
 
diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py
index 3df813b8b0cc..1acb84068e2f 100644
--- a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py
@@ -15,6 +15,7 @@
 """Transformer based language model."""
 
 from nemo.collections.nlp.modules.common.megatron.layer_type import LayerType
+from nemo.collections.nlp.modules.common.megatron.megatron_encoder_module import MegatronEncoderModule
 from nemo.collections.nlp.modules.common.megatron.module import MegatronModule
 from nemo.collections.nlp.modules.common.megatron.transformer import ParallelTransformer
 from nemo.collections.nlp.modules.common.megatron.utils import (
@@ -22,6 +23,7 @@
     attn_mask_postprocess,
     build_attention_mask_3d,
 )
+from nemo.core.classes.exportable import Exportable
 
 try:
     from apex.transformer.enums import AttnMaskType, ModelType
@@ -36,7 +38,7 @@
 __all__ = ["MegatronTransformerEncoderModule"]
 
 
-class MegatronTransformerEncoderModule(MegatronModule):
+class MegatronTransformerEncoderModule(MegatronModule, Exportable, MegatronEncoderModule):
     """Transformer encoder model."""
 
     def __init__(
diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py
index b827a785ff32..df1bb902c707 100644
--- a/nemo/collections/nlp/modules/common/text_generation_utils.py
+++ b/nemo/collections/nlp/modules/common/text_generation_utils.py
@@ -616,8 +616,12 @@ def sample_sequence_batch(
         # Generate enough tokens for the longest sequence
         maxlen = tokens_to_generate + context_lengths.max().item()
 
-        if maxlen > model.cfg.encoder_seq_length + 1:
-            maxlen = model.cfg.encoder_seq_length + 1
+        if isinstance(model, MegatronGPTPromptLearningModel):
+            if maxlen > model.frozen_model.cfg.encoder_seq_length + 1:
+                maxlen = model.frozen_model.cfg.encoder_seq_length + 1
+        else:
+            if maxlen > model.cfg.encoder_seq_length + 1:
+                maxlen = model.cfg.encoder_seq_length + 1
 
         lengths = torch.ones([batch_size]).long().cuda() * maxlen
 
diff --git a/nemo/collections/tts/torch/data.py b/nemo/collections/tts/torch/data.py
index 55b71360d49c..4108b1bb0a89 100644
--- a/nemo/collections/tts/torch/data.py
+++ b/nemo/collections/tts/torch/data.py
@@ -798,6 +798,7 @@ def __getitem__(self, index):
         if LMTokens in self.sup_data_types_set:
             lm_tokens = torch.tensor(self.id2lm_tokens[index]).long()
 
+        # Note: Please change the indices in _collate_fn if any items are added/removed.
         return (
             audio,
             audio_length,
@@ -819,8 +820,8 @@ def __getitem__(self, index):
 
     def _collate_fn(self, batch):
         batch = list(zip(*batch))
-        data_dict = self.general_collate_fn(list(zip(*batch[:13])))
-        lm_tokens_list = batch[13]
+        data_dict = self.general_collate_fn(list(zip(*batch[:15])))
+        lm_tokens_list = batch[15]
 
         if LMTokens in self.sup_data_types_set:
             lm_tokens = torch.full(
diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py
index 0b4e29cca324..8a4f9493900b 100644
--- a/nemo/core/classes/modelPT.py
+++ b/nemo/core/classes/modelPT.py
@@ -948,7 +948,7 @@ def get_test_dataloader_prefix(self, dataloader_idx: int = 0) -> str:
         """
         return self._test_names[dataloader_idx]
 
-    def load_part_of_state_dict(self, state_dict, include, exclude, load_from_string):
+    def load_part_of_state_dict(self, state_dict, include, exclude, load_from_string=None):
 
         excluded_param_names = []
         # create dict
@@ -971,12 +971,18 @@ def load_part_of_state_dict(self, state_dict, include, exclude, load_from_string
 
         # Restore checkpoint part into current model
         self.load_state_dict(dict_to_load, strict=False)
-        logging.info(f'Model checkpoint partially restored from {load_from_string}')
-        if len(excluded_param_names) > 0:
-            logging.info(
-                f'The following parameters were excluded from loading from {load_from_string} : {excluded_param_names}'
-            )
-            logging.info(f'Make sure that this is what you wanted!')
+        if load_from_string is not None:
+            logging.info(f'Model checkpoint partially restored from {load_from_string}')
+            if len(excluded_param_names) > 0:
+                logging.info(
+                    f'The following parameters were excluded when loading from {load_from_string} : {excluded_param_names}'
+                )
+                logging.info(f'Make sure that this is what you wanted!')
+        else:
+            if len(excluded_param_names) > 0:
+                logging.info(
+                    f'The following parameters were excluded when loading checkpoint : {excluded_param_names}'
+                )
 
     @rank_zero_only
     def maybe_init_from_pretrained_checkpoint(self, cfg: OmegaConf, map_location: str = 'cpu'):
@@ -1149,7 +1155,7 @@ def maybe_init_from_pretrained_checkpoint(self, cfg: OmegaConf, map_location: st
                         exclude = model_load_cfg.pop('exclude', [])
 
                         self.load_part_of_state_dict(
-                            ckpt['state_dict'], include, exclude, f'nemo file with path `{model_path}`'
+                            ckpt['state_dict'], include, exclude, f'nemo file with path `{ckpt_path}`'
                         )
 
                         del ckpt
diff --git a/nemo/utils/export_utils.py b/nemo/utils/export_utils.py
index 82cc5bcf2fca..28e50864ecc0 100644
--- a/nemo/utils/export_utils.py
+++ b/nemo/utils/export_utils.py
@@ -19,6 +19,7 @@
 import onnx
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 
 from nemo.utils import logging
 
@@ -74,6 +75,36 @@ def forward(self, x):
         return ret
 
 
+class LinearWithBiasSkip(nn.Module):
+    def __init__(self, weight, bias, skip_bias_add):
+        super(LinearWithBiasSkip, self).__init__()
+        self.bias = bias
+        self.weight = weight
+        self.skip_bias_add = skip_bias_add
+
+    def forward(self, x):
+        if self.skip_bias_add:
+            return F.linear(x, self.weight), self.bias
+        return F.linear(x, self.weight, self.bias), None
+
+
+# ScaledMaskedSoftmax replacement
+def mask_func(attention_scores, attention_mask):
+    attention_scores.masked_fill_(attention_mask, -10000.0)
+    return attention_scores
+
+
+def exportable_ScaledMaskedSoftmax(input, mask, scale):
+    if scale is not None:
+        input = input * scale
+
+    mask_output = mask_func(input, mask) if mask is not None else input
+    probs = torch.nn.Softmax(dim=-1)(mask_output)
+
+    probs = probs.half()
+    return probs
+
+
 def get_export_format(filename: str):
     _, ext = os.path.splitext(filename)
     try:
@@ -179,6 +210,8 @@ def run_ort_and_compare(sess, ort_input, output_example, check_tolerance=0.01):
 try:
     from apex.normalization.fused_layer_norm import FusedLayerNorm, MixedFusedLayerNorm
     from apex.contrib.layer_norm.layer_norm import FastLayerNorm
+    from apex.transformer.tensor_parallel.layers import RowParallelLinear
+    from apex.transformer.functional.fused_softmax import FusedScaleMaskSoftmax
 
     def replace_FusedLayerNorm(n: nn.Module) -> Optional[nn.BatchNorm2d]:
         """
@@ -196,16 +229,57 @@ def replace_FusedLayerNorm(n: nn.Module) -> Optional[nn.BatchNorm2d]:
             return None
 
         dev = next(n.parameters()).device
-        mod = nn.LayerNorm(n.normalized_shape, eps=n.eps, elementwise_affine=n.elementwise_affine,).to(dev)
+        if isinstance(n, FusedLayerNorm) or isinstance(n, MixedFusedLayerNorm):
+            mod = nn.LayerNorm(n.normalized_shape, eps=n.eps, elementwise_affine=n.elementwise_affine,).to(dev)
+        elif isinstance(n, FastLayerNorm):
+            mod = nn.LayerNorm(n.weight.shape, eps=n.epsilon, elementwise_affine=True, dtype=torch.float16,).to(dev)
 
         n_state = n.state_dict()
         mod.load_state_dict(n_state)
         return mod
 
+    def replace_RowParallelLinear(n: nn.Module) -> Optional[nn.Linear]:
+        """
+        Replaces Apex's FusedLayerNorm with nn.LayerNorm. This is required for ONNX export.
+        Args:
+           n: the FusedLayerNorm pytorch module to replace
+        Returns:
+           Equivalent LayerNorm module
+        """
+        if not isinstance(n, RowParallelLinear):
+            raise ValueError("This function can only change the RowParallelLinear module.")
+
+        dev = next(n.parameters()).device
+        mod = LinearWithBiasSkip(n.weight, n.bias, n.skip_bias_add).to(dev)
+
+        n_state = n.state_dict()
+        mod.load_state_dict(n_state)
+        return mod
+
+    def replace_FusedScaleMaskSoftmax(n: nn.Module) -> Optional[nn.Linear]:
+        """
+        Replaces Apex's FusedScaleMaskSoftmax with nn.LayerNorm. This is required for ONNX export.
+        Args:
+           n: the FusedScaleMaskSoftmax module to replace
+        Returns:
+           Equivalent LayerNorm module
+        """
+        if not isinstance(n, FusedScaleMaskSoftmax):
+            raise ValueError("This function can only change the FusedScaleMaskSoftmax module.")
+
+        # disable the fusion only
+        mod = FusedScaleMaskSoftmax(
+            n.input_in_fp16, n.input_in_bf16, n.attn_mask_type, False, n.mask_func, n.softmax_in_fp32, n.scale
+        )
+
+        return mod
+
     default_Apex_replacements = {
         "FusedLayerNorm": replace_FusedLayerNorm,
         "MixedFusedLayerNorm": replace_FusedLayerNorm,
         "FastLayerNorm": replace_FusedLayerNorm,
+        "RowParallelLinear": replace_RowParallelLinear,
+        "FusedScaleMaskSoftmax": replace_FusedScaleMaskSoftmax,
     }
 
 except Exception as e:
diff --git a/scripts/checkpoint_averaging/checkpoint_averaging.py b/scripts/checkpoint_averaging/checkpoint_averaging.py
index bbaa64db17bd..aaf890d77191 100755
--- a/scripts/checkpoint_averaging/checkpoint_averaging.py
+++ b/scripts/checkpoint_averaging/checkpoint_averaging.py
@@ -35,6 +35,7 @@
 import sys
 
 import torch
+from tqdm.auto import tqdm
 
 from nemo.core import ModelPT
 from nemo.utils import logging, model_utils
@@ -44,13 +45,14 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         'model_fname_list',
-        metavar='N',
+        metavar='NEMO_FILE_OR_FOLDER',
         type=str,
         nargs='+',
         help='Input .nemo files (or folders who contains them) to parse',
     )
     parser.add_argument(
         '--import_fname_list',
+        metavar='FILE',
         type=str,
         nargs='+',
         default=[],
@@ -59,7 +61,7 @@ def main():
     args = parser.parse_args()
 
     logging.info(
-        f"\n\nIMPORTANT: Use --import_fname_list for all files that contain missing classes (AttributeError: Can't get attribute '???' on <module '__main__' from '???'>)\n\n"
+        f"\n\nIMPORTANT:\nIf you get the following error:\n\t(AttributeError: Can't get attribute '???' on <module '__main__' from '???'>)\nuse:\n\t--import_fname_list\nfor all files that contain missing classes.\n\n"
     )
 
     for fn in args.import_fname_list:
@@ -77,7 +79,7 @@ def main():
                 filter(lambda fn: not fn.endswith("-averaged.nemo"), glob.glob(os.path.join(model_fname, "*.nemo")))
             )
             if len(nemo_files) != 1:
-                raise RuntimeError(f"Expected only a single .nemo files but discovered {len(nemo_files)} .nemo files")
+                raise RuntimeError(f"Expected exactly one .nemo file but discovered {len(nemo_files)} .nemo files")
 
             model_fname = nemo_files[0]
 
@@ -107,23 +109,25 @@ def main():
 
         logging.info(f"Averaging {n} checkpoints ...")
 
-        for ix, path in enumerate(checkpoint_paths):
+        for ix, path in enumerate(tqdm(checkpoint_paths, total=n, desc='Averaging checkpoints')):
             checkpoint = torch.load(path, map_location=device)
 
             if 'state_dict' in checkpoint:
                 checkpoint = checkpoint['state_dict']
+            else:
+                raise RuntimeError(f"Checkpoint from {path} does not include a state_dict.")
 
             if ix == 0:
                 # Initial state
                 avg_state = checkpoint
 
-                logging.info(f"Initialized average state dict with checkpoint : {path}")
+                logging.info(f"Initialized average state dict with checkpoint:\n\t{path}")
             else:
                 # Accumulated state
                 for k in avg_state:
                     avg_state[k] = avg_state[k] + checkpoint[k]
 
-                logging.info(f"Updated average state dict with state from checkpoint : {path}")
+                logging.info(f"Updated average state dict with state from checkpoint:\n\t{path}")
 
         for k in avg_state:
             if str(avg_state[k].dtype).startswith("torch.int"):
@@ -136,7 +140,7 @@ def main():
         # restore merged weights into model
         nemo_model.load_state_dict(avg_state, strict=True)
         # Save model
-        logging.info(f"Saving average mdel to: {avg_model_fname}")
+        logging.info(f"Saving average model to:\n\t{avg_model_fname}")
         nemo_model.save_to(avg_model_fname)
 
 
diff --git a/scripts/dataset_processing/nlp/squad/prompt_learning_squad_preprocessing.py b/scripts/dataset_processing/nlp/squad/prompt_learning_squad_preprocessing.py
index 17fb60826f81..785f8d59a80f 100644
--- a/scripts/dataset_processing/nlp/squad/prompt_learning_squad_preprocessing.py
+++ b/scripts/dataset_processing/nlp/squad/prompt_learning_squad_preprocessing.py
@@ -14,7 +14,6 @@
 
 import argparse
 import json
-import random
 
 from tqdm import tqdm
 
@@ -25,23 +24,16 @@
 
 Inputs:
     data-dir: (str) The directory where the squad dataset was downloaded, files will be saved here
-    file-name: (str) Name of the input file you want to process
+    train-file: (str) Name of train set file, either train-v1.1.json or train-v2.0.json
+    dev-file: (str) Name of dev set file, either dev-v1.1.json or dev-v2.0.json
     save-name-base: (str) The base name for each of the train, val, and test files. If save-name-base were 'squad' for
                     example, the files would be saved as squad_train.jsonl, squad_val.jsonl, and squad_test.jsonl
-    make-ground-truth: (bool) If true, test files will include answers, if false, test files will not include answers. 
     include-topic-name: Whether to include the topic name for the paragraph in the data json. See the squad explaination
                         below for more context on what is ment by 'topic name'.
     random-seed: (int) Random seed for repeatable shuffling of train/val/test splits. 
-    train-percent: (float) Precentage of data that should be used for the train split. The val and test splits will be made
-                    by splitting the remaining data evenly. 
 
-Saves train, val, and test files for the SQuAD dataset.
-
-The SQuAD dataset consists of various topics like Beyoncé, IPod, and Symbiosis. Each topic has several paragraphs 
-associated with it, and each paragraph has several questions and answers related to it. When we separated the 
-train/validation/test splits, we separated them on the topic level. For example, if the training set contains paragraphs 
-and questions about the topic Beyoncé, neither the validation nor test sets will contain any questions on this topic. 
-All questions about a certain topic are isolated to one split of the data.
+Saves train, val, and test files for the SQuAD dataset. The val and test splits are the same data, because the given test
+split lacks ground truth answers. 
 
 An example of the processed output written to file:
     
@@ -58,54 +50,36 @@
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--data-dir", type=str, default="data/SQuAD")
-    parser.add_argument("--file-name", type=str, default="train-v2.0.json")
+    parser.add_argument("--data-dir", type=str, default=".")
+    parser.add_argument("--train-file", type=str, default="train-v1.1.json")
+    parser.add_argument("--dev-file", type=str, default="dev-v1.1.json")
     parser.add_argument("--save-name-base", type=str, default="squad")
-    parser.add_argument("--make-ground-truth", action='store_true')
     parser.add_argument("--include-topic-name", action='store_true')
     parser.add_argument("--random-seed", type=int, default=1234)
-    parser.add_argument("--train-percent", type=float, default=0.8)
     args = parser.parse_args()
 
-    data_dict = json.load(open(f"{args.data_dir}/{args.file_name}"))
-    data = data_dict['data']
-    save_name_base = f"{args.data_dir}/{args.save_name_base}"
-
-    process_data(
-        data, save_name_base, args.train_percent, args.random_seed, args.include_topic_name, args.make_ground_truth
-    )
-
-
-def process_data(data, save_name_base, train_percent, random_seed, include_topic, make_ground_truth=False):
-    data = extract_questions(data, include_topic)
-
-    # Data examples are currently grouped by topic, shuffle topic groups
-    random.seed(random_seed)
-    random.shuffle(data)
+    train_data_dict = json.load(open(f"{args.data_dir}/{args.train_file}"))
+    dev_data_dict = json.load(open(f"{args.data_dir}/{args.dev_file}"))
+    train_data = train_data_dict['data']
+    val_data = dev_data_dict['data']
 
-    # Decide train/val/test splits on the topic level
-    data_total = len(data)
-    train_total = int(data_total * train_percent)
-    val_total = (data_total - train_total) // 2
+    save_name_base = f"{args.data_dir}/{args.save_name_base}"
 
-    train_set = data[0:train_total]
-    val_set = data[train_total : train_total + val_total]
-    test_set = data[train_total + val_total :]
+    process_data(train_data, val_data, save_name_base, args.include_topic_name)
 
-    # Flatten data for each split now that topics have been confined to one split
-    train_set = [question for topic in train_set for question in topic]
-    val_set = [question for topic in val_set for question in topic]
-    test_set = [question for topic in test_set for question in topic]
 
-    # Shuffle train set questions
-    random.shuffle(train_set)
+def process_data(train_data, val_data, save_name_base, include_topic):
+    train_set = extract_questions(train_data, include_topic, split="train")
+    val_set = extract_questions(val_data, include_topic, split="val")
+    test_set = extract_questions(val_data, include_topic, split="test")
 
     gen_file(train_set, save_name_base, 'train')
     gen_file(val_set, save_name_base, 'val')
-    gen_file(test_set, save_name_base, 'test', make_ground_truth)
+    gen_file(test_set, save_name_base, 'test', make_ground_truth=True)
+    gen_file(test_set, save_name_base, 'test', make_ground_truth=False)
 
 
-def extract_questions(data, include_topic):
+def extract_questions(data, include_topic, split):
     processed_data = []
 
     # Iterate over topics, want to keep them seprate in train/val/test splits
@@ -124,23 +98,34 @@ def extract_questions(data, include_topic):
                 question = qa['question']
 
                 try:
-                    answer = qa['answers'][0]['text']
+                    # Dev set has multiple right answers. Want all possible answers in test split ground truth
+                    if split == "test":
+                        answers = [qa['answers'][i]['text'] for i in range(len(qa['answers']))]
+
+                    # Choose one anser from dev set if making validation split, train set only has one answer
+                    else:
+                        answers = qa['answers'][0]["text"]
+
                 except IndexError:
                     continue
 
-                example_json = {"taskname": "squad", "context": context, "question": question, "answer": " " + answer}
+                example_json = {"taskname": "squad", "context": context, "question": question, "answer": answers}
 
                 if include_topic:
                     example_json["topic"] = topic
 
                 processed_topic_data.append(example_json)
-        processed_data.append(processed_topic_data)
+        processed_data.extend(processed_topic_data)
 
     return processed_data
 
 
 def gen_file(data, save_name_base, split_type, make_ground_truth=False):
     save_path = f"{save_name_base}_{split_type}.jsonl"
+
+    if make_ground_truth:
+        save_path = f"{save_name_base}_{split_type}_ground_truth.jsonl"
+
     print(f"Saving {split_type} split to {save_path}")
 
     with open(save_path, 'w') as save_file:
diff --git a/scripts/dataset_processing/tts/hui_acg/ds_conf/ds_for_fastpitch_align.yaml b/scripts/dataset_processing/tts/hui_acg/ds_conf/ds_for_fastpitch_align.yaml
index e8c0b58bb418..ab3717edf926 100644
--- a/scripts/dataset_processing/tts/hui_acg/ds_conf/ds_for_fastpitch_align.yaml
+++ b/scripts/dataset_processing/tts/hui_acg/ds_conf/ds_for_fastpitch_align.yaml
@@ -38,7 +38,7 @@ dataset:
     punct_post_process: true
 
   text_tokenizer:
-    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.GermanCharsTokenizer
+    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.GermanPhonemesTokenizer
     punct: true
     apostrophe: true
     pad_with_space: true
diff --git a/scripts/metric_calculation/compute_rouge.py b/scripts/metric_calculation/compute_rouge.py
new file mode 100644
index 000000000000..1324f3bd04f7
--- /dev/null
+++ b/scripts/metric_calculation/compute_rouge.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+
+import numpy as np
+from rouge_score import rouge_scorer, scoring
+
+"""
+Example command for T5 Preds
+
+    ```
+    python compute_rouge.py \
+        --ground-truth dialogsum_test_gt.jsonl \
+        --preds dialogsum_preds_t5.txt \
+        --answer-field "answer" 
+    ```
+
+Example command for GPT Preds
+
+    ```
+    python compute_rouge.py \
+        --ground-truth dialogsum_test_gt.jsonl \
+        --preds dialogsum_preds_gpt.txt \
+        --answer-field "answer" \
+        --split-string "summary:"
+    ```
+"""
+
+
+ROUGE_KEYS = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
+
+
+def calculate_rouge(output_lns, reference_lns, use_stemmer=True):
+    scorer = rouge_scorer.RougeScorer(ROUGE_KEYS, use_stemmer=use_stemmer)
+    aggregator = scoring.BootstrapAggregator()
+
+    for reference_ln, output_ln in zip(reference_lns, output_lns):
+        ln_scores = []
+        for possible_ln in reference_ln:
+            scores = scorer.score(possible_ln, output_ln)
+            ln_scores.append(scores)
+
+        best_index = np.argmax([score_dict["rouge1"][-1] for score_dict in ln_scores])
+        aggregator.add_scores(ln_scores[best_index])
+
+    result = aggregator.aggregate()
+    return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()}
+
+
+def load_ref(filename, answer_field):
+    lines = open(filename).readlines()
+    all_answers = []
+    for line in lines:
+        line = line.strip()
+        line = json.loads(line)
+        answers = line[answer_field]
+
+        if isinstance(answers, str):
+            answers = [answers]
+
+        all_answers.append(answers)
+
+    return all_answers
+
+
+def load_preds(filename, split_string):
+    with open(filename) as f:
+        lines = [line.split(split_string)[-1].strip() for line in f.readlines()]
+
+    return lines
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description='Process some integers.')
+    parser.add_argument('--ground-truth', type=str, help="ground truth .jsonl")
+    parser.add_argument('--preds', type=str, help="Text file with test set prompts + model predictions.")
+    parser.add_argument(
+        '--answer-field',
+        type=str,
+        help="The key in the ground truth json object containing specifying the correct answer.",
+        default="answer",
+    )
+    parser.add_argument(
+        '--split-string',
+        type=str,
+        help="The text at the end of the prompt, write before the predicted answer. This will be used to find the model's predictions in pred files when the pred file containers both the prompt and prediction.",
+        default=None,
+    )  # If the pred file only has preditions, just pass none
+
+    args = parser.parse_args()
+
+    pred_file = args.preds
+    ref_filename = args.ground_truth
+    answer_field = args.answer_field  # The field in the ground truth json that contains the answer
+    split_string = args.split_string  # The final few tokens of the prompt right before the generated answer
+
+    output_lns = load_preds(pred_file, split_string)
+    reference_lns = load_ref(ref_filename, answer_field)
+    assert len(output_lns) == len(reference_lns)
+    print("Calculating Rouge")
+
+    scores = calculate_rouge(output_lns=output_lns, reference_lns=reference_lns)
+    print(scores)
diff --git a/scripts/metric_calculation/squad_metric_calc.py b/scripts/metric_calculation/squad_metric_calc.py
new file mode 100755
index 000000000000..2c9e68669e6a
--- /dev/null
+++ b/scripts/metric_calculation/squad_metric_calc.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import re
+import string
+from collections import Counter
+
+
+"""
+This script can be used to calcualte exact match and F1 scores for many different tasks, not just squad. 
+
+Example command for T5 Preds
+
+    ```
+    python squad_metric_calc.py \
+        --ground-truth squad_test_gt.jsonl \
+        --preds squad_preds_t5.txt
+    ```
+
+Example command for GPT Preds
+
+    ```
+    python squad_metric_calc.py \
+        --ground-truth squad_test_gt.jsonl \
+        --preds squad_preds_gpt.txt \
+        --split-string "answer:"
+    ```
+
+    In this case, the prediction file will be split on "answer: " when looking for the LM's predicted answer. 
+
+"""
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1_score(prediction, ground_truth):
+    prediction_tokens = normalize_answer(prediction).split()
+    ground_truth_tokens = normalize_answer(ground_truth).split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def exact_match_score(prediction, ground_truth):
+    return normalize_answer(prediction) == normalize_answer(ground_truth)
+
+
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Process some integers.')
+    parser.add_argument(
+        '--ground-truth',
+        type=str,
+        help="ground truth .jsonl file made from /NeMo/scripts/dataset_processing/nlp/squad/prompt_learning_squad_preprocessing.py",
+    )
+    parser.add_argument(
+        '--preds',
+        type=str,
+        help="Text file with test set prompts + model predictions. Prediction file can be made by running NeMo/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py",
+    )
+    parser.add_argument(
+        '--split-string',
+        type=str,
+        help="The text at the end of the prompt, write before the predicted answer. This will be used to find the model's predictions in pred files when the pred file containers both the prompt and prediction.",
+        default=None,
+    )  # If the pred file only has preditions, just pass none
+
+    args = parser.parse_args()
+
+    ground_truth_file = args.ground_truth
+    pred_file = args.preds
+
+    preds = open(pred_file, encoding="utf-8").readlines()
+    ground_truth = open(ground_truth_file).readlines()
+    f1 = exact_match = total = 0
+
+    for i in range(len(preds)):
+        truth = json.loads(ground_truth[i])
+        pred_answer = preds[i]
+
+        # Need to separate out preditions from prompt, spliting on the provided "split string"
+        if args.split_string is not None:
+            pred_answer = pred_answer.split(args.split_string)[-1].strip()
+
+        true_answers = truth["answer"]
+        if not isinstance(true_answers, list):
+            true_answers = [true_answers]
+
+        exact_match += metric_max_over_ground_truths(exact_match_score, pred_answer, true_answers)
+        f1 += metric_max_over_ground_truths(f1_score, pred_answer, true_answers)
+        total += 1
+
+    exact_match = 100.0 * exact_match / total
+    f1 = 100.0 * f1 / total
+
+    print({'exact_match': exact_match, 'f1': f1, 'total': total})
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb b/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
index 10bd5afd2b50..e01c2f6a681a 100644
--- a/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
+++ b/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
@@ -1186,4 +1186,4 @@
       }
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/tutorials/nlp/Question_Answering.ipynb b/tutorials/nlp/Question_Answering.ipynb
index 8ab8edc64747..d9f677348306 100644
--- a/tutorials/nlp/Question_Answering.ipynb
+++ b/tutorials/nlp/Question_Answering.ipynb
@@ -74,7 +74,7 @@
       },
       "outputs": [],
       "source": [
-        "BRANCH = \"main\""
+        "BRANCH = 'main'"
       ]
     },
     {
@@ -155,7 +155,7 @@
         "- **trainer**: Any argument to be passed to PyTorch Lightning\n",
         "- **exp_manager**: All arguments used for setting up the experiment manager - target directory, name, logger information\n",
         "\n",
-        "We will download the default config file provided at `NeMo/examples/nlp/question_answering/conf/qa_conf.yaml` and edit necassary values for training different models"
+        "We will download the default config file provided at `NeMo/examples/nlp/question_answering/conf/qa_conf.yaml` and edit necessary values for training different models"
       ]
     },
     {
@@ -1122,11 +1122,8 @@
       "provenance": []
     },
     "gpuClass": "standard",
-    "interpreter": {
-      "hash": "bae55a3b24aa341f5a622f5db75f4176411c7613841f1c6ed3fbb75fb0be63d2"
-    },
     "kernelspec": {
-      "display_name": "Python 3.8.0 ('nemo': conda)",
+      "display_name": "Python 3.8.0 ('test_ptl_1.7')",
       "language": "python",
       "name": "python3"
     },
@@ -1142,7 +1139,12 @@
       "pygments_lexer": "ipython3",
       "version": "3.8.0"
     },
-    "orig_nbformat": 4
+    "orig_nbformat": 4,
+    "vscode": {
+      "interpreter": {
+        "hash": "e987a19b1bc60996a600adb5d563aa4a4c022e7b31abb2e65c324714934e8ea9"
+      }
+    }
   },
   "nbformat": 4,
   "nbformat_minor": 0
diff --git a/tutorials/tts/FastPitch_GermanTTS_Training.ipynb b/tutorials/tts/FastPitch_GermanTTS_Training.ipynb
index 6b1b33439b39..42fdec747bfa 100644
--- a/tutorials/tts/FastPitch_GermanTTS_Training.ipynb
+++ b/tutorials/tts/FastPitch_GermanTTS_Training.ipynb
@@ -1292,4 +1292,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/tutorials/tts/FastPitch_MixerTTS_Training.ipynb b/tutorials/tts/FastPitch_MixerTTS_Training.ipynb
index 627ded57d315..aecd7aad22ff 100644
--- a/tutorials/tts/FastPitch_MixerTTS_Training.ipynb
+++ b/tutorials/tts/FastPitch_MixerTTS_Training.ipynb
@@ -38,8 +38,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "261df0a0",
-   "metadata": {},
    "outputs": [],
    "source": [
     "\"\"\"\n",
@@ -54,14 +52,14 @@
     "# # If you're using Colab and not running locally, uncomment and run this cell.\n",
     "# !apt-get install sox libsndfile1 ffmpeg\n",
     "# !pip install wget unidecode pynini==2.1.4 scipy==1.7.3\n",
-    "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]"
+    "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
+    "# !wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/nemo_text_processing/install_pynini.sh\n",
+    "# !bash install_pynini.sh"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9e0c0d38",
-   "metadata": {},
    "outputs": [],
    "source": [
     "import json\n",
@@ -76,16 +74,12 @@
   },
   {
    "cell_type": "markdown",
-   "id": "efa2c292",
-   "metadata": {},
    "source": [
     "# Introduction"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "95884fcd",
-   "metadata": {},
    "source": [
     "### FastPitch\n",
     "\n",
@@ -103,8 +97,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9be422ee",
-   "metadata": {},
    "outputs": [],
    "source": [
     "from nemo.collections.tts.models.base import SpectrogramGenerator\n",
@@ -118,8 +110,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "cdf4aee7",
-   "metadata": {},
    "outputs": [],
    "source": [
     "# Let's see what pretrained models are available for FastPitch and Mixer-TTS\n",
@@ -133,8 +123,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "298704c4",
-   "metadata": {},
    "outputs": [],
    "source": [
     "# We can load the pre-trained FastModel as follows\n",
@@ -146,10 +134,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "c18181ff",
-   "metadata": {
-    "scrolled": true
-   },
    "outputs": [],
    "source": [
     "# In the same way, we can load the pre-trained Mixer-TTS model as follows\n",
@@ -161,8 +145,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "fb41b646",
-   "metadata": {},
    "outputs": [],
    "source": [
     "assert isinstance(spec_gen, SpectrogramGenerator)\n",
@@ -183,8 +165,6 @@
   },
   {
    "cell_type": "markdown",
-   "id": "54ec3c5e",
-   "metadata": {},
    "source": [
     "# Preprocessing"
    ]
@@ -192,23 +172,16 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "7ef87e31",
-   "metadata": {},
    "outputs": [],
    "source": [
+    "from nemo_text_processing.g2p.modules import EnglishG2p\n",
     "from nemo.collections.tts.torch.data import TTSDataset\n",
     "from nemo_text_processing.text_normalization.normalize import Normalizer\n",
-    "from nemo_text_processing.g2p.modules import EnglishG2p\n",
-    "from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import (\n",
-    "    EnglishPhonemesTokenizer,\n",
-    "    EnglishCharsTokenizer,\n",
-    ")"
+    "from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import EnglishPhonemesTokenizer, EnglishCharsTokenizer"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "9fd5dec0",
-   "metadata": {},
    "source": [
     "We will show example of preprocessing and training using small part of AN4 dataset. It consists of recordings of people spelling out addresses, names, telephone numbers, etc., one letter or number at a time, as well as their corresponding transcripts. Let's download data, prepared manifests and supplementary files.\n",
     "\n",
@@ -220,8 +193,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "6b621b1c",
-   "metadata": {},
    "outputs": [],
    "source": [
     "# download data and manifests\n",
@@ -237,8 +208,6 @@
   },
   {
    "cell_type": "markdown",
-   "id": "45f19be7",
-   "metadata": {},
    "source": [
     "### FastPitch\n",
     "\n",
@@ -250,8 +219,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4e76d950",
-   "metadata": {},
    "outputs": [],
    "source": [
     "!wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/fastpitch.py\n",
@@ -263,8 +230,6 @@
   },
   {
    "cell_type": "markdown",
-   "id": "82a2eacb",
-   "metadata": {},
    "source": [
     "TTS text preprocessing pipeline consists of two stages: text normalization and text tokenization. Both of them can be handled by `nemo.collections.tts.torch.data.TTSDataset` for training.  \n",
     "\n",
@@ -274,8 +239,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "a46da66d",
-   "metadata": {},
    "outputs": [],
    "source": [
     "# Text normalizer\n",
@@ -296,8 +259,6 @@
   },
   {
    "cell_type": "markdown",
-   "id": "884d8d82",
-   "metadata": {},
    "source": [
     "To accelerate and stabilize our training, we also need to extract pitch for every audio, estimate pitch statistics (mean and std) and pre-calculate alignment prior matrices for alignment framework. To do this, all we need to do is iterate over our data one time.\n",
     "\n",
@@ -316,8 +277,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "7108f748",
-   "metadata": {},
    "outputs": [],
    "source": [
     "def pre_calculate_supplementary_data(sup_data_path, sup_data_types, text_tokenizer, text_normalizer, text_normalizer_call_kwargs):\n",
@@ -364,8 +323,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f1affe50",
-   "metadata": {},
    "outputs": [],
    "source": [
     "fastpitch_sup_data_path = \"fastpitch_sup_data_folder\"\n",
@@ -378,8 +335,6 @@
   },
   {
    "cell_type": "markdown",
-   "id": "d868bb48",
-   "metadata": {},
    "source": [
     "### Mixer-TTS\n",
     "\n",
@@ -391,8 +346,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "1c7c0cfc",
-   "metadata": {},
    "outputs": [],
    "source": [
     "!wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/mixer_tts.py\n",
@@ -404,8 +357,6 @@
   },
   {
    "cell_type": "markdown",
-   "id": "e2f10886",
-   "metadata": {},
    "source": [
     "In the FastPitch pipeline we used a char-based tokenizer, but in the Mixer-TTS training pipeline we would like to demonstrate a phoneme-based tokenizer `nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer`. Unlike char-based tokenizer, `EnglishPhonemesTokenizer` needs a phoneme dictionary and a heteronym dictionary. We will be using the same `nemo_text_processing.text_normalization.normalize.Normalizer` for normalizing the text as used in the FastPitch example."
    ]
@@ -413,8 +364,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "c6ba0f9a",
-   "metadata": {},
    "outputs": [],
    "source": [
     "# Text normalizer\n",
@@ -448,8 +397,6 @@
   },
   {
    "cell_type": "markdown",
-   "id": "9fc55415",
-   "metadata": {},
    "source": [
     "Just like in FastPitch we will need to extract pitch for every audio, estimate pitch statistics (mean and std) and pre-calculate alignment prior matrices for alignment framework."
    ]
@@ -457,8 +404,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "aabc1f0f",
-   "metadata": {},
    "outputs": [],
    "source": [
     "mixer_tts_sup_data_path = \"mixer_tts_sup_data_folder\"\n",
@@ -471,16 +416,12 @@
   },
   {
    "cell_type": "markdown",
-   "id": "c0711ec6",
-   "metadata": {},
    "source": [
     "# Training"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "0a95848c",
-   "metadata": {},
    "source": [
     "### FastPitch\n",
     "\n",
@@ -492,8 +433,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "cc1a9107",
-   "metadata": {},
    "outputs": [],
    "source": [
     "!(python fastpitch.py --config-name=fastpitch_align_v1.05.yaml \\\n",
@@ -521,8 +460,6 @@
   },
   {
    "cell_type": "markdown",
-   "id": "d6bce3ce",
-   "metadata": {},
    "source": [
     "Let's look at some of the options in the training command:\n",
     "\n",
@@ -590,7 +527,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -604,7 +541,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.13"
+   "version": "3.9.7"
   }
  },
  "nbformat": 4,