From da140eb408b69501c2b6b57a59d68ba6af82ab4f Mon Sep 17 00:00:00 2001
From: Taejin Park <tango4j@gmail.com>
Date: Sun, 7 Jul 2024 19:38:57 -0700
Subject: [PATCH 01/13] Fix the arguments  of forward_for_export function in
 msdd_models (#9624)

* Fix the arguments  of forward_for_export function

Signed-off-by: Taejin Park <tango4j@gmail.com>

* Apply isort and black reformatting

Signed-off-by: tango4j <tango4j@users.noreply.github.com>

---------

Signed-off-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: tango4j <tango4j@users.noreply.github.com>
Co-authored-by: tango4j <tango4j@users.noreply.github.com>
---
 nemo/collections/asr/models/msdd_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/asr/models/msdd_models.py b/nemo/collections/asr/models/msdd_models.py
index 60aae8d1a4b1..c88275dcacd3 100644
--- a/nemo/collections/asr/models/msdd_models.py
+++ b/nemo/collections/asr/models/msdd_models.py
@@ -565,7 +565,7 @@ def forward(
         self.msdd._speaker_model.train()
         if len(detach_ids[0]) > 1:
             logits, embs_a = self.msdd._speaker_model.forward_for_export(
-                processed_signal=audio_signal[detach_ids[0]], processed_signal_len=audio_signal_len[detach_ids[0]]
+                audio_signal=audio_signal[detach_ids[0]], length=audio_signal_len[detach_ids[0]]
             )
             embs[detach_ids[0], :] = embs_a
 

From ab1d72235fb5c3add05169719e0572c8ac186aaa Mon Sep 17 00:00:00 2001
From: mikolajblaz <mikolajblaz@users.noreply.github.com>
Date: Mon, 8 Jul 2024 12:16:26 +0200
Subject: [PATCH 02/13] Change default parallel_save to False (#9632)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
---
 nemo/utils/callbacks/dist_ckpt_io.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/utils/callbacks/dist_ckpt_io.py b/nemo/utils/callbacks/dist_ckpt_io.py
index 65eea827e851..144c07addaa8 100644
--- a/nemo/utils/callbacks/dist_ckpt_io.py
+++ b/nemo/utils/callbacks/dist_ckpt_io.py
@@ -205,7 +205,7 @@ def __init__(
         async_save: bool = False,
         torch_dist_multiproc: Optional[int] = None,
         assume_constant_structure: bool = False,
-        parallel_save: bool = True,
+        parallel_save: bool = False,
         parallel_load: bool = False,
     ):
         super().__init__()
@@ -238,7 +238,7 @@ def from_config(cls, model_cfg: dict, async_save: bool = False):
             load_directly_on_device=model_cfg.get('dist_ckpt_load_on_device', True),
             async_save=async_save,
             torch_dist_multiproc=model_cfg.get('dist_ckpt_torch_dist_multiproc', None),
-            parallel_save=model_cfg.get('dist_ckpt_parallel_save', True),
+            parallel_save=model_cfg.get('dist_ckpt_parallel_save', False),
             parallel_load=model_cfg.get('dist_ckpt_parallel_load', False),
         )
 

From c0cd8d4567a6360b28f51751eabedd4bd1a76177 Mon Sep 17 00:00:00 2001
From: mikolajblaz <mikolajblaz@users.noreply.github.com>
Date: Mon, 8 Jul 2024 12:16:54 +0200
Subject: [PATCH 03/13] Unwrap ckpt_io for model opt (async save) (#9622)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
---
 nemo/collections/nlp/parts/nlp_overrides.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 0b89bfda8dbd..e251690831cb 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -395,7 +395,7 @@ def save_checkpoint(
                 save_sharded_modelopt_state(
                     self.lightning_module.get_model_module_list(),
                     ckpt_to_dir(filepath),
-                    self.checkpoint_io.save_sharded_strategy,
+                    self.unwrapped_checkpoint_io.save_sharded_strategy,
                     prefix="model.",
                 )
         else:
@@ -595,10 +595,7 @@ def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
 
     @property
     def use_distributed_checkpointing(self):
-        checkpoint_io = self.checkpoint_io
-        while isinstance(checkpoint_io, _WrappingCheckpointIO):
-            checkpoint_io = checkpoint_io.checkpoint_io
-        has_dist_ckpt_io = HAVE_MEGATRON_CORE and isinstance(checkpoint_io, DistributedCheckpointIO)
+        has_dist_ckpt_io = HAVE_MEGATRON_CORE and isinstance(self.unwrapped_checkpoint_io, DistributedCheckpointIO)
         has_sharded_state_dict = (
             hasattr(self.lightning_module, 'sharded_state_dict')
             and self.lightning_module.sharded_state_dict() is not None
@@ -638,6 +635,14 @@ def restore_checkpoint_after_setup(self) -> bool:
         """
         return True
 
+    @property
+    def unwrapped_checkpoint_io(self) -> CheckpointIO:
+        """Returns CheckpointIO unwrapped from any _WrappedCheckpointIO wrappers."""
+        checkpoint_io = self.checkpoint_io
+        while isinstance(checkpoint_io, _WrappingCheckpointIO):
+            checkpoint_io = checkpoint_io.checkpoint_io
+        return checkpoint_io
+
 
 class NLPDDPStrategyNotebook(NLPDDPStrategy):
     """Version of NLPDDPStrategy to be used in a Jupyter Notebook
@@ -1011,6 +1016,8 @@ def dummy():
                 checkpoint_io.save_checkpoint(sharded_state_dict, dist_ckpt_dir)
 
                 if HAVE_MODELOPT and hasattr(model, "get_model_module_list"):
+                    while isinstance(checkpoint_io, _WrappingCheckpointIO):
+                        checkpoint_io = checkpoint_io.checkpoint_io
                     save_sharded_modelopt_state(
                         model.get_model_module_list(),
                         dist_ckpt_dir,

From 575283a9d60037bab88baf675c27f21361bec933 Mon Sep 17 00:00:00 2001
From: huvunvidia <86480512+huvunvidia@users.noreply.github.com>
Date: Mon, 8 Jul 2024 09:06:32 -0400
Subject: [PATCH 04/13] MCore T5 support for NeMo - Training (#9432)

* huvu/mcore_t5 first commit from local

* removing DEBUGGING prints

* cleaning megatron_lm_encoder_decoder_model.py code

* cleaning code

* adding Github action test

* only run mcore T5 test

* only run mcore T5 test

* only run mcore T5 test

* only run mcore T5 test

* reset .github/workflows/cicd-main.yml

* reset .github/workflows/cicd-main.yml

* adding condition self.mcore_t5 when running self.build_transformer_config()

* refractor megatron_lm_encoder_decoder_model.py to not use self.model

* only run T5-related tests

* remove all self.model

* reset cicd file

* reset cicd file

* updating codes remove duplicate if/else; adding mcore/transformer_engine to config file

* adjust +model.mcore_t5=True

* Apply isort and black reformatting

Signed-off-by: huvunvidia <huvunvidia@users.noreply.github.com>

---------

Signed-off-by: huvunvidia <huvunvidia@users.noreply.github.com>
Co-authored-by: Huy Vu2 <huvu@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: huvunvidia <huvunvidia@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml               |  75 ++++
 .../conf/megatron_t5_config.yaml              |   4 +
 .../language_modeling/megatron_base_model.py  |  34 +-
 .../megatron_lm_encoder_decoder_model.py      | 369 +++++++++++++++---
 4 files changed, 425 insertions(+), 57 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 44ecb03acc7b..d225ee3ab429 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -3488,6 +3488,80 @@ jobs:
         rm -rf examples/nlp/language_modeling/t5_pretrain_results
         rm -rf examples/nlp/language_modeling/t5_index_mappings
 
+  L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.max_epochs=null \
+        trainer.max_steps=10 \
+        trainer.val_check_interval=10 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.precision=bf16 \
+        model.megatron_amp_O2=True \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        model.mcore_t5=True \
+        model.transformer_engine=True \
+        model.tensor_model_parallel_size=2 \
+        model.micro_batch_size=4 \
+        model.global_batch_size=4 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.decoder.num_layers=4 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.encoder.transformer_block_type='pre_ln' \
+        model.decoder.transformer_block_type='pre_ln' \
+        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+        model.data.data_impl=text_mmap \
+        +model.data.data_impl_kwargs.newline_int=10 \
+        +model.data.data_impl_kwargs.header_lines=0 \
+        +model.data.data_impl_kwargs.workers=null \
+        +model.data.data_impl_kwargs.sort_dataset_paths=False
+
+        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+        trainer.devices=2 \
+        trainer.log_every_n_steps=1 \
+        trainer.max_epochs=null \
+        trainer.max_steps=10 \
+        trainer.val_check_interval=10 \
+        trainer.accumulate_grad_batches=1 \
+        trainer.precision=bf16 \
+        model.megatron_amp_O2=True \
+        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+        exp_manager.resume_if_exists=True \
+        model.mcore_t5=True \
+        model.transformer_engine=True \
+        model.tensor_model_parallel_size=2 \
+        model.micro_batch_size=4 \
+        model.global_batch_size=4 \
+        model.seq_length=128 \
+        model.encoder.num_layers=4 \
+        model.encoder.hidden_size=64 \
+        model.encoder.num_attention_heads=8 \
+        model.decoder.num_layers=4 \
+        model.decoder.hidden_size=64 \
+        model.decoder.num_attention_heads=8 \
+        model.encoder.transformer_block_type='pre_ln' \
+        model.decoder.transformer_block_type='pre_ln' \
+        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
+        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
+        model.data.data_impl=text_mmap \
+        +model.data.data_impl_kwargs.newline_int=10 \
+        +model.data.data_impl_kwargs.header_lines=0 \
+        +model.data.data_impl_kwargs.workers=null \
+        +model.data.data_impl_kwargs.sort_dataset_paths=False
+      AFTER_SCRIPT: |
+        rm -rf examples/nlp/language_modeling/t5_pretrain_results
+        rm -rf examples/nlp/language_modeling/t5_index_mappings
+
   L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4433,6 +4507,7 @@ jobs:
       - L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2
       - L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2
       - L2_Megatron_T5_Pretraining_and_Resume_Training_TP2
+      - L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_Pretraining_and_Resume_Training_PP2
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
index e51cfff420a3..439a0f1533bd 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
@@ -43,6 +43,10 @@ exp_manager:
     model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
 
 model:
+  # use T5 model from megatron.core
+  mcore_t5: False
+  transformer_engine: False
+
   # model parallelism 
   micro_batch_size: 4
   global_batch_size: 8 # will use more micro batches to reach global batch size
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index f7b53a95c19a..7308d3db3f91 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -290,7 +290,11 @@ def _wrap_model_for_O2(self):
         Returns:
             The wrapped model. Returns a list of wrapped modules or a single wrapped module.
         """
-        is_mcore_model = self.__dict__.get('mcore_gpt', False) or self.__dict__.get('mcore_bert', False)
+        is_mcore_model = (
+            self.__dict__.get('mcore_gpt', False)
+            or self.__dict__.get('mcore_bert', False)
+            or self.__dict__.get('mcore_t5', False)
+        )
 
         Float16Wrapper = MCoreFloat16Module if is_mcore_model else Float16Module
 
@@ -305,15 +309,21 @@ def _wrap_model_for_O2(self):
 
         args = mcore_args if is_mcore_model else nemo_args
         # Model wrapper to convert both model and inputs to half precision
-        if isinstance(self.model, list):
+        if isinstance((self.enc_dec_model if hasattr(self, "enc_dec_model") else self.model), list):
             converted_model = []
-            for module in self.model:
+            for module in self.enc_dec_model if hasattr(self, "enc_dec_model") else self.model:
                 args['module'] = module
                 converted_model.append(Float16Wrapper(**args))
-            self.model = converted_model
+            if hasattr(self, "enc_dec_model"):
+                self.enc_dec_model = converted_model
+            else:
+                self.model = converted_model
         else:
-            args['module'] = self.model
-            self.model = Float16Wrapper(**args)
+            args['module'] = self.enc_dec_model if hasattr(self, "enc_dec_model") else self.model
+            if hasattr(self, "enc_dec_model"):
+                self.enc_dec_model = Float16Wrapper(**args)
+            else:
+                self.model = Float16Wrapper(**args)
         args.pop('module')
 
     def get_model_module_list(self):
@@ -323,10 +333,10 @@ def extract_module(model):
             else:
                 return model
 
-        if isinstance(self.model, list):
-            return list(map(extract_module, self.model))
+        if isinstance((self.enc_dec_model if hasattr(self, "enc_dec_model") else self.model), list):
+            return list(map(extract_module, (self.enc_dec_model if hasattr(self, "enc_dec_model") else self.model)))
         else:
-            return [extract_module(self.model)]
+            return [extract_module(self.enc_dec_model if hasattr(self, "enc_dec_model") else self.model)]
 
     def _reconfigure_limit_batches(self, limit_batches, dataloader, mode):
         """
@@ -1022,7 +1032,11 @@ def is_data_parallel_rank_zero(self):
 
     def _get_total_params_across_model_parallel_groups_gpt_bert(self):
         """Returns the total number of parameters across all model parallel groups."""
-        is_mcore_model = self.__dict__.get('mcore_gpt', False) or self.__dict__.get('mcore_bert', False)
+        is_mcore_model = (
+            self.__dict__.get('mcore_gpt', False)
+            or self.__dict__.get('mcore_bert', False)
+            or self.__dict__.get('mcore_t5', False)
+        )
         # log number of parameters
         model = self.get_model_module_list()
         if isinstance(model, list):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index 8fe215bcc9af..6609b1aff303 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -32,11 +32,13 @@
 from nemo.collections.nlp.modules.common.megatron.build_model import build_model
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module
 from nemo.collections.nlp.modules.common.megatron.token_level_encoder_decoder import (
+    AttnMaskType,
     MegatronTokenLevelEncoderDecoderModule,
 )
 from nemo.collections.nlp.modules.common.megatron.utils import (
     ApexGuardDefaults,
     average_losses_across_data_parallel_group,
+    build_attention_mask_3d,
     get_params_for_weight_decay_optimization,
 )
 from nemo.collections.nlp.modules.common.text_generation_utils import (
@@ -62,7 +64,16 @@
 try:
     from megatron.core import parallel_state, tensor_parallel
     from megatron.core.enums import ModelType
+    from megatron.core.models.T5 import T5Model as MCoreT5Model
+    from megatron.core.models.T5.t5_spec import (
+        get_t5_decoder_with_local_block_spec,
+        get_t5_decoder_with_transformer_engine_block_spec,
+        get_t5_encoder_with_local_block_spec,
+        get_t5_encoder_with_transformer_engine_block_spec,
+    )
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+    from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
+    from megatron.core.transformer.transformer_config import TransformerConfig
 
     HAVE_MEGATRON_CORE = True
 
@@ -96,6 +107,13 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         # Make sure trainer.accumulate_grad_batches is 1.
         self._validate_trainer()
 
+        self.mcore_t5 = cfg.get('mcore_t5', False)
+
+        if self.mcore_t5:
+            self.transformer_config = self.build_transformer_config()
+
+        self.megatron_amp_O2 = cfg.get('megatron_amp_O2', False)
+
         # TODO: Currently does not support interleaved pipeline parallelism.
         # This means we can only use pipeline parallelism without the interleaved schedule.
         if isinstance(self.trainer.accelerator, CPUAccelerator):
@@ -116,18 +134,18 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         # We don't need to call it explicitly? Since it is a pytorch lightning hook function
         # self.setup_optimizer_param_groups()
 
-        self.megatron_amp_O2 = cfg.get('megatron_amp_O2', False)
-
         if self.megatron_amp_O2:
 
             if not self.with_distributed_adam:
                 # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type
-                self.enc_dec_model.cuda(torch.cuda.current_device())
+                if isinstance(self.enc_dec_model, list):
+                    for module in self.enc_dec_model:
+                        module.cuda(torch.cuda.current_device())
+                else:
+                    self.enc_dec_model.cuda(torch.cuda.current_device())
 
             # Model wrapper to convert both model and inputs to half precision
-            self.enc_dec_model = Float16Module(
-                config=self.model_parallel_config, module=self.enc_dec_model, precision=self.cfg.precision
-            )
+            self._wrap_model_for_O2()
 
         self.enable_autocast = (
             True if (not self.megatron_amp_O2) and (self.autocast_dtype in [torch.float16, torch.bfloat16]) else False
@@ -250,38 +268,74 @@ def model_provider_func(self, pre_process, post_process, add_encoder, add_decode
         if parallel_state.get_pipeline_model_parallel_world_size() > 1 and self.cfg.encoder.arch == 'perceiver':
             raise ValueError(f"Perceivers with pipeline parallel > 1 is not supported yet.")
 
-        if not hasattr(self.cfg, 'embedding_init_method_std'):
-            embedding_init_method_std = self.cfg.encoder.init_method_std
-        else:
-            embedding_init_method_std = self.cfg.embedding_init_method_std
+        if hasattr(self, 'mcore_t5') and self.mcore_t5:
+            assert HAVE_MEGATRON_CORE, "Cannot use MCore T5 since Megatron Core is not found"
+            assert self.cfg.get(
+                'share_token_embeddings', True
+            ), "share_token_embeddings must be True if using MCore T5 model"
+            if self.cfg.get('transformer_engine', False):
+                enc_dec_spec_fns = (
+                    get_t5_encoder_with_transformer_engine_block_spec,
+                    get_t5_decoder_with_transformer_engine_block_spec,
+                )
+            else:
+                enc_dec_spec_fns = (
+                    get_t5_encoder_with_local_block_spec,
+                    get_t5_decoder_with_local_block_spec,
+                )
+
+            en_block_spec = enc_dec_spec_fns[0](self.cfg.encoder.num_layers)
+            de_block_spec = enc_dec_spec_fns[1](self.cfg.decoder.num_layers)
+            model = MCoreT5Model(
+                config=self.transformer_config,
+                transformer_encoder_layer_spec=en_block_spec,
+                transformer_decoder_layer_spec=de_block_spec,
+                vocab_size=self.padded_vocab_size,
+                max_sequence_length=self.cfg.max_position_embeddings,
+                pre_process=pre_process,
+                post_process=post_process,
+                fp16_lm_cross_entropy=self.cfg.get('fp16_lm_cross_entropy', False),
+                parallel_output=True,
+                share_embeddings_and_output_weights=self.cfg.get('share_decoder_tokens_head_embeddings', True),
+                position_embedding_type=self.cfg.get('position_embedding_type', 'learned_absolute'),
+                rotary_percent=self.cfg.get('rotary_percentage', 1.0),
+                seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None),
+            )
 
-        if not hasattr(self.cfg, 'embedding_dropout'):
-            embedding_dropout = self.cfg.encoder.hidden_dropout
         else:
-            embedding_dropout = self.cfg.embedding_dropout
-
-        model = MegatronTokenLevelEncoderDecoderModule(
-            config=self.model_parallel_config,
-            encoder_cfg=self.cfg.encoder,
-            decoder_cfg=self.cfg.decoder,
-            vocab_size=self.padded_vocab_size,
-            max_position_embeddings=self.cfg.max_position_embeddings,
-            num_tokentypes=0,
-            parallel_output=True,
-            pre_process=pre_process,
-            post_process=post_process,
-            fp16_cross_entropy=self.cfg.get('fp16_lm_cross_entropy', False),
-            precision=self.cfg.get('precision', 16),
-            embedding_init_method_std=embedding_init_method_std,
-            embedding_dropout=embedding_dropout,
-            label_smoothing=self.cfg.get('label_smoothing', 0.0),
-            add_encoder=add_encoder,
-            add_decoder=add_decoder,
-            share_token_embeddings=self.cfg.get('share_token_embeddings', True),
-            share_decoder_tokens_head_embeddings=self.cfg.get('share_decoder_tokens_head_embeddings', True),
-            tokens_head_bias=self.cfg.get('tokens_head_bias', True),
-            hiddens_cfg=self.cfg.get('hiddens', None),
-        )
+            if not hasattr(self.cfg, 'embedding_init_method_std'):
+                embedding_init_method_std = self.cfg.encoder.init_method_std
+            else:
+                embedding_init_method_std = self.cfg.embedding_init_method_std
+
+            if not hasattr(self.cfg, 'embedding_dropout'):
+                embedding_dropout = self.cfg.encoder.hidden_dropout
+            else:
+                embedding_dropout = self.cfg.embedding_dropout
+
+            model = MegatronTokenLevelEncoderDecoderModule(
+                config=self.model_parallel_config,
+                encoder_cfg=self.cfg.encoder,
+                decoder_cfg=self.cfg.decoder,
+                vocab_size=self.padded_vocab_size,
+                max_position_embeddings=self.cfg.max_position_embeddings,
+                num_tokentypes=0,
+                parallel_output=True,
+                pre_process=pre_process,
+                post_process=post_process,
+                fp16_cross_entropy=self.cfg.get('fp16_lm_cross_entropy', False),
+                precision=self.cfg.get('precision', 16),
+                embedding_init_method_std=embedding_init_method_std,
+                embedding_dropout=embedding_dropout,
+                label_smoothing=self.cfg.get('label_smoothing', 0.0),
+                add_encoder=add_encoder,
+                add_decoder=add_decoder,
+                share_token_embeddings=self.cfg.get('share_token_embeddings', True),
+                share_decoder_tokens_head_embeddings=self.cfg.get('share_decoder_tokens_head_embeddings', True),
+                tokens_head_bias=self.cfg.get('tokens_head_bias', True),
+                hiddens_cfg=self.cfg.get('hiddens', None),
+            )
+
         return model
 
     def forward(
@@ -372,6 +426,25 @@ def training_step(self, dataloader_iter):
         # we zero grads here because we also call backward in the megatron fwd/bwd functions
         self._optimizer.zero_grad()
 
+        if self.with_distributed_adam:
+            # hack to enable overlapping param sync and forward compute
+            # note: the distributed optimizer monkey-patches each
+            # parameter's __getattribute__ function so that it can
+            # launch parameter all-gathers the first time the
+            # parameter is accessed after the optimizer step. However,
+            # PyTorch directly passes embedding parameters into a C++,
+            # bypassing this process. A quick-and-dirty hack is to
+            # manually interact with the parameter.
+            modules = self.enc_dec_model if isinstance(self.enc_dec_model, list) else [self.enc_dec_model]
+            for module in modules:
+                if isinstance(module, (Float16Module, MCoreFloat16Module)):
+                    module = module.module
+                if not self.mcore_t5:
+                    module = module.language_model
+                if hasattr(module, 'embedding'):
+                    for param in module.embedding.parameters():
+                        param.data_ptr()
+
         loss_dict = self.fwd_bwd_step(dataloader_iter, False)
 
         if self.with_distributed_adam:
@@ -380,8 +453,12 @@ def training_step(self, dataloader_iter):
             # from multiple simultaneous NCCL calls
             self._optimizer._finish_bucket_grad_sync()
         elif self.megatron_amp_O2:
-            # when using pipeline parallelism grads must be reduced after the pipeline (not asynchronously)
-            if self.cfg.get('pipeline_model_parallel_size', 1) > 1:
+            # when using pipeline parallelism grads must be all-reduced after the pipeline (not asynchronously)
+            if (
+                self.cfg.get('pipeline_model_parallel_size', 1) > 1
+                or self.cfg.get('sequence_parallel', False)
+                or not self.cfg.get('async_grad_allreduce', True)
+            ):
                 # main grads are stored in the MainParamsOptimizer wrapper
                 self._optimizer.allreduce_main_grads()
         else:
@@ -596,15 +673,37 @@ def fwd_output_and_loss_func(dataloader_iter, model):
                 batch_data,
             ) = batch
 
-            output = model(
-                encoder_input_ids,  # enc_input_ids
-                encoder_attn_mask,  # enc_attn_mask
-                decoder_input_ids,  # dec_input_ids
-                decoder_attn_mask,  # dec_attn_mask
-                None,  # token_type_ids
-                lm_labels,  # labels
-                batch_data,  # batch_data
-            )
+            if self.mcore_t5:
+                # attn mask logic follows megatron.data.t5_dataset.py in Megatron-LM
+                encoder_attn_mask_3d = build_attention_mask_3d(
+                    encoder_attn_mask, encoder_attn_mask, AttnMaskType.padding
+                )
+                decoder_attn_mask_3d = build_attention_mask_3d(
+                    decoder_attn_mask, decoder_attn_mask, AttnMaskType.causal
+                )
+                enc_dec_attn_mask_3d = build_attention_mask_3d(
+                    decoder_attn_mask, encoder_attn_mask, AttnMaskType.padding
+                )
+
+                output = model(  # model is MCoreT5Model
+                    encoder_input_ids,  # encoder_input_ids
+                    decoder_input_ids,  # decoder_input_ids
+                    encoder_attn_mask_3d,  # encoder_attn_mask
+                    decoder_attn_mask_3d,  # decoder_attn_mask
+                    enc_dec_attn_mask_3d,  # encoder_decoder_attn_mask
+                    lm_labels,  # lm_labels
+                )
+
+            else:
+                output = model(
+                    encoder_input_ids,  # enc_input_ids
+                    encoder_attn_mask,  # enc_attn_mask
+                    decoder_input_ids,  # dec_input_ids
+                    decoder_attn_mask,  # dec_attn_mask
+                    None,  # token_type_ids
+                    lm_labels,  # labels
+                    batch_data,  # batch_data
+                )
 
             def loss_func(output_tensor):
                 if isinstance(output_tensor, dict):
@@ -983,6 +1082,36 @@ def setup(self, stage=None):
                 ) == 'relative' and not self.cfg.decoder.get('relative_position_bias_self_attention_only', True):
                     self.enc_dec_model.sync_initial_decoder_cross_attention_relative_position_embeddings()
 
+        if self.cfg.get('transformer_engine', False) or self.cfg.get('mcore_t5', False):
+            self.setup_transformer_engine_tp_groups()
+
+    def setup_transformer_engine_tp_groups(self):
+        """This should be called after model parallel groups have been initialized
+        and only needs to be called when using Transformer Engine.
+        """
+        for module in self.get_t5_module_list():
+            """Set TP group
+            Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398
+            """
+            # Deep iterate but skip self to avoid infinite recursion.
+            for index, child in enumerate(module.modules()):
+                if index == 0:
+                    continue
+                if hasattr(child, "set_tensor_parallel_group"):
+                    tp_group = parallel_state.get_tensor_model_parallel_group()
+                    child.set_tensor_parallel_group(tp_group)
+
+    def get_t5_module_list(self):
+        if isinstance(self.enc_dec_model, list):
+            return [
+                model.module if isinstance(model, (Float16Module, MCoreFloat16Module)) else model
+                for model in self.enc_dec_model
+            ]
+        elif isinstance(self.enc_dec_model, (Float16Module, MCoreFloat16Module)):
+            return [self.enc_dec_model.module]
+        else:
+            return [self.enc_dec_model]
+
     def setup_training_data(self, cfg):
         if hasattr(self, '_train_ds'):
             consumed_samples = self.compute_consumed_samples(0)
@@ -1536,3 +1665,149 @@ def build_model_parallel_config(self):
                 f'encoder.hidden_size not found in {self.cfg}. Set this in model_parallel_config if using pipeline parallelism.'
             )
         return model_parallel_config
+
+    def sharded_state_dict(self, prefix: str = '') -> Dict[str, Any]:
+        """
+        Creates the sharded state dict which is used by dist_checkpoint to save the sharded tensors to disk.
+        When given the sharded_stated_dict, dist_checkpoint.load will load the tensors corresponding to
+        self.state_dict().
+        The sharded tensor mapping is defined in the GPTModel class from mcore.
+        """
+        if self.mcore_t5:
+            module_prefix = f'{prefix}model.'
+            sharded_state_dict = {}
+            for index, module in enumerate(self.get_model_module_list()):
+                if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+                    # virtual pipline rank must be set so that GPTModel returns the correct sharded state dict
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(index)
+                    module_sharded_state_dict = module.sharded_state_dict(prefix=module_prefix)
+                    sharded_state_dict[f'model_{index}'] = module_sharded_state_dict
+                else:
+                    module_sharded_state_dict = module.sharded_state_dict(prefix=module_prefix)
+                    sharded_state_dict.update(module_sharded_state_dict)
+
+            # reset vp rank
+            if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
+
+            return sharded_state_dict
+
+    def on_save_checkpoint(self, checkpoint) -> None:
+        """LightningModule hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-save-checkpoint
+        """
+        if self.mcore_t5:
+            checkpoint['sharded_state_dict'] = self.sharded_state_dict()
+        else:
+            if isinstance(self.enc_dec_model, list):
+                for i in range(len(self.enc_dec_model)):
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(i)
+                    checkpoint[f'model{i}'] = self.enc_dec_model[i].module.state_dict_for_save_checkpoint()
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
+
+    def on_load_checkpoint(self, checkpoint) -> None:
+        """LightningModule hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-load-checkpoint
+        """
+        if self.mcore_t5:
+            if 'state_dict' in checkpoint and checkpoint['state_dict']:
+                for index, module in enumerate(self.get_model_module_list()):
+                    if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+                        checkpoint_state_dict = checkpoint['state_dict'][f'model_{index}']
+                    else:
+                        checkpoint_state_dict = checkpoint['state_dict']
+                    # checkpoint_state_dict has "model." but module does not so we need to remove it when loading
+                    checkpoint_state_dict = {
+                        key.replace('model.', ''): checkpoint_state_dict.pop(key)
+                        for key in list(checkpoint_state_dict.keys())
+                    }
+
+                    # addressing the current T5 mcore version's implementation of sharded_state_dict
+                    checkpoint_state_dict['lm_head.output_layer.bias'] = checkpoint_state_dict['output_layer.bias']
+
+                    module.load_state_dict(checkpoint_state_dict, strict=True)
+            else:
+                checkpoint['state_dict'] = {}
+        else:
+            if isinstance(self.enc_dec_model, list):
+                for i in range(len(self.enc_dec_model)):
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(i)
+                    self.enc_dec_model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True)
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
+
+    def build_transformer_config(self) -> TransformerConfig:
+        """Builds the megatron core gpt transformer config for the model.
+        For attributes in the nemo model config that are the same
+        as the megatron core TransformerConfig, we will use the value from the nemo model config.
+        For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
+        """
+
+        # for T5 model, transformers hyperparameters are stored in self.cfg.encoder/self.cfg.decoder
+        with open_dict(self.cfg):
+            for key in self.cfg.encoder:
+                print("{}: {}".format(key, self.cfg.encoder.get(key)))
+                OmegaConf.update(self.cfg, key, self.cfg.encoder.get(key))
+
+        normalization = self.cfg.get('normalization', 'layernorm')
+
+        layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p'
+        if normalization == 'layernorm':
+            normalization = 'LayerNorm'
+        elif normalization == 'rmsnorm':
+            normalization = 'RMSNorm'
+        elif normalization == 'layernorm1p':
+            normalization = 'LayerNorm'
+            layernorm_zero_centered_gamma = True
+        else:
+            logging.warning(
+                f"The normalization type: {normalization} might not be supported in megatron core."
+                f"Supported types are LayerNorm and RMSNorm."
+            )
+
+        # any configs that are not in the nemo model config will be added here
+        model_specific_configs = {
+            'layernorm_zero_centered_gamma': layernorm_zero_centered_gamma,
+            'normalization': normalization,
+        }
+
+        transformer_config = super().build_transformer_config()
+
+        for key, value in model_specific_configs.items():
+            setattr(transformer_config, key, value)
+
+        # pass mcore customization configs directly to mcore
+        mcore_customization_config_dict = self.cfg.get('mcore_customization_config', {})
+        for key, value in mcore_customization_config_dict.items():
+            setattr(transformer_config, key, value)
+
+        return transformer_config
+
+    def setup_mcore_distributed_parallel(self):
+        """Set up mcore distributed data parallel"""
+        if self.with_distributed_adam and self.use_mcore_dist_optim:
+            config = get_model_config(self.enc_dec_model[0])
+            ddp_config = DistributedDataParallelConfig(
+                grad_reduce_in_fp32=(self.cfg.optim.get('grad_sync_dtype', 'fp32') == 'fp32'),
+                overlap_grad_reduce=self.cfg.optim.get('overlap_grad_sync', False),
+                use_distributed_optimizer=True,
+                check_for_nan_in_grad=self.cfg.optim.get('check_for_nan_in_grad', False),
+                # mcore bucket_size is based on num of parameters, therefore not
+                # using bucket_cap_mb to configure bucket_size here
+                bucket_size=self.cfg.optim.get('ddp_bucket_size', None),
+            )
+            self.enc_dec_model = [
+                McoreDDP(
+                    config,
+                    ddp_config,
+                    model_chunk,
+                    data_parallel_group=parallel_state.get_data_parallel_group(with_context_parallel=True),
+                    expert_data_parallel_group=parallel_state.get_data_modulo_expert_parallel_group(),
+                    # Turn off bucketing for model_chunk 2 onwards, since communication for these
+                    # model chunks is overlapped with compute anyway.
+                    disable_bucketing=(model_chunk_idx > 0),
+                )
+                for (model_chunk_idx, model_chunk) in enumerate(self.enc_dec_model)
+            ]
+
+            # (TODO) Broadcast params from data parallel src rank to other data parallel ranks.
+            # by calling model_module.broadcast_params() if the model is randomly initialized.

From 17f295beb207a31c3f4dea40e311ccef3cbc08ff Mon Sep 17 00:00:00 2001
From: Marc Romeyn <mromeijn@nvidia.com>
Date: Mon, 8 Jul 2024 17:50:37 +0200
Subject: [PATCH 05/13] [Nemo-UX] Expose transformer_layer_spec inside
 GPTConfig (#9592)

* Expose transformer_layer_spec inside GPTConfig

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Expose layer-specs

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

---------

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>
---
 nemo/collections/llm/gpt/model/__init__.py |  4 +++
 nemo/collections/llm/gpt/model/base.py     | 33 +++++++++++++++++++---
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
index 1dac811f91ef..4391a41293ee 100644
--- a/nemo/collections/llm/gpt/model/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -4,6 +4,8 @@
     MaskedTokenLossReduction,
     gpt_data_step,
     gpt_forward_step,
+    local_layer_spec,
+    transformer_engine_layer_spec,
 )
 from nemo.collections.llm.gpt.model.gemma import (
     CodeGemmaConfig2B,
@@ -56,4 +58,6 @@
     "MaskedTokenLossReduction",
     "gpt_data_step",
     "gpt_forward_step",
+    "transformer_engine_layer_spec",
+    "local_layer_spec",
 ]
diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index 28a0eed52a5f..4c1f425d7f99 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -1,10 +1,12 @@
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional
+from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional, Union
 
 import pytorch_lightning as L
 import torch
 import torch.distributed
+from megatron.core.models.gpt import gpt_layer_specs
 from megatron.core.optimizer import OptimizerConfig
+from megatron.core.transformer.spec_utils import ModuleSpec
 from megatron.core.transformer.transformer_config import TransformerConfig
 from torch import nn
 
@@ -63,6 +65,18 @@ def gpt_forward_step(model, batch) -> torch.Tensor:
     return model(**forward_args)
 
 
+def transformer_engine_layer_spec(config: "GPTConfig") -> ModuleSpec:
+    return gpt_layer_specs.get_gpt_layer_with_transformer_engine_spec(
+        num_experts=config.num_moe_experts, moe_grouped_gemm=config.moe_grouped_gemm, qk_layernorm=config.qk_layernorm
+    )
+
+
+def local_layer_spec(config: "GPTConfig") -> ModuleSpec:
+    return gpt_layer_specs.get_gpt_layer_local_spec(
+        num_experts=config.num_moe_experts, moe_grouped_gemm=config.moe_grouped_gemm, qk_layernorm=config.qk_layernorm
+    )
+
+
 @dataclass
 class GPTConfig(TransformerConfig, io.IOMixin):
     # From megatron.core.models.gpt.gpt_model.GPTModel
@@ -79,6 +93,7 @@ class GPTConfig(TransformerConfig, io.IOMixin):
     # TODO: Move this to better places?
     get_attention_mask_from_fusion: bool = False
 
+    transformer_layer_spec: Union[ModuleSpec, Callable[["GPTConfig"], ModuleSpec]] = transformer_engine_layer_spec
     forward_step_fn: Callable = gpt_forward_step
     data_step_fn: Callable = gpt_data_step
 
@@ -91,12 +106,15 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel":
             ) % vp_size == 0, "Make sure the number of model chunks is the same across all pipeline stages."
 
         from megatron.core import parallel_state
-        from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
         from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel
 
+        transformer_layer_spec = self.transformer_layer_spec
+        if not isinstance(transformer_layer_spec, ModuleSpec):
+            transformer_layer_spec = transformer_layer_spec(self)
+
         return MCoreGPTModel(
             self,
-            transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(self.num_moe_experts),
+            transformer_layer_spec=transformer_layer_spec,
             vocab_size=get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by),
             max_sequence_length=self.seq_length,
             fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
@@ -225,4 +243,11 @@ def get_packed_seq_params(batch):
     )
 
 
-__all__ = ["GPTModel", "GPTConfig", "gpt_data_step", "gpt_forward_step"]
+__all__ = [
+    "GPTModel",
+    "GPTConfig",
+    "gpt_data_step",
+    "gpt_forward_step",
+    "transformer_engine_layer_spec",
+    "local_layer_spec",
+]

From a70349316552f1e5ee975fd03010152a17e1982e Mon Sep 17 00:00:00 2001
From: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>
Date: Mon, 8 Jul 2024 09:20:33 -0700
Subject: [PATCH 06/13] Update NeMo Clip to Use MCore Modules (#9594)

* update clip model and config file

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update clip for mcore

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* MCore CLIP Fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix no mask

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* few neva fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update siglip module

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add siglip loss

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix collate fn

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update siglip conversion script

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update siglip convert

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* clip fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* clean up script

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* clip fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix code styles

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update siglip_loss.py

Signed-off-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Signed-off-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
---
 examples/multimodal/convert_ckpt_to_nemo.py   |   8 -
 .../clip/conf/megatron_clip_VIT-L-14.yaml     |  51 +-
 .../clip/conf/megatron_clip_config.yaml       |   3 +-
 .../clip/conf/megatron_clip_infer.yaml        |   2 +-
 .../conf/megatron_siglip_so400m_14_384.yaml   | 251 +++++
 .../clip/convert_external_clip_to_nemo.py     |   1 +
 .../clip/megatron_clip_pretrain.py            |   7 +-
 .../multimodal/data/clip/clip_dataset.py      |  33 +-
 .../multimodal/losses/siglip_loss.py          | 220 +++++
 .../clip/megatron_clip_models.py              | 921 +++++++++++++++---
 .../language_modeling/megatron_base_model.py  |   2 +-
 nemo/collections/nlp/parts/utils_funcs.py     |  15 +-
 .../convert_clip_hf_to_nemo.py                | 248 +++++
 .../convert_siglip_hf_to_nemo.py              | 380 ++++++++
 14 files changed, 1996 insertions(+), 146 deletions(-)
 create mode 100644 examples/multimodal/vision_language_foundation/clip/conf/megatron_siglip_so400m_14_384.yaml
 create mode 100644 nemo/collections/multimodal/losses/siglip_loss.py
 create mode 100644 scripts/checkpoint_converters/convert_clip_hf_to_nemo.py
 create mode 100644 scripts/checkpoint_converters/convert_siglip_hf_to_nemo.py

diff --git a/examples/multimodal/convert_ckpt_to_nemo.py b/examples/multimodal/convert_ckpt_to_nemo.py
index 2bc0f5d7ab62..573bdc0bc040 100644
--- a/examples/multimodal/convert_ckpt_to_nemo.py
+++ b/examples/multimodal/convert_ckpt_to_nemo.py
@@ -165,14 +165,6 @@ def convert(local_rank, rank, world_size, args):
         model = MegatronControlNet.load_from_checkpoint(
             checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
         )
-    elif args.model_type == 'kosmos':
-        model = MegatronKosmosModel.load_from_checkpoint(
-            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
-        )
-    elif args.model_type == 'neva':
-        model = MegatronNevaModel.load_from_checkpoint(
-            checkpoint_path, hparams_file=args.hparams_file, trainer=trainer
-        )
     else:
         raise ValueError(f"Unrecognized model_type {args.model_type}.")
 
diff --git a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml
index d8740bb98eb2..bfee36b6c099 100644
--- a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml
+++ b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml
@@ -1,3 +1,50 @@
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: 32
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
+  max_steps: 375000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  check_val_every_n_epoch: null
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_clip
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  resume_from_checkpoint: ${model.resume_from_checkpoint}
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+  ema:
+    enable: False
+    decay: 0.9999
+    validate_original_weights: False
+    every_n_steps: 1
+    cpu_offload: False
+
 model:
   precision: 32
   # specify micro_batch_size, global_batch_size, and model parallelism
@@ -19,6 +66,9 @@ model:
   local_loss: False # calculate loss w/ local features @ global (instead of realizing full global @ global matrix)
   gather_with_grad: True # enable full distributed gradient for feature gather, set this to False may cause convergence issue
 
+  mcore_gpt: False
+  transformer_engine: False
+
   vision:
     precision: 32
     # vision configs
@@ -135,7 +185,6 @@ model:
     bias_activation_fusion: False
     megatron_legacy: True
 
-    transformer_engine: False
     fp8: False # enables fp8 in TransformerLayer forward
     fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
     fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
diff --git a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_config.yaml b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_config.yaml
index a6b1928ef13f..f75a163a5ed2 100644
--- a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_config.yaml
+++ b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_config.yaml
@@ -68,6 +68,8 @@ model:
   #  numerical results as the naïve method.
   local_loss: False # calculate loss w/ local features @ global (instead of realizing full global @ global matrix)
   gather_with_grad: True # enable full distributed gradient for feature gather, set this to False may cause convergence issue
+  mcore_gpt: True
+  transformer_engine: True
 
   vision:
     precision: ${trainer.precision}
@@ -183,7 +185,6 @@ model:
     bias_activation_fusion: False
     megatron_legacy: False
 
-    transformer_engine: False
     fp8: False # enables fp8 in TransformerLayer forward
     fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
     fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
diff --git a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_infer.yaml b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_infer.yaml
index 215cd17841ae..3e127aa6d86a 100755
--- a/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_infer.yaml
+++ b/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_infer.yaml
@@ -6,7 +6,7 @@ trainer:
   num_nodes: 1
   accelerator: gpu
   logger: False # logger provided by exp_manager
-  precision: 16 # 16, 32, or bf16
+  precision: 32 # 16, 32, or bf16
 
 model:
   restore_from_path: null  # Path to a trained ViT .nemo file
diff --git a/examples/multimodal/vision_language_foundation/clip/conf/megatron_siglip_so400m_14_384.yaml b/examples/multimodal/vision_language_foundation/clip/conf/megatron_siglip_so400m_14_384.yaml
new file mode 100644
index 000000000000..6c5be3a2bcd6
--- /dev/null
+++ b/examples/multimodal/vision_language_foundation/clip/conf/megatron_siglip_so400m_14_384.yaml
@@ -0,0 +1,251 @@
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: 32
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
+  max_steps: 375000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  check_val_every_n_epoch: null
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+  gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_clip
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  resume_from_checkpoint: ${model.resume_from_checkpoint}
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+  ema:
+    enable: False
+    decay: 0.9999
+    validate_original_weights: False
+    every_n_steps: 1
+    cpu_offload: False
+
+model:
+  precision: 32
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 1 # limited by GPU memory
+  global_batch_size: 1 # will use more micro batches to reach global batch size
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  virtual_pipeline_model_parallel_size: null # interleaved pipeline
+
+  restore_from_pretrained: null # used in fine-tuning
+  # multimodal configs
+  output_dim: 1152
+  #  As the number of devices used to train increases, so does the space complexity of
+  #  the logit matrix. Using a naïve all-gather scheme, space complexity will be
+  #  `O(n^2)`. Instead, complexity may become effectively linear if the flags
+  #  `--gather-with-grad` and `--local-loss` are used. This alteration results in one-to-one
+  #  numerical results as the naïve method.
+
+  use_siglip: True
+  mcore_gpt: True
+  transformer_engine: True
+
+  vision:
+    precision: 32
+    # vision configs
+    patch_dim: 14
+    img_h: 378
+    img_w: 378
+    image_mean: null
+    image_std: null
+    num_channels: 3
+    drop_patch_rate: 0.0
+    drop_path_rate: 0.0
+    global_average_pool: False
+    output_dim: ${model.output_dim}
+    class_token_length: 0
+    preprocess_layernorm: True # apply layer norm to embedded tokens
+
+    # model architecture
+    encoder_seq_length: 196
+    max_position_embeddings: ${.encoder_seq_length}
+    position_embedding_type: learned_absolute
+    num_layers: 27
+    hidden_size: 1152
+    ffn_hidden_size: 4304 # Transformer FFN hidden size. Usually 4 * hidden_size.
+    num_attention_heads: 16
+    init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+    use_scaled_init_method: True # use scaled residuals initialization
+    hidden_dropout: 0. # Dropout probability for hidden state transformer.
+    attention_dropout: 0.
+    kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+    apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+    normalization: layernorm # Type of normalization layers
+    layernorm_epsilon: 1e-5
+    do_layer_norm_weight_decay: False # True means weight decay on all params
+    pre_process: True # add embedding
+    post_process: True # add pooler
+    persist_layer_norm: True # Use of persistent fused layer norm kernel.
+
+    ## Activation Checkpointing
+    activations_checkpoint_granularity: null # 'selective' or 'full'
+    activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+    activations_checkpoint_num_layers: null # not used with 'selective'
+    sequence_parallel: False
+
+    # precision
+    native_amp_init_scale: 4294967296 # 2 ** 32
+    native_amp_growth_interval: 1000
+    hysteresis: 2 # Gradient scale hysteresis
+    fp32_residual_connection: False # Move residual connections to fp32
+    fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+    # model fusions
+    masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+    bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+
+    use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+    onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+    gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism.
+    openai_gelu: True
+    bias_activation_fusion: False
+    megatron_legacy: True
+    activation: approx-gelu
+
+
+
+  text:
+    precision: 32
+    # text configs
+    output_dim: ${model.output_dim}
+
+    # model architecture
+    encoder_seq_length: 64
+    max_position_embeddings: ${.encoder_seq_length}
+    position_embedding_type: learned_absolute
+    num_layers: 27
+    hidden_size: 1152
+    ffn_hidden_size: 4304 # Transformer FFN hidden size. Usually 4 * hidden_size.
+    num_attention_heads: 16
+    init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+    use_scaled_init_method: True # use scaled residuals initialization
+    hidden_dropout: 0. # Dropout probability for hidden state transformer.
+    attention_dropout: 0.
+    kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+    apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+    normalization: layernorm # Type of normalization layers
+    layernorm_epsilon: 1e-5
+    do_layer_norm_weight_decay: False # True means weight decay on all params
+    pre_process: True # add embedding
+    post_process: True # add pooler
+    persist_layer_norm: True # Use of persistent fused layer norm kernel.
+
+    ## Activation Checkpointing
+    activations_checkpoint_granularity: null # 'selective' or 'full'
+    activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+    activations_checkpoint_num_layers: null # not used with 'selective'
+    num_micro_batches_with_partial_activation_checkpoints: null
+    activations_checkpoint_layers_per_pipeline: null
+    sequence_parallel: False
+
+    # precision
+    native_amp_init_scale: 4294967296 # 2 ** 32
+    native_amp_growth_interval: 1000
+    hysteresis: 2 # Gradient scale hysteresis
+    fp32_residual_connection: False # Move residual connections to fp32
+    fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+    # model fusions
+    masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+    bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+
+    use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+    onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+    gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism.
+    openai_gelu: True
+    bias_activation_fusion: False
+    megatron_legacy: True
+
+    fp8: False # enables fp8 in TransformerLayer forward
+    fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+    fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+    fp8_margin: 0 # scaling margin
+    fp8_interval: 1 # scaling update interval
+    fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor
+    fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history
+    use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
+    activation: approx-gelu
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
+
+  # miscellaneous
+  seed: 1234
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+
+  tokenizer:
+    library: 'huggingface'
+    type: 'google/siglip-so400m-patch14-384'
+    model: null
+    vocab_file: null
+    merge_file: null
+    delimiter: null # only used for tabular tokenizer
+    sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+
+  data:
+    num_workers: 8
+    train:
+      dataset_path: # List of paths to pkl files or tar files
+        - /datasets/coyo/test.pkl
+    validation: # List of paths to pkl files or tar files
+      dataset_path:
+        - /datasets/coyo/test.pkl
+    webdataset:
+      infinite_sampler: False
+      local_root_path: /datasets/coyo
+
+    imagenet_val: null # Path to imagenet val set for conducting zero shot evaluation.
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [ 0 ] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
+
+  optim:
+    name: fused_adam
+    lr: 1e-3
+    weight_decay: 0.2
+    betas:
+      - 0.9
+      - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      constant_steps: 0
+      min_lr: 1e-5
\ No newline at end of file
diff --git a/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py b/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py
index b9b9ab917173..9af25181d07e 100644
--- a/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py
+++ b/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py
@@ -283,6 +283,7 @@ def convert(local_rank, rank, world_size, args):
 
 
 if __name__ == '__main__':
+    logging.warning("This script is going to be deprecated soon. Please use ")
     args = get_args()
     local_rank, rank, world_size = initialize_distributed(args)
     convert(local_rank, rank, world_size, args)
diff --git a/examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py b/examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py
index 4462649a5861..abca470e5843 100644
--- a/examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py
+++ b/examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py
@@ -22,8 +22,6 @@
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
 
-mp.set_start_method("spawn", force=True)
-
 
 @hydra_runner(config_path="conf", config_name="megatron_clip_config")
 def main(cfg) -> None:
@@ -31,7 +29,10 @@ def main(cfg) -> None:
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
     assert (
-        cfg.trainer.devices * cfg.trainer.num_nodes
+        cfg.trainer.devices
+        * cfg.trainer.num_nodes
+        // cfg.model.tensor_model_parallel_size
+        // cfg.model.pipeline_model_parallel_size
     ) * cfg.model.micro_batch_size == cfg.model.global_batch_size, (
         "Gradient accumulation is not supported in CLIP yet."
     )
diff --git a/nemo/collections/multimodal/data/clip/clip_dataset.py b/nemo/collections/multimodal/data/clip/clip_dataset.py
index 7e263e19dcc9..6b63d546194a 100644
--- a/nemo/collections/multimodal/data/clip/clip_dataset.py
+++ b/nemo/collections/multimodal/data/clip/clip_dataset.py
@@ -76,11 +76,18 @@ def get_preprocess_fns(model_cfg, tokenizer=None, is_train=True):
     img_size = (model_cfg.vision.get("img_h"), model_cfg.vision.get("img_w"))
     img_mean = model_cfg.vision.get("img_mean")
     img_std = model_cfg.vision.get("img_std")
-    img_transform = image_transform(img_size, is_train=is_train, mean=img_mean, std=img_std,)
+    img_transform = image_transform(
+        img_size,
+        is_train=is_train,
+        mean=img_mean,
+        std=img_std,
+    )
     text_transform = lambda x: x
     if tokenizer is not None:
         text_transform = partial(
-            tokenize, tokenizer=tokenizer, context_length=model_cfg.text.get("max_position_embeddings"),
+            tokenize,
+            tokenizer=tokenizer,
+            context_length=model_cfg.text.get("max_position_embeddings"),
         )
     return img_transform, text_transform
 
@@ -100,7 +107,9 @@ def transform_fn(sample, img_transform, text_transform):
 
 
 def build_train_valid_datasets(
-    model_cfg, consumed_samples, tokenizer=None,
+    model_cfg,
+    consumed_samples,
+    tokenizer=None,
 ):
     data_cfg = model_cfg.data
 
@@ -127,6 +136,13 @@ def build_train_valid_datasets(
     return train_data, val_data
 
 
+def custom_collate(batch):
+    if len(batch) == 0:
+        return None, None
+    else:
+        return default_collate(batch)
+
+
 # For zero-shot imagenet validation
 def build_imagenet_validation_dataloader(model_cfg, tokenizer=None):
     val_image_transform, text_transform = get_preprocess_fns(model_cfg, tokenizer, is_train=False)
@@ -138,7 +154,10 @@ def build_imagenet_validation_dataloader(model_cfg, tokenizer=None):
     if imagenet_path is None:
         return None
 
-    image_dataset = ImageFolder(root=imagenet_path, transform=val_image_transform,)
+    image_dataset = ImageFolder(
+        root=imagenet_path,
+        transform=val_image_transform,
+    )
 
     image_batch_sampler = MegatronPretrainingSampler(
         total_samples=len(image_dataset),
@@ -150,12 +169,6 @@ def build_imagenet_validation_dataloader(model_cfg, tokenizer=None):
         drop_last=False,
     )
 
-    def custom_collate(batch):
-        if len(batch) == 0:
-            return None, None
-        else:
-            return default_collate(batch)
-
     imagenet_val["images"] = torch.utils.data.DataLoader(
         image_dataset,
         batch_sampler=image_batch_sampler,
diff --git a/nemo/collections/multimodal/losses/siglip_loss.py b/nemo/collections/multimodal/losses/siglip_loss.py
new file mode 100644
index 000000000000..a7d2ec9b46ce
--- /dev/null
+++ b/nemo/collections/multimodal/losses/siglip_loss.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file contains code artifacts adapted from the original implementation:
+# https://github.com/mlfoundations/open_clip/blob/main/src/open_clip/loss.py
+
+import torch
+import torch.nn.functional as F
+
+
+def neighbour_exchange(from_rank, to_rank, tensor, group=None):
+    tensor_recv = torch.zeros_like(tensor)
+    send_op = torch.distributed.P2POp(
+        torch.distributed.isend,
+        tensor,
+        to_rank,
+        group=group,
+    )
+    recv_op = torch.distributed.P2POp(
+        torch.distributed.irecv,
+        tensor_recv,
+        from_rank,
+        group=group,
+    )
+    reqs = torch.distributed.batch_isend_irecv([send_op, recv_op])
+    for req in reqs:
+        req.wait()
+    return tensor_recv
+
+
+def neighbour_exchange_bidir(left_rank, right_rank, tensor_to_left, tensor_to_right, group=None):
+    tensor_from_left = torch.zeros_like(tensor_to_right)
+    tensor_from_right = torch.zeros_like(tensor_to_left)
+    send_op_left = torch.distributed.P2POp(
+        torch.distributed.isend,
+        tensor_to_left,
+        left_rank,
+        group=group,
+    )
+    send_op_right = torch.distributed.P2POp(
+        torch.distributed.isend,
+        tensor_to_right,
+        right_rank,
+        group=group,
+    )
+    recv_op_left = torch.distributed.P2POp(
+        torch.distributed.irecv,
+        tensor_from_left,
+        left_rank,
+        group=group,
+    )
+    recv_op_right = torch.distributed.P2POp(
+        torch.distributed.irecv,
+        tensor_from_right,
+        right_rank,
+        group=group,
+    )
+    reqs = torch.distributed.batch_isend_irecv([send_op_right, send_op_left, recv_op_right, recv_op_left])
+    for req in reqs:
+        req.wait()
+    return tensor_from_right, tensor_from_left
+
+
+class NeighbourExchange(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, from_rank, to_rank, group, tensor):
+        ctx.group = group
+        ctx.from_rank = from_rank
+        ctx.to_rank = to_rank
+        return neighbour_exchange(from_rank, to_rank, tensor, group=group)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return (None, None, None) + (NeighbourExchange.apply(ctx.to_rank, ctx.from_rank, ctx.group, grad_output),)
+
+
+def neighbour_exchange_with_grad(from_rank, to_rank, tensor, group=None):
+    return NeighbourExchange.apply(from_rank, to_rank, group, tensor)
+
+
+class NeighbourExchangeBidir(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, left_rank, right_rank, group, tensor_to_left, tensor_to_right):
+        ctx.group = group
+        ctx.left_rank = left_rank
+        ctx.right_rank = right_rank
+        return neighbour_exchange_bidir(left_rank, right_rank, tensor_to_left, tensor_to_right, group=group)
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        return (None, None, None) + NeighbourExchangeBidir.apply(
+            ctx.right_rank, ctx.left_rank, ctx.group, *grad_outputs
+        )
+
+
+def neighbour_exchange_bidir_with_grad(left_rank, right_rank, tensor_to_left, tensor_to_right, group=None):
+    return NeighbourExchangeBidir.apply(left_rank, right_rank, group, tensor_to_left, tensor_to_right)
+
+
+class SigLipLoss(torch.nn.Module):
+    """Sigmoid Loss for Language Image Pre-Training (SigLIP) - https://arxiv.org/abs/2303.15343
+
+    @article{zhai2023sigmoid,
+      title={Sigmoid loss for language image pre-training},
+      author={Zhai, Xiaohua and Mustafa, Basil and Kolesnikov, Alexander and Beyer, Lucas},
+      journal={arXiv preprint arXiv:2303.15343},
+      year={2023}
+    }
+    """
+
+    def __init__(
+        self,
+        cache_labels=False,
+        rank=0,
+        world_size=1,
+        group=None,
+        bidir=True,
+    ):
+        super().__init__()
+        self.cache_labels = cache_labels
+        self.rank = rank
+        self.world_size = world_size
+        self.group = group
+        self.bidir = bidir
+
+    def get_ground_truth(self, device, dtype, num_logits, negative_only=False) -> torch.Tensor:
+        labels = -torch.ones((num_logits, num_logits), device=device, dtype=dtype)
+        if not negative_only:
+            labels = 2 * torch.eye(num_logits, device=device, dtype=dtype) + labels
+        return labels
+
+    def get_logits(self, image_features, text_features, logit_scale, logit_bias=None):
+        logits = logit_scale * image_features @ text_features.T
+        if logit_bias is not None:
+            logits += logit_bias
+        return logits
+
+    def _loss(self, image_features, text_features, logit_scale, logit_bias=None, negative_only=False):
+        logits = self.get_logits(image_features, text_features, logit_scale, logit_bias)
+        labels = self.get_ground_truth(
+            image_features.device,
+            image_features.dtype,
+            image_features.shape[0],
+            negative_only=negative_only,
+        )
+        loss = -F.logsigmoid(labels * logits).sum() / image_features.shape[0]
+        return loss
+
+    def forward(
+        self,
+        output_tensor,
+    ):
+        image_features, text_features, logit_scale, logit_bias = output_tensor
+        loss = self._loss(image_features, text_features, logit_scale, logit_bias)
+
+        if self.world_size > 1:
+            # exchange text features w/ neighbour world_size - 1 times
+            right_rank = (self.rank + 1) % self.world_size
+            left_rank = (self.rank - 1 + self.world_size) % self.world_size
+            if self.bidir:
+                text_features_to_right = text_features_to_left = text_features
+                num_bidir, remainder = divmod(self.world_size - 1, 2)
+                for i in range(num_bidir):
+                    text_features_recv = neighbour_exchange_bidir_with_grad(
+                        left_rank,
+                        right_rank,
+                        text_features_to_left,
+                        text_features_to_right,
+                        group=self.group,
+                    )
+
+                    for f in text_features_recv:
+                        loss += self._loss(
+                            image_features,
+                            f,
+                            logit_scale,
+                            logit_bias,
+                            negative_only=True,
+                        )
+                    text_features_to_left, text_features_to_right = text_features_recv
+
+                if remainder:
+                    text_features_recv = neighbour_exchange_with_grad(
+                        left_rank, right_rank, text_features_to_right, group=self.group
+                    )
+
+                    loss += self._loss(
+                        image_features,
+                        text_features_recv,
+                        logit_scale,
+                        logit_bias,
+                        negative_only=True,
+                    )
+            else:
+                text_features_to_right = text_features
+                for i in range(self.world_size - 1):
+                    text_features_from_left = neighbour_exchange_with_grad(
+                        left_rank, right_rank, text_features_to_right, group=self.group
+                    )
+
+                    loss += self._loss(
+                        image_features,
+                        text_features_from_left,
+                        logit_scale,
+                        logit_bias,
+                        negative_only=True,
+                    )
+                    text_features_to_right = text_features_from_left
+        return loss, {"loss": loss}
diff --git a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
index 7be7407b98ae..a83960307672 100644
--- a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
+++ b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
@@ -13,12 +13,17 @@
 # limitations under the License.
 
 import itertools
-from functools import partial
+import os
+import warnings
+from contextlib import nullcontext
+from dataclasses import fields
+from functools import cache, partial
 from typing import Any, Optional
 
 import numpy as np
 import torch
 import torch.nn.functional as F
+from omegaconf import OmegaConf
 from omegaconf.dictconfig import DictConfig
 from pytorch_lightning.accelerators import CPUAccelerator
 from pytorch_lightning.trainer.trainer import Trainer
@@ -29,7 +34,9 @@
     build_train_valid_datasets,
 )
 from nemo.collections.multimodal.losses.clip_loss import ClipLoss
+from nemo.collections.multimodal.losses.siglip_loss import SigLipLoss
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import get_specs, mcore_supports_moe
 from nemo.collections.nlp.modules.common.megatron.build_model import build_model
 from nemo.collections.nlp.modules.common.megatron.language_model import get_language_model
 from nemo.collections.nlp.modules.common.megatron.module import Float16Module, MegatronModule
@@ -40,7 +47,7 @@
     init_method_normal,
     scaled_init_method_normal,
 )
-from nemo.collections.nlp.parts.utils_funcs import get_last_rank, torch_dtype_from_precision
+from nemo.collections.nlp.parts.utils_funcs import activation_to_func, get_last_rank
 from nemo.collections.vision.modules.vit.vit_backbone import VitBackbone
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
@@ -55,7 +62,33 @@
 
 try:
     from megatron.core import parallel_state
+    from megatron.core.distributed import DistributedDataParallel as McoreDDP
+    from megatron.core.distributed import DistributedDataParallelConfig
+    from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+    from megatron.core.models.gpt import GPTModel as MCoreGPTModel
+    from megatron.core.models.vision.clip_vit_model import CLIPViTModel
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+    from megatron.core.transformer.attention import CrossAttention, CrossAttentionSubmodules
+    from megatron.core.transformer.custom_layers.transformer_engine import (
+        TEColumnParallelLinear,
+        TEDotProductAttention,
+        TELayerNormColumnParallelLinear,
+        TENorm,
+        TERowParallelLinear,
+    )
+    from megatron.core.transformer.enums import AttnMaskType as MCoreAttnMaskType
+    from megatron.core.transformer.identity_op import IdentityOp
+    from megatron.core.transformer.mlp import MLP, MLPSubmodules
+    from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
+    from megatron.core.transformer.spec_utils import ModuleSpec
+    from megatron.core.transformer.transformer_config import TransformerConfig
+    from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+    from megatron.core.utils import (
+        drain_embedding_wgrad_compute,
+        get_model_config,
+        init_method_normal,
+        scaled_init_method_normal,
+    )
 
     HAVE_MEGATRON_CORE = True
 
@@ -63,6 +96,28 @@
 
     HAVE_MEGATRON_CORE = False
 
+try:
+    import transformer_engine
+    from transformer_engine.pytorch import module as te_module
+
+    HAVE_TE = True
+
+except (ImportError, ModuleNotFoundError):
+    HAVE_TE = False
+
+
+@cache
+def mcore_supports_moe() -> bool:
+    global HAVE_MEGATRON_CORE
+    if not HAVE_MEGATRON_CORE:
+        return False
+    try:
+        from megatron.core.transformer.moe.router import TopKRouter
+
+        return True
+    except ImportError:
+        return False
+
 
 class CLIPVisionTransformer(MegatronModule):
     """Vision Transformer Model."""
@@ -100,7 +155,11 @@ def __init__(self, model_cfg, model_parallel_config, pre_process=True, post_proc
 
         if self.post_process and not skip_head:
             self.output_dim = model_cfg.output_dim
-            self.head = torch.nn.Linear(self.hidden_size, self.output_dim, bias=False,)
+            self.head = torch.nn.Linear(
+                self.hidden_size,
+                self.output_dim,
+                bias=False,
+            )
 
     def set_input_tensor(self, input_tensor):
         """See megatron.model.transformer.set_input_tensor()"""
@@ -129,7 +188,6 @@ def __init__(self, model_cfg, model_parallel_config, padded_vocab_size, pre_proc
         self.pre_process = pre_process
         self.post_process = post_process
         self.fp16_lm_cross_entropy = model_cfg.fp16_lm_cross_entropy
-        self.sequence_parallel = model_cfg.sequence_parallel
         self.gradient_accumulation_fusion = model_cfg.gradient_accumulation_fusion
 
         scaled_init_method = (
@@ -173,7 +231,7 @@ def __init__(self, model_cfg, model_parallel_config, padded_vocab_size, pre_proc
             openai_gelu=model_cfg.openai_gelu,
             onnx_safe=model_cfg.onnx_safe,
             megatron_legacy=model_cfg.megatron_legacy,
-            transformer_engine=model_cfg.transformer_engine,
+            transformer_engine=False,
             fp8=model_cfg.fp8,
             fp8_e4m3=model_cfg.fp8_e4m3,
             fp8_hybrid=model_cfg.fp8_hybrid,
@@ -193,14 +251,17 @@ def __init__(self, model_cfg, model_parallel_config, padded_vocab_size, pre_proc
             hidden_size=model_cfg.hidden_size,
         )
 
-        # TODO (yuya): check this position id
         self.position_ids = None
         if self.pre_process:
             self.position_ids = torch.arange(model_cfg.max_position_embeddings).expand(1, -1).cuda()
 
         if self.post_process:
             self.output_dim = model_cfg.output_dim
-            self.head = torch.nn.Linear(model_cfg.hidden_size, self.output_dim, bias=False,)
+            self.head = torch.nn.Linear(
+                model_cfg.hidden_size,
+                self.output_dim,
+                bias=False,
+            )
 
         self.attn_mask = self.build_attention_mask(model_cfg.max_position_embeddings)
 
@@ -217,7 +278,8 @@ def build_attention_mask(self, max_position_embeddings):
         return mask
 
     def forward(
-        self, input_ids,
+        self,
+        input_ids,
     ):
         # input_ids: [b, s]
         # position_ids: [b, s]
@@ -245,27 +307,263 @@ def forward(
         return hidden_states
 
 
+class SiglipMHAPoolingHead(TransformerLayer):
+    """Multihead Attention Pooling."""
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+    ):
+        super().__init__(config, submodules)
+
+        self.probe = torch.nn.Parameter(torch.randn(1, 1, config.hidden_size))
+
+    def forward(self, hidden_state):
+        batch_size = hidden_state.shape[0]
+        # [s, b, h]
+        probe = self.probe.repeat(1, batch_size, 1)
+        hidden_state = hidden_state.transpose(0, 1)
+        hidden_state, context = super().forward(
+            probe,
+            attention_mask=None,
+            context=hidden_state,
+        )
+
+        return hidden_state[0]
+
+
+class MCoreSiglipViTModel(CLIPViTModel):
+    def __init__(self, *args, **kwargs):
+        # TODO (yuya): need to handle post_process correctly in order to enable PP
+        self.output_dim = kwargs.pop('output_dim')
+        kwargs['ln_pre_impl'] = IdentityOp
+        super().__init__(*args, **kwargs)
+        assert self.output_dim == self.config.hidden_size, "Siglip output_dim needs to be the same as hidden_size."
+        self.conv1 = torch.nn.Conv2d(
+            in_channels=3,
+            out_channels=self.visual_hidden_size,
+            kernel_size=self.patch_dim,
+            stride=self.patch_dim,
+            bias=True,
+        )
+        self.final_layernorm = TENorm(
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+
+        self.head = SiglipMHAPoolingHead(
+            self.config,
+            submodules=TransformerLayerSubmodules(
+                cross_attention=ModuleSpec(
+                    module=CrossAttention,
+                    params={"attn_mask_type": MCoreAttnMaskType.no_mask},
+                    submodules=CrossAttentionSubmodules(
+                        linear_q=TEColumnParallelLinear,
+                        linear_kv=TEColumnParallelLinear,
+                        core_attention=TEDotProductAttention,
+                        linear_proj=TERowParallelLinear,
+                    ),
+                ),
+                cross_attn_bda=get_bias_dropout_add,
+                mlp=ModuleSpec(
+                    module=MLP,
+                    submodules=MLPSubmodules(
+                        linear_fc1=TELayerNormColumnParallelLinear,
+                        linear_fc2=TERowParallelLinear,
+                    ),
+                ),
+                mlp_bda=get_bias_dropout_add,
+            ),
+        )
+
+    def forward(self, x):
+        x = super().forward(
+            x,
+        )
+        x = self.final_layernorm(x)
+        x = self.head(x)
+        return x
+
+
+class MCoreSiglipTextModel(MCoreGPTModel):
+    def __init__(self, *args, **kwargs):
+        # TODO (yuya): need to handle post_process correctly in order to enable PP
+        self.output_dim = kwargs.pop('output_dim')
+        kwargs['transformer_layer_spec'].submodules.self_attention.params['attn_mask_type'] = MCoreAttnMaskType.no_mask
+
+        super().__init__(*args, **kwargs)
+        self.final_layernorm = TENorm(
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+        self.head = torch.nn.Linear(
+            self.config.hidden_size,
+            self.output_dim,
+            bias=True,
+        )
+
+        self.position_ids = None
+        if self.pre_process:
+            self.position_ids = torch.arange(kwargs['max_sequence_length']).expand(1, -1).cuda()
+
+    def forward(self, input_ids):
+
+        x = super().forward(input_ids, position_ids=self.position_ids, attention_mask=None)
+        x = self.final_layernorm(x)
+        x = x[-1]
+        x = self.head(x)
+        return x
+
+
+class MCoreCLIPViTModel(CLIPViTModel):
+    def __init__(self, *args, **kwargs):
+        # TODO (yuya): need to handle post_process correctly in order to enable PP
+        self.output_dim = kwargs.pop('output_dim')
+        super().__init__(*args, **kwargs)
+        self.final_layernorm = TENorm(
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+        self.head = torch.nn.Linear(
+            self.config.hidden_size,
+            self.output_dim,
+            bias=False,
+        )
+
+    def forward(self, x):
+        x = super().forward(
+            x,
+        )
+        x = self.final_layernorm(x)
+        x = x[:, 0]
+        x = self.head(x)
+        return x
+
+
+class MCoreCLIPTextModel(MCoreGPTModel):
+    def __init__(self, *args, **kwargs):
+        # TODO (yuya): need to handle post_process correctly in order to enable PP
+        self.output_dim = kwargs.pop('output_dim')
+
+        super().__init__(*args, **kwargs)
+        self.final_layernorm = TENorm(
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+        self.head = torch.nn.Linear(
+            self.config.hidden_size,
+            self.output_dim,
+            bias=False,
+        )
+        self.position_ids = None
+        if self.pre_process:
+            self.position_ids = torch.arange(kwargs['max_sequence_length']).expand(1, -1).cuda()
+
+    def forward(self, input_ids):
+        x = super().forward(input_ids, position_ids=self.position_ids, attention_mask=None)
+        x = self.final_layernorm(x)
+        x = x[input_ids.argmax(dim=-1), torch.arange(x.shape[1])]
+        x = self.head(x)
+        return x
+
+
 class CLIPModel(MegatronModule):
     """CLIP Model"""
 
-    def __init__(self, model_cfg, model_parallel_config, padded_vocab_size, pre_process=True, post_process=True):
+    def __init__(
+        self,
+        model_cfg,
+        model_parallel_config,
+        vision_transformer_config,
+        text_transformer_config,
+        padded_vocab_size,
+        pre_process=True,
+        post_process=True,
+    ):
         super(CLIPModel, self).__init__()
 
         self.config = model_parallel_config
+        self.use_siglip = model_cfg.get("use_siglip", False)
         self.pre_process = pre_process
         self.post_process = post_process
-        self.vision_encoder = CLIPVisionTransformer(
-            model_cfg.vision, model_parallel_config, pre_process=self.pre_process, post_process=self.post_process,
-        )
-        self.text_encoder = CLIPTextTransformer(
-            model_cfg.text,
-            model_parallel_config,
-            padded_vocab_size,
-            pre_process=self.pre_process,
-            post_process=self.post_process,
-        )
+        self.output_dim = model_cfg.output_dim
+        self.get_attention_mask_from_fusion = model_cfg.get('get_attention_mask_from_fusion', True)
 
-        self.logit_scale = torch.nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        if model_cfg.get("mcore_gpt", False):
+            if model_cfg.vision.get("class_token_length") is None or model_cfg.vision.get("class_token_length") <= 0:
+                add_class_token = False
+            else:
+                add_class_token = True
+            vision_layer_spec = get_specs(
+                model_cfg.text.get('name', ''),
+                vision_transformer_config.num_moe_experts,
+                vision_transformer_config.moe_grouped_gemm,
+                model_cfg.get('transformer_engine', True),
+            )
+            vision_layer_spec.submodules.self_attention.params['attn_mask_type'] = MCoreAttnMaskType.no_mask
+
+            if model_cfg.get("use_siglip", False):
+                vision_module = MCoreSiglipViTModel
+                text_module = MCoreSiglipTextModel
+            else:
+                vision_module = MCoreCLIPViTModel
+                text_module = MCoreCLIPTextModel
+            self.vision_encoder = vision_module(
+                transformer_config=vision_transformer_config,
+                transformer_layer_spec=vision_layer_spec,
+                patch_dim=model_cfg.vision.get('patch_dim', 16),
+                img_h=model_cfg.vision.get('img_h', 224),
+                img_w=model_cfg.vision.get('img_w', 224),
+                add_class_token=add_class_token,
+                class_token_len=model_cfg.vision.get('class_token_length'),
+                output_dim=model_cfg.output_dim,
+            )
+            self.text_encoder = text_module(
+                config=text_transformer_config,
+                transformer_layer_spec=get_specs(
+                    model_cfg.text.get('name', ''),
+                    text_transformer_config.num_moe_experts,
+                    text_transformer_config.moe_grouped_gemm,
+                    model_cfg.get('transformer_engine', True),
+                ),
+                vocab_size=model_cfg.text.get('override_vocab_size', padded_vocab_size),
+                max_sequence_length=model_cfg.text.get('encoder_seq_length', 512),
+                pre_process=pre_process,
+                post_process=False,
+                parallel_output=True,
+                share_embeddings_and_output_weights=False,
+                position_embedding_type=model_cfg.text.get('position_embedding_type', 'learned_absolute'),
+                rotary_percent=model_cfg.text.get('rotary_percentage', 1.0),
+                seq_len_interpolation_factor=model_cfg.text.get('seq_len_interpolation_factor', None),
+                rotary_base=model_cfg.text.get('rotary_base', 10000),
+                output_dim=model_cfg.output_dim,
+            )
+
+        else:
+            self.vision_encoder = CLIPVisionTransformer(
+                model_cfg.vision,
+                model_parallel_config,
+                pre_process=self.pre_process,
+                post_process=self.post_process,
+            )
+            self.text_encoder = CLIPTextTransformer(
+                model_cfg.text,
+                model_parallel_config,
+                padded_vocab_size,
+                pre_process=self.pre_process,
+                post_process=self.post_process,
+            )
+
+        if self.use_siglip:
+            self.logit_scale = torch.nn.Parameter(torch.ones([]) * np.log(10))
+            self.logit_bias = torch.nn.Parameter(torch.ones([]) * (-10))
+        else:
+            self.logit_scale = torch.nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
 
     def set_input_tensor(self, input_tensor):
         """See megatron.model.transformer.set_input_tensor()"""
@@ -277,10 +575,89 @@ def forward(self, images, captions):
         text_features = self.text_encoder(captions)
 
         if self.post_process:
+            if self.use_siglip:
+                return (
+                    F.normalize(image_features, dim=-1),
+                    F.normalize(text_features, dim=-1),
+                    self.logit_scale.exp(),
+                    self.logit_bias,
+                )
             return F.normalize(image_features, dim=-1), F.normalize(text_features, dim=-1), self.logit_scale.exp()
 
         return image_features, text_features
 
+    def build_transformer_config(self) -> TransformerConfig:
+        """Builds the megatron core gpt transformer config for the model.
+        For attributes in the nemo model config that are the same
+        as the megatron core TransformerConfig, we will use the value from the nemo model config.
+        For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
+        """
+
+        normalization = self.cfg.get('normalization', 'layernorm').lower()
+        layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p'
+        if normalization == 'layernorm':
+            normalization = 'LayerNorm'
+        elif normalization == 'rmsnorm':
+            normalization = 'RMSNorm'
+        elif normalization == 'layernorm1p':
+            normalization = 'LayerNorm'
+            layernorm_zero_centered_gamma = True
+        else:
+            logging.warning(
+                f"The normalization type: {normalization} might not be supported in megatron core."
+                f"Supported types are LayerNorm and RMSNorm."
+            )
+
+        ub_tp_comm_overlap = self.cfg.get('ub_tp_comm_overlap', False)
+
+        if not self.cfg.get('fp8', False):
+            fp8 = None
+        elif self.cfg.get('fp8_e4m3', False):
+            fp8 = 'e4m3'
+        elif self.cfg.get('fp8_hybrid', False):
+            fp8 = 'hybrid'
+        else:
+            raise ValueError(f"fp8 enabled but fp8_format (fp8_e4m3 | fp8_hybrid) is not set.")
+
+        # any configs that are not in the nemo model config will be added here
+        model_specific_configs = {
+            'layernorm_zero_centered_gamma': layernorm_zero_centered_gamma,
+            'normalization': normalization,
+            'fp8': fp8,
+            'tp_comm_overlap': ub_tp_comm_overlap,
+            # MoE related
+            'num_moe_experts': self.cfg.get('num_moe_experts', None),
+            'moe_router_load_balancing_type': self.cfg.get('moe_router_load_balancing_type', 'aux_loss'),
+            'moe_router_topk': self.cfg.get('moe_router_topk', 2),
+            'moe_grouped_gemm': self.cfg.get('moe_grouped_gemm', False),
+            'moe_aux_loss_coeff': self.cfg.get(
+                'moe_aux_loss_coeff', 0
+            ),  # 1e-2 would be a good start value for load balance loss.
+            'moe_z_loss_coeff': self.cfg.get('moe_z_loss_coeff', None),  # 1e-3 would be a good start value for z-loss
+            'moe_input_jitter_eps': self.cfg.get('moe_input_jitter_eps', None),
+            'moe_token_dropping': self.cfg.get('moe_token_dropping', False),  # TODO: Support token dropping.
+        }
+        if model_specific_configs['num_moe_experts'] is not None:
+            assert mcore_supports_moe(), 'Megatron-core >= v0.5.0 is required for MoE'
+        elif not mcore_supports_moe():
+            if 'num_moe_experts' in model_specific_configs:
+                del model_specific_configs['num_moe_experts']
+            moe_keys = list(filter(lambda x: x.startswith('moe_'), model_specific_configs.keys()))
+            for k in moe_keys:
+                del model_specific_configs[k]
+
+        transformer_config = super().build_transformer_config()
+
+        for key, value in model_specific_configs.items():
+            setattr(transformer_config, key, value)
+
+        # pass mcore customization configs directly to mcore
+        mcore_customization_config_dict = self.cfg.get('mcore_customization_config', {})
+        for key, value in mcore_customization_config_dict.items():
+            setattr(transformer_config, key, value)
+
+        return transformer_config
+
 
 class MegatronCLIPModel(MegatronBaseModel):
     """Megatron CLIP Model."""
@@ -302,11 +679,21 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
 
         self._validate_trainer()
 
+        # placeholder for O2 wrapper
+        self.transformer_config = self.build_transformer_config(self.cfg.text)
+
         self.megatron_amp_O2 = cfg.get('megatron_amp_O2', False)
 
+        self.mcore_gpt = cfg.get('mcore_gpt', False)
+        if cfg.get('fp8', False):
+            self.prev_step_training = True
         if not self.megatron_amp_O2 and self.cfg.get('virtual_pipeline_model_parallel_size', None):
             raise ValueError('Virtual pipeline model parallel is only supported when using megatron_amp_O2')
 
+        self.transformer_engine = cfg.get('transformer_engine', False)
+        if self.megatron_amp_O2 and not self.transformer_engine:
+            logging.warning('megatron_amp_O2 is enabled but transformer-engine is not.')
+
         # build_model returns a list of modules which are used for interleaved pipeline parallelism
         if isinstance(self.trainer.accelerator, CPUAccelerator):
             self.model = build_model(
@@ -316,19 +703,24 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
                 virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None),
             )
         else:
-            self.model = build_model(
-                model_provider_func=self.model_provider_func,
-                wrap_with_ddp=False,
-                virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None),
-            )
+            build_model_context = nullcontext
+            if HAVE_TE and self.cfg.get('fp8', False) and self.cfg.get('fp8_params', False):
+                build_model_context = transformer_engine.pytorch.fp8_model_init
+            with build_model_context():
+                self.model = build_model(
+                    model_provider_func=self.model_provider_func,
+                    wrap_with_ddp=False,
+                    virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None),
+                    on_cpu=cfg.get('fsdp', False) and cfg.get('use_cpu_initialization', False),
+                )
 
         # if we're not using interleaved, then self.model is a module.
-        if self.cfg.get('virtual_pipeline_model_parallel_size', None) is None:
+        if self.cfg.get('virtual_pipeline_model_parallel_size', None) is None and (not self.use_mcore_dist_optim):
             self.model = self.model[0]
 
         if self.megatron_amp_O2:
 
-            if not self.with_distributed_adam:
+            if not self.with_distributed_adam and not self.cfg.get("use_cpu_initialization", False):
                 # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type
                 if isinstance(self.model, list):
                     for module in self.model:
@@ -336,31 +728,17 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
                 else:
                     self.model.cuda(torch.cuda.current_device())
 
-            # Model wrapper to convert both model and inputs to half precision
-            # TODO (yuya): check this; FP16 Module might not work; when self.model is a list?
-            if isinstance(self.model, list):
-                converted_model = []
-                for module in self.model:
-                    converted_model.append(
-                        Float16Module(config=self.model_parallel_config, module=module, precision=cfg.precision)
-                    )
-                    self.model = converted_model
-            else:
-                self.model = Float16Module(
-                    config=self.model_parallel_config, module=self.model, precision=cfg.precision
-                )
+            self._wrap_model_for_O2()
 
-        self.autocast_dtype = torch_dtype_from_precision(self.trainer.precision)
         self.enable_autocast = (
             True if (not self.megatron_amp_O2) and (self.autocast_dtype in [torch.float16, torch.bfloat16]) else False
         )
 
-        self.transformer_engine = cfg.get('transformer_engine', False)
-
         # Convert the global-batch-based profile index to micro-batch index
         if hasattr(self, '_nsys_profile_enabled') or hasattr(self, '_memory_profile_enabled'):
             mp_size = cfg.get('tensor_model_parallel_size', 1) * cfg.get('pipeline_model_parallel_size', 1)
-            data_parallel_world_size = trainer.world_size // mp_size
+            cp_size = cfg.get('context_parallel_size', 1)
+            data_parallel_world_size = trainer.world_size // (mp_size * cp_size)
             grad_accum_steps = cfg.get('global_batch_size') // (cfg.get('micro_batch_size') * data_parallel_world_size)
             if hasattr(self, '_nsys_profile_enabled'):
                 self._nsys_profile_start_step *= grad_accum_steps
@@ -368,22 +746,36 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
             if hasattr(self, '_memory_profile_enabled'):
                 self._memory_profile_start_step *= grad_accum_steps
                 self._memory_profile_end_step *= grad_accum_steps
-        self.get_attention_mask_from_fusion = self.cfg.get('get_attention_mask_from_fusion', True)
-        self.initialize_ub = self.cfg.get('ub_tp_comm_overlap', False)
 
-    def get_module_list(self):
-        if isinstance(self.model, list):
-            return [model.module if isinstance(model, Float16Module) else model for model in self.model]
-        elif isinstance(self.model, Float16Module):
-            return [self.model.module]
-        else:
-            return [self.model]
+        self.initialize_ub = self.cfg.get('ub_tp_comm_overlap', False)
+        self.log_train_loss = bool(int(os.getenv("NEMO_LOG_TRAIN_LOSS", 1)))
+        self.log_memory_usage = bool(int(os.getenv("NEMO_LOG_MEMORY_USAGE", 0)))
+        self.loss_broadcast_src_rank = None
+        data_cfg = cfg.get('data', {})
+        self.return_output_tensors = data_cfg.get('return_output_tensors', False)
+        self.validation_drop_last = data_cfg.get('validation_drop_last', True)
+        self.sample_weight = data_cfg.get('sample_weight', 'token')
+        self.validation_param_sync_overlap = self.cfg.get('validation_param_sync_overlap', False)
 
     def model_provider_func(self, pre_process, post_process):
         """Model depends on pipeline paralellism."""
+        vision_transformer_config = self.build_transformer_config(self.cfg.vision) if self.mcore_gpt else None
+        text_transformer_config = self.build_transformer_config(self.cfg.text) if self.mcore_gpt else None
+
+        if self.mcore_gpt and not parallel_state.is_initialized():
+
+            def dummy():
+                return
+
+            if self.trainer.strategy.launcher is not None:
+                self.trainer.strategy.launcher.launch(dummy, trainer=self.trainer)
+            self.trainer.strategy.setup_environment()
+
         model = CLIPModel(
             model_cfg=self.cfg,
             model_parallel_config=self.model_parallel_config,
+            vision_transformer_config=vision_transformer_config,
+            text_transformer_config=text_transformer_config,
             padded_vocab_size=self.padded_vocab_size,
             pre_process=pre_process,
             post_process=post_process,
@@ -401,9 +793,40 @@ def setup_optimizer_param_groups(self):
         else:
             self._optimizer_param_groups = get_params_for_weight_decay_optimization(self.model)
 
+    def setup_mcore_distributed_parallel(self):
+        """Set up mcore distributed data parallel"""
+        if self.with_distributed_adam and self.use_mcore_dist_optim:
+            config = get_model_config(self.model[0])
+            ddp_config = DistributedDataParallelConfig(
+                grad_reduce_in_fp32=(self.cfg.optim.get('grad_sync_dtype', 'fp32') == 'fp32'),
+                overlap_grad_reduce=self.cfg.optim.get('overlap_grad_sync', False),
+                use_distributed_optimizer=True,
+                check_for_nan_in_grad=self.cfg.optim.get('check_for_nan_in_grad', False),
+                # mcore bucket_size is based on num of parameters, therefore not
+                # using bucket_cap_mb to configure bucket_size here
+                bucket_size=self.cfg.optim.get('ddp_bucket_size', None),
+            )
+
+            self.model = [
+                McoreDDP(
+                    config,
+                    ddp_config,
+                    model_chunk,
+                    data_parallel_group=parallel_state.get_data_parallel_group(with_context_parallel=True),
+                    expert_data_parallel_group=parallel_state.get_data_modulo_expert_parallel_group(),
+                    # Turn off bucketing for model_chunk 2 onwards, since communication for these
+                    # model chunks is overlapped with compute anyway.
+                    disable_bucketing=(model_chunk_idx > 0),
+                )
+                for (model_chunk_idx, model_chunk) in enumerate(self.model)
+            ]
+
+            # (TODO) Broadcast params from data parallel src rank to other data parallel ranks.
+            # by calling model_module.broadcast_params() if the model is randomly initialized.
+
     def configure_optimizers(self):
 
-        if self.with_distributed_adam:
+        if self.with_distributed_adam and not self.use_mcore_dist_optim:
 
             # Disable overlapped grad sync for layer norm grads when
             # sequence parallelism is enabled
@@ -462,13 +885,16 @@ def fwd_bwd_step(self, dataloader_iter, forward_only):
         no_sync_func = None
         grad_sync_func = None
         param_sync_func = None
-        if not forward_only and self.with_distributed_adam:
-            no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
+        if not forward_only and self.with_distributed_adam and not self.use_mcore_dist_optim:
+            no_sync_func = partial(
+                self._optimizer.no_sync,
+                greedy_grad_copy=self.megatron_amp_O2,
+            )
             grad_sync_func = self.reduce_overlap_gradients
             param_sync_func = self.sync_overlap_parameters
 
         # pipeline schedules will get these from self.model.config
-        for module in self.get_module_list():
+        for module in self.get_model_module_list():
             module.config.no_sync_func = no_sync_func
             module.config.grad_sync_func = grad_sync_func
             module.config.param_sync_func = param_sync_func
@@ -515,7 +941,9 @@ def initialize_ub_func(self):
             )
 
         input_shape = [
-            self.cfg.get('encoder_seq_length') * self.cfg.get('micro_batch_size'),
+            self.cfg.get('encoder_seq_length')
+            * self.cfg.get('micro_batch_size')
+            // self.cfg.get('context_parallel_size', 1),
             self.cfg.get('hidden_size'),
         ]
 
@@ -529,12 +957,12 @@ def initialize_ub_func(self):
 
     def training_step(self, dataloader_iter):
         """
-            Our dataloaders produce a micro-batch and then we fetch
-            a number of microbatches depending on the global batch size and model parallel size
-            from the dataloader to produce a list of microbatches.
-            Batch should be a list of microbatches and those microbatches should on CPU.
-            Microbatches are then moved to GPU during the pipeline.
-            The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions.
+        Our dataloaders produce a micro-batch and then we fetch
+        a number of microbatches depending on the global batch size and model parallel size
+        from the dataloader to produce a list of microbatches.
+        Batch should be a list of microbatches and those microbatches should on CPU.
+        Microbatches are then moved to GPU during the pipeline.
+        The list of microbatches is then piped through the pipeline using Apex fwd/bwd functions.
         """
         # Initialize userbuffer communicators.
         if self.initialize_ub:
@@ -543,7 +971,7 @@ def training_step(self, dataloader_iter):
         # we zero grads here because we also call backward in the megatron-core fwd/bwd functions
         self._optimizer.zero_grad()
 
-        if self.with_distributed_adam:
+        if self.with_distributed_adam and not self.use_mcore_dist_optim:
             # hack to enable overlapping param sync and forward compute
             # note: the distributed optimizer monkey-patches each
             # parameter's __getattribute__ function so that it can
@@ -554,9 +982,10 @@ def training_step(self, dataloader_iter):
             # manually interact with the parameter.
             modules = self.model if isinstance(self.model, list) else [self.model]
             for module in modules:
-                if isinstance(module, Float16Module):
+                if isinstance(module, (Float16Module, MCoreFloat16Module)):
                     module = module.module
-                module = module.text_encoder.language_model
+                if not self.mcore_gpt:
+                    module = module.language_model
                 if hasattr(module, 'embedding'):
                     for param in module.embedding.parameters():
                         param.data_ptr()
@@ -567,38 +996,115 @@ def training_step(self, dataloader_iter):
         if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False):
             self.allreduce_sequence_parallel_gradients()
 
-        if self.with_distributed_adam:
-            # synchronize asynchronous grad reductions
-            # note: not necessary, but reduces performance degradation
-            # from multiple simultaneous NCCL calls
-            self._optimizer._finish_bucket_grad_sync()
+        if self.cfg.get('fp8', False):
+            self.prev_step_training = self.training
+
+        # Optimization: Defer the embedding GEMM Wgrads of the last PP stage to pipeline flush waiting time
+        if self.cfg.get('pipeline_model_parallel_size', 1) > 1 and parallel_state.is_pipeline_last_stage(
+            ignore_virtual=True
+        ):
+            if (
+                self.cfg.get('defer_embedding_wgrad_compute', False) and self.mcore_gpt
+            ):  # Silently ignore the optimization if MCORE is not used
+                module_list = self.get_model_module_list()
+                if len(module_list) > 1:
+                    embedding_module = module_list[-1]
+                else:
+                    embedding_module = module_list[0]
+
+                embedding_activation_buffer = embedding_module.embedding_activation_buffer
+                grad_output_buffer = embedding_module.grad_output_buffer
+                weight = embedding_module.output_layer.weight
+
+                drain_embedding_wgrad_compute(
+                    embedding_module.config, embedding_activation_buffer, grad_output_buffer, weight
+                )
+
+        # when using sequence parallelism, the sequence parallel layernorm grads must be all-reduced
+        if self.cfg.get('tensor_model_parallel_size', 1) > 1 and self.cfg.get('sequence_parallel', False):
+            self.megatron_timer_start('allreduce_sequence_parallel_gradients', log_level=1)
+            self.allreduce_sequence_parallel_gradients()
+            self.megatron_timer_stop('allreduce_sequence_parallel_gradients')
+
+        self.megatron_timer_start('gradient_allreduce', log_level=1)
+        if self.use_fsdp:
+            # Reduce the gradients omitted from FSDP-sharding
+            self.allreduce_fsdp_sharding_omitted_gradients()
+        elif self.with_distributed_adam:
+            if not self.use_mcore_dist_optim:
+                # synchronize asynchronous grad reductions
+                # note: not necessary, but reduces performance degradation
+                # from multiple simultaneous NCCL calls
+                self._optimizer._finish_bucket_grad_sync()
+            # else: Mcore distributed optim calls finalize_model_grads to finish grad sync
         elif self.megatron_amp_O2:
             # when using pipeline parallelism grads must be all-reduced after the pipeline (not asynchronously)
-            # if self.cfg.get('pipeline_model_parallel_size', 1) > 1 or self.cfg.get('sequence_parallel', False):
-            #     # main grads are stored in the MainParamsOptimizer wrapper
-            self._optimizer.allreduce_main_grads()
+            if (
+                self.cfg.get('pipeline_model_parallel_size', 1) > 1
+                or self.cfg.get('sequence_parallel', False)
+                or not self.cfg.get('async_grad_allreduce', True)
+            ):
+                # main grads are stored in the MainParamsOptimizer wrapper
+                self._optimizer.allreduce_main_grads()
         else:
             # async grad allreduce is not currently implemented for O1/autocasting mixed precision training
             # so we all-reduce gradients after the pipeline
             self.allreduce_gradients()  # @sangkug we think this is causing memory to blow up (hurts perf)
+        self.megatron_timer_stop('gradient_allreduce')
+
+        if (
+            not self.use_mcore_dist_optim
+            and self.cfg.get('pipeline_model_parallel_size', 1) > 1
+            and self.cfg.get('share_embeddings_and_output_weights', True)
+        ):
+            self.megatron_timer_start('allreduce_first_last_embeddings', log_level=1)
+            # when using pipeline parallelism the first and last stage must keep embeddings in sync
+            self.allreduce_first_last_embeddings()
+            self.megatron_timer_stop('allreduce_first_last_embeddings')
+
+        if self.log_memory_usage:
+            mem_reserved = torch.cuda.max_memory_reserved()
+            self.log(
+                'peak_memory_usage',
+                mem_reserved,
+                prog_bar=True,
+                rank_zero_only=True,
+                batch_size=1,
+            )
 
         ## logging
-        # we can only log on one rank if it is rank zero so we broadcast from last rank
-        # we can avoid this broadcast by updating the PTL log function to accept specific ranks
-        torch.distributed.broadcast(loss_mean, get_last_rank())
-
-        if self.cfg.precision in [16, '16', '16-mixed']:
-            loss_scale = self.trainer.precision_plugin.scaler._scale
-            if loss_scale is not None:
-                self.log('loss_scale', loss_scale, batch_size=1)
+        if self.log_train_loss:
+            # When using pipeline parallelism, loss is calculated only in the last pipeline stage and
+            # it should be casted to other pipeline stages for logging.
+            # we can avoid this broadcast by updating the PTL log function to accept specific ranks
+            if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+                if torch.distributed.get_rank() == get_last_rank():
+                    torch.distributed.send(loss_mean, 0)
+                elif torch.distributed.get_rank() == 0:
+                    torch.distributed.recv(loss_mean, get_last_rank())
+            self.log('reduced_train_loss', loss_mean, prog_bar=True, rank_zero_only=True, batch_size=1)
+
+            # (@adithyare) we need to check for the _scaler attribute to enable pp>1 for adapter training
+            if self.cfg.precision == 16 and hasattr(self.trainer.precision_plugin.scaler, "_scale"):
+                loss_scale = self.trainer.precision_plugin.scaler._scale
+                if loss_scale is not None:
+                    self.log('loss_scale', loss_scale, batch_size=1)
 
-        self.log('reduced_train_loss', loss_mean, prog_bar=True, rank_zero_only=True, batch_size=1)
         lr = self._optimizer.param_groups[0]['lr']
         self.log('lr', lr, rank_zero_only=True, batch_size=1)
-        self.log('global_step', self.trainer.global_step + 1, prog_bar=True, rank_zero_only=True, batch_size=1)
+        self.log(
+            'global_step',
+            self.trainer.global_step + 1,
+            prog_bar=True,
+            rank_zero_only=True,
+            batch_size=1,
+        )
+
+        consumed_samples = self._compute_consumed_samples_after_training_step()
+        # TODO: make sure compute_consumed_samples works for pipeline parallelism
         self.log(
             'consumed_samples',
-            self.compute_consumed_samples(self.trainer.global_step + 1 - self.init_global_step),
+            consumed_samples,
             prog_bar=True,
             rank_zero_only=True,
             batch_size=1,
@@ -607,20 +1113,20 @@ def training_step(self, dataloader_iter):
         return loss_mean
 
     def backward(self, *args, **kwargs):
-        """ LightningModule hook to do backward.
-            We want this to do nothing since we run backward in the fwd/bwd functions from apex.
-            No need to call it here.
+        """LightningModule hook to do backward.
+        We want this to do nothing since we run backward in the fwd/bwd functions from apex.
+        No need to call it here.
         """
         pass
 
     def optimizer_zero_grad(self, *args, **kwargs):
-        """ LightningModule hook to zero grad.
-            We want this to do nothing as we are zeroing grads during the training_step.
+        """LightningModule hook to zero grad.
+        We want this to do nothing as we are zeroing grads during the training_step.
         """
         pass
 
     def _append_sequence_parallel_module_grads(self, module, grads):
-        """ Helper method for allreduce_sequence_parallel_gradients"""
+        """Helper method for allreduce_sequence_parallel_gradients"""
 
         for param in module.parameters():
             sequence_parallel_param = getattr(param, 'sequence_parallel', False)
@@ -632,9 +1138,9 @@ def _append_sequence_parallel_module_grads(self, module, grads):
                 grads.append(grad.data)
 
     def allreduce_sequence_parallel_gradients(self):
-        """ All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used.
-            Modified from megatron-lm:
-            https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425
+        """All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used.
+        Modified from megatron-lm:
+        https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425
         """
 
         grads = []
@@ -650,7 +1156,18 @@ def allreduce_sequence_parallel_gradients(self):
             buf.copy_(synced)
 
     def get_forward_output_and_loss_func(self):
-        loss_func = ClipLoss(local_loss=self.cfg.local_loss, gather_with_grad=self.cfg.gather_with_grad,)
+        if self.cfg.get("use_siglip", False):
+            # TODO(yuya): fix rank
+            loss_func = SigLipLoss(
+                rank=parallel_state.get_data_parallel_rank(),
+                world_size=parallel_state.get_data_parallel_world_size(),
+                group=parallel_state.get_data_parallel_group(),
+            )
+        else:
+            loss_func = ClipLoss(
+                local_loss=self.cfg.local_loss,
+                gather_with_grad=self.cfg.gather_with_grad,
+            )
 
         def fwd_output_and_loss_func(dataloader_iter, model):
             batch, _, _ = next(dataloader_iter)
@@ -690,7 +1207,8 @@ def zero_shot_classifier(self):
                 texts = texts.cuda(non_blocking=True)
                 # TODO (yuya): distributed not working
                 with torch.cuda.amp.autocast(
-                    enabled=self.autocast_dtype in (torch.half, torch.bfloat16), dtype=self.autocast_dtype,
+                    enabled=self.autocast_dtype in (torch.half, torch.bfloat16),
+                    dtype=self.autocast_dtype,
                 ):
                     class_embeddings = text_encoder(texts)
                     class_embedding = F.normalize(class_embeddings, dim=-1).mean(dim=0)
@@ -726,7 +1244,8 @@ def accuracy(output, target, topk=(1,)):
                 target = target.cuda(non_blocking=True)
                 # predict
                 with torch.cuda.amp.autocast(
-                    enabled=self.autocast_dtype in (torch.half, torch.bfloat16), dtype=self.autocast_dtype,
+                    enabled=self.autocast_dtype in (torch.half, torch.bfloat16),
+                    dtype=self.autocast_dtype,
                 ):
                     image_features = vision_encoder(images)
                     image_features = F.normalize(image_features, dim=-1)
@@ -745,10 +1264,10 @@ def accuracy(output, target, topk=(1,)):
 
     def validation_step(self, dataloader_iter):
         """
-            Our dataloaders produce a micro-batch and then we fetch
-            a number of microbatches depending on the global batch size and model parallel size
-            from the dataloader to produce a list of microbatches.
-            The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.        """
+        Our dataloaders produce a micro-batch and then we fetch
+        a number of microbatches depending on the global batch size and model parallel size
+        from the dataloader to produce a list of microbatches.
+        The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions."""
         # Initialize userbuffer communicators.
         if self.initialize_ub:
             self.initialize_ub_func()
@@ -801,7 +1320,9 @@ def build_train_valid_test_datasets(self):
             raise ValueError("limit_val_batches must be an integer or float less than or equal to 1.0.")
 
         self._train_ds, self._validation_ds = build_train_valid_datasets(
-            model_cfg=self.cfg, consumed_samples=self.compute_consumed_samples(0), tokenizer=self.tokenizer,
+            model_cfg=self.cfg,
+            consumed_samples=self.compute_consumed_samples(0),
+            tokenizer=self.tokenizer,
         )
         self._test_ds = None
 
@@ -816,7 +1337,7 @@ def build_train_valid_test_datasets(self):
         return self._train_ds, self._validation_ds, self._test_ds
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
+        """PTL hook that is executed after DDP spawns.
             We setup datasets here as megatron datasets require DDP to instantiate.
             See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
         Args:
@@ -909,23 +1430,18 @@ def setup_test_data(self, cfg):
                 f'Setting up test dataloader with len(len(self._test_ds)): {len(self._test_ds)} and consumed samples: {consumed_samples}'
             )
             self._test_dl = torch.utils.data.DataLoader(
-                self._test_ds, batch_size=self._micro_batch_size, num_workers=cfg.num_workers, pin_memory=True,
+                self._test_ds,
+                batch_size=self._micro_batch_size,
+                num_workers=cfg.num_workers,
+                pin_memory=True,
             )
 
     def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] = None) -> Any:
         raise NotImplementedError
 
-    def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any:
-        """ PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device
-            When using pipeline parallelism, we need the global batch to remain on the CPU,
-            since the memory overhead will be too high when using a large number of microbatches.
-            Microbatches are transferred from CPU to GPU inside the pipeline.
-        """
-        return batch
-
     def _validate_trainer(self):
-        """ Certain trainer configurations can break training.
-            Here we try to catch them and raise an error.
+        """Certain trainer configurations can break training.
+        Here we try to catch them and raise an error.
         """
         if self.trainer.accumulate_grad_batches > 1:
             raise ValueError(
@@ -961,3 +1477,178 @@ def parameters(self):
             return itertools.chain.from_iterable(module.parameters() for module in self.model)
         else:
             return self.model.parameters()
+
+    def build_transformer_config(self, model_cfg=None) -> TransformerConfig:
+        """Builds the megatron core gpt transformer config for the model.
+        For attributes in the nemo model config that are the same
+        as the megatron core TransformerConfig, we will use the value from the nemo model config.
+        For attributes in TransformerConfig that are not in the nemo model config, we add custom logic.
+        """
+        if model_cfg is None:
+            model_cfg = self.cfg
+        normalization = model_cfg.get('normalization', 'layernorm').lower()
+        layernorm_zero_centered_gamma = model_cfg.get('normalization', 'layernorm') == 'layernorm1p'
+        if normalization == 'layernorm':
+            normalization = 'LayerNorm'
+        elif normalization == 'rmsnorm':
+            normalization = 'RMSNorm'
+        elif normalization == 'layernorm1p':
+            normalization = 'LayerNorm'
+            layernorm_zero_centered_gamma = True
+        else:
+            logging.warning(
+                f"The normalization type: {normalization} might not be supported in megatron core."
+                f"Supported types are LayerNorm and RMSNorm."
+            )
+
+        ub_tp_comm_overlap = model_cfg.get('ub_tp_comm_overlap', False)
+
+        if not model_cfg.get('fp8', False):
+            fp8 = None
+        elif model_cfg.get('fp8_e4m3', False):
+            fp8 = 'e4m3'
+        elif model_cfg.get('fp8_hybrid', False):
+            fp8 = 'hybrid'
+        else:
+            raise ValueError(f"fp8 enabled but fp8_format (fp8_e4m3 | fp8_hybrid) is not set.")
+
+        # any configs that are not in the nemo model config will be added here
+        model_specific_configs = {
+            'layernorm_zero_centered_gamma': layernorm_zero_centered_gamma,
+            'normalization': normalization,
+            'fp8': fp8,
+            'tp_comm_overlap': ub_tp_comm_overlap,
+            # MoE related
+            'num_moe_experts': model_cfg.get('num_moe_experts', None),
+            'moe_router_load_balancing_type': model_cfg.get('moe_router_load_balancing_type', 'aux_loss'),
+            'moe_router_topk': model_cfg.get('moe_router_topk', 2),
+            'moe_grouped_gemm': model_cfg.get('moe_grouped_gemm', False),
+            'moe_aux_loss_coeff': model_cfg.get(
+                'moe_aux_loss_coeff', 0
+            ),  # 1e-2 would be a good start value for load balance loss.
+            'moe_z_loss_coeff': model_cfg.get('moe_z_loss_coeff', None),  # 1e-3 would be a good start value for z-loss
+            'moe_input_jitter_eps': model_cfg.get('moe_input_jitter_eps', None),
+            'moe_token_dropping': model_cfg.get('moe_token_dropping', False),  # TODO: Support token dropping.
+        }
+        if model_specific_configs['num_moe_experts'] is not None:
+            assert mcore_supports_moe(), 'Megatron-core >= v0.5.0 is required for MoE'
+        elif not mcore_supports_moe():
+            if 'num_moe_experts' in model_specific_configs:
+                del model_specific_configs['num_moe_experts']
+            moe_keys = list(filter(lambda x: x.startswith('moe_'), model_specific_configs.keys()))
+            for k in moe_keys:
+                del model_specific_configs[k]
+
+        # create a dictionary copy of the model config
+        cfg = OmegaConf.to_container(model_cfg, resolve=True)
+
+        # create a dict to store the transformer config arguments
+        transformer_config_dict = {}
+
+        # get model parallel configs from the base class
+        model_parallel_config = self.build_model_parallel_config()
+
+        add_bias_linear = model_cfg.get('bias', True)
+        add_qkv_bias = model_cfg.get('qkv_bias', False)
+
+        activation = model_cfg.get('activation', 'gelu')
+        gated_linear_unit = activation.endswith('glu')
+        # TODO: need to check which activation functions are supported in mcore
+        activation_func = activation_to_func(activation, openai_gelu=model_cfg.get("openai_gelu", False))
+
+        normalization = model_cfg.get('normalization', 'LayerNorm')
+
+        init_method_std = model_cfg.get('init_method_std', 0.02)
+        # default used in mcore
+        init_method = init_method_normal(init_method_std)
+
+        output_layer_init_method = init_method
+        num_layers = model_cfg.get('num_layers', 1)
+        use_scaled_init_method = model_cfg.get('use_scaled_init_method', True)
+        if use_scaled_init_method:
+            output_layer_init_method = scaled_init_method_normal(init_method_std, num_layers=num_layers)
+
+        attention_softmax_in_fp32 = False  # not currently used in NeMo unless apply_query_key_layer_scaling is True
+        apply_query_key_layer_scaling = model_cfg.get('apply_query_key_layer_scaling', False)
+
+        rotary_interleaved = model_cfg.get('rotary_interleaved', False)
+
+        fp16_enabled = self.trainer.precision in [16, '16', '16-mixed']
+        if apply_query_key_layer_scaling:
+            if fp16_enabled:
+                os.environ["NVTE_APPLY_QK_LAYER_SCALING"] = "1"
+            else:
+                logging.warning(
+                    "apply_query_key_layer_scaling is only enabled when using FP16, setting it to False "
+                    "and setting NVTE_APPLY_QK_LAYER_SCALING=0"
+                )
+                os.environ["NVTE_APPLY_QK_LAYER_SCALING"] = "0"
+                apply_query_key_layer_scaling = False
+
+        if apply_query_key_layer_scaling:
+            attention_softmax_in_fp32 = True
+
+        bias_activation_fusion = model_cfg.get('bias_activation_fusion', True)
+
+        bias_dropout_fusion = model_cfg.get('bias_dropout_add_fusion', True)
+
+        apply_rope_fusion = model_cfg.get('apply_rope_fusion', False)
+
+        # TODO: need to check if recompute APIs are matching up properly
+        recompute_granularity = model_cfg.get('activations_checkpoint_granularity', None)
+        recompute_method = model_cfg.get('activations_checkpoint_method', None)
+        recompute_num_layers = model_cfg.get('activations_checkpoint_num_layers', None)
+
+        # any configs that are not in the nemo model config will be added here
+        config_mapping = {
+            'apply_query_key_layer_scaling': apply_query_key_layer_scaling,
+            'apply_residual_connection_post_layernorm': False,  # we don't use this in NeMo
+            'layernorm_zero_centered_gamma': False,
+            'add_bias_linear': add_bias_linear,
+            'add_qkv_bias': add_qkv_bias,
+            'gated_linear_unit': gated_linear_unit,
+            'activation_func': activation_func,
+            'normalization': normalization,
+            'init_method': init_method,
+            'output_layer_init_method': output_layer_init_method,
+            'attention_softmax_in_fp32': attention_softmax_in_fp32,
+            'bias_activation_fusion': bias_activation_fusion,
+            'bias_dropout_fusion': bias_dropout_fusion,
+            'apply_rope_fusion': apply_rope_fusion,
+            'recompute_granularity': recompute_granularity,
+            'recompute_method': recompute_method,
+            'recompute_num_layers': recompute_num_layers,
+            'distribute_saved_activations': False,  # not currently used in NeMo
+            'fp8': None,
+            'rotary_interleaved': rotary_interleaved,
+            'deallocate_pipeline_outputs': True,
+        }
+
+        # populate the transformer config dict
+        for field in fields(TransformerConfig):
+            # config mapping has second highest priority
+            if field.name in config_mapping:
+                transformer_config_dict[field.name] = config_mapping[field.name]
+            # then config
+            elif field.name in cfg:
+                transformer_config_dict[field.name] = cfg[field.name]
+            # then model parallel config
+            elif field in fields(model_parallel_config):
+                transformer_config_dict[field.name] = getattr(model_parallel_config, field.name)
+            else:
+                logging.warning(
+                    f"The model: {self} does not have field.name: {field.name} in its cfg. "
+                    f"Add this key to cfg or config_mapping to make to make it configurable."
+                )
+
+        transformer_config = TransformerConfig(**transformer_config_dict)
+
+        for key, value in model_specific_configs.items():
+            setattr(transformer_config, key, value)
+
+        # pass mcore customization configs directly to mcore
+        mcore_customization_config_dict = model_cfg.get('mcore_customization_config', {})
+        for key, value in mcore_customization_config_dict.items():
+            setattr(transformer_config, key, value)
+
+        return transformer_config
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 7308d3db3f91..4ded9a42db4f 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -484,7 +484,7 @@ def build_transformer_config(self) -> TransformerConfig:
         activation = self.cfg.get('activation', 'gelu')
         gated_linear_unit = activation.endswith('glu')
         # TODO: need to check which activation functions are supported in mcore
-        activation_func = activation_to_func(activation)
+        activation_func = activation_to_func(activation, openai_gelu=self.cfg.get("openai_gelu", False))
 
         normalization = self.cfg.get('normalization', 'LayerNorm')
 
diff --git a/nemo/collections/nlp/parts/utils_funcs.py b/nemo/collections/nlp/parts/utils_funcs.py
index c00df5de1a98..a989ff3f606c 100644
--- a/nemo/collections/nlp/parts/utils_funcs.py
+++ b/nemo/collections/nlp/parts/utils_funcs.py
@@ -34,14 +34,14 @@
 from sklearn.metrics import classification_report, confusion_matrix
 from torch import Tensor
 
-from nemo.collections.nlp.modules.common.megatron.utils import erf_gelu
+from nemo.collections.nlp.modules.common.megatron.utils import ApproxGELUActivation, erf_gelu
 from nemo.collections.nlp.modules.common.megatron.utils import openai_gelu as openai_gelu_func
 from nemo.collections.nlp.modules.common.megatron.utils import squared_relu
 from nemo.utils import logging
 
 
 def torch_dtype_from_precision(precision: Union[int, str], megatron_amp_O2: Optional[bool] = None) -> torch.dtype:
-    """ Mapping from PTL precision types to corresponding PyTorch parameter datatype."""
+    """Mapping from PTL precision types to corresponding PyTorch parameter datatype."""
     if megatron_amp_O2 is not None and megatron_amp_O2 is False:
         return torch.float32
 
@@ -56,12 +56,12 @@ def torch_dtype_from_precision(precision: Union[int, str], megatron_amp_O2: Opti
 
 
 def list2str(l: List[int]) -> str:
-    """ Converts list to a string"""
+    """Converts list to a string"""
     return ' '.join([str(x) for x in l])
 
 
 def tensor2list(tensor: Tensor) -> List[Union[int, float]]:
-    """ Converts tensor to a list """
+    """Converts tensor to a list"""
     return tensor.detach().cpu().tolist()
 
 
@@ -168,13 +168,13 @@ def get_last_rank():
 
 
 def activation_to_func(activation: str, openai_gelu: bool = False, onnx_safe: bool = False) -> Callable:
-    """ Converts an activation function represented as a string to a function.
+    """Converts an activation function represented as a string to a function.
 
     Args:
         activation (str): string representation of an activation function, typically gotten from the model config.
         openai_gelu (bool): whether to use the OpenAI GELU implementation. Used with HF compatibility.
         onnx_safe (bool): whether to use the ONNX-compatible implementation of GELU.
-    
+
     Returns:
         Callable: the activation function.
     """
@@ -188,6 +188,7 @@ def activation_to_func(activation: str, openai_gelu: bool = False, onnx_safe: bo
         'fast-geglu',
         'fast-swiglu',
         'fast-reglu',
+        'approx-gelu',
     ]
 
     if activation not in supported_activations:
@@ -208,6 +209,8 @@ def activation_to_func(activation: str, openai_gelu: bool = False, onnx_safe: bo
         activation_func = F.silu
     elif activation == 'squared-relu':
         activation_func = squared_relu
+    elif activation == 'approx-gelu':
+        activation_func = ApproxGELUActivation
 
     return activation_func
 
diff --git a/scripts/checkpoint_converters/convert_clip_hf_to_nemo.py b/scripts/checkpoint_converters/convert_clip_hf_to_nemo.py
new file mode 100644
index 000000000000..690fa74abccd
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_clip_hf_to_nemo.py
@@ -0,0 +1,248 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Usage example:
+    torchrun --nproc-per-node=1 /opt/NeMo/scripts/checkpoint_converters/convert_clip_hf_to_nemo.py \
+        --input_name_or_path=openai/clip-vit-large-patch14 \
+        --output_path=openai_clip.nemo \
+        --hparams_file=/opt/NeMo/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml
+
+Additionally, provide a NeMo hparams file with the correct model architecture arguments. Refer to examples/multimodal/foundation/clip/conf/megatron_clip_config.yaml.
+
+After conversion, you can verify with the following command:
+
+  wget https://upload.wikimedia.org/wikipedia/commons/0/0f/1665_Girl_with_a_Pearl_Earring.jpg
+  torchrun --nproc-per-node=1 /opt/NeMo/examples/multimodal/vision_language_foundation/clip/megatron_clip_infer.py \
+    model.restore_from_path=./openai_clip.nemo \
+    image_path=./1665_Girl_with_a_Pearl_Earring.jpg \
+    texts='["a dog", "a boy", "a girl"]'
+
+It should generate a high probability for "a girl" tag, e.g.
+Given image's CLIP text probability:  [('a dog', 0.0049710185), ('a boy', 0.002258187), ('a girl', 0.99277073)]
+
+"""
+
+import os
+from argparse import ArgumentParser
+
+import torch
+from omegaconf import OmegaConf
+from pytorch_lightning.plugins.environments import TorchElasticEnvironment
+from pytorch_lightning.trainer.trainer import Trainer
+from transformers import CLIPModel
+
+from nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models import MegatronCLIPModel
+from nemo.utils import AppState, logging
+from nemo.utils.distributed import initialize_distributed
+
+try:
+    from megatron.core import parallel_state
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument("--input_name_or_path", type=str, default="openai/clip-vit-base-patch32")
+
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=None,
+        required=True,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /opt/NeMo/examples/multimodal/vision_language_foundation/clip/conf/megatron_clip_VIT-L-14.yaml",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+
+    parser.add_argument("--gpus_per_node", type=int, required=False, default=1)
+    parser.add_argument("--tensor_model_parallel_size", type=int, required=False, default=1)
+    parser.add_argument("--pipeline_model_parallel_size", type=int, required=False, default=1)
+    parser.add_argument(
+        "--pipeline_model_parallel_split_rank",
+        type=int,
+        required=False,
+        default=None,
+        help="If pipeline parallel size > 1, this is the rank at which the encoder ends and the decoder begins.",
+    )
+    parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
+
+    args = parser.parse_args()
+    return args
+
+
+def mapping_hf_state_dict(hf_model):
+    hf_state_dict = hf_model.state_dict()
+    hf_config = hf_model.config
+    key_mapping = {
+        "text_projection.weight": "text_encoder.head.weight",
+        "visual_projection.weight": "vision_encoder.head.weight",
+    }
+
+    layer_mapping = {
+        ".layer_norm1.weight": ".self_attention.linear_qkv.layer_norm_weight",
+        ".layer_norm1.bias": ".self_attention.linear_qkv.layer_norm_bias",
+        ".layer_norm2.weight": ".mlp.linear_fc1.layer_norm_weight",
+        ".layer_norm2.bias": ".mlp.linear_fc1.layer_norm_bias",
+        ".self_attn.out_proj.weight": ".self_attention.linear_proj.weight",
+        ".self_attn.out_proj.bias": ".self_attention.linear_proj.bias",
+        ".mlp.fc1.weight": ".mlp.linear_fc1.weight",
+        ".mlp.fc1.bias": ".mlp.linear_fc1.bias",
+        ".mlp.fc2.weight": ".mlp.linear_fc2.weight",
+        ".mlp.fc2.bias": ".mlp.linear_fc2.bias",
+        ".pre_layrnorm.weight": ".ln_pre.weight",
+        ".pre_layrnorm.bias": ".ln_pre.bias",
+        ".post_layernorm.weight": ".final_layernorm.weight",
+        ".post_layernorm.bias": ".final_layernorm.bias",
+        ".embeddings.patch_embedding.weight": ".conv1.weight",
+        ".embeddings.class_embedding": ".class_token",
+        ".final_layer_norm.weight": ".final_layernorm.weight",
+        ".final_layer_norm.bias": ".final_layernorm.bias",
+        ".embeddings.token_embedding.weight": ".embedding.word_embeddings.weight",
+        "vision_encoder.embeddings.position_embedding.weight": "vision_encoder.position_embeddings.weight",
+        "text_encoder.embeddings.position_embedding.weight": "text_encoder.embedding.position_embeddings.weight",
+    }
+
+    nemo_state_dict = {}
+    for key in hf_state_dict.keys():
+        if key.startswith("text_model.encoder.layers"):
+            key_ = key.replace("text_model.encoder.layers", "text_encoder.decoder.layers")
+        elif key.startswith("vision_model.encoder.layers"):
+            key_ = key.replace("vision_model.encoder.layers", "vision_encoder.decoder.layers")
+        elif key.startswith('vision_model.'):
+            key_ = key.replace("vision_model.", "vision_encoder.")
+        elif key.startswith('text_model.'):
+            key_ = key.replace('text_model.', 'text_encoder.')
+        else:
+            key_ = key
+        for pat in key_mapping:
+            if key_ == pat:
+                key_ = key_.replace(pat, key_mapping[pat])
+        for pat in layer_mapping:
+            if key_.endswith(pat):
+                key_ = key_[: -len(pat)] + layer_mapping[pat]
+                break
+        if "vision" in key_:
+            config = hf_config.vision_config
+        else:
+            config = hf_config.text_config
+        head_num = num_query_groups = config.num_attention_heads
+        hidden_size = config.hidden_size
+        head_size = hidden_size // head_num
+        heads_per_group = head_num // num_query_groups
+
+        if 'q_proj.weight' in key_:
+            key_k = key.replace('q_proj', 'k_proj')
+            key_v = key.replace('q_proj', 'v_proj')
+            key_new = key_.replace('self_attn.q_proj', 'self_attention.linear_qkv')
+            q_weight, k_weight, v_weight = hf_state_dict[key], hf_state_dict[key_k], hf_state_dict[key_v]
+
+            q_weight = q_weight.reshape(head_num, head_size, hidden_size)
+            k_weight = k_weight.reshape(num_query_groups, head_size, hidden_size)
+            v_weight = v_weight.reshape(num_query_groups, head_size, hidden_size)
+            qkv_weight = torch.empty((0, head_size, hidden_size), device=q_weight.device)
+            for i in range(num_query_groups):
+                qkv_weight = torch.cat((qkv_weight, q_weight[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
+                qkv_weight = torch.cat((qkv_weight, k_weight[i : i + 1, :, :]))
+                qkv_weight = torch.cat((qkv_weight, v_weight[i : i + 1, :, :]))
+            qkv_weight = qkv_weight.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+            nemo_state_dict[key_new] = qkv_weight
+
+        elif 'q_proj.bias' in key_:
+            key_k = key.replace('q_proj', 'k_proj')
+            key_v = key.replace('q_proj', 'v_proj')
+            key_new = key_.replace('self_attn.q_proj', 'self_attention.linear_qkv')
+            q_bias, k_bias, v_bias = hf_state_dict[key], hf_state_dict[key_k], hf_state_dict[key_v]
+
+            q_bias = q_bias.reshape(head_num, head_size)
+            k_bias = k_bias.reshape(num_query_groups, head_size)
+            v_bias = v_bias.reshape(num_query_groups, head_size)
+            qkv_bias = torch.empty((0, head_size), device=q_bias.device)
+            for i in range(num_query_groups):
+                qkv_bias = torch.cat((qkv_bias, q_bias[i * heads_per_group : (i + 1) * heads_per_group, :]))
+                qkv_bias = torch.cat((qkv_bias, k_bias[i : i + 1, :]))
+                qkv_bias = torch.cat((qkv_bias, v_bias[i : i + 1, :]))
+            qkv_bias = qkv_bias.reshape([head_size * (head_num + 2 * num_query_groups)])
+            nemo_state_dict[key_new] = qkv_bias
+        elif not ('k_proj' in key_ or 'v_proj' in key_ or 'position_ids' in key_):
+            nemo_state_dict[key_] = hf_state_dict[key]
+
+    nemo_state_dict["vision_encoder.class_token"] = nemo_state_dict["vision_encoder.class_token"].reshape(1, 1, -1)
+
+    return nemo_state_dict
+
+
+def convert(local_rank, rank, world_size, args):
+    app_state = AppState()
+    app_state.data_parallel_rank = 0
+    num_nodes = world_size // args.gpus_per_node
+    trainer = Trainer(
+        devices=args.gpus_per_node, num_nodes=num_nodes, accelerator='gpu', plugins=[TorchElasticEnvironment()]
+    )
+
+    app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size
+    app_state.tensor_model_parallel_size = args.tensor_model_parallel_size
+
+    # no use atm, use to split ranks in encoder/decoder models.
+    if args.pipeline_model_parallel_size > 1 and args.model_type in []:
+        if args.pipeline_model_parallel_split_rank is not None:
+            app_state.pipeline_model_parallel_split_rank = args.pipeline_model_parallel_split_rank
+        else:
+            if args.pipeline_model_parallel_size % 2 != 0:
+                raise ValueError(
+                    f"Pipeline model parallel size {args.pipeline_model_parallel_size} must be even if split rank is not specified."
+                )
+            else:
+                # If split rank is not set, then we set it to be pipeline_model_parallel_size // 2 - this is because in most cases we have the same number of enc/dec layers.
+                app_state.pipeline_model_parallel_split_rank = args.pipeline_model_parallel_size // 2
+    else:
+        app_state.pipeline_model_parallel_split_rank = None
+
+    app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size
+
+    parallel_state.initialize_model_parallel(
+        tensor_model_parallel_size=app_state.tensor_model_parallel_size,
+        pipeline_model_parallel_size=app_state.pipeline_model_parallel_size,
+        pipeline_model_parallel_split_rank=app_state.pipeline_model_parallel_split_rank,
+    )
+
+    app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank()
+    app_state.tensor_model_parallel_rank = parallel_state.get_tensor_model_parallel_rank()
+
+    cfg = OmegaConf.load(args.hparams_file)
+    cfg.model.mcore_gpt = True
+    cfg.model.transformer_engine = True
+    cfg.model.text.position_embedding_type = "learned_absolute"
+    cfg.model.vision.position_embedding_type = "learned_absolute"
+
+    model = MegatronCLIPModel(cfg.model, trainer)
+
+    hf_model = CLIPModel.from_pretrained(args.input_name_or_path)
+    state_dict = mapping_hf_state_dict(hf_model)
+
+    model.model.load_state_dict(state_dict, strict=False)
+
+    model.save_to(args.output_path)
+
+    logging.info(f'NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+    local_rank, rank, world_size = initialize_distributed(args)
+    convert(local_rank, rank, world_size, args)
diff --git a/scripts/checkpoint_converters/convert_siglip_hf_to_nemo.py b/scripts/checkpoint_converters/convert_siglip_hf_to_nemo.py
new file mode 100644
index 000000000000..97a9d557f78b
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_siglip_hf_to_nemo.py
@@ -0,0 +1,380 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Requires HF transformers updated to support Gemma Models
+   python3 /opt/NeMo/scripts/nlp_language_modeling/convert_gemma_hf_to_nemo.py \
+   --input_name_or_path /path/to/gemma/checkpoints/hf/7b \
+   --output_path /path/to/gemma-7b.nemo \
+   --tokenizer_path /path/to/tokenizer.model
+"""
+
+import os
+from argparse import ArgumentParser
+
+import torch
+from omegaconf import OmegaConf
+from transformers import AutoModel, AutoProcessor
+
+from nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models import MegatronCLIPModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
+from nemo.utils import logging
+
+
+def create_rename_keys(num_hidden_layers):
+    rename_keys = []
+    for i in range(num_hidden_layers):
+        rename_keys.extend(
+            [
+                (
+                    f"text_model.encoder.layers.{i}.self_attn.k_proj.weight",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_k.weight",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.self_attn.k_proj.bias",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_k.bias",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.self_attn.q_proj.weight",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_q.weight",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.self_attn.q_proj.bias",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_q.bias",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.self_attn.v_proj.weight",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_v.weight",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.self_attn.v_proj.bias",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_v.bias",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.self_attn.out_proj.weight",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_proj.weight",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.self_attn.out_proj.bias",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_proj.bias",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.layer_norm1.weight",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_qkv.layer_norm_weight",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.layer_norm1.bias",
+                    f"model.text_encoder.decoder.layers.{i}.self_attention.linear_qkv.layer_norm_bias",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.mlp.fc1.weight",
+                    f"model.text_encoder.decoder.layers.{i}.mlp.linear_fc1.weight",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.mlp.fc1.bias",
+                    f"model.text_encoder.decoder.layers.{i}.mlp.linear_fc1.bias",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.mlp.fc2.weight",
+                    f"model.text_encoder.decoder.layers.{i}.mlp.linear_fc2.weight",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.mlp.fc2.bias",
+                    f"model.text_encoder.decoder.layers.{i}.mlp.linear_fc2.bias",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.layer_norm2.weight",
+                    f"model.text_encoder.decoder.layers.{i}.mlp.linear_fc1.layer_norm_weight",
+                ),
+                (
+                    f"text_model.encoder.layers.{i}.layer_norm2.bias",
+                    f"model.text_encoder.decoder.layers.{i}.mlp.linear_fc1.layer_norm_bias",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.self_attn.k_proj.weight",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_k.weight",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.self_attn.k_proj.bias",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_k.bias",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.self_attn.v_proj.weight",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_v.weight",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.self_attn.v_proj.bias",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_v.bias",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.self_attn.q_proj.weight",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_q.weight",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.self_attn.q_proj.bias",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_q.bias",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.self_attn.out_proj.weight",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_proj.weight",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.self_attn.out_proj.bias",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_proj.bias",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.layer_norm1.weight",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_qkv.layer_norm_weight",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.layer_norm1.bias",
+                    f"model.vision_encoder.decoder.layers.{i}.self_attention.linear_qkv.layer_norm_bias",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.mlp.fc1.weight",
+                    f"model.vision_encoder.decoder.layers.{i}.mlp.linear_fc1.weight",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.mlp.fc1.bias",
+                    f"model.vision_encoder.decoder.layers.{i}.mlp.linear_fc1.bias",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.mlp.fc2.weight",
+                    f"model.vision_encoder.decoder.layers.{i}.mlp.linear_fc2.weight",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.mlp.fc2.bias",
+                    f"model.vision_encoder.decoder.layers.{i}.mlp.linear_fc2.bias",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.layer_norm2.weight",
+                    f"model.vision_encoder.decoder.layers.{i}.mlp.linear_fc1.layer_norm_weight",
+                ),
+                (
+                    f"vision_model.encoder.layers.{i}.layer_norm2.bias",
+                    f"model.vision_encoder.decoder.layers.{i}.mlp.linear_fc1.layer_norm_bias",
+                ),
+            ]
+        )
+
+    rename_keys.extend(
+        [
+            ("logit_scale", "model.logit_scale"),
+            ("logit_bias", "model.logit_bias"),
+            ("vision_model.embeddings.patch_embedding.weight", "model.vision_encoder.conv1.weight"),
+            ("vision_model.embeddings.patch_embedding.bias", "model.vision_encoder.conv1.bias"),
+            ("vision_model.embeddings.position_embedding.weight", "model.vision_encoder.position_embeddings.weight"),
+            ("vision_model.post_layernorm.weight", "model.vision_encoder.final_layernorm.weight"),
+            ("vision_model.post_layernorm.bias", "model.vision_encoder.final_layernorm.bias"),
+            ("vision_model.head.probe", "model.vision_encoder.head.probe"),
+            (
+                "vision_model.head.attention.in_proj_weight",
+                "model.vision_encoder.head.cross_attention.linear_qkv.weight",
+            ),
+            ("vision_model.head.attention.in_proj_bias", "model.vision_encoder.head.cross_attention.linear_qkv.bias"),
+            (
+                "vision_model.head.attention.out_proj.weight",
+                "model.vision_encoder.head.cross_attention.linear_proj.weight",
+            ),
+            (
+                "vision_model.head.attention.out_proj.bias",
+                "model.vision_encoder.head.cross_attention.linear_proj.bias",
+            ),
+            ("vision_model.head.layernorm.weight", "model.vision_encoder.head.mlp.linear_fc1.layer_norm_weight"),
+            ("vision_model.head.layernorm.bias", "model.vision_encoder.head.mlp.linear_fc1.layer_norm_bias"),
+            ("vision_model.head.mlp.fc1.weight", "model.vision_encoder.head.mlp.linear_fc1.weight"),
+            ("vision_model.head.mlp.fc1.bias", "model.vision_encoder.head.mlp.linear_fc1.bias"),
+            ("vision_model.head.mlp.fc2.weight", "model.vision_encoder.head.mlp.linear_fc2.weight"),
+            ("vision_model.head.mlp.fc2.bias", "model.vision_encoder.head.mlp.linear_fc2.bias"),
+            ("text_model.embeddings.token_embedding.weight", "model.text_encoder.embedding.word_embeddings.weight"),
+            (
+                "text_model.embeddings.position_embedding.weight",
+                "model.text_encoder.embedding.position_embeddings.weight",
+            ),
+            ("text_model.final_layer_norm.weight", "model.text_encoder.final_layernorm.weight"),
+            ("text_model.final_layer_norm.bias", "model.text_encoder.final_layernorm.bias"),
+            ("text_model.head.weight", "model.text_encoder.head.weight"),
+            ("text_model.head.bias", "model.text_encoder.head.bias"),
+        ]
+    )
+
+    return rename_keys
+
+
+def rename_model_keys(model_state_dict, rename_keys):
+    """
+    Rename keys in the model's state dictionary based on the provided mappings.
+
+    Parameters:
+    model_state_dict (dict): The state dictionary of the model.
+    rename_keys (list): A list of tuples with the mapping (old_key, new_key).
+
+    Returns:
+    dict: A new state dictionary with updated key names.
+    """
+
+    # Create a new state dictionary with updated key names
+    new_state_dict = {}
+
+    # Track keys from the original state dict to ensure all are processed
+    remaining_keys = set(model_state_dict.keys())
+
+    # Iterate over the rename mappings
+    for old_key, new_key in rename_keys:
+        if old_key in model_state_dict:
+            # Rename the key and remove it from the tracking set
+            new_state_dict[new_key] = model_state_dict[old_key]
+            remaining_keys.remove(old_key)
+
+    # Check if any keys were not converted from old to new
+    for old_key in remaining_keys:
+        print(f"Warning: Key '{old_key}' was not converted.")
+
+    return new_state_dict
+
+
+def adjust_tensor_shapes(model, nemo_state_dict):
+    """
+    Adapt tensor shapes in the state dictionary to ensure compatibility with a different model structure.
+
+    Parameters:
+    nemo_state_dict (dict): The state dictionary of the model.
+
+    Returns:
+    dict: The updated state dictionary with modified tensor shapes for compatibility.
+    """
+    model_config = model.cfg
+
+    # Note: For 'key' and 'value' weight and biases, NeMo uses a consolidated tensor 'query_key_value'.
+    for key_ in list(nemo_state_dict.keys()):
+        if "vision" in key_:
+            config = model_config["vision"]
+        else:
+            config = model_config["text"]
+        num_query_groups = head_num = config["num_attention_heads"]
+        hidden_size = config["hidden_size"]
+        head_size = hidden_size // head_num
+        heads_per_group = head_num // num_query_groups
+        if "bias" in key_:
+            hidden_size = 1
+
+        if 'head.cross_attention.linear_qkv.' in key_:
+            key_q = key_.replace('linear_qkv', 'linear_q')
+            key_kv = key_.replace('linear_qkv', 'linear_kv')
+            q_weight, k_weight, v_weight = nemo_state_dict[key_].chunk(3)
+            k_weight = k_weight.reshape(num_query_groups, head_size, hidden_size)
+            v_weight = v_weight.reshape(num_query_groups, head_size, hidden_size)
+            kv_weight = torch.empty((0, head_size, hidden_size), device=q_weight.device)
+            for i in range(num_query_groups):
+                kv_weight = torch.cat((kv_weight, k_weight[i : i + 1, :, :]))
+                kv_weight = torch.cat((kv_weight, v_weight[i : i + 1, :, :]))
+            kv_weight = kv_weight.reshape([head_size * 2 * num_query_groups, hidden_size])
+            if "bias" in key_:
+                kv_weight = kv_weight.squeeze(-1)
+            nemo_state_dict[key_q] = q_weight
+            nemo_state_dict[key_kv] = kv_weight
+            del nemo_state_dict[key_]
+
+        if 'self_attention.linear_q.' in key_:
+            key_q = key_
+            key_k = key_.replace('linear_q', 'linear_k')
+            key_v = key_.replace('linear_q', 'linear_v')
+            key_qkv = key_.replace('linear_q', 'linear_qkv')
+
+            # [(head_num + 2 * num_query_groups) * head_size, hidden_size]
+            # -> [head_num, head_size, hidden_size], 2 * [num_query_groups, head_size, hidden_size]
+            q_weight, k_weight, v_weight = nemo_state_dict[key_q], nemo_state_dict[key_k], nemo_state_dict[key_v]
+            q_weight = q_weight.reshape(head_num, head_size, hidden_size)
+            k_weight = k_weight.reshape(num_query_groups, head_size, hidden_size)
+            v_weight = v_weight.reshape(num_query_groups, head_size, hidden_size)
+
+            qkv_weight = torch.empty((0, head_size, hidden_size), device=q_weight.device)
+            for i in range(num_query_groups):
+                qkv_weight = torch.cat((qkv_weight, q_weight[i * heads_per_group : (i + 1) * heads_per_group, :, :]))
+                qkv_weight = torch.cat((qkv_weight, k_weight[i : i + 1, :, :]))
+                qkv_weight = torch.cat((qkv_weight, v_weight[i : i + 1, :, :]))
+            qkv_weight = qkv_weight.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+            if "bias" in key_:
+                qkv_weight = qkv_weight.squeeze(-1)
+            nemo_state_dict[key_qkv] = qkv_weight
+            del nemo_state_dict[key_q], nemo_state_dict[key_k], nemo_state_dict[key_v]
+
+    return nemo_state_dict
+
+
+def adjust_nemo_config(model_config, ref_config):
+    model_config["encoder_seq_length"] = ref_config["max_position_embeddings"]
+    model_config["num_layers"] = ref_config["num_hidden_layers"]
+    model_config["ffn_hidden_size"] = ref_config["intermediate_size"]
+    model_config["hidden_size"] = ref_config["hidden_size"]
+    model_config["num_attention_heads"] = ref_config["num_attention_heads"]
+    model_config["num_query_groups"] = ref_config["num_key_value_heads"]
+    model_config["kv_channels"] = ref_config["head_dim"]
+    model_config["layernorm_epsilon"] = ref_config["rms_norm_eps"]
+    return model_config
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument("--input_name_or_path", type=str)
+    parser.add_argument("--tokenizer_path", type=str)
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=os.path.join(
+            os.path.dirname(__file__),
+            '../../examples/multimodal/vision_language_foundation/clip/conf/megatron_siglip_so400m_14_384.yaml',
+        ),
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
+    parser.add_argument("--output_path", type=str, default=None, help="Path to output .nemo file.")
+    parser.add_argument(
+        "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weight saved"
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def convert(args):
+    logging.info(f"Loading checkpoint from HF: `{args.input_name_or_path}`")
+    hf_model = AutoModel.from_pretrained(args.input_name_or_path)
+    # hf_processor = AutoProcessor.from_pretrained(args.input_name_or_path)
+    logging.info("HF Model loading done.")
+
+    nemo_config = OmegaConf.load(args.hparams_file)
+
+    nemo_config.trainer["precision"] = args.precision
+    trainer = MegatronTrainerBuilder(nemo_config).create_trainer()
+    model = MegatronCLIPModel(nemo_config.model, trainer)
+
+    assert nemo_config.model.text.num_layers == nemo_config.model.vision.num_layers
+    rename_keys = create_rename_keys(nemo_config.model.text.num_layers)
+    old_state_dict = hf_model.state_dict()
+    new_state_dict = rename_model_keys(model_state_dict=old_state_dict, rename_keys=rename_keys)
+
+    nemo_state_dict = adjust_tensor_shapes(model, new_state_dict)
+    model.load_state_dict(nemo_state_dict, strict=False)
+
+    dtype = torch_dtype_from_precision(args.precision)
+    model = model.to(dtype=dtype)
+    model.save_to(args.output_path)
+    logging.info(f'NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args)

From b4fe4a595575614d8c054ea28cecc02c90f946b6 Mon Sep 17 00:00:00 2001
From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Date: Mon, 8 Jul 2024 09:23:28 -0700
Subject: [PATCH 07/13] Add REST API to deploy module (#9539)

* Add REST API and FastAPI to deploy module

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add NemoQuery and requirements

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Edit path for config.json

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add modifications for REST API for the correct functionality

Move service dir under deploy
Use NeMoQueryLLM instead of NemoQuery

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Apply isort and black reformatting

Signed-off-by: pre-commit-ci[bot] <pre-commit-ci[bot]@users.noreply.github.com>

* Change default port for REST Service

Change default port for REST service as Triton server also used the same port as default.

Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: athitten <athitten@users.noreply.github.com>

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: pre-commit-ci[bot] <pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Signed-off-by: athitten <athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: athitten <athitten@users.noreply.github.com>
---
 nemo/deploy/service/__init__.py       | 14 +++++
 nemo/deploy/service/config.json       |  5 ++
 nemo/deploy/service/rest_model_api.py | 87 +++++++++++++++++++++++++++
 requirements/requirements_infer.txt   |  4 +-
 scripts/deploy/nlp/deploy_triton.py   | 30 ++++++++-
 5 files changed, 138 insertions(+), 2 deletions(-)
 create mode 100644 nemo/deploy/service/__init__.py
 create mode 100644 nemo/deploy/service/config.json
 create mode 100644 nemo/deploy/service/rest_model_api.py

diff --git a/nemo/deploy/service/__init__.py b/nemo/deploy/service/__init__.py
new file mode 100644
index 000000000000..0349454da9e1
--- /dev/null
+++ b/nemo/deploy/service/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .rest_model_api import app
diff --git a/nemo/deploy/service/config.json b/nemo/deploy/service/config.json
new file mode 100644
index 000000000000..d3b3440dd97b
--- /dev/null
+++ b/nemo/deploy/service/config.json
@@ -0,0 +1,5 @@
+{
+    "triton_service_port": 8000,
+    "triton_service_ip": "0.0.0.0",
+    "triton_request_timeout": 60
+  }
\ No newline at end of file
diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py
new file mode 100644
index 000000000000..5c49370fd45f
--- /dev/null
+++ b/nemo/deploy/service/rest_model_api.py
@@ -0,0 +1,87 @@
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+from pathlib import Path
+
+from fastapi import FastAPI
+from pydantic import BaseModel
+from pydantic_settings import BaseSettings
+
+from nemo.deploy.nlp import NemoQueryLLM
+
+
+class TritonSettings(BaseSettings):
+    _triton_service_port: int
+    _triton_service_ip: str
+    _triton_request_timeout: str
+
+    def __init__(self):
+        super(TritonSettings, self).__init__()
+        try:
+            with open(os.path.join(Path.cwd(), 'nemo/deploy/service/config.json')) as config:
+                config_json = json.load(config)
+                self._triton_service_port = config_json["triton_service_port"]
+                self._triton_service_ip = config_json["triton_service_ip"]
+                self._triton_request_timeout = config_json["triton_request_timeout"]
+        except Exception as error:
+            print("An exception occurred:", error)
+            return
+
+    @property
+    def triton_service_port(self):
+        return self._triton_service_port
+
+    @property
+    def triton_service_ip(self):
+        return self._triton_service_ip
+
+    @property
+    def triton_request_timeout(self):
+        return self._triton_request_timeout
+
+
+app = FastAPI()
+triton_settings = TritonSettings()
+
+
+class CompletionRequest(BaseModel):
+    model: str
+    prompt: str
+    max_tokens: int = 512
+    temperature: float = 1.0
+    top_p: float = 0.0
+    n: int = 1
+    stream: bool = False
+    stop: str | None = None
+    frequency_penalty: float = 1.0
+
+
+@app.post("/v1/completions/")
+def completions_v1(request: CompletionRequest):
+    try:
+        url = triton_settings.triton_service_ip + ":" + str(triton_settings.triton_service_port)
+        nq = NemoQueryLLM(url=url, model_name=request.model)
+        output = nq.query_llm(
+            prompts=[request.prompt],
+            max_output_len=request.max_tokens,
+            top_k=request.n,
+            top_p=request.top_p,
+            temperature=request.temperature,
+            init_timeout=triton_settings.triton_request_timeout,
+        )
+        return {
+            "output": output[0][0],
+        }
+    except Exception as error:
+        print("An exception occurred:", error)
+        return {"error": "An exception occurred"}
diff --git a/requirements/requirements_infer.txt b/requirements/requirements_infer.txt
index c18f4e81ade3..5380398c278b 100644
--- a/requirements/requirements_infer.txt
+++ b/requirements/requirements_infer.txt
@@ -1,4 +1,6 @@
+fastapi
 nvidia-pytriton
+pydantic-settings
 tensorstore==0.1.45
+uvicorn
 zarr
-
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
index 7173c64c7438..a306231bcd61 100755
--- a/scripts/deploy/nlp/deploy_triton.py
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -18,6 +18,8 @@
 import sys
 from pathlib import Path
 
+import uvicorn
+
 from nemo.deploy import DeployPyTriton
 
 LOGGER = logging.getLogger("NeMo")
@@ -170,6 +172,17 @@ def get_args(argv):
         choices=['TensorRT-LLM', 'In-Framework'],
         help="Different options to deploy nemo model.",
     )
+    parser.add_argument(
+        "-srs",
+        "--start_rest_service",
+        default="False",
+        type=str,
+        help="Starts the REST service for OpenAI API support",
+    )
+    parser.add_argument(
+        "-sha", "--service_http_address", default="0.0.0.0", type=str, help="HTTP address for the REST Service"
+    )
+    parser.add_argument("-sp", "--service_port", default=8080, type=int, help="Port for the REST Service")
     parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode")
     args = parser.parse_args(argv)
     return args
@@ -224,6 +237,11 @@ def get_trtllm_deployable(args):
                     "There are {0} tables and {1} task ids.".format(len(ptuning_tables_files), len(args.task_ids))
                 )
 
+    if args.start_rest_service:
+        if args.service_port == args.triton_port:
+            logging.error("REST service port and Triton server port cannot use the same port.")
+            return
+
     trt_llm_exporter = TensorRTLLM(
         model_dir=trt_llm_path,
         lora_ckpt_list=args.lora_ckpt,
@@ -331,11 +349,21 @@ def nemo_deploy(argv):
 
     try:
         LOGGER.info("Model serving on Triton is will be started.")
+        if args.start_rest_service == "True":
+            try:
+                LOGGER.info("REST service will be started.")
+                uvicorn.run(
+                    'nemo.deploy.service.rest_model_api:app',
+                    host=args.service_http_address,
+                    port=args.service_port,
+                    reload=True,
+                )
+            except Exception as error:
+                logging.error("Error message has occurred during REST service start. Error message: " + str(error))
         nm.serve()
     except Exception as error:
         LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
         return
-
     LOGGER.info("Model serving will be stopped.")
     nm.stop()
 

From 4dc63e751033b0ce4f0c4b2967bdd2dbb0058d31 Mon Sep 17 00:00:00 2001
From: paul-gibbons <87940629+paul-gibbons@users.noreply.github.com>
Date: Mon, 8 Jul 2024 09:37:46 -0700
Subject: [PATCH 08/13] Mistral + Mixtral Support for NeVa (#9459)

* mistral template support

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* get_specs neva fix

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* mistral update

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* fixed mistral tokenization

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* text_gen_strategy add mistral support

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* mistral text_gen fix

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Cleaning up neva config

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* fix llama_2 default text_gen_strategy

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Apply isort and black reformatting

Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>

* fix forward() to account for new embedding optimization in MCore

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>

* Apply isort and black reformatting

Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>

---------

Signed-off-by: paul-gibbons <paul@gibbonspaul.com>
Signed-off-by: paul-gibbons <paul-gibbons@users.noreply.github.com>
Co-authored-by: paul-gibbons <paul-gibbons@users.noreply.github.com>
---
 .../multimodal/data/neva/conversation.py      | 28 ++++++++++++--
 .../multimodal/data/neva/neva_dataset.py      | 34 ++++++++++++++---
 .../models/multimodal_llm/neva/neva_model.py  | 38 ++++++++++++++++---
 nemo/collections/multimodal/parts/utils.py    |  4 +-
 .../common/text_generation_strategy.py        | 21 ++++++++++
 5 files changed, 109 insertions(+), 16 deletions(-)

diff --git a/nemo/collections/multimodal/data/neva/conversation.py b/nemo/collections/multimodal/data/neva/conversation.py
index 43b1977aa993..10a6c9e7283d 100644
--- a/nemo/collections/multimodal/data/neva/conversation.py
+++ b/nemo/collections/multimodal/data/neva/conversation.py
@@ -43,6 +43,7 @@ class SeparatorStyle(Enum):
     PLAIN = auto()
     LLAMA_2 = auto()
     LLAMA_3 = auto()
+    MISTRAL = auto()
     NVGPT = auto()
 
 
@@ -94,11 +95,15 @@ def get_prompt(self):
                         ret += " "
                 else:
                     ret += role + ":"
-        elif self.sep_style == SeparatorStyle.LLAMA_2:
-            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
+        elif self.sep_style == SeparatorStyle.LLAMA_2 or self.sep_style == SeparatorStyle.MISTRAL:
+            if self.sep_style == SeparatorStyle.LLAMA_2:
+                wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
+            else:
+                wrap_sys = lambda msg: f"{msg}" + ("\n" if msg else "")
             wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
             ret = ""
-
+            if self.sep_style == SeparatorStyle.MISTRAL:
+                ret += DEFAULT_BOS_TOKEN
             for i, (role, message) in enumerate(messages):
                 if i == 0:
                     assert message, "first message should not be none"
@@ -112,7 +117,10 @@ def get_prompt(self):
                         message = wrap_inst(message)
                         ret += self.sep + " " + message
                     else:
-                        ret += " " + message + " " + self.sep2
+                        if self.sep_style == SeparatorStyle.LLAMA_2:
+                            ret += " " + message + " " + self.sep2
+                        else:
+                            ret += message + self.sep2
                 else:
                     ret += ""
             ret = ret.lstrip(self.sep)
@@ -449,6 +457,17 @@ def dict(self):
     version="v1_mmtag",
 )
 
+conv_mistral = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    version="mistral",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MISTRAL,
+    sep="",
+    sep2=DEFAULT_EOS_TOKEN,
+)
+
 default_conversation = conv_vicuna_v1
 conv_templates = {
     "default": conv_vicuna_v0,
@@ -466,6 +485,7 @@ def dict(self):
     "nvgpt": conv_nvgpt,
     "nv_steerlm": conv_nvgpt,
     "nv_dpo": conv_nv_dpo,
+    "mistral": conv_mistral,
 }
 
 if __name__ == "__main__":
diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
index 86d45ded54cf..7eef677e13a8 100644
--- a/nemo/collections/multimodal/data/neva/neva_dataset.py
+++ b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -426,6 +426,7 @@ def preprocess_llama_2(
     sources: dict,
     tokenizer,
     cfg,
+    is_mistral: bool = False,
 ) -> Dict:
     """
     Preprocesses sources for the LLaMA 2 model configuration.
@@ -442,7 +443,10 @@ def preprocess_llama_2(
     - Dict: A dictionary containing tokenized and labeled data suitable for the LLaMA 2 model.
       This includes tokens, labels, and any special processing as defined in the configuration.
     """
-    conv = conversation_lib.conv_llava_llama_2.copy()
+    if is_mistral:
+        conv = conversation_lib.conv_mistral.copy()
+    else:
+        conv = conversation_lib.conv_llava_llama_2.copy()
     roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
 
     # Apply prompt templates
@@ -477,7 +481,10 @@ def preprocess_llama_2(
     labels = tokens.clone().detach()
 
     # Mask labels
-    sep = "[/INST] "
+    if is_mistral:
+        sep = "[/INST]"
+    else:
+        sep = "[/INST] "
     for conversation, target in zip(conversations, labels):
         rounds = conversation.split(conv.sep2)
         cur_len = 0
@@ -492,18 +499,23 @@ def preprocess_llama_2(
             parts[0] += sep
 
             round_len = len(tokenizer.text_to_ids(rou + conv.sep2))
-            instruction_len = len(tokenizer.text_to_ids(parts[0])) - 2
+
+            if is_mistral:
+                instruction_len = len(tokenizer.text_to_ids(parts[0])) - 1
+            else:
+                instruction_len = len(tokenizer.text_to_ids(parts[0])) - 2
+
             if i > 0:
                 round_len -= 1  # Remove extra token added by sp tokenizer
             else:
                 instruction_len += 1
             target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
-
             cur_len += round_len
         target[cur_len:] = IGNORE_INDEX
 
     # Check if masking working correctly
-    # print([x for x in zip(tokens[0].numpy().tolist(), labels[0].numpy().tolist())])
+    # masking_test =[x for x in zip(tokens[0].numpy().tolist(), labels[0].numpy().tolist())]
+    # print(masking_test)
 
     if add_extra_token:
         tokens = tokens[:, :-1].contiguous()
@@ -990,7 +1002,10 @@ def expand2square(pil_img, background_color):
                                 result.paste(pil_img, ((height - width) // 2, 0))
                                 return result
 
-                        frames = expand2square(frames, tuple(int(x * 255) for x in self.processor.image_mean))
+                        frames = [
+                            expand2square(frame, tuple(int(x * 255) for x in self.processor.image_mean))
+                            for frame in frames
+                        ]
                         frames = self.processor.preprocess(frames, return_tensors='pt')['pixel_values']
                     else:
                         frames = self.processor.preprocess(frames, return_tensors='pt')['pixel_values']
@@ -1057,6 +1072,13 @@ def expand2square(pil_img, background_color):
                 self.tokenizer,
                 self.multimodal_cfg,
             )
+        elif self.conv_template == "mistral":
+            data_dict = preprocess_llama_2(
+                sources,
+                self.tokenizer,
+                self.multimodal_cfg,
+                is_mistral=True,
+            )
         elif self.conv_template == "plain":
             data_dict = preprocess_plain(
                 sources,
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index cce40da45725..376237e89ecc 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -75,7 +75,7 @@
     HAVE_APEX = False
 
 try:
-    from megatron.core import InferenceParams, dist_checkpointing, parallel_state
+    from megatron.core import InferenceParams, dist_checkpointing, parallel_state, tensor_parallel
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
     from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
@@ -154,10 +154,34 @@ def set_media(self, media):
         self.media = media
 
     def forward(self, input_ids, **kwargs):
-        media = self.media  # avoid change the signature of embedding forward function
+        media = self.media  # avoid changing the signature of embedding forward function
+
+        # TODO: Refactor replace_media_embedding to account for MCore's embedding communication optimization
+        # https://github.com/NVIDIA/Megatron-LM/commit/ee423e7 changes the way we handle embeddings with sequence parallelism
+        # When using reduce_scatter_embeddings, word_embedding_tensor is now in the following shape: [sequence/tp, batch_size, hidden_size]
+        # replace_media_embedding currently expects [batch_size, sequence, hidden_size]
+
+        # Check if reduce_scatter_embeddings is enabled in the embedding forward function
+        apply_reduce_scatter = getattr(self, 'reduce_scatter_embeddings', False)
+
+        # Set reduce_scatter_embeddings to false to keep words_embedding's
+        # tensor dimesion the same for replace_media_embedding
+        if apply_reduce_scatter:
+            self.reduce_scatter_embeddings = False
+
         words_embeddings = super().forward(input_ids, **kwargs)
+        words_embeddings = self.replace_media_embeddings(input_ids, words_embeddings, media)
 
-        return self.replace_media_embeddings(input_ids, words_embeddings, media)
+        # Scatter embeddings back to each TP rank if reduce_scatter_embeddings is enabled
+        if apply_reduce_scatter:
+            words_embeddings = self._apply_reduce_scatter(words_embeddings)
+            self.reduce_scatter_embeddings = True
+
+        return words_embeddings
+
+    def _apply_reduce_scatter(self, embeddings):
+        embeddings = embeddings.transpose(0, 1).contiguous()
+        return tensor_parallel.mappings.scatter_to_sequence_parallel_region(embeddings)
 
     def encode_vision_x(self, vision_x: torch.Tensor):
         """
@@ -193,7 +217,6 @@ def encode_vision_x(self, vision_x: torch.Tensor):
     def replace_media_embeddings(self, input_ids, inputs_embeds, media):
         if media is None:
             return inputs_embeds
-
         batch_size, sequence_length, hidden_size = inputs_embeds.shape
 
         # calculate media features without gradients
@@ -550,7 +573,12 @@ def dummy():
                 media_end_id=media_end_id,
                 mcore_gpt=self.mcore_gpt,
                 config=self.transformer_config,
-                transformer_layer_spec=get_specs(self.spec_name),
+                transformer_layer_spec=get_specs(
+                    self.spec_name,
+                    self.transformer_config.num_moe_experts,
+                    self.transformer_config.moe_grouped_gemm,
+                    self.transformer_engine,
+                ),
                 vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),
                 max_sequence_length=self.cfg.get('encoder_seq_length', 512),
                 pre_process=pre_process,
diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index b6dee33d24f3..7eb72b38d0f0 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -135,8 +135,10 @@ def load_nemo_model_weights(nemo_path, sharded_state_dict=None):
 
             # distributed checkpointing
             if state_dict is None and sharded_state_dict is not None:
+
                 is_dist_ckpt = True
                 checkpoint = dict(state_dict=sharded_state_dict)
+
                 tmp_model_weights_ckpt = os.path.join(tmpdir, save_restore_connector.model_weights_ckpt)
                 tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0]
                 assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.'
@@ -501,7 +503,7 @@ def expand2square(pil_img, background_color):
                     result.paste(pil_img, ((height - width) // 2, 0))
                     return result
 
-            frames = expand2square(frames, tuple(int(x * 255) for x in processor.image_mean))
+            frames = [expand2square(frame, tuple(int(x * 255) for x in self.processor.image_mean)) for frame in frames]
             frames = processor.preprocess(frames, return_tensors='pt')['pixel_values']
         else:
             frames = processor.preprocess(frames, return_tensors='pt')['pixel_values']
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index f51d53ba5944..8f8fe313a5e3 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -508,6 +508,27 @@ def neva_process_prompts(prompt, tokenizer, multimodal_cfg, num_media_latents, c
             copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents
         )  # HARDCODED FOR NOW
         data_dict = preprocess_llama_3(sources, tokenizer, multimodal_cfg)
+    elif multimodal_cfg["conv_template"] == "mistral":
+        record = {
+            'conversations': [
+                {
+                    'from': 'human',
+                    'value': prompt,
+                },
+                {
+                    'from': 'gpt',
+                    'value': '',
+                },
+            ],
+        }
+        for turn in record['conversations']:
+            if turn.get('value') is not None:
+                turn['value'] = re.sub('<image>', f'{DEFAULT_IMAGE_TOKEN}\n', turn['value'])
+        list_data_dict.append(record)
+        sources = preprocess_multimodal(
+            copy.deepcopy(list_data_dict), multimodal_cfg, num_media_latents
+        )  # HARDCODED FOR NOW
+        data_dict = preprocess_llama_2(sources, tokenizer, multimodal_cfg, is_mistral=True)
     elif multimodal_cfg["conv_template"] == "v1":
         record = {
             'conversations': [

From 38af139d8f2d3377201815d743c3c0daa05748b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 8 Jul 2024 18:49:09 +0200
Subject: [PATCH 09/13] ci: Timeout per step, not job (#9635)

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/workflows/_test_template.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
index 5956a23bdd67..0dbb1d50ee52 100644
--- a/.github/workflows/_test_template.yml
+++ b/.github/workflows/_test_template.yml
@@ -36,7 +36,6 @@ on:
 jobs:
   main:
     runs-on: ${{ inputs.RUNNER }} 
-    timeout-minutes: ${{ inputs.TIMEOUT }}
     outputs:
       conclusion: ${{ steps.main.conclusion }}
       log: ${{ steps.main.outputs.log }}
@@ -54,6 +53,7 @@ jobs:
           uses: actions/checkout@v4
         - id: main
           name: Run main script
+          timeout-minutes: ${{ inputs.TIMEOUT }}
           run: |
             set +e 
             (  

From aa397d7677b164abbd6138b8980b3d5019b399f7 Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Date: Mon, 8 Jul 2024 09:52:21 -0700
Subject: [PATCH 10/13] Adding support for mcore generate (#9566)

* Adding support for mcore generate

* Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>

* adding support

* Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>

* adding support

---------

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: shanmugamr <shanmugamr@nvidia.com>
Co-authored-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
---
 .../conf/megatron_gpt_inference.yaml          |   1 +
 .../megatron_gpt_inference_batch_mcore.yaml   |  29 +++
 .../language_modeling/megatron_gpt_eval.py    |   3 +
 .../megatron_gpt_mcore_batch_eval.py          | 222 ++++++++++++++++++
 4 files changed, 255 insertions(+)
 create mode 100644 examples/nlp/language_modeling/conf/megatron_gpt_inference_batch_mcore.yaml
 create mode 100644 examples/nlp/language_modeling/megatron_gpt_mcore_batch_eval.py

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml
index ce8311daf95c..056f9eb9c6ec 100644
--- a/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml
@@ -1,3 +1,4 @@
+# NOTE : This config and megatron_gpt_eval.py will be deprecated soon. Use megatron_gpt_inference_batch_mcore.yaml
 inference:
   greedy: False # Whether or not to use sampling ; use greedy decoding otherwise
   top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_inference_batch_mcore.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_inference_batch_mcore.yaml
new file mode 100644
index 000000000000..1b34a8b5abc3
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_inference_batch_mcore.yaml
@@ -0,0 +1,29 @@
+common_inference_params:
+  top_k: 1  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.0 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
+  return_log_probs: False  # whether return the log prob for the sampled tokens
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: 16 # 16, 32, or bf16
+  use_distributed_sampler: False
+  
+tensor_model_parallel_size: -1
+pipeline_model_parallel_size: -1
+inference_batch_times_seq_len_threshold: 1000 # If batch_size * sequence-length is smaller than this threshold we will not use pipelining, otherwise we will.
+max_batch_size: 4 # Input prompts are batched using max_batch_size and sent to inference
+
+megatron_amp_O2: False  # Enable O2-level automatic mixed precision to save memory
+gpt_model_file: null  # GPT nemo file path
+checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training
+checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
+hparams_file: null # model configuration file, only used for PTL checkpoint loading
+prompts: # prompts for GPT inference
+  - "Q: How are you?"
+  - "Q: How big is the universe?"
diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py
index 362a2ae3e298..b9b0d2973094 100644
--- a/examples/nlp/language_modeling/megatron_gpt_eval.py
+++ b/examples/nlp/language_modeling/megatron_gpt_eval.py
@@ -31,6 +31,7 @@
 from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
 from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy, NLPSaveRestoreConnector
 from nemo.core.config import hydra_runner
+from nemo.utils import logging
 from nemo.utils.app_state import AppState
 from nemo.utils.model_utils import inject_model_parallel_rank
 
@@ -168,6 +169,7 @@ def remove_padded_prompts(response, nb_paddings):
 
 
 def load_model_from_config(trainer, cfg):
+
     if cfg.gpt_model_file is not None:
         if (
             cfg.tensor_model_parallel_size < 0
@@ -306,6 +308,7 @@ def round_to_mult(n, mult=8):
 def main(cfg) -> None:
 
     callbacks = []
+    logging.warning("This file will be depreacted soon. Use the megatron_gpt_mcore_batch_eval.py file instead.")
     # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
     if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
         callbacks.append(CustomProgressBar())
diff --git a/examples/nlp/language_modeling/megatron_gpt_mcore_batch_eval.py b/examples/nlp/language_modeling/megatron_gpt_mcore_batch_eval.py
new file mode 100644
index 000000000000..988a5f8588ff
--- /dev/null
+++ b/examples/nlp/language_modeling/megatron_gpt_mcore_batch_eval.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import os
+from argparse import Namespace
+
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.engines.mcore_engine import MCoreEngine
+from megatron.core.inference.inference_model_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper
+from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import (
+    SimpleTextGenerationController,
+)
+from omegaconf import OmegaConf, open_dict
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
+from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy, NLPSaveRestoreConnector
+from nemo.core.config import hydra_runner
+from nemo.utils.app_state import AppState
+from nemo.utils.model_utils import inject_model_parallel_rank
+
+"""
+This is the script to run GPT text generation in batch mode using Megatron Core Generate function.
+"""
+
+
+@hydra_runner(config_path="conf", config_name="megatron_gpt_inference_batch_mcore")
+def main(cfg) -> None:
+    callbacks = []
+    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
+        callbacks.append(CustomProgressBar())
+    # trainer required for restoring model parallel models
+    trainer = Trainer(
+        strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)),
+        **cfg.trainer,
+        callbacks=callbacks,
+    )
+
+    if cfg.gpt_model_file is not None:
+        if (
+            cfg.tensor_model_parallel_size < 0
+            or cfg.pipeline_model_parallel_size < 0
+            or cfg.get('pipeline_model_parallel_split_rank', -1) < 0
+        ):
+            save_restore_connector = NLPSaveRestoreConnector()
+            if os.path.isdir(cfg.gpt_model_file):
+                save_restore_connector.model_extracted_dir = cfg.gpt_model_file
+            model_config = MegatronGPTModel.restore_from(
+                restore_path=cfg.gpt_model_file,
+                trainer=trainer,
+                return_config=True,
+                save_restore_connector=save_restore_connector,
+            )
+
+            # with dist checkpointing we don't need to set this
+            if not model_config.get('mcore_gpt', False):
+                with open_dict(cfg):
+                    cfg.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1)
+                    cfg.pipeline_model_parallel_size = model_config.get('pipeline_model_parallel_size', 1)
+                    cfg.pipeline_model_parallel_split_rank = model_config.get('pipeline_model_parallel_split_rank', 0)
+
+    assert (
+        cfg.trainer.devices * cfg.trainer.num_nodes
+        == cfg.tensor_model_parallel_size
+        * cfg.pipeline_model_parallel_size
+        * max(1, cfg.get('expert_model_parallel_size', 1))
+    ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size"
+
+    if cfg.gpt_model_file:
+        save_restore_connector = NLPSaveRestoreConnector()
+        if os.path.isdir(cfg.gpt_model_file):
+            save_restore_connector.model_extracted_dir = cfg.gpt_model_file
+
+        pretrained_cfg = MegatronGPTModel.restore_from(
+            restore_path=cfg.gpt_model_file,
+            trainer=trainer,
+            return_config=True,
+            save_restore_connector=save_restore_connector,
+        )
+        OmegaConf.set_struct(pretrained_cfg, True)
+        with open_dict(pretrained_cfg):
+            pretrained_cfg.sequence_parallel = False
+            pretrained_cfg.activations_checkpoint_granularity = None
+            pretrained_cfg.activations_checkpoint_method = None
+            pretrained_cfg.precision = trainer.precision
+            pretrained_cfg["use_flash_attention"] = cfg.get("use_flash_attention", False)
+            pretrained_cfg["apply_rope_fusion"] = False
+            if pretrained_cfg.get('mcore_gpt', False):
+                # with dist checkpointing we can use the model parallel config specified by the user
+                pretrained_cfg.tensor_model_parallel_size = cfg.tensor_model_parallel_size
+                pretrained_cfg.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size
+                pretrained_cfg.expert_model_parallel_size = cfg.get('expert_model_parallel_size', 1)
+                pretrained_cfg.micro_batch_size = 1
+            if trainer.precision == "16":
+                pretrained_cfg.megatron_amp_O2 = False
+            elif trainer.precision in ['bf16', 'bf16-mixed'] and cfg.get('megatron_amp_O2', False):
+                pretrained_cfg.megatron_amp_O2 = True
+        model = MegatronGPTModel.restore_from(
+            restore_path=cfg.gpt_model_file,
+            trainer=trainer,
+            override_config_path=pretrained_cfg,
+            save_restore_connector=save_restore_connector,
+            map_location=f'cuda:{trainer.local_rank}',  # map_location is needed for converted models
+        )
+    elif cfg.checkpoint_dir:
+        app_state = AppState()
+        if (
+            cfg.tensor_model_parallel_size > 1
+            or cfg.pipeline_model_parallel_size > 1
+            or cfg.get('expert_model_parallel_size', 1) > 1
+        ):
+            app_state.model_parallel_size = (
+                cfg.tensor_model_parallel_size
+                * cfg.pipeline_model_parallel_size
+                * cfg.get('expert_model_parallel_size', 1)
+            )
+            app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size
+            app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size
+            app_state.expert_model_parallel_size = cfg.get('expert_model_parallel_size', 1)
+            (
+                app_state.tensor_model_parallel_rank,
+                app_state.pipeline_model_parallel_rank,
+                app_state.expert_model_parallel_rank,
+                app_state.model_parallel_size,
+                app_state.data_parallel_size,
+                app_state.pipeline_model_parallel_split_rank,
+                app_state.virtual_pipeline_model_parallel_rank,
+            ) = fake_initialize_model_parallel(
+                world_size=app_state.model_parallel_size,
+                rank=trainer.global_rank,
+                tensor_model_parallel_size_=cfg.tensor_model_parallel_size,
+                pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size,
+                pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank,
+                expert_model_parallel_size_=cfg.get('expert_model_parallel_size', 1),
+            )
+        checkpoint_path = os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)
+        # checkpoint_path is a dir in case of distributed checkpointing
+        if not os.path.isdir(checkpoint_path):
+            # legacy checkpoint needs model parallel rank injection
+            checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name))
+        model = MegatronGPTModel.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer)
+    else:
+        raise ValueError("need at least a nemo file or checkpoint dir")
+
+    model.freeze()
+
+    # Have to turn off activations_checkpoint_method for inference
+    try:
+        model.model.language_model.encoder.activations_checkpoint_method = None
+    except AttributeError:
+        pass
+
+    args = Namespace
+    args.inference_batch_times_seq_len_threshold = cfg.inference_batch_times_seq_len_threshold
+    args.padded_vocab_size = model.padded_vocab_size
+    args.fp32_residual_connection = model.cfg.fp32_residual_connection
+    args.hidden_size = model.cfg.hidden_size
+    args.params_dtype = model.cfg.precision
+    args.max_batch_size = cfg.max_batch_size
+
+    # We need this wrapper since mcore generate uses tokenizer.detokenize, tokenizer.tokenize to encode and decode prompts
+    class MCoreTokenizerWrappper:
+        def __init__(self, tokenizer):
+            self.tokenizer = tokenizer
+            self.eod = tokenizer.eod
+            self.vocab_size = tokenizer.vocab_size
+
+        def detokenize(self, tokens):
+            return self.tokenizer.ids_to_text(tokens)
+
+        def tokenize(self, prompt):
+            return self.tokenizer.text_to_ids(prompt)
+
+    tokenizer = MCoreTokenizerWrappper(model.tokenizer)
+
+    inference_wrapped_model = GPTInferenceWrapper(model.model, args)
+    text_generation_controller = SimpleTextGenerationController(
+        inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer
+    )
+    mcore_engine = MCoreEngine(
+        text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size
+    )
+
+    common_inference_params = CommonInferenceParams(
+        temperature=cfg.common_inference_params.temperature,
+        top_k=cfg.common_inference_params.top_k,
+        top_p=cfg.common_inference_params.top_p,
+        return_log_probs=cfg.common_inference_params.return_log_probs,
+        num_tokens_to_generate=cfg.common_inference_params.tokens_to_generate,
+    )
+
+    results = mcore_engine.generate(
+        prompts=OmegaConf.to_container(cfg.prompts), common_inference_params=common_inference_params
+    )
+
+    for idx, result in enumerate(results):
+        print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
+        result = {
+            'id': result.request_id,
+            'input_prompt': result.prompt,
+            'generated_text': result.generated_text,
+            'generated_tokens': result.generated_tokens,
+        }
+        print(result)
+
+
+if __name__ == '__main__':
+    main()  # noqa pylint: disable=no-value-for-parameter

From 66c960ebdec9d22f40a7d43e9b2d38dc4a34ad25 Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Mon, 8 Jul 2024 13:31:24 -0400
Subject: [PATCH 11/13] Improve error messaging during trt-llm export (#9638)

* fix minor import bug

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Raise error when number of query groups cannot be splitted by the tps

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* moved the error message to the utils

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
---
 nemo/export/trt_llm/converter/utils.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/nemo/export/trt_llm/converter/utils.py b/nemo/export/trt_llm/converter/utils.py
index a4365a281b49..3768ff4b2844 100644
--- a/nemo/export/trt_llm/converter/utils.py
+++ b/nemo/export/trt_llm/converter/utils.py
@@ -388,6 +388,16 @@ def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, storage_t
 
             # Split the QKV to separate variables.
             qkv = np.split(val, [q_num, q_num + 1], axis=2)
+
+            query_groups_shape = qkv[0].shape
+            if len(query_groups_shape) > 1:
+                if (query_groups_shape[1] % split_factor) != 0:
+                    raise Exception(
+                        "Number of query groups of the models is {0}. Please select tensor parallelism size "
+                        "that can split the number of query groups to equal number of query matrices in the "
+                        "each GPU.".format(query_groups_shape[1])
+                    )
+
             q_split = np.split(qkv[0], split_factor, axis=1)
             k_split = np.split(qkv[1], split_factor, axis=1)
             v_split = np.split(qkv[2], split_factor, axis=1)

From f79074146563a38ae2a54a5358b8002c66d6499a Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Mon, 8 Jul 2024 19:37:20 -0400
Subject: [PATCH 12/13] support lora when kv_channel != hidden_size / num_heads
 (#9644)

Co-authored-by: Ao Tang <aot@nvidia.com>
---
 nemo/collections/nlp/parts/peft_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/parts/peft_config.py b/nemo/collections/nlp/parts/peft_config.py
index 50c97e349885..726ca33611d7 100644
--- a/nemo/collections/nlp/parts/peft_config.py
+++ b/nemo/collections/nlp/parts/peft_config.py
@@ -170,7 +170,7 @@ def __init__(self, cfg):
 
             elif module == PEFT_MODULE_MAP["dense_module"]:
                 adapter_cfg = self._create_lora_config(
-                    cfg, lora_cfg, cfg.hidden_size, cfg.hidden_size, LoraDenseAttentionAdapterConfig
+                    cfg, lora_cfg, projection_size, cfg.hidden_size, LoraDenseAttentionAdapterConfig
                 )
                 name_key_to_cfg[AdapterName.LORA_DENSE_ATTENTION_ADAPTER] = adapter_cfg
                 name_key_to_mcore_mixins[AdapterName.LORA_DENSE_ATTENTION_ADAPTER] = [

From f9c3a8b3a3165c365cc34a6e9d9820414fdb9935 Mon Sep 17 00:00:00 2001
From: Marc Romeyn <mromeijn@nvidia.com>
Date: Tue, 9 Jul 2024 06:16:04 +0200
Subject: [PATCH 13/13] [NeMo-UX] Fix when optimizers are setup for PEFT
 (#9619)

* Fix when optimizers are setup for PEFT

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Init DDP inside PEFT

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Some fixes, loss seems to become nan with peft for some reason

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Loss goes down on fp32

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Simplifying FNMixin

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Fix bug with new checkpoint-io

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

* Fix failing test: test_peft_on_train_epoch_start_with_adapter

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

---------

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
---
 nemo/collections/llm/api.py                   |   2 +-
 nemo/collections/llm/fn/mixin.py              |  17 +-
 nemo/lightning/_strategy_lib.py               |   3 +
 nemo/lightning/io/connector.py                |   8 +-
 nemo/lightning/megatron_parallel.py           | 159 +++++++++++-------
 .../pytorch/callbacks/model_transform.py      |   5 +-
 nemo/lightning/pytorch/callbacks/peft.py      |  18 +-
 nemo/lightning/pytorch/optim/lr_scheduler.py  |   1 -
 .../pytorch/plugins/mixed_precision.py        |   6 +-
 nemo/lightning/pytorch/strategies.py          |  40 +++--
 .../lightning/pytorch/callbacks/test_peft.py  |  18 +-
 11 files changed, 177 insertions(+), 100 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 5c9703497597..0bb8f5fa46af 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -279,7 +279,7 @@ def _setup(
     model_transform: Optional[Union[PEFT, ModelTransform, Callable]],
 ) -> Any:  # Return type is Any because app_state's type is not specified
     _log = log or NeMoLogger()
-    if resume and resume.adapter_path and _log.ckpt:
+    if resume and isinstance(model_transform, PEFT) and _log.ckpt:
         logging.info("Disabling try_restore_best_ckpt restoration for adapters")
         _log.ckpt.try_restore_best_ckpt = False
 
diff --git a/nemo/collections/llm/fn/mixin.py b/nemo/collections/llm/fn/mixin.py
index b32f66366bfb..c566c6e9d392 100644
--- a/nemo/collections/llm/fn/mixin.py
+++ b/nemo/collections/llm/fn/mixin.py
@@ -2,6 +2,7 @@
 from typing_extensions import Self
 
 from nemo.collections.llm.fn import base as fn
+from nemo.utils import logging
 
 
 class FNMixin:
@@ -114,8 +115,12 @@ def freeze(self) -> None:
         """
         assert isinstance(self, nn.Module), "self is not a nn.Module"
 
-        for param in self.parameters():
-            param.requires_grad = False
+        params = list(self.parameters())
+        if not params:
+            logging.info(f"No parameters found in module {self.__class__.__name__}")
+        else:
+            for param in params:
+                param.requires_grad = False
 
     def unfreeze(self) -> None:
         """
@@ -124,5 +129,9 @@ def unfreeze(self) -> None:
         """
         assert isinstance(self, nn.Module), "self is not a nn.Module"
 
-        for param in self.parameters():
-            param.requires_grad = True
+        params = list(self.parameters())
+        if not params:
+            logging.info(f"No parameters found in module {self.__class__.__name__}")
+        else:
+            for param in params:
+                param.requires_grad = True
diff --git a/nemo/lightning/_strategy_lib.py b/nemo/lightning/_strategy_lib.py
index e6452de16512..3bd62ddce24a 100644
--- a/nemo/lightning/_strategy_lib.py
+++ b/nemo/lightning/_strategy_lib.py
@@ -515,4 +515,7 @@ def load_model_state_dict(megatron_parallel, checkpoint: Mapping[str, Any], stri
             elif count > n_nesting:
                 to_remove = "module." * (count - n_nesting)
                 _state_dict[key[len(to_remove) :]] = value
+            else:
+                _state_dict[key] = value
+
         module.load_state_dict(_state_dict, strict=strict)
diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py
index 500d0203cfd4..8be630f163e0 100644
--- a/nemo/lightning/io/connector.py
+++ b/nemo/lightning/io/connector.py
@@ -160,12 +160,8 @@ def nemo_save(self, output_path: Path, trainer: pl.Trainer) -> None:
             output_path (Path): The path where the model checkpoint will be saved.
             trainer (pl.Trainer): The trainer with the strategy to save the model.
         """
-        _setup_kwargs = {}
-        setup_signature = inspect.signature(trainer.strategy.setup)
-        if 'setup_optimizers' in setup_signature.parameters:
-            _setup_kwargs["setup_optimizers"] = False
-
-        trainer.strategy.setup(trainer, **_setup_kwargs)
+        trainer.strategy._setup_optimizers = False
+        trainer.strategy.setup(trainer)
         trainer.save_checkpoint(output_path)
 
     def nemo_load(
diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index 2f2308717004..ee41455544bb 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -12,7 +12,6 @@
     Iterable,
     Iterator,
     List,
-    Mapping,
     Optional,
     Protocol,
     Sequence,
@@ -129,7 +128,6 @@ def __init__(
         cpu: bool = False,
         convert_module_fn: Optional[Callable[[ModelT], nn.Module]] = None,
     ) -> None:
-        from apex.transformer.tensor_parallel.layers import set_defaults_if_not_set_tensor_model_parallel_attributes
         from megatron.core import parallel_state
 
         _pipeline: List[nn.Module]
@@ -152,67 +150,15 @@ def __init__(
                         _model.configure_model()
                     _pipeline.append(_model)
 
-        if convert_module_fn:
-            for i in range(len(_pipeline)):
-                _pipeline[i] = convert_module_fn(_pipeline[i])
-
-        if isinstance(ddp_config, DistributedDataParallelConfig):
-            for model_chunk_idx, model_chunk in enumerate(_pipeline):
-                module = model_chunk.module
-
-                ddp = DDP(
-                    module.config,
-                    ddp_config,
-                    module,
-                    data_parallel_group=parallel_state.get_data_parallel_group(with_context_parallel=True),
-                    expert_data_parallel_group=parallel_state.get_data_modulo_expert_parallel_group(),
-                    # Turn off bucketing for model_chunk 2 onwards, since communication for these
-                    # model chunks is overlapped with compute anyway.
-                    disable_bucketing=(model_chunk_idx > 0),
-                )
-                model_chunk.module = ddp
-                model_chunk.buffers = ddp.buffers  # We need to do this explicitly since this is a attr pytorch uses
-                model_chunk.__class__.__getattr__ = getattr_proxy  # type: ignore
-
-            # param_sync_func is set in nemo.lightning.pytorch.optim.megatron
-            no_sync_func, grad_sync_func = extract_ddp_funcs(ddp_config, _pipeline)
-            for module in _pipeline:
-                module.config.no_sync_func = no_sync_func
-                module.config.grad_sync_func = grad_sync_func
-
-        for i, model_module in enumerate(_pipeline):
-            if not cpu:
-                model_module.cuda(torch.cuda.current_device())
-
-            for param in model_module.parameters():
-                set_defaults_if_not_set_tensor_model_parallel_attributes(param)
-
-            if hasattr(model_module, "configure_model"):
-                if not hasattr(model_module, "set_input_tensor"):
-                    if hasattr(model_module.module, "set_input_tensor"):
-                        model_module.set_input_tensor = model_module.module.set_input_tensor
-                    else:
-                        # TODO: What to do here?
-                        pass
-
-            # Print number of parameters.
-            if parallel_state.model_parallel_is_initialized() and parallel_state.get_data_parallel_rank() == 0:
-                from nemo.utils import logging
-
-                msg = (
-                    f" > number of parameters on (tensor, pipeline) model parallel rank "
-                    f"({parallel_state.get_tensor_model_parallel_rank()}, {parallel_state.get_pipeline_model_parallel_rank()}): "
-                    f"{_calc_number_of_params(_pipeline)}"
-                )
-                logging.info(msg)
-
         super().__init__(_pipeline)
         self.precision_plugin = precision_plugin
+        self._cpu = cpu
         self.callbacks = callbacks or CallbackConnector()
         self.data_step = data_step or default_data_step
         self.forward_step = forward_step or default_forward_step
         self.loss_reduction: MegatronLossReduction = loss_reduction
         self.ddp_config = ddp_config
+        self.convert_module_fn = convert_module_fn
 
     def forward(
         self,
@@ -475,6 +421,82 @@ def infer_num_microbatches(self, data: Union[DataT, Iterator[DataT], List[Iterat
 
         raise ValueError("Cannot infer `num_microbatches` from data, please specify it manually")
 
+    def init_model_parallel(self):
+        from apex.transformer.tensor_parallel.layers import set_defaults_if_not_set_tensor_model_parallel_attributes
+        from megatron.core import parallel_state
+
+        for model_module in self:
+            if not self._cpu:
+                model_module.cuda(torch.cuda.current_device())
+
+            for param in model_module.parameters():
+                set_defaults_if_not_set_tensor_model_parallel_attributes(param)
+
+            if hasattr(model_module, "configure_model"):
+                if not hasattr(model_module, "set_input_tensor"):
+                    if hasattr(model_module.module, "set_input_tensor"):
+                        model_module.set_input_tensor = model_module.module.set_input_tensor
+                    else:
+                        # TODO: What to do here?
+                        pass
+
+            # Print number of parameters.
+            if parallel_state.model_parallel_is_initialized() and parallel_state.get_data_parallel_rank() == 0:
+                from nemo.utils import logging
+
+                num_params = _calc_number_of_params(list(self))
+                num_trainable_params = _calc_number_of_trainable_params(list(self))
+
+                msg = (
+                    f" > number of parameters on (tensor, pipeline) model parallel rank "
+                    f"({parallel_state.get_tensor_model_parallel_rank()}, {parallel_state.get_pipeline_model_parallel_rank()}): "
+                    f"{num_params}"
+                )
+                logging.info(msg)
+
+                if num_params != num_trainable_params:
+                    logging.info(
+                        f" > number of trainable parameters: {num_trainable_params} ({num_trainable_params / num_params:.2%} of total)"
+                    )
+
+        if self.convert_module_fn:
+            self.apply_convert_module_fn()
+
+        self.init_ddp()
+
+    def apply_convert_module_fn(self):
+        for i in range(len(self)):
+            self[i] = self.convert_module_fn(self[i])
+
+    def init_ddp(self):
+        if not isinstance(self.ddp_config, DistributedDataParallelConfig):
+            return
+
+        from megatron.core import parallel_state
+
+        for model_chunk_idx, model_chunk in enumerate(self):
+            module = model_chunk.module
+
+            ddp = DDP(
+                module.config,
+                self.ddp_config,
+                module,
+                data_parallel_group=parallel_state.get_data_parallel_group(with_context_parallel=True),
+                expert_data_parallel_group=parallel_state.get_data_modulo_expert_parallel_group(),
+                # Turn off bucketing for model_chunk 2 onwards, since communication for these
+                # model chunks is overlapped with compute anyway.
+                disable_bucketing=(model_chunk_idx > 0),
+            )
+            model_chunk.module = ddp
+            model_chunk.buffers = ddp.buffers  # We need to do this explicitly since this is a attr pytorch uses
+            model_chunk.__class__.__getattr__ = getattr_proxy  # type: ignore
+
+        # param_sync_func is set in nemo.lightning.pytorch.optim.megatron
+        no_sync_func, grad_sync_func = extract_ddp_funcs(self.ddp_config, self)
+        for module in self:
+            module.config.no_sync_func = no_sync_func
+            module.config.grad_sync_func = grad_sync_func
+
     def _build_context(self, context: Dict[str, Any]) -> Dict[str, Any]:
         if "self" in context:
             del context["self"]
@@ -565,18 +587,21 @@ def forward_backward_func(self) -> "MegatronStepProtocol":
 
     @override
     def __getattr__(self, item: Any) -> Any:
-        if len(self) == 0:
-            return super().__getattr__(item)
-
         try:
-            # __getattr__ gets called as a last resort if the attribute does not exist
-            # call nn.Module's implementation first
+            # First, try to get the attribute from the superclass (nn.ModuleList)
             return super().__getattr__(item)
         except AttributeError:
-            # If the attribute is not available on the _FabricModule wrapper, redirect to the wrapped nn.Module
-            attr = getattr(self._modules[self._get_abs_string_index(0)], item)
+            # If not found in superclass, check if we have any modules
+            if len(self) == 0:
+                raise AttributeError(
+                    f"'{self.__class__.__name__}' object has no attribute '{item}' and contains no modules"
+                )
 
-            return attr
+            # Try to get it from the first module
+            try:
+                return getattr(self._modules[self._get_abs_string_index(0)], item)
+            except AttributeError:
+                raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{item}'")
 
 
 class _ModuleStepFunction:
@@ -915,6 +940,12 @@ def _calc_number_of_params(model: List[nn.Module]) -> int:
     return sum([sum([p.nelement() for p in model_module.parameters()]) for model_module in model])
 
 
+def _calc_number_of_trainable_params(model: List[nn.Module]) -> int:
+    assert isinstance(model, list)
+
+    return sum([sum([p.numel() for p in model_module.parameters() if p.requires_grad]) for model_module in model])
+
+
 def is_list_of_iterators(var) -> bool:
     if not isinstance(var, list):
         return False
diff --git a/nemo/lightning/pytorch/callbacks/model_transform.py b/nemo/lightning/pytorch/callbacks/model_transform.py
index 68b3db16f473..512324940133 100644
--- a/nemo/lightning/pytorch/callbacks/model_transform.py
+++ b/nemo/lightning/pytorch/callbacks/model_transform.py
@@ -65,7 +65,10 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo
 
     def _maybe_apply_transform(self, trainer):
         if self._needs_to_call:
-            self.model_transform(trainer.model)
+            self.apply_transform(trainer)
+
+    def apply_transform(self, trainer):
+        self.model_transform(trainer.model)
 
     @property
     def _needs_to_call(self) -> bool:
diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py
index 26325bf549d0..f8fa76110288 100644
--- a/nemo/lightning/pytorch/callbacks/peft.py
+++ b/nemo/lightning/pytorch/callbacks/peft.py
@@ -84,19 +84,27 @@ def __call__(self, model: nn.Module) -> nn.Module:
     def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str) -> None:
         super().setup(trainer, pl_module, stage=stage)
 
+        trainer.strategy.trainer = trainer
         self.wrapped_io = WrappedAdapterIO(trainer.strategy.checkpoint_io)
         trainer.strategy._checkpoint_io = self.wrapped_io
+        trainer.strategy._init_model_parallel = False
+        trainer.strategy._setup_optimizers = False
 
-    def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
-        needs_to_call = self._needs_to_call
-        self._maybe_apply_transform(trainer)
+    def apply_transform(self, trainer):
+        super().apply_transform(trainer)
 
-        # Check if we need to load the adapters
-        if needs_to_call and self.wrapped_io.adapter_ckpt_path is not None:
+        if self.wrapped_io.adapter_ckpt_path is not None:
             logging.info(f"Loading adapters from {self.wrapped_io.adapter_ckpt_path}")
             adapter_state = self.wrapped_io.load_checkpoint(self.wrapped_io.adapter_ckpt_path)
             trainer.strategy.load_model_state_dict(adapter_state, strict=False)
 
+        if hasattr(trainer.strategy, "init_model_parallel"):
+            logging.info("Initializing model parallel")
+            trainer.strategy.init_model_parallel()
+
+        logging.info("Setting up optimizers")
+        trainer.strategy.setup_optimizers(trainer)
+
     def on_load_checkpoint(
         self, trainer: pl.Trainer, pl_module: pl.LightningModule, checkpoint: Dict[str, Any]
     ) -> None:
diff --git a/nemo/lightning/pytorch/optim/lr_scheduler.py b/nemo/lightning/pytorch/optim/lr_scheduler.py
index 298a6e7a7f45..9374328190a6 100644
--- a/nemo/lightning/pytorch/optim/lr_scheduler.py
+++ b/nemo/lightning/pytorch/optim/lr_scheduler.py
@@ -445,7 +445,6 @@ def scheduler(self, model, optimizer):
 
         return {
             "optimizer": optimizer,
-            "scheduler": lr_scheduler,
             "lr_scheduler": {
                 # REQUIRED: The scheduler instance
                 "scheduler": lr_scheduler,
diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py
index 751141d8111b..5e43e09c0420 100644
--- a/nemo/lightning/pytorch/plugins/mixed_precision.py
+++ b/nemo/lightning/pytorch/plugins/mixed_precision.py
@@ -61,7 +61,6 @@ def convert_module(self, module: Module) -> Module:
         This is optional and depends on the precision limitations during optimization.
 
         """
-        from megatron.core.distributed import DistributedDataParallel
         from megatron.core.transformer.module import Float16Module
         from megatron.core.utils import get_model_config
 
@@ -69,7 +68,10 @@ def convert_module(self, module: Module) -> Module:
             config = get_model_config(module.module)
             config.fp16 = self.precision == "16-mixed"
             config.bf16 = self.precision == "bf16-mixed"
-            if not isinstance(module.module, Float16Module):
+            if isinstance(module.module, Float16Module):
+                new_float16_module = Float16Module(config, module.module.module)
+                module.module = new_float16_module
+            else:
                 module.module = Float16Module(config, module.module)
 
         return module
diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py
index 6a84319b4fa2..d0e502839f2f 100644
--- a/nemo/lightning/pytorch/strategies.py
+++ b/nemo/lightning/pytorch/strategies.py
@@ -110,6 +110,8 @@ def __init__(
         ckpt_parallel_save=True,
         ckpt_parallel_load=False,
         ckpt_parallel_save_optim=True,
+        setup_optimizers: bool = True,
+        init_model_parallel: bool = True,
         **kwargs,
     ) -> None:
         super().__init__(
@@ -132,6 +134,8 @@ def __init__(
         self.lazy_init = lazy_init
         self.ckpt_include_optimizer = ckpt_include_optimizer
         self.pipeline_dtype = pipeline_dtype
+        self._setup_optimizers = setup_optimizers
+        self._init_model_parallel = init_model_parallel
         self.log_train_loss = bool(int(os.getenv("NEMO_LOG_TRAIN_LOSS", 1)))
         self.log_memory_usage = bool(int(os.getenv("NEMO_LOG_MEMORY_USAGE", 0)))
 
@@ -144,7 +148,7 @@ def __init__(
 
         self._ddp = ddp
         if ddp == "megatron":
-            self.ddp_config = DistributedDataParallelConfig()
+            self.ddp_config = DistributedDataParallelConfig(check_for_nan_in_grad=True)
         elif isinstance(ddp, DistributedDataParallelConfig):
             self.ddp_config = ddp
         elif ddp == "pytorch":
@@ -180,7 +184,7 @@ def connect(self, model: pl.LightningModule) -> None:
                     ddp_config.use_distributed_optimizer = mcore_opt_config.use_distributed_optimizer
 
     @override
-    def setup(self, trainer: pl.Trainer, setup_optimizers: bool = True) -> None:
+    def setup(self, trainer: pl.Trainer) -> None:
         assert self.accelerator is not None
         self.accelerator.setup(trainer)
         self.trainer = trainer
@@ -204,7 +208,7 @@ def setup(self, trainer: pl.Trainer, setup_optimizers: bool = True) -> None:
             self.data_sampler.connect(trainer)
 
         self._fix_progress_bar(trainer)
-        self.setup_megatron_parallel(trainer, setup_optimizers=setup_optimizers)
+        self.setup_megatron_parallel(trainer)
         self.setup_precision_plugin()
 
         if getattr(self.lightning_module, "model_transform", None):
@@ -271,7 +275,7 @@ def process_dataloader(self, dataloader: DataLoader) -> DataLoader:
 
         return dataloader
 
-    def setup_megatron_parallel(self, trainer: pl.Trainer, setup_optimizers: bool = True) -> None:
+    def setup_megatron_parallel(self, trainer: pl.Trainer) -> None:
         assert self.model is not None, "Model is not set"
 
         convert_module_fn = None
@@ -286,6 +290,10 @@ def setup_megatron_parallel(self, trainer: pl.Trainer, setup_optimizers: bool =
             ddp_config=self.ddp_config,
             convert_module_fn=convert_module_fn,
         )
+
+        if self._init_model_parallel:
+            self.init_model_parallel()
+
         self.megatron_parallel.trainer = trainer
 
         # check signature-def of self.model.configure_optimizers to check if there's an optional arg: megatron_parallel
@@ -295,18 +303,9 @@ def setup_megatron_parallel(self, trainer: pl.Trainer, setup_optimizers: bool =
                 self.model.configure_optimizers, megatron_parallel=self.megatron_parallel
             )
 
-        if setup_optimizers:
+        if self._setup_optimizers:
             self.setup_optimizers(trainer)
 
-        # TODO: Throw an execption if we have a mcore optimizer and no ddp_config
-
-        if hasattr(self.precision_plugin, "convert_optimizer"):
-            _optimizers = [*self.optimizers]
-            _optimizers[0] = self.precision_plugin.convert_optimizer(self.optimizers[0])
-            self.optimizers = _optimizers
-
-        _optimizers_to_device(self.optimizers, self.root_device)
-
         self.model = self.megatron_parallel
         self.model.callbacks.add(getattr(trainer, "callbacks"))
 
@@ -317,6 +316,9 @@ def setup_megatron_parallel(self, trainer: pl.Trainer, setup_optimizers: bool =
         if datamodule:
             self.model.callbacks.add(datamodule)
 
+    def init_model_parallel(self):
+        self.megatron_parallel.init_model_parallel()
+
     @override
     def configure_ddp(self) -> None:
         logging.debug(f"{self.__class__.__name__}: configuring MegatronParallel")
@@ -349,6 +351,16 @@ def _setup_model(self, model: nn.Module) -> nn.Module:
 
         return model
 
+    @override
+    def setup_optimizers(self, trainer: "pl.Trainer") -> None:
+        super().setup_optimizers(trainer)
+        if hasattr(self.precision_plugin, "convert_optimizer"):
+            _optimizers = [*self.optimizers]
+            _optimizers[0] = self.precision_plugin.convert_optimizer(self.optimizers[0])
+            self.optimizers = _optimizers
+
+        _optimizers_to_device(self.optimizers, self.root_device)
+
     def _setup_parallel_ranks(self) -> None:
         self.set_world_ranks()
         env = cast(ClusterEnvironment, self.cluster_environment)
diff --git a/tests/lightning/pytorch/callbacks/test_peft.py b/tests/lightning/pytorch/callbacks/test_peft.py
index 81dc7f85bc08..e64ee7bd0ba3 100644
--- a/tests/lightning/pytorch/callbacks/test_peft.py
+++ b/tests/lightning/pytorch/callbacks/test_peft.py
@@ -1,4 +1,4 @@
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock, call, patch
 
 import torch.nn as nn
 from nemo.collections.llm import fn
@@ -54,8 +54,22 @@ def test_peft_on_train_epoch_start_with_adapter(self, mock_logging):
         peft.wrapped_io.load_checkpoint.return_value = {"dummy_state": "dummy_value"}
         peft.on_train_epoch_start(trainer, pl_module)
 
-        mock_logging.info.assert_called_once_with("Loading adapters from dummy_path")
+        # Check for all expected log messages
+        mock_logging.info.assert_has_calls(
+            [
+                call("Loading adapters from dummy_path"),
+                call("Initializing model parallel"),
+                call("Setting up optimizers"),
+            ],
+            any_order=True,
+        )
+
+        # Verify the number of calls
+        assert mock_logging.info.call_count == 3
+
         trainer.strategy.load_model_state_dict.assert_called_once_with({"dummy_state": "dummy_value"}, strict=False)
+        trainer.strategy.init_model_parallel.assert_called_once()
+        trainer.strategy.setup_optimizers.assert_called_once_with(trainer)
 
     def test_peft_on_load_checkpoint(self):
         peft = self.DummyPEFT()