From 8a268480777c8dac743b750d75fe556b9dd5251e Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Mon, 1 Jul 2024 07:39:29 -0700
Subject: [PATCH 01/21] adding mamba support

---
 .../conf/megatron_mamba_config.yaml           | 190 +++++
 .../megatron_mamba_finetuning_config.yaml     | 314 +++++++
 .../conf/megatron_mamba_generate_config.yaml  | 298 +++++++
 .../mamba_change_num_partition.py             | 782 ++++++++++++++++++
 .../megatron_mamba_finetuning.py              |  60 ++
 .../megatron_mamba_generate.py                |  69 ++
 .../language_modeling/megatron_mamba_model.py |  84 ++
 .../megatron_mamba_sft_model.py.py            |  55 ++
 .../common/text_generation_strategy.py        |  77 ++
 .../nlp/parts/mixins/nlp_adapter_mixins.py    |   7 +-
 .../checkpoint_converters/convert_jamba.py    | 248 ++++++
 .../convert_mamba2_pyt_to_nemo.py             | 159 ++++
 12 files changed, 2340 insertions(+), 3 deletions(-)
 create mode 100644 examples/nlp/language_modeling/conf/megatron_mamba_config.yaml
 create mode 100644 examples/nlp/language_modeling/conf/megatron_mamba_finetuning_config.yaml
 create mode 100644 examples/nlp/language_modeling/conf/megatron_mamba_generate_config.yaml
 create mode 100644 examples/nlp/language_modeling/mamba_change_num_partition.py
 create mode 100644 examples/nlp/language_modeling/megatron_mamba_finetuning.py
 create mode 100644 examples/nlp/language_modeling/megatron_mamba_generate.py
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py.py
 create mode 100644 scripts/checkpoint_converters/convert_jamba.py
 create mode 100644 scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py

diff --git a/examples/nlp/language_modeling/conf/megatron_mamba_config.yaml b/examples/nlp/language_modeling/conf/megatron_mamba_config.yaml
new file mode 100644
index 000000000000..2c9a64bc5f04
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_mamba_config.yaml
@@ -0,0 +1,190 @@
+name: megatron_mamba
+restore_from_path: null # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  benchmark: False
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_mamba
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    filename: 'megatron_mamba--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
+
+model:
+  restore_from_path: null
+  # model parallelism 
+  mcore_gpt: True
+  micro_batch_size: 1
+  global_batch_size: 8
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  expert_model_parallel_size: 1 # expert model parallelism
+  hybrid_override_pattern: null
+  vocab_size: 256000
+  # model architecture
+  encoder_seq_length: 4096
+  max_position_embeddings: ${.encoder_seq_length}
+  position_embedding_type: 'none' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
+  num_layers: 56
+  gated_linear_unit: False
+  add_bias_linear: False
+  num_query_groups: 8
+  ngroups_mamba: 8
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  hidden_size: 4096
+  ffn_hidden_size: 14336 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 32
+  transformer_block_type: pre_ln
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: RMSNorm
+  layernorm_epsilon: 1e-5
+  num_moe_experts: 16
+  moe_router_topk: 2
+  moe_aux_loss_coeff: 0.001
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  megatron_legacy: False
+  persist_layer_norm: True
+
+  tokenizer:
+    library: 'huggingface'
+    type: 'EleutherAI/gpt-neox-20b' 
+    model: null 
+    vocab_file: null
+    merge_file: null 
+    sentencepiece_legacy: False
+
+  # Distributed checkpoint setup
+  dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
+  dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU
+  dist_ckpt_parallel_save: False # if true, each worker will write its own part of the dist checkpoint
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+
+
+  # Fusion
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
+  gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
+  bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+  bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+
+
+  # miscellaneous
+  seed: 1234
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_recurrent: False # If set to True, the checkpointing is only done for rglru and conv1d and not for attention and mlp layers
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+  sequence_parallel: False
+  
+  data:
+    # Path to data must be specified by the user.
+    # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
+    # Or see example below: 
+    # data_prefix: 
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_00_text_document
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_01_text_document
+    data_prefix: [1.0, /path/to/data]
+    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
+    data_impl: mmap
+    splits_string: 900,50,50
+    seq_length: ${model.encoder_seq_length}
+    skip_warmup: True
+    num_workers: 0
+    dataloader_type: single  # cyclic, LDDL
+    reset_position_ids: False # Reset position ids after end-of-document token
+    reset_attention_mask: False # Reset attention mask after end-of-document token
+    eod_mask_loss: False # Mask loss for the end of document tokens
+    masked_lm_prob: 0.15 # Probability of replacing a token with mask.
+    short_seq_prob: 0.1 # Probability of producing a short sequence.
+    ceil_to_power_2: True
+    get_attention_mask_from_fusion: True
+    pad_to_max_length: True
+  
+  optim:
+    name: distributed_fused_adam
+    lr: 2e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 50000
+      min_lr: 2e-5
diff --git a/examples/nlp/language_modeling/conf/megatron_mamba_finetuning_config.yaml b/examples/nlp/language_modeling/conf/megatron_mamba_finetuning_config.yaml
new file mode 100644
index 000000000000..5b9d3517f44b
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_mamba_finetuning_config.yaml
@@ -0,0 +1,314 @@
+name: megatron_mamba
+restore_from_path: ${model.restore_from_path} # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 10000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 1 # frequency with which training steps are logged
+  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  limit_val_batches: 1024
+  limit_test_batches: 500
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: True
+  wandb_logger_kwargs:
+    project: griffin
+    name: sft-test
+  resume_if_exists: False
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: True
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  restore_from_path: null
+  # model parallelism 
+  mcore_gpt: True
+  micro_batch_size: 1
+  global_batch_size: 8
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  expert_model_parallel_size: 1 # expert model parallelism
+
+  vocab_size: 65536
+  # model architecture
+  encoder_seq_length: 4096
+  hybrid_override_pattern: null
+  max_position_embeddings: ${.encoder_seq_length}
+  position_embedding_type: 'none' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
+  num_layers: 64
+  gated_linear_unit: False
+  add_bias_linear: False
+  num_query_groups: 8
+  ngroups_mamba: 8
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  hidden_size: 4096
+  ffn_hidden_size: 14336 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 32
+  transformer_block_type: pre_ln
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: RMSNorm
+  layernorm_epsilon: 1e-5
+  num_moe_experts: 16
+  moe_router_topk: 2
+  moe_aux_loss_coeff: 0.001
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  megatron_legacy: False
+  persist_layer_norm: True
+
+
+  # mixed-precision
+  attention_softmax_in_fp32: False
+
+  # Distributed checkpoint setup
+  dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
+  dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU
+  dist_ckpt_parallel_save: False # if true, each worker will write its own part of the dist checkpoint
+
+
+  tokenizer:
+    library: 'huggingface'
+    type: 'EleutherAI/gpt-neox-20b' 
+    model: null 
+    vocab_file: null
+    merge_file: null 
+    sentencepiece_legacy: False
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+
+  # Fusion
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
+  gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
+  bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+  bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+
+  # miscellaneous
+  seed: 1234
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+  sequence_parallel: False
+  
+  peft:
+    peft_scheme: "lora"  # can be either adapter,ia3, lora, or ptuning
+    restore_from_path: null
+
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['all'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+    
+    selective_tuning:
+      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
+
+
+  data:
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_names: null # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      memmap_workers: 2
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      concat_sampling_probabilities: [1.0] # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: True
+      truncation_field: "input" # # Can be multiple keys separated with ',' Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
+      ceil_to_power_2: True
+      get_attention_mask_from_fusion: True
+      pad_to_max_length: True
+    validation_ds:
+        file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+        names: null # Names of the corresponding datasets used to log metrics.
+        global_batch_size: ${model.global_batch_size}
+        micro_batch_size: ${model.micro_batch_size}
+        shuffle: False
+        num_workers: 0
+        memmap_workers: ${model.data.train_ds.memmap_workers}
+        pin_memory: True
+        max_seq_length: 2048
+        min_seq_length: 1
+        drop_last: False
+        label_key: ${model.data.train_ds.label_key}
+        add_eos: ${model.data.train_ds.add_eos}
+        add_sep: ${model.data.train_ds.add_sep}
+        add_bos: ${model.data.train_ds.add_bos}
+        write_predictions_to_file: False
+        output_file_path_prefix: null # Prefix of the file to write predictions to.
+        truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template
+        index_mapping_dir: null # Path to a directory to write index mapping files.
+        prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+        tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+        truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+        ceil_to_power_2: True
+        get_attention_mask_from_fusion: True
+        pad_to_max_length: True
+        metric:
+          name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+          average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+          num_classes: null
+    test_ds:
+      file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      memmap_workers: ${model.data.train_ds.memmap_workers}
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      label_key: ${model.data.train_ds.label_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template}
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      ceil_to_power_2: True
+      get_attention_mask_from_fusion: True
+      pad_to_max_length: True
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+  optim:
+    name: distributed_fused_adam
+    lr: 2e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 50000
+      min_lr: 2e-5
diff --git a/examples/nlp/language_modeling/conf/megatron_mamba_generate_config.yaml b/examples/nlp/language_modeling/conf/megatron_mamba_generate_config.yaml
new file mode 100644
index 000000000000..9b00dff6f32f
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_mamba_generate_config.yaml
@@ -0,0 +1,298 @@
+name: megatron_mamba
+restore_from_path: ${model.restore_from_path} # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  benchmark: False
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_mamba
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    filename: 'megatron_mamba--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
+model:
+  restore_from_path: null
+  # model parallelism 
+  mcore_gpt: True
+  micro_batch_size: 2
+  global_batch_size: 2
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  expert_model_parallel_size: 1 # expert model parallelism
+  hybrid_override_pattern: null
+  vocab_size: 65536
+  # model architecture
+  encoder_seq_length: 4096
+  max_position_embeddings: ${.encoder_seq_length}
+  position_embedding_type: 'none' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
+  num_layers: 64
+  gated_linear_unit: False
+  num_query_groups: 8
+  ngroups_mamba: 8
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  hidden_size: 4096
+  bias_activation_fusion: False
+  ffn_hidden_size: 14336 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 32
+  transformer_block_type: pre_ln
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: RMSNorm
+  layernorm_epsilon: 1e-5
+  num_moe_experts: 16
+  moe_router_topk: 2
+  moe_aux_loss_coeff: 0.001
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  megatron_legacy: False
+  persist_layer_norm: True
+  add_bias_linear: False
+
+  answer_only_loss: True
+
+  tokenizer:
+    library: 'huggingface'
+    type: 'EleutherAI/gpt-neox-20b' 
+    model: null 
+    vocab_file: null
+    merge_file: null 
+    sentencepiece_legacy: False
+
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+
+  # Fusion
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
+  gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
+  bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+  bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+
+
+  # miscellaneous
+  seed: 1234
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_recurrent: False # If set to True, the checkpointing is only done for rglru and conv1d and not for attention and mlp layers
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+  sequence_parallel: False
+  
+  peft:
+    peft_scheme: null  # can be either adapter,ia3, lora, or ptuning
+    restore_from_path: null
+
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['all'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+    
+    selective_tuning:
+      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
+
+  data:
+    test_ds:
+      file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: ??? # Names of the corresponding datasets used to log metrics.
+      global_batch_size: 1
+      micro_batch_size: 1
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: True
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "input" # Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}"
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      ceil_to_power_2: True
+      get_attention_mask_from_fusion: True
+      pad_to_max_length: True
+
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+inference:
+  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  outfile_path: output.txt
+  compute_attention_mask: True
+
+# server-related configs
+server: False  # whether launch the API server
+port: 5555 # the port number for the inference server
+web_server: False # whether launch the web inference server
+share: True  # whether create a public URL
+username: test # user name for web client
+password: test2  # password for web client
+web_port: 9889 # the port number of the web server 1058
+chat: False # use the chat interface
+chatbot_config:
+  value: False   # whether to inject the value attributes
+  attributes:
+    - name: Quality
+      min: 0
+      max: 4
+      key: quality
+      type: int
+      default: 4
+    - name: Toxicity
+      min: 0
+      max: 4
+      key: toxcity
+      type: int
+      default: 0
+    - name: Humor
+      min: 0
+      max: 4
+      key: humor
+      type: int
+      default: 0
+    - name: Creativity
+      min: 0
+      max: 4
+      key: creativity
+      type: int
+      default: 0
+    - name: Violence
+      min: 0
+      max: 4
+      key: violence
+      type: int
+      default: 0
+    - name: Helpfulness
+      min: 0
+      max: 4
+      key: helpfulness
+      type: int
+      default: 4
+    - name: Not_Appropriate
+      min: 0
+      max: 4
+      key: not_appropriate
+      type: int
+      default: 0
+    - name: Language
+      choices: ['ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hu', 'id', 'it', 'ja', 'ko', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'th', 'tr', 'uk', 'vi', 'zh']
+      key: lang
+      type: list
+      default: en
+   
+  user: User
+  assistant: Assistant
+  system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n"
\ No newline at end of file
diff --git a/examples/nlp/language_modeling/mamba_change_num_partition.py b/examples/nlp/language_modeling/mamba_change_num_partition.py
new file mode 100644
index 000000000000..fd2433d636ab
--- /dev/null
+++ b/examples/nlp/language_modeling/mamba_change_num_partition.py
@@ -0,0 +1,782 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import tarfile
+import tempfile
+from argparse import ArgumentParser
+from typing import Dict, List
+
+import torch
+import torch.nn as nn
+from omegaconf import OmegaConf, open_dict
+from pytorch_lightning import Trainer
+
+from nemo.collections.nlp.parts.nlp_overrides import (
+    NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE,
+    GradScaler,
+    MegatronHalfPrecisionPlugin,
+    NLPDDPStrategy,
+    NLPSaveRestoreConnector,
+    PipelineMixedPrecisionPlugin,
+)
+from nemo.utils import logging, model_utils
+from nemo.utils.app_state import AppState
+
+"""
+Usage:
+
+### Tensor Parallelism and Pipeline Parallelism conversion ###
+
+# Megatron Mamba
+python /home/ataghibakhsh/NeMo/examples/nlp/language_modeling/mamba_change_num_partition.py \
+    --model_file=/home/ataghibakhsh/adlr_mamba2/mamba2-hybrid-8b-3t-4k.nemo \
+    --target_file=/home/ataghibakhsh/TP4-ADLR-mamba-hybrid/mamba2-TP4.nemo \
+    --tensor_model_parallel_size=1 \
+    --target_tensor_model_parallel_size=4 \
+    --precision=bf16
+
+
+"""
+
+
+import argparse
+import copy
+import os
+import re
+import shutil
+from collections import OrderedDict
+
+import torch
+
+tp_split_dim = {
+    'word_embeddings.weight': 0,
+    'norm.weight': -1,
+    'final_norm.weight': -1,
+    'output_layer.weight': 0,
+    # mamba1/2
+    'A_log': 0,
+    'D': 0,
+    'dt_bias': 0,
+    'in_proj.weight': 0,
+    'conv1d.weight': 0,
+    'conv1d.bias': 0,
+    'x_proj.weight': 1,
+    'dt_proj.weight': 0,
+    'dt_proj.bias': 0,
+    'out_proj.weight': 1,
+    'mixer.norm.weight': 0,
+    # mlp
+    'linear_fc1.layer_norm_weight': -1,
+    'linear_fc1.weight': 0,
+    'linear_fc2.weight': 1,
+    # attention
+    'self_attention.linear_proj.weight': 1,
+    'self_attention.linear_qkv.layer_norm_weight': -1,
+    'self_attention.linear_qkv.weight': 0,
+}
+
+
+def get_split_dim(tensor_name):
+    # norm.weight will match tensor_name of mixer.norm.weight and norm.weight, need to distinguish
+    if 'norm.weight' in tensor_name:
+        if 'mixer.norm.weight' in tensor_name:
+            return tp_split_dim['mixer.norm.weight']
+        else:
+            return tp_split_dim['norm.weight']
+
+    for key in tp_split_dim.keys():
+        if key in tensor_name:
+            return tp_split_dim[key]
+    raise Exception("Unknown tensor name {}".format(tensor_name))
+
+
+def split_tensor_for_tp(params, key, dim, tensor):
+
+    tp_size = params.target_tensor_model_parallel_size
+    tensor_sliced = []
+    if dim == -1:
+        tensor_sliced = [tensor for i in range(tp_size)]
+    else:
+        if 'mixer.in_proj.weight' in key and params.mamba_version == 1:
+            x, z = torch.split(tensor, [params.mamba_d_inner, params.mamba_d_inner], dim=dim)
+            x_sliced = torch.chunk(x, tp_size, dim=dim)
+            z_sliced = torch.chunk(z, tp_size, dim=dim)
+            for x, z in zip(x_sliced, z_sliced):
+                tensor_sliced.append(torch.cat((x, z), dim=dim))
+
+        elif 'mixer.in_proj.weight' in key and params.mamba_version == 2:
+            x, z, B, C, dt = torch.split(
+                tensor,
+                [
+                    params.mamba_d_inner,
+                    params.mamba_d_inner,
+                    params.mamba2_n_groups * params.mamba_d_state,
+                    params.mamba2_n_groups * params.mamba_d_state,
+                    params.mamba2_n_heads,
+                ],
+                dim=dim,
+            )
+            B = torch.reshape(B, (-1, params.mamba_d_state, B.shape[-1]))
+            C = torch.reshape(C, (-1, params.mamba_d_state, C.shape[-1]))
+
+            B_sliced = torch.chunk(B, tp_size, dim=dim)
+            C_sliced = torch.chunk(C, tp_size, dim=dim)
+            x_sliced = torch.chunk(x, tp_size, dim=dim)
+            z_sliced = torch.chunk(z, tp_size, dim=dim)
+            dt_sliced = torch.chunk(dt, tp_size, dim=dim)
+
+            tensor_sliced = []
+            for x, z, B, C, dt in zip(x_sliced, z_sliced, B_sliced, C_sliced, dt_sliced):
+                tensor_sliced.append(torch.cat((x, z, B.flatten(0, 1), C.flatten(0, 1), dt), dim=dim))
+
+        elif 'mixer.conv1d' in key and params.mamba_version == 2:
+            x, B, C = torch.split(
+                tensor,
+                [
+                    params.mamba_d_inner,
+                    params.mamba2_n_groups * params.mamba_d_state,
+                    params.mamba2_n_groups * params.mamba_d_state,
+                ],
+                dim=dim,
+            )
+            if 'weight' in key:
+                B = torch.reshape(B, (-1, params.mamba_d_state, B.shape[-2], B.shape[-1]))
+                C = torch.reshape(C, (-1, params.mamba_d_state, C.shape[-2], C.shape[-1]))
+            elif 'bias' in key:
+                B = torch.reshape(B, (-1, params.mamba_d_state))
+                C = torch.reshape(C, (-1, params.mamba_d_state))
+            else:
+                raise Exception("Unknown key")
+
+            B_sliced = torch.chunk(B, tp_size, dim=dim)
+            C_sliced = torch.chunk(C, tp_size, dim=dim)
+            x_sliced = torch.chunk(x, tp_size, dim=dim)
+
+            tensor_sliced = []
+            for x, B, C in zip(x_sliced, B_sliced, C_sliced):
+                tensor_sliced.append(torch.cat((x, B.flatten(0, 1), C.flatten(0, 1)), dim=dim))
+        elif '_extra_state' in key:
+            pass
+        else:
+            tensor_sliced = torch.chunk(tensor, tp_size, dim=dim)
+
+    return tensor_sliced
+
+
+#################
+### Utilities ###
+#################
+
+
+def force_cpu_model(cfg):
+    with open_dict(cfg):
+        # temporarily set to cpu
+        original_cpu_init = cfg.get('use_cpu_initialization', False)
+        if 'megatron_amp_O2' in cfg:
+            amp_o2_key = 'megatron_amp_O2'
+            original_amp_o2 = cfg.megatron_amp_O2
+        elif 'megatron_amp_02' in cfg:
+            amp_o2_key = 'megatron_amp_02'
+            original_amp_o2 = cfg.megatron_amp_02
+        else:
+            amp_o2_key, original_amp_o2 = None, None
+
+        # Set new values
+        cfg.use_cpu_initialization = True
+        if amp_o2_key is not None:
+            cfg[amp_o2_key] = False
+
+        # Disable sequence parallelism - Not disabling this gives error when converting the the model to TP=1
+        original_sequence_parallel = cfg.get('sequence_parallel', None)
+        cfg.sequence_parallel = False
+
+    # Setup restore dict
+    restore_dict = {'use_cpu_initialization': original_cpu_init}  # 'megatron_amp_O2': original_amp_o2
+    if amp_o2_key is not None:
+        restore_dict[amp_o2_key] = original_amp_o2
+    if original_sequence_parallel is not None:
+        restore_dict['sequence_parallel'] = original_sequence_parallel
+
+    return cfg, restore_dict
+
+
+def restore_model_config(cfg, original_dict):
+    with open_dict(cfg):
+        for key, val in original_dict.items():
+            logging.info(f"Restoring model config key ({key}) from {cfg[key]} to original value of {val}")
+            cfg[key] = val
+    return cfg
+
+
+#################
+### Utilities ###
+#################
+
+
+def compute_tp_splits(
+    param_name, param, partitions, global_idx, tp_size, pp_size, pp_rank, pp_split_rank, megatron_legacy, model_cfg
+):
+    """
+    Function to compute the splits required for tensor-parallelism.
+
+    Args:
+        param_name: Name of the current parameter of the current model (TP X PP Y)
+        param: Value of the current parameter of the current compute_tp_splitsmodel (TP X PP Y)
+        partitions: Partitions of the flattened parameter of the current model (TP 1 PP 1)
+        global_idx: The index used to select the parameter in the global partition.
+        tp_size: Int, tensor-parallelism size.
+        pp_size: Int, pipeline-parallelism size.
+        pp_rank: Int, pipeline-parallelism rank.
+        pp_split_rank: Int, pipeline-parallelism split rank. This should be > 1 if TP is being used with EncDec models (T5)
+        megatron_legacy: Bool, whether the model is a legacy Megatron model or not.
+        model_cfg: The model config as a OmegaConf DictConfig.
+
+    Returns:
+        List of torch tensors, each of which is a split of the current parameter.
+    """
+    # alias the global index to idx
+    idx = global_idx
+
+    fast_glu_activation = str(model_cfg.get('activation', '')).lower() in ['fast-geglu', 'fast-swiglu', 'fast-reglu']
+
+    if param.shape == partitions[0][idx].shape:
+        split = [partitions[0][idx].data] * tp_size
+        logging.debug(">> Perfect match, no splitting needed")
+    elif param.shape[0] == partitions[0][idx].shape[0]:
+        split = torch.split(partitions[0][idx].data, param.shape[-1], dim=-1)
+    else:
+        # For T5-converted weights, the splitting needs to be strided such that q,k,v weights are bunched together on each tensor-parallel rank.
+        if '.query_key_value.' in param_name and megatron_legacy:  # weight or bias
+            split_dim = partitions[0][idx].data.shape[0]
+            if split_dim % (tp_size * 3) != 0:
+                raise ValueError(
+                    f"Can not split Q,K,V parameter {param_name} with shape {param.shape} into tensor parallel size {tp_size}. Not divisible by {tp_size * 3}."
+                )
+            tp_qkv_splits = torch.chunk(partitions[0][idx].data, tp_size * 3, dim=0)
+            split = []
+            for i in range(tp_size):
+                tp_qkv = torch.cat([tp_qkv_splits[item] for item in range(i, tp_size * 3, tp_size)])
+                split.append(tp_qkv)
+        elif '.key_value.' in param_name and megatron_legacy:  # weight or bias
+            split_dim = partitions[0][idx].data.shape[0]
+            if split_dim % (tp_size * 2) != 0:
+                raise ValueError(
+                    f"Can not split K,V parameter {param_name} with shape {param.shape} into tensor parallel size {tp_size}. Not divisible by {tp_size * 2}."
+                )
+            tp_qkv_splits = torch.chunk(partitions[0][idx].data, tp_size * 2, dim=0)
+            split = []
+            for i in range(tp_size):
+                tp_qkv = torch.cat([tp_qkv_splits[item] for item in range(i, tp_size * 2, tp_size)])
+                split.append(tp_qkv)
+        elif ('dense_h_to_4h' in param_name or 'linear_fc1' in param_name) and fast_glu_activation:
+            # For Megatron GPT model with Fast Glu activation
+            # Handle gated linear units
+            # concat all the first halves ('W's) and all the second halves ('V's)
+            w_split, k_split = torch.chunk(partitions[0][idx].data, 2, dim=0)
+            w_split = torch.chunk(w_split, tp_size, dim=0)
+            k_split = torch.chunk(k_split, tp_size, dim=0)
+            split = [torch.cat(weights, dim=0) for weights in zip(w_split, k_split)]  # split per tp rank
+
+        # Regular split for Megatron and NeMo-Megatron models.
+        else:
+            split = torch.split(partitions[0][idx].data, param.shape[0], dim=0)
+
+    return split
+
+
+def write_tp_pp_split(model, splits, app_state, tp_size, pp_rank, write_path):
+    """
+    Function to write the given TP PP split to NeMo File.
+
+    Save each of the TP ranks in reverse order
+    This is done so that the last PP rank will save the last TP rank only after all other PP TP ranks are saved
+    The final rank will then save a new NeMo file with all other ranks inside.
+
+    Args:
+        model: The model corresponding to the current TP PP split. Contains partial parameters.
+        splits: Nested List of tensors containing the TP splits of the current model given current PP rank.
+            Indexed as splits[idx][tp_rank].
+        app_state: AppState object.
+        tp_size:  The global tensor-parallel size of the final model.
+        pp_rank: The local pipeline parallel rank of the final model.
+        write_path: The path to save the NeMo file.
+    """
+    for tp_rank in range(tp_size - 1, -1, -1):
+        app_state.pipeline_model_parallel_rank = pp_rank
+        app_state.tensor_model_parallel_rank = tp_rank
+
+        idx = 0
+        for name, param in model.named_parameters():
+            split_val = splits[idx][tp_rank].clone()
+
+            if param.shape != split_val.shape:
+                raise RuntimeError(
+                    f"Can not handle parameter {name}, required shape: {param.shape}, split shape: {split_val.shape}."
+                )
+            
+            param.data = split_val
+            idx += 1
+
+        if write_path is not None:
+            logging.info(f"Writing pp rank {pp_rank} tp rank {tp_rank} to file {write_path}")
+            model.save_to(write_path)
+
+
+def debug_log_split_param_diff(idx, param, param_name, partitions):
+    # Log some useful comparison of tensors that are being mapped.
+    # Note that the global param index for layers and modules may be different but the shapes
+    # and semantics of the layer should match.
+    logging.debug(f"Index: {idx} Model Params : {param_name} - {param.shape}")
+    logging.debug(f"Index: {idx} Global params: {partitions[1][idx]} - {partitions[0][idx].shape}")
+
+
+##################
+### Converters ###
+##################
+
+
+def split_tp_partition_only(args, model, original_model, tp_size, write_path=None, megatron_legacy=False):
+
+    if tp_size < 1:
+        raise ValueError("TP size must to be >= 1.")
+
+    app_state = AppState()
+    app_state.data_parallel_rank = 0
+    app_state.pipeline_model_parallel_size = 1
+    app_state.tensor_model_parallel_size = tp_size
+    app_state.model_parallel_size = app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size
+
+    app_state.pipeline_model_parallel_rank = 0
+    app_state.tensor_model_parallel_rank = tp_size - 1
+
+    idx = 0
+    splits = []
+
+    for ii, (key, original_tensor) in enumerate(original_model.model.state_dict().items()):
+        try:
+            layer_num = int(re.findall(r'\d+', key)[0])
+            new_key = key.replace(str(layer_num), str(layer_num), 1)
+        except:
+            new_key = key
+
+        if '_extra_state' not in new_key:
+            split_dim = get_split_dim(new_key)
+            split = split_tensor_for_tp(args, new_key, split_dim, original_tensor)
+
+            splits.append(split)
+            idx += 1
+
+    # Save each of the TP ranks in reverse order
+    # This is done so that the last PP rank will save the last TP rank only after all other PP TP ranks are saved
+    # The final rank will then save a new NeMo file with all other ranks inside.
+    write_tp_pp_split(model, splits, app_state, tp_size, pp_rank=0, write_path=write_path)
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--model_file", type=str, default=None, required=False, help="Path to source .nemo file")
+    parser.add_argument("--target_file", type=str, required=True, help="Path to write target .nemo file")
+    parser.add_argument(
+        "--tensor_model_parallel_size", type=int, default=-1, required=False, help="TP size of source model"
+    )
+    parser.add_argument("--target_tensor_model_parallel_size", type=int, required=True, help="TP size of target model")
+    parser.add_argument(
+        '--pipeline_model_parallel_size', type=int, default=1, required=False, help='PP size of source model'
+    )
+    parser.add_argument(
+        '--target_pipeline_model_parallel_size', type=int, required=False, default=1, help='PP size of target model'
+    )
+    parser.add_argument(
+        '--target_pipeline_model_parallel_split_rank', type=int, default=0, help='PP rank to split for Enc-Dec models'
+    )
+    parser.add_argument(
+        '--virtual_pipeline_model_parallel_size', type=int, default=None, help='Virtual Pipeline parallelism size'
+    )
+    parser.add_argument(
+        '--ckpt_name', type=str, default=None, help='Checkpoint name to load from for Virtual Parallel'
+    )
+    parser.add_argument(
+        "--model_class",
+        type=str,
+        default="nemo.collections.nlp.models.language_modeling.megatron_jamba_model.MegatronJambaModel",
+        help="NeMo model class. This script should support all NeMo megatron models that use Tensor Parallel",
+    )
+    parser.add_argument("--precision", default=16, help="PyTorch Lightning Trainer precision flag")
+    parser.add_argument('--num_gpu_per_node', default=8, type=int, help='Number of GPUs per node')
+    parser.add_argument(
+        "--megatron_legacy",
+        action="store_true",
+        help="Converter for legacy megatron modles that have different q,k,v weight splits",
+    )
+    parser.add_argument(
+        "--tokenizer_model_path",
+        type=str,
+        required=False,
+        default=None,
+        help="Path to the tokenizer model path if your model uses a tokenizer model as an artifact. This is needed if your model uses a sentencepiece tokenizer.",
+    )
+    parser.add_argument(
+        "--tokenizer_vocab_file",
+        type=str,
+        required=False,
+        default=None,
+        help="Path to the tokenizer model path if your model uses a tokenizer model as an artifact. This is needed if your model uses a sentencepiece tokenizer.",
+    )
+    parser.add_argument('--hparams_file', type=str, default=None, help='Path to hparams file from PTL training')
+    parser.add_argument(
+        '--tp_conversion_only', default=True, action='store_true', help='Only convert TP model to TP model'
+    )
+    parser.add_argument('--model_extracted_dir', type=str, default=None, help='Path to pre-extracted model directory')
+
+    parser.add_argument('--d-model', type=int, default=4096)
+    parser.add_argument('--mamba-version', type=int, default=2)
+    parser.add_argument('--mamba-d-state', type=int, default=128)
+    parser.add_argument('--mamba2-n-groups', type=int, default=8)
+    parser.add_argument('--mamba2-head-dim', type=int, default=64)
+
+    args = parser.parse_args()
+
+    args.mamba_d_inner = args.d_model * 2
+    args.mamba2_n_heads = args.mamba_d_inner // args.mamba2_head_dim
+
+    precision = args.precision
+    num_gpu_per_node = int(args.num_gpu_per_node)
+    if args.precision in ["32", "16"]:
+        precision = int(float(args.precision))
+
+    if precision in ["bf16", "bf16-mixed"]:
+        if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+            pass
+        else:
+            logging.warning("BF16 is not supported on this device. Using FP16 instead.")
+            precision = precision[2:]
+
+    if precision == 32:
+        dtype = torch.float32
+    elif precision in [16, "16", "16-mixed"]:
+        dtype = torch.float16
+    elif precision in ["bf16", "bf16-mixed"]:
+        dtype = torch.bfloat16
+    else:
+        dtype = torch.float32  # fallback
+
+    # Built target directory if it does not exist
+    target_dir = os.path.split(args.target_file)[0]
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir, exist_ok=True)
+
+    tp_size = args.tensor_model_parallel_size
+    tgt_tp_size = args.target_tensor_model_parallel_size
+    pp_size = args.pipeline_model_parallel_size
+    tgt_pp_size = args.target_pipeline_model_parallel_size
+    pipeline_model_parallel_split_rank = args.target_pipeline_model_parallel_split_rank
+    vp_size = args.virtual_pipeline_model_parallel_size
+    if vp_size is None:
+        vp_size = 1
+
+    convert_vp = vp_size > 1
+    if convert_vp:
+        from megatron.core import parallel_state
+
+        parallel_state.set_virtual_pipeline_model_parallel_world_size(vp_size)
+
+        hparams_filepath = args.hparams_file
+        if hparams_filepath is None:
+            logging.warning(
+                '\n\n\n!!!!!!!!!\n'
+                'You are converting a model with virtual pipeline parallelism enabled, \n'
+                'but have not passed `hparams_file` argument. \n'
+                'This will cause each ckpt file to be temporarily laoded onto GPU memory!\n\n'
+                'It is highly recommended to pass `hparams_file` argument to avoid this.\n'
+            )
+    else:
+        hparams_filepath = None
+
+    # Import the class of the model
+    cls = model_utils.import_class_by_path(args.model_class)
+
+    if args.model_file is None and args.model_extracted_dir is None:
+        raise ValueError("Cannot pass model_file and model_extracted_dir as None at the same time.")
+
+    tmp_cfg = cls.restore_from(
+        restore_path=args.model_file,
+        trainer=Trainer(devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision),
+        map_location=torch.device("cpu"),
+        return_config=True,
+    )
+    plugins = []
+    if precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
+        scaler = None
+        if precision in [16, '16', '16-mixed']:
+            scaler = GradScaler(
+                init_scale=tmp_cfg.get('native_amp_init_scale', 2**32),
+                growth_interval=tmp_cfg.get('native_amp_growth_interval', 1000),
+                hysteresis=tmp_cfg.get('hysteresis', 2),
+            )
+            # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed
+            plugin_precision = '16-mixed'
+        else:
+            plugin_precision = 'bf16-mixed'
+
+        if tmp_cfg.get('megatron_amp_O2', False):
+            plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        else:
+            plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
+        # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
+        # precision plugins and precision to exist
+        precision = None
+    trainer = Trainer(plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu")
+
+    if tp_size < 0 or pp_size < 0:
+        logging.info(f"Loading model config from {args.model_file} to get TP and PP size")
+        model_config_internal = cls.restore_from(
+            restore_path=args.model_file,
+            trainer=trainer,
+            map_location=torch.device("cpu"),
+            return_config=True,
+        )
+
+        tp_size = model_config_internal.get('tensor_model_parallel_size', 1)
+        pp_size = model_config_internal.get('pipeline_model_parallel_size', 1)
+
+    # Check if TP conversion only
+    tp_conversion_only = args.tp_conversion_only
+    if tp_conversion_only:
+        logging.info("Converting TP model to TP model only")
+
+        if pp_size > 1:
+            raise ValueError("Provided `--tp_conversion_only` but `--pipeline_model_parallel_size` > 1")
+
+        if tgt_pp_size > 1:
+            raise ValueError("Provided `--tp_conversion_only` but `--target_pipeline_model_parallel_size` > 1")
+
+        if pipeline_model_parallel_split_rank > 0:
+            raise ValueError("Provided `--tp_conversion_only` but `--target_pipeline_model_parallel_split_rank` > 0")
+
+        # Force PP size to 1
+        pp_size = 1
+        tgt_pp_size = 1
+        pipeline_model_parallel_split_rank = 0
+
+    if vp_size is None or vp_size < 0:
+        vp_size = 1
+
+    app_state = AppState()
+    app_state.data_parallel_rank = 0
+    app_state.pipeline_model_parallel_size = pp_size
+    app_state.tensor_model_parallel_size = tp_size
+
+    if vp_size > 1:
+        app_state.virtual_pipeline_model_parallel_size = vp_size
+    app_state.model_parallel_size = app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size
+
+    world_size = pp_size * tp_size  # pseudo world size for simulating load of a specific rank on a single gpu
+
+    app_state.tensor_model_parallel_rank = 0
+    app_state.pipeline_model_parallel_rank = 0
+
+    # Extract tokenizer artifact from the model to temp directory
+    logging.info("Extracting tokenizer artifact from NeMo file...")
+    temp_dir = tempfile.mkdtemp()
+    tokenizer_model_path = None
+    with tarfile.open(args.model_file, "r") as tar:
+        for member in tar.getmembers():
+            if '.model' in member.name:
+                extracted_file = tar.extractfile(member)
+                extracted_file_path = os.path.join(temp_dir, member.name)
+
+                if tokenizer_model_path is None:
+                    logging.info(f"Found tokenizer. Extracting {member.name} to {extracted_file_path}")
+
+                    tokenizer_model_path = extracted_file_path
+                    with open(extracted_file_path, "wb") as f:
+                        f.write(extracted_file.read())
+                else:
+                    if args.tokenizer_model_path is None:
+                        logging.warning(
+                            f"\n\nFound multiple tokenizer artifacts in the model file.\n"
+                            f"Using only {tokenizer_model_path}.\n"
+                            f"If this is incorrect, manually pass the correct tokenizer using "
+                            f"`--tokenizer_model_path`.\n\n"
+                        )
+
+    # If input model has TP > 1 or PP > 1
+    # Reconstruct the model to have TP = 1 and PP = 1
+    # Note that this is a forward loop that will process PP [0..N] TP [0..M] in sequential order.
+
+    # If input model has TP = 1 and PP = 1
+    app_state.model_parallel_size = 1
+
+    save_restore_connector = NLPSaveRestoreConnector()
+
+    if args.model_extracted_dir is not None:
+        logging.info(f"Using extracted model directory: {args.model_extracted_dir}")
+        save_restore_connector.model_extracted_dir = args.model_extracted_dir
+
+    if args.model_file is not None:
+        model_filepath = args.model_file
+    else:
+        model_filepath = args.model_extracted_dir
+
+    tmp_cfg = cls.restore_from(
+        restore_path=model_filepath,
+        trainer=trainer,
+        map_location=torch.device("cpu"),
+        save_restore_connector=save_restore_connector,
+        return_config=True,
+    )
+
+    tmp_cfg, restore_dict = force_cpu_model(tmp_cfg)
+
+    model = cls.restore_from(
+        restore_path=model_filepath,
+        trainer=trainer,
+        map_location=torch.device("cpu"),
+        save_restore_connector=save_restore_connector,
+        override_config_path=tmp_cfg,
+    )
+
+    original_model = cls.restore_from(
+        restore_path=model_filepath,
+        trainer=trainer,
+        map_location=torch.device("cpu"),
+        save_restore_connector=save_restore_connector,
+        override_config_path=tmp_cfg,
+    )
+    original_model = original_model.to('cpu')
+    original_model._save_restore_connector = NLPSaveRestoreConnector()
+    original_model.freeze()
+    original_model.to(dtype=dtype)
+
+    model.to(dtype=dtype)
+
+    restore_model_config(model.cfg, restore_dict)
+
+    # If target model has TP > 1 or PP > 1
+    if tgt_pp_size > 1 or tgt_tp_size > 1:
+
+        # Preserve the TP 1 PP 1 model parameters and names
+        global_params = []
+        global_params.append([p for n, p in model.named_parameters()])  # params
+        global_params.append([n for n, p in model.named_parameters()])  # names
+
+        logging.debug("Global parameters:")
+        for idx, (name, p) in enumerate(zip(global_params[1], global_params[0])):
+            logging.debug(f"{name} - {p.shape}")
+
+        logging.info(f"TP 1 PP 1 Number of Parameters : {len(global_params[0])}")
+
+        world_size = (
+            tgt_pp_size * tgt_tp_size
+        )  # pseudo world size for simulating load of a specific rank on a single gpu
+        new_global_batch_size = model.cfg.micro_batch_size * world_size
+        old_global_batch_size = model.cfg.get('global_batch_size', model.cfg.micro_batch_size)
+
+        global_offset = len(global_params[0]) - 1  # -1 cause this indexes the array, range [0, L-1]
+        logging.info(f"Final layer offset for parameters: {global_offset}")
+
+        for pp_rank in range(tgt_pp_size - 1, -1, -1):  # reverse order
+
+            with open_dict(model.cfg):
+                model.cfg.pipeline_model_parallel_size = tgt_pp_size
+                model.cfg.tensor_model_parallel_size = tgt_tp_size
+
+                if 'pipeline_model_parallel_split_rank' in model.cfg:
+                    if pipeline_model_parallel_split_rank > 0:
+                        model.cfg.pipeline_model_parallel_split_rank = pipeline_model_parallel_split_rank
+                    elif pp_size > 1:
+                        logging.warning(
+                            f"Model config has `pipeline_model_parallel_split_rank` set to "
+                            f"{model.cfg.pipeline_model_parallel_split_rank} and target PP "
+                            f"size is {tgt_pp_size}. "
+                            f"Provided `pipeline_model_parallel_split_rank` is "
+                            f"{pipeline_model_parallel_split_rank}. "
+                            f"Be careful that the model config is correct "
+                            f"if encoder-decoder models are being converted."
+                        )
+
+                model.cfg.global_batch_size = old_global_batch_size  # Used for restoration
+
+            # Override flag that forces Model to use AppState instead of Trainer
+            # to determine the world size, global and local rank
+            # Used for simulating load of a specific rank on a single gpu
+            os.environ[NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE] = "true"
+
+            # Compute the global rank
+            global_rank = (
+                pp_rank * tgt_tp_size + 0
+            )  # tp_rank = 0 needed just for modules, all TP will be merged to this PP rank
+
+            # Update AppState
+            app_state.world_size = world_size
+            app_state.global_rank = global_rank
+            app_state.local_rank = global_rank % num_gpu_per_node
+            app_state.pipeline_model_parallel_size = tgt_pp_size
+            app_state.tensor_model_parallel_size = tgt_tp_size
+            app_state.model_parallel_size = (
+                app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size
+            )
+
+            trainer = Trainer(plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu")
+            if args.tokenizer_model_path is not None:
+                with open_dict(model.cfg):
+                    model.cfg.tokenizer.model = args.tokenizer_model_path
+
+            else:
+                if tokenizer_model_path is None:
+                    logging.warning("Could not extract tokenizer model file from checkpoint.")
+
+                else:
+                    # Extract tokenizer info
+                    with open_dict(model.cfg):
+                        model.cfg.tokenizer.model = tokenizer_model_path
+
+            model.cfg, restore_dict = force_cpu_model(model.cfg)
+
+            model = cls(model.cfg, trainer)
+            model = model.to('cpu')
+            model._save_restore_connector = NLPSaveRestoreConnector()
+            model.freeze()
+            model.to(dtype=dtype)
+
+            restore_model_config(model.cfg, restore_dict)
+
+            # Update global batch size
+            if old_global_batch_size % new_global_batch_size != 0 or old_global_batch_size < new_global_batch_size:
+                logging.info(
+                    f"Global batch size {old_global_batch_size} is not divisible by new global batch size {new_global_batch_size}."
+                    f" The model config will be updated with new global batch size {new_global_batch_size}."
+                )
+                with open_dict(model.cfg):
+                    model.cfg.global_batch_size = new_global_batch_size
+
+            logging.info(f"Global rank: {global_rank} Local rank: {app_state.local_rank} World size: {world_size}")
+            logging.info(f"PP rank: {pp_rank} TP rank: {0}")
+            logging.info(f"TP 1 PP 1 Number of Layers : {len(global_params[0])}")
+            logging.info(f"Remaining layer offset for parameters: {global_offset}")
+            logging.info("\n")
+
+            # Special case for TP conversion only mode
+            if tp_conversion_only:
+                logging.info(f"Skipping PP split due to flag `--tp_conversion_only`")
+                split_tp_partition_only(
+                    args, model, original_model, tgt_tp_size, args.target_file, args.megatron_legacy
+                )
+                break
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/nlp/language_modeling/megatron_mamba_finetuning.py b/examples/nlp/language_modeling/megatron_mamba_finetuning.py
new file mode 100644
index 000000000000..4953ea747ae2
--- /dev/null
+++ b/examples/nlp/language_modeling/megatron_mamba_finetuning.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf
+
+from nemo.collections.nlp.models.language_modeling.megatron_mamba_sft_model import MegatronMambaSFTModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+
+mp.set_start_method("spawn", force=True)
+
+
+@hydra_runner(config_path="conf", config_name="megatron_jamba_finetuning_config")
+def main(cfg) -> None:
+
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+
+    precision = cfg.trainer.precision
+    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
+    # Restore the precision value after Trainer is built.
+    cfg.trainer.precision = precision
+    exp_manager(trainer, cfg.exp_manager)
+
+    model_cfg = MegatronJambaSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
+    model = MegatronJambaSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
+
+    peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
+
+    if cfg.model.peft.restore_from_path is not None:
+        # initialize peft weights from a check`point instead of randomly
+        # This is not the same as resume training because optimizer states are not restored.
+        logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path)
+        model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg))
+    elif peft_cfg_cls is not None:
+        logging.info("Adding adapter weights to the model for PEFT")
+        model.add_adapter(peft_cfg_cls(model_cfg))
+    else:
+        logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}")
+
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/nlp/language_modeling/megatron_mamba_generate.py b/examples/nlp/language_modeling/megatron_mamba_generate.py
new file mode 100644
index 000000000000..36bbef30069e
--- /dev/null
+++ b/examples/nlp/language_modeling/megatron_mamba_generate.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf
+from nemo.collections.nlp.models.language_modeling.megatron_mamba_sft_model import MegatronMambaSFTModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.model_utils import inject_model_parallel_rank
+
+
+mp.set_start_method("spawn", force=True)
+
+
+@hydra_runner(config_path="conf", config_name="megatron_jamba_generate_config")
+def main(cfg) -> None:
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f"\n{OmegaConf.to_yaml(cfg)}")
+    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
+
+    if cfg.model.peft.restore_from_path:
+        model_cfg = MegatronMambaSFTModel.merge_inference_cfg(cfg.model.peft.restore_from_path, cfg)
+    else:
+        model_cfg = MegatronMambaSFTModel.merge_inference_cfg(cfg.model.restore_from_path, cfg)
+
+    model = MegatronMambaSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
+
+    if cfg.model.peft.restore_from_path:
+        model.load_adapters(cfg.model.peft.restore_from_path)
+    elif cfg.model.peft.restore_from_ckpt.checkpoint_dir and cfg.model.peft.restore_from_ckpt.checkpoint_name:
+        peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
+        checkpoint_path = os.path.join(
+            cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
+        )
+        # checkpoint_path is a dir in case of distributed checkpointing
+        if not os.path.isdir(checkpoint_path):
+            # legacy checkpoint needs model parallel rank injection
+            checkpoint_path = inject_model_parallel_rank(
+                os.path.join(
+                    cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
+                )
+            )
+            model.load_adapters(checkpoint_path, peft_cfgs=peft_cfg_cls(model_cfg))
+        else:
+            raise NotImplementedError("distributed checkpointing of PEFT weights is not supported")
+
+    model.freeze()
+    logging.info(f"Freezing parameters for PEFT eval:\n{model.summarize()}")
+
+    trainer.test(model)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
new file mode 100644
index 000000000000..d88413fc5ad9
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from megatron.core.models.mamba import MambaModel
+from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec
+from omegaconf.dictconfig import DictConfig
+from pytorch_lightning.trainer.trainer import Trainer
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+
+
+class MegatronMambaModel(MegatronGPTModel):
+    """
+    Megatron Mamba pretraining.
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+
+        self.vocab_size = cfg.get('vocab_size', 65536)
+        self.cfg = cfg
+        super().__init__(cfg=cfg, trainer=trainer)
+        self.mcore_gpt = True
+
+    def model_provider_func(self, pre_process, post_process):
+
+        self.hybrid_override_pattern = self.cfg.get(
+            'hybrid_override_pattern', "M" * self.transformer_config.num_layers
+        )
+        self.transformer_config.add_bias_linear = self.cfg.get('add_bias_linear', False)
+        self.transformer_config.gated_linear_unit = self.cfg.get('gated_linear_unit', False)
+        self.transformer_config.layernorm_epsilon = self.cfg.get('layernorm_epsilon', 1e-5)
+
+        model = MambaModel(
+            config=self.transformer_config,
+            ngroups=self.cfg.get('ngroups_mamba', 8),
+            max_sequence_length=self.cfg.get('encoder_seq_length', 4096),
+            vocab_size=self.cfg.get('vocab_size', 65536),
+            mamba_stack_spec=mamba_stack_spec,
+            hybrid_override_pattern=self.hybrid_override_pattern,
+        )
+
+        return model
+
+    def forward(self, input_ids, position_ids=None, attention_mask=None, labels=None):
+
+        output_tensor = self.model(
+            input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask, labels=labels
+        )
+        return output_tensor
+
+    def build_transformer_config(self):
+        transformer_config = super().build_transformer_config()
+        return transformer_config
+
+    def on_validation_epoch_end(self):
+
+        averaged_loss = torch.tensor(0.0, dtype=torch.float32).cuda()
+        return averaged_loss
+
+    def sharded_state_dict(self, prefix: str = ''):
+        return None
+
+    def _reset_activation_checkpointing_args(self):
+        return
+
+    def _restore_activation_checkpointing_args(self):
+        return
+
+    def _reset_sequence_parallelism_args(self):
+        return
+
+    def _restore_sequence_parallelism_args(self):
+        return
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py.py
new file mode 100644
index 000000000000..2d84bc088b0d
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from omegaconf import DictConfig
+from omegaconf.dictconfig import DictConfig
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
+from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
+
+try:
+    HAVE_APEX = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_APEX = False
+
+__all__ = ['MegatronMambaSFTModel']
+
+
+class MegatronMambaSFTModel(MegatronGPTSFTModel, MegatronMambaModel):
+    """
+    Megatron Jamba Supervised Fine-Tuning
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+        if not HAVE_APEX:
+            raise ImportError(
+                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
+            )
+
+        super().__init__(cfg, trainer=trainer)
+        self.mcore_gpt = True
+        self.validation_param_sync_overlap = self.cfg.get('validation_param_sync_overlap', False)
+
+    def _reset_activation_checkpointing_args(self):
+        pass
+
+    def on_validation_model_zero_grad(self) -> None:
+        """
+        Skip gradient zeroing at the beginning of validation routine.
+        This is needed when overlapping the AllGather of the updated parameters with the following valdation step.
+        """
+        if not self.validation_param_sync_overlap:
+            MegatronBaseModel.on_validation_model_zero_grad(self)
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index e8e2859e439f..c6c72868f2b2 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -333,7 +333,81 @@ def prepare_batch_at_step(
         tensor_shape = [tokens2use.shape[1], micro_batch_size, self.model.cfg.hidden_size]
         return batch, tensor_shape
 
+class MambaModelTextGenerationStrategy(TextGenerationStrategy):
+    def __init__(self, model):
+        super().__init__(model)
+        self.forward_model = self.model.model
+
+    def clip_max_len(self, maxlen: int) -> int:
+        """clip the max len based on the LM model max sequence length"""
+
+        # for positional embedding types that allow length extrapolation, don't clip the max length
+        if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
+            if maxlen > self.model.cfg.encoder_seq_length + 1:
+                maxlen = self.model.cfg.encoder_seq_length + 1
+        return maxlen
+
+    def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_attention_mask: bool):
+        """initialize the batch data before the inference steps."""
+        # Move to GPU.
+        tokenizer = self.model.tokenizer
+        tokens = context_tokens.contiguous().cuda()
+        # Get the attention mask and postition ids.
+        self.attention_mask, _, self.position_ids = get_ltor_masks_and_position_ids(
+            tokens,
+            tokenizer.eos_id,
+            self.model.cfg.get('reset_position_ids', False),
+            self.model.cfg.get('reset_attention_mask', False),
+            self.model.cfg.get('eod_mask_loss', False),
+            compute_attention_mask=compute_attention_mask,
+        )
+        self.attention_mask = None
 
+    def prepare_batch_at_step(
+        self,
+        tokens: torch.Tensor,
+        maxlen: int,
+        micro_batch_size: int,
+        step: int,
+        context_length: int,
+        compute_attention_mask: bool = False,
+    ) -> Tuple[List[torch.Tensor], List[int]]:
+        """
+        generate the batch used in inference for each of the steps
+        """
+        # types2use = None
+        # Allocate memory for the entire context.
+
+        tokens2use = tokens
+
+        """Prepare batch for each of the inference steps"""
+        attention_mask_repeat = None
+
+        batch = [tokens2use, attention_mask_repeat, self.position_ids]
+        tensor_shape = [tokens2use.shape[1], micro_batch_size, self.model.cfg.hidden_size]
+        return batch, (tensor_shape, context_length)
+
+    def forward_step(self, batch, tensor_shape_and_context_length):
+        tensor_shape, context_length = tensor_shape_and_context_length
+        fwd_bwd_function = get_forward_backward_func()
+
+        output_tensor = fwd_bwd_function(
+            forward_step_func=self.model.get_forward_output_only_func(),
+            data_iterator=iter(
+                [
+                    batch,
+                ]
+            ),
+            model=[self.forward_model],
+            num_microbatches=get_num_microbatches(),
+            forward_only=True,
+            seq_length=tensor_shape[0],
+            micro_batch_size=tensor_shape[1],
+        )
+
+        output_tensor[0]['logits'] = output_tensor[0]['logits'][:, :context_length, :]
+        return output_tensor
+    
 class GriffinModelTextGenerationStrategy(TextGenerationStrategy):
     def __init__(self, model):
         super().__init__(model)
@@ -981,6 +1055,7 @@ def model_inference_strategy_dispatcher(model, **args):
         MegatronGPTPromptLearningModel,
     )
     from nemo.collections.nlp.models.language_modeling.megatron_griffin_model import MegatronGriffinModel
+    from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
     from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
     from nemo.collections.nlp.models.language_modeling.megatron_retro_model import MegatronRetroModel
     from nemo.collections.nlp.modules.common.retro_inference_strategies import (
@@ -991,6 +1066,8 @@ def model_inference_strategy_dispatcher(model, **args):
 
     if isinstance(model, MegatronGriffinModel):
         return GriffinModelTextGenerationStrategy(model)
+    if isinstance(model, MegatronMambaModel):
+        return MambaModelTextGenerationStrategy(model)
     if isinstance(model, MegatronNevaModel):
         return NevaModelTextGenerationStrategy(model)
     if isinstance(model, MegatronGPTPromptLearningModel):
diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
index 7d294f6085bb..f4b0b7804e2b 100644
--- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
@@ -178,9 +178,10 @@ def _check_and_add_peft_cfg(self, peft_cfg):
                 for layer in layers:
                     if layer.layer_number in (layer_selection or list(range(1, self.cfg.num_layers + 1))):
                         for name, module in layer.named_modules():
-                            self._check_and_add_adapter(
-                                name, module, adapter_name, adapter_cfg, name_key_to_mcore_mixins
-                            )
+                            if not isinstance(module, IdentityOp):
+                                self._check_and_add_adapter(
+                                    name, module, adapter_name, adapter_cfg, name_key_to_mcore_mixins
+                                )
             else:
                 # Non GPT models, as well as GPT+PTuning do not support layer selection
                 if layer_selection is not None:
diff --git a/scripts/checkpoint_converters/convert_jamba.py b/scripts/checkpoint_converters/convert_jamba.py
new file mode 100644
index 000000000000..ac61c02c318c
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_jamba.py
@@ -0,0 +1,248 @@
+import os
+from argparse import ArgumentParser
+
+import torch
+from omegaconf.omegaconf import OmegaConf
+from transformers import AutoModelForCausalLM
+
+from NeMo.nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronJambaModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
+from nemo.utils import logging
+
+'''
+CUDA_VISIBLE_DEVICES="0" python /home/ataghibakhsh/NeMo/scripts/checkpoint_converters/convert_jamba_hf_to_nemo.py --output_path /home/ataghibakhsh/forks/full_jamba.nemo
+'''
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=f"{os.path.dirname(__file__)}/../../examples/nlp/language_modeling/conf/megatron_jamba_config.yaml",
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument("--input_name_or_path", type=str, default="ai21labs/Jamba-v0.1")
+    parser.add_argument(
+        "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
+    )
+    args = parser.parse_args()
+    return args
+
+
+def convert(args):
+
+    nemo_config = OmegaConf.load(args.hparams_file)
+    nemo_config.trainer["precision"] = args.precision
+    nemo_config.model.tokenizer.type = "ai21labs/Jamba-v0.1"
+    # nemo_config.model.num_attention_heads=8
+    # nemo_config.model.num_query_groups=8
+    # nemo_config.model.hidden_size=32
+    # nemo_config.model.ffn_hidden_size=112
+    # nemo_config.model.num_moe_experts=16
+
+    nemo_config.model.use_cpu_initialization = True
+    # print(nemo_config)
+    # import sys
+    # sys.exit()
+    from transformers import AutoConfig, AutoModelForCausalLM
+
+    config = AutoConfig.from_pretrained("ai21labs/Jamba-v0.1")
+    nemo_config.model.hybrid_override_pattern = "M-MOM-MO*-MOM-MO" * 4
+
+    # config.hidden_size = int(config.hidden_size / 128)
+    # config.intermediate_size = int(config.intermediate_size  / 128)
+    # config.num_attention_heads = int(config.num_attention_heads/4)
+    # config.num_key_value_heads = 8
+    # import math
+    # config.mamba_dt_rank = math.ceil(config.hidden_size / 16)
+
+    # hf_model = AutoModelForCausalLM.from_config(config)#.to("cuda")
+    # import sys
+    # sys.exit()
+
+    logging.info(f"Loading checkpoint from HF: `{args.input_name_or_path}`")
+    hf_model = AutoModelForCausalLM.from_pretrained(
+        args.input_name_or_path, trust_remote_code=True
+    )  # , force_download=True)
+
+    trainer = MegatronLMPPTrainerBuilder(nemo_config).create_trainer()
+    nemo_config.model.use_cpu_initialization = True
+    nemo_model_from_hf = MegatronJambaModel(nemo_config.model, trainer)
+    # print(nemo_model_from_hf.state_dict().keys())
+    # import sys
+    # sys.exit()
+    new_state_dict = {}
+
+    new_state_dict['model.embedding.word_embeddings.weight'] = hf_model.state_dict()['model.embed_tokens.weight']
+    new_state_dict['model.decoder.final_norm.weight'] = hf_model.state_dict()['model.final_layernorm.weight']
+    new_state_dict['model.output_layer.weight'] = hf_model.state_dict()['lm_head.weight']
+    for i, symb in enumerate(nemo_model_from_hf.hybrid_override_pattern):
+        hf_jamba_layer = int(i / 2)
+        if symb == "M":
+
+            new_state_dict[f'model.decoder.layers.{i}.mixer.A_log'] = hf_model.state_dict()[
+                f'model.layers.{hf_jamba_layer}.mamba.A_log'
+            ]
+            new_state_dict[f'model.decoder.layers.{i}.mixer.D'] = hf_model.state_dict()[
+                f'model.layers.{hf_jamba_layer}.mamba.D'
+            ]
+            new_state_dict[f'model.decoder.layers.{i}.mixer.conv1d.weight'] = hf_model.state_dict()[
+                f'model.layers.{hf_jamba_layer}.mamba.conv1d.weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{i}.mixer.conv1d.bias'] = hf_model.state_dict()[
+                f'model.layers.{hf_jamba_layer}.mamba.conv1d.bias'
+            ]
+            new_state_dict[f'model.decoder.layers.{i}.mixer.in_proj.weight'] = hf_model.state_dict()[
+                f'model.layers.{hf_jamba_layer}.mamba.in_proj.weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{i}.mixer.x_proj.weight'] = hf_model.state_dict()[
+                f'model.layers.{hf_jamba_layer}.mamba.x_proj.weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{i}.mixer.dt_proj.weight'] = hf_model.state_dict()[
+                f'model.layers.{hf_jamba_layer}.mamba.dt_proj.weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{i}.mixer.dt_proj.bias'] = hf_model.state_dict()[
+                f'model.layers.{hf_jamba_layer}.mamba.dt_proj.bias'
+            ]
+            new_state_dict[f'model.decoder.layers.{i}.mixer.out_proj.weight'] = hf_model.state_dict()[
+                f'model.layers.{hf_jamba_layer}.mamba.out_proj.weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{i}.mixer.dt_layernorm.weight'] = hf_model.state_dict()[
+                f'model.layers.{hf_jamba_layer}.mamba.dt_layernorm.weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{i}.mixer.b_layernorm.weight'] = hf_model.state_dict()[
+                f'model.layers.{hf_jamba_layer}.mamba.b_layernorm.weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{i}.mixer.c_layernorm.weight'] = hf_model.state_dict()[
+                f'model.layers.{hf_jamba_layer}.mamba.c_layernorm.weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{i}.norm.weight'] = hf_model.state_dict()[
+                f'model.layers.{hf_jamba_layer}.input_layernorm.weight'
+            ]
+        if symb == "*":
+
+            new_state_dict[f'model.decoder.layers.{i}.self_attention.linear_qkv.layer_norm_weight'] = (
+                hf_model.state_dict()[f'model.layers.{hf_jamba_layer}.input_layernorm.weight']
+            )
+
+            new_state_dict[f'model.decoder.layers.{i}.self_attention.linear_proj.weight'] = hf_model.state_dict()[
+                f'model.layers.{hf_jamba_layer}.self_attn.o_proj.weight'
+            ]
+            hidden_size = config.hidden_size
+            head_num = config.num_attention_heads
+            head_size = hidden_size // head_num
+            num_query_groups = config.num_key_value_heads
+
+            old_tensor_shape = hf_model.state_dict()[f'model.layers.{hf_jamba_layer}.self_attn.q_proj.weight'].size()
+            new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+            new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+
+            q = hf_model.state_dict()[f'model.layers.{hf_jamba_layer}.self_attn.q_proj.weight'].view(
+                *new_q_tensor_shape
+            )
+            k = hf_model.state_dict()[f'model.layers.{hf_jamba_layer}.self_attn.k_proj.weight'].view(
+                *new_kv_tensor_shape
+            )
+            v = hf_model.state_dict()[f'model.layers.{hf_jamba_layer}.self_attn.v_proj.weight'].view(
+                *new_kv_tensor_shape
+            )
+
+            qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:])  # .cuda()
+            heads_per_group = head_num // num_query_groups
+            for count in range(num_query_groups):
+                qkv_weights = torch.cat(
+                    (qkv_weights, q[count * heads_per_group : (count + 1) * heads_per_group, :, :])
+                )
+                qkv_weights = torch.cat((qkv_weights, k[count : count + 1, :, :]))
+                qkv_weights = torch.cat((qkv_weights, v[count : count + 1, :, :]))
+            qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+
+            param_to_weights = lambda param: param.float()
+            new_state_dict[f'model.decoder.layers.{i}.self_attention.linear_qkv.weight'] = param_to_weights(
+                qkv_weights
+            )
+
+            new_state_dict[f'model.decoder.layers.{i}.self_attention.linear_proj._extra_state'] = (
+                nemo_model_from_hf.state_dict()[f'model.decoder.layers.{i}.self_attention.linear_proj._extra_state']
+            )
+            new_state_dict[f'model.decoder.layers.{i}.self_attention.linear_qkv._extra_state'] = (
+                nemo_model_from_hf.state_dict()[f'model.decoder.layers.{i}.self_attention.linear_qkv._extra_state']
+            )
+        if symb == "-":
+            new_state_dict[f'model.decoder.layers.{i}.mlp.linear_fc1.layer_norm_weight'] = hf_model.state_dict()[
+                f'model.layers.{hf_jamba_layer}.pre_ff_layernorm.weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{i}.mlp.linear_fc1.weight'] = torch.cat(
+                [
+                    hf_model.state_dict()[f'model.layers.{hf_jamba_layer}.feed_forward.gate_proj.weight'],
+                    hf_model.state_dict()[f'model.layers.{hf_jamba_layer}.feed_forward.up_proj.weight'],
+                ]
+            )
+            new_state_dict[f'model.decoder.layers.{i}.mlp.linear_fc2.weight'] = hf_model.state_dict()[
+                f'model.layers.{hf_jamba_layer}.feed_forward.down_proj.weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{i}.mlp.linear_fc1._extra_state'] = nemo_model_from_hf.state_dict()[
+                f'model.decoder.layers.{i}.mlp.linear_fc1._extra_state'
+            ]
+            new_state_dict[f'model.decoder.layers.{i}.mlp.linear_fc2._extra_state'] = nemo_model_from_hf.state_dict()[
+                f'model.decoder.layers.{i}.mlp.linear_fc2._extra_state'
+            ]
+        if symb == "O":
+            new_state_dict[f'model.decoder.layers.{i}.mlp.input_layernorm.weight'] = hf_model.state_dict()[
+                f'model.layers.{hf_jamba_layer}.pre_ff_layernorm.weight'
+            ]
+            new_state_dict[f'model.decoder.layers.{i}.mlp.router.weight'] = hf_model.state_dict()[
+                f'model.layers.{hf_jamba_layer}.feed_forward.router.weight'
+            ]
+            for j in range(nemo_config.model.num_moe_experts):
+                new_state_dict[f'model.decoder.layers.{i}.mlp.experts.local_experts.{j}.linear_fc1.weight'] = (
+                    torch.cat(
+                        [
+                            hf_model.state_dict()[
+                                f'model.layers.{hf_jamba_layer}.feed_forward.experts.{j}.gate_proj.weight'
+                            ],
+                            hf_model.state_dict()[
+                                f'model.layers.{hf_jamba_layer}.feed_forward.experts.{j}.up_proj.weight'
+                            ],
+                        ]
+                    )
+                )
+                new_state_dict[f'model.decoder.layers.{i}.mlp.experts.local_experts.{j}.linear_fc2.weight'] = (
+                    hf_model.state_dict()[f'model.layers.{hf_jamba_layer}.feed_forward.experts.{j}.down_proj.weight']
+                )
+                new_state_dict[
+                    f'model.decoder.layers.{i}.mlp.experts.local_experts.{j}.linear_fc1._extra_state'
+                ] = nemo_model_from_hf.state_dict()[
+                    f'model.decoder.layers.{i}.mlp.experts.local_experts.{j}.linear_fc1._extra_state'
+                ]
+                new_state_dict[
+                    f'model.decoder.layers.{i}.mlp.experts.local_experts.{j}.linear_fc2._extra_state'
+                ] = nemo_model_from_hf.state_dict()[
+                    f'model.decoder.layers.{i}.mlp.experts.local_experts.{j}.linear_fc2._extra_state'
+                ]
+
+    nemo_model_from_hf.load_state_dict(new_state_dict, strict=True)
+    dtype = torch_dtype_from_precision(args.precision)
+    nemo_model_from_hf = nemo_model_from_hf.to(dtype=dtype)
+
+    inpt = torch.randint(10, (1, 10))  # .cuda()
+
+    # out_pyt = hf_model.forward(inpt)
+    # out_nemo = nemo_model_from_hf.forward(inpt)
+    # print(f"out_pyt = {out_pyt}")
+    # print(f"out_nemo = {out_nemo}")
+
+    import sys
+
+    sys.exit()
+    nemo_model_from_hf.save_to(args.output_path)
+    logging.info(f'Jamba NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args)
\ No newline at end of file
diff --git a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
new file mode 100644
index 000000000000..37cabd4a5a98
--- /dev/null
+++ b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+from argparse import ArgumentParser
+from collections import defaultdict
+import torch
+from omegaconf.omegaconf import OmegaConf
+from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
+from nemo.utils import logging
+
+'''
+Example
+
+CUDA_VISIBLE_DEVICES="0" python /NeMo/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py 
+                                --input_name_or_path PATH_TO_PYTORCH_WEIGHTS
+                                --output_path OUTPUT_PATH.nemo
+                                --ngroups_mamba 8
+                                --precision 32
+'''
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=f"{os.path.dirname(__file__)}/../../examples/nlp/language_modeling/conf/megatron_jamba_config.yaml",
+        required=False,
+        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
+    )
+    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument(
+        "--input_name_or_path",
+        type=str,
+        required=True,
+    )
+    parser.add_argument("--ngroups_mamba", type=int, default=8, help="ngroups for Mamba model")
+    parser.add_argument(
+        "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
+    )
+    args = parser.parse_args()
+    return args
+
+
+def convert(args):
+
+    checkpoint_weights = torch.load(args.input_name_or_path, map_location='cpu')['model']
+    new_state_dict = {}
+
+    if 'backbone' in list(checkpoint_weights.keys())[0]:
+
+        layer_keys = [key for key in checkpoint_weights.keys() if re.match(r'backbone\.layers\.\d+\.', key)]
+        layer_numbers = set(int(re.search(r'backbone\.layers\.(\d+)\.', key).group(1)) for key in layer_keys)
+        num_layers = max(layer_numbers) + 1
+
+        direct_mappings = {
+            'model.embedding.word_embeddings.weight': 'backbone.embedding.weight',
+            'model.decoder.final_norm.weight': 'backbone.norm_f.weight',
+            'model.output_layer.weight': 'lm_head.weight',
+        }
+
+        for new_key, old_key in direct_mappings.items():
+            new_state_dict[new_key] = checkpoint_weights[old_key]
+
+        layer_attributes = [
+            'mixer.A_log',
+            'mixer.D',
+            'mixer.conv1d.weight',
+            'mixer.conv1d.bias',
+            'mixer.in_proj.weight',
+            'mixer.dt_bias',
+            'mixer.out_proj.weight',
+            'mixer.norm.weight',
+            'norm.weight',
+        ]
+
+        for i in range(num_layers):
+            for attr in layer_attributes:
+                new_key = f'model.decoder.layers.{i}.{attr}'
+                old_key = f'backbone.layers.{i}.{attr}'
+                new_state_dict[new_key] = checkpoint_weights[old_key]
+
+    else:
+
+        layer_keys = [key for key in checkpoint_weights.keys() if re.match(r'decoder\.layers\.\d+\.', key)]
+        layer_numbers = set(int(re.search(r'decoder\.layers\.(\d+)\.', key).group(1)) for key in layer_keys)
+        num_layers = max(layer_numbers) + 1
+
+        new_state_dict = {"model." + key: value for key, value in checkpoint_weights.items()}
+
+    layers = defaultdict(list)
+
+    for key in new_state_dict.keys():
+        match = re.match(r'model\.decoder\.layers\.(\d+)\.(\w+)', key)
+        if match:
+            index, layer_type = match.groups()
+            layers[index].append(layer_type)
+
+    layer_pattern = ''
+    for i in range(max(map(int, layers.keys())) + 1):
+        index_str = str(i)
+        layer_types = layers.get(index_str, [])
+        if 'mixer' in layer_types:
+            layer_pattern += 'M'
+        elif 'self_attention' in layer_types:
+            layer_pattern += '*'
+        elif 'mlp' in layer_types:
+            layer_pattern += '-'
+        else:
+            AssertionError("Layer not found. Each layer must be eiher MLP, Mamba, or Attention")
+
+    nemo_config = OmegaConf.load(args.hparams_file)
+    nemo_config.trainer["precision"] = args.precision
+    nemo_config.model.vocab_size, nemo_config.model.hidden_size = new_state_dict[
+        'model.embedding.word_embeddings.weight'
+    ].shape
+    nemo_config.model.num_layers = num_layers
+    nemo_config.model.hybrid_override_pattern = layer_pattern
+    nemo_config.model.ngroups_mamba = args.ngroups_mamba
+
+    if "-" in layer_pattern:
+        nemo_config.model.ffn_hidden_size = new_state_dict[
+            f'model.decoder.layers.{layer_pattern.index("-")}.mlp.linear_fc1.weight'
+        ].shape[0]
+    else:
+        nemo_config.model.ffn_hidden_size = nemo_config.model.hidden_size
+
+    nemo_config.model.use_cpu_initialization = True
+
+    logging.info(f"Loading Mamba2 Pytorch checkpoint : `{args.input_name_or_path}`")
+
+    trainer = MegatronLMPPTrainerBuilder(nemo_config).create_trainer()
+    nemo_model_from_pyt = MegatronMambaModel(nemo_config.model, trainer)
+
+    nemo_model_from_pyt.load_state_dict(new_state_dict, strict=True)
+    dtype = torch_dtype_from_precision(args.precision)
+    nemo_model_from_pyt = nemo_model_from_pyt.to(dtype=dtype)
+    nemo_model_from_pyt.save_to(args.output_path)
+    logging.info(f'Mamba2 NeMo model saved to: {args.output_path}')
+
+
+if __name__ == '__main__':
+    args = get_args()
+    convert(args)
\ No newline at end of file

From 73d7c4c1d35dcb8c388350d8e2da4be722f874cb Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Mon, 1 Jul 2024 07:40:38 -0700
Subject: [PATCH 02/21] fix import mixins

---
 nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
index f4b0b7804e2b..d46b7232f38f 100644
--- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
@@ -18,7 +18,7 @@
 
 import torch
 from omegaconf import DictConfig, OmegaConf, open_dict
-
+from megatron.core.transformer.identity_op import IdentityOp
 from nemo.utils.model_utils import inject_model_parallel_rank
 
 try:

From 66886b539d5b2ba1bd14fd520ba19fad4c87eacc Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Mon, 1 Jul 2024 07:43:06 -0700
Subject: [PATCH 03/21] rm convert jamba

---
 .../checkpoint_converters/convert_jamba.py    | 248 ------------------
 1 file changed, 248 deletions(-)
 delete mode 100644 scripts/checkpoint_converters/convert_jamba.py

diff --git a/scripts/checkpoint_converters/convert_jamba.py b/scripts/checkpoint_converters/convert_jamba.py
deleted file mode 100644
index ac61c02c318c..000000000000
--- a/scripts/checkpoint_converters/convert_jamba.py
+++ /dev/null
@@ -1,248 +0,0 @@
-import os
-from argparse import ArgumentParser
-
-import torch
-from omegaconf.omegaconf import OmegaConf
-from transformers import AutoModelForCausalLM
-
-from NeMo.nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronJambaModel
-from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
-from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
-from nemo.utils import logging
-
-'''
-CUDA_VISIBLE_DEVICES="0" python /home/ataghibakhsh/NeMo/scripts/checkpoint_converters/convert_jamba_hf_to_nemo.py --output_path /home/ataghibakhsh/forks/full_jamba.nemo
-'''
-
-
-def get_args():
-    parser = ArgumentParser()
-    parser.add_argument(
-        "--hparams_file",
-        type=str,
-        default=f"{os.path.dirname(__file__)}/../../examples/nlp/language_modeling/conf/megatron_jamba_config.yaml",
-        required=False,
-        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
-    )
-    parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
-    parser.add_argument("--input_name_or_path", type=str, default="ai21labs/Jamba-v0.1")
-    parser.add_argument(
-        "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
-    )
-    args = parser.parse_args()
-    return args
-
-
-def convert(args):
-
-    nemo_config = OmegaConf.load(args.hparams_file)
-    nemo_config.trainer["precision"] = args.precision
-    nemo_config.model.tokenizer.type = "ai21labs/Jamba-v0.1"
-    # nemo_config.model.num_attention_heads=8
-    # nemo_config.model.num_query_groups=8
-    # nemo_config.model.hidden_size=32
-    # nemo_config.model.ffn_hidden_size=112
-    # nemo_config.model.num_moe_experts=16
-
-    nemo_config.model.use_cpu_initialization = True
-    # print(nemo_config)
-    # import sys
-    # sys.exit()
-    from transformers import AutoConfig, AutoModelForCausalLM
-
-    config = AutoConfig.from_pretrained("ai21labs/Jamba-v0.1")
-    nemo_config.model.hybrid_override_pattern = "M-MOM-MO*-MOM-MO" * 4
-
-    # config.hidden_size = int(config.hidden_size / 128)
-    # config.intermediate_size = int(config.intermediate_size  / 128)
-    # config.num_attention_heads = int(config.num_attention_heads/4)
-    # config.num_key_value_heads = 8
-    # import math
-    # config.mamba_dt_rank = math.ceil(config.hidden_size / 16)
-
-    # hf_model = AutoModelForCausalLM.from_config(config)#.to("cuda")
-    # import sys
-    # sys.exit()
-
-    logging.info(f"Loading checkpoint from HF: `{args.input_name_or_path}`")
-    hf_model = AutoModelForCausalLM.from_pretrained(
-        args.input_name_or_path, trust_remote_code=True
-    )  # , force_download=True)
-
-    trainer = MegatronLMPPTrainerBuilder(nemo_config).create_trainer()
-    nemo_config.model.use_cpu_initialization = True
-    nemo_model_from_hf = MegatronJambaModel(nemo_config.model, trainer)
-    # print(nemo_model_from_hf.state_dict().keys())
-    # import sys
-    # sys.exit()
-    new_state_dict = {}
-
-    new_state_dict['model.embedding.word_embeddings.weight'] = hf_model.state_dict()['model.embed_tokens.weight']
-    new_state_dict['model.decoder.final_norm.weight'] = hf_model.state_dict()['model.final_layernorm.weight']
-    new_state_dict['model.output_layer.weight'] = hf_model.state_dict()['lm_head.weight']
-    for i, symb in enumerate(nemo_model_from_hf.hybrid_override_pattern):
-        hf_jamba_layer = int(i / 2)
-        if symb == "M":
-
-            new_state_dict[f'model.decoder.layers.{i}.mixer.A_log'] = hf_model.state_dict()[
-                f'model.layers.{hf_jamba_layer}.mamba.A_log'
-            ]
-            new_state_dict[f'model.decoder.layers.{i}.mixer.D'] = hf_model.state_dict()[
-                f'model.layers.{hf_jamba_layer}.mamba.D'
-            ]
-            new_state_dict[f'model.decoder.layers.{i}.mixer.conv1d.weight'] = hf_model.state_dict()[
-                f'model.layers.{hf_jamba_layer}.mamba.conv1d.weight'
-            ]
-            new_state_dict[f'model.decoder.layers.{i}.mixer.conv1d.bias'] = hf_model.state_dict()[
-                f'model.layers.{hf_jamba_layer}.mamba.conv1d.bias'
-            ]
-            new_state_dict[f'model.decoder.layers.{i}.mixer.in_proj.weight'] = hf_model.state_dict()[
-                f'model.layers.{hf_jamba_layer}.mamba.in_proj.weight'
-            ]
-            new_state_dict[f'model.decoder.layers.{i}.mixer.x_proj.weight'] = hf_model.state_dict()[
-                f'model.layers.{hf_jamba_layer}.mamba.x_proj.weight'
-            ]
-            new_state_dict[f'model.decoder.layers.{i}.mixer.dt_proj.weight'] = hf_model.state_dict()[
-                f'model.layers.{hf_jamba_layer}.mamba.dt_proj.weight'
-            ]
-            new_state_dict[f'model.decoder.layers.{i}.mixer.dt_proj.bias'] = hf_model.state_dict()[
-                f'model.layers.{hf_jamba_layer}.mamba.dt_proj.bias'
-            ]
-            new_state_dict[f'model.decoder.layers.{i}.mixer.out_proj.weight'] = hf_model.state_dict()[
-                f'model.layers.{hf_jamba_layer}.mamba.out_proj.weight'
-            ]
-            new_state_dict[f'model.decoder.layers.{i}.mixer.dt_layernorm.weight'] = hf_model.state_dict()[
-                f'model.layers.{hf_jamba_layer}.mamba.dt_layernorm.weight'
-            ]
-            new_state_dict[f'model.decoder.layers.{i}.mixer.b_layernorm.weight'] = hf_model.state_dict()[
-                f'model.layers.{hf_jamba_layer}.mamba.b_layernorm.weight'
-            ]
-            new_state_dict[f'model.decoder.layers.{i}.mixer.c_layernorm.weight'] = hf_model.state_dict()[
-                f'model.layers.{hf_jamba_layer}.mamba.c_layernorm.weight'
-            ]
-            new_state_dict[f'model.decoder.layers.{i}.norm.weight'] = hf_model.state_dict()[
-                f'model.layers.{hf_jamba_layer}.input_layernorm.weight'
-            ]
-        if symb == "*":
-
-            new_state_dict[f'model.decoder.layers.{i}.self_attention.linear_qkv.layer_norm_weight'] = (
-                hf_model.state_dict()[f'model.layers.{hf_jamba_layer}.input_layernorm.weight']
-            )
-
-            new_state_dict[f'model.decoder.layers.{i}.self_attention.linear_proj.weight'] = hf_model.state_dict()[
-                f'model.layers.{hf_jamba_layer}.self_attn.o_proj.weight'
-            ]
-            hidden_size = config.hidden_size
-            head_num = config.num_attention_heads
-            head_size = hidden_size // head_num
-            num_query_groups = config.num_key_value_heads
-
-            old_tensor_shape = hf_model.state_dict()[f'model.layers.{hf_jamba_layer}.self_attn.q_proj.weight'].size()
-            new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
-            new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
-
-            q = hf_model.state_dict()[f'model.layers.{hf_jamba_layer}.self_attn.q_proj.weight'].view(
-                *new_q_tensor_shape
-            )
-            k = hf_model.state_dict()[f'model.layers.{hf_jamba_layer}.self_attn.k_proj.weight'].view(
-                *new_kv_tensor_shape
-            )
-            v = hf_model.state_dict()[f'model.layers.{hf_jamba_layer}.self_attn.v_proj.weight'].view(
-                *new_kv_tensor_shape
-            )
-
-            qkv_weights = torch.empty((0, head_size) + old_tensor_shape[1:])  # .cuda()
-            heads_per_group = head_num // num_query_groups
-            for count in range(num_query_groups):
-                qkv_weights = torch.cat(
-                    (qkv_weights, q[count * heads_per_group : (count + 1) * heads_per_group, :, :])
-                )
-                qkv_weights = torch.cat((qkv_weights, k[count : count + 1, :, :]))
-                qkv_weights = torch.cat((qkv_weights, v[count : count + 1, :, :]))
-            qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
-
-            param_to_weights = lambda param: param.float()
-            new_state_dict[f'model.decoder.layers.{i}.self_attention.linear_qkv.weight'] = param_to_weights(
-                qkv_weights
-            )
-
-            new_state_dict[f'model.decoder.layers.{i}.self_attention.linear_proj._extra_state'] = (
-                nemo_model_from_hf.state_dict()[f'model.decoder.layers.{i}.self_attention.linear_proj._extra_state']
-            )
-            new_state_dict[f'model.decoder.layers.{i}.self_attention.linear_qkv._extra_state'] = (
-                nemo_model_from_hf.state_dict()[f'model.decoder.layers.{i}.self_attention.linear_qkv._extra_state']
-            )
-        if symb == "-":
-            new_state_dict[f'model.decoder.layers.{i}.mlp.linear_fc1.layer_norm_weight'] = hf_model.state_dict()[
-                f'model.layers.{hf_jamba_layer}.pre_ff_layernorm.weight'
-            ]
-            new_state_dict[f'model.decoder.layers.{i}.mlp.linear_fc1.weight'] = torch.cat(
-                [
-                    hf_model.state_dict()[f'model.layers.{hf_jamba_layer}.feed_forward.gate_proj.weight'],
-                    hf_model.state_dict()[f'model.layers.{hf_jamba_layer}.feed_forward.up_proj.weight'],
-                ]
-            )
-            new_state_dict[f'model.decoder.layers.{i}.mlp.linear_fc2.weight'] = hf_model.state_dict()[
-                f'model.layers.{hf_jamba_layer}.feed_forward.down_proj.weight'
-            ]
-            new_state_dict[f'model.decoder.layers.{i}.mlp.linear_fc1._extra_state'] = nemo_model_from_hf.state_dict()[
-                f'model.decoder.layers.{i}.mlp.linear_fc1._extra_state'
-            ]
-            new_state_dict[f'model.decoder.layers.{i}.mlp.linear_fc2._extra_state'] = nemo_model_from_hf.state_dict()[
-                f'model.decoder.layers.{i}.mlp.linear_fc2._extra_state'
-            ]
-        if symb == "O":
-            new_state_dict[f'model.decoder.layers.{i}.mlp.input_layernorm.weight'] = hf_model.state_dict()[
-                f'model.layers.{hf_jamba_layer}.pre_ff_layernorm.weight'
-            ]
-            new_state_dict[f'model.decoder.layers.{i}.mlp.router.weight'] = hf_model.state_dict()[
-                f'model.layers.{hf_jamba_layer}.feed_forward.router.weight'
-            ]
-            for j in range(nemo_config.model.num_moe_experts):
-                new_state_dict[f'model.decoder.layers.{i}.mlp.experts.local_experts.{j}.linear_fc1.weight'] = (
-                    torch.cat(
-                        [
-                            hf_model.state_dict()[
-                                f'model.layers.{hf_jamba_layer}.feed_forward.experts.{j}.gate_proj.weight'
-                            ],
-                            hf_model.state_dict()[
-                                f'model.layers.{hf_jamba_layer}.feed_forward.experts.{j}.up_proj.weight'
-                            ],
-                        ]
-                    )
-                )
-                new_state_dict[f'model.decoder.layers.{i}.mlp.experts.local_experts.{j}.linear_fc2.weight'] = (
-                    hf_model.state_dict()[f'model.layers.{hf_jamba_layer}.feed_forward.experts.{j}.down_proj.weight']
-                )
-                new_state_dict[
-                    f'model.decoder.layers.{i}.mlp.experts.local_experts.{j}.linear_fc1._extra_state'
-                ] = nemo_model_from_hf.state_dict()[
-                    f'model.decoder.layers.{i}.mlp.experts.local_experts.{j}.linear_fc1._extra_state'
-                ]
-                new_state_dict[
-                    f'model.decoder.layers.{i}.mlp.experts.local_experts.{j}.linear_fc2._extra_state'
-                ] = nemo_model_from_hf.state_dict()[
-                    f'model.decoder.layers.{i}.mlp.experts.local_experts.{j}.linear_fc2._extra_state'
-                ]
-
-    nemo_model_from_hf.load_state_dict(new_state_dict, strict=True)
-    dtype = torch_dtype_from_precision(args.precision)
-    nemo_model_from_hf = nemo_model_from_hf.to(dtype=dtype)
-
-    inpt = torch.randint(10, (1, 10))  # .cuda()
-
-    # out_pyt = hf_model.forward(inpt)
-    # out_nemo = nemo_model_from_hf.forward(inpt)
-    # print(f"out_pyt = {out_pyt}")
-    # print(f"out_nemo = {out_nemo}")
-
-    import sys
-
-    sys.exit()
-    nemo_model_from_hf.save_to(args.output_path)
-    logging.info(f'Jamba NeMo model saved to: {args.output_path}')
-
-
-if __name__ == '__main__':
-    args = get_args()
-    convert(args)
\ No newline at end of file

From f9e2066c492b8a4161a81bdd51048f3bf2c4d82a Mon Sep 17 00:00:00 2001
From: JRD971000 <JRD971000@users.noreply.github.com>
Date: Mon, 1 Jul 2024 14:44:23 +0000
Subject: [PATCH 04/21] Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>
---
 examples/nlp/language_modeling/mamba_change_num_partition.py  | 2 +-
 examples/nlp/language_modeling/megatron_mamba_generate.py     | 2 +-
 .../nlp/modules/common/text_generation_strategy.py            | 4 +++-
 nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py       | 3 ++-
 scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py   | 2 +-
 5 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/examples/nlp/language_modeling/mamba_change_num_partition.py b/examples/nlp/language_modeling/mamba_change_num_partition.py
index fd2433d636ab..a2bdc9667675 100644
--- a/examples/nlp/language_modeling/mamba_change_num_partition.py
+++ b/examples/nlp/language_modeling/mamba_change_num_partition.py
@@ -326,7 +326,7 @@ def write_tp_pp_split(model, splits, app_state, tp_size, pp_rank, write_path):
                 raise RuntimeError(
                     f"Can not handle parameter {name}, required shape: {param.shape}, split shape: {split_val.shape}."
                 )
-            
+
             param.data = split_val
             idx += 1
 
diff --git a/examples/nlp/language_modeling/megatron_mamba_generate.py b/examples/nlp/language_modeling/megatron_mamba_generate.py
index 36bbef30069e..54621fd1a28b 100644
--- a/examples/nlp/language_modeling/megatron_mamba_generate.py
+++ b/examples/nlp/language_modeling/megatron_mamba_generate.py
@@ -66,4 +66,4 @@ def main(cfg) -> None:
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index c6c72868f2b2..1e0a8b6a8f28 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -333,6 +333,7 @@ def prepare_batch_at_step(
         tensor_shape = [tokens2use.shape[1], micro_batch_size, self.model.cfg.hidden_size]
         return batch, tensor_shape
 
+
 class MambaModelTextGenerationStrategy(TextGenerationStrategy):
     def __init__(self, model):
         super().__init__(model)
@@ -407,7 +408,8 @@ def forward_step(self, batch, tensor_shape_and_context_length):
 
         output_tensor[0]['logits'] = output_tensor[0]['logits'][:, :context_length, :]
         return output_tensor
-    
+
+
 class GriffinModelTextGenerationStrategy(TextGenerationStrategy):
     def __init__(self, model):
         super().__init__(model)
diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
index d46b7232f38f..34ca175470ab 100644
--- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
@@ -17,8 +17,9 @@
 from typing import List, Optional, Union
 
 import torch
-from omegaconf import DictConfig, OmegaConf, open_dict
 from megatron.core.transformer.identity_op import IdentityOp
+from omegaconf import DictConfig, OmegaConf, open_dict
+
 from nemo.utils.model_utils import inject_model_parallel_rank
 
 try:
diff --git a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
index 37cabd4a5a98..ddc24a1fa95a 100644
--- a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
@@ -156,4 +156,4 @@ def convert(args):
 
 if __name__ == '__main__':
     args = get_args()
-    convert(args)
\ No newline at end of file
+    convert(args)

From 96ab05c16e49a32953aeaa74f232df421820db3d Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Tue, 2 Jul 2024 08:04:36 -0700
Subject: [PATCH 05/21] more cleanups

---
 .../conf/megatron_mamba_config.yaml           |  1 +
 .../megatron_mamba_finetuning_config.yaml     |  1 +
 .../conf/megatron_mamba_generate_config.yaml  |  1 +
 .../mamba_change_num_partition.py             | 85 ++-----------------
 .../megatron_mamba_finetuning.py              |  6 +-
 .../megatron_mamba_generate.py                |  2 +-
 ...odel.py.py => megatron_mamba_sft_model.py} |  0
 7 files changed, 13 insertions(+), 83 deletions(-)
 rename nemo/collections/nlp/models/language_modeling/{megatron_mamba_sft_model.py.py => megatron_mamba_sft_model.py} (100%)

diff --git a/examples/nlp/language_modeling/conf/megatron_mamba_config.yaml b/examples/nlp/language_modeling/conf/megatron_mamba_config.yaml
index 2c9a64bc5f04..4a720309031a 100644
--- a/examples/nlp/language_modeling/conf/megatron_mamba_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_mamba_config.yaml
@@ -87,6 +87,7 @@ model:
     vocab_file: null
     merge_file: null 
     sentencepiece_legacy: False
+    use_fast: True
 
   # Distributed checkpoint setup
   dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
diff --git a/examples/nlp/language_modeling/conf/megatron_mamba_finetuning_config.yaml b/examples/nlp/language_modeling/conf/megatron_mamba_finetuning_config.yaml
index 5b9d3517f44b..3684b61bb186 100644
--- a/examples/nlp/language_modeling/conf/megatron_mamba_finetuning_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_mamba_finetuning_config.yaml
@@ -106,6 +106,7 @@ model:
     vocab_file: null
     merge_file: null 
     sentencepiece_legacy: False
+    use_fast: True
 
   # precision
   native_amp_init_scale: 4294967296 # 2 ** 32
diff --git a/examples/nlp/language_modeling/conf/megatron_mamba_generate_config.yaml b/examples/nlp/language_modeling/conf/megatron_mamba_generate_config.yaml
index 9b00dff6f32f..35b42e61788f 100644
--- a/examples/nlp/language_modeling/conf/megatron_mamba_generate_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_mamba_generate_config.yaml
@@ -89,6 +89,7 @@ model:
     vocab_file: null
     merge_file: null 
     sentencepiece_legacy: False
+    use_fast: True
 
 
   # precision
diff --git a/examples/nlp/language_modeling/mamba_change_num_partition.py b/examples/nlp/language_modeling/mamba_change_num_partition.py
index fd2433d636ab..0a595f17998b 100644
--- a/examples/nlp/language_modeling/mamba_change_num_partition.py
+++ b/examples/nlp/language_modeling/mamba_change_num_partition.py
@@ -38,7 +38,7 @@
 """
 Usage:
 
-### Tensor Parallelism and Pipeline Parallelism conversion ###
+### Tensor Parallelism conversion ###
 
 # Megatron Mamba
 python /home/ataghibakhsh/NeMo/examples/nlp/language_modeling/mamba_change_num_partition.py \
@@ -46,8 +46,11 @@
     --target_file=/home/ataghibakhsh/TP4-ADLR-mamba-hybrid/mamba2-TP4.nemo \
     --tensor_model_parallel_size=1 \
     --target_tensor_model_parallel_size=4 \
-    --precision=bf16
-
+    --precision=bf16 \
+    --d_model=4096 \
+    --mamba_version=2 \
+    --mamba2_n_groups=8 \
+    --mamba2_head_dim=64
 
 """
 
@@ -221,82 +224,6 @@ def restore_model_config(cfg, original_dict):
     return cfg
 
 
-#################
-### Utilities ###
-#################
-
-
-def compute_tp_splits(
-    param_name, param, partitions, global_idx, tp_size, pp_size, pp_rank, pp_split_rank, megatron_legacy, model_cfg
-):
-    """
-    Function to compute the splits required for tensor-parallelism.
-
-    Args:
-        param_name: Name of the current parameter of the current model (TP X PP Y)
-        param: Value of the current parameter of the current compute_tp_splitsmodel (TP X PP Y)
-        partitions: Partitions of the flattened parameter of the current model (TP 1 PP 1)
-        global_idx: The index used to select the parameter in the global partition.
-        tp_size: Int, tensor-parallelism size.
-        pp_size: Int, pipeline-parallelism size.
-        pp_rank: Int, pipeline-parallelism rank.
-        pp_split_rank: Int, pipeline-parallelism split rank. This should be > 1 if TP is being used with EncDec models (T5)
-        megatron_legacy: Bool, whether the model is a legacy Megatron model or not.
-        model_cfg: The model config as a OmegaConf DictConfig.
-
-    Returns:
-        List of torch tensors, each of which is a split of the current parameter.
-    """
-    # alias the global index to idx
-    idx = global_idx
-
-    fast_glu_activation = str(model_cfg.get('activation', '')).lower() in ['fast-geglu', 'fast-swiglu', 'fast-reglu']
-
-    if param.shape == partitions[0][idx].shape:
-        split = [partitions[0][idx].data] * tp_size
-        logging.debug(">> Perfect match, no splitting needed")
-    elif param.shape[0] == partitions[0][idx].shape[0]:
-        split = torch.split(partitions[0][idx].data, param.shape[-1], dim=-1)
-    else:
-        # For T5-converted weights, the splitting needs to be strided such that q,k,v weights are bunched together on each tensor-parallel rank.
-        if '.query_key_value.' in param_name and megatron_legacy:  # weight or bias
-            split_dim = partitions[0][idx].data.shape[0]
-            if split_dim % (tp_size * 3) != 0:
-                raise ValueError(
-                    f"Can not split Q,K,V parameter {param_name} with shape {param.shape} into tensor parallel size {tp_size}. Not divisible by {tp_size * 3}."
-                )
-            tp_qkv_splits = torch.chunk(partitions[0][idx].data, tp_size * 3, dim=0)
-            split = []
-            for i in range(tp_size):
-                tp_qkv = torch.cat([tp_qkv_splits[item] for item in range(i, tp_size * 3, tp_size)])
-                split.append(tp_qkv)
-        elif '.key_value.' in param_name and megatron_legacy:  # weight or bias
-            split_dim = partitions[0][idx].data.shape[0]
-            if split_dim % (tp_size * 2) != 0:
-                raise ValueError(
-                    f"Can not split K,V parameter {param_name} with shape {param.shape} into tensor parallel size {tp_size}. Not divisible by {tp_size * 2}."
-                )
-            tp_qkv_splits = torch.chunk(partitions[0][idx].data, tp_size * 2, dim=0)
-            split = []
-            for i in range(tp_size):
-                tp_qkv = torch.cat([tp_qkv_splits[item] for item in range(i, tp_size * 2, tp_size)])
-                split.append(tp_qkv)
-        elif ('dense_h_to_4h' in param_name or 'linear_fc1' in param_name) and fast_glu_activation:
-            # For Megatron GPT model with Fast Glu activation
-            # Handle gated linear units
-            # concat all the first halves ('W's) and all the second halves ('V's)
-            w_split, k_split = torch.chunk(partitions[0][idx].data, 2, dim=0)
-            w_split = torch.chunk(w_split, tp_size, dim=0)
-            k_split = torch.chunk(k_split, tp_size, dim=0)
-            split = [torch.cat(weights, dim=0) for weights in zip(w_split, k_split)]  # split per tp rank
-
-        # Regular split for Megatron and NeMo-Megatron models.
-        else:
-            split = torch.split(partitions[0][idx].data, param.shape[0], dim=0)
-
-    return split
-
-
 def write_tp_pp_split(model, splits, app_state, tp_size, pp_rank, write_path):
     """
     Function to write the given TP PP split to NeMo File.
diff --git a/examples/nlp/language_modeling/megatron_mamba_finetuning.py b/examples/nlp/language_modeling/megatron_mamba_finetuning.py
index 4953ea747ae2..0613ef486ec3 100644
--- a/examples/nlp/language_modeling/megatron_mamba_finetuning.py
+++ b/examples/nlp/language_modeling/megatron_mamba_finetuning.py
@@ -25,7 +25,7 @@
 mp.set_start_method("spawn", force=True)
 
 
-@hydra_runner(config_path="conf", config_name="megatron_jamba_finetuning_config")
+@hydra_runner(config_path="conf", config_name="megatron_mamba_finetuning_config")
 def main(cfg) -> None:
 
     logging.info("\n\n************** Experiment configuration ***********")
@@ -37,8 +37,8 @@ def main(cfg) -> None:
     cfg.trainer.precision = precision
     exp_manager(trainer, cfg.exp_manager)
 
-    model_cfg = MegatronJambaSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
-    model = MegatronJambaSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
+    model_cfg = MegatronMambaSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
+    model = MegatronMambaSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
 
     peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
 
diff --git a/examples/nlp/language_modeling/megatron_mamba_generate.py b/examples/nlp/language_modeling/megatron_mamba_generate.py
index 36bbef30069e..043a39979de6 100644
--- a/examples/nlp/language_modeling/megatron_mamba_generate.py
+++ b/examples/nlp/language_modeling/megatron_mamba_generate.py
@@ -27,7 +27,7 @@
 mp.set_start_method("spawn", force=True)
 
 
-@hydra_runner(config_path="conf", config_name="megatron_jamba_generate_config")
+@hydra_runner(config_path="conf", config_name="megatron_mamba_generate_config")
 def main(cfg) -> None:
     logging.info("\n\n************** Experiment configuration ***********")
     logging.info(f"\n{OmegaConf.to_yaml(cfg)}")
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py
similarity index 100%
rename from nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py.py
rename to nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py

From 2e74b64e768f04b8dc86c9adbdc9ab09aa91640d Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Tue, 2 Jul 2024 09:50:23 -0700
Subject: [PATCH 06/21] use GPT text gen

---
 .../mamba_change_num_partition.py             | 29 +++----
 .../common/text_generation_strategy.py        | 77 -------------------
 2 files changed, 15 insertions(+), 91 deletions(-)

diff --git a/examples/nlp/language_modeling/mamba_change_num_partition.py b/examples/nlp/language_modeling/mamba_change_num_partition.py
index 8b54a18d71de..5ade5c653852 100644
--- a/examples/nlp/language_modeling/mamba_change_num_partition.py
+++ b/examples/nlp/language_modeling/mamba_change_num_partition.py
@@ -18,7 +18,7 @@
 import tempfile
 from argparse import ArgumentParser
 from typing import Dict, List
-
+from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
 import torch
 import torch.nn as nn
 from omegaconf import OmegaConf, open_dict
@@ -47,11 +47,10 @@
     --tensor_model_parallel_size=1 \
     --target_tensor_model_parallel_size=4 \
     --precision=bf16 \
-    --d_model=4096 \
-    --mamba_version=2 \
-    --mamba2_n_groups=8 \
-    --mamba2_head_dim=64
-
+    --d-model=4096 \
+    --mamba-version=2 \
+    --mamba2-n-groups=8 \
+    --mamba2-head-dim=64
 """
 
 
@@ -310,7 +309,10 @@ def split_tp_partition_only(args, model, original_model, tp_size, write_path=Non
     # This is done so that the last PP rank will save the last TP rank only after all other PP TP ranks are saved
     # The final rank will then save a new NeMo file with all other ranks inside.
     write_tp_pp_split(model, splits, app_state, tp_size, pp_rank=0, write_path=write_path)
-
+    
+    with tarfile.open(write_path, 'r') as tar:
+        # Extract all contents to the specified path
+        tar.extractall(path=os.path.dirname(write_path))
 
 def main():
     parser = ArgumentParser()
@@ -433,12 +435,11 @@ def main():
         hparams_filepath = None
 
     # Import the class of the model
-    cls = model_utils.import_class_by_path(args.model_class)
 
     if args.model_file is None and args.model_extracted_dir is None:
         raise ValueError("Cannot pass model_file and model_extracted_dir as None at the same time.")
 
-    tmp_cfg = cls.restore_from(
+    tmp_cfg = MegatronMambaModel.restore_from(
         restore_path=args.model_file,
         trainer=Trainer(devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision),
         map_location=torch.device("cpu"),
@@ -469,7 +470,7 @@ def main():
 
     if tp_size < 0 or pp_size < 0:
         logging.info(f"Loading model config from {args.model_file} to get TP and PP size")
-        model_config_internal = cls.restore_from(
+        model_config_internal = MegatronMambaModel.restore_from(
             restore_path=args.model_file,
             trainer=trainer,
             map_location=torch.device("cpu"),
@@ -558,7 +559,7 @@ def main():
     else:
         model_filepath = args.model_extracted_dir
 
-    tmp_cfg = cls.restore_from(
+    tmp_cfg = MegatronMambaModel.restore_from(
         restore_path=model_filepath,
         trainer=trainer,
         map_location=torch.device("cpu"),
@@ -568,7 +569,7 @@ def main():
 
     tmp_cfg, restore_dict = force_cpu_model(tmp_cfg)
 
-    model = cls.restore_from(
+    model = MegatronMambaModel.restore_from(
         restore_path=model_filepath,
         trainer=trainer,
         map_location=torch.device("cpu"),
@@ -576,7 +577,7 @@ def main():
         override_config_path=tmp_cfg,
     )
 
-    original_model = cls.restore_from(
+    original_model = MegatronMambaModel.restore_from(
         restore_path=model_filepath,
         trainer=trainer,
         map_location=torch.device("cpu"),
@@ -673,7 +674,7 @@ def main():
 
             model.cfg, restore_dict = force_cpu_model(model.cfg)
 
-            model = cls(model.cfg, trainer)
+            model = MegatronMambaModel(model.cfg, trainer)
             model = model.to('cpu')
             model._save_restore_connector = NLPSaveRestoreConnector()
             model.freeze()
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index 1e0a8b6a8f28..f1a79bacaffb 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -333,83 +333,6 @@ def prepare_batch_at_step(
         tensor_shape = [tokens2use.shape[1], micro_batch_size, self.model.cfg.hidden_size]
         return batch, tensor_shape
 
-
-class MambaModelTextGenerationStrategy(TextGenerationStrategy):
-    def __init__(self, model):
-        super().__init__(model)
-        self.forward_model = self.model.model
-
-    def clip_max_len(self, maxlen: int) -> int:
-        """clip the max len based on the LM model max sequence length"""
-
-        # for positional embedding types that allow length extrapolation, don't clip the max length
-        if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
-            if maxlen > self.model.cfg.encoder_seq_length + 1:
-                maxlen = self.model.cfg.encoder_seq_length + 1
-        return maxlen
-
-    def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_attention_mask: bool):
-        """initialize the batch data before the inference steps."""
-        # Move to GPU.
-        tokenizer = self.model.tokenizer
-        tokens = context_tokens.contiguous().cuda()
-        # Get the attention mask and postition ids.
-        self.attention_mask, _, self.position_ids = get_ltor_masks_and_position_ids(
-            tokens,
-            tokenizer.eos_id,
-            self.model.cfg.get('reset_position_ids', False),
-            self.model.cfg.get('reset_attention_mask', False),
-            self.model.cfg.get('eod_mask_loss', False),
-            compute_attention_mask=compute_attention_mask,
-        )
-        self.attention_mask = None
-
-    def prepare_batch_at_step(
-        self,
-        tokens: torch.Tensor,
-        maxlen: int,
-        micro_batch_size: int,
-        step: int,
-        context_length: int,
-        compute_attention_mask: bool = False,
-    ) -> Tuple[List[torch.Tensor], List[int]]:
-        """
-        generate the batch used in inference for each of the steps
-        """
-        # types2use = None
-        # Allocate memory for the entire context.
-
-        tokens2use = tokens
-
-        """Prepare batch for each of the inference steps"""
-        attention_mask_repeat = None
-
-        batch = [tokens2use, attention_mask_repeat, self.position_ids]
-        tensor_shape = [tokens2use.shape[1], micro_batch_size, self.model.cfg.hidden_size]
-        return batch, (tensor_shape, context_length)
-
-    def forward_step(self, batch, tensor_shape_and_context_length):
-        tensor_shape, context_length = tensor_shape_and_context_length
-        fwd_bwd_function = get_forward_backward_func()
-
-        output_tensor = fwd_bwd_function(
-            forward_step_func=self.model.get_forward_output_only_func(),
-            data_iterator=iter(
-                [
-                    batch,
-                ]
-            ),
-            model=[self.forward_model],
-            num_microbatches=get_num_microbatches(),
-            forward_only=True,
-            seq_length=tensor_shape[0],
-            micro_batch_size=tensor_shape[1],
-        )
-
-        output_tensor[0]['logits'] = output_tensor[0]['logits'][:, :context_length, :]
-        return output_tensor
-
-
 class GriffinModelTextGenerationStrategy(TextGenerationStrategy):
     def __init__(self, model):
         super().__init__(model)

From 05c377a3b34c462be6876dfb6cba62b1c47ff93c Mon Sep 17 00:00:00 2001
From: JRD971000 <JRD971000@users.noreply.github.com>
Date: Tue, 2 Jul 2024 16:51:13 +0000
Subject: [PATCH 07/21] Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>
---
 .../nlp/language_modeling/mamba_change_num_partition.py     | 6 ++++--
 .../nlp/modules/common/text_generation_strategy.py          | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/nlp/language_modeling/mamba_change_num_partition.py b/examples/nlp/language_modeling/mamba_change_num_partition.py
index 5ade5c653852..10f974c0582c 100644
--- a/examples/nlp/language_modeling/mamba_change_num_partition.py
+++ b/examples/nlp/language_modeling/mamba_change_num_partition.py
@@ -18,12 +18,13 @@
 import tempfile
 from argparse import ArgumentParser
 from typing import Dict, List
-from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
+
 import torch
 import torch.nn as nn
 from omegaconf import OmegaConf, open_dict
 from pytorch_lightning import Trainer
 
+from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
 from nemo.collections.nlp.parts.nlp_overrides import (
     NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE,
     GradScaler,
@@ -309,11 +310,12 @@ def split_tp_partition_only(args, model, original_model, tp_size, write_path=Non
     # This is done so that the last PP rank will save the last TP rank only after all other PP TP ranks are saved
     # The final rank will then save a new NeMo file with all other ranks inside.
     write_tp_pp_split(model, splits, app_state, tp_size, pp_rank=0, write_path=write_path)
-    
+
     with tarfile.open(write_path, 'r') as tar:
         # Extract all contents to the specified path
         tar.extractall(path=os.path.dirname(write_path))
 
+
 def main():
     parser = ArgumentParser()
     parser.add_argument("--model_file", type=str, default=None, required=False, help="Path to source .nemo file")
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index f1a79bacaffb..ebd2d02c9350 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -333,6 +333,7 @@ def prepare_batch_at_step(
         tensor_shape = [tokens2use.shape[1], micro_batch_size, self.model.cfg.hidden_size]
         return batch, tensor_shape
 
+
 class GriffinModelTextGenerationStrategy(TextGenerationStrategy):
     def __init__(self, model):
         super().__init__(model)

From 59f176a313ab1ac89d59ad3a9adcfa0d5c4d526c Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Tue, 2 Jul 2024 12:19:39 -0700
Subject: [PATCH 08/21] fixing gbs in TP convetor

---
 .../mamba_change_num_partition.py             | 22 +++++++++++--------
 .../convert_mamba2_pyt_to_nemo.py             | 10 ++++-----
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/examples/nlp/language_modeling/mamba_change_num_partition.py b/examples/nlp/language_modeling/mamba_change_num_partition.py
index 5ade5c653852..5bb0c5ace8dc 100644
--- a/examples/nlp/language_modeling/mamba_change_num_partition.py
+++ b/examples/nlp/language_modeling/mamba_change_num_partition.py
@@ -13,15 +13,12 @@
 # limitations under the License.
 
 import os
-import shutil
 import tarfile
 import tempfile
 from argparse import ArgumentParser
-from typing import Dict, List
 from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
 import torch
-import torch.nn as nn
-from omegaconf import OmegaConf, open_dict
+from omegaconf import open_dict
 from pytorch_lightning import Trainer
 
 from nemo.collections.nlp.parts.nlp_overrides import (
@@ -32,7 +29,7 @@
     NLPSaveRestoreConnector,
     PipelineMixedPrecisionPlugin,
 )
-from nemo.utils import logging, model_utils
+from nemo.utils import logging
 from nemo.utils.app_state import AppState
 
 """
@@ -41,9 +38,9 @@
 ### Tensor Parallelism conversion ###
 
 # Megatron Mamba
-python /home/ataghibakhsh/NeMo/examples/nlp/language_modeling/mamba_change_num_partition.py \
-    --model_file=/home/ataghibakhsh/adlr_mamba2/mamba2-hybrid-8b-3t-4k.nemo \
-    --target_file=/home/ataghibakhsh/TP4-ADLR-mamba-hybrid/mamba2-TP4.nemo \
+python /opt/NeMo/examples/nlp/language_modeling/mamba_change_num_partition.py \
+    --model_file=<path to source .nemo model> \
+    --target_file=<path to target .nemo model> \
     --tensor_model_parallel_size=1 \
     --target_tensor_model_parallel_size=4 \
     --precision=bf16 \
@@ -340,7 +337,7 @@ def main():
     parser.add_argument(
         "--model_class",
         type=str,
-        default="nemo.collections.nlp.models.language_modeling.megatron_jamba_model.MegatronJambaModel",
+        default="nemo.collections.nlp.models.language_modeling.megatron_mamba_model.MegatronMambaModel",
         help="NeMo model class. This script should support all NeMo megatron models that use Tensor Parallel",
     )
     parser.add_argument("--precision", default=16, help="PyTorch Lightning Trainer precision flag")
@@ -673,6 +670,13 @@ def main():
                         model.cfg.tokenizer.model = tokenizer_model_path
 
             model.cfg, restore_dict = force_cpu_model(model.cfg)
+            
+            from apex.transformer.pipeline_parallel.utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+            
+            _GLOBAL_NUM_MICROBATCHES_CALCULATOR.current_global_batch_size = 1
+            _GLOBAL_NUM_MICROBATCHES_CALCULATOR.current_micro_batch_size = 1
+            model.cfg.global_batch_size = 1
+            model.cfg.micro_batch_size = 1
 
             model = MegatronMambaModel(model.cfg, trainer)
             model = model.to('cpu')
diff --git a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
index ddc24a1fa95a..f51060564174 100644
--- a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
@@ -26,10 +26,10 @@
 '''
 Example
 
-CUDA_VISIBLE_DEVICES="0" python /NeMo/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py 
-                                --input_name_or_path PATH_TO_PYTORCH_WEIGHTS
-                                --output_path OUTPUT_PATH.nemo
-                                --ngroups_mamba 8
+CUDA_VISIBLE_DEVICES="0" python /NeMo/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \
+                                --input_name_or_path <path to the source pytorch model> \
+                                --output_path <path to target .nemo model> \
+                                --ngroups_mamba 8 \
                                 --precision 32
 '''
 
@@ -39,7 +39,7 @@ def get_args():
     parser.add_argument(
         "--hparams_file",
         type=str,
-        default=f"{os.path.dirname(__file__)}/../../examples/nlp/language_modeling/conf/megatron_jamba_config.yaml",
+        default=f"{os.path.dirname(__file__)}/../../examples/nlp/language_modeling/conf/megatron_mamba_config.yaml",
         required=False,
         help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
     )

From dfc24e26a7d75b227fe3dc8979ea0bedf268bcfe Mon Sep 17 00:00:00 2001
From: JRD971000 <JRD971000@users.noreply.github.com>
Date: Tue, 2 Jul 2024 19:21:48 +0000
Subject: [PATCH 09/21] Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>
---
 .../nlp/language_modeling/mamba_change_num_partition.py     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/nlp/language_modeling/mamba_change_num_partition.py b/examples/nlp/language_modeling/mamba_change_num_partition.py
index 6d5dc17437a8..108ecc308482 100644
--- a/examples/nlp/language_modeling/mamba_change_num_partition.py
+++ b/examples/nlp/language_modeling/mamba_change_num_partition.py
@@ -16,7 +16,7 @@
 import tarfile
 import tempfile
 from argparse import ArgumentParser
-from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
+
 import torch
 from omegaconf import open_dict
 from pytorch_lightning import Trainer
@@ -672,9 +672,9 @@ def main():
                         model.cfg.tokenizer.model = tokenizer_model_path
 
             model.cfg, restore_dict = force_cpu_model(model.cfg)
-            
+
             from apex.transformer.pipeline_parallel.utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
-            
+
             _GLOBAL_NUM_MICROBATCHES_CALCULATOR.current_global_batch_size = 1
             _GLOBAL_NUM_MICROBATCHES_CALCULATOR.current_micro_batch_size = 1
             model.cfg.global_batch_size = 1

From 7edd5cc47cc1dc6885ba924f7a1fe7836d5b45ff Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Tue, 2 Jul 2024 13:18:48 -0700
Subject: [PATCH 10/21] add reqs

---
 nemo/collections/nlp/modules/common/text_generation_strategy.py | 2 +-
 requirements/requirements_nlp.txt                               | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index ebd2d02c9350..808c684208a5 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -993,7 +993,7 @@ def model_inference_strategy_dispatcher(model, **args):
     if isinstance(model, MegatronGriffinModel):
         return GriffinModelTextGenerationStrategy(model)
     if isinstance(model, MegatronMambaModel):
-        return MambaModelTextGenerationStrategy(model)
+        return GPTModelTextGenerationStrategy(model)
     if isinstance(model, MegatronNevaModel):
         return NevaModelTextGenerationStrategy(model)
     if isinstance(model, MegatronGPTPromptLearningModel):
diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index 494a9ab6d672..d006ccb7ad65 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -10,6 +10,7 @@ gdown
 h5py
 ijson
 jieba
+mamba-ssm==1.2.0.post1
 markdown2
 matplotlib>=3.3.2
 #megatron_core>0.6.0 # add back once mcore on pypi is compatible again

From c0afdc48fedc15a8304dbc7d5c3791cf7d7f18a5 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Wed, 3 Jul 2024 11:54:24 -0700
Subject: [PATCH 11/21] add tutorial

---
 .../conf/megatron_mamba_generate_config.yaml  |   1 -
 tutorials/llm/mamba/mamba.rst                 | 301 ++++++++++++++++++
 2 files changed, 301 insertions(+), 1 deletion(-)
 create mode 100644 tutorials/llm/mamba/mamba.rst

diff --git a/examples/nlp/language_modeling/conf/megatron_mamba_generate_config.yaml b/examples/nlp/language_modeling/conf/megatron_mamba_generate_config.yaml
index 35b42e61788f..2d34aefffc7e 100644
--- a/examples/nlp/language_modeling/conf/megatron_mamba_generate_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_mamba_generate_config.yaml
@@ -61,7 +61,6 @@ model:
   attention_dropout: 0.0
   hidden_dropout: 0.0
   hidden_size: 4096
-  bias_activation_fusion: False
   ffn_hidden_size: 14336 # Transformer FFN hidden size. Usually 4 * hidden_size.
   num_attention_heads: 32
   transformer_block_type: pre_ln
diff --git a/tutorials/llm/mamba/mamba.rst b/tutorials/llm/mamba/mamba.rst
new file mode 100644
index 000000000000..bab62f10eb1e
--- /dev/null
+++ b/tutorials/llm/mamba/mamba.rst
@@ -0,0 +1,301 @@
+Mamba2 and Mamba2-Transformer Hybrid Models Fine-Tuning and Evaluation
+======================================================================
+
+`State Space Models (SSMs) <https://arxiv.org/pdf/2405.21060>`__  have recently emerged as a potential replacement for transformers. They have desirable features such as linear time complexity with respect to sequence length and constant cache size for inference, enabling them to process longer sequences and have higher throughput. However, while pure SSM-based models match or exceed Transformers on many tasks, they lag behind Transformer models on tasks that require strong copying or in-context learning abilities. In order to leverage best of the both worlds, SSM-Hybrid models combine MLP, Transformer, and SSM block in their architecture, and as mentioned in `a study by NVIDIA  <https://arxiv.org/pdf/2406.07887>`__. Based on the experiments, Mamba2-Hybrid models outperform transformed baselines of the same size while leveraging faster inference thanks to the SSM blocks.
+
+
+Mamba2 models from the `Transformers are SSMs <https://arxiv.org/pdf/2405.21060>`__ paper have been offered in 5 different sizes, namely 130m, 370m, 780m, 1.3b, and 2.7b. The Mamba2-Hybrid models and their Mamba2 baseline released by `NVIDIA  <https://arxiv.org/pdf/2406.07887>`__ are all of the size of 8b. 
+
+
+`Low-Rank Adaptation (LoRA) <https://arxiv.org/pdf/2106.09685>`__ has emerged as a popular Parameter Efficient Fine-Tuning (PEFT) technique that tunes a very small number of additional parameters as compared to full fine-tuning, thereby reducing the compute required. LoRA tuning can be applied to the linear layers in the Transformer and MLP blocks for the Mamba2-Hybrid models. 
+
+`NVIDIA NeMo
+Framework <https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html>`__ provides tools to perform Fine-tuning on Mamba2 and Mamba2-Hybrid to fit your use case.
+
+Requirements
+-------------
+
+In order to proceed, ensure that you have met the following requirements:
+
+* Full Fine-Tuning System Configuration
+    * Small models (130m, 370m, 780m)
+        * Access to at least 1 NVIDIA GPU with a cumulative memory of at least 40GB, for example: 1 x A6000-40GB.
+
+    * Mid-size models (1.3b, 2.7b)
+        * Access to at least 1 NVIDIA GPU with a cumulative memory of at least 80GB, for example: 1 x H100-80GB or 1 x A100-80GB.
+
+    * Large models (8b)
+        * Access to at least 2 NVIDIA GPUs with a cumulative memory of at least 80GB, for example: 2 x H100-80GB or 2 x A100-80GB.
+
+* LoRA Fine-Tuning (Mamba2-Hybrid only) System Configuration
+    * Access to at least 1 NVIDIA GPU with a cumulative memory of at least 80GB, for example: 1 x H100-80GB or 1 x A100-80GB.
+
+
+
+* A Docker-enabled environment, with `NVIDIA Container Runtime <https://developer.nvidia.com/container-runtime>`_ installed, which will make the container GPU-aware.
+
+
+* `Authenticate with NVIDIA NGC <https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#ngc-authentication>`_, and download `NGC CLI Tool <https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#ngc-cli-tool>`_.
+
+
+Step-by-step Guide for Fine-Tuning 
+----------------------------------
+
+Checkpoints from HuggingFace
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Obtain the desired checkpoint from HuggigFace. 
+
+* `Repository <https://huggingface.co/state-spaces>`__  for the Mamba2 models from the `Transformers are SSMs paper <https://arxiv.org/pdf/2405.21060>`__.
+* `Repository <https://huggingface.co/collections/nvidia/ssms-666a362c5c3bb7e4a6bcfb9c>`__  for the Mamba2 and Mamba2-Hybrid models by `NVIDIA <https://arxiv.org/pdf/2406.07887>`__.
+
+
+Convert the Pytorch Checkpoint to a NeMo Checkpoint
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+1. Get into NVIDIA Container 
+
+2. Run the conversion script from <SCRIPT-PATH>. For this conversion script, you should provide the PyTorch state dictionary of the model for ``input_name_or_path``, i.e. this argument only accepts a single ``state_dict``.
+
+.. code:: bash
+
+    CUDA_VISIBLE_DEVICES="0" python /NeMo/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \
+                                    --input_name_or_path <path to the source pytorch model> \
+                                    --output_path <path to target .nemo model> \
+                                    --ngroups_mamba 8 \
+                                    --precision bf16
+
+* Note: the ``ngroups_mamba`` parameter should be 1 for the Mamba2 models from the `Transformers are SSMs paper <https://arxiv.org/pdf/2405.21060>`__ (130m, 370m, 780m, 1.3b, and 2.7b) and 8 for the Mamba2 and Mamba2-Hybrid models by `NVIDIA <https://arxiv.org/pdf/2406.07887>`__ (both 8b).
+
+Model (Tensor) Parallelism for the 8b Models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* Note: Distributed checkpointing for the Mamba2 and Mamba2-Hybrid models will be implemented in the near future. For now, you should use the method below for converting to Tensor Parallel (TP) of different sizes. 
+
+The HuggingFace checkpoint for the 8b model is for TP of size 1, and so is the ``.nemo`` checkpoint obtained for the previous step. To shard the model weights for a larger TP size, use the script from <SCRIPT PATH>. The example below is for a target TP of size 4.
+
+.. code:: bash
+   
+   python /opt/NeMo/examples/nlp/language_modeling/mamba_change_num_partition.py \
+          --model_file=<path to source .nemo model> \
+          --target_file=<path to target .nemo model> \
+          --tensor_model_parallel_size=1 \
+          --target_tensor_model_parallel_size=4 \
+          --precision=bf16 \
+
+After running this script, a ``.nemo`` model along with the TP-size number of folders (4 in this example) will be generated in the target path. The folders for each rank will be displayed as ``mp_rank_00`` to ``mp_rank_04`` in this example. 
+
+* Note: You can only use Tensor Parallelism for the 8b models by `NVIDIA <https://arxiv.org/pdf/2406.07887>`__ (Mamba2 8b and Mamba2-Hybrid 8b). This is due to the fact that the ``nroups`` parameter in the model architecture should be divisible by TP size. ``nroups`` parameter is 8 for NVIDIA models and 1 for other models in the list.
+
+Run Fine-Tuning
+^^^^^^^^^^^^^^^
+1. Follow the steps from `here <https://nemo-framework-tme.gitlab-master-pages.nvidia.com/documentation/user-guide/latest/llms/gemma/dataprep.html>`__ to obtain and preprocess the fine-tuning dataset.
+
+2. For full fine-tuning, run the following script
+
+.. code:: bash
+
+    #!/bin/bash
+
+    MBS=4
+    GBS=128
+    TP=2 # According to the saved checkpoint
+    SP=True # True only if TP>1 otherwise False
+    SEQ_LEN=2048
+    NUM_DEVICES=2
+    MODEL="8b-hybrid"
+    PATH_TO_NEMO_MODEL=<path to .nemo file>
+    TRAIN_DATASET_PATH=<path to training dataset file>
+    VAL_DATASET_PATH=<path to validation dataset file>
+    CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/conf/"
+    CONFIG_NAME="megatron_mamba_finetuning_config"
+    SAVE_DIR=<path to the saving directory>
+    TOKENIZER_MODEL=<path to tokenizer model> # Only for the 8b models, for other models, set to null
+
+    declare -A MODEL_CONFIGS
+    MODEL_CONFIGS[130m]="24 768 768 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[370m]="48 1024 1024 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[780m]="48 1536 1536 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[1_3b]="48 2048 2048 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[2_7b]="64 2560 2560 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[8b]="56 4096 16384 256000 8 megatron GPTSentencePieceTokenizer" 
+    MODEL_CONFIGS[8b-hybrid]="56 4096 16384 256000 8 megatron GPTSentencePieceTokenizer" 
+
+    if [ "$MODEL" = "8b-hybrid" ]; then
+        export HYBRID_PATTERN='M-M-M--M-M*-M-M-M-M--M*-M-M-M-M-M*--M-M-M-M-M*-M--M-M-M-'
+    else
+        export HYBRID_PATTERN=''
+    fi
+
+    set_model_params() {
+        local config=(${MODEL_CONFIGS[$MODEL]})
+        NUM_LAYERS=${config[0]}
+        DIM=${config[1]}
+        FFN_DIM=${config[2]}
+        VOCAB_SIZE=${config[3]}
+        NGROUP=${config[4]}
+        TOKENIZER_LIB=${config[5]}
+        TOKENIZER_TYPE=${config[6]}
+    }
+    set_model_params
+
+    export NVTE_FUSED_ATTN=1
+    export NVTE_FLASH_ATTN=0
+
+    MASTER_PORT=15008 torchrun --nproc_per_node=${NUM_DEVICES} 
+            /home/ataghibakhsh/NeMo/examples/nlp/language_modeling/megatron_mamba_finetuning.py \
+            --config-path=${CONFIG_PATH} \
+            --config-name=${CONFIG_NAME} \
+            trainer.devices=${NUM_DEVICES} \
+            trainer.precision=bf16 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=100 \
+            trainer.limit_val_batches=50 \
+            +trainer.num_sanity_val_steps=0 \
+            +trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=700 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=${SAVE_DIR} \
+            exp_manager.resume_if_exists=True \
+            exp_manager.create_checkpoint_callback=True \
+            exp_manager.create_wandb_logger=True \
+            model.hybrid_override_pattern=${HYBRID_PATTERN} \
+            model.ngroups_mamba=${NGROUP} \
+            model.tensor_model_parallel_size=${TP} \
+            model.sequence_parallel=$SP \
+            model.tokenizer.library=${TOKENIZER_LIB} \
+            model.tokenizer.type=${TOKENIZER_TYPE} \
+            model.tokenizer.model=${TOKENIZER_MODEL} \
+            model.vocab_size=${VOCAB_SIZE} \
+            model.num_layers=${NUM_LAYERS} \
+            model.hidden_size=${DIM} \
+            model.ffn_hidden_size=${FFN_DIM} \
+            model.peft.peft_scheme='none' \
+            model.megatron_amp_O2=True \
+            model.encoder_seq_length=${SEQ_LEN} \
+            model.data.validation_ds.pad_to_max_length=True \
+            model.data.train_ds.pad_to_max_length=True \
+            model.optim.name="distributed_fused_adam" \
+            model.data.train_ds.max_seq_length=${SEQ_LEN} \
+            model.data.validation_ds.max_seq_length=${SEQ_LEN} \
+            model.mcore_gpt=True \
+            model.micro_batch_size=${MBS} \
+            model.global_batch_size=${GBS} \
+            model.restore_from_path=${PATH_TO_NEMO_MODEL} \
+            model.data.train_ds.file_names=[${TRAIN_DATASET_PATH}] \
+            model.data.validation_ds.file_names=[${VAL_DATASET_PATH}] \
+            model.optim.lr=5e-6 \
+            model.optim.sched.min_lr=1e-7
+
+* Note: The tokenizer for 8b models (Mamba2 8b and MAmba2-Hybrid 8b) can be found in the `HuggingFace repository <https://huggingface.co/collections/nvidia/ssms-666a362c5c3bb7e4a6bcfb9c>`__. Download it a set its path to ``TOKENIZER_MODEL`` (the tokenizer model file is under the name of ```mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model```). For other models, set ``TOKENIZER_MODEL=null`` since it will be downloaded from HuggingFace at the time of run.
+
+3. For LoRA PEFT-Tuning (only for the 8b-hybrid model), use the script above but change the ```model.peft.peft_scheme``` to ```lora``` and ```model.optim.name``` to ``fused_adam``.
+
+
+Evaluating the Fine-Tuned Model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code:: bash
+
+    #!/bin/bash
+
+    MBS=32
+    GBS=64
+    TP=2 # According to the fine-tuned checkpoint
+    SP=True # True only if TP>1 otherwise False
+    SEQ_LEN=2048
+    NUM_DEVICES=2
+    MODEL="8b-hybrid"
+    PATH_TO_NEMO_MODEL=<path to .nemo file>
+    TRAIN_DATASET_PATH=<path to training dataset file>
+    VAL_DATASET_PATH=<path to validation dataset file>
+    CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/conf/"
+    CONFIG_NAME="megatron_mamba_finetuning_config"
+    SAVE_DIR=<path to the saving directory>
+    TOKENIZER_MODEL=<path to tokenizer model> # Only for the 8b models, for other models, set to null
+
+    declare -A MODEL_CONFIGS
+    MODEL_CONFIGS[130m]="24 768 768 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[370m]="48 1024 1024 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[780m]="48 1536 1536 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[1_3b]="48 2048 2048 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[2_7b]="64 2560 2560 50288 1 huggingface EleutherAI/gpt-neox-20b" 
+    MODEL_CONFIGS[8b]="56 4096 16384 256000 8 megatron GPTSentencePieceTokenizer" 
+    MODEL_CONFIGS[8b-hybrid]="56 4096 16384 256000 8 megatron GPTSentencePieceTokenizer" 
+
+    if [ "$MODEL" = "8b-hybrid" ]; then
+        export HYBRID_PATTERN='M-M-M--M-M*-M-M-M-M--M*-M-M-M-M-M*--M-M-M-M-M*-M--M-M-M-'
+    else
+        export HYBRID_PATTERN=''
+    fi
+
+    set_model_params() {
+        local config=(${MODEL_CONFIGS[$MODEL]})
+        NUM_LAYERS=${config[0]}
+        DIM=${config[1]}
+        FFN_DIM=${config[2]}
+        VOCAB_SIZE=${config[3]}
+        NGROUP=${config[4]}
+        TOKENIZER_LIB=${config[5]}
+        TOKENIZER_TYPE=${config[6]}
+    }
+    set_model_params
+
+    export NVTE_FUSED_ATTN=1
+    export NVTE_FLASH_ATTN=0
+
+    TEST_DATASET="[<path to test datasets (list)>]"
+
+    CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/conf/"
+    CONFIG_NAME="megatron_mamba_generate_config"
+
+    MASTER_PORT=15008 torchrun --nproc_per_node=${NUM_DEVICES}  /opt/NeMo/examples/nlp/language_modeling/megatron_mamba_generate.py \
+            --config-path=${CONFIG_PATH} \
+            --config-name=${CONFIG_NAME} \
+            trainer.devices=${NUM_DEVICES} \
+            trainer.precision=bf16 \
+            trainer.accelerator=gpu \
+            trainer.log_every_n_steps=1 \
+            trainer.val_check_interval=10 \
+            trainer.limit_val_batches=20 \
+            ++trainer.num_sanity_val_steps=0 \
+            ++trainer.accumulate_grad_batches=1 \
+            trainer.max_steps=1000 \
+            trainer.gradient_clip_val=1.0 \
+            exp_manager.exp_dir=${SAVE_DIR} \
+            exp_manager.resume_if_exists=False \
+            exp_manager.create_wandb_logger=False \
+            model.megatron_amp_O2=True \
+            model.peft.restore_from_path=False \
+            +model.peft.restore_from_ckpt.checkpoint_dir=False \
+            +model.peft.restore_from_ckpt.checkpoint_name=False \
+            model.hybrid_override_pattern=${HYBRID_PATTERN} \
+            model.tensor_model_parallel_size=${TP} \
+            model.sequence_parallel=$SP \
+            model.micro_batch_size=${MBS} \
+            model.global_batch_size=${GBS} \
+            model.restore_from_path=${PATH_TO_NEMO_MODEL} \
+            model.data.test_ds.file_names=${TEST_DATASET} \
+            model.data.test_ds.global_batch_size=${GBS} \
+            model.data.test_ds.micro_batch_size=${MBS} \
+            model.data.test_ds.tokens_to_generate=30 \
+            model.answer_only_loss=True \
+            model.tokenizer.library=${TOKENIZER_LIB} \
+            model.tokenizer.type=${TOKENIZER_TYPE} \
+            model.tokenizer.model=${TOKENIZER_MODEL} \
+            model.vocab_size=${VOCAB_SIZE} \
+            model.num_layers=${NUM_LAYERS} \
+            model.hidden_size=${DIM} \
+            model.ffn_hidden_size=${FFN_DIM} \
+            inference.greedy=True \
+            exp_manager.checkpoint_callback_params.monitor=validation_loss \
+            ++inference.verbose=True \
+            model.data.test_ds.write_predictions_to_file=True \
+            model.data.test_ds.output_file_path_prefix=${SAVE_DIR}/shorteval \
+            && echo "Eval finished, calculating scores" \
+            && python /opt/NeMo/scripts/metric_calculation/peft_metric_calc.py --label_field original_answers \
+            --pred_file ${SAVE_DIR}/shorteval_test_squad_inputs_preds_labels.jsonl > ${SAVE_DIR}/shorteval_test_squad_inputs_preds_labels.score \
+            && cat ${SAVE_DIR}/shorteval_test_squad_inputs_preds_labels.score
+
+

From 60973795c2e095b710311766276463037eba52d2 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Wed, 3 Jul 2024 12:55:03 -0700
Subject: [PATCH 12/21] minor fix to tutorial

---
 tutorials/llm/mamba/mamba.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tutorials/llm/mamba/mamba.rst b/tutorials/llm/mamba/mamba.rst
index bab62f10eb1e..41b69887d3fd 100644
--- a/tutorials/llm/mamba/mamba.rst
+++ b/tutorials/llm/mamba/mamba.rst
@@ -1,11 +1,11 @@
-Mamba2 and Mamba2-Transformer Hybrid Models Fine-Tuning and Evaluation
-======================================================================
+Mamba2 and Mamba2-Transformer Hybrid Models Fine-Tuning
+=======================================================
 
-`State Space Models (SSMs) <https://arxiv.org/pdf/2405.21060>`__  have recently emerged as a potential replacement for transformers. They have desirable features such as linear time complexity with respect to sequence length and constant cache size for inference, enabling them to process longer sequences and have higher throughput. However, while pure SSM-based models match or exceed Transformers on many tasks, they lag behind Transformer models on tasks that require strong copying or in-context learning abilities. In order to leverage best of the both worlds, SSM-Hybrid models combine MLP, Transformer, and SSM block in their architecture, and as mentioned in `a study by NVIDIA  <https://arxiv.org/pdf/2406.07887>`__. Based on the experiments, Mamba2-Hybrid models outperform transformed baselines of the same size while leveraging faster inference thanks to the SSM blocks.
+`State Space Models (SSMs) <https://arxiv.org/pdf/2405.21060>`__ have recently emerged as a promising alternative to transformers. SSMs offer advantages such as linear time complexity relative to sequence length and a constant cache size for inference. These features enable the processing of longer sequences and higher throughput. Despite these benefits, SSMs alone may fall short compared to transformers on tasks that demand strong copying or in-context learning capabilities.
 
+To harness the strengths of both approaches, SSM-Hybrid models incorporate MLP, Transformer, and SSM blocks in their architecture. As highlighted in `a study by NVIDIA <https://arxiv.org/pdf/2406.07887>`__, these hybrid models outperform traditional transformers of the same size by achieving faster inference times due to the inclusion of SSM blocks. Based on experimental results, Mamba2-Hybrid models not only surpass transformer baselines in performance but also benefit from increased computational efficiency.
 
-Mamba2 models from the `Transformers are SSMs <https://arxiv.org/pdf/2405.21060>`__ paper have been offered in 5 different sizes, namely 130m, 370m, 780m, 1.3b, and 2.7b. The Mamba2-Hybrid models and their Mamba2 baseline released by `NVIDIA  <https://arxiv.org/pdf/2406.07887>`__ are all of the size of 8b. 
-
+The Mamba2 models discussed in the `Transformers are SSMs <https://arxiv.org/pdf/2405.21060>`__ paper are available in five different sizes: 130 million, 370 million, 780 million, 1.3 billion, and 2.7 billion parameters. The Mamba2-Hybrid models, along with their Mamba2 baseline as released by `NVIDIA <https://arxiv.org/pdf/2406.07887>`__, are provided in an 8 billion parameter size.
 
 `Low-Rank Adaptation (LoRA) <https://arxiv.org/pdf/2106.09685>`__ has emerged as a popular Parameter Efficient Fine-Tuning (PEFT) technique that tunes a very small number of additional parameters as compared to full fine-tuning, thereby reducing the compute required. LoRA tuning can be applied to the linear layers in the Transformer and MLP blocks for the Mamba2-Hybrid models. 
 
@@ -83,7 +83,7 @@ The HuggingFace checkpoint for the 8b model is for TP of size 1, and so is the `
           --target_tensor_model_parallel_size=4 \
           --precision=bf16 \
 
-After running this script, a ``.nemo`` model along with the TP-size number of folders (4 in this example) will be generated in the target path. The folders for each rank will be displayed as ``mp_rank_00`` to ``mp_rank_04`` in this example. 
+After running this script, a ``.nemo`` model along with the TP-size number of folders (4 in this example) will be generated in the target path. The folders for each rank will be displayed as ``mp_rank_00`` to ``mp_rank_03`` in this example. 
 
 * Note: You can only use Tensor Parallelism for the 8b models by `NVIDIA <https://arxiv.org/pdf/2406.07887>`__ (Mamba2 8b and Mamba2-Hybrid 8b). This is due to the fact that the ``nroups`` parameter in the model architecture should be divisible by TP size. ``nroups`` parameter is 8 for NVIDIA models and 1 for other models in the list.
 

From 8e7aea05466a70025ec95c107b7241e8e5a0d58b Mon Sep 17 00:00:00 2001
From: arendu <adithya.r@gmail.com>
Date: Wed, 3 Jul 2024 20:51:43 +0000
Subject: [PATCH 13/21] moving finetuning files

Signed-off-by: arendu <adithya.r@gmail.com>
---
 .../megatron_mamba_finetuning_config.yaml     | 315 ++++++++++++++++++
 .../conf/megatron_mamba_generate_config.yaml  | 298 +++++++++++++++++
 .../tuning/megatron_mamba_finetuning.py       |  60 ++++
 .../tuning/megatron_mamba_generate.py         |  69 ++++
 4 files changed, 742 insertions(+)
 create mode 100644 examples/nlp/language_modeling/tuning/conf/megatron_mamba_finetuning_config.yaml
 create mode 100644 examples/nlp/language_modeling/tuning/conf/megatron_mamba_generate_config.yaml
 create mode 100644 examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py
 create mode 100644 examples/nlp/language_modeling/tuning/megatron_mamba_generate.py

diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_mamba_finetuning_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_mamba_finetuning_config.yaml
new file mode 100644
index 000000000000..3684b61bb186
--- /dev/null
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_mamba_finetuning_config.yaml
@@ -0,0 +1,315 @@
+name: megatron_mamba
+restore_from_path: ${model.restore_from_path} # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 9999
+  max_steps: 10000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 1 # frequency with which training steps are logged
+  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: 1.0
+  limit_val_batches: 1024
+  limit_test_batches: 500
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: True
+  wandb_logger_kwargs:
+    project: griffin
+    name: sft-test
+  resume_if_exists: False
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: validation_${model.data.validation_ds.metric.name}
+    save_top_k: 1
+    mode: min
+    save_nemo_on_train_end: True
+    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    always_save_nemo: False
+    save_best_model: True
+  create_early_stopping_callback: True
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+
+model:
+  restore_from_path: null
+  # model parallelism 
+  mcore_gpt: True
+  micro_batch_size: 1
+  global_batch_size: 8
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  expert_model_parallel_size: 1 # expert model parallelism
+
+  vocab_size: 65536
+  # model architecture
+  encoder_seq_length: 4096
+  hybrid_override_pattern: null
+  max_position_embeddings: ${.encoder_seq_length}
+  position_embedding_type: 'none' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
+  num_layers: 64
+  gated_linear_unit: False
+  add_bias_linear: False
+  num_query_groups: 8
+  ngroups_mamba: 8
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  hidden_size: 4096
+  ffn_hidden_size: 14336 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 32
+  transformer_block_type: pre_ln
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: RMSNorm
+  layernorm_epsilon: 1e-5
+  num_moe_experts: 16
+  moe_router_topk: 2
+  moe_aux_loss_coeff: 0.001
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  megatron_legacy: False
+  persist_layer_norm: True
+
+
+  # mixed-precision
+  attention_softmax_in_fp32: False
+
+  # Distributed checkpoint setup
+  dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
+  dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU
+  dist_ckpt_parallel_save: False # if true, each worker will write its own part of the dist checkpoint
+
+
+  tokenizer:
+    library: 'huggingface'
+    type: 'EleutherAI/gpt-neox-20b' 
+    model: null 
+    vocab_file: null
+    merge_file: null 
+    sentencepiece_legacy: False
+    use_fast: True
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+
+  # Fusion
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
+  gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
+  bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+  bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+
+  # miscellaneous
+  seed: 1234
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+  sequence_parallel: False
+  
+  peft:
+    peft_scheme: "lora"  # can be either adapter,ia3, lora, or ptuning
+    restore_from_path: null
+
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['all'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+    
+    selective_tuning:
+      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
+
+
+  data:
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_names: null # Path to a list of JSONL files corresponding to the source data.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: True
+      num_workers: 0
+      memmap_workers: 2
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: True
+      # Example of how to specify concat_sampling_probabilities
+      # concat_sampling_probabilities:
+      #   - 0.5
+      #   - 0.25
+      #   - 0.25
+      concat_sampling_probabilities: [1.0] # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: True
+      truncation_field: "input" # # Can be multiple keys separated with ',' Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
+      ceil_to_power_2: True
+      get_attention_mask_from_fusion: True
+      pad_to_max_length: True
+    validation_ds:
+        file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+        names: null # Names of the corresponding datasets used to log metrics.
+        global_batch_size: ${model.global_batch_size}
+        micro_batch_size: ${model.micro_batch_size}
+        shuffle: False
+        num_workers: 0
+        memmap_workers: ${model.data.train_ds.memmap_workers}
+        pin_memory: True
+        max_seq_length: 2048
+        min_seq_length: 1
+        drop_last: False
+        label_key: ${model.data.train_ds.label_key}
+        add_eos: ${model.data.train_ds.add_eos}
+        add_sep: ${model.data.train_ds.add_sep}
+        add_bos: ${model.data.train_ds.add_bos}
+        write_predictions_to_file: False
+        output_file_path_prefix: null # Prefix of the file to write predictions to.
+        truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template
+        index_mapping_dir: null # Path to a directory to write index mapping files.
+        prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+        tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+        truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+        ceil_to_power_2: True
+        get_attention_mask_from_fusion: True
+        pad_to_max_length: True
+        metric:
+          name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+          average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+          num_classes: null
+    test_ds:
+      file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: null # Names of the corresponding datasets used to log metrics.
+      global_batch_size: ${model.global_batch_size}
+      micro_batch_size: ${model.micro_batch_size}
+      shuffle: False
+      num_workers: 0
+      memmap_workers: ${model.data.train_ds.memmap_workers}
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      label_key: ${model.data.train_ds.label_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template}
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      ceil_to_power_2: True
+      get_attention_mask_from_fusion: True
+      pad_to_max_length: True
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+  optim:
+    name: distributed_fused_adam
+    lr: 2e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 50000
+      min_lr: 2e-5
diff --git a/examples/nlp/language_modeling/tuning/conf/megatron_mamba_generate_config.yaml b/examples/nlp/language_modeling/tuning/conf/megatron_mamba_generate_config.yaml
new file mode 100644
index 000000000000..2d34aefffc7e
--- /dev/null
+++ b/examples/nlp/language_modeling/tuning/conf/megatron_mamba_generate_config.yaml
@@ -0,0 +1,298 @@
+name: megatron_mamba
+restore_from_path: ${model.restore_from_path} # used when starting from a .nemo file
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  benchmark: False
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_mamba
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    filename: 'megatron_mamba--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
+model:
+  restore_from_path: null
+  # model parallelism 
+  mcore_gpt: True
+  micro_batch_size: 2
+  global_batch_size: 2
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  expert_model_parallel_size: 1 # expert model parallelism
+  hybrid_override_pattern: null
+  vocab_size: 65536
+  # model architecture
+  encoder_seq_length: 4096
+  max_position_embeddings: ${.encoder_seq_length}
+  position_embedding_type: 'none' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
+  num_layers: 64
+  gated_linear_unit: False
+  num_query_groups: 8
+  ngroups_mamba: 8
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  hidden_size: 4096
+  ffn_hidden_size: 14336 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 32
+  transformer_block_type: pre_ln
+  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+  normalization: RMSNorm
+  layernorm_epsilon: 1e-5
+  num_moe_experts: 16
+  moe_router_topk: 2
+  moe_aux_loss_coeff: 0.001
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  megatron_legacy: False
+  persist_layer_norm: True
+  add_bias_linear: False
+
+  answer_only_loss: True
+
+  tokenizer:
+    library: 'huggingface'
+    type: 'EleutherAI/gpt-neox-20b' 
+    model: null 
+    vocab_file: null
+    merge_file: null 
+    sentencepiece_legacy: False
+    use_fast: True
+
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  fp32_residual_connection: False # Move residual connections to fp32
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+
+  # Fusion
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
+  gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
+  bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+  bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
+  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
+
+
+  # miscellaneous
+  seed: 1234
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_recurrent: False # If set to True, the checkpointing is only done for rglru and conv1d and not for attention and mlp layers
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+  sequence_parallel: False
+  
+  peft:
+    peft_scheme: null  # can be either adapter,ia3, lora, or ptuning
+    restore_from_path: null
+
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      target_modules: ['all'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
+      adapter_dim: 32
+      alpha: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+    
+    selective_tuning:
+      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
+
+  data:
+    test_ds:
+      file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      names: ??? # Names of the corresponding datasets used to log metrics.
+      global_batch_size: 1
+      micro_batch_size: 1
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: 'input'
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: True
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      truncation_field: "input" # Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input} {output}"
+      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      ceil_to_power_2: True
+      get_attention_mask_from_fusion: True
+      pad_to_max_length: True
+
+      metric:
+        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+inference:
+  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  outfile_path: output.txt
+  compute_attention_mask: True
+
+# server-related configs
+server: False  # whether launch the API server
+port: 5555 # the port number for the inference server
+web_server: False # whether launch the web inference server
+share: True  # whether create a public URL
+username: test # user name for web client
+password: test2  # password for web client
+web_port: 9889 # the port number of the web server 1058
+chat: False # use the chat interface
+chatbot_config:
+  value: False   # whether to inject the value attributes
+  attributes:
+    - name: Quality
+      min: 0
+      max: 4
+      key: quality
+      type: int
+      default: 4
+    - name: Toxicity
+      min: 0
+      max: 4
+      key: toxcity
+      type: int
+      default: 0
+    - name: Humor
+      min: 0
+      max: 4
+      key: humor
+      type: int
+      default: 0
+    - name: Creativity
+      min: 0
+      max: 4
+      key: creativity
+      type: int
+      default: 0
+    - name: Violence
+      min: 0
+      max: 4
+      key: violence
+      type: int
+      default: 0
+    - name: Helpfulness
+      min: 0
+      max: 4
+      key: helpfulness
+      type: int
+      default: 4
+    - name: Not_Appropriate
+      min: 0
+      max: 4
+      key: not_appropriate
+      type: int
+      default: 0
+    - name: Language
+      choices: ['ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hu', 'id', 'it', 'ja', 'ko', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'th', 'tr', 'uk', 'vi', 'zh']
+      key: lang
+      type: list
+      default: en
+   
+  user: User
+  assistant: Assistant
+  system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n"
\ No newline at end of file
diff --git a/examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py b/examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py
new file mode 100644
index 000000000000..0613ef486ec3
--- /dev/null
+++ b/examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf
+
+from nemo.collections.nlp.models.language_modeling.megatron_mamba_sft_model import MegatronMambaSFTModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+
+mp.set_start_method("spawn", force=True)
+
+
+@hydra_runner(config_path="conf", config_name="megatron_mamba_finetuning_config")
+def main(cfg) -> None:
+
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+
+    precision = cfg.trainer.precision
+    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
+    # Restore the precision value after Trainer is built.
+    cfg.trainer.precision = precision
+    exp_manager(trainer, cfg.exp_manager)
+
+    model_cfg = MegatronMambaSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
+    model = MegatronMambaSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
+
+    peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
+
+    if cfg.model.peft.restore_from_path is not None:
+        # initialize peft weights from a check`point instead of randomly
+        # This is not the same as resume training because optimizer states are not restored.
+        logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path)
+        model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg))
+    elif peft_cfg_cls is not None:
+        logging.info("Adding adapter weights to the model for PEFT")
+        model.add_adapter(peft_cfg_cls(model_cfg))
+    else:
+        logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}")
+
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/nlp/language_modeling/tuning/megatron_mamba_generate.py b/examples/nlp/language_modeling/tuning/megatron_mamba_generate.py
new file mode 100644
index 000000000000..6f660d552fc6
--- /dev/null
+++ b/examples/nlp/language_modeling/tuning/megatron_mamba_generate.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf
+from nemo.collections.nlp.models.language_modeling.megatron_mamba_sft_model import MegatronMambaSFTModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
+from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.model_utils import inject_model_parallel_rank
+
+
+mp.set_start_method("spawn", force=True)
+
+
+@hydra_runner(config_path="conf", config_name="megatron_mamba_generate_config")
+def main(cfg) -> None:
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f"\n{OmegaConf.to_yaml(cfg)}")
+    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
+
+    if cfg.model.peft.restore_from_path:
+        model_cfg = MegatronMambaSFTModel.merge_inference_cfg(cfg.model.peft.restore_from_path, cfg)
+    else:
+        model_cfg = MegatronMambaSFTModel.merge_inference_cfg(cfg.model.restore_from_path, cfg)
+
+    model = MegatronMambaSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
+
+    if cfg.model.peft.restore_from_path:
+        model.load_adapters(cfg.model.peft.restore_from_path)
+    elif cfg.model.peft.restore_from_ckpt.checkpoint_dir and cfg.model.peft.restore_from_ckpt.checkpoint_name:
+        peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
+        checkpoint_path = os.path.join(
+            cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
+        )
+        # checkpoint_path is a dir in case of distributed checkpointing
+        if not os.path.isdir(checkpoint_path):
+            # legacy checkpoint needs model parallel rank injection
+            checkpoint_path = inject_model_parallel_rank(
+                os.path.join(
+                    cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
+                )
+            )
+            model.load_adapters(checkpoint_path, peft_cfgs=peft_cfg_cls(model_cfg))
+        else:
+            raise NotImplementedError("distributed checkpointing of PEFT weights is not supported")
+
+    model.freeze()
+    logging.info(f"Freezing parameters for PEFT eval:\n{model.summarize()}")
+
+    trainer.test(model)
+
+
+if __name__ == "__main__":
+    main()

From 1db82695872a29612dea4d5b3200f494058031a4 Mon Sep 17 00:00:00 2001
From: arendu <adithya.r@gmail.com>
Date: Wed, 3 Jul 2024 20:52:24 +0000
Subject: [PATCH 14/21] moving finetuning files

Signed-off-by: arendu <adithya.r@gmail.com>
---
 tutorials/llm/mamba/mamba.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tutorials/llm/mamba/mamba.rst b/tutorials/llm/mamba/mamba.rst
index 41b69887d3fd..c09a6ae03087 100644
--- a/tutorials/llm/mamba/mamba.rst
+++ b/tutorials/llm/mamba/mamba.rst
@@ -143,7 +143,7 @@ Run Fine-Tuning
     export NVTE_FLASH_ATTN=0
 
     MASTER_PORT=15008 torchrun --nproc_per_node=${NUM_DEVICES} 
-            /home/ataghibakhsh/NeMo/examples/nlp/language_modeling/megatron_mamba_finetuning.py \
+            /home/ataghibakhsh/NeMo/examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py \
             --config-path=${CONFIG_PATH} \
             --config-name=${CONFIG_NAME} \
             trainer.devices=${NUM_DEVICES} \
@@ -210,7 +210,7 @@ Evaluating the Fine-Tuned Model
     PATH_TO_NEMO_MODEL=<path to .nemo file>
     TRAIN_DATASET_PATH=<path to training dataset file>
     VAL_DATASET_PATH=<path to validation dataset file>
-    CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/conf/"
+    CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/tuning/conf/"
     CONFIG_NAME="megatron_mamba_finetuning_config"
     SAVE_DIR=<path to the saving directory>
     TOKENIZER_MODEL=<path to tokenizer model> # Only for the 8b models, for other models, set to null
@@ -247,10 +247,10 @@ Evaluating the Fine-Tuned Model
 
     TEST_DATASET="[<path to test datasets (list)>]"
 
-    CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/conf/"
+    CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/tuning/conf/"
     CONFIG_NAME="megatron_mamba_generate_config"
 
-    MASTER_PORT=15008 torchrun --nproc_per_node=${NUM_DEVICES}  /opt/NeMo/examples/nlp/language_modeling/megatron_mamba_generate.py \
+    MASTER_PORT=15008 torchrun --nproc_per_node=${NUM_DEVICES}  /opt/NeMo/examples/nlp/language_modeling/tuning/megatron_mamba_generate.py \
             --config-path=${CONFIG_PATH} \
             --config-name=${CONFIG_NAME} \
             trainer.devices=${NUM_DEVICES} \

From 0f326d6264995b35ed045369a6fb1426bc8ce4d9 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Thu, 4 Jul 2024 02:20:32 -0700
Subject: [PATCH 15/21] address comments

---
 .../conf/megatron_mamba_config.yaml           |   2 +-
 .../megatron_mamba_finetuning_config.yaml     | 315 ------------------
 .../conf/megatron_mamba_generate_config.yaml  | 298 -----------------
 .../mamba_change_num_partition.py             |  22 +-
 .../megatron_mamba_finetuning.py              |  60 ----
 .../megatron_mamba_generate.py                |  69 ----
 .../language_modeling/megatron_mamba_model.py |   3 +-
 .../megatron_mamba_sft_model.py               |   8 -
 .../convert_mamba2_pyt_to_nemo.py             |   4 +-
 9 files changed, 6 insertions(+), 775 deletions(-)
 delete mode 100644 examples/nlp/language_modeling/conf/megatron_mamba_finetuning_config.yaml
 delete mode 100644 examples/nlp/language_modeling/conf/megatron_mamba_generate_config.yaml
 delete mode 100644 examples/nlp/language_modeling/megatron_mamba_finetuning.py
 delete mode 100644 examples/nlp/language_modeling/megatron_mamba_generate.py

diff --git a/examples/nlp/language_modeling/conf/megatron_mamba_config.yaml b/examples/nlp/language_modeling/conf/megatron_mamba_config.yaml
index 4a720309031a..f4f37d7c4ce0 100644
--- a/examples/nlp/language_modeling/conf/megatron_mamba_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_mamba_config.yaml
@@ -59,7 +59,7 @@ model:
   gated_linear_unit: False
   add_bias_linear: False
   num_query_groups: 8
-  ngroups_mamba: 8
+  mamba_ssm_ngroups: 8
   attention_dropout: 0.0
   hidden_dropout: 0.0
   hidden_size: 4096
diff --git a/examples/nlp/language_modeling/conf/megatron_mamba_finetuning_config.yaml b/examples/nlp/language_modeling/conf/megatron_mamba_finetuning_config.yaml
deleted file mode 100644
index 3684b61bb186..000000000000
--- a/examples/nlp/language_modeling/conf/megatron_mamba_finetuning_config.yaml
+++ /dev/null
@@ -1,315 +0,0 @@
-name: megatron_mamba
-restore_from_path: ${model.restore_from_path} # used when starting from a .nemo file
-
-trainer:
-  devices: 1
-  accelerator: gpu
-  num_nodes: 1
-  precision: bf16
-  logger: False # logger provided by exp_manager
-  enable_checkpointing: False
-  use_distributed_sampler: False
-  max_epochs: 9999
-  max_steps: 10000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
-  log_every_n_steps: 1 # frequency with which training steps are logged
-  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
-  gradient_clip_val: 1.0
-  limit_val_batches: 1024
-  limit_test_batches: 500
-
-exp_manager:
-  explicit_log_dir: null
-  exp_dir: null
-  name: ${name}
-  create_wandb_logger: True
-  wandb_logger_kwargs:
-    project: griffin
-    name: sft-test
-  resume_if_exists: False
-  resume_ignore_no_checkpoint: True
-  create_checkpoint_callback: True
-  checkpoint_callback_params:
-    monitor: validation_${model.data.validation_ds.metric.name}
-    save_top_k: 1
-    mode: min
-    save_nemo_on_train_end: True
-    filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}'
-    model_parallel_size: ${model.tensor_model_parallel_size}
-    always_save_nemo: False
-    save_best_model: True
-  create_early_stopping_callback: True
-  early_stopping_callback_params:
-    monitor: "val_loss"
-    mode: "min"
-    min_delta: 0.001
-    patience: 10
-    verbose: True
-    strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
-
-
-model:
-  restore_from_path: null
-  # model parallelism 
-  mcore_gpt: True
-  micro_batch_size: 1
-  global_batch_size: 8
-  tensor_model_parallel_size: 1
-  pipeline_model_parallel_size: 1
-  virtual_pipeline_model_parallel_size: null
-  expert_model_parallel_size: 1 # expert model parallelism
-
-  vocab_size: 65536
-  # model architecture
-  encoder_seq_length: 4096
-  hybrid_override_pattern: null
-  max_position_embeddings: ${.encoder_seq_length}
-  position_embedding_type: 'none' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
-  num_layers: 64
-  gated_linear_unit: False
-  add_bias_linear: False
-  num_query_groups: 8
-  ngroups_mamba: 8
-  attention_dropout: 0.0
-  hidden_dropout: 0.0
-  hidden_size: 4096
-  ffn_hidden_size: 14336 # Transformer FFN hidden size. Usually 4 * hidden_size.
-  num_attention_heads: 32
-  transformer_block_type: pre_ln
-  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
-  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
-  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
-  normalization: RMSNorm
-  layernorm_epsilon: 1e-5
-  num_moe_experts: 16
-  moe_router_topk: 2
-  moe_aux_loss_coeff: 0.001
-  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
-  pre_process: True # add embedding
-  post_process: True # add pooler
-  megatron_legacy: False
-  persist_layer_norm: True
-
-
-  # mixed-precision
-  attention_softmax_in_fp32: False
-
-  # Distributed checkpoint setup
-  dist_ckpt_format: 'zarr' # Set to 'torch_dist' to use PyTorch distributed checkpoint format.
-  dist_ckpt_load_on_device: True # whether to load checkpoint weights directly on GPU or to CPU
-  dist_ckpt_parallel_save: False # if true, each worker will write its own part of the dist checkpoint
-
-
-  tokenizer:
-    library: 'huggingface'
-    type: 'EleutherAI/gpt-neox-20b' 
-    model: null 
-    vocab_file: null
-    merge_file: null 
-    sentencepiece_legacy: False
-    use_fast: True
-
-  # precision
-  native_amp_init_scale: 4294967296 # 2 ** 32
-  native_amp_growth_interval: 1000
-  fp32_residual_connection: False # Move residual connections to fp32
-  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
-
-  # Megatron O2-style half-precision
-  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
-  grad_allreduce_chunk_size_mb: 125
-
-  # Fusion
-  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
-  gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
-  bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
-  bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
-  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
-  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
-  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
-
-  # miscellaneous
-  seed: 1234
-  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
-  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
-  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
-  
-  ## Activation Checkpointing
-  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
-  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
-  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
-  # 'full' will checkpoint the entire transformer layer.
-  activations_checkpoint_granularity: null # 'selective' or 'full' 
-  activations_checkpoint_method: null # 'uniform', 'block'
-  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
-  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
-  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
-  activations_checkpoint_num_layers: null
-  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
-  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
-  num_micro_batches_with_partial_activation_checkpoints: null
-  # This feature is valid only when used with pipeline-model-parallelism.
-  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
-  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
-  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
-  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
-  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
-  activations_checkpoint_layers_per_pipeline: null
-  # This feature is valid only when used with pipeline-model-parallelism.
-  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
-  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
-  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
-  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
-  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
-  sequence_parallel: False
-  
-  peft:
-    peft_scheme: "lora"  # can be either adapter,ia3, lora, or ptuning
-    restore_from_path: null
-
-    # Used for adapter peft training
-    adapter_tuning:
-      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
-      adapter_dim: 32
-      adapter_dropout: 0.0
-      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
-      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
-      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
-      weight_tying: False
-      position_embedding_strategy: null # used only when weight_tying is True
-
-    lora_tuning:
-      target_modules: ['all'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
-      adapter_dim: 32
-      alpha: 32
-      adapter_dropout: 0.0
-      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
-      weight_tying: False
-      position_embedding_strategy: null # used only when weight_tying is True
-
-    # Used for p-tuning peft training
-    p_tuning:
-      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
-      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
-      embedding_dim: 1024  # the size of the prompt encoder embeddings
-      init_std: 0.023
-
-    ia3_tuning:
-      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
-    
-    selective_tuning:
-      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
-
-
-  data:
-    train_ds:
-      # Example of how to specify paths to multiple datasets
-      # file_names:
-      #   - /path/to/squad.jsonl
-      #   - /path/to/mnli.jsonl
-      #   - /path/to/boolq.jsonl
-      # Example of how each dataset is formatted
-      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
-      file_names: null # Path to a list of JSONL files corresponding to the source data.
-      global_batch_size: ${model.global_batch_size}
-      micro_batch_size: ${model.micro_batch_size}
-      shuffle: True
-      num_workers: 0
-      memmap_workers: 2
-      pin_memory: True
-      max_seq_length: 2048
-      min_seq_length: 1
-      drop_last: True
-      # Example of how to specify concat_sampling_probabilities
-      # concat_sampling_probabilities:
-      #   - 0.5
-      #   - 0.25
-      #   - 0.25
-      concat_sampling_probabilities: [1.0] # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
-      label_key: 'output'
-      add_eos: True
-      add_sep: False
-      add_bos: True
-      truncation_field: "input" # # Can be multiple keys separated with ',' Options: keys in prompt_template
-      index_mapping_dir: null # Path to a directory to write index mapping files.
-      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
-      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
-      ceil_to_power_2: True
-      get_attention_mask_from_fusion: True
-      pad_to_max_length: True
-    validation_ds:
-        file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
-        names: null # Names of the corresponding datasets used to log metrics.
-        global_batch_size: ${model.global_batch_size}
-        micro_batch_size: ${model.micro_batch_size}
-        shuffle: False
-        num_workers: 0
-        memmap_workers: ${model.data.train_ds.memmap_workers}
-        pin_memory: True
-        max_seq_length: 2048
-        min_seq_length: 1
-        drop_last: False
-        label_key: ${model.data.train_ds.label_key}
-        add_eos: ${model.data.train_ds.add_eos}
-        add_sep: ${model.data.train_ds.add_sep}
-        add_bos: ${model.data.train_ds.add_bos}
-        write_predictions_to_file: False
-        output_file_path_prefix: null # Prefix of the file to write predictions to.
-        truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template
-        index_mapping_dir: null # Path to a directory to write index mapping files.
-        prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
-        tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
-        truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
-        ceil_to_power_2: True
-        get_attention_mask_from_fusion: True
-        pad_to_max_length: True
-        metric:
-          name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
-          average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
-          num_classes: null
-    test_ds:
-      file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
-      names: null # Names of the corresponding datasets used to log metrics.
-      global_batch_size: ${model.global_batch_size}
-      micro_batch_size: ${model.micro_batch_size}
-      shuffle: False
-      num_workers: 0
-      memmap_workers: ${model.data.train_ds.memmap_workers}
-      pin_memory: True
-      max_seq_length: 2048
-      min_seq_length: 1
-      drop_last: False
-      label_key: ${model.data.train_ds.label_key}
-      add_eos: ${model.data.train_ds.add_eos}
-      add_sep: ${model.data.train_ds.add_sep}
-      add_bos: ${model.data.train_ds.add_bos}
-      write_predictions_to_file: False
-      output_file_path_prefix: null # Prefix of the file to write predictions to.
-      truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template
-      index_mapping_dir: null # Path to a directory to write index mapping files.
-      prompt_template: ${model.data.train_ds.prompt_template}
-      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
-      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
-      ceil_to_power_2: True
-      get_attention_mask_from_fusion: True
-      pad_to_max_length: True
-      metric:
-        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
-        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
-        num_classes: null
-
-  optim:
-    name: distributed_fused_adam
-    lr: 2e-4
-    weight_decay: 0.01 
-    betas: 
-    - 0.9
-    - 0.98
-    sched:
-      name: CosineAnnealing
-      warmup_steps: 500
-      constant_steps: 50000
-      min_lr: 2e-5
diff --git a/examples/nlp/language_modeling/conf/megatron_mamba_generate_config.yaml b/examples/nlp/language_modeling/conf/megatron_mamba_generate_config.yaml
deleted file mode 100644
index 2d34aefffc7e..000000000000
--- a/examples/nlp/language_modeling/conf/megatron_mamba_generate_config.yaml
+++ /dev/null
@@ -1,298 +0,0 @@
-name: megatron_mamba
-restore_from_path: ${model.restore_from_path} # used when starting from a .nemo file
-
-trainer:
-  devices: 1
-  num_nodes: 1
-  accelerator: gpu
-  precision: bf16
-  logger: False # logger provided by exp_manager
-  enable_checkpointing: False
-  use_distributed_sampler: False
-  max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
-  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
-  log_every_n_steps: 10
-  val_check_interval: 100
-  limit_val_batches: 50
-  limit_test_batches: 500
-  accumulate_grad_batches: 1
-  gradient_clip_val: 1.0
-  benchmark: False
-
-exp_manager:
-  explicit_log_dir: null
-  exp_dir: null
-  name: megatron_mamba
-  create_wandb_logger: False
-  wandb_logger_kwargs:
-    project: null
-    name: null
-  resume_if_exists: True
-  resume_ignore_no_checkpoint: True
-  create_checkpoint_callback: True
-  checkpoint_callback_params:
-    monitor: val_loss
-    save_top_k: 10
-    mode: min
-    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
-    filename: 'megatron_mamba--{val_loss:.2f}-{step}-{consumed_samples}'
-    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
-
-model:
-  restore_from_path: null
-  # model parallelism 
-  mcore_gpt: True
-  micro_batch_size: 2
-  global_batch_size: 2
-  tensor_model_parallel_size: 1
-  pipeline_model_parallel_size: 1
-  virtual_pipeline_model_parallel_size: null
-  expert_model_parallel_size: 1 # expert model parallelism
-  hybrid_override_pattern: null
-  vocab_size: 65536
-  # model architecture
-  encoder_seq_length: 4096
-  max_position_embeddings: ${.encoder_seq_length}
-  position_embedding_type: 'none' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
-  num_layers: 64
-  gated_linear_unit: False
-  num_query_groups: 8
-  ngroups_mamba: 8
-  attention_dropout: 0.0
-  hidden_dropout: 0.0
-  hidden_size: 4096
-  ffn_hidden_size: 14336 # Transformer FFN hidden size. Usually 4 * hidden_size.
-  num_attention_heads: 32
-  transformer_block_type: pre_ln
-  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
-  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
-  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
-  normalization: RMSNorm
-  layernorm_epsilon: 1e-5
-  num_moe_experts: 16
-  moe_router_topk: 2
-  moe_aux_loss_coeff: 0.001
-  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
-  pre_process: True # add embedding
-  post_process: True # add pooler
-  megatron_legacy: False
-  persist_layer_norm: True
-  add_bias_linear: False
-
-  answer_only_loss: True
-
-  tokenizer:
-    library: 'huggingface'
-    type: 'EleutherAI/gpt-neox-20b' 
-    model: null 
-    vocab_file: null
-    merge_file: null 
-    sentencepiece_legacy: False
-    use_fast: True
-
-
-  # precision
-  native_amp_init_scale: 4294967296 # 2 ** 32
-  native_amp_growth_interval: 1000
-  fp32_residual_connection: False # Move residual connections to fp32
-  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
-
-  # Megatron O2-style half-precision
-  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
-  grad_allreduce_chunk_size_mb: 125
-
-  # Fusion
-  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
-  gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
-  bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
-  bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
-  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
-  get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
-  apply_rope_fusion: True # Use a kernel to add rotary positional embeddings. Only used if position_embedding_type=rope
-
-
-  # miscellaneous
-  seed: 1234
-  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
-  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
-  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
-  
-  ## Activation Checkpointing
-  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
-  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
-  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
-  # 'full' will checkpoint the entire transformer layer.
-  activations_checkpoint_granularity: null # 'selective' or 'full' 
-  activations_checkpoint_recurrent: False # If set to True, the checkpointing is only done for rglru and conv1d and not for attention and mlp layers
-  activations_checkpoint_method: null # 'uniform', 'block'
-  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
-  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
-  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
-  activations_checkpoint_num_layers: null
-  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
-  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
-  num_micro_batches_with_partial_activation_checkpoints: null
-  # This feature is valid only when used with pipeline-model-parallelism.
-  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
-  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
-  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
-  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
-  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
-  activations_checkpoint_layers_per_pipeline: null
-  # This feature is valid only when used with pipeline-model-parallelism.
-  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
-  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
-  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
-  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
-  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
-  sequence_parallel: False
-  
-  peft:
-    peft_scheme: null  # can be either adapter,ia3, lora, or ptuning
-    restore_from_path: null
-
-    # Used for adapter peft training
-    adapter_tuning:
-      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
-      adapter_dim: 32
-      adapter_dropout: 0.0
-      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
-      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
-      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
-      weight_tying: False
-      position_embedding_strategy: null # used only when weight_tying is True
-
-    lora_tuning:
-      target_modules: ['all'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', attention (qkv & dense), mlp (fc1 & fc2)
-      adapter_dim: 32
-      alpha: 32
-      adapter_dropout: 0.0
-      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
-      layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
-      weight_tying: False
-      position_embedding_strategy: null # used only when weight_tying is True
-
-    # Used for p-tuning peft training
-    p_tuning:
-      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
-      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
-      embedding_dim: 1024  # the size of the prompt encoder embeddings
-      init_std: 0.023
-
-    ia3_tuning:
-      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
-    
-    selective_tuning:
-      tunable_base_param_names: ["self_attention", "word_embeddings"]  # TODO: regex support @adithyre
-
-  data:
-    test_ds:
-      file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
-      names: ??? # Names of the corresponding datasets used to log metrics.
-      global_batch_size: 1
-      micro_batch_size: 1
-      shuffle: False
-      num_workers: 0
-      pin_memory: True
-      max_seq_length: 2048
-      min_seq_length: 1
-      drop_last: False
-      context_key: 'input'
-      label_key: 'output'
-      add_eos: True
-      add_sep: False
-      add_bos: True
-      write_predictions_to_file: False
-      output_file_path_prefix: null # Prefix of the file to write predictions to.
-      truncation_field: "input" # Options: keys in prompt_template
-      index_mapping_dir: null # Path to a directory to write index mapping files.
-      prompt_template: "{input} {output}"
-      tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics
-      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
-      ceil_to_power_2: True
-      get_attention_mask_from_fusion: True
-      pad_to_max_length: True
-
-      metric:
-        name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
-        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
-        num_classes: null
-
-inference:
-  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
-  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
-  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
-  temperature: 1.0 # sampling temperature
-  all_probs: False  # whether return the log prob for all the tokens in vocab
-  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
-  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
-  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
-  outfile_path: output.txt
-  compute_attention_mask: True
-
-# server-related configs
-server: False  # whether launch the API server
-port: 5555 # the port number for the inference server
-web_server: False # whether launch the web inference server
-share: True  # whether create a public URL
-username: test # user name for web client
-password: test2  # password for web client
-web_port: 9889 # the port number of the web server 1058
-chat: False # use the chat interface
-chatbot_config:
-  value: False   # whether to inject the value attributes
-  attributes:
-    - name: Quality
-      min: 0
-      max: 4
-      key: quality
-      type: int
-      default: 4
-    - name: Toxicity
-      min: 0
-      max: 4
-      key: toxcity
-      type: int
-      default: 0
-    - name: Humor
-      min: 0
-      max: 4
-      key: humor
-      type: int
-      default: 0
-    - name: Creativity
-      min: 0
-      max: 4
-      key: creativity
-      type: int
-      default: 0
-    - name: Violence
-      min: 0
-      max: 4
-      key: violence
-      type: int
-      default: 0
-    - name: Helpfulness
-      min: 0
-      max: 4
-      key: helpfulness
-      type: int
-      default: 4
-    - name: Not_Appropriate
-      min: 0
-      max: 4
-      key: not_appropriate
-      type: int
-      default: 0
-    - name: Language
-      choices: ['ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hu', 'id', 'it', 'ja', 'ko', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'th', 'tr', 'uk', 'vi', 'zh']
-      key: lang
-      type: list
-      default: en
-   
-  user: User
-  assistant: Assistant
-  system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n"
\ No newline at end of file
diff --git a/examples/nlp/language_modeling/mamba_change_num_partition.py b/examples/nlp/language_modeling/mamba_change_num_partition.py
index 108ecc308482..6e0d922b4ea0 100644
--- a/examples/nlp/language_modeling/mamba_change_num_partition.py
+++ b/examples/nlp/language_modeling/mamba_change_num_partition.py
@@ -16,7 +16,7 @@
 import tarfile
 import tempfile
 from argparse import ArgumentParser
-
+import re
 import torch
 from omegaconf import open_dict
 from pytorch_lightning import Trainer
@@ -51,16 +51,6 @@
     --mamba2-head-dim=64
 """
 
-
-import argparse
-import copy
-import os
-import re
-import shutil
-from collections import OrderedDict
-
-import torch
-
 tp_split_dim = {
     'word_embeddings.weight': 0,
     'norm.weight': -1,
@@ -258,15 +248,6 @@ def write_tp_pp_split(model, splits, app_state, tp_size, pp_rank, write_path):
             logging.info(f"Writing pp rank {pp_rank} tp rank {tp_rank} to file {write_path}")
             model.save_to(write_path)
 
-
-def debug_log_split_param_diff(idx, param, param_name, partitions):
-    # Log some useful comparison of tensors that are being mapped.
-    # Note that the global param index for layers and modules may be different but the shapes
-    # and semantics of the layer should match.
-    logging.debug(f"Index: {idx} Model Params : {param_name} - {param.shape}")
-    logging.debug(f"Index: {idx} Global params: {partitions[1][idx]} - {partitions[0][idx].shape}")
-
-
 ##################
 ### Converters ###
 ##################
@@ -464,7 +445,6 @@ def main():
             plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
         # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
         # precision plugins and precision to exist
-        precision = None
     trainer = Trainer(plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu")
 
     if tp_size < 0 or pp_size < 0:
diff --git a/examples/nlp/language_modeling/megatron_mamba_finetuning.py b/examples/nlp/language_modeling/megatron_mamba_finetuning.py
deleted file mode 100644
index 0613ef486ec3..000000000000
--- a/examples/nlp/language_modeling/megatron_mamba_finetuning.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch.multiprocessing as mp
-from omegaconf.omegaconf import OmegaConf
-
-from nemo.collections.nlp.models.language_modeling.megatron_mamba_sft_model import MegatronMambaSFTModel
-from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
-from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
-
-mp.set_start_method("spawn", force=True)
-
-
-@hydra_runner(config_path="conf", config_name="megatron_mamba_finetuning_config")
-def main(cfg) -> None:
-
-    logging.info("\n\n************** Experiment configuration ***********")
-    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
-
-    precision = cfg.trainer.precision
-    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
-    # Restore the precision value after Trainer is built.
-    cfg.trainer.precision = precision
-    exp_manager(trainer, cfg.exp_manager)
-
-    model_cfg = MegatronMambaSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
-    model = MegatronMambaSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
-
-    peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
-
-    if cfg.model.peft.restore_from_path is not None:
-        # initialize peft weights from a check`point instead of randomly
-        # This is not the same as resume training because optimizer states are not restored.
-        logging.info("PEFT Weights will be loaded from", cfg.model.peft.restore_from_path)
-        model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg))
-    elif peft_cfg_cls is not None:
-        logging.info("Adding adapter weights to the model for PEFT")
-        model.add_adapter(peft_cfg_cls(model_cfg))
-    else:
-        logging.info(f"Running full finetuning since no peft scheme is given.\n{model.summarize()}")
-
-    trainer.fit(model)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/language_modeling/megatron_mamba_generate.py b/examples/nlp/language_modeling/megatron_mamba_generate.py
deleted file mode 100644
index 6f660d552fc6..000000000000
--- a/examples/nlp/language_modeling/megatron_mamba_generate.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import torch.multiprocessing as mp
-from omegaconf.omegaconf import OmegaConf
-from nemo.collections.nlp.models.language_modeling.megatron_mamba_sft_model import MegatronMambaSFTModel
-from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
-from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.model_utils import inject_model_parallel_rank
-
-
-mp.set_start_method("spawn", force=True)
-
-
-@hydra_runner(config_path="conf", config_name="megatron_mamba_generate_config")
-def main(cfg) -> None:
-    logging.info("\n\n************** Experiment configuration ***********")
-    logging.info(f"\n{OmegaConf.to_yaml(cfg)}")
-    trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
-
-    if cfg.model.peft.restore_from_path:
-        model_cfg = MegatronMambaSFTModel.merge_inference_cfg(cfg.model.peft.restore_from_path, cfg)
-    else:
-        model_cfg = MegatronMambaSFTModel.merge_inference_cfg(cfg.model.restore_from_path, cfg)
-
-    model = MegatronMambaSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
-
-    if cfg.model.peft.restore_from_path:
-        model.load_adapters(cfg.model.peft.restore_from_path)
-    elif cfg.model.peft.restore_from_ckpt.checkpoint_dir and cfg.model.peft.restore_from_ckpt.checkpoint_name:
-        peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
-        checkpoint_path = os.path.join(
-            cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
-        )
-        # checkpoint_path is a dir in case of distributed checkpointing
-        if not os.path.isdir(checkpoint_path):
-            # legacy checkpoint needs model parallel rank injection
-            checkpoint_path = inject_model_parallel_rank(
-                os.path.join(
-                    cfg.model.peft.restore_from_ckpt.checkpoint_dir, cfg.model.peft.restore_from_ckpt.checkpoint_name
-                )
-            )
-            model.load_adapters(checkpoint_path, peft_cfgs=peft_cfg_cls(model_cfg))
-        else:
-            raise NotImplementedError("distributed checkpointing of PEFT weights is not supported")
-
-    model.freeze()
-    logging.info(f"Freezing parameters for PEFT eval:\n{model.summarize()}")
-
-    trainer.test(model)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
index d88413fc5ad9..024c656b8c7b 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
@@ -40,10 +40,11 @@ def model_provider_func(self, pre_process, post_process):
         self.transformer_config.add_bias_linear = self.cfg.get('add_bias_linear', False)
         self.transformer_config.gated_linear_unit = self.cfg.get('gated_linear_unit', False)
         self.transformer_config.layernorm_epsilon = self.cfg.get('layernorm_epsilon', 1e-5)
+        
+        # TODO @ataghibakhsh: add mamba_ssm_ngroups=self.cfg.get('mamba_ssm_ngroups', 8) once MLM MR merged
 
         model = MambaModel(
             config=self.transformer_config,
-            ngroups=self.cfg.get('ngroups_mamba', 8),
             max_sequence_length=self.cfg.get('encoder_seq_length', 4096),
             vocab_size=self.cfg.get('vocab_size', 65536),
             mamba_stack_spec=mamba_stack_spec,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py
index 2d84bc088b0d..ebcc47004711 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_sft_model.py
@@ -20,10 +20,6 @@
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_sft_model import MegatronGPTSFTModel
 from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
 
-try:
-    HAVE_APEX = True
-except (ImportError, ModuleNotFoundError):
-    HAVE_APEX = False
 
 __all__ = ['MegatronMambaSFTModel']
 
@@ -34,10 +30,6 @@ class MegatronMambaSFTModel(MegatronGPTSFTModel, MegatronMambaModel):
     """
 
     def __init__(self, cfg: DictConfig, trainer: Trainer):
-        if not HAVE_APEX:
-            raise ImportError(
-                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
-            )
 
         super().__init__(cfg, trainer=trainer)
         self.mcore_gpt = True
diff --git a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
index f51060564174..9a44f9c2c5c4 100644
--- a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
@@ -30,7 +30,7 @@
                                 --input_name_or_path <path to the source pytorch model> \
                                 --output_path <path to target .nemo model> \
                                 --ngroups_mamba 8 \
-                                --precision 32
+                                --precision bf16
 '''
 
 
@@ -122,7 +122,7 @@ def convert(args):
         elif 'mlp' in layer_types:
             layer_pattern += '-'
         else:
-            AssertionError("Layer not found. Each layer must be eiher MLP, Mamba, or Attention")
+            raise AssertionError("Layer not found. Each layer must be eiher MLP, Mamba, or Attention")
 
     nemo_config = OmegaConf.load(args.hparams_file)
     nemo_config.trainer["precision"] = args.precision

From da7461abaf6a7783dbd85792641000d38ef5cd14 Mon Sep 17 00:00:00 2001
From: JRD971000 <JRD971000@users.noreply.github.com>
Date: Thu, 4 Jul 2024 09:21:34 +0000
Subject: [PATCH 16/21] Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>
---
 examples/nlp/language_modeling/mamba_change_num_partition.py  | 4 +++-
 .../nlp/models/language_modeling/megatron_mamba_model.py      | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/nlp/language_modeling/mamba_change_num_partition.py b/examples/nlp/language_modeling/mamba_change_num_partition.py
index 6e0d922b4ea0..bdb72a3c2aa4 100644
--- a/examples/nlp/language_modeling/mamba_change_num_partition.py
+++ b/examples/nlp/language_modeling/mamba_change_num_partition.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 
 import os
+import re
 import tarfile
 import tempfile
 from argparse import ArgumentParser
-import re
+
 import torch
 from omegaconf import open_dict
 from pytorch_lightning import Trainer
@@ -248,6 +249,7 @@ def write_tp_pp_split(model, splits, app_state, tp_size, pp_rank, write_path):
             logging.info(f"Writing pp rank {pp_rank} tp rank {tp_rank} to file {write_path}")
             model.save_to(write_path)
 
+
 ##################
 ### Converters ###
 ##################
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
index 024c656b8c7b..2bdfd32eb12b 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
@@ -40,7 +40,7 @@ def model_provider_func(self, pre_process, post_process):
         self.transformer_config.add_bias_linear = self.cfg.get('add_bias_linear', False)
         self.transformer_config.gated_linear_unit = self.cfg.get('gated_linear_unit', False)
         self.transformer_config.layernorm_epsilon = self.cfg.get('layernorm_epsilon', 1e-5)
-        
+
         # TODO @ataghibakhsh: add mamba_ssm_ngroups=self.cfg.get('mamba_ssm_ngroups', 8) once MLM MR merged
 
         model = MambaModel(

From 022622e78cad0177adcd8ad0dca6ec76d9903c7d Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Thu, 4 Jul 2024 02:32:26 -0700
Subject: [PATCH 17/21] address comments

---
 examples/nlp/language_modeling/mamba_change_num_partition.py    | 2 --
 .../nlp/models/language_modeling/megatron_mamba_model.py        | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/nlp/language_modeling/mamba_change_num_partition.py b/examples/nlp/language_modeling/mamba_change_num_partition.py
index bdb72a3c2aa4..bc76b3215a74 100644
--- a/examples/nlp/language_modeling/mamba_change_num_partition.py
+++ b/examples/nlp/language_modeling/mamba_change_num_partition.py
@@ -413,8 +413,6 @@ def main():
                 'This will cause each ckpt file to be temporarily laoded onto GPU memory!\n\n'
                 'It is highly recommended to pass `hparams_file` argument to avoid this.\n'
             )
-    else:
-        hparams_filepath = None
 
     # Import the class of the model
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
index 2bdfd32eb12b..bc61d0815056 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import torch
+from nemo.utils import logging
 from megatron.core.models.mamba import MambaModel
 from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec
 from omegaconf.dictconfig import DictConfig
@@ -30,6 +31,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.vocab_size = cfg.get('vocab_size', 65536)
         self.cfg = cfg
         super().__init__(cfg=cfg, trainer=trainer)
+        logging.warning("Overriding mcore_gpt=True")
         self.mcore_gpt = True
 
     def model_provider_func(self, pre_process, post_process):

From 7b6756827c4a68d85be90888480d830e8d290c05 Mon Sep 17 00:00:00 2001
From: JRD971000 <JRD971000@users.noreply.github.com>
Date: Thu, 4 Jul 2024 09:33:12 +0000
Subject: [PATCH 18/21] Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>
---
 .../nlp/models/language_modeling/megatron_mamba_model.py       | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
index bc61d0815056..5d7d1ce7f32c 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
@@ -13,12 +13,13 @@
 # limitations under the License.
 
 import torch
-from nemo.utils import logging
 from megatron.core.models.mamba import MambaModel
 from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec
 from omegaconf.dictconfig import DictConfig
 from pytorch_lightning.trainer.trainer import Trainer
+
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.utils import logging
 
 
 class MegatronMambaModel(MegatronGPTModel):

From de321bd98c84a6698b1910d22b400038e72a5bda Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Thu, 4 Jul 2024 03:14:38 -0700
Subject: [PATCH 19/21] add mamba_tmp

---
 .../nlp/models/language_modeling/megatron_mamba_model.py  | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
index 5d7d1ce7f32c..9b37ff7becf7 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
@@ -45,14 +45,16 @@ def model_provider_func(self, pre_process, post_process):
         self.transformer_config.layernorm_epsilon = self.cfg.get('layernorm_epsilon', 1e-5)
 
         # TODO @ataghibakhsh: add mamba_ssm_ngroups=self.cfg.get('mamba_ssm_ngroups', 8) once MLM MR merged
-
-        model = MambaModel(
+        # TODO @ataghibakhsh: add the following
+        '''MambaModel(
             config=self.transformer_config,
             max_sequence_length=self.cfg.get('encoder_seq_length', 4096),
             vocab_size=self.cfg.get('vocab_size', 65536),
             mamba_stack_spec=mamba_stack_spec,
             hybrid_override_pattern=self.hybrid_override_pattern,
-        )
+        )'''
+        # after package mismatch is resovled
+        model = None
 
         return model
 

From 8c7ba489503dbc834e06ae35a7c50a1563249b93 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <ataghibakhsh@login-eos01.eos.clusters.nvidia.com>
Date: Thu, 4 Jul 2024 03:45:24 -0700
Subject: [PATCH 20/21] remove mamba import

---
 .../nlp/models/language_modeling/megatron_mamba_model.py      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
index 9b37ff7becf7..00557c2538df 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import torch
-from megatron.core.models.mamba import MambaModel
-from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec
+# from megatron.core.models.mamba import MambaModel
+# from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec
 from omegaconf.dictconfig import DictConfig
 from pytorch_lightning.trainer.trainer import Trainer
 

From 7e41c51d5073a6b61a5b132845c41de1824ccee1 Mon Sep 17 00:00:00 2001
From: JRD971000 <JRD971000@users.noreply.github.com>
Date: Thu, 4 Jul 2024 10:46:21 +0000
Subject: [PATCH 21/21] Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>
---
 .../nlp/models/language_modeling/megatron_mamba_model.py         | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
index 00557c2538df..fb8a04b947b0 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_mamba_model.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import torch
+
 # from megatron.core.models.mamba import MambaModel
 # from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec
 from omegaconf.dictconfig import DictConfig