From 7eac53c217d8f399231f9f390602a7478a5e07c5 Mon Sep 17 00:00:00 2001
From: Dong Hyuk Chang <thomaschang26@tutanota.com>
Date: Thu, 25 Jul 2024 14:15:24 -0400
Subject: [PATCH 1/8] Set default Torch version if YY.MM format is not met
 (#9776)

* Torch major and minor versions set to current year and month if YY.MM formatting is not met

Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>

* Update nvidia torch version check

Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>

* Remove redundant import

Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>

* Formatting fix

Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>

---------

Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>
Co-authored-by: Dong Hyuk Chang <donghyukc@nvidia.com>
---
 .../nlp/models/language_modeling/megatron_base_model.py  | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 20d532d4764a..d0d239b21637 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -379,8 +379,11 @@ def _enable_nvidia_optimizations(self):
         # NVIDIA container version check
         nvidia_torch_version = os.getenv('NVIDIA_PYTORCH_VERSION', None)
 
-        # Support DLFW master container
-        if nvidia_torch_version == 'master':
+        def is_official_release_version(nvidia_torch_version):
+            return re.fullmatch("[0-9][0-9]\.[0-9][0-9].*", nvidia_torch_version)  # "YY.MM.*"
+
+        # Support DLFW dev container
+        if not is_official_release_version(nvidia_torch_version):
             nvidia_torch_version = datetime.now().strftime('%y.%m')
 
         if nvidia_torch_version is not None:
@@ -389,7 +392,7 @@ def _enable_nvidia_optimizations(self):
             except Exception:
                 NVIDIA_TORCH_MAJOR = 0
             try:
-                NVIDIA_TORCH_MINOR = int(nvidia_torch_version.split('.')[1])
+                NVIDIA_TORCH_MINOR = int(nvidia_torch_version.split('.')[1][:2])
             except Exception:
                 NVIDIA_TORCH_MINOR = 0
 

From 54916428c28f45394b6b3714934bcb6138109b16 Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Thu, 25 Jul 2024 15:45:19 -0700
Subject: [PATCH 2/8] fix arg name (#9848)

* fix arg name

Signed-off-by: Sangkug Lym <slym@nvidia.com>

* cleanup

Signed-off-by: Sangkug Lym <slym@nvidia.com>

* cleanup

Signed-off-by: Sangkug Lym <slym@nvidia.com>

---------

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
---
 .../nlp/modules/common/megatron/adapters/parallel_adapters.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index 8d2d77c55cf2..7167eefda637 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -257,7 +257,7 @@ def __init__(
 
             te_version = packaging.version.Version(version("transformer-engine"))
             if te_version >= packaging.version.Version("1.5.0dev") and (
-                not self.input_is_parallel and model_parallel_config.tp_comm_disable_qkv
+                not self.input_is_parallel and getattr(model_parallel_config, "tp_comm_overlap_disable_qkv", False)
             ):
                 # TE 1.5 introduces the option `return_layernorm_output_gathered`, so the all gather
                 # in the forward method is not needed, so set self._sequence_parallel to False

From bd185cbb738f01103aa4652b8cca34784df5e2cc Mon Sep 17 00:00:00 2001
From: Selvaraj Anandaraj <anandaraj@wisc.edu>
Date: Thu, 25 Jul 2024 16:32:59 -0700
Subject: [PATCH 3/8] Added defer wgrad support with mcore optim (#9896)

* Added defer wgrad support with mcore optim

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>

* Apply isort and black reformatting

Signed-off-by: sanandaraj5597 <sanandaraj5597@users.noreply.github.com>

---------

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
Signed-off-by: sanandaraj5597 <sanandaraj5597@users.noreply.github.com>
Co-authored-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: sanandaraj5597 <sanandaraj5597@users.noreply.github.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py    | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 1cdee42f580e..41d85d48e497 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -816,7 +816,9 @@ def training_step(self, dataloader_iter):
             ignore_virtual=True
         ):
             if (
-                self.cfg.get('defer_embedding_wgrad_compute', False) and self.mcore_gpt
+                self.cfg.get('defer_embedding_wgrad_compute', False)
+                and self.mcore_gpt
+                and not self.use_mcore_dist_optim
             ):  # Silently ignore the optimization if MCORE is not used
                 module_list = self.get_model_module_list()
                 if len(module_list) > 1:
@@ -839,7 +841,9 @@ def training_step(self, dataloader_iter):
             ignore_virtual=True
         ):
             if (
-                self.cfg.get('defer_embedding_wgrad_compute', False) and self.mcore_gpt
+                self.cfg.get('defer_embedding_wgrad_compute', False)
+                and self.mcore_gpt
+                and not self.use_mcore_dist_optim
             ):  # Silently ignore the optimization if MCORE is not used
                 module_list = self.get_model_module_list()
                 if len(module_list) > 1:

From fe1625947fbda50b32b914c12aafff86b1cb534f Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Thu, 25 Jul 2024 21:54:35 -0500
Subject: [PATCH 4/8] tutorial fixes (#9907)

---
 tutorials/llm/mamba/mamba.rst | 61 ++++++++++++++++++++++-------------
 1 file changed, 38 insertions(+), 23 deletions(-)

diff --git a/tutorials/llm/mamba/mamba.rst b/tutorials/llm/mamba/mamba.rst
index 2ce5ee5f616b..e619e56c3747 100644
--- a/tutorials/llm/mamba/mamba.rst
+++ b/tutorials/llm/mamba/mamba.rst
@@ -37,18 +37,36 @@ Step-by-step Guide for Fine-Tuning
 Checkpoints from HuggingFace
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Obtain the desired checkpoint from HuggigFace. 
+Obtain the desired checkpoint from HuggigFace. The checkpoints below have different arrangement and there are a few preprocessing step for each.
+
+1. `Repository <https://huggingface.co/collections/nvidia/ssms-666a362c5c3bb7e4a6bcfb9c>`__  for the Mamba2 and Mamba2-Hybrid models by `NVIDIA <https://arxiv.org/pdf/2406.07887>`__.
+   The checkpoint from this repository is located in files tab under ``release/mp_rank_00/model_optim_rng.pt``. The tokenizer is under files tab and is named ``mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model``. You need both of these for conversion to ``.nemo`` checkpoint.
+
+2. `Repository <https://huggingface.co/state-spaces>`__  for the Mamba2 models from the `Transformers are SSMs paper <https://arxiv.org/pdf/2405.21060>`__.
+    For checkpoints from this repository, run the following python script to convert the pytorch checkpoint (`pytorch_model.bin` in the HuggingFace model card) to a format similar to the 8b models:
+
+    .. code:: python
+        
+        import torch
+        import os
+
+        ckpt_path = "/path/to/pytorch_model.bin"
+        pyt_checkpoint = torch.load(ckpt_path)
+        new_ckpt_path = os.path.join(os.path.dirname(ckpt_path), f"wrapped_{os.path.basename(ckpt_path)}")
+        
+        # Save the new checkpoint which will be used as the input to the conversion script
+        torch.save({"model": pyt_checkpoint}, new_ckpt_path)
+
+    You will use this ``wrapped_pytorch_model.bin`` for the conversion to ``.nemo`` in the next step.
 
-* `Repository <https://huggingface.co/state-spaces>`__  for the Mamba2 models from the `Transformers are SSMs paper <https://arxiv.org/pdf/2405.21060>`__.
-* `Repository <https://huggingface.co/collections/nvidia/ssms-666a362c5c3bb7e4a6bcfb9c>`__  for the Mamba2 and Mamba2-Hybrid models by `NVIDIA <https://arxiv.org/pdf/2406.07887>`__.
 
 
 Convert the Pytorch Checkpoint to a NeMo Checkpoint
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-1. Get into NVIDIA Container 
+1. Get into the NVIDIA dev container from `NGC <https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags>`_, or the 24.07 container (once released).
 
-2. Run the conversion script from <SCRIPT-PATH>. For this conversion script, you should provide the PyTorch state dictionary of the model for ``input_name_or_path``, i.e. this argument only accepts a single ``state_dict``.
+2. Run the conversion script from <SCRIPT-PATH>. For this conversion script, you should provide the checkpoint (and tokenizer in the case of 8b models) from the previous step for ``input_name_or_path``.
 
 .. code:: bash
 
@@ -56,7 +74,9 @@ Convert the Pytorch Checkpoint to a NeMo Checkpoint
                                     --input_name_or_path <path to the source pytorch model> \
                                     --output_path <path to target .nemo model> \
                                     --mamba_ssm_ngroups 8 \
-                                    --precision bf16
+                                    --precision bf16 \
+                                    --tokenizer_path=<path to tokenizer.model>
+                                    
 
 * Note: the ``mamba_ssm_ngroups`` parameter should be 1 for the Mamba2 models from the `Transformers are SSMs paper <https://arxiv.org/pdf/2405.21060>`__ (130m, 370m, 780m, 1.3b, and 2.7b) and 8 for the Mamba2 and Mamba2-Hybrid models by `NVIDIA <https://arxiv.org/pdf/2406.07887>`__ (both 8b).
 
@@ -69,7 +89,7 @@ The HuggingFace checkpoint for the 8b model is for TP of size 1, and so is the `
 
 .. code:: bash
    
-   python /opt/NeMo/examples/nlp/language_modeling/mamba_change_num_partition.py \
+   CUDA_VISIBLE_DEVICES="0" python /opt/NeMo/examples/nlp/language_modeling/mamba_change_num_partition.py \
           --model_file=<path to source .nemo model> \
           --target_file=<path to target .nemo model> \
           --tensor_model_parallel_size=1 \
@@ -79,7 +99,7 @@ The HuggingFace checkpoint for the 8b model is for TP of size 1, and so is the `
 
 After running this script, a ``.nemo`` model along with the TP-size number of folders (4 in this example) will be generated in the target path. The folders for each rank will be displayed as ``mp_rank_00`` to ``mp_rank_03`` in this example. 
 
-* Note: You can only use Tensor Parallelism for the 8b models by `NVIDIA <https://arxiv.org/pdf/2406.07887>`__ (Mamba2 8b and Mamba2-Hybrid 8b). This is due to the fact that the ``nroups`` parameter in the model architecture should be divisible by TP size. ``nroups`` parameter is 8 for NVIDIA models and 1 for other models in the list.
+* Note: You can only use Tensor Parallelism for the 8b models by `NVIDIA <https://arxiv.org/pdf/2406.07887>`__ (Mamba2 8b and Mamba2-Hybrid 8b). This is due to the fact that the ``mamba_ssm_ngroups`` parameter in the model architecture should be divisible by TP size. ``mamba_ssm_ngroups`` parameter is 8 for NVIDIA models and 1 for other models in the list.
 
 Run Fine-Tuning
 ^^^^^^^^^^^^^^^
@@ -93,21 +113,21 @@ Run Fine-Tuning
 
     MBS=4
     GBS=128
-    TP=2 # According to the saved checkpoint
+    TP=4 # According to the saved checkpoint
     SP=True # True only if TP>1 otherwise False
     SEQ_LEN=2048
-    NUM_DEVICES=2
+    NUM_DEVICES=8
     PATH_TO_NEMO_MODEL=<path to .nemo file>
     TRAIN_DATASET_PATH=<path to training dataset file>
     VAL_DATASET_PATH=<path to validation dataset file>
-    CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/conf/"
+    CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/tuning/conf/"
     CONFIG_NAME="megatron_mamba_finetuning_config"
     SAVE_DIR=<path to the saving directory>
 
     export NVTE_FUSED_ATTN=1
     export NVTE_FLASH_ATTN=0
 
-    torchrun --nproc_per_node=${NUM_DEVICES} 
+    torchrun --nproc_per_node=${NUM_DEVICES} \
             /opt/NeMo/examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py \
             --config-path=${CONFIG_PATH} \
             --config-name=${CONFIG_NAME} \
@@ -135,7 +155,6 @@ Run Fine-Tuning
             model.optim.name="distributed_fused_adam" \
             model.data.train_ds.max_seq_length=${SEQ_LEN} \
             model.data.validation_ds.max_seq_length=${SEQ_LEN} \
-            model.mcore_gpt=True \
             model.micro_batch_size=${MBS} \
             model.global_batch_size=${GBS} \
             model.restore_from_path=${PATH_TO_NEMO_MODEL} \
@@ -144,8 +163,6 @@ Run Fine-Tuning
             model.optim.lr=5e-6 \
             model.optim.sched.min_lr=1e-7
 
-* Note: The tokenizer for 8b models (Mamba2 8b and MAmba2-Hybrid 8b) can be found in the `HuggingFace repository <https://huggingface.co/collections/nvidia/ssms-666a362c5c3bb7e4a6bcfb9c>`__. Download it a set its path to ``TOKENIZER_MODEL`` (the tokenizer model file is under the name of ```mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model```). For other models, set ``TOKENIZER_MODEL=null`` since it will be downloaded from HuggingFace at the time of run.
-
 
 Evaluating the Fine-Tuned Model
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -156,13 +173,12 @@ Evaluating the Fine-Tuned Model
 
     MBS=32
     GBS=64
-    TP=2 # According to the fine-tuned checkpoint
+    TP=4 # According to the fine-tuned checkpoint
     SP=True # True only if TP>1 otherwise False
     SEQ_LEN=2048
-    NUM_DEVICES=2
+    NUM_DEVICES=8
     PATH_TO_NEMO_MODEL=<path to .nemo file>
-    TRAIN_DATASET_PATH=<path to training dataset file>
-    VAL_DATASET_PATH=<path to validation dataset file>
+    TEST_DATASET="[<path to test datasets (list)>]"
     CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/tuning/conf/"
     CONFIG_NAME="megatron_mamba_finetuning_config"
     SAVE_DIR=<path to the saving directory>
@@ -170,12 +186,11 @@ Evaluating the Fine-Tuned Model
     export NVTE_FUSED_ATTN=1
     export NVTE_FLASH_ATTN=0
 
-    TEST_DATASET="[<path to test datasets (list)>]"
 
     CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/tuning/conf/"
     CONFIG_NAME="megatron_mamba_generate_config"
 
-    MASTER_PORT=15008 torchrun --nproc_per_node=${NUM_DEVICES}  /opt/NeMo/examples/nlp/language_modeling/tuning/megatron_mamba_generate.py \
+    torchrun --nproc_per_node=${NUM_DEVICES}  /opt/NeMo/examples/nlp/language_modeling/tuning/megatron_mamba_generate.py \
             --config-path=${CONFIG_PATH} \
             --config-name=${CONFIG_NAME} \
             trainer.devices=${NUM_DEVICES} \
@@ -196,11 +211,11 @@ Evaluating the Fine-Tuned Model
             +model.peft.restore_from_ckpt.checkpoint_dir=False \
             +model.peft.restore_from_ckpt.checkpoint_name=False \
             model.tensor_model_parallel_size=${TP} \
-            model.sequence_parallel=$SP \
             model.micro_batch_size=${MBS} \
             model.global_batch_size=${GBS} \
             model.restore_from_path=${PATH_TO_NEMO_MODEL} \
             model.data.test_ds.file_names=${TEST_DATASET} \
+            model.data.test_ds.names=["squad"] \
             model.data.test_ds.global_batch_size=${GBS} \
             model.data.test_ds.micro_batch_size=${MBS} \
             model.data.test_ds.tokens_to_generate=30 \
@@ -219,7 +234,7 @@ Evaluating the Fine-Tuned Model
 Inference
 ^^^^^^^^^
 
-For running inference on a Mamba model, one should use ``megatron_mamba_eval.py`` script. For example:
+For running inference on a Mamba model, one should use ``megatron_mamba_eval.py`` script. This evaluation script currently requires tensor/model parallel (TP1) of size one. If your checkpoint has TP>1, use the TP conversion step from above and set ``target_tensor_model_parallel_size=1``. The following is an example for using evaluation script:
 
 .. code:: bash
 

From 74c2caffdc81e83d67ce2aa5889c6c7c0aab6f72 Mon Sep 17 00:00:00 2001
From: Huu Tuong Tu <83907151+huutuongtu@users.noreply.github.com>
Date: Fri, 26 Jul 2024 15:24:48 +0700
Subject: [PATCH 5/8] [TTS][Vietnamese] Add VietnameseCharsTokenizer (#9665)

* Update tts_tokenizers.py
* Update tokenizer_utils.py
* Update test_tts_tokenizers.py
* Apply isort and black reformatting

Signed-off-by: huutuongtu <huutuongtu@users.noreply.github.com>

* Signed-off-by: Tu [huutu12312vn@gmail.com](mailto:huutu12312vn@gmail.com)

* Update ipa_lexicon.py - Signed-off-by: Tu [huutu12312vn@gmail.com](mailto:huutu12312vn@gmail.com)

Signed-off-by: XuesongYang <XuesongYang@users.noreply.github.com>

---------

Signed-off-by: huutuongtu <huutuongtu@users.noreply.github.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: XuesongYang <XuesongYang@users.noreply.github.com>
Co-authored-by: huutuongtu <huutuongtu@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: XuesongYang <XuesongYang@users.noreply.github.com>
---
 .../tokenizers/text_to_speech/ipa_lexicon.py  | 29 +++++++++++++-
 .../text_to_speech/tokenizer_utils.py         |  5 +++
 .../text_to_speech/tts_tokenizers.py          | 38 +++++++++++++++++++
 .../text_to_speech/test_tts_tokenizers.py     | 13 +++++++
 4 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py b/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
index f4081735eb71..6b7677431fc7 100644
--- a/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
+++ b/nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
@@ -15,7 +15,9 @@
 
 # fmt: off
 
-SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR"]
+
+SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN"]
+
 
 DEFAULT_PUNCTUATION = (
     ',', '.', '!', '?', '-',
@@ -48,6 +50,19 @@
         'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
         'U', 'V', 'W', 'X', 'Y', 'Z', 'Ä', 'Ö', 'Ü', 'ẞ',
     ),
+    # ref: https://en.wikipedia.org/wiki/Vietnamese_alphabet
+    "vi-VN": (
+        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 
+        'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 
+        'U', 'V', 'W', 'X', 'Y', 'Z', 'Đ', 'Á', 'À', 'Ã', 
+        'Ả', 'Ạ', 'Ă', 'Ắ', 'Ằ', 'Ẵ', 'Ẳ', 'Ặ', 'Â', 'Ấ', 
+        'Ầ', 'Ẫ', 'Ẩ', 'Ậ', 'Ó', 'Ò', 'Õ', 'Ỏ', 'Ọ', 'Ô', 
+        'Ố', 'Ồ', 'Ỗ', 'Ổ', 'Ộ', 'Ơ', 'Ớ', 'Ờ', 'Ỡ', 'Ở', 
+        'Ợ', 'É', 'È', 'Ẽ', 'Ẻ', 'Ẹ', 'Ê', 'Ế', 'Ề', 'Ễ', 
+        'Ể', 'Ệ', 'Ú', 'Ù', 'Ũ', 'Ủ', 'Ụ', 'Ư', 'Ứ', 'Ừ', 
+        'Ữ', 'Ử', 'Ự', 'Í', 'Ì', 'Ĩ', 'Ỉ', 'Ị', 'Ý', 'Ỳ', 
+        'Ỹ', 'Ỷ', 'Ỵ',
+    ),
     "fr-FR": (
         'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 
         'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 
@@ -104,6 +119,18 @@
         'ɽ','ʂ','ʈ','ʧ','ʉ','ʋ','ⱱ','ɤ','ʍ','χ','ʏ','ʑ','ʐ',
         'ʔ','ʡ','ʕ','ʢ','ǀ','ǁ','ǂ','ᵻ', 'ʃ','ː',
     ),
+    "vi-VN": (
+        'a', 'ə', 'ɛ', 'e', 'i', 'o', 'ɔ', 'u', 'ɨ',
+        'b', 'c', 'z', 'j', 'd', 'g', 'h', 'x', 'l',
+        'm', 'n', 'ŋ', 'ɲ', 'p', 'f', 'w', 'r', 's',
+        'ʃ', 't', 'ʈ', 'ʂ', 'v', 'ʔ', 'ɓ', 'ɗ', 'ɣ',
+        'k', 'ʰ', 'ʷ', 'ɕ', 'ʑ', 'ʝ', '̚', '̟', 't͡',
+        '˧', 'ː', 'ɯ', '̀', '̄', '̌', '̂', 'ˀ', '͡', '˥',
+        '˩', '̤', '˨', 'ɹ', 'ʲ', '̯', 'ă', 'ə̆', 'ǐ',
+        '˦', 'æ', 'ɐ',
+        'ɜ', 'ɡ', 'ɪ', 'ɬ' 'ɾ', 'ʊ', 'ʌ', 'ʒ', '̃',
+        '̩', 'θ', 'ᵻ',
+    ),
 }
 
 GRAPHEME_CHARACTER_CASES = ["upper", "lower", "mixed"]
diff --git a/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py b/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py
index 542b18186846..c82d4f3cce19 100644
--- a/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py
+++ b/nemo/collections/common/tokenizers/text_to_speech/tokenizer_utils.py
@@ -24,6 +24,7 @@
     "english_text_preprocessing",
     "any_locale_text_preprocessing",
     "spanish_text_preprocessing",
+    "vietnamese_text_preprocessing",
     "italian_text_preprocessing",
     "any_locale_word_tokenize",
     "english_word_tokenize",
@@ -201,3 +202,7 @@ def chinese_text_preprocessing(text: str) -> str:
 
 def french_text_preprocessing(text: str) -> str:
     return text.lower()
+
+
+def vietnamese_text_preprocessing(text: str) -> str:
+    return text.lower()
diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
index 4998fbba1ac9..6332c91cad46 100644
--- a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
+++ b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
@@ -31,6 +31,7 @@
     french_text_preprocessing,
     italian_text_preprocessing,
     spanish_text_preprocessing,
+    vietnamese_text_preprocessing,
 )
 from nemo.utils import logging
 from nemo.utils.decorators import experimental
@@ -202,6 +203,43 @@ def __init__(
         )
 
 
+class VietnameseCharsTokenizer(BaseCharsTokenizer):
+
+    _LOCALE = "vi-VN"
+    _CHARSET_STR = get_grapheme_character_set(locale=_LOCALE, case="mixed")
+
+    def __init__(
+        self,
+        chars=_CHARSET_STR,
+        punct=True,
+        apostrophe=True,
+        add_blank_at=None,
+        pad_with_space=False,
+        non_default_punct_list=None,
+        text_preprocessing_func=vietnamese_text_preprocessing,
+    ):
+        """Vietnamese grapheme tokenizer.
+        Args:
+            punct: Whether to reserve grapheme for basic punctuation or not.
+            apostrophe: Whether to use apostrophe or not.
+            add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
+            if None then no blank in labels.
+            pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
+            non_default_punct_list: List of punctuation marks which will be used instead default.
+            text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer. By default, it
+            would keep any word lowercase.
+        """
+        super().__init__(
+            chars=chars,
+            punct=punct,
+            apostrophe=apostrophe,
+            add_blank_at=add_blank_at,
+            pad_with_space=pad_with_space,
+            non_default_punct_list=non_default_punct_list,
+            text_preprocessing_func=vietnamese_text_preprocessing,
+        )
+
+
 class GermanCharsTokenizer(BaseCharsTokenizer):
 
     _LOCALE = "de-DE"
diff --git a/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py b/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py
index 2e2f9bdaaf36..2023d31696b1 100644
--- a/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py
+++ b/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py
@@ -21,6 +21,7 @@
     IPATokenizer,
     ItalianCharsTokenizer,
     SpanishCharsTokenizer,
+    VietnameseCharsTokenizer,
 )
 from nemo.collections.tts.g2p.models.i18n_ipa import IpaG2p
 
@@ -124,6 +125,18 @@ def test_spanish_chars_tokenizer(self):
         assert chars == expected_output
         assert len(tokens) == len(input_text)
 
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_vietnamese_chars_tokenizer(self):
+        input_text = "Xin chào các bạn."
+        expected_output = "xin chào các bạn."
+
+        tokenizer = VietnameseCharsTokenizer()
+        chars, tokens = self._parse_text(tokenizer, input_text)
+
+        assert chars == expected_output
+        assert len(tokens) == len(input_text)
+
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
     def test_french_chars_tokenizer(self):

From c81f7cf6cb1234bf51843f8fd192f72c52389407 Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Fri, 26 Jul 2024 09:46:30 -0400
Subject: [PATCH 6/8] Integrate TRT-LLM v0.11 (#9705)

* fix minor import bug

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Change imports to catch import level errros

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Update changed trt-llm apis

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Gemma working version

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* llama working version

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* gpt support

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* remove old tests

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* add new tests

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* lora fix

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Add a few more params for trt-llm

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* Add params to the load

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
---
 nemo/export/tensorrt_llm.py                   |  51 ++++++-
 .../trt_llm/converter/model_converter.py      |  21 ++-
 nemo/export/trt_llm/converter/utils.py        |   2 +-
 nemo/export/trt_llm/tensorrt_llm_build.py     |  29 ++--
 nemo/export/trt_llm/tensorrt_llm_run.py       |  56 +++++++-
 scripts/deploy/nlp/deploy_triton.py           |  12 +-
 tests/export/run.sh                           |  11 +-
 tests/infer_data_path.py                      | 136 +++---------------
 8 files changed, 169 insertions(+), 149 deletions(-)
 mode change 100644 => 100755 nemo/export/trt_llm/converter/model_converter.py
 mode change 100644 => 100755 nemo/export/trt_llm/converter/utils.py
 mode change 100644 => 100755 nemo/export/trt_llm/tensorrt_llm_build.py

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index b4299dfd8945..08f1e4fe74e6 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -38,16 +38,24 @@
     is_nemo_file,
     load_nemo_model,
 )
-from nemo.export.trt_llm.qnemo import qnemo_to_tensorrt_llm
-from nemo.export.trt_llm.qnemo.tokenizer_utils import get_nmt_tokenizer
-from nemo.export.trt_llm.qnemo.utils import is_qnemo_checkpoint
 from nemo.export.trt_llm.tensorrt_llm_build import build_and_save_engine
 from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load, load_distributed, refit
 
+LOGGER = logging.getLogger("NeMo")
+
+use_model_opt = True
+try:
+    from nemo.export.trt_llm.qnemo import qnemo_to_tensorrt_llm
+    from nemo.export.trt_llm.qnemo.tokenizer_utils import get_nmt_tokenizer
+    from nemo.export.trt_llm.qnemo.utils import is_qnemo_checkpoint
+except Exception as e:
+    LOGGER.warning(f"Cannot import the Model Optimizer, it will not be available. {type(e).__name__}: {e}")
+    use_model_opt = False
+
 use_deploy = True
 try:
     from nemo.deploy.utils import cast_output, str_ndarray2list
-except Exception:
+except Exception as e:
     use_deploy = False
 
 
@@ -67,8 +75,6 @@ def wrapper(*args, **kwargs):
 except Exception:
     use_pytriton = False
 
-LOGGER = logging.getLogger("NeMo")
-
 
 class TensorRTLLM(ITritonDeployable):
     """
@@ -95,6 +101,8 @@ def __init__(
         lora_ckpt_list: List[str] = None,
         load_model: bool = True,
         use_python_runtime: bool = True,
+        enable_chunked_context: bool = None,
+        max_tokens_in_paged_kv_cache: int = None,
     ):
         """
         Args:
@@ -104,9 +112,19 @@ def __init__(
             use_python_runtime (bool): whether to use python or c++ runtime.
         """
 
+        if use_python_runtime:
+            if enable_chunked_context is not None or max_tokens_in_paged_kv_cache is not None:
+                raise Exception(
+                    "enable_chunked_context and max_tokens_in_paged_kv_cache options "
+                    "work only with the TensorRT-LLM C++ runtime. Please set "
+                    "use_python_runtime=False to use these options."
+                )
+
         self.model_dir = model_dir
         self.lora_ckpt_list = lora_ckpt_list
         self.use_python_runtime = use_python_runtime
+        self.enable_chunked_context = enable_chunked_context if enable_chunked_context is not None else False
+        self.max_tokens_in_paged_kv_cache = max_tokens_in_paged_kv_cache
         self.model = None
         self.tokenizer = None
         self.n_gpus = None
@@ -148,6 +166,10 @@ def export(
         max_lora_rank: int = 64,
         max_num_tokens: int = None,
         opt_num_tokens: int = None,
+        max_seq_len: int = None,
+        multiple_profiles: bool = False,
+        gpt_attention_plugin: str = "auto",
+        gemm_plugin: str = "auto",
     ):
         """
         Exports nemo checkpoints to TensorRT-LLM.
@@ -179,6 +201,10 @@ def export(
             max_lora_rank (int): maximum lora rank.
             max_num_tokens (int):
             opt_num_tokens (int):
+            max_seq_len (int):
+            multiple_profiles: (bool): enables multiple profiles feature of TRT-LLM. Default = False
+            gpt_attention_plugin (str): enable the gpt attention plugin. Default = "auto"
+            gemm_plugin (str): enable the gpt plugin. Default = "auto"
         """
 
         if n_gpus is not None:
@@ -233,7 +259,12 @@ def export(
             tmp_dir = tempfile.TemporaryDirectory()
             nemo_export_dir = Path(tmp_dir.name)
 
-            if is_qnemo_checkpoint(nemo_checkpoint_path):
+            is_qnemo_ckpt = False
+            if use_model_opt:
+                if is_qnemo_checkpoint(nemo_checkpoint_path):
+                    is_qnemo_ckpt = True
+
+            if is_qnemo_ckpt:
                 if os.path.isdir(nemo_checkpoint_path):
                     nemo_export_dir = nemo_checkpoint_path
                 else:
@@ -310,6 +341,10 @@ def export(
                         paged_context_fmha=paged_context_fmha,
                         max_num_tokens=max_num_tokens,
                         opt_num_tokens=opt_num_tokens,
+                        max_seq_len=max_seq_len,
+                        multiple_profiles=multiple_profiles,
+                        gpt_attention_plugin=gpt_attention_plugin,
+                        gemm_plugin=gemm_plugin,
                     )
 
             tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
@@ -838,6 +873,8 @@ def _load(self):
                         engine_dir=self.model_dir,
                         lora_ckpt_list=self.lora_ckpt_list,
                         use_python_runtime=self.use_python_runtime,
+                        enable_chunked_context=self.enable_chunked_context,
+                        max_tokens_in_paged_kv_cache=self.max_tokens_in_paged_kv_cache,
                     )
                     self._load_prompt_tables()
                 except Exception as error:
diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py
old mode 100644
new mode 100755
index 2a78f6833782..60d50316e9ed
--- a/nemo/export/trt_llm/converter/model_converter.py
+++ b/nemo/export/trt_llm/converter/model_converter.py
@@ -22,6 +22,8 @@
 from tensorrt_llm._utils import pad_vocab_size
 from tensorrt_llm.functional import non_gated_version
 from tensorrt_llm.layers import MoeConfig
+from tensorrt_llm.models.gpt.config import GPTConfig
+from tensorrt_llm.models.llama.config import LLaMAConfig
 from tensorrt_llm.models.modeling_utils import PretrainedConfig
 
 from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import (
@@ -33,6 +35,15 @@
 LOGGER = logging.getLogger("NeMo")
 
 
+def get_config(decoder_type, config):
+    if decoder_type == "llama":
+        return LLaMAConfig(**config)
+    elif decoder_type == "gpt" or decoder_type == "gptnext":
+        return GPTConfig(**config)
+    else:
+        return PretrainedConfig(**config)
+
+
 def prompt_convert(prompt_config, prompt_weights):
     if "task_templates" in prompt_config:
         prompt_templates = prompt_config["task_templates"]
@@ -156,11 +167,13 @@ def model_to_trtllm_ckpt(
         'rotary_pct': nemo_model_config.get('rotary_percentage', 1.0),
         'rotary_base': nemo_model_config.get('rotary_base', 10000),
         'moe_num_experts': nemo_model_config.get('num_moe_experts', 0),
-        'moe_top_k': nemo_model_config.get('moe_router_topk'),
+        'moe_top_k': nemo_model_config.get('moe_router_topk', 0),
         'moe_normalization_mode': nemo_model_config.get(
             'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
         ),
-        'moe_tp_mode': nemo_model_config.get('moe_tp_mode', MoeConfig.ParallelismMode.TENSOR_PARALLEL),
+        'moe_tp_mode': nemo_model_config.get(
+            'moe_tp_mode', 2
+        ),  # change MoeConfig.ParallelismMode.TENSOR_PARALLEL to 2
         'logits_dtype': 'float32',
         'world_size': world_size,
         'tp_size': tensor_parallel_size,
@@ -179,7 +192,7 @@ def model_to_trtllm_ckpt(
 
     if use_distributed_convert:
         config["gpus_per_node"] = gpus_per_node
-        model_configs.append(PretrainedConfig(**config))
+        model_configs.append(get_config(decoder_type, config))
         model_configs[0].mapping = tensorrt_llm.Mapping(
             world_size=world_size,
             rank=model_parallel_rank,
@@ -258,7 +271,7 @@ def model_to_trtllm_ckpt(
                 weights_dict_local["transformer.ln_f.bias"] = ln_f_bias
 
         config["gpus_per_node"] = gpus_per_node
-        model_config = PretrainedConfig(**config)
+        model_config = get_config(decoder_type, config)
         model_config.mapping = mapping
         model_configs.append(model_config)
         weights_dicts.append(weights_dict_local)
diff --git a/nemo/export/trt_llm/converter/utils.py b/nemo/export/trt_llm/converter/utils.py
old mode 100644
new mode 100755
index 3768ff4b2844..eab17167cbd5
--- a/nemo/export/trt_llm/converter/utils.py
+++ b/nemo/export/trt_llm/converter/utils.py
@@ -26,7 +26,7 @@
 DECODER_MODEL_TYPE = {
     "gptj": 'GPTForCausalLM',
     "gptnext": 'GPTForCausalLM',
-    "llama": 'LLaMAForCausalLM',
+    "llama": 'LlamaForCausalLM',
     "gemma": 'GemmaForCausalLM',
     "falcon": 'FalconForCausalLM',
 }
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
old mode 100644
new mode 100755
index b329de2a3b18..d04698c318bf
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -45,41 +45,51 @@ def build_and_save_engine(
     paged_kv_cache: bool = True,
     remove_input_padding: bool = True,
     paged_context_fmha: bool = False,
-    custom_all_reduce: bool = True,
+    use_custom_all_reduce: bool = True,
     use_refit: bool = False,
     max_num_tokens: int = None,
+    max_seq_len: int = None,
     opt_num_tokens: int = None,
     max_beam_width: int = 1,
     tokens_per_block: int = 128,
+    multiple_profiles: bool = False,
+    gpt_attention_plugin: str = "auto",
+    gemm_plugin: str = "auto",
 ):
+    architecture = "LLaMAForCausalLM" if model_config.architecture == "LlamaForCausalLM" else model_config.architecture
     try:
-        model_cls = getattr(tensorrt_llm.models, model_config.architecture)
+        model_cls = getattr(tensorrt_llm.models, architecture)
     except:
         raise AttributeError(f"Could not find TRTLLM model type: {model_type}!")
 
     logger.set_level("info")
-    str_dtype = model_config.dtype
     plugin_config = PluginConfig()
-    plugin_config.set_gpt_attention_plugin(dtype=str_dtype)
-    plugin_config.set_gemm_plugin(dtype=str_dtype)
-    plugin_config.use_custom_all_reduce = custom_all_reduce
-    plugin_config.set_plugin("multi_block_mode", enable_multi_block_mode)
+    plugin_config.gpt_attention_plugin = gpt_attention_plugin
+    plugin_config.gemm_plugin = gemm_plugin
+    plugin_config.set_nccl_plugin(use_custom_all_reduce=use_custom_all_reduce)
+    plugin_config.multi_block_mode = enable_multi_block_mode
     if paged_kv_cache:
         plugin_config.enable_paged_kv_cache(tokens_per_block=tokens_per_block)
     else:
         plugin_config.paged_kv_cache = False
     plugin_config.remove_input_padding = remove_input_padding
     plugin_config.use_paged_context_fmha = paged_context_fmha
+    plugin_config.multiple_profiles = multiple_profiles
+
+    if max_seq_len is None:
+        max_seq_len = max_input_len + max_output_len
 
     max_num_tokens, opt_num_tokens = check_max_num_tokens(
         max_num_tokens=max_num_tokens,
         opt_num_tokens=opt_num_tokens,
+        max_seq_len=max_seq_len,
         max_batch_size=max_batch_size,
         max_input_len=max_input_len,
         max_beam_width=max_beam_width,
         remove_input_padding=remove_input_padding,
         enable_context_fmha=plugin_config.context_fmha,
         tokens_per_block=tokens_per_block,
+        multiple_profiles=multiple_profiles,
     )
 
     build_dict = {
@@ -87,6 +97,7 @@ def build_and_save_engine(
         'max_output_len': max_output_len,
         'max_batch_size': max_batch_size,
         'max_beam_width': max_beam_width,
+        'max_seq_len': max_seq_len,
         'max_num_tokens': max_num_tokens,
         'opt_num_tokens': opt_num_tokens,
         'max_prompt_embedding_table_size': max_prompt_embedding_table_size,
@@ -95,11 +106,13 @@ def build_and_save_engine(
         'strongly_typed': False,
         'builder_opt': None,
         'use_refit': use_refit,
+        'multiple_profiles': multiple_profiles,
     }
     build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config)
 
     if use_lora_plugin is not None:
-        build_config.plugin_config.set_lora_plugin(use_lora_plugin)
+        # build_config.plugin_config.set_lora_plugin(use_lora_plugin)
+        # build_config.plugin_config._lora_plugin = use_lora_plugin
         lora_config = LoraConfig(
             lora_dir=lora_ckpt_list,
             lora_ckpt_source='nemo',
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index dbbf40cc3cf1..14ad0be699bb 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -26,15 +26,26 @@
 import tensorrt_llm
 import torch
 from mpi4py.futures import MPIPoolExecutor
-from tensorrt_llm.bindings import GptJsonConfig, GptSession, GptSessionConfig, KvCacheConfig, WorldConfig
 from tensorrt_llm.lora_manager import LoraManager
 from tensorrt_llm.quantization import QuantMode
 from tensorrt_llm.runtime import ModelConfig, ModelRunner, ModelRunnerCpp, SamplingConfig
-from tensorrt_llm.runtime.model_runner_cpp import ModelRunnerCppGptSession
+
 from transformers import PreTrainedTokenizer
 
 LOGGER = logging.getLogger("NeMo")
 
+use_trtllm_bindings = True
+try:
+    from tensorrt_llm.bindings import GptJsonConfig, GptSession, GptSessionConfig, KvCacheConfig, WorldConfig
+except Exception as e:
+    use_trtllm_bindings = False
+
+use_cpp_gpt_session = True
+try:
+    from tensorrt_llm.runtime.model_runner_cpp import ModelRunnerCppGptSession
+except Exception as e:
+    use_cpp_gpt_session = False
+
 
 @dataclass
 class TensorrtLLMHostContext:
@@ -131,6 +142,8 @@ def _load(
     lora_ckpt_list=None,
     num_beams=1,
     use_python_runtime: bool = True,
+    enable_chunked_context: bool = False,
+    max_tokens_in_paged_kv_cache: int = None,
 ):
     """The impl of `load` API for on a single GPU worker."""
     try:
@@ -145,7 +158,7 @@ def _load(
 
         max_batch_size = config["build_config"]["max_batch_size"]
         max_input_len = config["build_config"]["max_input_len"]
-        max_output_len = config["build_config"]["max_output_len"]
+        # max_output_len = config["build_config"]["max_output_len"]
         max_beam_width = config["build_config"]["max_beam_width"]
 
         runtime_rank = tensorrt_llm.mpi_rank()
@@ -166,8 +179,10 @@ def _load(
                 rank=runtime_rank,
                 max_batch_size=max_batch_size,
                 max_input_len=max_input_len,
-                max_output_len=max_output_len,
+                # max_output_len=max_output_len,
                 max_beam_width=max_beam_width,
+                enable_chunked_context=enable_chunked_context,
+                max_tokens_in_paged_kv_cache=max_tokens_in_paged_kv_cache,
                 debug_mode=False,
             )
 
@@ -279,6 +294,8 @@ def load(
     lora_ckpt_list: List[str] = None,
     num_beams: int = 1,
     use_python_runtime: bool = True,
+    enable_chunked_context: bool = False,
+    max_tokens_in_paged_kv_cache: int = None,
 ) -> TensorrtLLMHostContext:
     """Loaded the compiled LLM model and run it.
 
@@ -290,17 +307,42 @@ def load(
         config = json.load(f)
     world_size = config["pretrained_config"]["mapping"]["world_size"]
     if world_size == 1:
-        _load(tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime)
+        _load(
+            tokenizer,
+            engine_dir,
+            lora_ckpt_list,
+            num_beams,
+            use_python_runtime,
+            enable_chunked_context,
+            max_tokens_in_paged_kv_cache,
+        )
         executor = None
     elif tensorrt_llm.mpi_world_size() > 1:
-        _load(tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime)
+        _load(
+            tokenizer,
+            engine_dir,
+            lora_ckpt_list,
+            num_beams,
+            use_python_runtime,
+            enable_chunked_context,
+            max_tokens_in_paged_kv_cache,
+        )
         executor = None
         tensorrt_llm.mpi_barrier()
     else:
         executor = MPIPoolExecutor(max_workers=world_size)
         futures = []
         for _ in range(world_size):
-            future = executor.submit(_load, tokenizer, engine_dir, lora_ckpt_list, num_beams, use_python_runtime)
+            future = executor.submit(
+                _load,
+                tokenizer,
+                engine_dir,
+                lora_ckpt_list,
+                num_beams,
+                use_python_runtime,
+                enable_chunked_context,
+                max_tokens_in_paged_kv_cache,
+            )
             futures.append(future)
         for future in futures:
             future.result()
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
index 9d9f0fa200f0..01be9ff63a0d 100755
--- a/scripts/deploy/nlp/deploy_triton.py
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -80,7 +80,7 @@ def get_args(argv):
     parser.add_argument(
         "-tmr", "--triton_model_repository", default=None, type=str, help="Folder for the trt-llm conversion"
     )
-    parser.add_argument("-ng", "--num_gpus", default=1, type=int, help="Number of GPUs for the deployment")
+    parser.add_argument("-ng", "--num_gpus", default=None, type=int, help="Number of GPUs for the deployment")
     parser.add_argument("-tps", "--tensor_parallelism_size", default=1, type=int, help="Tensor parallelism size")
     parser.add_argument("-pps", "--pipeline_parallelism_size", default=1, type=int, help="Pipeline parallelism size")
     parser.add_argument(
@@ -95,7 +95,13 @@ def get_args(argv):
     parser.add_argument("-mol", "--max_output_len", default=256, type=int, help="Max output length of the model")
     parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model")
     parser.add_argument("-mnt", "--max_num_tokens", default=None, type=int, help="Max number of tokens")
+    parser.add_argument("-msl", "--max_seq_len", default=None, type=int, help="Maximum number of sequence length")
+    parser.add_argument("-mp", "--multiple_profiles", default=False, action='store_true', help="Multiple profiles")
     parser.add_argument("-ont", "--opt_num_tokens", default=None, type=int, help="Optimum number of tokens")
+    parser.add_argument(
+        "-gap", "--gpt_attention_plugin", default="auto", type=str, help="dtype of gpt attention plugin"
+    )
+    parser.add_argument("-gp", "--gemm_plugin", default="auto", type=str, help="dtype of gpt plugin")
     parser.add_argument(
         "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size"
     )
@@ -284,6 +290,7 @@ def get_trtllm_deployable(args):
                 max_batch_size=args.max_batch_size,
                 max_num_tokens=args.max_num_tokens,
                 opt_num_tokens=args.opt_num_tokens,
+                max_seq_len=args.max_seq_len,
                 use_parallel_embedding=args.use_parallel_embedding,
                 max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
                 paged_kv_cache=(not args.no_paged_kv_cache),
@@ -293,6 +300,9 @@ def get_trtllm_deployable(args):
                 use_lora_plugin=args.use_lora_plugin,
                 lora_target_modules=args.lora_target_modules,
                 max_lora_rank=args.max_lora_rank,
+                multiple_profiles=args.multiple_profiles,
+                gpt_attention_plugin=args.gpt_attention_plugin,
+                gemm_plugin=args.gemm_plugin,
             )
         except Exception as error:
             raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))
diff --git a/tests/export/run.sh b/tests/export/run.sh
index e534e4e87ee9..a2366f0634ea 100644
--- a/tests/export/run.sh
+++ b/tests/export/run.sh
@@ -36,12 +36,9 @@ python tests/export/nemo_export.py --model_name LLAMA2-13B-base-int4 --existing_
 python tests/export/nemo_export.py --model_name LLAMA2-70B-base --existing_test_models --min_tps 2 --max_tps 8
 python tests/export/nemo_export.py --model_name LLAMA2-70B-base-fp8 --existing_test_models --min_tps 8 --max_tps 8
 python tests/export/nemo_export.py --model_name LLAMA2-70B-base-int4 --existing_test_models --min_tps 8 --max_tps 8
-python tests/export/nemo_export.py --model_name NV-GPT-8B-Base-4k --existing_test_models --min_tps 1 --max_tps 8
-python tests/export/nemo_export.py --model_name NV-GPT-8B-QA-4k --existing_test_models --min_tps 1 --max_tps 8
-python tests/export/nemo_export.py --model_name NV-GPT-8B-Chat-4k-SFT --existing_test_models --min_tps 1 --max_tps 8
-python tests/export/nemo_export.py --model_name NV-GPT-8B-Chat-4k-RLHF --existing_test_models --min_tps 1 --max_tps 8
-python tests/export/nemo_export.py --model_name NV-GPT-8B-Chat-4k-SteerLM --existing_test_models --min_tps 1 --max_tps 8
-python tests/export/nemo_export.py --model_name FALCON-7B-base --existing_test_models --min_tps 1 --max_tps 2
+python tests/export/nemo_export.py --model_name FALCON-7B-base --existing_test_models --min_tps 1 --max_tps 1
 python tests/export/nemo_export.py --model_name FALCON-40B-base --existing_test_models --min_tps 2 --max_tps 8
 python tests/export/nemo_export.py --model_name STARCODER1-15B-base --existing_test_models --min_tps 1 --max_tps 1
-python tests/export/nemo_export.py --model_name GEMMA-base --existing_test_models --min_tps 1 --max_tps 1
\ No newline at end of file
+python tests/export/nemo_export.py --model_name STARCODER2-15B-base --existing_test_models --min_tps 1 --max_tps 1
+python tests/export/nemo_export.py --model_name GEMMA-base --existing_test_models --min_tps 1 --max_tps 1
+python tests/export/nemo_export.py --model_name Nemotron3-22B-base-32k --existing_test_models --min_tps 2
\ No newline at end of file
diff --git a/tests/infer_data_path.py b/tests/infer_data_path.py
index 45850dcb366a..4125e77c0a1b 100644
--- a/tests/infer_data_path.py
+++ b/tests/infer_data_path.py
@@ -19,125 +19,22 @@
 def get_infer_test_data():
     test_data = {}
 
-    test_data["NV-GPT-8B-Base-4k"] = {}
-    test_data["NV-GPT-8B-Base-4k"]["model_type"] = "gptnext"
-    test_data["NV-GPT-8B-Base-4k"]["min_tps"] = 1
-    test_data["NV-GPT-8B-Base-4k"]["location"] = "Local"
-    test_data["NV-GPT-8B-Base-4k"]["model_dir"] = "/tmp/NV-GPT-8B-Base-4k/nv-gpt-8b-base-4k_v1.0/"
-    test_data["NV-GPT-8B-Base-4k"][
+    test_data["Nemotron3-22B-base-32k"] = {}
+    test_data["Nemotron3-22B-base-32k"]["model_type"] = "gptnext"
+    test_data["Nemotron3-22B-base-32k"]["min_tps"] = 2
+    test_data["Nemotron3-22B-base-32k"]["location"] = "Local"
+    test_data["Nemotron3-22B-base-32k"]["model_dir"] = "/tmp/Nemotron3-22B-base-32k/"
+    test_data["Nemotron3-22B-base-32k"][
         "checkpoint"
-    ] = "/opt/checkpoints/NV-GPT-8B-Base-4k/nv-gpt-8b-base-4k_v1.0/NV-GPT-8B-Base-4k.nemo"
-    test_data["NV-GPT-8B-Base-4k"]["p_tuning_checkpoint"] = "/opt/checkpoints/NV-GPT-8B-PTuning/nv-gpt-8B-ptuning.nemo"
-    test_data["NV-GPT-8B-Base-4k"]["prompt_template"] = [
+    ] = "/opt/checkpoints/nemotron-3-22b-base-32k_v1.0/mcore-gpt3-22b-3_8T-pi32k-3_5T-cont-10k.nemo"
+    test_data["Nemotron3-22B-base-32k"]["prompt_template"] = [
         "The capital of France is",
         "Largest animal in the sea is",
         "Fastest animal in the world is",
     ]
-    test_data["NV-GPT-8B-Base-4k"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
-    test_data["NV-GPT-8B-Base-4k"]["max_output_len"] = 128
-    test_data["NV-GPT-8B-Base-4k"]["max_batch_size"] = 10
-
-    test_data["NV-GPT-8B-Base-16k"] = {}
-    test_data["NV-GPT-8B-Base-16k"]["model_type"] = "gptnext"
-    test_data["NV-GPT-8B-Base-16k"]["min_tps"] = 1
-    test_data["NV-GPT-8B-Base-16k"]["location"] = "Local"
-    test_data["NV-GPT-8B-Base-16k"]["model_dir"] = "/tmp/NV-GPT-8B-Base-16k/nv-gpt-8b-base-16k_v1.0/"
-    test_data["NV-GPT-8B-Base-16k"][
-        "checkpoint"
-    ] = "/opt/checkpoints/NV-GPT-8B-Base-16k/nv-gpt-8b-base-16k_v1.0/NV-GPT-8B-Base-16k.nemo"
-    test_data["NV-GPT-8B-Base-16k"]["prompt_template"] = [
-        "The capital of France is",
-        "Largest animal in the sea is",
-        "Fastest animal in the world is",
-    ]
-    test_data["NV-GPT-8B-Base-16k"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
-    test_data["NV-GPT-8B-Base-16k"]["max_output_len"] = 128
-    test_data["NV-GPT-8B-Base-16k"]["max_batch_size"] = 20
-
-    test_data["NV-GPT-8B-QA-4k"] = {}
-    test_data["NV-GPT-8B-QA-4k"]["model_type"] = "gptnext"
-    test_data["NV-GPT-8B-QA-4k"]["min_tps"] = 1
-    test_data["NV-GPT-8B-QA-4k"]["location"] = "Local"
-    test_data["NV-GPT-8B-QA-4k"]["model_dir"] = "/tmp/NV-GPT-8B-QA-4k/nv-gpt-8b-qa-4k_v1.0/"
-    test_data["NV-GPT-8B-QA-4k"][
-        "checkpoint"
-    ] = "/opt/checkpoints/NV-GPT-8B-QA-4k/nv-gpt-8b-qa-4k_v1.0/NV-GPT-8B-QA-4k.nemo"
-    test_data["NV-GPT-8B-QA-4k"]["prompt_template"] = [
-        "What is the capital of France?",
-        "What is the largest animal in the sea?",
-        "What is the fastest animal in the world?",
-    ]
-    test_data["NV-GPT-8B-QA-4k"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
-    test_data["NV-GPT-8B-QA-4k"]["max_output_len"] = 96
-    test_data["NV-GPT-8B-QA-4k"]["max_batch_size"] = 20
-
-    test_data["NV-GPT-8B-Chat-4k-SFT"] = {}
-    test_data["NV-GPT-8B-Chat-4k-SFT"]["model_type"] = "gptnext"
-    test_data["NV-GPT-8B-Chat-4k-SFT"]["min_tps"] = 1
-    test_data["NV-GPT-8B-Chat-4k-SFT"]["location"] = "Local"
-    test_data["NV-GPT-8B-Chat-4k-SFT"]["model_dir"] = "/tmp/NV-GPT-8B-Chat-4k-SFT/nv-gpt-8b-chat-4k-sft_v1.0/"
-    test_data["NV-GPT-8B-Chat-4k-SFT"][
-        "checkpoint"
-    ] = "/opt/checkpoints/NV-GPT-8B-Chat-4k-SFT/nv-gpt-8b-chat-4k-sft_v1.0/NV-GPT-8B-Chat-4k-SFT.nemo"
-    test_data["NV-GPT-8B-Chat-4k-SFT"]["prompt_template"] = [
-        "What is the capital of France?",
-        "What is the largest animal in the sea?",
-        "What is the fastest animal in the world?",
-    ]
-    test_data["NV-GPT-8B-Chat-4k-SFT"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
-    test_data["NV-GPT-8B-Chat-4k-SFT"]["max_output_len"] = 256
-    test_data["NV-GPT-8B-Chat-4k-SFT"]["max_batch_size"] = 5
-
-    test_data["NV-GPT-8B-Chat-4k-RLHF"] = {}
-    test_data["NV-GPT-8B-Chat-4k-RLHF"]["model_type"] = "gptnext"
-    test_data["NV-GPT-8B-Chat-4k-RLHF"]["min_tps"] = 1
-    test_data["NV-GPT-8B-Chat-4k-RLHF"]["location"] = "Local"
-    test_data["NV-GPT-8B-Chat-4k-RLHF"]["model_dir"] = "/tmp/NV-GPT-8B-Chat-4k-RLHF/nv-gpt-8b-chat-4k-rlhf_v1.0/"
-    test_data["NV-GPT-8B-Chat-4k-RLHF"][
-        "checkpoint"
-    ] = "/opt/checkpoints/NV-GPT-8B-Chat-4k-RLHF/nv-gpt-8b-chat-4k-rlhf_v1.0/NV-GPT-8B-Chat-4k-RLHF.nemo"
-    test_data["NV-GPT-8B-Chat-4k-RLHF"]["prompt_template"] = [
-        "What is the capital of France?",
-        "What is the largest animal in the sea?",
-        "What is the fastest animal in the world?",
-    ]
-    test_data["NV-GPT-8B-Chat-4k-RLHF"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
-    test_data["NV-GPT-8B-Chat-4k-RLHF"]["max_output_len"] = 128
-    test_data["NV-GPT-8B-Chat-4k-RLHF"]["max_batch_size"] = 10
-
-    test_data["NV-GPT-8B-Chat-4k-SteerLM"] = {}
-    test_data["NV-GPT-8B-Chat-4k-SteerLM"]["model_type"] = "gptnext"
-    test_data["NV-GPT-8B-Chat-4k-SteerLM"]["min_tps"] = 1
-    test_data["NV-GPT-8B-Chat-4k-SteerLM"]["location"] = "Local"
-    test_data["NV-GPT-8B-Chat-4k-SteerLM"][
-        "model_dir"
-    ] = "/tmp/NV-GPT-8B-Chat-4k-SteerLM/nv-gpt-8b-chat-4k-steerlm_v1.0/"
-    test_data["NV-GPT-8B-Chat-4k-SteerLM"][
-        "checkpoint"
-    ] = "/opt/checkpoints/NV-GPT-8B-Chat-4k-SteerLM/nv-gpt-8b-chat-4k-steerlm_v1.0/NV-GPT-8B-Chat-4k-SteerLM.nemo"
-    test_data["NV-GPT-8B-Chat-4k-SteerLM"]["prompt_template"] = [
-        "What is the capital of France?",
-        "What is the largest animal in the sea?",
-        "What is the fastest animal in the world?",
-    ]
-    test_data["NV-GPT-8B-Chat-4k-SteerLM"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
-    test_data["NV-GPT-8B-Chat-4k-SteerLM"]["max_output_len"] = 128
-    test_data["NV-GPT-8B-Chat-4k-SteerLM"]["max_batch_size"] = 10
-
-    test_data["GPT-43B-Base"] = {}
-    test_data["GPT-43B-Base"]["model_type"] = "gptnext"
-    test_data["GPT-43B-Base"]["min_tps"] = 2
-    test_data["GPT-43B-Base"]["location"] = "Local"
-    test_data["GPT-43B-Base"]["model_dir"] = "/tmp/GPT-43B-Base/gpt-43B-base/"
-    test_data["GPT-43B-Base"]["checkpoint"] = "/opt/checkpoints/GPT-43B-Base/gpt-43B-base.nemo"
-    test_data["GPT-43B-Base"]["prompt_template"] = [
-        "The capital of France is",
-        "Largest animal in the sea is",
-        "Fastest animal in the world is",
-    ]
-    test_data["GPT-43B-Base"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
-    test_data["GPT-43B-Base"]["max_output_len"] = 128
-    test_data["GPT-43B-Base"]["max_batch_size"] = 10
+    test_data["Nemotron3-22B-base-32k"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["Nemotron3-22B-base-32k"]["max_output_len"] = 128
+    test_data["Nemotron3-22B-base-32k"]["max_batch_size"] = 10
 
     test_data["LLAMA2-7B-base"] = {}
     test_data["LLAMA2-7B-base"]["model_type"] = "llama"
@@ -367,6 +264,17 @@ def get_infer_test_data():
     test_data["STARCODER1-15B-base"]["max_output_len"] = 128
     test_data["STARCODER1-15B-base"]["max_batch_size"] = 5
 
+    test_data["STARCODER2-15B-base"] = {}
+    test_data["STARCODER2-15B-base"]["model_type"] = "starcoder"
+    test_data["STARCODER2-15B-base"]["min_tps"] = 1
+    test_data["STARCODER2-15B-base"]["location"] = "Local"
+    test_data["STARCODER2-15B-base"]["model_dir"] = "/tmp/STARCODER2-15B-base/trt_llm_model-1/"
+    test_data["STARCODER2-15B-base"]["checkpoint"] = "/opt/checkpoints/starcoder-2_15b_4k_vfinal/4194b.nemo"
+    test_data["STARCODER2-15B-base"]["prompt_template"] = ["def fibonnaci(n"]
+    test_data["STARCODER2-15B-base"]["expected_keyword"] = ["fibonnaci"]
+    test_data["STARCODER2-15B-base"]["max_output_len"] = 128
+    test_data["STARCODER2-15B-base"]["max_batch_size"] = 5
+
     test_data["GEMMA-base"] = {}
     test_data["GEMMA-base"]["model_type"] = "gemma"
     test_data["GEMMA-base"]["min_tps"] = 1

From fc0e4ab09025a8584343c8c9818f748a62597c1a Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Fri, 26 Jul 2024 08:27:45 -0700
Subject: [PATCH 7/8] add code owner (#9917)

---
 .github/CODEOWNERS | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 .github/CODEOWNERS

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 000000000000..ef7434efe377
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1,2 @@
+.github/ @pablo-garay @ko3n1g
+Dockerfile.ci @pablo-garay @ko3n1g

From 67aee7fb975e44bdebe1840527725a095b22580c Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 26 Jul 2024 19:39:58 +0400
Subject: [PATCH 8/8] Fix Docker build. Make Dockerfile consistent with CI
 (#9784) (#9915)

* Fix Docker build. Make Dockerfile consistent with CI

---------

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Co-authored-by: Vladimir Bataev <vbataev@nvidia.com>
---
 Dockerfile.speech                | 24 ++++++++++++++++++------
 scripts/installers/install_k2.sh |  2 +-
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/Dockerfile.speech b/Dockerfile.speech
index cfe7d9eb5fdc..e7cc670a132d 100644
--- a/Dockerfile.speech
+++ b/Dockerfile.speech
@@ -62,23 +62,28 @@ RUN apt-get update && \
   rm -rf /var/lib/apt/lists/*
 
 WORKDIR /workspace/
+
+ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
+ARG MCORE_TAG=338af51452a53982d202e8386db6233adad1ce86
+ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 # Install megatron core, this can be removed once 0.3 pip package is released
 # We leave it here in case we need to work off of a specific commit in main
 RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \
   cd Megatron-LM && \
-  git checkout c7a1f82d761577e6ca0338d3521eac82f2aa0904 && \
+  git checkout ${MCORE_TAG} && \
   pip install .
 
 # Performance optimizations for distributed optimizer: https://github.com/NVIDIA/apex/pull/1771
 RUN git clone https://github.com/NVIDIA/apex.git && \
   cd apex && \
-  git checkout f058162b215791b15507bb542f22ccfde49c872d && \
-  pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./
+  git checkout ${APEX_TAG} && \
+  pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir \
+    --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./
 
 # Transformer Engine 1.2.0
 RUN git clone https://github.com/NVIDIA/TransformerEngine.git && \
   cd TransformerEngine && \
-  git fetch origin da30634a6c9ccdbb6c587b6c93b1860e4b038204 && \
+  git fetch origin ${TE_TAG} && \
   git checkout FETCH_HEAD && \
   git submodule init && git submodule update && \
   NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .
@@ -126,7 +131,9 @@ RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/installers/install_k2.sh); INSTALL
 WORKDIR /tmp/nemo
 ENV LHOTSE_REQUIRE_TORCHAUDIO=0
 COPY requirements .
-RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done
+# exclude requirements_vllm.txt, since `vllm==0.5.x` breaks the container due to hardcoded requirements `torch==2.3.0`
+RUN for f in $(ls requirements*.txt | grep -v 'requirements_vllm.txt'); do \
+    pip3 install --disable-pip-version-check --no-cache-dir -r $f; done
 
 # install flash attention
 RUN pip install flash-attn
@@ -151,7 +158,12 @@ RUN /usr/bin/test -n "$NEMO_VERSION" && \
 RUN --mount=from=nemo-src,target=/tmp/nemo,rw cd /tmp/nemo && pip install ".[all]"
 
 # Check install
-RUN python -c "import nemo.collections.nlp as nemo_nlp" && \
+# NB: adjusting LD_LIBRARY_PATH (only here, should not be persistent!) is a temporary hack
+# to avoid failure if CUDA is unavailable (`docker build` does not expose GPUs)
+# The error is raised in NeMo Core, and the main reason is reinstalled Transformer-Engine;
+RUN export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${CUDA_HOME}/compat/lib.real && \
+  python -c "import nemo.collections.asr as nemo_asr" && \
+  python -c "import nemo.collections.nlp as nemo_nlp" && \
   python -c "import nemo.collections.tts as nemo_tts" && \
   python -c "import nemo_text_processing.text_normalization as text_normalization"
 
diff --git a/scripts/installers/install_k2.sh b/scripts/installers/install_k2.sh
index 18d948209ab8..6de80ecae3eb 100755
--- a/scripts/installers/install_k2.sh
+++ b/scripts/installers/install_k2.sh
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 K2_REPO=https://github.com/k2-fsa/k2
-LATEST_RELEASE=525cfa5 # fix for PyTorch 2.2.0
+LATEST_RELEASE=5735fa7 # fix for PyTorch 2.4.0
 # uncomment the following line after the next k2 version is released (>1.24.4)
 #LATEST_RELEASE=$(git -c 'versionsort.suffix=-' \
 #    ls-remote --exit-code --refs --sort='version:refname' --tags ${K2_REPO} '*.*' \