From fd751621096647a46c12036acc24c0946797751d Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Tue, 27 Aug 2024 09:53:08 -0700
Subject: [PATCH 1/7] Add WandbPlugin, NsysPlugin and PreemptionPlugin to
 nemo.lightning.run.plugins (#10223)

* Add WandbPlugin, NsysPlugin and PreemptionPlugin to nemo.lightning.run.plugins

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Remove duplicate

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Add entity to wandb logger

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Add documentation

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Add warning

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* PR feedback

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Add comments

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

---------

Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>
---
 nemo/collections/llm/recipes/log/default.py |   9 +-
 nemo/lightning/run/__init__.py              |   0
 nemo/lightning/run/plugins.py               | 165 ++++++++++++++++++++
 3 files changed, 172 insertions(+), 2 deletions(-)
 create mode 100644 nemo/lightning/run/__init__.py
 create mode 100644 nemo/lightning/run/plugins.py

diff --git a/nemo/collections/llm/recipes/log/default.py b/nemo/collections/llm/recipes/log/default.py
index dc18565a0e06..4d5e9223b535 100644
--- a/nemo/collections/llm/recipes/log/default.py
+++ b/nemo/collections/llm/recipes/log/default.py
@@ -10,14 +10,19 @@ def tensorboard_logger(name: str, save_dir: str = "tb_logs") -> Config[TensorBoa
     return Config(TensorBoardLogger, save_dir=save_dir, name=name)
 
 
-def wandb_logger(project: str, name: str) -> Config[WandbLogger]:
-    return Config(
+def wandb_logger(project: str, name: str, entity: Optional[str] = None) -> Config[WandbLogger]:
+    cfg = Config(
         WandbLogger,
         project=project,
         name=name,
         config={},
     )
 
+    if entity:
+        cfg.entity = entity
+
+    return cfg
+
 
 def default_log(
     ckpt_dir: str,
diff --git a/nemo/lightning/run/__init__.py b/nemo/lightning/run/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py
new file mode 100644
index 000000000000..0f6a76d4799f
--- /dev/null
+++ b/nemo/lightning/run/plugins.py
@@ -0,0 +1,165 @@
+import copy
+import logging
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Callable, Optional
+
+import nemo_run as run
+import yaml
+from nemo_run.core.serialization.yaml import YamlSerializer
+from pytorch_lightning import Callback
+from pytorch_lightning.loggers import WandbLogger
+
+from nemo.lightning.pytorch.callbacks import NsysCallback, PreemptionCallback
+from nemo.utils import logging
+
+# This file contains plugins based on NeMo-Run's run.Plugin API.
+# Plugins operate both on a configured task and an executor at the same time, and are specific to NeMo-Run.
+# If you are adding functionality that goes directly into the Pytorch Lightning trainer, you may consider adding a callback instead of a plugin.
+
+
+def _merge_callbacks(partial: run.Partial, callbacks: list[run.Config[Callback]]):
+    if hasattr(partial, "trainer"):
+        if hasattr(partial.trainer, "callbacks"):
+            for callback in callbacks:
+                if callback not in partial.trainer.callbacks:
+                    partial.trainer.callbacks.append(callback)
+        else:
+            partial.trainer.callbacks = copy.deepcopy(callbacks)
+
+
+@dataclass(kw_only=True)
+class PreemptionPlugin(run.Plugin):
+    """
+    A plugin for setting up Preemption callback and preemption signals.
+
+    Args:
+        preempt_time (int): The time, in seconds, before the task's time limit at which the executor
+                             will send a SIGTERM preemption signal. This allows tasks to be gracefully
+                             stopped before reaching their time limit, reducing waste and
+                             promoting fair resource usage. The default value is 300 seconds (5 minutes).
+                             This is only supported for ``run.SlurmExecutor``.
+        callbacks (list[run.Config[Callback]]): A list of callback configurations that the plugin
+                                                will merge with the task's existing callbacks.
+                                                By default, the list includes NeMo's preemption callback.
+    """
+
+    preempt_time: int = 300
+    callbacks: list[run.Config[Callback]] = field(default_factory=lambda: [run.Config(PreemptionCallback)])
+
+    def setup(self, task: run.Partial | run.Script, executor: run.Executor):
+        if isinstance(task, run.Script):
+            logging.warning(
+                f"The {self.__class__.__name__} will have no effect on the task as it's an instance of run.Script"
+            )
+            return
+
+        if isinstance(executor, run.SlurmExecutor):
+            # Sends a SIGTERM self.preempt_time seconds before hitting time limit
+            logging.info(
+                f"{self.__class__.__name__} will send a SIGTERM {self.preempt_time} seconds before the job's time limit for your Slurm executor."
+            )
+            executor.signal = f"TERM@{self.preempt_time}"
+
+        _merge_callbacks(task, callbacks=self.callbacks)
+
+
+@dataclass(kw_only=True)
+class NsysPlugin(run.Plugin):
+    """
+    A plugin for nsys profiling.
+
+    The NsysPlugin allows you to profile your run using nsys.
+    You can specify when to start and end the profiling, on which ranks to run the profiling,
+    and what to trace during profiling.
+
+    Args:
+        start_step (int): The step at which to start the nsys profiling.
+        end_step (int): The step at which to end the nsys profiling.
+        ranks (Optional[list[int]]): The ranks on which to run the nsys profiling. If not specified,
+            profiling will be run on rank 0.
+        nsys_trace (Optional[list[str]]): The events to trace during profiling. If not specified,
+            'nvtx' and 'cuda' events will be traced.
+    """
+
+    start_step: int
+    end_step: int
+    ranks: Optional[list[int]] = None
+    nsys_trace: Optional[list[str]] = None
+
+    def setup(self, task: run.Partial | run.Script, executor: run.Executor):
+        if isinstance(task, run.Partial):
+            nsys_callback = run.Config(
+                NsysCallback,
+                start_step=self.start_step,
+                end_step=self.end_step,
+                ranks=self.ranks or [0],
+            )
+            callbacks: list[run.Config[Callback]] = [nsys_callback]  # type: ignore
+            _merge_callbacks(task, callbacks=callbacks)
+
+        launcher = executor.get_launcher()
+        launcher.nsys_profile = True
+        launcher.nsys_trace = self.nsys_trace or ["nvtx", "cuda"]
+
+
+@dataclass(kw_only=True)
+class WandbPlugin(run.Plugin):
+    """
+    A plugin for setting up Weights & Biases.
+
+    This plugin sets a ``WandbLogger`` to ``NeMoLogger``'s ``wandb`` arg,
+    which in turn initializes the Pytorch Lightning `WandbLogger <https://lightning.ai/docs/pytorch/stable/extensions/generated/lightning.pytorch.loggers.WandbLogger.html>`_.
+
+    This plugin is only activated if the ``WANDB_API_KEY`` environment variable is set.
+    The ``WANDB_API_KEY`` environment variables will also be set in the executor's environment variables.
+    Follow https://docs.wandb.ai/quickstart to retrieve your ``WANDB_API_KEY``.
+
+    If `log_task_config` is True, the plugin will log the task configuration as a config dictionary
+    to the Weights and Biases logger.
+
+    Args:
+        name (str): The name for the Weights & Biases run.
+        logger_fn (Callable[..., run.Config[WandbLogger]]): A callable that returns a Config of ``WandbLogger``
+        log_task_config (bool, optional): Whether to log the task configuration to the logger.
+            Defaults to True.
+
+    Raises:
+        logging.warning: If the task is an instance of `run.Script`, as the plugin has no effect on such tasks.
+    """
+
+    name: str
+    logger_fn: Callable[..., run.Config[WandbLogger]]
+    log_task_config: bool = True
+
+    def setup(self, task: run.Partial | run.Script, executor: run.Executor):
+        if isinstance(task, run.Script):
+            logging.warning(
+                f"The {self.__class__.__name__} will have no effect on the task as it's an instance of run.Script"
+            )
+            return
+
+        if "WANDB_API_KEY" in os.environ:
+            executor.env_vars["WANDB_API_KEY"] = os.environ["WANDB_API_KEY"]
+
+            if hasattr(task, "log") and hasattr(task.log, "wandb"):
+                task.log.wandb = self.logger_fn(name=self.name)
+                if self.log_task_config:
+                    partial_config = yaml.safe_load(YamlSerializer().serialize(task))
+                    partial_config["experiment"] = {
+                        "id": self.experiment_id,
+                        "task_name": self.name,
+                        "executor": executor.info(),
+                        "remote_directory": (
+                            os.path.join(executor.tunnel.job_dir, Path(executor.job_dir).name)
+                            if isinstance(executor, run.SlurmExecutor)
+                            else None
+                        ),
+                        "local_directory": executor.job_dir,
+                    }
+                    task.log.wandb.config = partial_config
+        else:
+            logging.warning(
+                f"The {self.__class__.__name__} will have no effect as WANDB_API_KEY environment variable is not set."
+            )

From 38800cdad5d46565a156bc62ebcd2847a4f4d043 Mon Sep 17 00:00:00 2001
From: Anna Shors <71393111+ashors1@users.noreply.github.com>
Date: Tue, 27 Aug 2024 14:51:20 -0700
Subject: [PATCH 2/7] [NeMo-UX] Handle absolute logger directories in
 nemo_logger (#10259)

* handle absolute and relative logger directories

Signed-off-by: Anna Shors <ashors@nvidia.com>

* merge lines

Signed-off-by: ashors1 <ashors@nvidia.com>

---------

Signed-off-by: Anna Shors <ashors@nvidia.com>
Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo/lightning/nemo_logger.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/nemo/lightning/nemo_logger.py b/nemo/lightning/nemo_logger.py
index 6509c384f8cf..bae62f09593b 100644
--- a/nemo/lightning/nemo_logger.py
+++ b/nemo/lightning/nemo_logger.py
@@ -30,11 +30,10 @@ class NeMoLogger(IOMixin):
         log_global_rank_0_only (bool): Log only on global rank 0.
         files_to_copy (Optional[List[str]]): List of files to copy to log directory.
         update_logger_directory (bool): Whether to update logger directory to write to `exp_dir`.
-            If True, the `save_dir` passed to the logger will be treated as a relative path and
-            the logger will be reconfigured to write to `exp_dir / save_dir`. This ensures that
-            all output from an experiment is written to a common directory. If False, the logger's
-            save_dir will not be overwritten. This argument applies only to TensorBoardLogger and
-            WandbLogger instances.
+            If True, the `save_dir` passed to the logger will be reconfigured to write to `exp_dir / save_dir`.
+            This ensures that all output from an experiment is written to a common directory.
+            If False, the logger's save_dir will not be overwritten.
+            This argument applies only to TensorBoardLogger and WandbLogger instances.
         ckpt (Optional[ModelCheckpoint]): Model checkpoint callback.
         tensorboard: (Optional[TensorBoardLogger]): A PyTorch Lightning TensorBoardLogger instance
             to add to the trainer.
@@ -158,7 +157,7 @@ def _setup_trainer_loggers(self, trainer, dir, version):
             for logger in trainer.loggers:
                 if isinstance(logger, TensorBoardLogger):
                     logger._version = version or ""
-                    logger._root_dir = Path(dir) / logger.save_dir
+                    logger._root_dir = Path(dir) / os.path.relpath(logger.save_dir)
                     trainer.logger._name = self.name
                     logging.warning(
                         f'"update_logger_directory" is True. Overwriting tensorboard logger "save_dir" to {logger._root_dir}'

From 57aa305ef60c67e72c10725402fabe267f1470bb Mon Sep 17 00:00:00 2001
From: Ming <111467530+Victor49152@users.noreply.github.com>
Date: Tue, 27 Aug 2024 15:20:22 -0700
Subject: [PATCH 3/7] Add sdxl notebook (#10139)

* Add sdxl notebook

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Rename

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* final Update SDXL notebook

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

---------

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>
---
 docs/source/multimodal/text2img/sd.rst   |   2 +-
 tutorials/multimodal/SDXL Tutorial.ipynb | 253 +++++++++++++++++++++++
 2 files changed, 254 insertions(+), 1 deletion(-)
 create mode 100644 tutorials/multimodal/SDXL Tutorial.ipynb

diff --git a/docs/source/multimodal/text2img/sd.rst b/docs/source/multimodal/text2img/sd.rst
index 6f5092f93f5f..549f13bbabf6 100644
--- a/docs/source/multimodal/text2img/sd.rst
+++ b/docs/source/multimodal/text2img/sd.rst
@@ -163,7 +163,7 @@ Optimization related configurations
 Training with precached latents
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Since the VAE and text encoder remain frozed during training, you can pre-calculate the image and caption latents offline, enhancing training throughput. To create a pre-cached dataset, see :doc:`Multimodal Dataset <./datasets>`. For training using this dataset, configure ``model.data`` section properly and set ``model.first_stage_key=image_encoded`` along with ``model.cond_stage_key=captions_encoded``.
+Since the VAE and text encoder remain frozen during training, you can pre-calculate the image and caption latents offline, enhancing training throughput. To create a pre-cached dataset, see :doc:`Multimodal Dataset <./datasets>`. For training using this dataset, configure ``model.data`` section properly and set ``model.first_stage_key=image_encoded`` along with ``model.cond_stage_key=captions_encoded``.
 
 Reference
 -----------
diff --git a/tutorials/multimodal/SDXL Tutorial.ipynb b/tutorials/multimodal/SDXL Tutorial.ipynb
new file mode 100644
index 000000000000..92667100b405
--- /dev/null
+++ b/tutorials/multimodal/SDXL Tutorial.ipynb	
@@ -0,0 +1,253 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "d874e23f-9631-48e0-b635-84e7280bf07b",
+   "metadata": {},
+   "source": [
+    "# SDXL Training / Inference Tutorial\n",
+    "\n",
+    "### Note:\n",
+    "Currently, this notebook must be run in a NeMo container (> 24.09) and open_clip_torch<=2.24.0. An example command to launch the container:\n",
+    "\n",
+    "```\n",
+    "docker run --gpus all -it --rm -v <your_nemo_dir>:/opt/NeMo -v <your_dataset_dir>:/datasets --shm-size=8g \\\n",
+    "     -p 8888:8888 --ulimit memlock=-1 --ulimit \\\n",
+    "      stack=67108864 <your_nemo_container>\n",
+    "```\n",
+    "\n",
+    "\n",
+    "## Introduction\n",
+    "\n",
+    "This notebook illustrates how to train and perform inference using Stable Diffusion XL with the NeMo Toolkit. Despite differences in model configs, the training and inference procedure is similar as Stable Diffusion.\n",
+    "\n",
+    "The implementation of Stable Diffusion XL is based on [SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis](https://arxiv.org/abs/2307.01952).\n",
+    "\n",
+    "This tutorial will guide you through the following topics:\n",
+    "\n",
+    "1. Training a Stable Diffusion XL model.\n",
+    "2. Performing inference with the trained model.\n",
+    "\n",
+    "## Datasets\n",
+    "\n",
+    "Please refer to [Dataset Tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/Multimodal%20Data%20Preparation.ipynb) for how to prepare a training dataset for Stable diffusion XL.\n",
+    "\n",
+    "For a pre-cached Stable Diffusion dataset, each webdataset tar file should, at a minimum, include the pickle files that store the pre-cached image and text features:\n",
+    "\n",
+    "```\n",
+    "t0_r0_0.tar\n",
+    "|---- 0000.pickle\n",
+    "|---- 0001.pickle\n",
+    "...\n",
+    "```\n",
+    "\n",
+    "For non-precached Stable Diffusion dataset, each webdataset tar file should contain the raw texts and corresponding images:\n",
+    "\n",
+    "```\n",
+    "t0_r0_0.tar\n",
+    "|---- 0000.jpg\n",
+    "|---- 0000.txt\n",
+    "|---- 0001.jpg\n",
+    "|---- 0001.txt\n",
+    "...\n",
+    "```\n",
+    "\n",
+    "## Encoders Preparation\n",
+    "\n",
+    "Depending on whether you precache the dataset, you might also need to first download the image and/or text encoders.\n",
+    "\n",
+    "### Option 1: Training on Non-Precached Dataset (Use Encoders During Training)\n",
+    "\n",
+    "#### A. Prepare VAE\n",
+    "To download the default VAE for Stable Diffusion:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "730cd137-0fce-4bab-8ac7-219e5c55faf2",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "! wget https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/vae/diffusion_pytorch_model.safetensors\n",
+    "! mkdir -p /sdxl_ckpts\n",
+    "! mv diffusion_pytorch_model.safetensors /sdxl_ckpts/vae.safetensors"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "fef8b245-7cee-4048-a9ec-3ada90432a89",
+   "metadata": {},
+   "source": [
+    "The above command will download the default VAE weights from HuggingFace and save it to `/sdxl_ckpts/vae.safetensors`.\n",
+    "\n",
+    "**Note**: if you want to customize the saved location, make sure it is also reflected in your training config.\n",
+    "#### B. Prepare Text Encoder\n",
+    "For the text encoders used in Stable Diffusion XL, it will be automatically downloaded by the training script we provide.\n",
+    "\n",
+    "The type of text encoder used in the sdxl model conditioner can be found in `conditioner_config` in the predefined training configs:\n",
+    "\n",
+    "```\n",
+    "  conditioner_config:\n",
+    "    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner\n",
+    "    emb_models:\n",
+    "      - is_trainable: false\n",
+    "        input_key: captions\n",
+    "        ucg_rate: 0.1\n",
+    "        emb_model:\n",
+    "          _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder\n",
+    "          layer: hidden\n",
+    "          layer_idx: 11\n",
+    "      - is_trainable: false\n",
+    "        ucg_rate: 0.1\n",
+    "        input_key: captions\n",
+    "        emb_model:\n",
+    "          _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2\n",
+    "          arch: ViT-bigG-14\n",
+    "          version: laion2b_s39b_b160k\n",
+    "          freeze: true\n",
+    "          layer: penultimate\n",
+    "          always_return_pooled: true\n",
+    "          legacy: false\n",
+    "```"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "8854eb7a-e822-43f6-a1d5-12357049485a",
+   "metadata": {},
+   "source": [
+    "\n",
+    "### Option 2: Training on Precached Dataset (Training UNet Only)\n",
+    "\n",
+    "When using precached dataset (please refer to the [Dataset Tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/Multimodal%20Data%20Preparation.ipynb) for details), every text feature and image feature are stored as key-value pairs in `.pickle` file:\n",
+    "\n",
+    "```\n",
+    "{\n",
+    " image_key: torch.Tensor(),\n",
+    " text_key: torch.Tensor(),\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "Make sure in the training config, `cond_stage_key` is associated with `text_key` and `first_stage_key` is associated with `image_key`.\n",
+    "\n",
+    "We offer an expample script to convert a dataset from `parquet` file to webdataset `tar` files at [parquet_conversion](https://github.com/NVIDIA/NeMo/blob/main/scripts/multimodal_dataset_conversion/parquet_conversion.py). Three different modes of prechaed training are provided, they are:\n",
+    "\n",
+    "1. No Caching: VAE and Text encoders are loaded during training\n",
+    "2. Text only: Only text features are loaded from dataset during training\n",
+    "3. Both: Both image and text features are loaded from dataset during training\n",
+    "\n",
+    "In each mode, the cached components should be saved in its raw format in tarfiles while cached components should be saved as torch.Tensor()."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "5762427b-f60c-4dfd-8318-e55771b25354",
+   "metadata": {},
+   "source": [
+    "## Model Config Setup\n",
+    "\n",
+    "Now we will begin setting up the config file needed for Stable Diffusion training. We will use [sd_train.yaml](https://github.com/NVIDIA/NeMo/blob/main/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train.yaml) as the template.\n",
+    "\n",
+    "1. Modify `model.data.train.dataset_path` so that it has all the webdataset info files you want to train on\n",
+    "2. Modify `model.data.webdataset.local_root_path` to point to your dataset path\n",
+    "3. Make sure VAE path `model.first_stage_config.from_pretrained` is adjusted if using non-precached dataset\n",
+    "4. Make sure the `model.precache mode` is set properly with the dataset you prepared, as detailed above.\n",
+    "5. Configure `exp_manager.exp_dir` for experiment save directory\n",
+    "6. Configure `exp_manager.wandb_logger_kwargs` and/or `exp_manager.create_tensorboard_logger` if needed"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "70f858b3-f7d5-4678-b380-80582337bc23",
+   "metadata": {},
+   "source": [
+    "**Note**: Please refer to NeMo Toolkit Developer Guide's Stable Diffusion page for more details on in-depth customizations, including all available optimizations.\n",
+    "\n",
+    "## Training\n",
+    "\n",
+    "Once everything is set up, training stable diffusion is as simple as running:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "589e3a14-c881-4a56-b2bd-370653059dfc",
+   "metadata": {},
+   "outputs": [],
+   "source": "! torchrun /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_train.py trainer.max_steps=100 model.data.train.dataset_path=/path/to/wdinfo.pkl model.data.webdataset.local_root_path=/path/to/dataset trainer.devices=1 trainer.num_nodes=1 model.micro_batch_size=1 model.global_batch_size=1 model.first_stage_config.from_pretrained=/sdxl_ckpts/vae.safetensors model.fsdp=False"
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "892d72dd-c4d7-4ca4-a948-168e187af65c",
+   "metadata": {},
+   "source": [
+    "Intermediate checkpoints (during training) and final checkpoint will be saved to `exp_manager.exp_dir` folder. Note that here we use synthetic data for demo purpose."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "087c8b9a-92c3-43d3-86a3-bf7e848dfbd2",
+   "metadata": {},
+   "source": [
+    "## Inference\n",
+    "\n",
+    "Stable Diffusion XL inference needs a trained NeMo Stable Diffusion checkpoint, along with both the image encoder (VAE) and text encoder (CLIP). The checkpoint can be either a fully trained `.nemo` checkpoint or an intermediate checkpoint from training (typically in `.ckpt` format). \n",
+    "\n",
+    "### Inference Config Setup\n",
+    "\n",
+    "Now we will begin setting up the config file needed for Stable Diffusion inference. We will use [sd_xl_infer_v2.yaml](https://github.com/NVIDIA/NeMo/blob/main/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_infer_v2.yaml) as the template.\n",
+    "\n",
+    "We generally use [Classifier Free Guidance](https://arxiv.org/abs/2207.12598) for better visual quality, which can be set at `sampling.base.scale`.\n",
+    "\n",
+    "NeMo Stable Diffusion supports multiple samplers. Please refer to the developer guide for more details. Samplers can be set at `sampling.base.sampler`.\n",
+    "\n",
+    "Inference supports a batch of text prompts, which can be set at `infer.prompt`. One can also generate a configurable number of images per prompt by setting `infer.num_samples`. Generated images will be saved to `out_path`.\n",
+    "\n",
+    "You will also need to set the model checkpoint path at `model.restore_from_path` if you are loading from `.nemo` checkpoint, otherwise, mannually set `unet` checkpoints and `vae` checkpoint at `model.unet_config.from_pretrained` and `model.first_stage_config.from_pretrained`, respectively.\n",
+    "\n",
+    "### Running the Inference\n",
+    "\n",
+    "Once everything is set up, Stable Diffusion inference is as simple as running:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9e676c5d-d711-489e-8ab7-3ee20046d88d",
+   "metadata": {},
+   "outputs": [],
+   "source": "! torchrun  /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py model.restore_from_path=/path/to/stable-diffusion-xl-train.nemo out_path=/sdxl_infer_out"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 19668e5320a2e2af0199b6d5e0b841993be3a634 Mon Sep 17 00:00:00 2001
From: Ao Tang <aot@nvidia.com>
Date: Tue, 27 Aug 2024 18:41:35 -0400
Subject: [PATCH 4/7] Add Llama31 Config (#10260)

* add llama31 config

* Apply isort and black reformatting

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>

* fix init method

* typo

* revert llama3-70b init method std

---------

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>
Co-authored-by: suiyoubi <suiyoubi@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
---
 nemo/collections/llm/__init__.py           |  6 ++
 nemo/collections/llm/gpt/model/__init__.py |  6 ++
 nemo/collections/llm/gpt/model/llama.py    | 89 +++++++++++++++++++++-
 3 files changed, 99 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 812daddf02b6..86373135adb5 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -39,6 +39,9 @@
     Llama2Config70B,
     Llama3Config8B,
     Llama3Config70B,
+    Llama31Config8B,
+    Llama31Config70B,
+    Llama31Config405B,
     LlamaConfig,
     LlamaModel,
     MaskedTokenLossReduction,
@@ -93,6 +96,9 @@
     "Llama2Config70B",
     "Llama3Config8B",
     "Llama3Config70B",
+    "Llama31Config8B",
+    "Llama31Config70B",
+    "Llama31Config405B",
     "CodeLlamaConfig7B",
     "CodeLlamaConfig13B",
     "CodeLlamaConfig34B",
diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
index a0132a34d185..0452c8dc6f89 100644
--- a/nemo/collections/llm/gpt/model/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -27,6 +27,9 @@
     Llama2Config70B,
     Llama3Config8B,
     Llama3Config70B,
+    Llama31Config8B,
+    Llama31Config70B,
+    Llama31Config405B,
     LlamaConfig,
     LlamaModel,
 )
@@ -62,6 +65,9 @@
     "Llama2Config70B",
     "Llama3Config8B",
     "Llama3Config70B",
+    "Llama31Config8B",
+    "Llama31Config70B",
+    "Llama31Config405B",
     "NemotronConfig",
     "Nemotron3Config4B",
     "Nemotron3Config8B",
diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index ab2f46378a1e..4f7dd4d37a90 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -1,3 +1,4 @@
+import math
 from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Annotated, Callable, Optional
@@ -9,6 +10,7 @@
 from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
 from nemo.collections.llm.utils import Config
 from nemo.lightning import OptimizerModule, io, teardown
+from nemo.utils import logging
 
 if TYPE_CHECKING:
     from transformers import LlamaConfig as HFLlamaConfig
@@ -66,7 +68,7 @@ class Llama3Config(GPTConfig):
     num_query_groups: int = 8
     hidden_dropout: float = 0.0
     attention_dropout: float = 0.0
-    normalization = "RMSNorm"
+    normalization: str = "RMSNorm"
     init_method_std: float = 0.01
     layernorm_epsilon: float = 1.0e-05
     add_bias_linear: bool = False
@@ -80,10 +82,31 @@ class Llama3Config(GPTConfig):
     bias_dropout_fusion: bool = True
     apply_rope_fusion: bool = True
     share_embeddings_and_output_weights: bool = False
-    position_embedding_type = "rope"
+    position_embedding_type: str = "rope"
     rotary_percent: float = 1.0
 
 
+@dataclass
+class Llama31Config(Llama3Config):
+    scale_factor: int = 8
+    low_freq_factor: int = 1
+    high_freq_factor: int = 4
+    old_context_len: int = 8192
+    init_method_std: float = 0.02
+
+    def configure_model(self, tokenizer) -> "MCoreGPTModel":
+        model = super().configure_model(tokenizer)
+        # Apply rope scaling for Llama3.1 model
+        model.rotary_pos_emb.inv_freq = apply_rope_scaling(
+            model.rotary_pos_emb.inv_freq,
+            factor=self.scale_factor,
+            low_freq_factor=self.low_freq_factor,
+            high_freq_factor=self.high_freq_factor,
+            old_context_len=self.old_context_len,
+        )
+        return model
+
+
 @dataclass
 class Llama3Config8B(Llama3Config):
     rotary_base: int = 500_000
@@ -106,6 +129,38 @@ class Llama3Config70B(Llama3Config):
     make_vocab_size_divisible_by: int = 128
 
 
+@dataclass
+class Llama31Config8B(Llama31Config):
+    rotary_base: int = 500_000
+    seq_length: int = 131072
+    num_layers: int = 32
+    hidden_size: int = 4096
+    ffn_hidden_size: int = 14336
+    num_attention_heads: int = 32
+
+
+@dataclass
+class Llama31Config70B(Llama31Config):
+    rotary_base: int = 500_000
+    seq_length: int = 131072
+    num_layers: int = 80
+    hidden_size: int = 8192
+    ffn_hidden_size: int = 28672
+    num_attention_heads: int = 64
+    make_vocab_size_divisible_by: int = 128
+
+
+@dataclass
+class Llama31Config405B(Llama31Config):
+    rotary_base: int = 500_000
+    seq_length: int = 131072
+    num_layers: int = 126
+    hidden_size: int = 16384
+    ffn_hidden_size: int = 53248
+    num_attention_heads: int = 128
+    make_vocab_size_divisible_by: int = 128
+
+
 @dataclass
 class CodeLlamaConfig7B(Llama2Config7B):
     rotary_base: int = 1_000_000
@@ -365,6 +420,33 @@ def _export_linear_fc1(linear_fc1):
     return gate_proj, up_proj
 
 
+def apply_rope_scaling(
+    inv_freq,
+    factor: int = 8,
+    low_freq_factor: int = 1,
+    high_freq_factor: int = 4,
+    old_context_len: int = 8192,
+):
+    logging.info(
+        f"Apply rope scaling with factor={factor}, low_freq_factor={low_freq_factor}, high_freq_factor={high_freq_factor}, old_context_len={old_context_len}."
+    )
+
+    low_freq_wavelen = old_context_len / low_freq_factor
+    high_freq_wavelen = old_context_len / high_freq_factor
+
+    wavelen = 2 * math.pi / inv_freq
+    # wavelen < high_freq_wavelen: do nothing
+    # wavelen > low_freq_wavelen: divide by factor
+    inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
+    # otherwise: interpolate between the two, using a smooth factor
+    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+    smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+    is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
+    inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+
+    return inv_freq_llama
+
+
 __all__ = [
     "LlamaConfig",
     "Llama2Config7B",
@@ -372,6 +454,9 @@ def _export_linear_fc1(linear_fc1):
     "Llama2Config70B",
     "Llama3Config8B",
     "Llama3Config70B",
+    "Llama31Config8B",
+    "Llama31Config70B",
+    "Llama31Config405B",
     "CodeLlamaConfig7B",
     "CodeLlamaConfig13B",
     "CodeLlamaConfig34B",

From c7c3eae455be3cda28210e11625f31633e13abe2 Mon Sep 17 00:00:00 2001
From: Selvaraj Anandaraj <anandaraj@wisc.edu>
Date: Tue, 27 Aug 2024 16:30:09 -0700
Subject: [PATCH 5/7] Added offloading support for LoRA adapters (#10237)

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
---
 .../modules/common/megatron/adapters/parallel_adapters.py  | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index 4f9f04527038..29eea2d54664 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -177,6 +177,7 @@ def __init__(
             model_parallel_config = ModelParallelConfig()
         self._sequence_parallel = model_parallel_config.sequence_parallel
         model_parallel_config.sequence_parallel = False  # SP is irrelevant for the lora linear layer
+        self.config = model_parallel_config
 
         if input_is_parallel:
             self.linear_in = RowParallelLinear(
@@ -298,8 +299,14 @@ def forward(self, x):
             # this function also handles the backward pass correctly
             x = gather_from_sequence_parallel_region(x)
 
+        if self.config.cpu_offloading and self.config.cpu_offloading_activations:
+            x.activation_offloading = True
         x, _ = self.linear_in(x)  # (@adithyare) ColumnLinear returns output and bias, we are ignoring the bias term.
+
         x = self.activation(x)
+
+        if self.config.cpu_offloading and self.config.cpu_offloading_activations:
+            x.activation_offloading = True
         x, _ = self.linear_out(x)
 
         if self._sequence_parallel and self.input_is_parallel:

From f53600a3e3b85621985f26b9c1f2b9261b2cac96 Mon Sep 17 00:00:00 2001
From: Ao Tang <aot@nvidia.com>
Date: Tue, 27 Aug 2024 19:35:13 -0400
Subject: [PATCH 6/7] Add Qwen2 to Nemo 2 (#10258)

* add qwen2

* typo

* Apply isort and black reformatting

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>

* qwen without pip install issue

* Apply isort and black reformatting

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>

* remove calculate vocab size divisible

---------

Signed-off-by: suiyoubi <suiyoubi@users.noreply.github.com>
Co-authored-by: suiyoubi <suiyoubi@users.noreply.github.com>
---
 nemo/collections/llm/__init__.py           |  12 +
 nemo/collections/llm/gpt/model/__init__.py |  14 +
 nemo/collections/llm/gpt/model/base.py     |  12 +-
 nemo/collections/llm/gpt/model/qwen2.py    | 392 +++++++++++++++++++++
 4 files changed, 429 insertions(+), 1 deletion(-)
 create mode 100644 nemo/collections/llm/gpt/model/qwen2.py

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 86373135adb5..168f05d2e56e 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -58,6 +58,12 @@
     Nemotron4Config340B,
     NemotronConfig,
     NemotronModel,
+    Qwen2Config,
+    Qwen2Config1P5B,
+    Qwen2Config7B,
+    Qwen2Config72B,
+    Qwen2Config500M,
+    Qwen2Model,
     gpt_data_step,
     gpt_forward_step,
 )
@@ -117,6 +123,12 @@
     "ChatGLM2Config6B",
     "ChatGLM3Config6B",
     "ChatGLMModel",
+    "Qwen2Model",
+    "Qwen2Config7B",
+    "Qwen2Config",
+    "Qwen2Config500M",
+    "Qwen2Config1P5B",
+    "Qwen2Config72B",
     "PreTrainingDataModule",
     "FineTuningDataModule",
     "SquadDataModule",
diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
index 0452c8dc6f89..0bf2fc6f1e7b 100644
--- a/nemo/collections/llm/gpt/model/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -49,6 +49,14 @@
     NemotronConfig,
     NemotronModel,
 )
+from nemo.collections.llm.gpt.model.qwen2 import (
+    Qwen2Config,
+    Qwen2Config1P5B,
+    Qwen2Config7B,
+    Qwen2Config72B,
+    Qwen2Config500M,
+    Qwen2Model,
+)
 
 __all__ = [
     "GPTConfig",
@@ -93,6 +101,12 @@
     "ChatGLM2Config6B",
     "ChatGLM3Config6B",
     "ChatGLMModel",
+    "Qwen2Config",
+    "Qwen2Config500M",
+    "Qwen2Config1P5B",
+    "Qwen2Config7B",
+    "Qwen2Config72B",
+    "Qwen2Model",
     "MaskedTokenLossReduction",
     "gpt_data_step",
     "gpt_forward_step",
diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index 2badfa2b1915..c108415a085e 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -13,6 +13,7 @@
 from nemo.lightning import get_vocab_size, io
 from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
 from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule
+from nemo.utils import logging
 
 HAVE_TE = True
 try:
@@ -131,10 +132,19 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel":
         if not isinstance(transformer_layer_spec, ModuleSpec):
             transformer_layer_spec = transformer_layer_spec(self)
 
+        if hasattr(self, 'vocab_size'):
+            vocab_size = self.vocab_size
+            logging.info(
+                f"Use preset vocab_size: {vocab_size}, original vocab_size: {tokenizer.vocab_size}, dummy tokens:"
+                f" {vocab_size - tokenizer.vocab_size}."
+            )
+        else:
+            vocab_size = get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by)
+
         return MCoreGPTModel(
             self,
             transformer_layer_spec=transformer_layer_spec,
-            vocab_size=get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by),
+            vocab_size=vocab_size,
             max_sequence_length=self.seq_length,
             fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
             parallel_output=self.parallel_output,
diff --git a/nemo/collections/llm/gpt/model/qwen2.py b/nemo/collections/llm/gpt/model/qwen2.py
new file mode 100644
index 000000000000..eb67dd9d4f0d
--- /dev/null
+++ b/nemo/collections/llm/gpt/model/qwen2.py
@@ -0,0 +1,392 @@
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Annotated, Callable, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
+from nemo.collections.llm.utils import Config
+from nemo.lightning import OptimizerModule, io, teardown
+
+if TYPE_CHECKING:
+    from transformers import AutoModelForCausalLM
+    from transformers import Qwen2Config as HFQwen2Config
+
+    from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+
+
+@dataclass
+class Qwen2Config(GPTConfig):
+    normalization: str = "RMSNorm"
+    activation_func: Callable = F.silu
+    gated_linear_unit: bool = True
+    add_bias_linear: bool = False
+    add_qkv_bias: bool = True
+    seq_length: int = 4096
+    init_method_std: int = 0.02
+    hidden_dropout: float = 0.0
+    attention_dropout: float = 0.0
+    vocab_size: int = 151936
+    share_embeddings_and_output_weights: Optional[bool] = False
+    layernorm_epsilon: float = 1e-6
+    rotary_base: float = 1000000.0
+    position_embedding_type: str = "rope"
+    apply_query_key_layer_scaling: bool = True
+
+
+@dataclass
+class Qwen2Config500M(Qwen2Config):
+    num_layers: int = 24
+    hidden_size: int = 896
+    num_attention_heads: int = 14
+    num_query_groups: int = 2
+    ffn_hidden_size: int = 4864
+
+
+@dataclass
+class Qwen2Config1P5B(Qwen2Config):
+    num_layers: int = 28
+    hidden_size: int = 1536
+    num_attention_heads: int = 12
+    num_query_groups: int = 2
+    ffn_hidden_size: int = 8960
+
+
+@dataclass
+class Qwen2Config7B(Qwen2Config):
+    num_layers: int = 28
+    hidden_size: int = 3584
+    num_attention_heads: int = 28
+    num_query_groups: int = 4
+    ffn_hidden_size: int = 18944
+    vocab_size: int = 152064
+
+
+@dataclass
+class Qwen2Config72B(Qwen2Config):
+    num_layers: int = 80
+    hidden_size: int = 8192
+    num_attention_heads: int = 64
+    num_query_groups: int = 8
+    ffn_hidden_size: int = 29568
+    vocab_size: int = 152064
+    layernorm_epsilon: float = 1e-5
+    vocab_size: int = 152064
+
+
+class Qwen2Model(GPTModel):
+    def __init__(
+        self,
+        config: Annotated[Optional[Qwen2Config], Config[Qwen2Config]] = None,
+        optim: Optional[OptimizerModule] = None,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
+    ):
+        super().__init__(config or Qwen2Config(), optim=optim, tokenizer=tokenizer, model_transform=model_transform)
+
+
+@io.model_importer(Qwen2Model, "hf")
+class HFQwen2Importer(io.ModelConnector["AutoModelForCausalLM", Qwen2Model]):
+    def init(self) -> Qwen2Model:
+        return Qwen2Model(self.config, tokenizer=self.tokenizer)
+
+    def apply(self, output_path: Path) -> Path:
+        from transformers import AutoModelForCausalLM
+
+        source = AutoModelForCausalLM.from_pretrained(str(self), trust_remote_code=True)
+        target = self.init()
+        trainer = self.nemo_setup(target)
+        self.convert_state(source, target)
+        self.nemo_save(output_path, trainer)
+
+        print(f"Converted Qwen model to Nemo, model saved to {output_path}")
+
+        teardown(trainer, target)
+        del trainer, target
+
+        return output_path
+
+    def convert_state(self, source, target):
+        mapping = {
+            "model.embed_tokens.weight": "embedding.word_embeddings.weight",
+            "model.layers.*.self_attn.o_proj.weight": "decoder.layers.*.self_attention.linear_proj.weight",
+            "model.layers.*.mlp.down_proj.weight": "decoder.layers.*.mlp.linear_fc2.weight",
+            "model.layers.*.input_layernorm.weight": "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
+            "model.layers.*.post_attention_layernorm.weight": "decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
+            "model.norm.weight": "decoder.final_layernorm.weight",
+            "lm_head.weight": "output_layer.weight",
+        }
+
+        return io.apply_transforms(
+            source, target, mapping=mapping, transforms=[_import_qkv, _import_qkv_bias, _import_linear_fc1]
+        )
+
+    @property
+    def tokenizer(self) -> "AutoTokenizer":
+        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
+        return AutoTokenizer(str(self), trust_remote_code=True)
+
+    @property
+    def config(self) -> Qwen2Config:
+        from transformers import AutoConfig as HFAutoConfig
+
+        source = HFAutoConfig.from_pretrained(str(self), trust_remote_code=True)
+
+        output = Qwen2Config(
+            num_layers=source.num_hidden_layers,
+            hidden_size=source.hidden_size,
+            ffn_hidden_size=source.intermediate_size,
+            num_attention_heads=source.num_attention_heads,
+            num_query_groups=source.num_key_value_heads,
+            init_method_std=source.initializer_range,
+            layernorm_epsilon=source.rms_norm_eps,
+            gated_linear_unit=True,
+            make_vocab_size_divisible_by=128,
+            rotary_base=source.rope_theta,
+            share_embeddings_and_output_weights=False,
+        )
+
+        return output
+
+
+@io.model_exporter(Qwen2Model, "hf")
+class HFQwen2Exporter(io.ModelConnector[Qwen2Model, "AutoModelForCausalLM"]):
+    def init(self) -> "AutoModelForCausalLM":
+        from transformers import AutoModelForCausalLM
+
+        return AutoModelForCausalLM.from_config(self.config, trust_remote_code=True)
+
+    def apply(self, output_path: Path) -> Path:
+        target = self.init()
+        source, _ = self.nemo_load(str(self))
+        target = self.convert_state(source, target)
+
+        target = target.cpu()
+        target.save_pretrained(output_path)
+        self.tokenizer.save_pretrained(output_path)
+
+        return output_path
+
+    def convert_state(self, source, target):
+        mapping = {
+            "embedding.word_embeddings.weight": "model.embed_tokens.weight",
+            "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight",
+            "decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight",
+            "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight",
+            "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight",
+            "decoder.final_layernorm.weight": "model.norm.weight",
+            "output_layer.weight": "lm_head.weight",
+        }
+
+        return io.apply_transforms(
+            source, target, mapping=mapping, transforms=[_export_qkv, _export_qkv_bias, _export_linear_fc1]
+        )
+
+    @property
+    def tokenizer(self):
+        return io.load_context(str(self)).model.tokenizer.tokenizer
+
+    @property
+    def config(self) -> "HFQwen2Config":
+        from transformers import Qwen2Config as HFQwen2Config
+
+        source: Qwen2Config = io.load_context(str(self)).model.config
+
+        return HFQwen2Config(
+            num_hidden_layers=source.num_layers,
+            hidden_size=source.hidden_size,
+            intermediate_size=source.ffn_hidden_size,
+            num_attention_heads=source.num_attention_heads,
+            max_position_embeddings=source.seq_length,
+            initializer_range=source.init_method_std,
+            rms_norm_eps=source.layernorm_epsilon,
+            num_key_value_heads=source.num_query_groups,
+            rope_theta=source.rotary_base,
+            vocab_size=getattr(source, 'vocab_size', self.tokenizer.vocab_size),
+            sliding_window=source.seq_length,
+            tie_word_embeddings=False,
+        )
+
+
+@io.state_transform(
+    source_key=(
+        "model.layers.*.self_attn.q_proj.weight",
+        "model.layers.*.self_attn.k_proj.weight",
+        "model.layers.*.self_attn.v_proj.weight",
+    ),
+    target_key="decoder.layers.*.self_attention.linear_qkv.weight",
+)
+def _import_qkv(ctx: io.TransformCTX, q, k, v):
+    megatron_config = ctx.target.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+
+    old_tensor_shape = q.size()
+    new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
+    new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
+
+    q = q.view(*new_q_tensor_shape)
+    k = k.view(*new_kv_tensor_shape)
+    v = v.view(*new_kv_tensor_shape)
+
+    qkv_weights_l = []
+    for i in range(num_query_groups):
+        qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
+        qkv_weights_l.append(k[i : i + 1, :, :])
+        qkv_weights_l.append(v[i : i + 1, :, :])
+    qkv_weights = torch.cat(qkv_weights_l)
+    assert qkv_weights.ndim == 3, qkv_weights.shape
+    assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
+    assert qkv_weights.shape[1] == head_size, qkv_weights.shape
+    assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape
+
+    qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])
+
+    return qkv_weights
+
+
+@io.state_transform(
+    source_key=(
+        "model.layers.*.self_attn.q_proj.bias",
+        "model.layers.*.self_attn.k_proj.bias",
+        "model.layers.*.self_attn.v_proj.bias",
+    ),
+    target_key="decoder.layers.*.self_attention.linear_qkv.bias",
+)
+def _import_qkv_bias(ctx: io.TransformCTX, q, k, v):
+    megatron_config = ctx.target.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+
+    new_q_tensor_shape = (head_num, head_size)
+    new_kv_tensor_shape = (num_query_groups, head_size)
+
+    q = q.view(*new_q_tensor_shape)
+    k = k.view(*new_kv_tensor_shape)
+    v = v.view(*new_kv_tensor_shape)
+
+    qkv_bias = torch.empty((0, head_size))
+    for i in range(num_query_groups):
+        qkv_bias = torch.cat((qkv_bias, q[i * heads_per_group : (i + 1) * heads_per_group, :]))
+        qkv_bias = torch.cat((qkv_bias, k[i : i + 1, :]))
+        qkv_bias = torch.cat((qkv_bias, v[i : i + 1, :]))
+    qkv_bias = qkv_bias.reshape(
+        [
+            head_size * (head_num + 2 * num_query_groups),
+        ]
+    )
+    return qkv_bias
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.self_attention.linear_qkv.weight",
+    target_key=(
+        "model.layers.*.self_attn.q_proj.weight",
+        "model.layers.*.self_attn.k_proj.weight",
+        "model.layers.*.self_attn.v_proj.weight",
+    ),
+)
+def _export_qkv(ctx: io.TransformCTX, linear_qkv):
+    megatron_config = ctx.source.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size])
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+            for i in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+    q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu()
+    k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu()
+    v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu()
+
+    return q_proj, k_proj, v_proj
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.self_attention.linear_qkv.bias",
+    target_key=(
+        "model.layers.*.self_attn.q_proj.bias",
+        "model.layers.*.self_attn.k_proj.bias",
+        "model.layers.*.self_attn.v_proj.bias",
+    ),
+)
+def _export_qkv_bias(ctx: io.TransformCTX, qkv_bias):
+    megatron_config = ctx.source.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_num = megatron_config.num_attention_heads
+    head_size = hidden_size // head_num
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    qkv_bias = qkv_bias.reshape([qkv_total_dim, head_size])
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+            for i in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+    q_bias = qkv_bias[q_slice].reshape(-1).cpu()
+    k_bias = qkv_bias[k_slice].reshape(-1).cpu()
+    v_bias = qkv_bias[v_slice].reshape(-1).cpu()
+
+    return q_bias, k_bias, v_bias
+
+
+@io.state_transform(
+    source_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
+    target_key="decoder.layers.*.mlp.linear_fc1.weight",
+)
+def _import_linear_fc1(down, gate):
+    return torch.cat((down, gate), axis=0).float()
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.mlp.linear_fc1.weight",
+    target_key=("model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.up_proj.weight"),
+)
+def _export_linear_fc1(linear_fc1):
+    gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
+
+    return gate_proj, up_proj
+
+
+__all__ = [
+    "Qwen2Config",
+    "Qwen2Config500M",
+    "Qwen2Config1P5B",
+    "Qwen2Config7B",
+    "Qwen2Config72B",
+    "Qwen2Model",
+]

From e68f981c393441165548e45c65c49ed5283fc0d5 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Tue, 27 Aug 2024 18:24:31 -0700
Subject: [PATCH 7/7] Lazy import tokenizers (#10213)

* Move inflect to lazy import

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Use lazy imports for tokenizer libraries

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* sacremoses lazy import

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix cyclic import

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* import fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* move pangu

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
---
 .../data/en/data_preprocessing.py             | 23 ++++--
 .../common/parts/preprocessing/cleaners.py    | 16 +++-
 .../common/tokenizers/en_ja_tokenizers.py     |  7 +-
 .../common/tokenizers/indic_tokenizers.py     |  4 +-
 .../common/tokenizers/moses_tokenizers.py     |  4 +-
 .../nlp/modules/common/tokenizer_utils.py     | 73 ++++++++++---------
 .../niv2/preprocess_niv2.py                   | 13 +++-
 .../t0/t0_dataset_preproc.py                  |  7 +-
 8 files changed, 91 insertions(+), 56 deletions(-)

diff --git a/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py b/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py
index 9523d0974db8..f902e771cde4 100644
--- a/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py
+++ b/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py
@@ -46,8 +46,8 @@
 
 import os
 from argparse import ArgumentParser
+from functools import cache
 
-import inflect
 import regex as re
 from tqdm import tqdm
 
@@ -60,12 +60,21 @@
 )
 from nemo.utils import logging
 
-engine = inflect.engine()
+
+@cache
+def inflect_engine():
+    import inflect
+
+    return inflect.engine()
+
 
 # these are all words that can appear in a verbalized number, this list will be used later as a filter to detect numbers in verbalizations
 number_verbalizations = list(range(0, 20)) + list(range(20, 100, 10))
 number_verbalizations = (
-    [engine.number_to_words(x, zero="zero").replace("-", " ").replace(",", "") for x in number_verbalizations]
+    [
+        inflect_engine().number_to_words(x, zero="zero").replace("-", " ").replace(",", "")
+        for x in number_verbalizations
+    ]
     + ["hundred", "thousand", "million", "billion", "trillion"]
     + ["point"]
 )
@@ -85,7 +94,7 @@ def process_url(o):
     """
 
     def flatten(l):
-        """ flatten a list of lists """
+        """flatten a list of lists"""
         return [item for sublist in l for item in sublist]
 
     if o != '<self>' and '_letter' in o:
@@ -129,6 +138,7 @@ def convert2digits(digits: str):
     Return:
         res: number verbalization of the integer prefix of the input
     """
+    engine = inflect_engine()
     res = []
     for i, x in enumerate(digits):
         if x in digit:
@@ -145,6 +155,7 @@ def convert2digits(digits: str):
 
 
 def convert(example):
+    engine = inflect_engine()
     cls, written, spoken = example
 
     written = convert_fraction(written)
@@ -288,7 +299,7 @@ def convert(example):
 def ignore(example):
     """
     This function makes sure specific class types like 'PLAIN', 'ELECTRONIC' etc. are left unchanged.
-    
+
     Args:
         example: data example
     """
@@ -300,7 +311,7 @@ def ignore(example):
 
 
 def process_file(fp):
-    """ Reading the raw data from a file of NeMo format and preprocesses it. Write is out to the output directory.
+    """Reading the raw data from a file of NeMo format and preprocesses it. Write is out to the output directory.
     For more info about the data format, refer to the
     `text_normalization doc <https://github.com/NVIDIA/NeMo/blob/main/docs/source/nlp/text_normalization.rst>`.
 
diff --git a/nemo/collections/common/parts/preprocessing/cleaners.py b/nemo/collections/common/parts/preprocessing/cleaners.py
index 40c80115786a..0697abe8792e 100644
--- a/nemo/collections/common/parts/preprocessing/cleaners.py
+++ b/nemo/collections/common/parts/preprocessing/cleaners.py
@@ -14,7 +14,6 @@
 
 import re
 
-import inflect
 from text_unidecode import unidecode
 
 from nemo.utils import logging
@@ -139,7 +138,14 @@
 ]
 
 
-inflect = inflect.engine()
+from functools import cache
+
+
+@cache
+def inflect_engine():
+    import inflect
+
+    return inflect.engine()
 
 
 def clean_text(string, table, punctuation_to_replace, abbreviation_version=None):
@@ -194,11 +200,12 @@ def reset(self):
         self.currency = None
 
     def format_final_number(self, whole_num, decimal):
+        inflect = inflect_engine()
         if self.currency:
             return_string = inflect.number_to_words(whole_num)
             return_string += " dollar" if whole_num == 1 else " dollars"
             if decimal:
-                return_string += " and " + inflect.number_to_words(decimal)
+                return_string += " and " + inflect_engine().number_to_words(decimal)
                 return_string += " cent" if whole_num == decimal else " cents"
             self.reset()
             return return_string
@@ -210,11 +217,12 @@ def format_final_number(self, whole_num, decimal):
         else:
             # Check if there are non-numbers
             def convert_to_word(match):
-                return " " + inflect.number_to_words(match.group(0)) + " "
+                return " " + inflect_engine().number_to_words(match.group(0)) + " "
 
             return re.sub(r'[0-9,]+', convert_to_word, whole_num)
 
     def clean(self, match):
+        inflect = inflect_engine()
         ws = match.group(2)
         number = match.group(3)
         _proceeding_symbol = match.group(7)
diff --git a/nemo/collections/common/tokenizers/en_ja_tokenizers.py b/nemo/collections/common/tokenizers/en_ja_tokenizers.py
index cf58130834e9..c72ae1853deb 100644
--- a/nemo/collections/common/tokenizers/en_ja_tokenizers.py
+++ b/nemo/collections/common/tokenizers/en_ja_tokenizers.py
@@ -14,9 +14,6 @@
 import re
 from typing import List
 
-from pangu import spacing
-from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer
-
 try:
     import ipadic
     import MeCab
@@ -36,6 +33,8 @@ class EnJaProcessor:
     """
 
     def __init__(self, lang_id: str):
+        from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer
+
         self.lang_id = lang_id
         self.moses_tokenizer = MosesTokenizer(lang=lang_id)
         self.moses_detokenizer = MosesDetokenizer(lang=lang_id)
@@ -81,6 +80,8 @@ def __init__(self):
         self.mecab_tokenizer = MeCab.Tagger(ipadic.MECAB_ARGS + " -Owakati")
 
     def detokenize(self, text: List[str]) -> str:
+        from pangu import spacing
+
         RE_WS_IN_FW = re.compile(
             r'([\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])\s+(?=[\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])'
         )
diff --git a/nemo/collections/common/tokenizers/indic_tokenizers.py b/nemo/collections/common/tokenizers/indic_tokenizers.py
index 3b9192c8885b..eaf3aa5c7b64 100644
--- a/nemo/collections/common/tokenizers/indic_tokenizers.py
+++ b/nemo/collections/common/tokenizers/indic_tokenizers.py
@@ -14,8 +14,6 @@
 
 from typing import List
 
-from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer
-
 
 class IndicProcessor:
     """
@@ -26,6 +24,8 @@ class IndicProcessor:
     def __init__(self, lang_id: str):
         if lang_id != 'hi':
             raise NotImplementedError
+        from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer
+
         self.moses_tokenizer = MosesTokenizer(lang=lang_id)
         self.moses_detokenizer = MosesDetokenizer(lang=lang_id)
         self.normalizer = MosesPunctNormalizer(lang=lang_id)
diff --git a/nemo/collections/common/tokenizers/moses_tokenizers.py b/nemo/collections/common/tokenizers/moses_tokenizers.py
index 27e91e6c5262..717427090dd2 100644
--- a/nemo/collections/common/tokenizers/moses_tokenizers.py
+++ b/nemo/collections/common/tokenizers/moses_tokenizers.py
@@ -14,8 +14,6 @@
 
 from typing import List
 
-from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer
-
 
 class MosesProcessor:
     """
@@ -23,6 +21,8 @@ class MosesProcessor:
     """
 
     def __init__(self, lang_id: str):
+        from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer
+
         self.moses_tokenizer = MosesTokenizer(lang=lang_id)
         self.moses_detokenizer = MosesDetokenizer(lang=lang_id)
         self.normalizer = MosesPunctNormalizer(lang=lang_id)
diff --git a/nemo/collections/nlp/modules/common/tokenizer_utils.py b/nemo/collections/nlp/modules/common/tokenizer_utils.py
index 4cbadd87fe52..56496d56bc07 100644
--- a/nemo/collections/nlp/modules/common/tokenizer_utils.py
+++ b/nemo/collections/nlp/modules/common/tokenizer_utils.py
@@ -16,28 +16,8 @@
 from dataclasses import MISSING, dataclass
 from typing import Dict, List, Optional
 
-import nemo
-from nemo.collections.common.tokenizers.bytelevel_tokenizers import ByteLevelTokenizer
-from nemo.collections.common.tokenizers.char_tokenizer import CharTokenizer
-from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
-from nemo.collections.common.tokenizers.regex_tokenizer import RegExTokenizer
-from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer
-from nemo.collections.common.tokenizers.tiktoken_tokenizer import TiktokenTokenizer
-from nemo.collections.common.tokenizers.word_tokenizer import WordTokenizer
-from nemo.collections.nlp.modules.common.huggingface.huggingface_utils import get_huggingface_pretrained_lm_models_list
-from nemo.collections.nlp.modules.common.lm_utils import get_pretrained_lm_models_list
-from nemo.collections.nlp.parts.nlp_overrides import HAVE_MEGATRON_CORE
 from nemo.utils import logging
 
-try:
-    from nemo.collections.nlp.modules.common.megatron.megatron_utils import get_megatron_tokenizer
-
-    HAVE_MEGATRON_CORE = True
-
-except (ImportError, ModuleNotFoundError):
-    HAVE_MEGATRON_CORE = False
-
-
 __all__ = ['get_tokenizer', 'get_tokenizer_list']
 
 
@@ -96,46 +76,61 @@ def get_tokenizer(
             model better learn word compositionality and become robust to segmentation errors.
             It has emperically been shown to improve inference time BLEU scores.
     """
+
     if special_tokens is None:
         special_tokens_dict = {}
     else:
         special_tokens_dict = special_tokens
 
     if 'megatron' in tokenizer_name:
-        if not HAVE_MEGATRON_CORE:
+        try:
+            from nemo.collections.nlp.modules.common.megatron.megatron_utils import (
+                get_megatron_merges_file,
+                get_megatron_tokenizer,
+                get_megatron_vocab_file,
+            )
+        except (ImportError, ModuleNotFoundError):
             raise ImportError(
                 "Megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
             )
         if vocab_file is None:
-            vocab_file = nemo.collections.nlp.modules.common.megatron.megatron_utils.get_megatron_vocab_file(
-                tokenizer_name
-            )
-            merges_file = nemo.collections.nlp.modules.common.megatron.megatron_utils.get_megatron_merges_file(
-                tokenizer_name
-            )
+            vocab_file = get_megatron_vocab_file(tokenizer_name)
+            merges_file = get_megatron_merges_file(tokenizer_name)
         tokenizer_name = get_megatron_tokenizer(tokenizer_name)
 
     if tokenizer_name == 'sentencepiece':
+        from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
+
         logging.info("tokenizer_model: " + str(tokenizer_model))
-        return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer(
+        return SentencePieceTokenizer(
             model_path=tokenizer_model,
             special_tokens=special_tokens,
             legacy=True,
             chat_template=chat_template,
         )
     elif tokenizer_name == 'tiktoken':
-        return nemo.collections.common.tokenizers.tiktoken_tokenizer.TiktokenTokenizer(vocab_file=vocab_file)
+        from nemo.collections.common.tokenizers.tiktoken_tokenizer import TiktokenTokenizer
+
+        return TiktokenTokenizer(vocab_file=vocab_file)
     elif tokenizer_name == 'word':
+        from nemo.collections.common.tokenizers.word_tokenizer import WordTokenizer
+
         return WordTokenizer(vocab_file=vocab_file, **special_tokens_dict)
     elif tokenizer_name == 'char':
+        from nemo.collections.common.tokenizers.char_tokenizer import CharTokenizer
+
         return CharTokenizer(vocab_file=vocab_file, **special_tokens_dict)
     elif tokenizer_name == 'regex':
+        from nemo.collections.common.tokenizers.regex_tokenizer import RegExTokenizer
+
         return RegExTokenizer().load_tokenizer(regex_file=tokenizer_model, vocab_file=vocab_file)
 
     logging.info(
         f"Getting HuggingFace AutoTokenizer with pretrained_model_name: {tokenizer_name}, vocab_file: {vocab_file}, merges_files: {merges_file}, "
         f"special_tokens_dict: {special_tokens_dict}, and use_fast: {use_fast}"
     )
+    from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
     return AutoTokenizer(
         pretrained_model_name=tokenizer_name,
         vocab_file=vocab_file,
@@ -183,6 +178,8 @@ def get_nmt_tokenizer(
         raise ValueError("No Tokenizer path provided or file does not exist!")
 
     if library == 'huggingface':
+        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
         logging.info(f'Getting HuggingFace AutoTokenizer with pretrained_model_name: {model_name}')
         return AutoTokenizer(
             pretrained_model_name=model_name,
@@ -193,26 +190,32 @@ def get_nmt_tokenizer(
             trust_remote_code=trust_remote_code,
         )
     elif library == 'sentencepiece':
+        from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
+
         logging.info(f'Getting SentencePiece with model: {tokenizer_model}')
-        return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer(
+        return SentencePieceTokenizer(
             model_path=tokenizer_model,
             legacy=legacy,
             chat_template=chat_template,
         )
     elif library == 'byte-level':
+        from nemo.collections.common.tokenizers.bytelevel_tokenizers import ByteLevelTokenizer
+
         logging.info(f'Using byte-level tokenization')
         return ByteLevelTokenizer(special_tokens_dict)
     elif library == 'regex':
+        from nemo.collections.common.tokenizers.regex_tokenizer import RegExTokenizer
+
         logging.info(f'Using regex tokenization')
         return RegExTokenizer().load_tokenizer(regex_file=tokenizer_model, vocab_file=vocab_file)
     elif library == 'megatron':
 
         if model_name == 'GPTSentencePieceTokenizer':
+            from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
+
             logging.info("tokenizer_model: ")
             logging.info(tokenizer_model)
-            return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer(
-                model_path=tokenizer_model, legacy=legacy
-            )
+            return SentencePieceTokenizer(model_path=tokenizer_model, legacy=legacy)
 
         if model_name in megatron_tokenizer_model_map:
             model_name = megatron_tokenizer_model_map[model_name]
@@ -223,8 +226,12 @@ def get_nmt_tokenizer(
             tokenizer_name=model_name, vocab_file=vocab_file, merges_file=merges_file, chat_template=chat_template
         )
     elif library == 'tabular':
+        from nemo.collections.common.tokenizers.tabular_tokenizer import TabularTokenizer
+
         return TabularTokenizer(vocab_file, delimiter=delimiter)
     elif library == 'tiktoken':
+        from nemo.collections.common.tokenizers.tiktoken_tokenizer import TiktokenTokenizer
+
         return TiktokenTokenizer(vocab_file=vocab_file)
     else:
         raise NotImplementedError(
diff --git a/scripts/nlp_language_modeling/niv2/preprocess_niv2.py b/scripts/nlp_language_modeling/niv2/preprocess_niv2.py
index 073d6da8f32c..6119768e66f2 100644
--- a/scripts/nlp_language_modeling/niv2/preprocess_niv2.py
+++ b/scripts/nlp_language_modeling/niv2/preprocess_niv2.py
@@ -18,8 +18,6 @@
 from argparse import ArgumentParser
 from multiprocessing import Pool
 
-from sacremoses import MosesDetokenizer
-
 from nemo.collections.common.tokenizers import AutoTokenizer
 
 
@@ -99,6 +97,8 @@ def write_dataset_to_file(file_name, output_file_name, detokenizer, tokenizer, i
 
 
 def process_folder(data_folder, output_folder, splits_file, remove_newline):
+    from sacremoses import MosesDetokenizer
+
     detokenizer = MosesDetokenizer('en')
     tokenizer = AutoTokenizer("gpt2")
     assert os.path.isdir(data_folder)
@@ -162,10 +162,15 @@ def process_folder(data_folder, output_folder, splits_file, remove_newline):
         help="Path to output folder where JSONL files will be written.",
     )
     parser.add_argument(
-        "--splits_file_path", type=str, default="default", help="Path to the file that contains splits. ex: ",
+        "--splits_file_path",
+        type=str,
+        default="default",
+        help="Path to the file that contains splits. ex: ",
     )
     parser.add_argument(
-        "--remove_newline", action="store_true", help="Whether to remove newlines from the input and output.",
+        "--remove_newline",
+        action="store_true",
+        help="Whether to remove newlines from the input and output.",
     )
     args = parser.parse_args()
     process_folder(args.niv2_dataset_path, args.jsonl_output_path, args.splits_file_path, args.remove_newline)
diff --git a/scripts/nlp_language_modeling/t0/t0_dataset_preproc.py b/scripts/nlp_language_modeling/t0/t0_dataset_preproc.py
index 618c02c0cc13..53bed36ff8d0 100644
--- a/scripts/nlp_language_modeling/t0/t0_dataset_preproc.py
+++ b/scripts/nlp_language_modeling/t0/t0_dataset_preproc.py
@@ -19,7 +19,6 @@
 from multiprocessing import Pool
 
 import tensorflow as tf
-from sacremoses import MosesDetokenizer
 from tasks_splits_and_features import _TASK_SPLITS_AND_FEATURES_DICT
 
 
@@ -136,6 +135,8 @@ def process_folder(data_folder, folder_name, output_folder, detokenizer, remove_
 
 
 def process_all_folders(data_folder, output_folder, remove_newlines):
+    from sacremoses import MosesDetokenizer
+
     detokenizer = MosesDetokenizer('en')
     assert os.path.isdir(data_folder)
     if not os.path.exists(output_folder):
@@ -170,7 +171,9 @@ def process_all_folders(data_folder, output_folder, remove_newlines):
         help="Path to output folder where JSONL files will be written.",
     )
     parser.add_argument(
-        "--remove_newlines", action="store_true", help="Whether to remove newlines from the input and output.",
+        "--remove_newlines",
+        action="store_true",
+        help="Whether to remove newlines from the input and output.",
     )
     args = parser.parse_args()
     process_all_folders(args.p3_dataset_path, args.jsonl_output_path, args.remove_newlines)