NVIDIA · Davood-M · May 22, 2023 · Apr 17, 2023 · Apr 19, 2023 · Apr 19, 2023
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml
@@ -0,0 +1,25 @@
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: bf16 # 16, 32, or bf16
+
+model_type: gpt
+tensor_model_parallel_size: 1
+pipeline_model_parallel_size: 1
+pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
+gpt_model_file: null  # GPT nemo file path
+onnx_model_file: null # ONNX file path
+checkpoint_dir: null # Checkpoint directory
+checkpoint_name: null # Checkpoint name
+hparams_file: null # hparams filepath
+
+export_options:
+  runtime_check: False
+  verbose: False
+  onnx_opset: 17
+  do_constant_folding: True
+  cache_support: False
+  device: 'cuda'
+  check_tolerance: 0.01
diff --git a/examples/nlp/language_modeling/megatron_export.py b/examples/nlp/language_modeling/megatron_export.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from omegaconf import OmegaConf, open_dict
+from pytorch_lightning import Trainer
+
+from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel
+from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
+from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
+from nemo.collections.nlp.models.machine_translation.megatron_nmt_model import MegatronNMTModel
+from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
+from nemo.core import ModelPT
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.app_state import AppState
+from nemo.utils.model_utils import inject_model_parallel_rank
+
+
+def get_model_class(cfg):
+    if cfg.model_type == 'gpt':
+        return MegatronGPTModel
+    elif cfg.model_type == 'bert':
+        return MegatronBertModel
+    elif cfg.model_type == 't5':
+        return MegatronT5Model
+    elif cfg.model_type == 'bart':
+        return MegatronBARTModel
+    elif cfg.model_type == 'nmt':
+        return MegatronNMTModel
+    elif cfg.model_type == 'retro':
+        return MegatronRetrievalModel
+    else:
+        raise ValueError("Invalid Model Type")
+
+
+@hydra_runner(config_path="conf", config_name="megatron_gpt_export")
+def nemo_export(cfg):
+    """Convert a nemo model into .onnx ONNX format."""
+    nemo_in = None
+    if cfg.gpt_model_file:
+        nemo_in = cfg.gpt_model_file
+    elif cfg.checkpoint_dir:
+        nemo_in = os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)
+    assert nemo_in is not None, "NeMo model not provided. Please provide the path to the .nemo or .ckpt file"
+
+    onnx_out = cfg.onnx_model_file
+
+    trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
+    assert (
+        cfg.trainer.devices * cfg.trainer.num_nodes
+        == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
+    ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size"
+
+    logging.info("Restoring NeMo model from '{}'".format(nemo_in))
+    try:
+        if cfg.gpt_model_file:
+            save_restore_connector = NLPSaveRestoreConnector()
+            if os.path.isdir(cfg.gpt_model_file):
+                save_restore_connector.model_extracted_dir = cfg.gpt_model_file
+
+            pretrained_cfg = ModelPT.restore_from(
+                restore_path=cfg.gpt_model_file,
+                trainer=trainer,
+                return_config=True,
+                save_restore_connector=save_restore_connector,
+            )
+            OmegaConf.set_struct(pretrained_cfg, True)
+            with open_dict(pretrained_cfg):
+                pretrained_cfg.sequence_parallel = False
+                pretrained_cfg.activations_checkpoint_granularity = None
+                pretrained_cfg.activations_checkpoint_method = None
+                pretrained_cfg.precision = trainer.precision
+                if trainer.precision == "16":
+                    pretrained_cfg.megatron_amp_O2 = False
+            model = ModelPT.restore_from(
+                restore_path=cfg.gpt_model_file,
+                trainer=trainer,
+                override_config_path=pretrained_cfg,
+                save_restore_connector=save_restore_connector,
+            )
+        elif cfg.checkpoint_dir:
+            app_state = AppState()
+            if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1:
+                app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
+                app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size
+                app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size
+                (
+                    app_state.tensor_model_parallel_rank,
+                    app_state.pipeline_model_parallel_rank,
+                    app_state.model_parallel_size,
+                    app_state.data_parallel_size,
+                    app_state.pipeline_model_parallel_split_rank,
+                    app_state.virtual_pipeline_model_parallel_rank,
+                ) = fake_initialize_model_parallel(
+                    world_size=app_state.model_parallel_size,
+                    rank=trainer.global_rank,
+                    tensor_model_parallel_size_=cfg.tensor_model_parallel_size,
+                    pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size,
+                    pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank,
+                )
+            checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name))
+            model_cls = get_model_class(cfg)
+            model = model_cls.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer)
+        else:
+            raise ValueError("need at least a nemo file or checkpoint dir")
+    except Exception as e:
+        logging.error(
+            "Failed to restore model from NeMo file : {}. Please make sure you have the latest NeMo package installed with [all] dependencies.".format(
+                nemo_in
+            )
+        )
+        raise e
+
+    logging.info("Model {} restored from '{}'".format(model.__class__.__name__, nemo_in))
+
+    # Export
+    check_trace = cfg.export_options.runtime_check
+
+    try:
+        model.to(device=cfg.export_options.device).freeze()
+        model.eval()
+        model.export(
+            onnx_out,
+            onnx_opset_version=cfg.export_options.onnx_opset,
+            do_constant_folding=cfg.export_options.do_constant_folding,
+            dynamic_axes={
+                'input_ids': {0: "sequence", 1: "batch"},
+                'position_ids': {0: "sequence", 1: "batch"},
+                'logits': {0: "sequence", 1: "batch"},
+            },
+            check_trace=check_trace,
+            check_tolerance=cfg.export_options.check_tolerance,
+            verbose=cfg.export_options.verbose,
+        )
+    except Exception as e:
+        logging.error(
+            "Export failed. Please make sure your NeMo model class ({}) has working export() and that you have the latest NeMo package installed with [all] dependencies.".format(
+                model.__class__
+            )
+        )
+        raise e
+
+
+if __name__ == '__main__':
+    nemo_export()
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -14,8 +14,9 @@
 
 import itertools
 import queue
+import warnings
 from functools import partial
-from typing import Any, Iterator, List, Optional, Union
+from typing import Any, Dict, Iterator, List, Optional, Union
 
 import numpy as np
 import torch
@@ -36,6 +37,7 @@
 from nemo.collections.nlp.modules.common.megatron.utils import (
     average_losses_across_data_parallel_group,
     get_all_params_for_weight_decay_optimization,
+    get_ltor_masks_and_position_ids,
     get_params_for_weight_decay_optimization,
 )
 from nemo.collections.nlp.modules.common.text_generation_utils import (
@@ -53,7 +55,9 @@
 )
 from nemo.collections.nlp.parts.nlp_overrides import GradScaler
 from nemo.collections.nlp.parts.utils_funcs import get_last_rank
+from nemo.core.classes import Exportable
 from nemo.core.classes.common import PretrainedModelInfo
+from nemo.core.neural_types import ChannelType, NeuralType
 from nemo.utils import logging
 
 try:
@@ -88,6 +92,99 @@
     HAVE_TE = False
 
 
+class MegatronGPTExportableModel(torch.nn.Module, Exportable):
+    """
+    Megatron GPT Wrapper for ONNX export
+    """
+
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.fp8_enabled = model.cfg.get('fp8', False)
+        self.fp8_recipe = None
+        if self.fp8_enabled and HAVE_TE:
+            self.fp8_recipe = transformer_engine.common.recipe.DelayedScaling(
+                margin=0, interval=1, fp8_format=transformer_engine.common.recipe.Format.E4M3
+            )
+
+        self.dtype = None
+        if model.cfg['precision'] == 'bf16':
+            self.dtype = torch.bfloat16
+        elif int(model.cfg['precision']) == 32:
+            self.dtype = torch.float
+        elif int(model.cfg['precision']) == 16:
+            self.dtype = torch.float16
+        else:
+            raise ValueError(f"precision: {model.cfg['precision']} is not supported.")
+
+    def forward(self, tokens, position_ids, attention_mask):
+        if self.fp8_enabled and HAVE_TE:
+            with transformer_engine.pytorch.onnx_export(self.fp8_enabled), transformer_engine.pytorch.fp8_autocast(
+                enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe
+            ), torch.no_grad(), torch.inference_mode(), torch.autocast(
+                'cuda', dtype=self.dtype
+            ), warnings.catch_warnings():
+                warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
+                assert tokens.shape == position_ids.shape
+                assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
+                output_tensor = self.model.forward(
+                    tokens=tokens.cuda(),
+                    text_position_ids=position_ids.cuda(),
+                    attention_mask=attention_mask.cuda(),
+                    labels=None,
+                )
+        else:
+            with torch.no_grad(), torch.inference_mode(), torch.autocast(
+                'cuda', dtype=self.dtype
+            ), warnings.catch_warnings():
+                warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
+                assert tokens.shape == position_ids.shape
+                assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
+                output_tensor = self.model.forward(
+                    tokens=tokens.cuda(),
+                    text_position_ids=position_ids.cuda(),
+                    attention_mask=attention_mask.cuda(),
+                    labels=None,
+                )
+
+        return output_tensor
+
+    def freeze(self):
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def input_example(self, max_batch=1, max_dim=768, seq_len=6):
+        ids = [self.model.tokenizer.text_to_ids(text) for text in ["how is the weather on           Sunday"]]
+        id_tensors = [torch.unsqueeze(torch.LongTensor(id_list), dim=0) for id_list in ids]
+        masks_and_position_ids = [
+            get_ltor_masks_and_position_ids(id_tensor, self.model.tokenizer.eos_id, False, False, False)
+            for id_tensor in id_tensors
+        ]
+        for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids):
+            attn_mask, _, pos_ids = attn_mask_and_pos_ids
+            return tokens, pos_ids, attn_mask
+
+    @property
+    def input_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {
+            "input_ids": NeuralType(('B', 'T'), ChannelType()),
+            "position_ids": NeuralType(('B', 'T'), ChannelType()),
+            "attention_mask": NeuralType(('D', 'D', 'T', 'T'), ChannelType()),
+        }
+
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {"logits": NeuralType(('B', 'T', 'D'), ChannelType())}
+
+    @property
+    def input_names(self) -> List[str]:
+        return ['input_ids', 'position_ids', 'attention_mask']
+
+    @property
+    def output_names(self) -> List[str]:
+        return ['logits']
+
+
 class MegatronGPTModel(MegatronBaseModel, TextGeneration):
     """
     Megatron GPT pretraining
@@ -1113,6 +1210,13 @@ def parameters(self):
         else:
             return self.model.parameters()
 
+    @property
+    def mgpt_wrapper(self):
+        return MegatronGPTExportableModel(self)
+
+    def list_export_subnets(self):
+        return ['mgpt_wrapper']
+
     def _reset_activation_checkpointing_args(self):
         """ Disables activation checkpointing completely and saves the values so that
             _restore_activation_checkpointing_args can restore them later. This function must always be

diff --git a/nemo/collections/nlp/modules/common/megatron/attention.py b/nemo/collections/nlp/modules/common/megatron/attention.py
@@ -697,8 +697,12 @@ def __init__(
         super(CoreAttention, self).__init__()
 
         self.precision = precision
-        self.fp16 = precision == 16
-        self.bf16 = precision == 'bf16'
+        self.fp16 = False
+        self.bf16 = False
+        if precision == 'bf16':
+            self.bf16 = True
+        elif int(precision) == 16:
+            self.fp16 = True
         self.multi_query_attention = multi_query_attention
 
         self.apply_query_key_layer_scaling = apply_query_key_layer_scaling

diff --git a/nemo/collections/nlp/modules/common/megatron/module.py b/nemo/collections/nlp/modules/common/megatron/module.py
@@ -262,17 +262,17 @@ def __init__(self, module, precision):
         super().__init__()
         self.precision = precision
 
-        if precision == 16:
-            self.add_module('module', module.half())
+        if precision == 'bf16':
+            self.add_module('module', module.bfloat16())
 
             def float16_converter(val):
-                return val.half()
+                return val.bfloat16()
 
-        elif precision == 'bf16':
-            self.add_module('module', module.bfloat16())
+        elif int(precision) == 16:
+            self.add_module('module', module.half())
 
             def float16_converter(val):
-                return val.bfloat16()
+                return val.half()
 
         else:
             raise Exception(

diff --git a/nemo/utils/export_utils.py b/nemo/utils/export_utils.py
@@ -309,7 +309,8 @@ def replace_FusedScaleMaskSoftmax(n: nn.Module) -> Optional[nn.Linear]:
            Equivalent LayerNorm module
         """
         if not isinstance(n, FusedScaleMaskSoftmax):
-            raise ValueError("This function can only change the FusedScaleMaskSoftmax module.")
+            logging.warning("This function can only change the FusedScaleMaskSoftmax module.")
+            return n
 
         # disable the fusion only
         mod = FusedScaleMaskSoftmax(