From e57472e9ed54e4f7c331ae51f02b0714917b89b7 Mon Sep 17 00:00:00 2001
From: Asfiya Baig <asfiyab@nvidia.com>
Date: Sun, 16 Apr 2023 18:27:23 -0700
Subject: [PATCH 01/14] add GPT FP8 ONNX export support

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>
---
 .../conf/megatron_gpt_export.yaml             |  22 ++
 .../language_modeling/megatron_gpt_export.py  | 225 ++++++++++++++++++
 .../nlp/modules/common/megatron/module.py     |   2 +-
 nemo/utils/export_utils.py                    |   7 +-
 4 files changed, 253 insertions(+), 3 deletions(-)
 create mode 100644 examples/nlp/language_modeling/conf/megatron_gpt_export.yaml
 create mode 100644 examples/nlp/language_modeling/megatron_gpt_export.py

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml
new file mode 100644
index 000000000000..63d2d7ee9a40
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml
@@ -0,0 +1,22 @@
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: bf16 # 16, 32, or bf16
+
+tensor_model_parallel_size: 1
+pipeline_model_parallel_size: 1
+pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
+gpt_model_file: null  # GPT nemo file path
+onnx_model_file: null # ONNX file path
+
+export_options:
+  autocast: True
+  runtime_check: False
+  verbose: None
+  onnx_opset: 17
+  do_constant_folding: True
+  cache_support: True
+  device: 'cuda'
+  check_tolerance: 0.01
diff --git a/examples/nlp/language_modeling/megatron_gpt_export.py b/examples/nlp/language_modeling/megatron_gpt_export.py
new file mode 100644
index 000000000000..b0a243964a9c
--- /dev/null
+++ b/examples/nlp/language_modeling/megatron_gpt_export.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import warnings
+from typing import Optional, List, Dict
+
+import torch
+from omegaconf import OmegaConf, open_dict
+from pytorch_lightning import Trainer
+
+from nemo.core.config import hydra_runner
+from nemo.core.classes import Exportable
+from nemo.core.neural_types import ChannelType, NeuralType
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
+from nemo.utils import logging
+
+import transformer_engine.pytorch as te
+from transformer_engine.common import recipe
+
+try:
+    from contextlib import nullcontext
+except ImportError:
+    # handle python < 3.7
+    from contextlib import suppress as nullcontext
+
+class MegatronGPTExportableModel(torch.nn.Module, Exportable):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.dtype = None
+        if model.cfg['precision'] == 'bf16':
+            self.dtype = torch.bfloat16
+        elif int(model.cfg['precision']) == 32:
+            self.dtype = torch.float
+        elif int(model.cfg['precision']) == 16:
+            self.dtype = torch.float16
+        else:
+            raise ValueError(f"precision: {model.cfg['precision']} is not supported.")
+
+    def forward(self, id_tensors, masks_and_position_ids):
+        output_tensors = []
+        for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids):
+            attn_mask, _, pos_ids = attn_mask_and_pos_ids
+            assert tokens.shape == pos_ids.shape
+            assert attn_mask.shape[2] == attn_mask.shape[3] == tokens.shape[1] == pos_ids.shape[1]
+            with torch.autocast('cuda', dtype=self.dtype):
+                output_tensor = self.model.forward(
+                        tokens=tokens.cuda(),
+                        text_position_ids=pos_ids.cuda(),
+                        attention_mask=attn_mask.cuda(),
+                        labels=None,
+                    )
+
+            output_tensors.append(output_tensor)
+        return output_tensors
+
+    def freeze(self):
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def input_example(self, max_batch=1, max_dim=768, seq_len=6):
+        ids = [self.model.tokenizer.text_to_ids(text) for text in ['hi there']]
+        id_tensors = [torch.unsqueeze(torch.LongTensor(id_list), dim=0) for id_list in ids]
+        masks_and_position_ids = [
+            get_ltor_masks_and_position_ids(id_tensor, self.model.tokenizer.eos_id, False, False, False)
+            for id_tensor in id_tensors
+        ]
+
+        return id_tensors, masks_and_position_ids
+    
+    def get_dynamic_axes(self):
+        dynamic_axes = {
+                'id_tensors': {0: "BS", 1: "sequence"},
+                'masks_and_position_ids': {0: "BS", 2: "sequence", 3: "sequence"},
+        }
+        return dynamic_axes
+    
+    @property
+    def input_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {
+            "id_tensors": NeuralType(('B'), ChannelType()),
+            "masks_and_position_ids": NeuralType(('B'), ChannelType()),
+        }
+
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {"log_probs": NeuralType(('B', 'T', 'D'), ChannelType())}
+
+    @property
+    def input_names(self) -> List[str]:
+        return ['id_tensors', 'masks_and_position_ids']
+
+    @property
+    def output_names(self) -> List[str]:
+        return ['log_probs']
+
+@hydra_runner(config_path="conf", config_name="megatron_gpt_export")
+def nemo_export(cfg):
+    """Convert a .nemo saved model into .onnx ONNX format."""
+    nemo_in = cfg.gpt_model_file
+    onnx_out = cfg.onnx_model_file
+
+    trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
+    assert (
+        cfg.trainer.devices * cfg.trainer.num_nodes
+        == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
+    ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size"
+
+
+    logging.info("Restoring NeMo model from '{}'".format(nemo_in))
+    try:
+        with torch.inference_mode():
+            # Restore instance from .nemo file using generic model restore_from
+            save_restore_connector = NLPSaveRestoreConnector()
+            if os.path.isdir(cfg.gpt_model_file):
+                save_restore_connector.model_extracted_dir = cfg.gpt_model_file
+
+            pretrained_cfg = MegatronGPTModel.restore_from(
+                restore_path=cfg.gpt_model_file,
+                trainer=trainer,
+                return_config=True,
+                save_restore_connector=save_restore_connector,
+            )
+            OmegaConf.set_struct(pretrained_cfg, True)
+            with open_dict(pretrained_cfg):
+                pretrained_cfg.sequence_parallel = False
+                pretrained_cfg.activations_checkpoint_granularity = None
+                pretrained_cfg.activations_checkpoint_method = None
+                pretrained_cfg.precision = trainer.precision
+            model = MegatronGPTModel.restore_from(
+                restore_path=cfg.gpt_model_file,
+                trainer=trainer,
+                override_config_path=pretrained_cfg,
+                save_restore_connector=save_restore_connector,
+            )
+    except Exception as e:
+        logging.error(
+            "Failed to restore model from NeMo file : {}. Please make sure you have the latest NeMo package installed with [all] dependencies.".format(
+                nemo_in
+            )
+        )
+        raise e
+
+    logging.info("Model {} restored from '{}'".format(model.__class__.__name__, nemo_in))
+
+    if not isinstance(model, Exportable):
+        logging.error("Your NeMo model class ({}) is not Exportable.".format(model.__class__.__name__))
+        sys.exit(1)
+
+    #
+    #  Add custom export parameters here
+    #
+    check_trace = cfg.export_options.runtime_check
+
+    if cfg.export_options.cache_support and hasattr(model, "encoder") and hasattr(model.encoder, "export_cache_support"):
+        model.encoder.export_cache_support = True
+        logging.info("Caching support is enabled.")
+        model.encoder.setup_streaming_params()
+
+    autocast = nullcontext
+    if cfg.export_options.autocast:
+        autocast = torch.cuda.amp.autocast
+    fp8_recipe = recipe.DelayedScaling(margin=0, interval=1, fp8_format=recipe.Format.E4M3)
+    try:
+        with autocast(), torch.no_grad(), torch.inference_mode(), te.onnx_export(True), \
+            te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe), warnings.catch_warnings():
+            warnings.filterwarnings(
+                action='ignore',
+                category=torch.jit.TracerWarning,
+                module=r'.*'
+            )
+
+            model.to(device=cfg.export_options.device).freeze()
+            model.eval()
+            exportable_model = MegatronGPTExportableModel(model)
+
+            exportable_model.export(
+                onnx_out,
+                onnx_opset_version=cfg.export_options.onnx_opset,
+                do_constant_folding=cfg.export_options.do_constant_folding,
+                dynamic_axes=exportable_model.get_dynamic_axes(),
+                check_trace=check_trace,
+                check_tolerance=cfg.export_options.check_tolerance,
+                verbose=cfg.export_options.verbose,
+            )
+    except Exception as e:
+        logging.error(
+            "Export failed. Please make sure your NeMo model class ({}) has working export() and that you have the latest NeMo package installed with [all] dependencies.".format(
+                model.__class__
+            )
+        )
+        raise e
+
+
+if __name__ == '__main__':
+    nemo_export()
diff --git a/nemo/collections/nlp/modules/common/megatron/module.py b/nemo/collections/nlp/modules/common/megatron/module.py
index 58ce7a7bae18..598272366b44 100644
--- a/nemo/collections/nlp/modules/common/megatron/module.py
+++ b/nemo/collections/nlp/modules/common/megatron/module.py
@@ -262,7 +262,7 @@ def __init__(self, module, precision):
         super().__init__()
         self.precision = precision
 
-        if precision == 16:
+        if int(precision) == 16:
             self.add_module('module', module.half())
 
             def float16_converter(val):
diff --git a/nemo/utils/export_utils.py b/nemo/utils/export_utils.py
index 9fa2bc239eb8..7131f9ce6fc1 100644
--- a/nemo/utils/export_utils.py
+++ b/nemo/utils/export_utils.py
@@ -309,7 +309,8 @@ def replace_FusedScaleMaskSoftmax(n: nn.Module) -> Optional[nn.Linear]:
            Equivalent LayerNorm module
         """
         if not isinstance(n, FusedScaleMaskSoftmax):
-            raise ValueError("This function can only change the FusedScaleMaskSoftmax module.")
+            logging.warning("This function can only change the FusedScaleMaskSoftmax module.")
+            return n
 
         # disable the fusion only
         mod = FusedScaleMaskSoftmax(
@@ -454,7 +455,9 @@ def replace_for_export(model: nn.Module) -> nn.Module:
     }
 
     replace_modules(model, default_Apex_replacements)
-    replace_modules(model, default_replacements)
+    # Apply CastToFloat for torch.float32
+    if model.dtype==torch.float32:
+        replace_modules(model, default_replacements)
     # This one has to be the last
     replace_modules(model, script_replacements)
 

From f64d546d6b1cf714614b051957b2b374f5085975 Mon Sep 17 00:00:00 2001
From: Asfiya Baig <asfiyab@nvidia.com>
Date: Wed, 19 Apr 2023 11:47:50 -0700
Subject: [PATCH 02/14] changes 1. Add dynamic axes for inputs 2. Update model
 input_example to resolve size error by TE

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>
---
 .../conf/megatron_gpt_export.yaml             |  5 +-
 .../language_modeling/megatron_gpt_export.py  | 50 +++++++++----------
 2 files changed, 25 insertions(+), 30 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml
index 63d2d7ee9a40..798d156146d5 100644
--- a/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml
@@ -12,11 +12,10 @@ gpt_model_file: null  # GPT nemo file path
 onnx_model_file: null # ONNX file path
 
 export_options:
-  autocast: True
   runtime_check: False
-  verbose: None
+  verbose: False
   onnx_opset: 17
   do_constant_folding: True
-  cache_support: True
+  cache_support: False
   device: 'cuda'
   check_tolerance: 0.01
diff --git a/examples/nlp/language_modeling/megatron_gpt_export.py b/examples/nlp/language_modeling/megatron_gpt_export.py
index b0a243964a9c..0773bf143a12 100644
--- a/examples/nlp/language_modeling/megatron_gpt_export.py
+++ b/examples/nlp/language_modeling/megatron_gpt_export.py
@@ -66,49 +66,48 @@ def __init__(self, model):
         else:
             raise ValueError(f"precision: {model.cfg['precision']} is not supported.")
 
-    def forward(self, id_tensors, masks_and_position_ids):
-        output_tensors = []
-        for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids):
-            attn_mask, _, pos_ids = attn_mask_and_pos_ids
-            assert tokens.shape == pos_ids.shape
-            assert attn_mask.shape[2] == attn_mask.shape[3] == tokens.shape[1] == pos_ids.shape[1]
-            with torch.autocast('cuda', dtype=self.dtype):
-                output_tensor = self.model.forward(
-                        tokens=tokens.cuda(),
-                        text_position_ids=pos_ids.cuda(),
-                        attention_mask=attn_mask.cuda(),
-                        labels=None,
-                    )
-
-            output_tensors.append(output_tensor)
-        return output_tensors
+    def forward(self, tokens, position_ids, attention_mask):
+        assert tokens.shape == position_ids.shape
+        assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
+        with torch.autocast('cuda', dtype=self.dtype):
+            output_tensor = self.model.forward(
+                    tokens=tokens.cuda(),
+                    text_position_ids=position_ids.cuda(),
+                    attention_mask=attention_mask.cuda(),
+                    labels=None,
+                )
+
+        return output_tensor
 
     def freeze(self):
         for param in self.parameters():
             param.requires_grad = False
 
     def input_example(self, max_batch=1, max_dim=768, seq_len=6):
-        ids = [self.model.tokenizer.text_to_ids(text) for text in ['hi there']]
+        ids = [self.model.tokenizer.text_to_ids(text) for text in ["how is the weather on           Sunday"]]
         id_tensors = [torch.unsqueeze(torch.LongTensor(id_list), dim=0) for id_list in ids]
         masks_and_position_ids = [
             get_ltor_masks_and_position_ids(id_tensor, self.model.tokenizer.eos_id, False, False, False)
             for id_tensor in id_tensors
         ]
-
-        return id_tensors, masks_and_position_ids
+        for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids):
+            attn_mask, _, pos_ids = attn_mask_and_pos_ids
+            return tokens, pos_ids, attn_mask
     
     def get_dynamic_axes(self):
         dynamic_axes = {
                 'id_tensors': {0: "BS", 1: "sequence"},
-                'masks_and_position_ids': {0: "BS", 2: "sequence", 3: "sequence"},
+                'position_ids': {0: "BS", 1: "sequence"},
+                'attention_mask': {0: "BS", 2: "sequence", 3: "sequence"},
         }
         return dynamic_axes
     
     @property
     def input_types(self) -> Optional[Dict[str, NeuralType]]:
         return {
-            "id_tensors": NeuralType(('B'), ChannelType()),
-            "masks_and_position_ids": NeuralType(('B'), ChannelType()),
+            "id_tensors": NeuralType(('B', 'T'), ChannelType()),
+            "position_ids": NeuralType(('B', 'T'), ChannelType()),
+            "attention_mask": NeuralType(('B', 'D', 'T', 'T'), ChannelType()),
         }
 
     @property
@@ -117,7 +116,7 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]:
 
     @property
     def input_names(self) -> List[str]:
-        return ['id_tensors', 'masks_and_position_ids']
+        return ['id_tensors', 'position_ids', 'attention_mask']
 
     @property
     def output_names(self) -> List[str]:
@@ -186,12 +185,9 @@ def nemo_export(cfg):
         logging.info("Caching support is enabled.")
         model.encoder.setup_streaming_params()
 
-    autocast = nullcontext
-    if cfg.export_options.autocast:
-        autocast = torch.cuda.amp.autocast
     fp8_recipe = recipe.DelayedScaling(margin=0, interval=1, fp8_format=recipe.Format.E4M3)
     try:
-        with autocast(), torch.no_grad(), torch.inference_mode(), te.onnx_export(True), \
+        with torch.no_grad(), torch.inference_mode(), te.onnx_export(True), \
             te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe), warnings.catch_warnings():
             warnings.filterwarnings(
                 action='ignore',

From 0f5b9b71d9259b0484ba58575e03b9543fdaf96a Mon Sep 17 00:00:00 2001
From: Asfiya Baig <asfiyab@nvidia.com>
Date: Wed, 19 Apr 2023 16:21:18 -0700
Subject: [PATCH 03/14] Conform to Python style guidelines

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>
---
 .../language_modeling/megatron_gpt_export.py  | 61 +++++++++----------
 nemo/utils/export_utils.py                    |  2 +-
 2 files changed, 29 insertions(+), 34 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_gpt_export.py b/examples/nlp/language_modeling/megatron_gpt_export.py
index 0773bf143a12..e2221b762962 100644
--- a/examples/nlp/language_modeling/megatron_gpt_export.py
+++ b/examples/nlp/language_modeling/megatron_gpt_export.py
@@ -29,28 +29,22 @@
 import os
 import sys
 import warnings
-from typing import Optional, List, Dict
+from typing import Dict, List, Optional
 
 import torch
+import transformer_engine.pytorch as te
 from omegaconf import OmegaConf, open_dict
 from pytorch_lightning import Trainer
+from transformer_engine.common import recipe
 
-from nemo.core.config import hydra_runner
-from nemo.core.classes import Exportable
-from nemo.core.neural_types import ChannelType, NeuralType
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
+from nemo.core.classes import Exportable
+from nemo.core.config import hydra_runner
+from nemo.core.neural_types import ChannelType, NeuralType
 from nemo.utils import logging
 
-import transformer_engine.pytorch as te
-from transformer_engine.common import recipe
-
-try:
-    from contextlib import nullcontext
-except ImportError:
-    # handle python < 3.7
-    from contextlib import suppress as nullcontext
 
 class MegatronGPTExportableModel(torch.nn.Module, Exportable):
     def __init__(self, model):
@@ -71,11 +65,11 @@ def forward(self, tokens, position_ids, attention_mask):
         assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
         with torch.autocast('cuda', dtype=self.dtype):
             output_tensor = self.model.forward(
-                    tokens=tokens.cuda(),
-                    text_position_ids=position_ids.cuda(),
-                    attention_mask=attention_mask.cuda(),
-                    labels=None,
-                )
+                tokens=tokens.cuda(),
+                text_position_ids=position_ids.cuda(),
+                attention_mask=attention_mask.cuda(),
+                labels=None,
+            )
 
         return output_tensor
 
@@ -93,21 +87,21 @@ def input_example(self, max_batch=1, max_dim=768, seq_len=6):
         for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids):
             attn_mask, _, pos_ids = attn_mask_and_pos_ids
             return tokens, pos_ids, attn_mask
-    
+
     def get_dynamic_axes(self):
         dynamic_axes = {
-                'id_tensors': {0: "BS", 1: "sequence"},
-                'position_ids': {0: "BS", 1: "sequence"},
-                'attention_mask': {0: "BS", 2: "sequence", 3: "sequence"},
+            'id_tensors': {0: "BS", 1: "sequence"},
+            'position_ids': {0: "BS", 1: "sequence"},
+            'attention_mask': {0: "BS", 2: "sequence", 3: "sequence"},
         }
         return dynamic_axes
-    
+
     @property
     def input_types(self) -> Optional[Dict[str, NeuralType]]:
         return {
             "id_tensors": NeuralType(('B', 'T'), ChannelType()),
             "position_ids": NeuralType(('B', 'T'), ChannelType()),
-            "attention_mask": NeuralType(('B', 'D', 'T', 'T'), ChannelType()),
+            "attention_mask": NeuralType(('D', 'D', 'T', 'T'), ChannelType()),
         }
 
     @property
@@ -122,6 +116,7 @@ def input_names(self) -> List[str]:
     def output_names(self) -> List[str]:
         return ['log_probs']
 
+
 @hydra_runner(config_path="conf", config_name="megatron_gpt_export")
 def nemo_export(cfg):
     """Convert a .nemo saved model into .onnx ONNX format."""
@@ -134,7 +129,6 @@ def nemo_export(cfg):
         == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
     ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size"
 
-
     logging.info("Restoring NeMo model from '{}'".format(nemo_in))
     try:
         with torch.inference_mode():
@@ -180,20 +174,21 @@ def nemo_export(cfg):
     #
     check_trace = cfg.export_options.runtime_check
 
-    if cfg.export_options.cache_support and hasattr(model, "encoder") and hasattr(model.encoder, "export_cache_support"):
+    if (
+        cfg.export_options.cache_support
+        and hasattr(model, "encoder")
+        and hasattr(model.encoder, "export_cache_support")
+    ):
         model.encoder.export_cache_support = True
         logging.info("Caching support is enabled.")
         model.encoder.setup_streaming_params()
 
     fp8_recipe = recipe.DelayedScaling(margin=0, interval=1, fp8_format=recipe.Format.E4M3)
     try:
-        with torch.no_grad(), torch.inference_mode(), te.onnx_export(True), \
-            te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe), warnings.catch_warnings():
-            warnings.filterwarnings(
-                action='ignore',
-                category=torch.jit.TracerWarning,
-                module=r'.*'
-            )
+        with torch.no_grad(), torch.inference_mode(), te.onnx_export(True), te.fp8_autocast(
+            enabled=True, fp8_recipe=fp8_recipe
+        ), warnings.catch_warnings():
+            warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
 
             model.to(device=cfg.export_options.device).freeze()
             model.eval()
diff --git a/nemo/utils/export_utils.py b/nemo/utils/export_utils.py
index 7131f9ce6fc1..edfad0af62e5 100644
--- a/nemo/utils/export_utils.py
+++ b/nemo/utils/export_utils.py
@@ -456,7 +456,7 @@ def replace_for_export(model: nn.Module) -> nn.Module:
 
     replace_modules(model, default_Apex_replacements)
     # Apply CastToFloat for torch.float32
-    if model.dtype==torch.float32:
+    if model.dtype == torch.float32:
         replace_modules(model, default_replacements)
     # This one has to be the last
     replace_modules(model, script_replacements)

From 2c5d5fb677e4fe96925fa72066795fde4272d7e5 Mon Sep 17 00:00:00 2001
From: Asfiya Baig <asfiyab@nvidia.com>
Date: Wed, 26 Apr 2023 16:58:48 -0700
Subject: [PATCH 04/14] refactor to avoid typecasting bf16 string

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>
---
 .../nlp/modules/common/megatron/module.py            | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/module.py b/nemo/collections/nlp/modules/common/megatron/module.py
index 598272366b44..0a340985eec2 100644
--- a/nemo/collections/nlp/modules/common/megatron/module.py
+++ b/nemo/collections/nlp/modules/common/megatron/module.py
@@ -262,17 +262,17 @@ def __init__(self, module, precision):
         super().__init__()
         self.precision = precision
 
-        if int(precision) == 16:
-            self.add_module('module', module.half())
+        if precision == 'bf16':
+            self.add_module('module', module.bfloat16())
 
             def float16_converter(val):
-                return val.half()
+                return val.bfloat16()
 
-        elif precision == 'bf16':
-            self.add_module('module', module.bfloat16())
+        elif int(precision) == 16:
+            self.add_module('module', module.half())
 
             def float16_converter(val):
-                return val.bfloat16()
+                return val.half()
 
         else:
             raise Exception(

From 162cb6c9bc23f587b8d7f1a6bf56d362a42edb6a Mon Sep 17 00:00:00 2001
From: Asfiya Baig <asfiyab@nvidia.com>
Date: Thu, 27 Apr 2023 14:50:03 -0700
Subject: [PATCH 05/14] fix attribute error in export_utils

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>
---
 nemo/utils/export_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/utils/export_utils.py b/nemo/utils/export_utils.py
index edfad0af62e5..b4debf746a98 100644
--- a/nemo/utils/export_utils.py
+++ b/nemo/utils/export_utils.py
@@ -456,7 +456,7 @@ def replace_for_export(model: nn.Module) -> nn.Module:
 
     replace_modules(model, default_Apex_replacements)
     # Apply CastToFloat for torch.float32
-    if model.dtype == torch.float32:
+    if hasattr(model, 'dtype') and model.dtype == torch.float32:
         replace_modules(model, default_replacements)
     # This one has to be the last
     replace_modules(model, script_replacements)

From 29917eec292cdaf8c849c7cd982845d83ad48ee6 Mon Sep 17 00:00:00 2001
From: Asfiya Baig <asfiyab@nvidia.com>
Date: Thu, 4 May 2023 08:28:47 -0700
Subject: [PATCH 06/14] set constant_folding to False by default

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>
---
 examples/nlp/language_modeling/conf/megatron_gpt_export.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml
index 798d156146d5..49616fea02ab 100644
--- a/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml
@@ -15,7 +15,7 @@ export_options:
   runtime_check: False
   verbose: False
   onnx_opset: 17
-  do_constant_folding: True
+  do_constant_folding: False
   cache_support: False
   device: 'cuda'
   check_tolerance: 0.01

From bb16f60d6fc6517e8925a1f283451869bedd05a0 Mon Sep 17 00:00:00 2001
From: Asfiya Baig <asfiyab@nvidia.com>
Date: Tue, 9 May 2023 10:42:29 -0700
Subject: [PATCH 07/14] refactor exportable wrapper into model class definition

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>
---
 .../language_modeling/megatron_gpt_export.py  | 115 ++----------------
 .../language_modeling/megatron_gpt_model.py   |  83 ++++++++++++-
 2 files changed, 93 insertions(+), 105 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_gpt_export.py b/examples/nlp/language_modeling/megatron_gpt_export.py
index e2221b762962..baf77b7653c6 100644
--- a/examples/nlp/language_modeling/megatron_gpt_export.py
+++ b/examples/nlp/language_modeling/megatron_gpt_export.py
@@ -38,85 +38,12 @@
 from transformer_engine.common import recipe
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
-from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
 from nemo.core.classes import Exportable
 from nemo.core.config import hydra_runner
-from nemo.core.neural_types import ChannelType, NeuralType
 from nemo.utils import logging
 
 
-class MegatronGPTExportableModel(torch.nn.Module, Exportable):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-        self.dtype = None
-        if model.cfg['precision'] == 'bf16':
-            self.dtype = torch.bfloat16
-        elif int(model.cfg['precision']) == 32:
-            self.dtype = torch.float
-        elif int(model.cfg['precision']) == 16:
-            self.dtype = torch.float16
-        else:
-            raise ValueError(f"precision: {model.cfg['precision']} is not supported.")
-
-    def forward(self, tokens, position_ids, attention_mask):
-        assert tokens.shape == position_ids.shape
-        assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
-        with torch.autocast('cuda', dtype=self.dtype):
-            output_tensor = self.model.forward(
-                tokens=tokens.cuda(),
-                text_position_ids=position_ids.cuda(),
-                attention_mask=attention_mask.cuda(),
-                labels=None,
-            )
-
-        return output_tensor
-
-    def freeze(self):
-        for param in self.parameters():
-            param.requires_grad = False
-
-    def input_example(self, max_batch=1, max_dim=768, seq_len=6):
-        ids = [self.model.tokenizer.text_to_ids(text) for text in ["how is the weather on           Sunday"]]
-        id_tensors = [torch.unsqueeze(torch.LongTensor(id_list), dim=0) for id_list in ids]
-        masks_and_position_ids = [
-            get_ltor_masks_and_position_ids(id_tensor, self.model.tokenizer.eos_id, False, False, False)
-            for id_tensor in id_tensors
-        ]
-        for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids):
-            attn_mask, _, pos_ids = attn_mask_and_pos_ids
-            return tokens, pos_ids, attn_mask
-
-    def get_dynamic_axes(self):
-        dynamic_axes = {
-            'id_tensors': {0: "BS", 1: "sequence"},
-            'position_ids': {0: "BS", 1: "sequence"},
-            'attention_mask': {0: "BS", 2: "sequence", 3: "sequence"},
-        }
-        return dynamic_axes
-
-    @property
-    def input_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {
-            "id_tensors": NeuralType(('B', 'T'), ChannelType()),
-            "position_ids": NeuralType(('B', 'T'), ChannelType()),
-            "attention_mask": NeuralType(('D', 'D', 'T', 'T'), ChannelType()),
-        }
-
-    @property
-    def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {"log_probs": NeuralType(('B', 'T', 'D'), ChannelType())}
-
-    @property
-    def input_names(self) -> List[str]:
-        return ['id_tensors', 'position_ids', 'attention_mask']
-
-    @property
-    def output_names(self) -> List[str]:
-        return ['log_probs']
-
-
 @hydra_runner(config_path="conf", config_name="megatron_gpt_export")
 def nemo_export(cfg):
     """Convert a .nemo saved model into .onnx ONNX format."""
@@ -169,40 +96,20 @@ def nemo_export(cfg):
         logging.error("Your NeMo model class ({}) is not Exportable.".format(model.__class__.__name__))
         sys.exit(1)
 
-    #
-    #  Add custom export parameters here
-    #
+    # Export
     check_trace = cfg.export_options.runtime_check
 
-    if (
-        cfg.export_options.cache_support
-        and hasattr(model, "encoder")
-        and hasattr(model.encoder, "export_cache_support")
-    ):
-        model.encoder.export_cache_support = True
-        logging.info("Caching support is enabled.")
-        model.encoder.setup_streaming_params()
-
-    fp8_recipe = recipe.DelayedScaling(margin=0, interval=1, fp8_format=recipe.Format.E4M3)
     try:
-        with torch.no_grad(), torch.inference_mode(), te.onnx_export(True), te.fp8_autocast(
-            enabled=True, fp8_recipe=fp8_recipe
-        ), warnings.catch_warnings():
-            warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
-
-            model.to(device=cfg.export_options.device).freeze()
-            model.eval()
-            exportable_model = MegatronGPTExportableModel(model)
-
-            exportable_model.export(
-                onnx_out,
-                onnx_opset_version=cfg.export_options.onnx_opset,
-                do_constant_folding=cfg.export_options.do_constant_folding,
-                dynamic_axes=exportable_model.get_dynamic_axes(),
-                check_trace=check_trace,
-                check_tolerance=cfg.export_options.check_tolerance,
-                verbose=cfg.export_options.verbose,
-            )
+        model.to(device=cfg.export_options.device).freeze()
+        model.eval()
+        model.mgpt_wrapper().export(
+            onnx_out,
+            onnx_opset_version=cfg.export_options.onnx_opset,
+            do_constant_folding=cfg.export_options.do_constant_folding,
+            check_trace=check_trace,
+            check_tolerance=cfg.export_options.check_tolerance,
+            verbose=cfg.export_options.verbose,
+        )
     except Exception as e:
         logging.error(
             "Export failed. Please make sure your NeMo model class ({}) has working export() and that you have the latest NeMo package installed with [all] dependencies.".format(
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index e9545361b88d..8a3f1e386442 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -15,7 +15,7 @@
 import itertools
 import queue
 from functools import partial
-from typing import Any, Iterator, List, Optional, Union
+from typing import Any, Iterator, List, Optional, Union, Dict
 
 import numpy as np
 import torch
@@ -37,6 +37,7 @@
     average_losses_across_data_parallel_group,
     get_all_params_for_weight_decay_optimization,
     get_params_for_weight_decay_optimization,
+    get_ltor_masks_and_position_ids,
 )
 from nemo.collections.nlp.modules.common.text_generation_utils import (
     generate,
@@ -53,6 +54,8 @@
 )
 from nemo.collections.nlp.parts.nlp_overrides import GradScaler
 from nemo.collections.nlp.parts.utils_funcs import get_last_rank
+from nemo.core.classes import Exportable
+from nemo.core.neural_types import ChannelType, NeuralType
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
 
@@ -88,6 +91,81 @@
     HAVE_TE = False
 
 
+class MegatronGPTExportableModel(torch.nn.Module, Exportable):
+    """
+    Megatron GPT Wrapper for ONNX export
+    """
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.fp8_enabled = model.cfg.get('fp8', False)
+        if self.fp8_enabled and HAVE_TE:
+                self.fp8_recipe = transformer_engine.common.recipe.DelayedScaling(margin=0, interval=1, fp8_format=transformer_engine.common.recipe.Format.E4M3)
+
+        self.dtype = None
+        if model.cfg['precision'] == 'bf16':
+            self.dtype = torch.bfloat16
+        elif int(model.cfg['precision']) == 32:
+            self.dtype = torch.float
+        elif int(model.cfg['precision']) == 16:
+            self.dtype = torch.float16
+        else:
+            raise ValueError(f"precision: {model.cfg['precision']} is not supported.")
+
+    def forward(self, tokens, position_ids, attention_mask):
+        with transformer_engine.pytorch.onnx_export(self.fp8_enabled), \
+            transformer_engine.pytorch.fp8_autocast(enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe), \
+            torch.no_grad(), torch.inference_mode(), torch.autocast('cuda', dtype=self.dtype), \
+            warnings.catch_warnings() if self.fp8_enabled and HAVE_TE else torch.no_grad(), \
+            torch.inference_mode(), torch.autocast('cuda', dtype=self.dtype), \
+            warnings.catch_warnings():
+                warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
+                assert tokens.shape == position_ids.shape
+                assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
+                output_tensor = self.model.forward(
+                    tokens=tokens.cuda(),
+                    text_position_ids=position_ids.cuda(),
+                    attention_mask=attention_mask.cuda(),
+                    labels=None,
+                )
+
+        return output_tensor
+
+    def freeze(self):
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def input_example(self, max_batch=1, max_dim=768, seq_len=6):
+        ids = [self.model.tokenizer.text_to_ids(text) for text in ["how is the weather on           Sunday"]]
+        id_tensors = [torch.unsqueeze(torch.LongTensor(id_list), dim=0) for id_list in ids]
+        masks_and_position_ids = [
+            get_ltor_masks_and_position_ids(id_tensor, self.model.tokenizer.eos_id, False, False, False)
+            for id_tensor in id_tensors
+        ]
+        for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids):
+            attn_mask, _, pos_ids = attn_mask_and_pos_ids
+            return tokens, pos_ids, attn_mask
+
+    @property
+    def input_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {
+            "id_tensors": NeuralType(('B', 'T'), ChannelType()),
+            "position_ids": NeuralType(('B', 'T'), ChannelType()),
+            "attention_mask": NeuralType(('D', 'D', 'T', 'T'), ChannelType()),
+        }
+
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {"log_probs": NeuralType(('B', 'T', 'D'), ChannelType())}
+
+    @property
+    def input_names(self) -> List[str]:
+        return ['id_tensors', 'position_ids', 'attention_mask']
+
+    @property
+    def output_names(self) -> List[str]:
+        return ['log_probs']
+
 class MegatronGPTModel(MegatronBaseModel, TextGeneration):
     """
     Megatron GPT pretraining
@@ -971,6 +1049,9 @@ def setup_test_data(self, cfg):
             )
             self._test_dl = self.build_pretraining_data_loader(self._test_ds, consumed_samples)
 
+    def mgpt_wrapper(self):
+        return MegatronGPTExportableModel(self)
+
     def generate(
         self,
         inputs: Union[List[str], torch.Tensor, List[dict]],

From 580c026ee7c2ec3a2f604dda69a247f0af23d573 Mon Sep 17 00:00:00 2001
From: Asfiya Baig <asfiyab@nvidia.com>
Date: Tue, 9 May 2023 10:44:14 -0700
Subject: [PATCH 08/14] remove conditional replacement of modules

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>
---
 nemo/utils/export_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/nemo/utils/export_utils.py b/nemo/utils/export_utils.py
index b4debf746a98..60203ed6b9f0 100644
--- a/nemo/utils/export_utils.py
+++ b/nemo/utils/export_utils.py
@@ -455,9 +455,7 @@ def replace_for_export(model: nn.Module) -> nn.Module:
     }
 
     replace_modules(model, default_Apex_replacements)
-    # Apply CastToFloat for torch.float32
-    if hasattr(model, 'dtype') and model.dtype == torch.float32:
-        replace_modules(model, default_replacements)
+    replace_modules(model, default_replacements)
     # This one has to be the last
     replace_modules(model, script_replacements)
 

From 2357e7773929a3475bbc8684321a9968b065690a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 9 May 2023 17:45:07 +0000
Subject: [PATCH 09/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../language_modeling/megatron_gpt_model.py   | 42 +++++++++++--------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 8a3f1e386442..83afd1833ffb 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import itertools
+import warnings
 import queue
 from functools import partial
 from typing import Any, Iterator, List, Optional, Union, Dict
@@ -36,8 +37,8 @@
 from nemo.collections.nlp.modules.common.megatron.utils import (
     average_losses_across_data_parallel_group,
     get_all_params_for_weight_decay_optimization,
-    get_params_for_weight_decay_optimization,
     get_ltor_masks_and_position_ids,
+    get_params_for_weight_decay_optimization,
 )
 from nemo.collections.nlp.modules.common.text_generation_utils import (
     generate,
@@ -55,8 +56,8 @@
 from nemo.collections.nlp.parts.nlp_overrides import GradScaler
 from nemo.collections.nlp.parts.utils_funcs import get_last_rank
 from nemo.core.classes import Exportable
-from nemo.core.neural_types import ChannelType, NeuralType
 from nemo.core.classes.common import PretrainedModelInfo
+from nemo.core.neural_types import ChannelType, NeuralType
 from nemo.utils import logging
 
 try:
@@ -95,12 +96,15 @@ class MegatronGPTExportableModel(torch.nn.Module, Exportable):
     """
     Megatron GPT Wrapper for ONNX export
     """
+
     def __init__(self, model):
         super().__init__()
         self.model = model
         self.fp8_enabled = model.cfg.get('fp8', False)
         if self.fp8_enabled and HAVE_TE:
-                self.fp8_recipe = transformer_engine.common.recipe.DelayedScaling(margin=0, interval=1, fp8_format=transformer_engine.common.recipe.Format.E4M3)
+            self.fp8_recipe = transformer_engine.common.recipe.DelayedScaling(
+                margin=0, interval=1, fp8_format=transformer_engine.common.recipe.Format.E4M3
+            )
 
         self.dtype = None
         if model.cfg['precision'] == 'bf16':
@@ -113,21 +117,22 @@ def __init__(self, model):
             raise ValueError(f"precision: {model.cfg['precision']} is not supported.")
 
     def forward(self, tokens, position_ids, attention_mask):
-        with transformer_engine.pytorch.onnx_export(self.fp8_enabled), \
-            transformer_engine.pytorch.fp8_autocast(enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe), \
-            torch.no_grad(), torch.inference_mode(), torch.autocast('cuda', dtype=self.dtype), \
-            warnings.catch_warnings() if self.fp8_enabled and HAVE_TE else torch.no_grad(), \
-            torch.inference_mode(), torch.autocast('cuda', dtype=self.dtype), \
-            warnings.catch_warnings():
-                warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
-                assert tokens.shape == position_ids.shape
-                assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
-                output_tensor = self.model.forward(
-                    tokens=tokens.cuda(),
-                    text_position_ids=position_ids.cuda(),
-                    attention_mask=attention_mask.cuda(),
-                    labels=None,
-                )
+        with transformer_engine.pytorch.onnx_export(self.fp8_enabled), transformer_engine.pytorch.fp8_autocast(
+            enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe
+        ), torch.no_grad(), torch.inference_mode(), torch.autocast(
+            'cuda', dtype=self.dtype
+        ), warnings.catch_warnings() if self.fp8_enabled and HAVE_TE else torch.no_grad(), torch.inference_mode(), torch.autocast(
+            'cuda', dtype=self.dtype
+        ), warnings.catch_warnings():
+            warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
+            assert tokens.shape == position_ids.shape
+            assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
+            output_tensor = self.model.forward(
+                tokens=tokens.cuda(),
+                text_position_ids=position_ids.cuda(),
+                attention_mask=attention_mask.cuda(),
+                labels=None,
+            )
 
         return output_tensor
 
@@ -166,6 +171,7 @@ def input_names(self) -> List[str]:
     def output_names(self) -> List[str]:
         return ['log_probs']
 
+
 class MegatronGPTModel(MegatronBaseModel, TextGeneration):
     """
     Megatron GPT pretraining

From eaeafd790ea89905ee99ee91fb5d75ac202d8f2a Mon Sep 17 00:00:00 2001
From: Asfiya Baig <asfiyab@nvidia.com>
Date: Tue, 9 May 2023 13:05:15 -0700
Subject: [PATCH 10/14] set fp8_recipe to None by default

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py           | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 83afd1833ffb..61fe5696ca29 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -101,6 +101,7 @@ def __init__(self, model):
         super().__init__()
         self.model = model
         self.fp8_enabled = model.cfg.get('fp8', False)
+        self.fp8_recipe = None
         if self.fp8_enabled and HAVE_TE:
             self.fp8_recipe = transformer_engine.common.recipe.DelayedScaling(
                 margin=0, interval=1, fp8_format=transformer_engine.common.recipe.Format.E4M3

From 26470b93784576951e3c60886fac0464874d4fd6 Mon Sep 17 00:00:00 2001
From: Asfiya Baig <asfiyab@nvidia.com>
Date: Tue, 16 May 2023 23:46:49 +0000
Subject: [PATCH 11/14] address all comments

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>
---
 .../conf/megatron_gpt_export.yaml             |  6 +-
 .../language_modeling/megatron_gpt_export.py  | 89 +++++++++++++++----
 .../language_modeling/megatron_gpt_model.py   | 56 +++++++-----
 3 files changed, 108 insertions(+), 43 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml
index 49616fea02ab..24d0c1548e69 100644
--- a/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml
@@ -5,17 +5,21 @@ trainer:
   logger: False # logger provided by exp_manager
   precision: bf16 # 16, 32, or bf16
 
+model_type: gpt
 tensor_model_parallel_size: 1
 pipeline_model_parallel_size: 1
 pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
 gpt_model_file: null  # GPT nemo file path
 onnx_model_file: null # ONNX file path
+checkpoint_dir: null # Checkpoint directory
+checkpoint_name: null # Checkpoint name
+hparams_file: null # hparams filepath
 
 export_options:
   runtime_check: False
   verbose: False
   onnx_opset: 17
-  do_constant_folding: False
+  do_constant_folding: True
   cache_support: False
   device: 'cuda'
   check_tolerance: 0.01
diff --git a/examples/nlp/language_modeling/megatron_gpt_export.py b/examples/nlp/language_modeling/megatron_gpt_export.py
index baf77b7653c6..96d6f4911a4b 100644
--- a/examples/nlp/language_modeling/megatron_gpt_export.py
+++ b/examples/nlp/language_modeling/megatron_gpt_export.py
@@ -27,27 +27,51 @@
 # limitations under the License.
 
 import os
-import sys
-import warnings
-from typing import Dict, List, Optional
-
-import torch
-import transformer_engine.pytorch as te
 from omegaconf import OmegaConf, open_dict
 from pytorch_lightning import Trainer
-from transformer_engine.common import recipe
 
+from nemo.core import ModelPT
+from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel
+from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
+from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
+from nemo.collections.nlp.models.machine_translation.megatron_nmt_model import MegatronNMTModel
+from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
-from nemo.core.classes import Exportable
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 
+from nemo.utils.app_state import AppState
+from nemo.utils.model_utils import inject_model_parallel_rank
+
+def get_model_class(cfg):
+    if cfg.model_type == 'gpt':
+        return MegatronGPTModel
+    elif cfg.model_type == 'bert':
+        return MegatronBertModel
+    elif cfg.model_type == 't5':
+        return MegatronT5Model
+    elif cfg.model_type == 'bart':
+        return MegatronBARTModel
+    elif cfg.model_type == 'nmt':
+        return MegatronNMTModel
+    elif cfg.model_type == 'retro':
+        return MegatronRetrievalModel
+    else:
+        raise ValueError("Invalid Model Type")
+
 
 @hydra_runner(config_path="conf", config_name="megatron_gpt_export")
 def nemo_export(cfg):
-    """Convert a .nemo saved model into .onnx ONNX format."""
-    nemo_in = cfg.gpt_model_file
+    """Convert a nemo model into .onnx ONNX format."""
+    nemo_in = None
+    if cfg.gpt_model_file:
+        nemo_in = cfg.gpt_model_file
+    elif cfg.checkpoint_dir:
+        nemo_in = os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)
+    assert nemo_in is not None, "NeMo model not provided. Please provide the path to the .nemo or .ckpt file"
+
     onnx_out = cfg.onnx_model_file
 
     trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
@@ -58,13 +82,12 @@ def nemo_export(cfg):
 
     logging.info("Restoring NeMo model from '{}'".format(nemo_in))
     try:
-        with torch.inference_mode():
-            # Restore instance from .nemo file using generic model restore_from
+        if cfg.gpt_model_file:
             save_restore_connector = NLPSaveRestoreConnector()
             if os.path.isdir(cfg.gpt_model_file):
                 save_restore_connector.model_extracted_dir = cfg.gpt_model_file
 
-            pretrained_cfg = MegatronGPTModel.restore_from(
+            pretrained_cfg = ModelPT.restore_from(
                 restore_path=cfg.gpt_model_file,
                 trainer=trainer,
                 return_config=True,
@@ -76,12 +99,39 @@ def nemo_export(cfg):
                 pretrained_cfg.activations_checkpoint_granularity = None
                 pretrained_cfg.activations_checkpoint_method = None
                 pretrained_cfg.precision = trainer.precision
-            model = MegatronGPTModel.restore_from(
+                if trainer.precision == "16":
+                    pretrained_cfg.megatron_amp_O2 = False
+            model = ModelPT.restore_from(
                 restore_path=cfg.gpt_model_file,
                 trainer=trainer,
                 override_config_path=pretrained_cfg,
                 save_restore_connector=save_restore_connector,
             )
+        elif cfg.checkpoint_dir:
+            app_state = AppState()
+            if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1:
+                app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
+                app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size
+                app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size
+                (
+                    app_state.tensor_model_parallel_rank,
+                    app_state.pipeline_model_parallel_rank,
+                    app_state.model_parallel_size,
+                    app_state.data_parallel_size,
+                    app_state.pipeline_model_parallel_split_rank,
+                    app_state.virtual_pipeline_model_parallel_rank,
+                ) = fake_initialize_model_parallel(
+                    world_size=app_state.model_parallel_size,
+                    rank=trainer.global_rank,
+                    tensor_model_parallel_size_=cfg.tensor_model_parallel_size,
+                    pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size,
+                    pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank,
+                )
+            checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name))
+            model_cls = get_model_class(cfg)
+            model = model_cls.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer)
+        else:
+            raise ValueError("need at least a nemo file or checkpoint dir")
     except Exception as e:
         logging.error(
             "Failed to restore model from NeMo file : {}. Please make sure you have the latest NeMo package installed with [all] dependencies.".format(
@@ -92,20 +142,21 @@ def nemo_export(cfg):
 
     logging.info("Model {} restored from '{}'".format(model.__class__.__name__, nemo_in))
 
-    if not isinstance(model, Exportable):
-        logging.error("Your NeMo model class ({}) is not Exportable.".format(model.__class__.__name__))
-        sys.exit(1)
-
     # Export
     check_trace = cfg.export_options.runtime_check
 
     try:
         model.to(device=cfg.export_options.device).freeze()
         model.eval()
-        model.mgpt_wrapper().export(
+        model.export(
             onnx_out,
             onnx_opset_version=cfg.export_options.onnx_opset,
             do_constant_folding=cfg.export_options.do_constant_folding,
+            dynamic_axes = {
+                'input_ids': {0: "sequence", 1: "batch"},
+                'position_ids': {0: "sequence", 1: "batch"},
+                'logits': {0: "sequence", 1: "batch"}
+            },
             check_trace=check_trace,
             check_tolerance=cfg.export_options.check_tolerance,
             verbose=cfg.export_options.verbose,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 61fe5696ca29..735bd80b3ac4 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -118,22 +118,28 @@ def __init__(self, model):
             raise ValueError(f"precision: {model.cfg['precision']} is not supported.")
 
     def forward(self, tokens, position_ids, attention_mask):
-        with transformer_engine.pytorch.onnx_export(self.fp8_enabled), transformer_engine.pytorch.fp8_autocast(
-            enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe
-        ), torch.no_grad(), torch.inference_mode(), torch.autocast(
-            'cuda', dtype=self.dtype
-        ), warnings.catch_warnings() if self.fp8_enabled and HAVE_TE else torch.no_grad(), torch.inference_mode(), torch.autocast(
-            'cuda', dtype=self.dtype
-        ), warnings.catch_warnings():
-            warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
-            assert tokens.shape == position_ids.shape
-            assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
-            output_tensor = self.model.forward(
-                tokens=tokens.cuda(),
-                text_position_ids=position_ids.cuda(),
-                attention_mask=attention_mask.cuda(),
-                labels=None,
-            )
+        if self.fp8_enabled and HAVE_TE:
+            with transformer_engine.pytorch.onnx_export(self.fp8_enabled), transformer_engine.pytorch.fp8_autocast(enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe), torch.no_grad(), torch.inference_mode(), torch.autocast('cuda', dtype=self.dtype), warnings.catch_warnings():
+                warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
+                assert tokens.shape == position_ids.shape
+                assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
+                output_tensor = self.model.forward(
+                    tokens=tokens.cuda(),
+                    text_position_ids=position_ids.cuda(),
+                    attention_mask=attention_mask.cuda(),
+                    labels=None,
+                )
+        else:
+            with torch.no_grad(), torch.inference_mode(), torch.autocast('cuda', dtype=self.dtype), warnings.catch_warnings():
+                warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
+                assert tokens.shape == position_ids.shape
+                assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
+                output_tensor = self.model.forward(
+                    tokens=tokens.cuda(),
+                    text_position_ids=position_ids.cuda(),
+                    attention_mask=attention_mask.cuda(),
+                    labels=None,
+                )
 
         return output_tensor
 
@@ -155,22 +161,22 @@ def input_example(self, max_batch=1, max_dim=768, seq_len=6):
     @property
     def input_types(self) -> Optional[Dict[str, NeuralType]]:
         return {
-            "id_tensors": NeuralType(('B', 'T'), ChannelType()),
+            "input_ids": NeuralType(('B', 'T'), ChannelType()),
             "position_ids": NeuralType(('B', 'T'), ChannelType()),
             "attention_mask": NeuralType(('D', 'D', 'T', 'T'), ChannelType()),
         }
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
-        return {"log_probs": NeuralType(('B', 'T', 'D'), ChannelType())}
+        return {"logits": NeuralType(('B', 'T', 'D'), ChannelType())}
 
     @property
     def input_names(self) -> List[str]:
-        return ['id_tensors', 'position_ids', 'attention_mask']
+        return ['input_ids', 'position_ids', 'attention_mask']
 
     @property
     def output_names(self) -> List[str]:
-        return ['log_probs']
+        return ['logits']
 
 
 class MegatronGPTModel(MegatronBaseModel, TextGeneration):
@@ -1056,9 +1062,6 @@ def setup_test_data(self, cfg):
             )
             self._test_dl = self.build_pretraining_data_loader(self._test_ds, consumed_samples)
 
-    def mgpt_wrapper(self):
-        return MegatronGPTExportableModel(self)
-
     def generate(
         self,
         inputs: Union[List[str], torch.Tensor, List[dict]],
@@ -1201,6 +1204,13 @@ def parameters(self):
         else:
             return self.model.parameters()
 
+    @property
+    def mgpt_wrapper(self):
+        return MegatronGPTExportableModel(self)
+
+    def list_export_subnets(self):
+        return ['mgpt_wrapper']
+
     def _reset_activation_checkpointing_args(self):
         """ Disables activation checkpointing completely and saves the values so that
             _restore_activation_checkpointing_args can restore them later. This function must always be

From 76e3d8fa0b406240e093ea7879380b6a3453dc18 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 16 May 2023 23:52:03 +0000
Subject: [PATCH 12/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../nlp/language_modeling/megatron_gpt_export.py   |  9 +++++----
 .../models/language_modeling/megatron_gpt_model.py | 14 ++++++++++----
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_gpt_export.py b/examples/nlp/language_modeling/megatron_gpt_export.py
index 96d6f4911a4b..bf9157884bfc 100644
--- a/examples/nlp/language_modeling/megatron_gpt_export.py
+++ b/examples/nlp/language_modeling/megatron_gpt_export.py
@@ -27,10 +27,10 @@
 # limitations under the License.
 
 import os
+
 from omegaconf import OmegaConf, open_dict
 from pytorch_lightning import Trainer
 
-from nemo.core import ModelPT
 from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel
 from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
@@ -39,12 +39,13 @@
 from nemo.collections.nlp.models.machine_translation.megatron_nmt_model import MegatronNMTModel
 from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
+from nemo.core import ModelPT
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
-
 from nemo.utils.app_state import AppState
 from nemo.utils.model_utils import inject_model_parallel_rank
 
+
 def get_model_class(cfg):
     if cfg.model_type == 'gpt':
         return MegatronGPTModel
@@ -152,10 +153,10 @@ def nemo_export(cfg):
             onnx_out,
             onnx_opset_version=cfg.export_options.onnx_opset,
             do_constant_folding=cfg.export_options.do_constant_folding,
-            dynamic_axes = {
+            dynamic_axes={
                 'input_ids': {0: "sequence", 1: "batch"},
                 'position_ids': {0: "sequence", 1: "batch"},
-                'logits': {0: "sequence", 1: "batch"}
+                'logits': {0: "sequence", 1: "batch"},
             },
             check_trace=check_trace,
             check_tolerance=cfg.export_options.check_tolerance,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 735bd80b3ac4..ef27450ff159 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 
 import itertools
-import warnings
 import queue
+import warnings
 from functools import partial
-from typing import Any, Iterator, List, Optional, Union, Dict
+from typing import Any, Dict, Iterator, List, Optional, Union
 
 import numpy as np
 import torch
@@ -119,7 +119,11 @@ def __init__(self, model):
 
     def forward(self, tokens, position_ids, attention_mask):
         if self.fp8_enabled and HAVE_TE:
-            with transformer_engine.pytorch.onnx_export(self.fp8_enabled), transformer_engine.pytorch.fp8_autocast(enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe), torch.no_grad(), torch.inference_mode(), torch.autocast('cuda', dtype=self.dtype), warnings.catch_warnings():
+            with transformer_engine.pytorch.onnx_export(self.fp8_enabled), transformer_engine.pytorch.fp8_autocast(
+                enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe
+            ), torch.no_grad(), torch.inference_mode(), torch.autocast(
+                'cuda', dtype=self.dtype
+            ), warnings.catch_warnings():
                 warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
                 assert tokens.shape == position_ids.shape
                 assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
@@ -130,7 +134,9 @@ def forward(self, tokens, position_ids, attention_mask):
                     labels=None,
                 )
         else:
-            with torch.no_grad(), torch.inference_mode(), torch.autocast('cuda', dtype=self.dtype), warnings.catch_warnings():
+            with torch.no_grad(), torch.inference_mode(), torch.autocast(
+                'cuda', dtype=self.dtype
+            ), warnings.catch_warnings():
                 warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
                 assert tokens.shape == position_ids.shape
                 assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]

From 7948f3517da62369d0adde4b1469b85e710a7632 Mon Sep 17 00:00:00 2001
From: Asfiya Baig <asfiyab@nvidia.com>
Date: Wed, 17 May 2023 00:04:32 +0000
Subject: [PATCH 13/14] typecast precision check for fp16

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>
---
 nemo/collections/nlp/modules/common/megatron/attention.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/attention.py b/nemo/collections/nlp/modules/common/megatron/attention.py
index c025c1fc32ba..aaeb05d43cde 100644
--- a/nemo/collections/nlp/modules/common/megatron/attention.py
+++ b/nemo/collections/nlp/modules/common/megatron/attention.py
@@ -697,8 +697,12 @@ def __init__(
         super(CoreAttention, self).__init__()
 
         self.precision = precision
-        self.fp16 = precision == 16
-        self.bf16 = precision == 'bf16'
+        self.fp16 = False
+        self.bf16 = False
+        if precision == 'bf16':
+            self.bf16 = True
+        elif int(precision) == 16:
+            self.fp16 = True
         self.multi_query_attention = multi_query_attention
 
         self.apply_query_key_layer_scaling = apply_query_key_layer_scaling

From 4a83c47e62b2625d36400096e0dd6ccd890e8ceb Mon Sep 17 00:00:00 2001
From: Asfiya Baig <asfiyab@nvidia.com>
Date: Thu, 18 May 2023 19:01:39 +0000
Subject: [PATCH 14/14] rename export script

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>
---
 .../{megatron_gpt_export.py => megatron_export.py}                | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/nlp/language_modeling/{megatron_gpt_export.py => megatron_export.py} (100%)

diff --git a/examples/nlp/language_modeling/megatron_gpt_export.py b/examples/nlp/language_modeling/megatron_export.py
similarity index 100%
rename from examples/nlp/language_modeling/megatron_gpt_export.py
rename to examples/nlp/language_modeling/megatron_export.py