From e57472e9ed54e4f7c331ae51f02b0714917b89b7 Mon Sep 17 00:00:00 2001 From: Asfiya Baig Date: Sun, 16 Apr 2023 18:27:23 -0700 Subject: [PATCH 01/14] add GPT FP8 ONNX export support Signed-off-by: Asfiya Baig --- .../conf/megatron_gpt_export.yaml | 22 ++ .../language_modeling/megatron_gpt_export.py | 225 ++++++++++++++++++ .../nlp/modules/common/megatron/module.py | 2 +- nemo/utils/export_utils.py | 7 +- 4 files changed, 253 insertions(+), 3 deletions(-) create mode 100644 examples/nlp/language_modeling/conf/megatron_gpt_export.yaml create mode 100644 examples/nlp/language_modeling/megatron_gpt_export.py diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml new file mode 100644 index 000000000000..63d2d7ee9a40 --- /dev/null +++ b/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml @@ -0,0 +1,22 @@ +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + logger: False # logger provided by exp_manager + precision: bf16 # 16, 32, or bf16 + +tensor_model_parallel_size: 1 +pipeline_model_parallel_size: 1 +pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others) +gpt_model_file: null # GPT nemo file path +onnx_model_file: null # ONNX file path + +export_options: + autocast: True + runtime_check: False + verbose: None + onnx_opset: 17 + do_constant_folding: True + cache_support: True + device: 'cuda' + check_tolerance: 0.01 diff --git a/examples/nlp/language_modeling/megatron_gpt_export.py b/examples/nlp/language_modeling/megatron_gpt_export.py new file mode 100644 index 000000000000..b0a243964a9c --- /dev/null +++ b/examples/nlp/language_modeling/megatron_gpt_export.py @@ -0,0 +1,225 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2017 Johns Hopkins University (Shinji Watanabe) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import warnings +from typing import Optional, List, Dict + +import torch +from omegaconf import OmegaConf, open_dict +from pytorch_lightning import Trainer + +from nemo.core.config import hydra_runner +from nemo.core.classes import Exportable +from nemo.core.neural_types import ChannelType, NeuralType +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids +from nemo.utils import logging + +import transformer_engine.pytorch as te +from transformer_engine.common import recipe + +try: + from contextlib import nullcontext +except ImportError: + # handle python < 3.7 + from contextlib import suppress as nullcontext + +class MegatronGPTExportableModel(torch.nn.Module, Exportable): + def __init__(self, model): + super().__init__() + self.model = model + self.dtype = None + if model.cfg['precision'] == 'bf16': + self.dtype = torch.bfloat16 + elif int(model.cfg['precision']) == 32: + self.dtype = torch.float + elif int(model.cfg['precision']) == 16: + self.dtype = torch.float16 + else: + raise ValueError(f"precision: {model.cfg['precision']} is not supported.") + + def forward(self, id_tensors, masks_and_position_ids): + output_tensors = [] + for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids): + attn_mask, _, pos_ids = attn_mask_and_pos_ids + assert tokens.shape == pos_ids.shape + assert attn_mask.shape[2] == attn_mask.shape[3] == tokens.shape[1] == pos_ids.shape[1] + with torch.autocast('cuda', dtype=self.dtype): + output_tensor = self.model.forward( + tokens=tokens.cuda(), + text_position_ids=pos_ids.cuda(), + attention_mask=attn_mask.cuda(), + labels=None, + ) + + output_tensors.append(output_tensor) + return output_tensors + + def freeze(self): + for param in self.parameters(): + param.requires_grad = False + + def input_example(self, max_batch=1, max_dim=768, seq_len=6): + ids = [self.model.tokenizer.text_to_ids(text) for text in ['hi there']] + id_tensors = [torch.unsqueeze(torch.LongTensor(id_list), dim=0) for id_list in ids] + masks_and_position_ids = [ + get_ltor_masks_and_position_ids(id_tensor, self.model.tokenizer.eos_id, False, False, False) + for id_tensor in id_tensors + ] + + return id_tensors, masks_and_position_ids + + def get_dynamic_axes(self): + dynamic_axes = { + 'id_tensors': {0: "BS", 1: "sequence"}, + 'masks_and_position_ids': {0: "BS", 2: "sequence", 3: "sequence"}, + } + return dynamic_axes + + @property + def input_types(self) -> Optional[Dict[str, NeuralType]]: + return { + "id_tensors": NeuralType(('B'), ChannelType()), + "masks_and_position_ids": NeuralType(('B'), ChannelType()), + } + + @property + def output_types(self) -> Optional[Dict[str, NeuralType]]: + return {"log_probs": NeuralType(('B', 'T', 'D'), ChannelType())} + + @property + def input_names(self) -> List[str]: + return ['id_tensors', 'masks_and_position_ids'] + + @property + def output_names(self) -> List[str]: + return ['log_probs'] + +@hydra_runner(config_path="conf", config_name="megatron_gpt_export") +def nemo_export(cfg): + """Convert a .nemo saved model into .onnx ONNX format.""" + nemo_in = cfg.gpt_model_file + onnx_out = cfg.onnx_model_file + + trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) + assert ( + cfg.trainer.devices * cfg.trainer.num_nodes + == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size + ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" + + + logging.info("Restoring NeMo model from '{}'".format(nemo_in)) + try: + with torch.inference_mode(): + # Restore instance from .nemo file using generic model restore_from + save_restore_connector = NLPSaveRestoreConnector() + if os.path.isdir(cfg.gpt_model_file): + save_restore_connector.model_extracted_dir = cfg.gpt_model_file + + pretrained_cfg = MegatronGPTModel.restore_from( + restore_path=cfg.gpt_model_file, + trainer=trainer, + return_config=True, + save_restore_connector=save_restore_connector, + ) + OmegaConf.set_struct(pretrained_cfg, True) + with open_dict(pretrained_cfg): + pretrained_cfg.sequence_parallel = False + pretrained_cfg.activations_checkpoint_granularity = None + pretrained_cfg.activations_checkpoint_method = None + pretrained_cfg.precision = trainer.precision + model = MegatronGPTModel.restore_from( + restore_path=cfg.gpt_model_file, + trainer=trainer, + override_config_path=pretrained_cfg, + save_restore_connector=save_restore_connector, + ) + except Exception as e: + logging.error( + "Failed to restore model from NeMo file : {}. Please make sure you have the latest NeMo package installed with [all] dependencies.".format( + nemo_in + ) + ) + raise e + + logging.info("Model {} restored from '{}'".format(model.__class__.__name__, nemo_in)) + + if not isinstance(model, Exportable): + logging.error("Your NeMo model class ({}) is not Exportable.".format(model.__class__.__name__)) + sys.exit(1) + + # + # Add custom export parameters here + # + check_trace = cfg.export_options.runtime_check + + if cfg.export_options.cache_support and hasattr(model, "encoder") and hasattr(model.encoder, "export_cache_support"): + model.encoder.export_cache_support = True + logging.info("Caching support is enabled.") + model.encoder.setup_streaming_params() + + autocast = nullcontext + if cfg.export_options.autocast: + autocast = torch.cuda.amp.autocast + fp8_recipe = recipe.DelayedScaling(margin=0, interval=1, fp8_format=recipe.Format.E4M3) + try: + with autocast(), torch.no_grad(), torch.inference_mode(), te.onnx_export(True), \ + te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe), warnings.catch_warnings(): + warnings.filterwarnings( + action='ignore', + category=torch.jit.TracerWarning, + module=r'.*' + ) + + model.to(device=cfg.export_options.device).freeze() + model.eval() + exportable_model = MegatronGPTExportableModel(model) + + exportable_model.export( + onnx_out, + onnx_opset_version=cfg.export_options.onnx_opset, + do_constant_folding=cfg.export_options.do_constant_folding, + dynamic_axes=exportable_model.get_dynamic_axes(), + check_trace=check_trace, + check_tolerance=cfg.export_options.check_tolerance, + verbose=cfg.export_options.verbose, + ) + except Exception as e: + logging.error( + "Export failed. Please make sure your NeMo model class ({}) has working export() and that you have the latest NeMo package installed with [all] dependencies.".format( + model.__class__ + ) + ) + raise e + + +if __name__ == '__main__': + nemo_export() diff --git a/nemo/collections/nlp/modules/common/megatron/module.py b/nemo/collections/nlp/modules/common/megatron/module.py index 58ce7a7bae18..598272366b44 100644 --- a/nemo/collections/nlp/modules/common/megatron/module.py +++ b/nemo/collections/nlp/modules/common/megatron/module.py @@ -262,7 +262,7 @@ def __init__(self, module, precision): super().__init__() self.precision = precision - if precision == 16: + if int(precision) == 16: self.add_module('module', module.half()) def float16_converter(val): diff --git a/nemo/utils/export_utils.py b/nemo/utils/export_utils.py index 9fa2bc239eb8..7131f9ce6fc1 100644 --- a/nemo/utils/export_utils.py +++ b/nemo/utils/export_utils.py @@ -309,7 +309,8 @@ def replace_FusedScaleMaskSoftmax(n: nn.Module) -> Optional[nn.Linear]: Equivalent LayerNorm module """ if not isinstance(n, FusedScaleMaskSoftmax): - raise ValueError("This function can only change the FusedScaleMaskSoftmax module.") + logging.warning("This function can only change the FusedScaleMaskSoftmax module.") + return n # disable the fusion only mod = FusedScaleMaskSoftmax( @@ -454,7 +455,9 @@ def replace_for_export(model: nn.Module) -> nn.Module: } replace_modules(model, default_Apex_replacements) - replace_modules(model, default_replacements) + # Apply CastToFloat for torch.float32 + if model.dtype==torch.float32: + replace_modules(model, default_replacements) # This one has to be the last replace_modules(model, script_replacements) From f64d546d6b1cf714614b051957b2b374f5085975 Mon Sep 17 00:00:00 2001 From: Asfiya Baig Date: Wed, 19 Apr 2023 11:47:50 -0700 Subject: [PATCH 02/14] changes 1. Add dynamic axes for inputs 2. Update model input_example to resolve size error by TE Signed-off-by: Asfiya Baig --- .../conf/megatron_gpt_export.yaml | 5 +- .../language_modeling/megatron_gpt_export.py | 50 +++++++++---------- 2 files changed, 25 insertions(+), 30 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml index 63d2d7ee9a40..798d156146d5 100644 --- a/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml @@ -12,11 +12,10 @@ gpt_model_file: null # GPT nemo file path onnx_model_file: null # ONNX file path export_options: - autocast: True runtime_check: False - verbose: None + verbose: False onnx_opset: 17 do_constant_folding: True - cache_support: True + cache_support: False device: 'cuda' check_tolerance: 0.01 diff --git a/examples/nlp/language_modeling/megatron_gpt_export.py b/examples/nlp/language_modeling/megatron_gpt_export.py index b0a243964a9c..0773bf143a12 100644 --- a/examples/nlp/language_modeling/megatron_gpt_export.py +++ b/examples/nlp/language_modeling/megatron_gpt_export.py @@ -66,49 +66,48 @@ def __init__(self, model): else: raise ValueError(f"precision: {model.cfg['precision']} is not supported.") - def forward(self, id_tensors, masks_and_position_ids): - output_tensors = [] - for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids): - attn_mask, _, pos_ids = attn_mask_and_pos_ids - assert tokens.shape == pos_ids.shape - assert attn_mask.shape[2] == attn_mask.shape[3] == tokens.shape[1] == pos_ids.shape[1] - with torch.autocast('cuda', dtype=self.dtype): - output_tensor = self.model.forward( - tokens=tokens.cuda(), - text_position_ids=pos_ids.cuda(), - attention_mask=attn_mask.cuda(), - labels=None, - ) - - output_tensors.append(output_tensor) - return output_tensors + def forward(self, tokens, position_ids, attention_mask): + assert tokens.shape == position_ids.shape + assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1] + with torch.autocast('cuda', dtype=self.dtype): + output_tensor = self.model.forward( + tokens=tokens.cuda(), + text_position_ids=position_ids.cuda(), + attention_mask=attention_mask.cuda(), + labels=None, + ) + + return output_tensor def freeze(self): for param in self.parameters(): param.requires_grad = False def input_example(self, max_batch=1, max_dim=768, seq_len=6): - ids = [self.model.tokenizer.text_to_ids(text) for text in ['hi there']] + ids = [self.model.tokenizer.text_to_ids(text) for text in ["how is the weather on Sunday"]] id_tensors = [torch.unsqueeze(torch.LongTensor(id_list), dim=0) for id_list in ids] masks_and_position_ids = [ get_ltor_masks_and_position_ids(id_tensor, self.model.tokenizer.eos_id, False, False, False) for id_tensor in id_tensors ] - - return id_tensors, masks_and_position_ids + for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids): + attn_mask, _, pos_ids = attn_mask_and_pos_ids + return tokens, pos_ids, attn_mask def get_dynamic_axes(self): dynamic_axes = { 'id_tensors': {0: "BS", 1: "sequence"}, - 'masks_and_position_ids': {0: "BS", 2: "sequence", 3: "sequence"}, + 'position_ids': {0: "BS", 1: "sequence"}, + 'attention_mask': {0: "BS", 2: "sequence", 3: "sequence"}, } return dynamic_axes @property def input_types(self) -> Optional[Dict[str, NeuralType]]: return { - "id_tensors": NeuralType(('B'), ChannelType()), - "masks_and_position_ids": NeuralType(('B'), ChannelType()), + "id_tensors": NeuralType(('B', 'T'), ChannelType()), + "position_ids": NeuralType(('B', 'T'), ChannelType()), + "attention_mask": NeuralType(('B', 'D', 'T', 'T'), ChannelType()), } @property @@ -117,7 +116,7 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: @property def input_names(self) -> List[str]: - return ['id_tensors', 'masks_and_position_ids'] + return ['id_tensors', 'position_ids', 'attention_mask'] @property def output_names(self) -> List[str]: @@ -186,12 +185,9 @@ def nemo_export(cfg): logging.info("Caching support is enabled.") model.encoder.setup_streaming_params() - autocast = nullcontext - if cfg.export_options.autocast: - autocast = torch.cuda.amp.autocast fp8_recipe = recipe.DelayedScaling(margin=0, interval=1, fp8_format=recipe.Format.E4M3) try: - with autocast(), torch.no_grad(), torch.inference_mode(), te.onnx_export(True), \ + with torch.no_grad(), torch.inference_mode(), te.onnx_export(True), \ te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe), warnings.catch_warnings(): warnings.filterwarnings( action='ignore', From 0f5b9b71d9259b0484ba58575e03b9543fdaf96a Mon Sep 17 00:00:00 2001 From: Asfiya Baig Date: Wed, 19 Apr 2023 16:21:18 -0700 Subject: [PATCH 03/14] Conform to Python style guidelines Signed-off-by: Asfiya Baig --- .../language_modeling/megatron_gpt_export.py | 61 +++++++++---------- nemo/utils/export_utils.py | 2 +- 2 files changed, 29 insertions(+), 34 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_gpt_export.py b/examples/nlp/language_modeling/megatron_gpt_export.py index 0773bf143a12..e2221b762962 100644 --- a/examples/nlp/language_modeling/megatron_gpt_export.py +++ b/examples/nlp/language_modeling/megatron_gpt_export.py @@ -29,28 +29,22 @@ import os import sys import warnings -from typing import Optional, List, Dict +from typing import Dict, List, Optional import torch +import transformer_engine.pytorch as te from omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer +from transformer_engine.common import recipe -from nemo.core.config import hydra_runner -from nemo.core.classes import Exportable -from nemo.core.neural_types import ChannelType, NeuralType -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector +from nemo.core.classes import Exportable +from nemo.core.config import hydra_runner +from nemo.core.neural_types import ChannelType, NeuralType from nemo.utils import logging -import transformer_engine.pytorch as te -from transformer_engine.common import recipe - -try: - from contextlib import nullcontext -except ImportError: - # handle python < 3.7 - from contextlib import suppress as nullcontext class MegatronGPTExportableModel(torch.nn.Module, Exportable): def __init__(self, model): @@ -71,11 +65,11 @@ def forward(self, tokens, position_ids, attention_mask): assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1] with torch.autocast('cuda', dtype=self.dtype): output_tensor = self.model.forward( - tokens=tokens.cuda(), - text_position_ids=position_ids.cuda(), - attention_mask=attention_mask.cuda(), - labels=None, - ) + tokens=tokens.cuda(), + text_position_ids=position_ids.cuda(), + attention_mask=attention_mask.cuda(), + labels=None, + ) return output_tensor @@ -93,21 +87,21 @@ def input_example(self, max_batch=1, max_dim=768, seq_len=6): for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids): attn_mask, _, pos_ids = attn_mask_and_pos_ids return tokens, pos_ids, attn_mask - + def get_dynamic_axes(self): dynamic_axes = { - 'id_tensors': {0: "BS", 1: "sequence"}, - 'position_ids': {0: "BS", 1: "sequence"}, - 'attention_mask': {0: "BS", 2: "sequence", 3: "sequence"}, + 'id_tensors': {0: "BS", 1: "sequence"}, + 'position_ids': {0: "BS", 1: "sequence"}, + 'attention_mask': {0: "BS", 2: "sequence", 3: "sequence"}, } return dynamic_axes - + @property def input_types(self) -> Optional[Dict[str, NeuralType]]: return { "id_tensors": NeuralType(('B', 'T'), ChannelType()), "position_ids": NeuralType(('B', 'T'), ChannelType()), - "attention_mask": NeuralType(('B', 'D', 'T', 'T'), ChannelType()), + "attention_mask": NeuralType(('D', 'D', 'T', 'T'), ChannelType()), } @property @@ -122,6 +116,7 @@ def input_names(self) -> List[str]: def output_names(self) -> List[str]: return ['log_probs'] + @hydra_runner(config_path="conf", config_name="megatron_gpt_export") def nemo_export(cfg): """Convert a .nemo saved model into .onnx ONNX format.""" @@ -134,7 +129,6 @@ def nemo_export(cfg): == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" - logging.info("Restoring NeMo model from '{}'".format(nemo_in)) try: with torch.inference_mode(): @@ -180,20 +174,21 @@ def nemo_export(cfg): # check_trace = cfg.export_options.runtime_check - if cfg.export_options.cache_support and hasattr(model, "encoder") and hasattr(model.encoder, "export_cache_support"): + if ( + cfg.export_options.cache_support + and hasattr(model, "encoder") + and hasattr(model.encoder, "export_cache_support") + ): model.encoder.export_cache_support = True logging.info("Caching support is enabled.") model.encoder.setup_streaming_params() fp8_recipe = recipe.DelayedScaling(margin=0, interval=1, fp8_format=recipe.Format.E4M3) try: - with torch.no_grad(), torch.inference_mode(), te.onnx_export(True), \ - te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe), warnings.catch_warnings(): - warnings.filterwarnings( - action='ignore', - category=torch.jit.TracerWarning, - module=r'.*' - ) + with torch.no_grad(), torch.inference_mode(), te.onnx_export(True), te.fp8_autocast( + enabled=True, fp8_recipe=fp8_recipe + ), warnings.catch_warnings(): + warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*') model.to(device=cfg.export_options.device).freeze() model.eval() diff --git a/nemo/utils/export_utils.py b/nemo/utils/export_utils.py index 7131f9ce6fc1..edfad0af62e5 100644 --- a/nemo/utils/export_utils.py +++ b/nemo/utils/export_utils.py @@ -456,7 +456,7 @@ def replace_for_export(model: nn.Module) -> nn.Module: replace_modules(model, default_Apex_replacements) # Apply CastToFloat for torch.float32 - if model.dtype==torch.float32: + if model.dtype == torch.float32: replace_modules(model, default_replacements) # This one has to be the last replace_modules(model, script_replacements) From 2c5d5fb677e4fe96925fa72066795fde4272d7e5 Mon Sep 17 00:00:00 2001 From: Asfiya Baig Date: Wed, 26 Apr 2023 16:58:48 -0700 Subject: [PATCH 04/14] refactor to avoid typecasting bf16 string Signed-off-by: Asfiya Baig --- .../nlp/modules/common/megatron/module.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nemo/collections/nlp/modules/common/megatron/module.py b/nemo/collections/nlp/modules/common/megatron/module.py index 598272366b44..0a340985eec2 100644 --- a/nemo/collections/nlp/modules/common/megatron/module.py +++ b/nemo/collections/nlp/modules/common/megatron/module.py @@ -262,17 +262,17 @@ def __init__(self, module, precision): super().__init__() self.precision = precision - if int(precision) == 16: - self.add_module('module', module.half()) + if precision == 'bf16': + self.add_module('module', module.bfloat16()) def float16_converter(val): - return val.half() + return val.bfloat16() - elif precision == 'bf16': - self.add_module('module', module.bfloat16()) + elif int(precision) == 16: + self.add_module('module', module.half()) def float16_converter(val): - return val.bfloat16() + return val.half() else: raise Exception( From 162cb6c9bc23f587b8d7f1a6bf56d362a42edb6a Mon Sep 17 00:00:00 2001 From: Asfiya Baig Date: Thu, 27 Apr 2023 14:50:03 -0700 Subject: [PATCH 05/14] fix attribute error in export_utils Signed-off-by: Asfiya Baig --- nemo/utils/export_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/utils/export_utils.py b/nemo/utils/export_utils.py index edfad0af62e5..b4debf746a98 100644 --- a/nemo/utils/export_utils.py +++ b/nemo/utils/export_utils.py @@ -456,7 +456,7 @@ def replace_for_export(model: nn.Module) -> nn.Module: replace_modules(model, default_Apex_replacements) # Apply CastToFloat for torch.float32 - if model.dtype == torch.float32: + if hasattr(model, 'dtype') and model.dtype == torch.float32: replace_modules(model, default_replacements) # This one has to be the last replace_modules(model, script_replacements) From 29917eec292cdaf8c849c7cd982845d83ad48ee6 Mon Sep 17 00:00:00 2001 From: Asfiya Baig Date: Thu, 4 May 2023 08:28:47 -0700 Subject: [PATCH 06/14] set constant_folding to False by default Signed-off-by: Asfiya Baig --- examples/nlp/language_modeling/conf/megatron_gpt_export.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml index 798d156146d5..49616fea02ab 100644 --- a/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml @@ -15,7 +15,7 @@ export_options: runtime_check: False verbose: False onnx_opset: 17 - do_constant_folding: True + do_constant_folding: False cache_support: False device: 'cuda' check_tolerance: 0.01 From bb16f60d6fc6517e8925a1f283451869bedd05a0 Mon Sep 17 00:00:00 2001 From: Asfiya Baig Date: Tue, 9 May 2023 10:42:29 -0700 Subject: [PATCH 07/14] refactor exportable wrapper into model class definition Signed-off-by: Asfiya Baig --- .../language_modeling/megatron_gpt_export.py | 115 ++---------------- .../language_modeling/megatron_gpt_model.py | 83 ++++++++++++- 2 files changed, 93 insertions(+), 105 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_gpt_export.py b/examples/nlp/language_modeling/megatron_gpt_export.py index e2221b762962..baf77b7653c6 100644 --- a/examples/nlp/language_modeling/megatron_gpt_export.py +++ b/examples/nlp/language_modeling/megatron_gpt_export.py @@ -38,85 +38,12 @@ from transformer_engine.common import recipe from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel -from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector from nemo.core.classes import Exportable from nemo.core.config import hydra_runner -from nemo.core.neural_types import ChannelType, NeuralType from nemo.utils import logging -class MegatronGPTExportableModel(torch.nn.Module, Exportable): - def __init__(self, model): - super().__init__() - self.model = model - self.dtype = None - if model.cfg['precision'] == 'bf16': - self.dtype = torch.bfloat16 - elif int(model.cfg['precision']) == 32: - self.dtype = torch.float - elif int(model.cfg['precision']) == 16: - self.dtype = torch.float16 - else: - raise ValueError(f"precision: {model.cfg['precision']} is not supported.") - - def forward(self, tokens, position_ids, attention_mask): - assert tokens.shape == position_ids.shape - assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1] - with torch.autocast('cuda', dtype=self.dtype): - output_tensor = self.model.forward( - tokens=tokens.cuda(), - text_position_ids=position_ids.cuda(), - attention_mask=attention_mask.cuda(), - labels=None, - ) - - return output_tensor - - def freeze(self): - for param in self.parameters(): - param.requires_grad = False - - def input_example(self, max_batch=1, max_dim=768, seq_len=6): - ids = [self.model.tokenizer.text_to_ids(text) for text in ["how is the weather on Sunday"]] - id_tensors = [torch.unsqueeze(torch.LongTensor(id_list), dim=0) for id_list in ids] - masks_and_position_ids = [ - get_ltor_masks_and_position_ids(id_tensor, self.model.tokenizer.eos_id, False, False, False) - for id_tensor in id_tensors - ] - for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids): - attn_mask, _, pos_ids = attn_mask_and_pos_ids - return tokens, pos_ids, attn_mask - - def get_dynamic_axes(self): - dynamic_axes = { - 'id_tensors': {0: "BS", 1: "sequence"}, - 'position_ids': {0: "BS", 1: "sequence"}, - 'attention_mask': {0: "BS", 2: "sequence", 3: "sequence"}, - } - return dynamic_axes - - @property - def input_types(self) -> Optional[Dict[str, NeuralType]]: - return { - "id_tensors": NeuralType(('B', 'T'), ChannelType()), - "position_ids": NeuralType(('B', 'T'), ChannelType()), - "attention_mask": NeuralType(('D', 'D', 'T', 'T'), ChannelType()), - } - - @property - def output_types(self) -> Optional[Dict[str, NeuralType]]: - return {"log_probs": NeuralType(('B', 'T', 'D'), ChannelType())} - - @property - def input_names(self) -> List[str]: - return ['id_tensors', 'position_ids', 'attention_mask'] - - @property - def output_names(self) -> List[str]: - return ['log_probs'] - - @hydra_runner(config_path="conf", config_name="megatron_gpt_export") def nemo_export(cfg): """Convert a .nemo saved model into .onnx ONNX format.""" @@ -169,40 +96,20 @@ def nemo_export(cfg): logging.error("Your NeMo model class ({}) is not Exportable.".format(model.__class__.__name__)) sys.exit(1) - # - # Add custom export parameters here - # + # Export check_trace = cfg.export_options.runtime_check - if ( - cfg.export_options.cache_support - and hasattr(model, "encoder") - and hasattr(model.encoder, "export_cache_support") - ): - model.encoder.export_cache_support = True - logging.info("Caching support is enabled.") - model.encoder.setup_streaming_params() - - fp8_recipe = recipe.DelayedScaling(margin=0, interval=1, fp8_format=recipe.Format.E4M3) try: - with torch.no_grad(), torch.inference_mode(), te.onnx_export(True), te.fp8_autocast( - enabled=True, fp8_recipe=fp8_recipe - ), warnings.catch_warnings(): - warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*') - - model.to(device=cfg.export_options.device).freeze() - model.eval() - exportable_model = MegatronGPTExportableModel(model) - - exportable_model.export( - onnx_out, - onnx_opset_version=cfg.export_options.onnx_opset, - do_constant_folding=cfg.export_options.do_constant_folding, - dynamic_axes=exportable_model.get_dynamic_axes(), - check_trace=check_trace, - check_tolerance=cfg.export_options.check_tolerance, - verbose=cfg.export_options.verbose, - ) + model.to(device=cfg.export_options.device).freeze() + model.eval() + model.mgpt_wrapper().export( + onnx_out, + onnx_opset_version=cfg.export_options.onnx_opset, + do_constant_folding=cfg.export_options.do_constant_folding, + check_trace=check_trace, + check_tolerance=cfg.export_options.check_tolerance, + verbose=cfg.export_options.verbose, + ) except Exception as e: logging.error( "Export failed. Please make sure your NeMo model class ({}) has working export() and that you have the latest NeMo package installed with [all] dependencies.".format( diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index e9545361b88d..8a3f1e386442 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -15,7 +15,7 @@ import itertools import queue from functools import partial -from typing import Any, Iterator, List, Optional, Union +from typing import Any, Iterator, List, Optional, Union, Dict import numpy as np import torch @@ -37,6 +37,7 @@ average_losses_across_data_parallel_group, get_all_params_for_weight_decay_optimization, get_params_for_weight_decay_optimization, + get_ltor_masks_and_position_ids, ) from nemo.collections.nlp.modules.common.text_generation_utils import ( generate, @@ -53,6 +54,8 @@ ) from nemo.collections.nlp.parts.nlp_overrides import GradScaler from nemo.collections.nlp.parts.utils_funcs import get_last_rank +from nemo.core.classes import Exportable +from nemo.core.neural_types import ChannelType, NeuralType from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging @@ -88,6 +91,81 @@ HAVE_TE = False +class MegatronGPTExportableModel(torch.nn.Module, Exportable): + """ + Megatron GPT Wrapper for ONNX export + """ + def __init__(self, model): + super().__init__() + self.model = model + self.fp8_enabled = model.cfg.get('fp8', False) + if self.fp8_enabled and HAVE_TE: + self.fp8_recipe = transformer_engine.common.recipe.DelayedScaling(margin=0, interval=1, fp8_format=transformer_engine.common.recipe.Format.E4M3) + + self.dtype = None + if model.cfg['precision'] == 'bf16': + self.dtype = torch.bfloat16 + elif int(model.cfg['precision']) == 32: + self.dtype = torch.float + elif int(model.cfg['precision']) == 16: + self.dtype = torch.float16 + else: + raise ValueError(f"precision: {model.cfg['precision']} is not supported.") + + def forward(self, tokens, position_ids, attention_mask): + with transformer_engine.pytorch.onnx_export(self.fp8_enabled), \ + transformer_engine.pytorch.fp8_autocast(enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe), \ + torch.no_grad(), torch.inference_mode(), torch.autocast('cuda', dtype=self.dtype), \ + warnings.catch_warnings() if self.fp8_enabled and HAVE_TE else torch.no_grad(), \ + torch.inference_mode(), torch.autocast('cuda', dtype=self.dtype), \ + warnings.catch_warnings(): + warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*') + assert tokens.shape == position_ids.shape + assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1] + output_tensor = self.model.forward( + tokens=tokens.cuda(), + text_position_ids=position_ids.cuda(), + attention_mask=attention_mask.cuda(), + labels=None, + ) + + return output_tensor + + def freeze(self): + for param in self.parameters(): + param.requires_grad = False + + def input_example(self, max_batch=1, max_dim=768, seq_len=6): + ids = [self.model.tokenizer.text_to_ids(text) for text in ["how is the weather on Sunday"]] + id_tensors = [torch.unsqueeze(torch.LongTensor(id_list), dim=0) for id_list in ids] + masks_and_position_ids = [ + get_ltor_masks_and_position_ids(id_tensor, self.model.tokenizer.eos_id, False, False, False) + for id_tensor in id_tensors + ] + for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids): + attn_mask, _, pos_ids = attn_mask_and_pos_ids + return tokens, pos_ids, attn_mask + + @property + def input_types(self) -> Optional[Dict[str, NeuralType]]: + return { + "id_tensors": NeuralType(('B', 'T'), ChannelType()), + "position_ids": NeuralType(('B', 'T'), ChannelType()), + "attention_mask": NeuralType(('D', 'D', 'T', 'T'), ChannelType()), + } + + @property + def output_types(self) -> Optional[Dict[str, NeuralType]]: + return {"log_probs": NeuralType(('B', 'T', 'D'), ChannelType())} + + @property + def input_names(self) -> List[str]: + return ['id_tensors', 'position_ids', 'attention_mask'] + + @property + def output_names(self) -> List[str]: + return ['log_probs'] + class MegatronGPTModel(MegatronBaseModel, TextGeneration): """ Megatron GPT pretraining @@ -971,6 +1049,9 @@ def setup_test_data(self, cfg): ) self._test_dl = self.build_pretraining_data_loader(self._test_ds, consumed_samples) + def mgpt_wrapper(self): + return MegatronGPTExportableModel(self) + def generate( self, inputs: Union[List[str], torch.Tensor, List[dict]], From 580c026ee7c2ec3a2f604dda69a247f0af23d573 Mon Sep 17 00:00:00 2001 From: Asfiya Baig Date: Tue, 9 May 2023 10:44:14 -0700 Subject: [PATCH 08/14] remove conditional replacement of modules Signed-off-by: Asfiya Baig --- nemo/utils/export_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nemo/utils/export_utils.py b/nemo/utils/export_utils.py index b4debf746a98..60203ed6b9f0 100644 --- a/nemo/utils/export_utils.py +++ b/nemo/utils/export_utils.py @@ -455,9 +455,7 @@ def replace_for_export(model: nn.Module) -> nn.Module: } replace_modules(model, default_Apex_replacements) - # Apply CastToFloat for torch.float32 - if hasattr(model, 'dtype') and model.dtype == torch.float32: - replace_modules(model, default_replacements) + replace_modules(model, default_replacements) # This one has to be the last replace_modules(model, script_replacements) From 2357e7773929a3475bbc8684321a9968b065690a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 9 May 2023 17:45:07 +0000 Subject: [PATCH 09/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../language_modeling/megatron_gpt_model.py | 42 +++++++++++-------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 8a3f1e386442..83afd1833ffb 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -13,6 +13,7 @@ # limitations under the License. import itertools +import warnings import queue from functools import partial from typing import Any, Iterator, List, Optional, Union, Dict @@ -36,8 +37,8 @@ from nemo.collections.nlp.modules.common.megatron.utils import ( average_losses_across_data_parallel_group, get_all_params_for_weight_decay_optimization, - get_params_for_weight_decay_optimization, get_ltor_masks_and_position_ids, + get_params_for_weight_decay_optimization, ) from nemo.collections.nlp.modules.common.text_generation_utils import ( generate, @@ -55,8 +56,8 @@ from nemo.collections.nlp.parts.nlp_overrides import GradScaler from nemo.collections.nlp.parts.utils_funcs import get_last_rank from nemo.core.classes import Exportable -from nemo.core.neural_types import ChannelType, NeuralType from nemo.core.classes.common import PretrainedModelInfo +from nemo.core.neural_types import ChannelType, NeuralType from nemo.utils import logging try: @@ -95,12 +96,15 @@ class MegatronGPTExportableModel(torch.nn.Module, Exportable): """ Megatron GPT Wrapper for ONNX export """ + def __init__(self, model): super().__init__() self.model = model self.fp8_enabled = model.cfg.get('fp8', False) if self.fp8_enabled and HAVE_TE: - self.fp8_recipe = transformer_engine.common.recipe.DelayedScaling(margin=0, interval=1, fp8_format=transformer_engine.common.recipe.Format.E4M3) + self.fp8_recipe = transformer_engine.common.recipe.DelayedScaling( + margin=0, interval=1, fp8_format=transformer_engine.common.recipe.Format.E4M3 + ) self.dtype = None if model.cfg['precision'] == 'bf16': @@ -113,21 +117,22 @@ def __init__(self, model): raise ValueError(f"precision: {model.cfg['precision']} is not supported.") def forward(self, tokens, position_ids, attention_mask): - with transformer_engine.pytorch.onnx_export(self.fp8_enabled), \ - transformer_engine.pytorch.fp8_autocast(enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe), \ - torch.no_grad(), torch.inference_mode(), torch.autocast('cuda', dtype=self.dtype), \ - warnings.catch_warnings() if self.fp8_enabled and HAVE_TE else torch.no_grad(), \ - torch.inference_mode(), torch.autocast('cuda', dtype=self.dtype), \ - warnings.catch_warnings(): - warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*') - assert tokens.shape == position_ids.shape - assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1] - output_tensor = self.model.forward( - tokens=tokens.cuda(), - text_position_ids=position_ids.cuda(), - attention_mask=attention_mask.cuda(), - labels=None, - ) + with transformer_engine.pytorch.onnx_export(self.fp8_enabled), transformer_engine.pytorch.fp8_autocast( + enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe + ), torch.no_grad(), torch.inference_mode(), torch.autocast( + 'cuda', dtype=self.dtype + ), warnings.catch_warnings() if self.fp8_enabled and HAVE_TE else torch.no_grad(), torch.inference_mode(), torch.autocast( + 'cuda', dtype=self.dtype + ), warnings.catch_warnings(): + warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*') + assert tokens.shape == position_ids.shape + assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1] + output_tensor = self.model.forward( + tokens=tokens.cuda(), + text_position_ids=position_ids.cuda(), + attention_mask=attention_mask.cuda(), + labels=None, + ) return output_tensor @@ -166,6 +171,7 @@ def input_names(self) -> List[str]: def output_names(self) -> List[str]: return ['log_probs'] + class MegatronGPTModel(MegatronBaseModel, TextGeneration): """ Megatron GPT pretraining From eaeafd790ea89905ee99ee91fb5d75ac202d8f2a Mon Sep 17 00:00:00 2001 From: Asfiya Baig Date: Tue, 9 May 2023 13:05:15 -0700 Subject: [PATCH 10/14] set fp8_recipe to None by default Signed-off-by: Asfiya Baig --- .../nlp/models/language_modeling/megatron_gpt_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 83afd1833ffb..61fe5696ca29 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -101,6 +101,7 @@ def __init__(self, model): super().__init__() self.model = model self.fp8_enabled = model.cfg.get('fp8', False) + self.fp8_recipe = None if self.fp8_enabled and HAVE_TE: self.fp8_recipe = transformer_engine.common.recipe.DelayedScaling( margin=0, interval=1, fp8_format=transformer_engine.common.recipe.Format.E4M3 From 26470b93784576951e3c60886fac0464874d4fd6 Mon Sep 17 00:00:00 2001 From: Asfiya Baig Date: Tue, 16 May 2023 23:46:49 +0000 Subject: [PATCH 11/14] address all comments Signed-off-by: Asfiya Baig --- .../conf/megatron_gpt_export.yaml | 6 +- .../language_modeling/megatron_gpt_export.py | 89 +++++++++++++++---- .../language_modeling/megatron_gpt_model.py | 56 +++++++----- 3 files changed, 108 insertions(+), 43 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml index 49616fea02ab..24d0c1548e69 100644 --- a/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_export.yaml @@ -5,17 +5,21 @@ trainer: logger: False # logger provided by exp_manager precision: bf16 # 16, 32, or bf16 +model_type: gpt tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others) gpt_model_file: null # GPT nemo file path onnx_model_file: null # ONNX file path +checkpoint_dir: null # Checkpoint directory +checkpoint_name: null # Checkpoint name +hparams_file: null # hparams filepath export_options: runtime_check: False verbose: False onnx_opset: 17 - do_constant_folding: False + do_constant_folding: True cache_support: False device: 'cuda' check_tolerance: 0.01 diff --git a/examples/nlp/language_modeling/megatron_gpt_export.py b/examples/nlp/language_modeling/megatron_gpt_export.py index baf77b7653c6..96d6f4911a4b 100644 --- a/examples/nlp/language_modeling/megatron_gpt_export.py +++ b/examples/nlp/language_modeling/megatron_gpt_export.py @@ -27,27 +27,51 @@ # limitations under the License. import os -import sys -import warnings -from typing import Dict, List, Optional - -import torch -import transformer_engine.pytorch as te from omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer -from transformer_engine.common import recipe +from nemo.core import ModelPT +from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel +from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel +from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model +from nemo.collections.nlp.models.machine_translation.megatron_nmt_model import MegatronNMTModel +from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector -from nemo.core.classes import Exportable from nemo.core.config import hydra_runner from nemo.utils import logging +from nemo.utils.app_state import AppState +from nemo.utils.model_utils import inject_model_parallel_rank + +def get_model_class(cfg): + if cfg.model_type == 'gpt': + return MegatronGPTModel + elif cfg.model_type == 'bert': + return MegatronBertModel + elif cfg.model_type == 't5': + return MegatronT5Model + elif cfg.model_type == 'bart': + return MegatronBARTModel + elif cfg.model_type == 'nmt': + return MegatronNMTModel + elif cfg.model_type == 'retro': + return MegatronRetrievalModel + else: + raise ValueError("Invalid Model Type") + @hydra_runner(config_path="conf", config_name="megatron_gpt_export") def nemo_export(cfg): - """Convert a .nemo saved model into .onnx ONNX format.""" - nemo_in = cfg.gpt_model_file + """Convert a nemo model into .onnx ONNX format.""" + nemo_in = None + if cfg.gpt_model_file: + nemo_in = cfg.gpt_model_file + elif cfg.checkpoint_dir: + nemo_in = os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name) + assert nemo_in is not None, "NeMo model not provided. Please provide the path to the .nemo or .ckpt file" + onnx_out = cfg.onnx_model_file trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) @@ -58,13 +82,12 @@ def nemo_export(cfg): logging.info("Restoring NeMo model from '{}'".format(nemo_in)) try: - with torch.inference_mode(): - # Restore instance from .nemo file using generic model restore_from + if cfg.gpt_model_file: save_restore_connector = NLPSaveRestoreConnector() if os.path.isdir(cfg.gpt_model_file): save_restore_connector.model_extracted_dir = cfg.gpt_model_file - pretrained_cfg = MegatronGPTModel.restore_from( + pretrained_cfg = ModelPT.restore_from( restore_path=cfg.gpt_model_file, trainer=trainer, return_config=True, @@ -76,12 +99,39 @@ def nemo_export(cfg): pretrained_cfg.activations_checkpoint_granularity = None pretrained_cfg.activations_checkpoint_method = None pretrained_cfg.precision = trainer.precision - model = MegatronGPTModel.restore_from( + if trainer.precision == "16": + pretrained_cfg.megatron_amp_O2 = False + model = ModelPT.restore_from( restore_path=cfg.gpt_model_file, trainer=trainer, override_config_path=pretrained_cfg, save_restore_connector=save_restore_connector, ) + elif cfg.checkpoint_dir: + app_state = AppState() + if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1: + app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size + app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size + app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size + ( + app_state.tensor_model_parallel_rank, + app_state.pipeline_model_parallel_rank, + app_state.model_parallel_size, + app_state.data_parallel_size, + app_state.pipeline_model_parallel_split_rank, + app_state.virtual_pipeline_model_parallel_rank, + ) = fake_initialize_model_parallel( + world_size=app_state.model_parallel_size, + rank=trainer.global_rank, + tensor_model_parallel_size_=cfg.tensor_model_parallel_size, + pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size, + pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank, + ) + checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)) + model_cls = get_model_class(cfg) + model = model_cls.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer) + else: + raise ValueError("need at least a nemo file or checkpoint dir") except Exception as e: logging.error( "Failed to restore model from NeMo file : {}. Please make sure you have the latest NeMo package installed with [all] dependencies.".format( @@ -92,20 +142,21 @@ def nemo_export(cfg): logging.info("Model {} restored from '{}'".format(model.__class__.__name__, nemo_in)) - if not isinstance(model, Exportable): - logging.error("Your NeMo model class ({}) is not Exportable.".format(model.__class__.__name__)) - sys.exit(1) - # Export check_trace = cfg.export_options.runtime_check try: model.to(device=cfg.export_options.device).freeze() model.eval() - model.mgpt_wrapper().export( + model.export( onnx_out, onnx_opset_version=cfg.export_options.onnx_opset, do_constant_folding=cfg.export_options.do_constant_folding, + dynamic_axes = { + 'input_ids': {0: "sequence", 1: "batch"}, + 'position_ids': {0: "sequence", 1: "batch"}, + 'logits': {0: "sequence", 1: "batch"} + }, check_trace=check_trace, check_tolerance=cfg.export_options.check_tolerance, verbose=cfg.export_options.verbose, diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 61fe5696ca29..735bd80b3ac4 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -118,22 +118,28 @@ def __init__(self, model): raise ValueError(f"precision: {model.cfg['precision']} is not supported.") def forward(self, tokens, position_ids, attention_mask): - with transformer_engine.pytorch.onnx_export(self.fp8_enabled), transformer_engine.pytorch.fp8_autocast( - enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe - ), torch.no_grad(), torch.inference_mode(), torch.autocast( - 'cuda', dtype=self.dtype - ), warnings.catch_warnings() if self.fp8_enabled and HAVE_TE else torch.no_grad(), torch.inference_mode(), torch.autocast( - 'cuda', dtype=self.dtype - ), warnings.catch_warnings(): - warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*') - assert tokens.shape == position_ids.shape - assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1] - output_tensor = self.model.forward( - tokens=tokens.cuda(), - text_position_ids=position_ids.cuda(), - attention_mask=attention_mask.cuda(), - labels=None, - ) + if self.fp8_enabled and HAVE_TE: + with transformer_engine.pytorch.onnx_export(self.fp8_enabled), transformer_engine.pytorch.fp8_autocast(enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe), torch.no_grad(), torch.inference_mode(), torch.autocast('cuda', dtype=self.dtype), warnings.catch_warnings(): + warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*') + assert tokens.shape == position_ids.shape + assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1] + output_tensor = self.model.forward( + tokens=tokens.cuda(), + text_position_ids=position_ids.cuda(), + attention_mask=attention_mask.cuda(), + labels=None, + ) + else: + with torch.no_grad(), torch.inference_mode(), torch.autocast('cuda', dtype=self.dtype), warnings.catch_warnings(): + warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*') + assert tokens.shape == position_ids.shape + assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1] + output_tensor = self.model.forward( + tokens=tokens.cuda(), + text_position_ids=position_ids.cuda(), + attention_mask=attention_mask.cuda(), + labels=None, + ) return output_tensor @@ -155,22 +161,22 @@ def input_example(self, max_batch=1, max_dim=768, seq_len=6): @property def input_types(self) -> Optional[Dict[str, NeuralType]]: return { - "id_tensors": NeuralType(('B', 'T'), ChannelType()), + "input_ids": NeuralType(('B', 'T'), ChannelType()), "position_ids": NeuralType(('B', 'T'), ChannelType()), "attention_mask": NeuralType(('D', 'D', 'T', 'T'), ChannelType()), } @property def output_types(self) -> Optional[Dict[str, NeuralType]]: - return {"log_probs": NeuralType(('B', 'T', 'D'), ChannelType())} + return {"logits": NeuralType(('B', 'T', 'D'), ChannelType())} @property def input_names(self) -> List[str]: - return ['id_tensors', 'position_ids', 'attention_mask'] + return ['input_ids', 'position_ids', 'attention_mask'] @property def output_names(self) -> List[str]: - return ['log_probs'] + return ['logits'] class MegatronGPTModel(MegatronBaseModel, TextGeneration): @@ -1056,9 +1062,6 @@ def setup_test_data(self, cfg): ) self._test_dl = self.build_pretraining_data_loader(self._test_ds, consumed_samples) - def mgpt_wrapper(self): - return MegatronGPTExportableModel(self) - def generate( self, inputs: Union[List[str], torch.Tensor, List[dict]], @@ -1201,6 +1204,13 @@ def parameters(self): else: return self.model.parameters() + @property + def mgpt_wrapper(self): + return MegatronGPTExportableModel(self) + + def list_export_subnets(self): + return ['mgpt_wrapper'] + def _reset_activation_checkpointing_args(self): """ Disables activation checkpointing completely and saves the values so that _restore_activation_checkpointing_args can restore them later. This function must always be From 76e3d8fa0b406240e093ea7879380b6a3453dc18 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 May 2023 23:52:03 +0000 Subject: [PATCH 12/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../nlp/language_modeling/megatron_gpt_export.py | 9 +++++---- .../models/language_modeling/megatron_gpt_model.py | 14 ++++++++++---- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_gpt_export.py b/examples/nlp/language_modeling/megatron_gpt_export.py index 96d6f4911a4b..bf9157884bfc 100644 --- a/examples/nlp/language_modeling/megatron_gpt_export.py +++ b/examples/nlp/language_modeling/megatron_gpt_export.py @@ -27,10 +27,10 @@ # limitations under the License. import os + from omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer -from nemo.core import ModelPT from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel @@ -39,12 +39,13 @@ from nemo.collections.nlp.models.machine_translation.megatron_nmt_model import MegatronNMTModel from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector +from nemo.core import ModelPT from nemo.core.config import hydra_runner from nemo.utils import logging - from nemo.utils.app_state import AppState from nemo.utils.model_utils import inject_model_parallel_rank + def get_model_class(cfg): if cfg.model_type == 'gpt': return MegatronGPTModel @@ -152,10 +153,10 @@ def nemo_export(cfg): onnx_out, onnx_opset_version=cfg.export_options.onnx_opset, do_constant_folding=cfg.export_options.do_constant_folding, - dynamic_axes = { + dynamic_axes={ 'input_ids': {0: "sequence", 1: "batch"}, 'position_ids': {0: "sequence", 1: "batch"}, - 'logits': {0: "sequence", 1: "batch"} + 'logits': {0: "sequence", 1: "batch"}, }, check_trace=check_trace, check_tolerance=cfg.export_options.check_tolerance, diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 735bd80b3ac4..ef27450ff159 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -13,10 +13,10 @@ # limitations under the License. import itertools -import warnings import queue +import warnings from functools import partial -from typing import Any, Iterator, List, Optional, Union, Dict +from typing import Any, Dict, Iterator, List, Optional, Union import numpy as np import torch @@ -119,7 +119,11 @@ def __init__(self, model): def forward(self, tokens, position_ids, attention_mask): if self.fp8_enabled and HAVE_TE: - with transformer_engine.pytorch.onnx_export(self.fp8_enabled), transformer_engine.pytorch.fp8_autocast(enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe), torch.no_grad(), torch.inference_mode(), torch.autocast('cuda', dtype=self.dtype), warnings.catch_warnings(): + with transformer_engine.pytorch.onnx_export(self.fp8_enabled), transformer_engine.pytorch.fp8_autocast( + enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe + ), torch.no_grad(), torch.inference_mode(), torch.autocast( + 'cuda', dtype=self.dtype + ), warnings.catch_warnings(): warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*') assert tokens.shape == position_ids.shape assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1] @@ -130,7 +134,9 @@ def forward(self, tokens, position_ids, attention_mask): labels=None, ) else: - with torch.no_grad(), torch.inference_mode(), torch.autocast('cuda', dtype=self.dtype), warnings.catch_warnings(): + with torch.no_grad(), torch.inference_mode(), torch.autocast( + 'cuda', dtype=self.dtype + ), warnings.catch_warnings(): warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*') assert tokens.shape == position_ids.shape assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1] From 7948f3517da62369d0adde4b1469b85e710a7632 Mon Sep 17 00:00:00 2001 From: Asfiya Baig Date: Wed, 17 May 2023 00:04:32 +0000 Subject: [PATCH 13/14] typecast precision check for fp16 Signed-off-by: Asfiya Baig --- nemo/collections/nlp/modules/common/megatron/attention.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/modules/common/megatron/attention.py b/nemo/collections/nlp/modules/common/megatron/attention.py index c025c1fc32ba..aaeb05d43cde 100644 --- a/nemo/collections/nlp/modules/common/megatron/attention.py +++ b/nemo/collections/nlp/modules/common/megatron/attention.py @@ -697,8 +697,12 @@ def __init__( super(CoreAttention, self).__init__() self.precision = precision - self.fp16 = precision == 16 - self.bf16 = precision == 'bf16' + self.fp16 = False + self.bf16 = False + if precision == 'bf16': + self.bf16 = True + elif int(precision) == 16: + self.fp16 = True self.multi_query_attention = multi_query_attention self.apply_query_key_layer_scaling = apply_query_key_layer_scaling From 4a83c47e62b2625d36400096e0dd6ccd890e8ceb Mon Sep 17 00:00:00 2001 From: Asfiya Baig Date: Thu, 18 May 2023 19:01:39 +0000 Subject: [PATCH 14/14] rename export script Signed-off-by: Asfiya Baig --- .../{megatron_gpt_export.py => megatron_export.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/nlp/language_modeling/{megatron_gpt_export.py => megatron_export.py} (100%) diff --git a/examples/nlp/language_modeling/megatron_gpt_export.py b/examples/nlp/language_modeling/megatron_export.py similarity index 100% rename from examples/nlp/language_modeling/megatron_gpt_export.py rename to examples/nlp/language_modeling/megatron_export.py