Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable ONNX export of 5B GPT trained with TE FP8 modules #6458

Merged
merged 16 commits into from
May 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions examples/nlp/language_modeling/conf/megatron_gpt_export.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
trainer:
devices: 1
num_nodes: 1
accelerator: gpu
logger: False # logger provided by exp_manager
precision: bf16 # 16, 32, or bf16

model_type: gpt
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
gpt_model_file: null # GPT nemo file path
onnx_model_file: null # ONNX file path
checkpoint_dir: null # Checkpoint directory
checkpoint_name: null # Checkpoint name
hparams_file: null # hparams filepath

export_options:
runtime_check: False
verbose: False
onnx_opset: 17
do_constant_folding: True
cache_support: False
device: 'cuda'
check_tolerance: 0.01
175 changes: 175 additions & 0 deletions examples/nlp/language_modeling/megatron_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

from omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer

from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel
from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
from nemo.collections.nlp.models.machine_translation.megatron_nmt_model import MegatronNMTModel
from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
from nemo.core import ModelPT
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.utils.app_state import AppState
from nemo.utils.model_utils import inject_model_parallel_rank


def get_model_class(cfg):
if cfg.model_type == 'gpt':
return MegatronGPTModel
elif cfg.model_type == 'bert':
return MegatronBertModel
elif cfg.model_type == 't5':
return MegatronT5Model
elif cfg.model_type == 'bart':
return MegatronBARTModel
elif cfg.model_type == 'nmt':
return MegatronNMTModel
elif cfg.model_type == 'retro':
return MegatronRetrievalModel
else:
raise ValueError("Invalid Model Type")


@hydra_runner(config_path="conf", config_name="megatron_gpt_export")
def nemo_export(cfg):
"""Convert a nemo model into .onnx ONNX format."""
nemo_in = None
if cfg.gpt_model_file:
nemo_in = cfg.gpt_model_file
elif cfg.checkpoint_dir:
nemo_in = os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)
assert nemo_in is not None, "NeMo model not provided. Please provide the path to the .nemo or .ckpt file"

onnx_out = cfg.onnx_model_file

trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
assert (
cfg.trainer.devices * cfg.trainer.num_nodes
== cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size"

logging.info("Restoring NeMo model from '{}'".format(nemo_in))
try:
if cfg.gpt_model_file:
save_restore_connector = NLPSaveRestoreConnector()
if os.path.isdir(cfg.gpt_model_file):
save_restore_connector.model_extracted_dir = cfg.gpt_model_file

pretrained_cfg = ModelPT.restore_from(
restore_path=cfg.gpt_model_file,
trainer=trainer,
return_config=True,
save_restore_connector=save_restore_connector,
)
OmegaConf.set_struct(pretrained_cfg, True)
with open_dict(pretrained_cfg):
pretrained_cfg.sequence_parallel = False
pretrained_cfg.activations_checkpoint_granularity = None
pretrained_cfg.activations_checkpoint_method = None
pretrained_cfg.precision = trainer.precision
if trainer.precision == "16":
pretrained_cfg.megatron_amp_O2 = False
model = ModelPT.restore_from(
restore_path=cfg.gpt_model_file,
trainer=trainer,
override_config_path=pretrained_cfg,
save_restore_connector=save_restore_connector,
)
elif cfg.checkpoint_dir:
app_state = AppState()
if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1:
app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size
app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size
(
app_state.tensor_model_parallel_rank,
app_state.pipeline_model_parallel_rank,
app_state.model_parallel_size,
app_state.data_parallel_size,
app_state.pipeline_model_parallel_split_rank,
app_state.virtual_pipeline_model_parallel_rank,
) = fake_initialize_model_parallel(
world_size=app_state.model_parallel_size,
rank=trainer.global_rank,
tensor_model_parallel_size_=cfg.tensor_model_parallel_size,
pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size,
pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank,
)
checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name))
model_cls = get_model_class(cfg)
model = model_cls.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer)
else:
raise ValueError("need at least a nemo file or checkpoint dir")
except Exception as e:
logging.error(
"Failed to restore model from NeMo file : {}. Please make sure you have the latest NeMo package installed with [all] dependencies.".format(
nemo_in
)
)
raise e

logging.info("Model {} restored from '{}'".format(model.__class__.__name__, nemo_in))

# Export
check_trace = cfg.export_options.runtime_check

try:
model.to(device=cfg.export_options.device).freeze()
model.eval()
model.export(
onnx_out,
onnx_opset_version=cfg.export_options.onnx_opset,
do_constant_folding=cfg.export_options.do_constant_folding,
dynamic_axes={
'input_ids': {0: "sequence", 1: "batch"},
'position_ids': {0: "sequence", 1: "batch"},
'logits': {0: "sequence", 1: "batch"},
},
check_trace=check_trace,
check_tolerance=cfg.export_options.check_tolerance,
verbose=cfg.export_options.verbose,
)
except Exception as e:
logging.error(
"Export failed. Please make sure your NeMo model class ({}) has working export() and that you have the latest NeMo package installed with [all] dependencies.".format(
model.__class__
)
)
raise e


if __name__ == '__main__':
nemo_export()
106 changes: 105 additions & 1 deletion nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@

import itertools
import queue
import warnings
from functools import partial
from typing import Any, Iterator, List, Optional, Union
from typing import Any, Dict, Iterator, List, Optional, Union

import numpy as np
import torch
Expand All @@ -36,6 +37,7 @@
from nemo.collections.nlp.modules.common.megatron.utils import (
average_losses_across_data_parallel_group,
get_all_params_for_weight_decay_optimization,
get_ltor_masks_and_position_ids,
get_params_for_weight_decay_optimization,
)
from nemo.collections.nlp.modules.common.text_generation_utils import (
Expand All @@ -53,7 +55,9 @@
)
from nemo.collections.nlp.parts.nlp_overrides import GradScaler
from nemo.collections.nlp.parts.utils_funcs import get_last_rank
from nemo.core.classes import Exportable
from nemo.core.classes.common import PretrainedModelInfo
from nemo.core.neural_types import ChannelType, NeuralType
from nemo.utils import logging

try:
Expand Down Expand Up @@ -88,6 +92,99 @@
HAVE_TE = False


class MegatronGPTExportableModel(torch.nn.Module, Exportable):
"""
Megatron GPT Wrapper for ONNX export
"""

def __init__(self, model):
super().__init__()
self.model = model
self.fp8_enabled = model.cfg.get('fp8', False)
self.fp8_recipe = None
if self.fp8_enabled and HAVE_TE:
self.fp8_recipe = transformer_engine.common.recipe.DelayedScaling(
margin=0, interval=1, fp8_format=transformer_engine.common.recipe.Format.E4M3
)

self.dtype = None
if model.cfg['precision'] == 'bf16':
self.dtype = torch.bfloat16
elif int(model.cfg['precision']) == 32:
self.dtype = torch.float
elif int(model.cfg['precision']) == 16:
self.dtype = torch.float16
else:
raise ValueError(f"precision: {model.cfg['precision']} is not supported.")

def forward(self, tokens, position_ids, attention_mask):
if self.fp8_enabled and HAVE_TE:
with transformer_engine.pytorch.onnx_export(self.fp8_enabled), transformer_engine.pytorch.fp8_autocast(
enabled=self.fp8_enabled, fp8_recipe=self.fp8_recipe
), torch.no_grad(), torch.inference_mode(), torch.autocast(
'cuda', dtype=self.dtype
), warnings.catch_warnings():
warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
assert tokens.shape == position_ids.shape
assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
output_tensor = self.model.forward(
tokens=tokens.cuda(),
text_position_ids=position_ids.cuda(),
attention_mask=attention_mask.cuda(),
labels=None,
)
else:
with torch.no_grad(), torch.inference_mode(), torch.autocast(
'cuda', dtype=self.dtype
), warnings.catch_warnings():
warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning, module=r'.*')
assert tokens.shape == position_ids.shape
assert attention_mask.shape[2] == attention_mask.shape[3] == tokens.shape[1] == position_ids.shape[1]
output_tensor = self.model.forward(
tokens=tokens.cuda(),
text_position_ids=position_ids.cuda(),
attention_mask=attention_mask.cuda(),
labels=None,
)

return output_tensor

def freeze(self):
for param in self.parameters():
param.requires_grad = False

def input_example(self, max_batch=1, max_dim=768, seq_len=6):

Check notice

Code scanning / CodeQL

Explicit returns mixed with implicit (fall through) returns

Mixing implicit and explicit returns may indicate an error as implicit returns always return None.
ids = [self.model.tokenizer.text_to_ids(text) for text in ["how is the weather on Sunday"]]
id_tensors = [torch.unsqueeze(torch.LongTensor(id_list), dim=0) for id_list in ids]
masks_and_position_ids = [
get_ltor_masks_and_position_ids(id_tensor, self.model.tokenizer.eos_id, False, False, False)
for id_tensor in id_tensors
]
for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids):
attn_mask, _, pos_ids = attn_mask_and_pos_ids
return tokens, pos_ids, attn_mask

@property
def input_types(self) -> Optional[Dict[str, NeuralType]]:
return {
"input_ids": NeuralType(('B', 'T'), ChannelType()),
"position_ids": NeuralType(('B', 'T'), ChannelType()),
"attention_mask": NeuralType(('D', 'D', 'T', 'T'), ChannelType()),
}

@property
def output_types(self) -> Optional[Dict[str, NeuralType]]:
return {"logits": NeuralType(('B', 'T', 'D'), ChannelType())}

@property
def input_names(self) -> List[str]:
return ['input_ids', 'position_ids', 'attention_mask']

@property
def output_names(self) -> List[str]:
return ['logits']


class MegatronGPTModel(MegatronBaseModel, TextGeneration):
"""
Megatron GPT pretraining
Expand Down Expand Up @@ -1113,6 +1210,13 @@ def parameters(self):
else:
return self.model.parameters()

@property
def mgpt_wrapper(self):
return MegatronGPTExportableModel(self)

def list_export_subnets(self):
return ['mgpt_wrapper']

def _reset_activation_checkpointing_args(self):
""" Disables activation checkpointing completely and saves the values so that
_restore_activation_checkpointing_args can restore them later. This function must always be
Expand Down
8 changes: 6 additions & 2 deletions nemo/collections/nlp/modules/common/megatron/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -697,8 +697,12 @@ def __init__(
super(CoreAttention, self).__init__()

self.precision = precision
self.fp16 = precision == 16
self.bf16 = precision == 'bf16'
self.fp16 = False
self.bf16 = False
if precision == 'bf16':
self.bf16 = True
elif int(precision) == 16:
self.fp16 = True
self.multi_query_attention = multi_query_attention

self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
Expand Down
12 changes: 6 additions & 6 deletions nemo/collections/nlp/modules/common/megatron/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,17 +262,17 @@ def __init__(self, module, precision):
super().__init__()
self.precision = precision

if precision == 16:
self.add_module('module', module.half())
if precision == 'bf16':
self.add_module('module', module.bfloat16())

def float16_converter(val):
return val.half()
return val.bfloat16()

elif precision == 'bf16':
self.add_module('module', module.bfloat16())
elif int(precision) == 16:
self.add_module('module', module.half())

def float16_converter(val):
return val.bfloat16()
return val.half()

else:
raise Exception(
Expand Down
3 changes: 2 additions & 1 deletion nemo/utils/export_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,8 @@ def replace_FusedScaleMaskSoftmax(n: nn.Module) -> Optional[nn.Linear]:
Equivalent LayerNorm module
"""
if not isinstance(n, FusedScaleMaskSoftmax):
raise ValueError("This function can only change the FusedScaleMaskSoftmax module.")
logging.warning("This function can only change the FusedScaleMaskSoftmax module.")
return n

# disable the fusion only
mod = FusedScaleMaskSoftmax(
Expand Down