Skip to content

Commit

Permalink
Draft: Expose MCore Cudagraph interface (NVIDIA#10121)
Browse files Browse the repository at this point in the history
* cuda graph modules

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* bug fixes

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* add mcore cuda graph interface

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* add mcore cuda graph interface

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* fix whitespace

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* change te import path

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* add cudagraph manager

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

* separate external cudagraph flag

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

* separate config flags

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

* update mcore changes

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* add check

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>

---------

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: JimmyZhang12 <JimmyZhang12@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Signed-off-by: George Armstrong <georgea@nvidia.com>
  • Loading branch information
5 people authored and gwarmstrong committed Sep 19, 2024
1 parent 3ce2c70 commit 34041b5
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
from importlib.metadata import version
from typing import Any, Callable, Optional

import packaging
import torch
from pkg_resources import packaging

from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
from nemo.collections.nlp.parts import utils_funcs
Expand All @@ -30,6 +30,7 @@
try:
from megatron.core import parallel_state, tensor_parallel
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
from megatron.core.transformer.cuda_graphs import CudaGraphManager
from megatron.core.transformer.spec_utils import ModuleSpec
from megatron.core.transformer.transformer_block import TransformerBlockSubmodules, get_num_layers_to_build
from megatron.core.transformer.transformer_layer import BaseTransformerLayer
Expand Down Expand Up @@ -235,6 +236,10 @@ def __init__(self, config, layer_number=1, hidden_dropout=None):
transformer_layer_args["ub_atomic_gemm_rs"] = config.tp_comm_atomic_rs
super().__init__(**transformer_layer_args)

if self.config.enable_cuda_graph and self.training:
assert not config.cpu_offloading and config.recompute_granularity is None, "Cudagraphs not supported"
self.add_module('cudagraph_manager', CudaGraphManager())

# Called by MCore's TransformerBlock.forward
# megatron/core/transformer/transformer_block.py
def forward(
Expand All @@ -261,8 +266,8 @@ def forward(
self.is_first_microbatch = False
context = None

# CUDA graph requires returned values to be Tensors
if self.config.enable_cuda_graph and self.training:
# External CUDA graph requires returned values to be Tensors
if hasattr(self.config, 'external_cuda_graph') and self.config.external_cuda_graph and self.training:
return hidden_states
return hidden_states, context

Expand Down Expand Up @@ -318,6 +323,11 @@ def sharded_state_dict(self, prefix: str = '', sharded_offsets: tuple = (), meta

return sharded_state_dict

def __call__(self, *args, **kwargs):
if hasattr(self, 'cudagraph_manager'):
return self.cudagraph_manager(self, args, kwargs)
return super().__call__(*args, **kwargs)


# Use this spec to use the full Transformer layer from Transformer Engine
def get_gpt_full_te_layer_autocast_spec(transformer_config) -> ModuleSpec:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -832,6 +832,7 @@ def training_step(self, dataloader_iter):
module = module.module
if not self.mcore_gpt:
module = module.language_model

if hasattr(module, 'embedding'):
for param in module.embedding.parameters():
param.data_ptr()
Expand Down Expand Up @@ -2115,6 +2116,13 @@ def build_transformer_config(self) -> TransformerConfig:
else:
raise ValueError(f"fp8 enabled but fp8_format (fp8_e4m3 | fp8_hybrid) is not set.")

if self.cfg.get('enable_cuda_graph', False):
assert HAVE_TE, "Transformer Engine is required for cudagraphs."
assert self.cfg.get(
'use_te_rng_tracker', False
), "Transformer engine's RNG tracker is required for cudagraphs, this can be enabled with \
'use_te_rng_tracker=True'."

# any configs that are not in the nemo model config will be added here
model_specific_configs = {
'layernorm_zero_centered_gamma': layernorm_zero_centered_gamma,
Expand All @@ -2132,6 +2140,7 @@ def build_transformer_config(self) -> TransformerConfig:
'moe_z_loss_coeff': self.cfg.get('moe_z_loss_coeff', None), # 1e-3 would be a good start value for z-loss
'moe_input_jitter_eps': self.cfg.get('moe_input_jitter_eps', None),
'moe_token_dropping': self.cfg.get('moe_token_dropping', False), # TODO: Support token dropping.
'enable_cuda_graph': self.cfg.get('enable_cuda_graph', False),
}
if model_specific_configs['num_moe_experts'] is not None:
assert mcore_supports_moe(), 'Megatron-core >= v0.5.0 is required for MoE'
Expand Down

0 comments on commit 34041b5

Please sign in to comment.