diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py index cdff4a0036d1..b04ff248a326 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py @@ -15,8 +15,8 @@ from importlib.metadata import version from typing import Any, Callable, Optional -import packaging import torch +from pkg_resources import packaging from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults from nemo.collections.nlp.parts import utils_funcs @@ -30,6 +30,7 @@ try: from megatron.core import parallel_state, tensor_parallel from megatron.core.fusions.fused_layer_norm import FusedLayerNorm + from megatron.core.transformer.cuda_graphs import CudaGraphManager from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_block import TransformerBlockSubmodules, get_num_layers_to_build from megatron.core.transformer.transformer_layer import BaseTransformerLayer @@ -235,6 +236,10 @@ def __init__(self, config, layer_number=1, hidden_dropout=None): transformer_layer_args["ub_atomic_gemm_rs"] = config.tp_comm_atomic_rs super().__init__(**transformer_layer_args) + if self.config.enable_cuda_graph and self.training: + assert not config.cpu_offloading and config.recompute_granularity is None, "Cudagraphs not supported" + self.add_module('cudagraph_manager', CudaGraphManager()) + # Called by MCore's TransformerBlock.forward # megatron/core/transformer/transformer_block.py def forward( @@ -261,8 +266,8 @@ def forward( self.is_first_microbatch = False context = None - # CUDA graph requires returned values to be Tensors - if self.config.enable_cuda_graph and self.training: + # External CUDA graph requires returned values to be Tensors + if hasattr(self.config, 'external_cuda_graph') and self.config.external_cuda_graph and self.training: return hidden_states return hidden_states, context @@ -318,6 +323,11 @@ def sharded_state_dict(self, prefix: str = '', sharded_offsets: tuple = (), meta return sharded_state_dict + def __call__(self, *args, **kwargs): + if hasattr(self, 'cudagraph_manager'): + return self.cudagraph_manager(self, args, kwargs) + return super().__call__(*args, **kwargs) + # Use this spec to use the full Transformer layer from Transformer Engine def get_gpt_full_te_layer_autocast_spec(transformer_config) -> ModuleSpec: diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 1ae05b58890a..e3d426ec9275 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -832,6 +832,7 @@ def training_step(self, dataloader_iter): module = module.module if not self.mcore_gpt: module = module.language_model + if hasattr(module, 'embedding'): for param in module.embedding.parameters(): param.data_ptr() @@ -2115,6 +2116,13 @@ def build_transformer_config(self) -> TransformerConfig: else: raise ValueError(f"fp8 enabled but fp8_format (fp8_e4m3 | fp8_hybrid) is not set.") + if self.cfg.get('enable_cuda_graph', False): + assert HAVE_TE, "Transformer Engine is required for cudagraphs." + assert self.cfg.get( + 'use_te_rng_tracker', False + ), "Transformer engine's RNG tracker is required for cudagraphs, this can be enabled with \ + 'use_te_rng_tracker=True'." + # any configs that are not in the nemo model config will be added here model_specific_configs = { 'layernorm_zero_centered_gamma': layernorm_zero_centered_gamma, @@ -2132,6 +2140,7 @@ def build_transformer_config(self) -> TransformerConfig: 'moe_z_loss_coeff': self.cfg.get('moe_z_loss_coeff', None), # 1e-3 would be a good start value for z-loss 'moe_input_jitter_eps': self.cfg.get('moe_input_jitter_eps', None), 'moe_token_dropping': self.cfg.get('moe_token_dropping', False), # TODO: Support token dropping. + 'enable_cuda_graph': self.cfg.get('enable_cuda_graph', False), } if model_specific_configs['num_moe_experts'] is not None: assert mcore_supports_moe(), 'Megatron-core >= v0.5.0 is required for MoE'