diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py index bb1e854e09c40..938b9fbddc39f 100644 --- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py +++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py @@ -739,8 +739,7 @@ def dummy(): config=self.transformer_config, transformer_layer_spec=get_specs( self.spec_name, - self.transformer_config.num_moe_experts, - self.transformer_config.moe_grouped_gemm, + self.transformer_config, self.transformer_engine, ), vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size), diff --git a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py index a197ae2918806..d4aa3755b385b 100644 --- a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py +++ b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py @@ -501,8 +501,7 @@ def __init__( add_class_token = True vision_layer_spec = get_specs( model_cfg.text.get('name', ''), - vision_transformer_config.num_moe_experts, - vision_transformer_config.moe_grouped_gemm, + vision_transformer_config, model_cfg.get('transformer_engine', True), ) vision_layer_spec.submodules.self_attention.params['attn_mask_type'] = MCoreAttnMaskType.no_mask @@ -527,8 +526,7 @@ def __init__( config=text_transformer_config, transformer_layer_spec=get_specs( model_cfg.text.get('name', ''), - text_transformer_config.num_moe_experts, - text_transformer_config.moe_grouped_gemm, + text_transformer_config, model_cfg.get('transformer_engine', True), ), vocab_size=model_cfg.text.get('override_vocab_size', padded_vocab_size), diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py index 6cce2b42be9c0..f3299d488fd01 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py @@ -35,7 +35,9 @@ try: from megatron.core import parallel_state, tensor_parallel + from megatron.core.fusions.fused_layer_norm import FusedLayerNorm from megatron.core.transformer.spec_utils import ModuleSpec + from megatron.core.transformer.transformer_block import TransformerBlockSubmodules, get_num_layers_to_build from megatron.core.transformer.transformer_layer import BaseTransformerLayer from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint @@ -322,8 +324,10 @@ def sharded_state_dict(self, prefix: str = '', sharded_offsets: tuple = (), meta # Use this spec to use the full Transformer layer from Transformer Engine -def get_gpt_full_te_layer_autocast_spec() -> ModuleSpec: +def get_gpt_full_te_layer_autocast_spec(transformer_config) -> ModuleSpec: if not HAVE_MEGATRON_CORE or not HAVE_TE: raise ImportError(IMPORT_ERROR) - - return ModuleSpec(module=TETransformerLayerAutocast) + num_layers = get_num_layers_to_build(transformer_config) + return TransformerBlockSubmodules( + layer_specs=[ModuleSpec(module=TETransformerLayerAutocast)] * num_layers, layer_norm=FusedLayerNorm + ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 41d85d48e497a..6e7a145679e01 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -138,7 +138,11 @@ def mcore_supports_moe() -> bool: ## TODO: This function will not work if TE is not installed -def get_specs(spec_name, num_experts=None, moe_grouped_gemm=False, use_te=True, hyena_cfg: Dict = None): +def get_specs(spec_name, transformer_config=None, use_te=True, hyena_cfg: Dict = None): + # else cases for backwards compatibility with neva + num_experts = transformer_config.num_moe_experts if transformer_config else None + moe_grouped_gemm = transformer_config.moe_grouped_gemm if transformer_config else False + if num_experts is not None: assert mcore_supports_moe(), "Megatron-core >= v0.5.0 is required for MoE" @@ -148,7 +152,7 @@ def get_specs(spec_name, num_experts=None, moe_grouped_gemm=False, use_te=True, "": get_gpt_layer_local_spec(num_experts, moe_grouped_gemm), "te_gpt": get_gpt_layer_with_transformer_engine_spec(num_experts, moe_grouped_gemm), "megatron_falcon_gpt": get_falcon_layer_spec(), - "megatron_gpt_full_te_layer_autocast": get_gpt_full_te_layer_autocast_spec(), + "megatron_gpt_full_te_layer_autocast": get_gpt_full_te_layer_autocast_spec(transformer_config), "modelopt": get_gpt_layer_modelopt_spec(num_experts), "te_gpt_hyena": get_gpt_layer_with_te_and_hyena_spec(hyena_cfg), } @@ -415,8 +419,7 @@ def model_provider_func(self, pre_process, post_process): config=self.transformer_config, transformer_layer_spec=get_specs( self.spec_name, - self.transformer_config.num_moe_experts, - self.transformer_config.moe_grouped_gemm, + self.transformer_config, self.transformer_engine, self.cfg.get('hyena', None), ),