Mixtral to NeMo conversion script. (NVIDIA#8155)

* HF-Mixtral to NeMo conversion script. Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com> * Pass MoE options from NeMo config to TransformerConfig. Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com> * Add version check for get_gpt_layer_with_transformer_engine_spec Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com> * Determine MoE support by attempting to import MoETokenDispatcher. Using importlib.metadata.version would be an alternative, however, a) it requires having mcore installed via pip (not always the case) and b) one might override megatron's location (e.g. via PYTHONPATH) and as a result would get inaccurate version from importlib.metadata. Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Mixtral-NeMo to Mixtral-HF converter. Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com> * fixup: Update mcore_supports_moe due to file rename in upcoming MoE Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Mixtral-converters: use `set_expert_model_parallel_world_size` to specify MoE world size. Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com> * Fix import Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com> * Jenkins: install lightning. Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com> * Match latest MoE parameter names. Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper <complex451@gmail.com> Signed-off-by: Sasha Meister <ameister@nvidia.com>
ssh-meister · Feb 15, 2024 · c8a50ed · c8a50ed
1 parent 4c89f1e
commit c8a50ed
Show file tree

Hide file tree

Showing 4 changed files with 633 additions and 7 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -35,7 +35,7 @@ pipeline {
 
     stage('Install test requirements') {
       steps {
-        sh 'apt-get update && apt-get install -y bc && pip install -r requirements/requirements_test.txt'
+        sh 'apt-get update && apt-get install -y bc && pip install -r requirements/requirements_test.txt && pip install -r requirements/requirements_lightning.txt'
       }
     }
 

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -18,7 +18,7 @@
 import warnings
 from contextlib import nullcontext
 from dataclasses import fields
-from functools import partial
+from functools import cache, partial
 from importlib.metadata import version
 from typing import Any, Dict, Iterator, List, Optional, Union
 
@@ -113,11 +113,30 @@
     HAVE_TE = False
 
 
-def get_specs(spec_name):
-    name_spec_dict = {"": get_gpt_layer_with_transformer_engine_spec(), "megatron_falcon_gpt": get_falcon_layer_spec()}
-    if spec_name not in name_spec_dict:
+@cache
+def mcore_supports_moe() -> bool:
+    global HAVE_MEGATRON_CORE
+    if not HAVE_MEGATRON_CORE:
+        return False
+    try:
+        from megatron.core.transformer.moe.router import TopKRouter
+
+        return True
+    except ImportError:
+        return False
+
+
+def get_specs(spec_name, num_experts=None):
+    if spec_name == '':
+        if num_experts is not None:
+            assert mcore_supports_moe(), "Megatron-core >= v0.5.0 is required for MoE"
+            return get_gpt_layer_with_transformer_engine_spec(num_experts)
+        else:
+            return get_gpt_layer_with_transformer_engine_spec()
+    elif spec_name == 'megatron_falcon_gpt':
+        return get_falcon_layer_spec()
+    else:
         raise ValueError(f"Spec name '{spec_name}' is not recognized.")
-    return name_spec_dict[spec_name]
 
 
 class MegatronGPTExportableModel(torch.nn.Module, Exportable):
@@ -328,7 +347,7 @@ def model_provider_func(self, pre_process, post_process):
         if self.mcore_gpt:
             model = MCoreGPTModel(
                 config=self.transformer_config,
-                transformer_layer_spec=get_specs(self.spec_name),
+                transformer_layer_spec=get_specs(self.spec_name, self.transformer_config.num_moe_experts),
                 vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),
                 max_sequence_length=self.cfg.get('encoder_seq_length', 512),
                 pre_process=pre_process,
@@ -1683,7 +1702,26 @@ def build_transformer_config(self) -> TransformerConfig:
             'normalization': normalization,
             'fp8': fp8,
             'tp_comm_overlap': ub_tp_comm_overlap,
+            # MoE related
+            'num_experts': self.cfg.get('num_experts', None),
+            'moe_router_load_balancing_type': self.cfg.get('moe_router_load_balancing_type', 'aux_loss'),
+            'moe_router_topk': self.cfg.get('moe_router_topk', 2),
+            'moe_grouped_gemm': self.cfg.get('moe_grouped_gemm', False),
+            'moe_aux_loss_coeff': self.cfg.get(
+                'moe_aux_loss_coeff', 0
+            ),  # 1e-2 would be a good start value for load balance loss.
+            'moe_z_loss_coeff': self.cfg.get('moe_z_loss_coeff', None),  # 1e-3 would be a good start value for z-loss
+            'moe_input_jitter_eps': self.cfg.get('moe_input_jitter_eps', None),
+            'moe_token_dropping': self.cfg.get('moe_token_dropping', False),  # TODO: Support token dropping.
         }
+        if model_specific_configs['num_experts'] is not None:
+            assert mcore_supports_moe(), 'Megatron-core >= v0.5.0 is required for MoE'
+        elif not mcore_supports_moe():
+            if 'num_experts' in model_specific_configs:
+                del model_specific_configs['num_experts']
+            moe_keys = list(filter(lambda x: x.startswith('moe_'), model_specific_configs.keys()))
+            for k in moe_keys:
+                del model_specific_configs[k]
 
         transformer_config = super().build_transformer_config()
-Original file line number
+Diff line change
@@ Expand Up / @@ -35,7 +35,7 @@ pipeline { @@
         stage('Install test requirements') {
           steps {
-            sh 'apt-get update && apt-get install -y bc && pip install -r requirements/requirements_test.txt'
+            sh 'apt-get update && apt-get install -y bc && pip install -r requirements/requirements_test.txt && pip install -r requirements/requirements_lightning.txt'
           }
         }
@@ Expand Down @@