Skip to content

Commit

Permalink
Mixtral to NeMo conversion script. (NVIDIA#8155)
Browse files Browse the repository at this point in the history
* HF-Mixtral to NeMo conversion script.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Pass MoE options from NeMo config to TransformerConfig.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Add version check for get_gpt_layer_with_transformer_engine_spec

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Determine MoE support by attempting to import MoETokenDispatcher.

Using importlib.metadata.version would be an alternative, however,
a) it requires having mcore installed via pip (not always the case)
and b) one might override megatron's location (e.g. via PYTHONPATH)
and as a result would get inaccurate version from importlib.metadata.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Mixtral-NeMo to Mixtral-HF converter.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fixup: Update mcore_supports_moe due to file rename in upcoming MoE

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Mixtral-converters: use `set_expert_model_parallel_world_size` to specify MoE world size.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Fix import

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Jenkins: install lightning.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Match latest MoE parameter names.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Signed-off-by: Sasha Meister <ameister@nvidia.com>
  • Loading branch information
3 people authored and sashameister committed Feb 15, 2024
1 parent 4c89f1e commit c8a50ed
Show file tree
Hide file tree
Showing 4 changed files with 633 additions and 7 deletions.
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ pipeline {

stage('Install test requirements') {
steps {
sh 'apt-get update && apt-get install -y bc && pip install -r requirements/requirements_test.txt'
sh 'apt-get update && apt-get install -y bc && pip install -r requirements/requirements_test.txt && pip install -r requirements/requirements_lightning.txt'
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import warnings
from contextlib import nullcontext
from dataclasses import fields
from functools import partial
from functools import cache, partial
from importlib.metadata import version
from typing import Any, Dict, Iterator, List, Optional, Union

Expand Down Expand Up @@ -113,11 +113,30 @@
HAVE_TE = False


def get_specs(spec_name):
name_spec_dict = {"": get_gpt_layer_with_transformer_engine_spec(), "megatron_falcon_gpt": get_falcon_layer_spec()}
if spec_name not in name_spec_dict:
@cache
def mcore_supports_moe() -> bool:
global HAVE_MEGATRON_CORE
if not HAVE_MEGATRON_CORE:
return False
try:
from megatron.core.transformer.moe.router import TopKRouter

return True
except ImportError:
return False


def get_specs(spec_name, num_experts=None):
if spec_name == '':
if num_experts is not None:
assert mcore_supports_moe(), "Megatron-core >= v0.5.0 is required for MoE"
return get_gpt_layer_with_transformer_engine_spec(num_experts)
else:
return get_gpt_layer_with_transformer_engine_spec()
elif spec_name == 'megatron_falcon_gpt':
return get_falcon_layer_spec()
else:
raise ValueError(f"Spec name '{spec_name}' is not recognized.")
return name_spec_dict[spec_name]


class MegatronGPTExportableModel(torch.nn.Module, Exportable):
Expand Down Expand Up @@ -328,7 +347,7 @@ def model_provider_func(self, pre_process, post_process):
if self.mcore_gpt:
model = MCoreGPTModel(
config=self.transformer_config,
transformer_layer_spec=get_specs(self.spec_name),
transformer_layer_spec=get_specs(self.spec_name, self.transformer_config.num_moe_experts),
vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),
max_sequence_length=self.cfg.get('encoder_seq_length', 512),
pre_process=pre_process,
Expand Down Expand Up @@ -1683,7 +1702,26 @@ def build_transformer_config(self) -> TransformerConfig:
'normalization': normalization,
'fp8': fp8,
'tp_comm_overlap': ub_tp_comm_overlap,
# MoE related
'num_experts': self.cfg.get('num_experts', None),
'moe_router_load_balancing_type': self.cfg.get('moe_router_load_balancing_type', 'aux_loss'),
'moe_router_topk': self.cfg.get('moe_router_topk', 2),
'moe_grouped_gemm': self.cfg.get('moe_grouped_gemm', False),
'moe_aux_loss_coeff': self.cfg.get(
'moe_aux_loss_coeff', 0
), # 1e-2 would be a good start value for load balance loss.
'moe_z_loss_coeff': self.cfg.get('moe_z_loss_coeff', None), # 1e-3 would be a good start value for z-loss
'moe_input_jitter_eps': self.cfg.get('moe_input_jitter_eps', None),
'moe_token_dropping': self.cfg.get('moe_token_dropping', False), # TODO: Support token dropping.
}
if model_specific_configs['num_experts'] is not None:
assert mcore_supports_moe(), 'Megatron-core >= v0.5.0 is required for MoE'
elif not mcore_supports_moe():
if 'num_experts' in model_specific_configs:
del model_specific_configs['num_experts']
moe_keys = list(filter(lambda x: x.startswith('moe_'), model_specific_configs.keys()))
for k in moe_keys:
del model_specific_configs[k]

transformer_config = super().build_transformer_config()

Expand Down
Loading

0 comments on commit c8a50ed

Please sign in to comment.