From 79510f41066b307d65946c7c11143d65bb1f762e Mon Sep 17 00:00:00 2001 From: haze188 Date: Mon, 1 Jul 2024 06:10:10 +0000 Subject: [PATCH 1/4] [misc] fix typos --- colossalai/shardformer/policies/deepseek.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/colossalai/shardformer/policies/deepseek.py b/colossalai/shardformer/policies/deepseek.py index f8f39e66c121..1d64c643ebdb 100644 --- a/colossalai/shardformer/policies/deepseek.py +++ b/colossalai/shardformer/policies/deepseek.py @@ -39,11 +39,11 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]: if self.shard_config.enable_sequence_parallelism: self.shard_config.enable_sequence_parallelism = False raise NotImplementedError( - "Mixtral dosen't support sequence parallelism now, will ignore the sequence parallelism flag." + "Deepseek dosen't support sequence parallelism now, will ignore the sequence parallelism flag." ) if self.shard_config.enable_tensor_parallelism: - raise NotImplementedError("Tensor parallelism is not supported for Mixtral model now.") + raise NotImplementedError("Tensor parallelism is not supported for Deepseek model now.") if getattr(self.shard_config, "ep_group", None) is None: raise ValueError("You must pass in ep_group via shard_config for expert parallel!") @@ -117,7 +117,7 @@ def get_held_layers(self) -> List[Module]: """Get pipeline layers for current stage.""" assert self.pipeline_stage_manager is not None - if self.model.__class__.__name__ == "MixtralModel": + if self.model.__class__.__name__ == "DeepseekModel": module = self.model else: module = self.model.model @@ -145,7 +145,7 @@ def module_policy(self): # set None as default self.set_pipeline_forward( model_cls=DeepseekModel, - new_forward=DeepseekPipelineForwards.mixtral_model_forward, + new_forward=DeepseekPipelineForwards.deepseek_model_forward, policy=policy, ) return policy From e9bf95ef286da4f68a10a3fcac5de024c7049531 Mon Sep 17 00:00:00 2001 From: haze188 Date: Thu, 4 Jul 2024 08:03:42 +0000 Subject: [PATCH 2/4] [Feature] deepseek support via auto model, remove modeling file --- colossalai/cluster/process_group_mesh.py | 2 +- colossalai/shardformer/modeling/deepseek.py | 51 +++++++++++++-------- colossalai/shardformer/policies/deepseek.py | 17 +++---- tests/test_moe/test_deepseek_layer.py | 21 +++++---- tests/test_moe/test_moe_checkpoint.py | 15 +----- 5 files changed, 53 insertions(+), 53 deletions(-) diff --git a/colossalai/cluster/process_group_mesh.py b/colossalai/cluster/process_group_mesh.py index 1319a4529093..b6aff0d72fe6 100644 --- a/colossalai/cluster/process_group_mesh.py +++ b/colossalai/cluster/process_group_mesh.py @@ -147,7 +147,7 @@ def get_group(self, ranks_in_group: List[int], backend: Optional[str] = None) -> ProcessGroup: The process group with the given ranks. """ ranks_in_group = sorted(ranks_in_group) - if tuple(ranks_in_group) not in self._group_to_ranks: + if tuple(ranks_in_group) not in self._ranks_to_group: group = dist.new_group(ranks_in_group, backend=backend) self._ranks_to_group[tuple(ranks_in_group)] = group self._group_to_ranks[group] = tuple(ranks_in_group) diff --git a/colossalai/shardformer/modeling/deepseek.py b/colossalai/shardformer/modeling/deepseek.py index 91391639dd50..6e79ce144cc8 100644 --- a/colossalai/shardformer/modeling/deepseek.py +++ b/colossalai/shardformer/modeling/deepseek.py @@ -2,32 +2,47 @@ import torch import torch.distributed as dist +import torch.nn as nn from torch.distributed import ProcessGroup # from colossalai.tensor.moe_tensor.moe_info import MoeParallelInfo from torch.nn import CrossEntropyLoss from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask +from transformers.modeling_outputs import CausalLMOutputWithPast from transformers.utils import is_flash_attn_2_available, logging from colossalai.lazy import LazyInitContext from colossalai.moe._operation import MoeInGradScaler, MoeOutGradScaler, all_to_all_uneven from colossalai.pipeline.stage_manager import PipelineStageManager -from colossalai.shardformer.modeling.deepseek_moe_16b_base.configuration_deepseek import DeepseekConfig -from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import ( - AddAuxiliaryLoss, - CausalLMOutputWithPast, - DeepseekForCausalLM, - DeepseekMLP, - DeepseekModel, - DeepseekMoE, -) from colossalai.shardformer.shard import ShardConfig from colossalai.shardformer.shard.utils import set_tensors_to_none -class EPDeepseekMoE(DeepseekMoE): - def __init__(self, config: DeepseekConfig): - super().__init__(config) +# copied from modeling_deepseek.py +class AddAuxiliaryLoss(torch.autograd.Function): + """ + The trick function of adding auxiliary (aux) loss, + which includes the gradient of the aux loss during backpropagation. + """ + + @staticmethod + def forward(ctx, x, loss): + assert loss.numel() == 1 + ctx.dtype = loss.dtype + ctx.required_aux_loss = loss.requires_grad + return x + + @staticmethod + def backward(ctx, grad_output): + grad_loss = None + if ctx.required_aux_loss: + grad_loss = torch.ones(1, dtype=ctx.dtype, device=grad_output.device) + return grad_output, grad_loss + + +class EPDeepseekMoE(nn.Module): + def __init__(self): + super(EPDeepseekMoE, self).__init__() def setup_ep(self, ep_group: ProcessGroup): ep_group = ep_group @@ -44,9 +59,9 @@ def setup_ep(self, ep_group: ProcessGroup): p.ep_group = ep_group @staticmethod - def from_native_module(module: Union[DeepseekMoE, DeepseekMLP], *args, **kwargs) -> "EPDeepseekMoE": + def from_native_module(module: Union["DeepseekMoE", "DeepseekMLP"], *args, **kwargs) -> "EPDeepseekMoE": LazyInitContext.materialize(module) - if isinstance(module, DeepseekMLP): + if module.__class__.__name__ == "DeepseekMLP": return module module.__class__ = EPDeepseekMoE assert "ep_group" in kwargs, "You should pass ep_group in SubModuleReplacementDescription via shard_config!!" @@ -120,7 +135,7 @@ class DeepseekPipelineForwards: @staticmethod def deepseek_model_forward( - self: DeepseekModel, + self: "DeepseekModel", input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, @@ -147,9 +162,9 @@ def deepseek_model_forward( Example: ```python - >>> from transformers import AutoTokenizer, DeepseekForCausalLM + >>> from transformers import AutoTokenizer, AutoModelForCausalLM - >>> model = DeepseekForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> model = AutoModelForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) >>> prompt = "Hey, are you conscious? Can you talk to me?" @@ -303,7 +318,7 @@ def custom_forward(*inputs): @staticmethod def deepseek_for_causal_lm_forward( - self: DeepseekForCausalLM, + self: "DeepseekForCausalLM", input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, diff --git a/colossalai/shardformer/policies/deepseek.py b/colossalai/shardformer/policies/deepseek.py index 1d64c643ebdb..07b86cd638c8 100644 --- a/colossalai/shardformer/policies/deepseek.py +++ b/colossalai/shardformer/policies/deepseek.py @@ -7,11 +7,6 @@ from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col from colossalai.shardformer.modeling.deepseek import DeepseekPipelineForwards, EPDeepseekMoE -from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import ( - DeepseekDecoderLayer, - DeepseekForCausalLM, - DeepseekModel, -) from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription __all__ = ["DeepseekPolicy", "DeepseekForCausalLMPolicy"] @@ -57,7 +52,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]: ) ], policy=policy, - target_key=DeepseekDecoderLayer, + target_key="DeepseekDecoderLayer", ) # optimization configuration @@ -74,7 +69,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]: ), ], policy=policy, - target_key=DeepseekDecoderLayer, + target_key="DeepseekDecoderLayer", ) self.append_or_create_submodule_replacement( @@ -83,7 +78,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]: target_module=FusedRMSNorm, ), policy=policy, - target_key=DeepseekModel, + target_key="DeepseekModel", ) if self.shard_config.enable_flash_attention: @@ -144,7 +139,7 @@ def module_policy(self): if self.pipeline_stage_manager: # set None as default self.set_pipeline_forward( - model_cls=DeepseekModel, + model_cls="DeepseekModel", new_forward=DeepseekPipelineForwards.deepseek_model_forward, policy=policy, ) @@ -167,7 +162,7 @@ def module_policy(self): if self.shard_config.enable_tensor_parallelism: # add a new item for casual lm new_item = { - DeepseekForCausalLM: ModulePolicyDescription( + "DeepseekForCausalLM": ModulePolicyDescription( sub_module_replacement=[ SubModuleReplacementDescription( suffix="lm_head", @@ -182,7 +177,7 @@ def module_policy(self): if self.pipeline_stage_manager: # set None as default self.set_pipeline_forward( - model_cls=DeepseekForCausalLM, + model_cls="DeepseekForCausalLM", new_forward=DeepseekPipelineForwards.deepseek_for_causal_lm_forward, policy=policy, ) diff --git a/tests/test_moe/test_deepseek_layer.py b/tests/test_moe/test_deepseek_layer.py index 06dfbfe3b515..328ffb1de5f8 100644 --- a/tests/test_moe/test_deepseek_layer.py +++ b/tests/test_moe/test_deepseek_layer.py @@ -4,12 +4,11 @@ import torch import torch.distributed as dist from torch.testing import assert_close +from transformers import AutoConfig, AutoModel import colossalai from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin from colossalai.shardformer.modeling.deepseek import EPDeepseekMoE -from colossalai.shardformer.modeling.deepseek_moe_16b_base.configuration_deepseek import DeepseekConfig -from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import DeepseekMoE from colossalai.testing.utils import spawn tokens, n_experts = 7, 4 @@ -25,14 +24,18 @@ def check_deepseek_moe_layer(): pp_size=1, ep_size=dist.get_world_size(), ) - config = DeepseekConfig( - hidden_size=hidden_size, - intermediate_size=hidden_size * 2, - n_routed_experts=n_experts, - num_experts_per_tok=top_k, - ) + + config = AutoConfig.from_pretrained("deepseek-ai/deepseek-moe-16b-base", trust_remote_code=True) + config.num_hidden_layers = 1 + config.n_routed_experts = n_experts + config.num_experts_per_tok = top_k + config.hidden_size = hidden_size + config.intermediate_size = hidden_size * 2 + config.first_k_dense_replace = 0 + config.num_attention_heads = 2 torch.manual_seed(0) - orig_model = DeepseekMoE(config).cuda() + # get the moe layer in auto model + orig_model = AutoModel.from_config(config, trust_remote_code=True).layers[0].mlp.cuda() x = torch.rand(1, tokens, hidden_size, requires_grad=True).cuda() orig_output = orig_model(x) model = deepcopy(orig_model) diff --git a/tests/test_moe/test_moe_checkpoint.py b/tests/test_moe/test_moe_checkpoint.py index f3c5726ea0ae..8113b32d0411 100644 --- a/tests/test_moe/test_moe_checkpoint.py +++ b/tests/test_moe/test_moe_checkpoint.py @@ -14,8 +14,6 @@ from colossalai.booster import Booster from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin from colossalai.checkpoint_io import MoECheckpointIO -from colossalai.shardformer.modeling.deepseek_moe_16b_base.configuration_deepseek import DeepseekConfig -from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import DeepseekForCausalLM from colossalai.tensor.moe_tensor.api import is_moe_tensor from colossalai.testing import parameterize, spawn from colossalai.testing.utils import spawn @@ -91,21 +89,10 @@ def check_optimizer_snapshot_equal(snapshot1, snapshot2, param2name, moe_dp_grou num_experts_per_tok=top_k, num_attention_heads=2, num_key_value_heads=2, + num_hidden_layers=4, ), MixtralForCausalLM, ], - [ - DeepseekConfig( - hidden_size=hidden_size, - intermediate_size=hidden_size * 2, - n_routed_experts=n_experts, - num_experts_per_tok=top_k, - num_attention_heads=2, - num_key_value_heads=2, - first_k_dense_replace=4, - ), - DeepseekForCausalLM, - ], ], ) def check_moe_checkpoint(test_config): From 4030aa6ea039948b9b58307b45861acf150859d8 Mon Sep 17 00:00:00 2001 From: haze188 Date: Thu, 4 Jul 2024 08:17:45 +0000 Subject: [PATCH 3/4] [misc] delete useless file --- examples/language/llama/scripts/benchmark_7B/hosts.txt | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 examples/language/llama/scripts/benchmark_7B/hosts.txt diff --git a/examples/language/llama/scripts/benchmark_7B/hosts.txt b/examples/language/llama/scripts/benchmark_7B/hosts.txt deleted file mode 100644 index c9c165ebb978..000000000000 --- a/examples/language/llama/scripts/benchmark_7B/hosts.txt +++ /dev/null @@ -1,2 +0,0 @@ -10.20.1.170 -10.20.1.83 From c6abfbc64637fd0cf2edcc79039220921d84c717 Mon Sep 17 00:00:00 2001 From: haze188 Date: Thu, 4 Jul 2024 08:19:21 +0000 Subject: [PATCH 4/4] [misc] fix typos --- tests/test_moe/test_moe_checkpoint.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_moe/test_moe_checkpoint.py b/tests/test_moe/test_moe_checkpoint.py index 8113b32d0411..164301695865 100644 --- a/tests/test_moe/test_moe_checkpoint.py +++ b/tests/test_moe/test_moe_checkpoint.py @@ -89,7 +89,6 @@ def check_optimizer_snapshot_equal(snapshot1, snapshot2, param2name, moe_dp_grou num_experts_per_tok=top_k, num_attention_heads=2, num_key_value_heads=2, - num_hidden_layers=4, ), MixtralForCausalLM, ],