[Feature] deepseek support via auto model, remove modeling file

hpcaitech · botbw · Jul 4, 2024 · Jul 1, 2024 · Jul 4, 2024 · Jul 4, 2024
commit e9bf95ef286da4f68a10a3fcac5de024c7049531
diff --git a/colossalai/cluster/process_group_mesh.py b/colossalai/cluster/process_group_mesh.py
@@ -147,7 +147,7 @@ def get_group(self, ranks_in_group: List[int], backend: Optional[str] = None) ->
             ProcessGroup: The process group with the given ranks.
         """
         ranks_in_group = sorted(ranks_in_group)
-        if tuple(ranks_in_group) not in self._group_to_ranks:
+        if tuple(ranks_in_group) not in self._ranks_to_group:
             group = dist.new_group(ranks_in_group, backend=backend)
             self._ranks_to_group[tuple(ranks_in_group)] = group
             self._group_to_ranks[group] = tuple(ranks_in_group)

diff --git a/colossalai/shardformer/modeling/deepseek.py b/colossalai/shardformer/modeling/deepseek.py
@@ -2,32 +2,47 @@
 
 import torch
 import torch.distributed as dist
+import torch.nn as nn
 from torch.distributed import ProcessGroup
 
 # from colossalai.tensor.moe_tensor.moe_info import MoeParallelInfo
 from torch.nn import CrossEntropyLoss
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.utils import is_flash_attn_2_available, logging
 
 from colossalai.lazy import LazyInitContext
 from colossalai.moe._operation import MoeInGradScaler, MoeOutGradScaler, all_to_all_uneven
 from colossalai.pipeline.stage_manager import PipelineStageManager
-from colossalai.shardformer.modeling.deepseek_moe_16b_base.configuration_deepseek import DeepseekConfig
-from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import (
-    AddAuxiliaryLoss,
-    CausalLMOutputWithPast,
-    DeepseekForCausalLM,
-    DeepseekMLP,
-    DeepseekModel,
-    DeepseekMoE,
-)
 from colossalai.shardformer.shard import ShardConfig
 from colossalai.shardformer.shard.utils import set_tensors_to_none
 
 
-class EPDeepseekMoE(DeepseekMoE):
-    def __init__(self, config: DeepseekConfig):
-        super().__init__(config)
+# copied from modeling_deepseek.py
+class AddAuxiliaryLoss(torch.autograd.Function):
+    """
+    The trick function of adding auxiliary (aux) loss,
+    which includes the gradient of the aux loss during backpropagation.
+    """
+
+    @staticmethod
+    def forward(ctx, x, loss):
+        assert loss.numel() == 1
+        ctx.dtype = loss.dtype
+        ctx.required_aux_loss = loss.requires_grad
+        return x
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_loss = None
+        if ctx.required_aux_loss:
+            grad_loss = torch.ones(1, dtype=ctx.dtype, device=grad_output.device)
+        return grad_output, grad_loss
+
+
+class EPDeepseekMoE(nn.Module):
+    def __init__(self):
+        super(EPDeepseekMoE, self).__init__()
 
     def setup_ep(self, ep_group: ProcessGroup):
         ep_group = ep_group
@@ -44,9 +59,9 @@ def setup_ep(self, ep_group: ProcessGroup):
             p.ep_group = ep_group
 
     @staticmethod
-    def from_native_module(module: Union[DeepseekMoE, DeepseekMLP], *args, **kwargs) -> "EPDeepseekMoE":
+    def from_native_module(module: Union["DeepseekMoE", "DeepseekMLP"], *args, **kwargs) -> "EPDeepseekMoE":
         LazyInitContext.materialize(module)
-        if isinstance(module, DeepseekMLP):
+        if module.__class__.__name__ == "DeepseekMLP":
             return module
         module.__class__ = EPDeepseekMoE
         assert "ep_group" in kwargs, "You should pass ep_group in SubModuleReplacementDescription via shard_config!!"
@@ -120,7 +135,7 @@ class DeepseekPipelineForwards:
 
     @staticmethod
     def deepseek_model_forward(
-        self: DeepseekModel,
+        self: "DeepseekModel",
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -147,9 +162,9 @@ def deepseek_model_forward(
         Example:
 
         ```python
-        >>> from transformers import AutoTokenizer, DeepseekForCausalLM
+        >>> from transformers import AutoTokenizer, AutoModelForCausalLM
 
-        >>> model = DeepseekForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> model = AutoModelForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
 
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
@@ -303,7 +318,7 @@ def custom_forward(*inputs):
 
     @staticmethod
     def deepseek_for_causal_lm_forward(
-        self: DeepseekForCausalLM,
+        self: "DeepseekForCausalLM",
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,

diff --git a/colossalai/shardformer/policies/deepseek.py b/colossalai/shardformer/policies/deepseek.py
@@ -7,11 +7,6 @@
 
 from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col
 from colossalai.shardformer.modeling.deepseek import DeepseekPipelineForwards, EPDeepseekMoE
-from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import (
-    DeepseekDecoderLayer,
-    DeepseekForCausalLM,
-    DeepseekModel,
-)
 from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
 
 __all__ = ["DeepseekPolicy", "DeepseekForCausalLMPolicy"]
@@ -57,7 +52,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 )
             ],
             policy=policy,
-            target_key=DeepseekDecoderLayer,
+            target_key="DeepseekDecoderLayer",
         )
 
         # optimization configuration
@@ -74,7 +69,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     ),
                 ],
                 policy=policy,
-                target_key=DeepseekDecoderLayer,
+                target_key="DeepseekDecoderLayer",
             )
 
             self.append_or_create_submodule_replacement(
@@ -83,7 +78,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     target_module=FusedRMSNorm,
                 ),
                 policy=policy,
-                target_key=DeepseekModel,
+                target_key="DeepseekModel",
             )
 
         if self.shard_config.enable_flash_attention:
@@ -144,7 +139,7 @@ def module_policy(self):
         if self.pipeline_stage_manager:
             # set None as default
             self.set_pipeline_forward(
-                model_cls=DeepseekModel,
+                model_cls="DeepseekModel",
                 new_forward=DeepseekPipelineForwards.deepseek_model_forward,
                 policy=policy,
             )
@@ -167,7 +162,7 @@ def module_policy(self):
         if self.shard_config.enable_tensor_parallelism:
             # add a new item for casual lm
             new_item = {
-                DeepseekForCausalLM: ModulePolicyDescription(
+                "DeepseekForCausalLM": ModulePolicyDescription(
                     sub_module_replacement=[
                         SubModuleReplacementDescription(
                             suffix="lm_head",
@@ -182,7 +177,7 @@ def module_policy(self):
         if self.pipeline_stage_manager:
             # set None as default
             self.set_pipeline_forward(
-                model_cls=DeepseekForCausalLM,
+                model_cls="DeepseekForCausalLM",
                 new_forward=DeepseekPipelineForwards.deepseek_for_causal_lm_forward,
                 policy=policy,
             )

diff --git a/tests/test_moe/test_deepseek_layer.py b/tests/test_moe/test_deepseek_layer.py
@@ -4,12 +4,11 @@
 import torch
 import torch.distributed as dist
 from torch.testing import assert_close
+from transformers import AutoConfig, AutoModel
 
 import colossalai
 from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
 from colossalai.shardformer.modeling.deepseek import EPDeepseekMoE
-from colossalai.shardformer.modeling.deepseek_moe_16b_base.configuration_deepseek import DeepseekConfig
-from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import DeepseekMoE
 from colossalai.testing.utils import spawn
 
 tokens, n_experts = 7, 4
@@ -25,14 +24,18 @@ def check_deepseek_moe_layer():
         pp_size=1,
         ep_size=dist.get_world_size(),
     )
-    config = DeepseekConfig(
-        hidden_size=hidden_size,
-        intermediate_size=hidden_size * 2,
-        n_routed_experts=n_experts,
-        num_experts_per_tok=top_k,
-    )
+
+    config = AutoConfig.from_pretrained("deepseek-ai/deepseek-moe-16b-base", trust_remote_code=True)
+    config.num_hidden_layers = 1
+    config.n_routed_experts = n_experts
+    config.num_experts_per_tok = top_k
+    config.hidden_size = hidden_size
+    config.intermediate_size = hidden_size * 2
+    config.first_k_dense_replace = 0
+    config.num_attention_heads = 2
     torch.manual_seed(0)
-    orig_model = DeepseekMoE(config).cuda()
+    # get the moe layer in auto model
+    orig_model = AutoModel.from_config(config, trust_remote_code=True).layers[0].mlp.cuda()
     x = torch.rand(1, tokens, hidden_size, requires_grad=True).cuda()
     orig_output = orig_model(x)
     model = deepcopy(orig_model)

diff --git a/tests/test_moe/test_moe_checkpoint.py b/tests/test_moe/test_moe_checkpoint.py
@@ -14,8 +14,6 @@
 from colossalai.booster import Booster
 from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
 from colossalai.checkpoint_io import MoECheckpointIO
-from colossalai.shardformer.modeling.deepseek_moe_16b_base.configuration_deepseek import DeepseekConfig
-from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import DeepseekForCausalLM
 from colossalai.tensor.moe_tensor.api import is_moe_tensor
 from colossalai.testing import parameterize, spawn
 from colossalai.testing.utils import spawn
@@ -91,21 +89,10 @@ def check_optimizer_snapshot_equal(snapshot1, snapshot2, param2name, moe_dp_grou
                 num_experts_per_tok=top_k,
                 num_attention_heads=2,
                 num_key_value_heads=2,
+                num_hidden_layers=4,
             ),
             MixtralForCausalLM,
         ],
-        [
-            DeepseekConfig(
-                hidden_size=hidden_size,
-                intermediate_size=hidden_size * 2,
-                n_routed_experts=n_experts,
-                num_experts_per_tok=top_k,
-                num_attention_heads=2,
-                num_key_value_heads=2,
-                first_k_dense_replace=4,
-            ),
-            DeepseekForCausalLM,
-        ],
     ],
 )
 def check_moe_checkpoint(test_config):