Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] remove modeling file, use auto config. #5884

Merged
merged 5 commits into from
Jul 4, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
[Feature] deepseek support via auto model, remove modeling file
Hz188 committed Jul 4, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit e9bf95ef286da4f68a10a3fcac5de024c7049531
2 changes: 1 addition & 1 deletion colossalai/cluster/process_group_mesh.py
Original file line number Diff line number Diff line change
@@ -147,7 +147,7 @@ def get_group(self, ranks_in_group: List[int], backend: Optional[str] = None) ->
ProcessGroup: The process group with the given ranks.
"""
ranks_in_group = sorted(ranks_in_group)
if tuple(ranks_in_group) not in self._group_to_ranks:
if tuple(ranks_in_group) not in self._ranks_to_group:
group = dist.new_group(ranks_in_group, backend=backend)
self._ranks_to_group[tuple(ranks_in_group)] = group
self._group_to_ranks[group] = tuple(ranks_in_group)
51 changes: 33 additions & 18 deletions colossalai/shardformer/modeling/deepseek.py
Original file line number Diff line number Diff line change
@@ -2,32 +2,47 @@

import torch
import torch.distributed as dist
import torch.nn as nn
from torch.distributed import ProcessGroup

# from colossalai.tensor.moe_tensor.moe_info import MoeParallelInfo
from torch.nn import CrossEntropyLoss
from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
from transformers.modeling_outputs import CausalLMOutputWithPast
from transformers.utils import is_flash_attn_2_available, logging

from colossalai.lazy import LazyInitContext
from colossalai.moe._operation import MoeInGradScaler, MoeOutGradScaler, all_to_all_uneven
from colossalai.pipeline.stage_manager import PipelineStageManager
from colossalai.shardformer.modeling.deepseek_moe_16b_base.configuration_deepseek import DeepseekConfig
from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import (
AddAuxiliaryLoss,
CausalLMOutputWithPast,
DeepseekForCausalLM,
DeepseekMLP,
DeepseekModel,
DeepseekMoE,
)
from colossalai.shardformer.shard import ShardConfig
from colossalai.shardformer.shard.utils import set_tensors_to_none


class EPDeepseekMoE(DeepseekMoE):
def __init__(self, config: DeepseekConfig):
super().__init__(config)
# copied from modeling_deepseek.py
class AddAuxiliaryLoss(torch.autograd.Function):
"""
The trick function of adding auxiliary (aux) loss,
which includes the gradient of the aux loss during backpropagation.
"""

@staticmethod
def forward(ctx, x, loss):
assert loss.numel() == 1
ctx.dtype = loss.dtype
ctx.required_aux_loss = loss.requires_grad
return x

@staticmethod
def backward(ctx, grad_output):
grad_loss = None
if ctx.required_aux_loss:
grad_loss = torch.ones(1, dtype=ctx.dtype, device=grad_output.device)
return grad_output, grad_loss


class EPDeepseekMoE(nn.Module):
def __init__(self):
super(EPDeepseekMoE, self).__init__()

def setup_ep(self, ep_group: ProcessGroup):
ep_group = ep_group
@@ -44,9 +59,9 @@ def setup_ep(self, ep_group: ProcessGroup):
p.ep_group = ep_group

@staticmethod
def from_native_module(module: Union[DeepseekMoE, DeepseekMLP], *args, **kwargs) -> "EPDeepseekMoE":
def from_native_module(module: Union["DeepseekMoE", "DeepseekMLP"], *args, **kwargs) -> "EPDeepseekMoE":
LazyInitContext.materialize(module)
if isinstance(module, DeepseekMLP):
if module.__class__.__name__ == "DeepseekMLP":
return module
module.__class__ = EPDeepseekMoE
assert "ep_group" in kwargs, "You should pass ep_group in SubModuleReplacementDescription via shard_config!!"
@@ -120,7 +135,7 @@ class DeepseekPipelineForwards:

@staticmethod
def deepseek_model_forward(
self: DeepseekModel,
self: "DeepseekModel",
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
@@ -147,9 +162,9 @@ def deepseek_model_forward(
Example:

```python
>>> from transformers import AutoTokenizer, DeepseekForCausalLM
>>> from transformers import AutoTokenizer, AutoModelForCausalLM

>>> model = DeepseekForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
>>> model = AutoModelForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

>>> prompt = "Hey, are you conscious? Can you talk to me?"
@@ -303,7 +318,7 @@ def custom_forward(*inputs):

@staticmethod
def deepseek_for_causal_lm_forward(
self: DeepseekForCausalLM,
self: "DeepseekForCausalLM",
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
17 changes: 6 additions & 11 deletions colossalai/shardformer/policies/deepseek.py
Original file line number Diff line number Diff line change
@@ -7,11 +7,6 @@

from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col
from colossalai.shardformer.modeling.deepseek import DeepseekPipelineForwards, EPDeepseekMoE
from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import (
DeepseekDecoderLayer,
DeepseekForCausalLM,
DeepseekModel,
)
from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription

__all__ = ["DeepseekPolicy", "DeepseekForCausalLMPolicy"]
@@ -57,7 +52,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
)
],
policy=policy,
target_key=DeepseekDecoderLayer,
target_key="DeepseekDecoderLayer",
)

# optimization configuration
@@ -74,7 +69,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
),
],
policy=policy,
target_key=DeepseekDecoderLayer,
target_key="DeepseekDecoderLayer",
)

self.append_or_create_submodule_replacement(
@@ -83,7 +78,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
target_module=FusedRMSNorm,
),
policy=policy,
target_key=DeepseekModel,
target_key="DeepseekModel",
)

if self.shard_config.enable_flash_attention:
@@ -144,7 +139,7 @@ def module_policy(self):
if self.pipeline_stage_manager:
# set None as default
self.set_pipeline_forward(
model_cls=DeepseekModel,
model_cls="DeepseekModel",
new_forward=DeepseekPipelineForwards.deepseek_model_forward,
policy=policy,
)
@@ -167,7 +162,7 @@ def module_policy(self):
if self.shard_config.enable_tensor_parallelism:
# add a new item for casual lm
new_item = {
DeepseekForCausalLM: ModulePolicyDescription(
"DeepseekForCausalLM": ModulePolicyDescription(
sub_module_replacement=[
SubModuleReplacementDescription(
suffix="lm_head",
@@ -182,7 +177,7 @@ def module_policy(self):
if self.pipeline_stage_manager:
# set None as default
self.set_pipeline_forward(
model_cls=DeepseekForCausalLM,
model_cls="DeepseekForCausalLM",
new_forward=DeepseekPipelineForwards.deepseek_for_causal_lm_forward,
policy=policy,
)
21 changes: 12 additions & 9 deletions tests/test_moe/test_deepseek_layer.py
Original file line number Diff line number Diff line change
@@ -4,12 +4,11 @@
import torch
import torch.distributed as dist
from torch.testing import assert_close
from transformers import AutoConfig, AutoModel

import colossalai
from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
from colossalai.shardformer.modeling.deepseek import EPDeepseekMoE
from colossalai.shardformer.modeling.deepseek_moe_16b_base.configuration_deepseek import DeepseekConfig
from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import DeepseekMoE
from colossalai.testing.utils import spawn

tokens, n_experts = 7, 4
@@ -25,14 +24,18 @@ def check_deepseek_moe_layer():
pp_size=1,
ep_size=dist.get_world_size(),
)
config = DeepseekConfig(
hidden_size=hidden_size,
intermediate_size=hidden_size * 2,
n_routed_experts=n_experts,
num_experts_per_tok=top_k,
)

config = AutoConfig.from_pretrained("deepseek-ai/deepseek-moe-16b-base", trust_remote_code=True)
config.num_hidden_layers = 1
config.n_routed_experts = n_experts
config.num_experts_per_tok = top_k
config.hidden_size = hidden_size
config.intermediate_size = hidden_size * 2
config.first_k_dense_replace = 0
config.num_attention_heads = 2
torch.manual_seed(0)
orig_model = DeepseekMoE(config).cuda()
# get the moe layer in auto model
orig_model = AutoModel.from_config(config, trust_remote_code=True).layers[0].mlp.cuda()
x = torch.rand(1, tokens, hidden_size, requires_grad=True).cuda()
orig_output = orig_model(x)
model = deepcopy(orig_model)
15 changes: 1 addition & 14 deletions tests/test_moe/test_moe_checkpoint.py
Original file line number Diff line number Diff line change
@@ -14,8 +14,6 @@
from colossalai.booster import Booster
from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
from colossalai.checkpoint_io import MoECheckpointIO
from colossalai.shardformer.modeling.deepseek_moe_16b_base.configuration_deepseek import DeepseekConfig
from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import DeepseekForCausalLM
from colossalai.tensor.moe_tensor.api import is_moe_tensor
from colossalai.testing import parameterize, spawn
from colossalai.testing.utils import spawn
@@ -91,21 +89,10 @@ def check_optimizer_snapshot_equal(snapshot1, snapshot2, param2name, moe_dp_grou
num_experts_per_tok=top_k,
num_attention_heads=2,
num_key_value_heads=2,
num_hidden_layers=4,
),
MixtralForCausalLM,
],
[
DeepseekConfig(
hidden_size=hidden_size,
intermediate_size=hidden_size * 2,
n_routed_experts=n_experts,
num_experts_per_tok=top_k,
num_attention_heads=2,
num_key_value_heads=2,
first_k_dense_replace=4,
),
DeepseekForCausalLM,
],
],
)
def check_moe_checkpoint(test_config):