Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] remove modeling file, use auto config. #5884

Merged
merged 5 commits into from
Jul 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion colossalai/cluster/process_group_mesh.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def get_group(self, ranks_in_group: List[int], backend: Optional[str] = None) ->
ProcessGroup: The process group with the given ranks.
"""
ranks_in_group = sorted(ranks_in_group)
if tuple(ranks_in_group) not in self._group_to_ranks:
if tuple(ranks_in_group) not in self._ranks_to_group:
group = dist.new_group(ranks_in_group, backend=backend)
self._ranks_to_group[tuple(ranks_in_group)] = group
self._group_to_ranks[group] = tuple(ranks_in_group)
Expand Down
51 changes: 33 additions & 18 deletions colossalai/shardformer/modeling/deepseek.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,47 @@

import torch
import torch.distributed as dist
import torch.nn as nn
from torch.distributed import ProcessGroup

# from colossalai.tensor.moe_tensor.moe_info import MoeParallelInfo
from torch.nn import CrossEntropyLoss
from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
from transformers.modeling_outputs import CausalLMOutputWithPast
from transformers.utils import is_flash_attn_2_available, logging

from colossalai.lazy import LazyInitContext
from colossalai.moe._operation import MoeInGradScaler, MoeOutGradScaler, all_to_all_uneven
from colossalai.pipeline.stage_manager import PipelineStageManager
from colossalai.shardformer.modeling.deepseek_moe_16b_base.configuration_deepseek import DeepseekConfig
from colossalai.shardformer.modeling.deepseek_moe_16b_base.modeling_deepseek import (
AddAuxiliaryLoss,
CausalLMOutputWithPast,
DeepseekForCausalLM,
DeepseekMLP,
DeepseekModel,
DeepseekMoE,
)
from colossalai.shardformer.shard import ShardConfig
from colossalai.shardformer.shard.utils import set_tensors_to_none


class EPDeepseekMoE(DeepseekMoE):
def __init__(self, config: DeepseekConfig):
super().__init__(config)
# copied from modeling_deepseek.py
class AddAuxiliaryLoss(torch.autograd.Function):
"""
The trick function of adding auxiliary (aux) loss,
which includes the gradient of the aux loss during backpropagation.
"""

@staticmethod
def forward(ctx, x, loss):
assert loss.numel() == 1
ctx.dtype = loss.dtype
ctx.required_aux_loss = loss.requires_grad
return x

@staticmethod
def backward(ctx, grad_output):
grad_loss = None
if ctx.required_aux_loss:
grad_loss = torch.ones(1, dtype=ctx.dtype, device=grad_output.device)
return grad_output, grad_loss


class EPDeepseekMoE(nn.Module):
def __init__(self):
super(EPDeepseekMoE, self).__init__()

def setup_ep(self, ep_group: ProcessGroup):
ep_group = ep_group
Expand All @@ -44,9 +59,9 @@ def setup_ep(self, ep_group: ProcessGroup):
p.ep_group = ep_group

@staticmethod
def from_native_module(module: Union[DeepseekMoE, DeepseekMLP], *args, **kwargs) -> "EPDeepseekMoE":
def from_native_module(module: Union["DeepseekMoE", "DeepseekMLP"], *args, **kwargs) -> "EPDeepseekMoE":
LazyInitContext.materialize(module)
if isinstance(module, DeepseekMLP):
if module.__class__.__name__ == "DeepseekMLP":
return module
module.__class__ = EPDeepseekMoE
assert "ep_group" in kwargs, "You should pass ep_group in SubModuleReplacementDescription via shard_config!!"
Expand Down Expand Up @@ -120,7 +135,7 @@ class DeepseekPipelineForwards:

@staticmethod
def deepseek_model_forward(
self: DeepseekModel,
self: "DeepseekModel",
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
Expand All @@ -147,9 +162,9 @@ def deepseek_model_forward(
Example:

```python
>>> from transformers import AutoTokenizer, DeepseekForCausalLM
>>> from transformers import AutoTokenizer, AutoModelForCausalLM

>>> model = DeepseekForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
>>> model = AutoModelForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

>>> prompt = "Hey, are you conscious? Can you talk to me?"
Expand Down Expand Up @@ -303,7 +318,7 @@ def custom_forward(*inputs):

@staticmethod
def deepseek_for_causal_lm_forward(
self: DeepseekForCausalLM,
self: "DeepseekForCausalLM",
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
Expand Down
Empty file.

This file was deleted.

Loading
Loading