Skip to content

Commit

Permalink
Merge branch 'feature/huggingface#35425' of github.com:bzantium/trans…
Browse files Browse the repository at this point in the history
…formers into feature/huggingface#35425
  • Loading branch information
arthur@huggingface.co committed Feb 14, 2025
2 parents e0a49ac + 4df42f0 commit dfd9abc
Show file tree
Hide file tree
Showing 3 changed files with 243 additions and 393 deletions.
11 changes: 9 additions & 2 deletions src/transformers/models/deepseek_v3/configuration_deepseek_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,11 +135,18 @@ class DeepseekV3Config(PretrainedConfig):
keys_to_ignore_at_inference = ["past_key_values"]
# Default tensor parallel plan for base model `DeepseekV3Model`
base_model_tp_plan = {
"layers.*.self_attn.q_b_proj": "colwise",
"layers.*.self_attn.kv_b_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.gate_proj": "colwise",
"layers.*.up_proj": "colwise",
"layers.*.down_proj": "rowwise",
}

base_model_pp_plan = {
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
"norm": (["hidden_states"], ["hidden_states"]),
}
def __init__(
self,
vocab_size=129280,
Expand Down Expand Up @@ -194,7 +201,7 @@ def __init__(
self.qk_rope_head_dim = qk_rope_head_dim
self.v_head_dim = v_head_dim
self.qk_nope_head_dim = qk_nope_head_dim
self.q_head_dim = qk_nope_head_dim + qk_rope_head_dim
self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
self.head_dim = qk_rope_head_dim
self.n_group = n_group
self.topk_group = topk_group
Expand Down
Loading

0 comments on commit dfd9abc

Please sign in to comment.