InternLM · gaoyang07 · Mar 1, 2024 · Mar 1, 2024 · Mar 1, 2024 · Mar 1, 2024
diff --git a/configs/_base_/models/internlm2_20B.py b/configs/_base_/models/internlm2_20B.py
@@ -1,6 +1,6 @@
 # Copyright (c) InternLM. All rights reserved.
 
-model_type = "INTERNLM2"
+model_type = "INTERNLM2_PUBLIC"
 
 VOCAB_SIZE = 92544
 HIDDEN_SIZE = 6144

diff --git a/configs/_base_/models/internlm2_7B.py b/configs/_base_/models/internlm2_7B.py
@@ -1,6 +1,6 @@
 # Copyright (c) InternLM. All rights reserved.
 
-model_type = "INTERNLM2"
+model_type = "INTERNLM2_PUBLIC"
 
 VOCAB_SIZE = 92544
 HIDDEN_SIZE = 4096

diff --git a/internlm/model/modeling_internlm2.py b/internlm/model/modeling_internlm2.py
@@ -49,7 +49,7 @@
 from internlm.utils.logger import get_logger
 from internlm.utils.registry import MODEL_INITIALIZER
 
-MODEL_TYPE = "INTERNLM2"
+MODEL_TYPE = "INTERNLM2_PUBLIC"
 
 logger = get_logger(__file__)
 RMSNorm = try_import_RMSNorm()
@@ -786,6 +786,7 @@ class PackedFlashLlama1D(nn.Module):
         out_head_init_std (float): std used to init output lmhead weight. 0.02 by default,
         init_type (str): Initialization type. Use uniform or normal. "normal" by default,
         rope_base (int): The value of `base` for rotary position embeddings. 10000 by default.
+        norm_head (bool): Whether to use norm head. False by default.
         tp_mode (str): The string value of tensor parallel mode, should be in ["mtp", "msp", "fsp", "isp"],
                        "mtp" by default.
     """
@@ -831,6 +832,7 @@ def __init__(
         out_head_init_std: float = 0.02,
         init_type: str = "normal",
         rope_base: int = 10000,
+        norm_head: bool = False,
         tp_mode: str = "mtp",
     ):
         super().__init__()
@@ -921,14 +923,20 @@ def __init__(
                 else:
                     self.norm = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
 
-            self.output = head_cls(
+            if norm_head and not issubclass(head_cls, InternLM2ScaleColumnParallelLinear):
+                raise TypeError(
+                    "Parameter ``norm_head`` should only be True when head_cls is "
+                    f"``InternLM2ScaleColumnParallelLinear``, instead of {head_cls}."
+                )
+            self.output = head_cls(  # pylint: disable=E1123
                 in_features=hidden_size,
                 out_features=gpc.get_world_size(ParallelMode.TENSOR) if is_reward else vocab_size,
                 process_group=gpc.get_group(ParallelMode.TENSOR),
                 bias=False,
                 device=device,
                 dtype=dtype,
                 weight_scale=embed_grad_scale,
+                norm_head=norm_head,
             )
             for _, param in self.output.named_parameters():
                 if init_type == "normal":
@@ -1068,6 +1076,7 @@ def build_model_with_cfg(
     out_head_init_std: float = 0.02,
     init_type: str = "normal",
     rope_base: int = 10000,
+    norm_head: bool = False,
     max_position_embeddings=2048,
     use_dynamic_ntk_rope=False,
 ):
@@ -1147,6 +1156,7 @@ def build_model_with_cfg(
         out_head_init_std=out_head_init_std,
         init_type=init_type,
         rope_base=rope_base,
+        norm_head=norm_head,
         max_position_embeddings=max_position_embeddings,
         use_dynamic_ntk_rope=use_dynamic_ntk_rope,
     )