diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index a976d8774b..ecef21c9d4 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -141,7 +141,9 @@ class DescrptDPA1(NativeOP, BaseDescriptor):
             Time-step `dt` in the resnet construction:
             y = x + dt * \phi (Wx + b)
     trainable: bool
-            If the weights of embedding net are trainable.
+            If the weights of this descriptors are trainable.
+    trainable_ln: bool
+            Whether to use trainable shift and scale weights in layer normalization.
     type_one_side: bool
             If 'False', type embeddings of both neighbor and central atoms are considered.
             If 'True', only type embeddings of neighbor atoms are considered.
@@ -222,6 +224,7 @@ def __init__(
         scaling_factor=1.0,
         normalize: bool = True,
         temperature: Optional[float] = None,
+        trainable_ln: bool = True,
         smooth_type_embedding: bool = True,
         concat_output_tebd: bool = True,
         spin: Optional[Any] = None,
@@ -254,6 +257,7 @@ def __init__(
         self.tebd_input_mode = tebd_input_mode
         self.resnet_dt = resnet_dt
         self.trainable = trainable
+        self.trainable_ln = trainable_ln
         self.type_one_side = type_one_side
         self.attn = attn
         self.attn_layer = attn_layer
@@ -306,6 +310,7 @@ def __init__(
             scaling_factor=self.scaling_factor,
             normalize=self.normalize,
             temperature=self.temperature,
+            trainable_ln=self.trainable_ln,
             smooth=self.smooth,
             precision=self.precision,
         )
@@ -539,6 +544,7 @@ def serialize(self) -> dict:
             "scaling_factor": self.scaling_factor,
             "normalize": self.normalize,
             "temperature": self.temperature,
+            "trainable_ln": self.trainable_ln,
             "smooth_type_embedding": self.smooth,
             "type_one_side": self.type_one_side,
             "concat_output_tebd": self.concat_output_tebd,
@@ -607,6 +613,7 @@ def __init__(
         scaling_factor: float = 1.0,
         normalize: bool = True,
         temperature: Optional[float] = None,
+        trainable_ln: bool = True,
         smooth: bool = True,
         precision: str = DEFAULT_PRECISION,
     ):
@@ -621,6 +628,7 @@ def __init__(
         self.scaling_factor = scaling_factor
         self.normalize = normalize
         self.temperature = temperature
+        self.trainable_ln = trainable_ln
         self.smooth = smooth
         self.precision = precision
         self.network_type = NeighborGatedAttentionLayer
@@ -635,6 +643,7 @@ def __init__(
                 scaling_factor=scaling_factor,
                 normalize=normalize,
                 temperature=temperature,
+                trainable_ln=trainable_ln,
                 smooth=smooth,
                 precision=precision,
             )
@@ -690,6 +699,7 @@ def serialize(self):
             "scaling_factor": self.scaling_factor,
             "normalize": self.normalize,
             "temperature": self.temperature,
+            "trainable_ln": self.trainable_ln,
             "precision": self.precision,
             "attention_layers": [layer.serialize() for layer in self.attention_layers],
         }
@@ -725,6 +735,7 @@ def __init__(
         scaling_factor: float = 1.0,
         normalize: bool = True,
         temperature: Optional[float] = None,
+        trainable_ln: bool = True,
         smooth: bool = True,
         precision: str = DEFAULT_PRECISION,
     ):
@@ -738,6 +749,7 @@ def __init__(
         self.scaling_factor = scaling_factor
         self.normalize = normalize
         self.temperature = temperature
+        self.trainable_ln = trainable_ln
         self.precision = precision
         self.attention_layer = GatedAttentionLayer(
             nnei,
@@ -751,7 +763,9 @@ def __init__(
             smooth=smooth,
             precision=precision,
         )
-        self.attn_layer_norm = LayerNorm(self.embed_dim, precision=precision)
+        self.attn_layer_norm = LayerNorm(
+            self.embed_dim, trainable=self.trainable_ln, precision=precision
+        )
 
     def call(
         self,
@@ -783,6 +797,7 @@ def serialize(self) -> dict:
             "scaling_factor": self.scaling_factor,
             "normalize": self.normalize,
             "temperature": self.temperature,
+            "trainable_ln": self.trainable_ln,
             "precision": self.precision,
             "attention_layer": self.attention_layer.serialize(),
             "attn_layer_norm": self.attn_layer_norm.serialize(),
diff --git a/deepmd/dpmodel/utils/network.py b/deepmd/dpmodel/utils/network.py
index 88e97ee3c4..3490b654dd 100644
--- a/deepmd/dpmodel/utils/network.py
+++ b/deepmd/dpmodel/utils/network.py
@@ -399,6 +399,7 @@ def __init__(
         num_in: int,
         eps: float = 1e-5,
         uni_init: bool = True,
+        trainable: bool = True,
         precision: str = DEFAULT_PRECISION,
     ) -> None:
         self.eps = eps
@@ -417,6 +418,8 @@ def __init__(
         if self.uni_init:
             self.w = np.ones_like(self.w)
             self.b = np.zeros_like(self.b)
+        # only to keep consistent with other backends
+        self.trainable = trainable
 
     def serialize(self) -> dict:
         """Serialize the layer to a dict.
@@ -434,6 +437,7 @@ def serialize(self) -> dict:
             "@class": "LayerNorm",
             "@version": 1,
             "eps": self.eps,
+            "trainable": self.trainable,
             "precision": self.precision,
             "@variables": data,
         }
@@ -477,6 +481,8 @@ def __setitem__(self, key, value):
             self.w = value
         elif key in ("b", "bias"):
             self.b = value
+        elif key == "trainable":
+            self.trainable = value
         elif key == "precision":
             self.precision = value
         elif key == "eps":
@@ -489,6 +495,8 @@ def __getitem__(self, key):
             return self.w
         elif key in ("b", "bias"):
             return self.b
+        elif key == "trainable":
+            return self.trainable
         elif key == "precision":
             return self.precision
         elif key == "eps":
@@ -512,21 +520,20 @@ def call(self, x: np.ndarray) -> np.ndarray:
         np.ndarray
             The output.
         """
-        if self.w is None or self.b is None:
-            raise ValueError("w/b must be set")
         y = self.layer_norm_numpy(x, (self.num_in,), self.w, self.b, self.eps)
         return y
 
     @staticmethod
-    def layer_norm_numpy(x, shape, weight, bias, eps):
+    def layer_norm_numpy(x, shape, weight=None, bias=None, eps=1e-5):
         # mean and variance
         mean = np.mean(x, axis=tuple(range(-len(shape), 0)), keepdims=True)
         var = np.var(x, axis=tuple(range(-len(shape), 0)), keepdims=True)
         # normalize
         x_normalized = (x - mean) / np.sqrt(var + eps)
         # shift and scale
-        x_ln = x_normalized * weight + bias
-        return x_ln
+        if weight is not None and bias is not None:
+            x_normalized = x_normalized * weight + bias
+        return x_normalized
 
 
 def make_multilayer_network(T_NetworkLayer, ModuleBase):
diff --git a/deepmd/pt/model/descriptor/dpa1.py b/deepmd/pt/model/descriptor/dpa1.py
index aec2e75e6e..e95af36674 100644
--- a/deepmd/pt/model/descriptor/dpa1.py
+++ b/deepmd/pt/model/descriptor/dpa1.py
@@ -123,7 +123,9 @@ class DescrptDPA1(BaseDescriptor, torch.nn.Module):
             Time-step `dt` in the resnet construction:
             y = x + dt * \phi (Wx + b)
     trainable: bool
-            If the weights of embedding net are trainable.
+            If the weights of this descriptors are trainable.
+    trainable_ln: bool
+            Whether to use trainable shift and scale weights in layer normalization.
     type_one_side: bool
             If 'False', type embeddings of both neighbor and central atoms are considered.
             If 'True', only type embeddings of neighbor atoms are considered.
@@ -205,6 +207,7 @@ def __init__(
         temperature=None,
         concat_output_tebd: bool = True,
         trainable: bool = True,
+        trainable_ln: bool = True,
         smooth_type_embedding: bool = True,
         type_one_side: bool = False,
         # not implemented
@@ -252,6 +255,7 @@ def __init__(
             type_one_side=type_one_side,
             exclude_types=exclude_types,
             env_protection=env_protection,
+            trainable_ln=trainable_ln,
             old_impl=old_impl,
         )
         self.type_embedding = TypeEmbedNet(ntypes, tebd_dim, precision=precision)
@@ -385,6 +389,7 @@ def serialize(self) -> dict:
             "scaling_factor": obj.scaling_factor,
             "normalize": obj.normalize,
             "temperature": obj.temperature,
+            "trainable_ln": obj.trainable_ln,
             "smooth_type_embedding": obj.smooth,
             "type_one_side": obj.type_one_side,
             "concat_output_tebd": self.concat_output_tebd,
diff --git a/deepmd/pt/model/descriptor/se_atten.py b/deepmd/pt/model/descriptor/se_atten.py
index cfd0a7f95d..d857bc31f7 100644
--- a/deepmd/pt/model/descriptor/se_atten.py
+++ b/deepmd/pt/model/descriptor/se_atten.py
@@ -83,6 +83,7 @@ def __init__(
         type_one_side: bool = False,
         exclude_types: List[Tuple[int, int]] = [],
         env_protection: float = 0.0,
+        trainable_ln: bool = True,
         type: Optional[str] = None,
         old_impl: bool = False,
     ):
@@ -119,6 +120,7 @@ def __init__(
         self.smooth = smooth
         self.type_one_side = type_one_side
         self.env_protection = env_protection
+        self.trainable_ln = trainable_ln
         self.old_impl = old_impl
 
         if isinstance(sel, int):
@@ -157,6 +159,7 @@ def __init__(
                 scaling_factor=self.scaling_factor,
                 normalize=self.normalize,
                 temperature=self.temperature,
+                trainable_ln=self.trainable_ln,
                 smooth=self.smooth,
                 precision=self.precision,
             )
@@ -468,6 +471,7 @@ def __init__(
         scaling_factor: float = 1.0,
         normalize: bool = True,
         temperature: Optional[float] = None,
+        trainable_ln: bool = True,
         smooth: bool = True,
         precision: str = DEFAULT_PRECISION,
     ):
@@ -482,6 +486,7 @@ def __init__(
         self.scaling_factor = scaling_factor
         self.normalize = normalize
         self.temperature = temperature
+        self.trainable_ln = trainable_ln
         self.smooth = smooth
         self.precision = precision
         self.network_type = NeighborGatedAttentionLayer
@@ -497,7 +502,8 @@ def __init__(
                     scaling_factor=scaling_factor,
                     normalize=normalize,
                     temperature=temperature,
-                    smooth=self.smooth,
+                    trainable_ln=trainable_ln,
+                    smooth=smooth,
                     precision=precision,
                 )
             )
@@ -563,6 +569,7 @@ def serialize(self) -> dict:
             "scaling_factor": self.scaling_factor,
             "normalize": self.normalize,
             "temperature": self.temperature,
+            "trainable_ln": self.trainable_ln,
             "precision": self.precision,
             "attention_layers": [layer.serialize() for layer in self.attention_layers],
         }
@@ -598,6 +605,7 @@ def __init__(
         normalize: bool = True,
         temperature: Optional[float] = None,
         smooth: bool = True,
+        trainable_ln: bool = True,
         precision: str = DEFAULT_PRECISION,
     ):
         """Construct a neighbor-wise attention layer."""
@@ -611,6 +619,7 @@ def __init__(
         self.normalize = normalize
         self.temperature = temperature
         self.precision = precision
+        self.trainable_ln = trainable_ln
         self.attention_layer = GatedAttentionLayer(
             nnei,
             embed_dim,
@@ -623,7 +632,9 @@ def __init__(
             smooth=smooth,
             precision=precision,
         )
-        self.attn_layer_norm = LayerNorm(self.embed_dim, precision=precision)
+        self.attn_layer_norm = LayerNorm(
+            self.embed_dim, trainable=trainable_ln, precision=precision
+        )
 
     def forward(
         self,
@@ -655,6 +666,7 @@ def serialize(self) -> dict:
             "scaling_factor": self.scaling_factor,
             "normalize": self.normalize,
             "temperature": self.temperature,
+            "trainable_ln": self.trainable_ln,
             "precision": self.precision,
             "attention_layer": self.attention_layer.serialize(),
             "attn_layer_norm": self.attn_layer_norm.serialize(),
diff --git a/deepmd/pt/model/network/layernorm.py b/deepmd/pt/model/network/layernorm.py
index efb4836db7..27b9808010 100644
--- a/deepmd/pt/model/network/layernorm.py
+++ b/deepmd/pt/model/network/layernorm.py
@@ -31,6 +31,7 @@ def __init__(
         bavg: float = 0.0,
         stddev: float = 1.0,
         precision: str = DEFAULT_PRECISION,
+        trainable: bool = True,
     ):
         self.eps = eps
         self.uni_init = uni_init
@@ -50,6 +51,10 @@ def __init__(
         if self.uni_init:
             nn.init.ones_(self.matrix.data)
             nn.init.zeros_(self.bias.data)
+        self.trainable = trainable
+        if not self.trainable:
+            self.matrix.requires_grad = False
+            self.bias.requires_grad = False
 
     def dim_out(self) -> int:
         return self.matrix.shape[0]
@@ -73,10 +78,8 @@ def forward(
         mean = xx.mean(dim=-1, keepdim=True)
         variance = xx.var(dim=-1, unbiased=False, keepdim=True)
         yy = (xx - mean) / torch.sqrt(variance + self.eps)
-        if self.matrix is not None:
-            yy = yy * self.matrix
-        if self.bias is not None:
-            yy = yy + self.bias
+        if self.matrix is not None and self.bias is not None:
+            yy = yy * self.matrix + self.bias
         return yy
 
     def serialize(self) -> dict:
@@ -90,6 +93,7 @@ def serialize(self) -> dict:
         nl = DPLayerNorm(
             self.matrix.shape[0],
             eps=self.eps,
+            trainable=self.trainable,
             precision=self.precision,
         )
         nl.w = to_numpy_array(self.matrix)
@@ -110,6 +114,7 @@ def deserialize(cls, data: dict) -> "LayerNorm":
         obj = cls(
             nl["matrix"].shape[0],
             eps=nl["eps"],
+            trainable=nl["trainable"],
             precision=nl["precision"],
         )
         prec = PRECISION_DICT[obj.precision]
diff --git a/deepmd/tf/descriptor/se_atten.py b/deepmd/tf/descriptor/se_atten.py
index 15db4f5a22..dcf785d6f4 100644
--- a/deepmd/tf/descriptor/se_atten.py
+++ b/deepmd/tf/descriptor/se_atten.py
@@ -181,6 +181,7 @@ def __init__(
         scaling_factor=1.0,
         normalize=True,
         temperature=None,
+        trainable_ln: bool = True,
         concat_output_tebd: bool = True,
         env_protection: float = 0.0,  # not implement!!
         **kwargs,
@@ -233,6 +234,7 @@ def __init__(
             raise ValueError("`model/type_map` is not set or empty!")
         self.stripped_type_embedding = stripped_type_embedding
         self.smooth = smooth_type_embedding
+        self.trainable_ln = trainable_ln
         self.ntypes = ntypes
         self.att_n = attn
         self.attn_layer = attn_layer
@@ -251,12 +253,6 @@ def __init__(
         std_ones = np.ones([self.ntypes, self.ndescrpt]).astype(
             GLOBAL_NP_FLOAT_PRECISION
         )
-        # self.beta = np.zeros([self.attn_layer, self.filter_neuron[-1]]).astype(
-        #     GLOBAL_NP_FLOAT_PRECISION
-        # )
-        # self.gamma = np.ones([self.attn_layer, self.filter_neuron[-1]]).astype(
-        #     GLOBAL_NP_FLOAT_PRECISION
-        # )
         self.attention_layer_variables = None
         sub_graph = tf.Graph()
         with sub_graph.as_default():
@@ -891,38 +887,6 @@ def _lookup_type_embedding(
             return self.embedding_input_2
         return self.embedding_input
 
-    def _feedforward(self, input_xyz, d_in, d_mid):
-        residual = input_xyz
-        input_xyz = tf.nn.relu(
-            one_layer(
-                input_xyz,
-                d_mid,
-                name="c_ffn1",
-                reuse=tf.AUTO_REUSE,
-                seed=self.seed,
-                activation_fn=None,
-                precision=self.filter_precision,
-                trainable=True,
-                uniform_seed=self.uniform_seed,
-                initial_variables=self.attention_layer_variables,
-            )
-        )
-        input_xyz = one_layer(
-            input_xyz,
-            d_in,
-            name="c_ffn2",
-            reuse=tf.AUTO_REUSE,
-            seed=self.seed,
-            activation_fn=None,
-            precision=self.filter_precision,
-            trainable=True,
-            uniform_seed=self.uniform_seed,
-            initial_variables=self.attention_layer_variables,
-        )
-        input_xyz += residual
-        input_xyz = tf.keras.layers.LayerNormalization()(input_xyz)
-        return input_xyz
-
     def _scaled_dot_attn(
         self,
         Q,
@@ -1068,15 +1032,9 @@ def _attention_layers(
                     reuse=tf.AUTO_REUSE,
                     seed=self.seed,
                     uniform_seed=self.uniform_seed,
-                    trainable=trainable,
+                    trainable=self.trainable_ln,
                     initial_variables=self.attention_layer_variables,
                 )
-                # input_xyz = tf.keras.layers.LayerNormalization(
-                #     beta_initializer=tf.constant_initializer(self.beta[i]),
-                #     gamma_initializer=tf.constant_initializer(self.gamma[i]),
-                #     dtype=self.filter_precision,
-                # )(input_xyz)
-                # input_xyz = self._feedforward(input_xyz, outputs_size[-1], self.att_n)
         return input_xyz
 
     def _filter_lower(
@@ -1384,27 +1342,6 @@ def init_variables(
         self.attention_layer_variables = get_attention_layer_variables_from_graph_def(
             graph_def, suffix=suffix
         )
-        # if self.attn_layer > 0:
-        #     self.beta[0] = self.attention_layer_variables[
-        #         f"attention_layer_0{suffix}/layer_normalization/beta"
-        #     ]
-        #     self.gamma[0] = self.attention_layer_variables[
-        #         f"attention_layer_0{suffix}/layer_normalization/gamma"
-        #     ]
-        #     for i in range(1, self.attn_layer):
-        #         self.beta[i] = self.attention_layer_variables[
-        #             f"attention_layer_{i}{suffix}/layer_normalization_{i}/beta"
-        #         ]
-        #         self.gamma[i] = self.attention_layer_variables[
-        #             f"attention_layer_{i}{suffix}/layer_normalization_{i}/gamma"
-        #         ]
-        # for i in range(self.attn_layer):
-        #     self.beta[i] = self.attention_layer_variables[
-        #         f"attention_layer_{i}{suffix}/layer_normalization/beta"
-        #     ]
-        #     self.gamma[i] = self.attention_layer_variables[
-        #         f"attention_layer_{i}{suffix}/layer_normalization/gamma"
-        #     ]
 
         if self.stripped_type_embedding:
             self.two_side_embeeding_net_variables = (
@@ -1527,6 +1464,7 @@ def serialize_attention_layers(
         hidden_dim: int,
         dotr: bool,
         do_mask: bool,
+        trainable_ln: bool,
         variables: dict,
         bias: bool = True,
         suffix: str = "",
@@ -1538,6 +1476,7 @@ def serialize_attention_layers(
             "hidden_dim": hidden_dim,
             "dotr": dotr,
             "do_mask": do_mask,
+            "trainable_ln": trainable_ln,
             "precision": self.precision.name,
             "attention_layers": [],
         }
@@ -1592,6 +1531,7 @@ def serialize_attention_layers(
 
             layer_norm = LayerNorm(
                 embed_dim,
+                trainable=self.trainable_ln,
                 precision=self.precision.name,
             )
             layer_norm["matrix"] = attention_layer_params[layer_idx][
@@ -1609,6 +1549,7 @@ def serialize_attention_layers(
                         "smooth": self.smooth,
                     },
                     "attn_layer_norm": layer_norm.serialize(),
+                    "trainable_ln": self.trainable_ln,
                 }
             )
         return data
@@ -1778,6 +1719,7 @@ def serialize(self, suffix: str = "") -> dict:
             "activation_function": self.activation_function_name,
             "resnet_dt": self.filter_resnet_dt,
             "smooth_type_embedding": self.smooth,
+            "trainable_ln": self.trainable_ln,
             "precision": self.filter_precision.name,
             "embeddings": self.serialize_network(
                 ntypes=self.ntypes,
@@ -1799,6 +1741,7 @@ def serialize(self, suffix: str = "") -> dict:
                 hidden_dim=self.att_n,
                 dotr=self.attn_dotr,
                 do_mask=self.attn_mask,
+                trainable_ln=self.trainable_ln,
                 variables=self.attention_layer_variables,
                 suffix=suffix,
             ),
@@ -1844,7 +1787,9 @@ class DescrptDPA1Compat(DescrptSeAtten):
             Time-step `dt` in the resnet construction:
             y = x + dt * \phi (Wx + b)
     trainable: bool
-            If the weights of embedding net are trainable.
+            If the weights of this descriptors are trainable.
+    trainable_ln: bool
+            Whether to use trainable shift and scale weights in layer normalization.
     type_one_side: bool
             If 'False', type embeddings of both neighbor and central atoms are considered.
             If 'True', only type embeddings of neighbor atoms are considered.
@@ -1917,6 +1862,7 @@ def __init__(
         scaling_factor=1.0,
         normalize: bool = True,
         temperature: Optional[float] = None,
+        trainable_ln: bool = True,
         smooth_type_embedding: bool = True,
         concat_output_tebd: bool = True,
         spin: Optional[Any] = None,
@@ -1963,6 +1909,7 @@ def __init__(
             attn_mask=attn_mask,
             multi_task=True,
             stripped_type_embedding=False,
+            trainable_ln=trainable_ln,
             smooth_type_embedding=smooth_type_embedding,
             env_protection=env_protection,
         )
diff --git a/deepmd/tf/utils/network.py b/deepmd/tf/utils/network.py
index 916f783050..7918b58d0c 100644
--- a/deepmd/tf/utils/network.py
+++ b/deepmd/tf/utils/network.py
@@ -200,177 +200,6 @@ def layernorm(
         return output
 
 
-# class LayerNormCompat:
-#     """Implementation of Layer Normalization layer for testing with other backend references.
-#
-#     Parameters
-#     ----------
-#     num_in : int
-#         The input dimension of the layer.
-#     eps : float, optional
-#         A small value added to prevent division by zero in calculations.
-#     uni_init : bool, optional
-#         If initialize the weights to be zeros and ones.
-#     precision : str, optional
-#         The precision of the layer parameters. Supported options are |PRECISION|
-#     """
-#
-#     def __init__(
-#         self,
-#         num_in: int,
-#         eps: float = 1e-5,
-#         uni_init: bool = True,
-#         precision: str = "default",
-#     ) -> None:
-#         self.eps = eps
-#         self.uni_init = uni_init
-#         self.num_in = num_in
-#         self.filter_precision = get_precision(precision)
-#         self.layer_norm_variables = None
-#
-#     def build(
-#         self,
-#         inputs,
-#         input_shape: List[int],
-#         reuse=None,
-#         suffix="",
-#     ):
-#         """Build the computational graph for the layer normalization.
-#
-#         Parameters
-#         ----------
-#         input_shape
-#             The shape of the input tensor.
-#         reuse
-#             The weights in the networks should be reused when get the variable.
-#         suffix
-#             Name suffix to identify this layer
-#
-#         Returns
-#         -------
-#         normalized_output
-#             The computational graph for the normalized output
-#         """
-#         assert input_shape[-1] == self.num_in
-#         name = "layer_norm" + suffix
-#         with tf.variable_scope(name, reuse=reuse):
-#             gamma = tf.get_variable(
-#                 "gamma",
-#                 shape=[self.num_in],
-#                 initializer=tf.ones_initializer(),
-#                 dtype=self.filter_precision,
-#                 trainable=True,
-#             )
-#             beta = tf.get_variable(
-#                 "beta",
-#                 shape=[self.num_in],
-#                 initializer=tf.zeros_initializer(),
-#                 dtype=self.filter_precision,
-#                 trainable=True,
-#             )
-#             normalized_output = tf.contrib.layers.layer_norm(
-#                 inputs=input,
-#                 begin_norm_axis=-1,
-#                 begin_params_axis=-1,
-#                 epsilon=self.eps,
-#                 activation_fn=None,
-#                 param_initializers={
-#                     "gamma": tf.ones_initializer(),
-#                     "beta": tf.zeros_initializer(),
-#                 },
-#                 trainable=True,
-#                 reuse=reuse,
-#                 variables_collections=None,
-#                 outputs_collections=None,
-#                 data_format="NHWC",
-#                 name=name,
-#             )
-#         return normalized_output
-#
-#     def init_variables(
-#         self,
-#         graph: tf.Graph,
-#         graph_def: tf.GraphDef,
-#         suffix="",
-#         model_type="original_model",
-#     ) -> None:
-#         """Init the layer norm variables with the given dict.
-#
-#         Parameters
-#         ----------
-#         graph : tf.Graph
-#             The input frozen model graph
-#         graph_def : tf.GraphDef
-#             The input frozen model graph_def
-#         suffix
-#             Name suffix to identify this layer
-#         model_type
-#             Indicator of whether this model is a compressed model
-#         """
-#         self.layer_norm_variables = get_layer_norm_variables_from_graph_def(
-#             graph_def, suffix=suffix
-#         )
-#
-#     @classmethod
-#     def deserialize(cls, data: dict, suffix: str = ""):
-#         """Deserialize the layer from a dict.
-#
-#         Parameters
-#         ----------
-#         data : dict
-#             The dict to deserialize from.
-#         suffix : str, optional
-#             The suffix of the scope
-#
-#         Returns
-#         -------
-#         LayerNorm
-#             The deserialized layer
-#         """
-#         data = data.copy()
-#         check_version_compatibility(data.pop("@version", 1), 1, 1)
-#         data_cls = data.pop("@class")
-#         assert data_cls == "LayerNorm", f"Invalid class {data_cls}"
-#         variables = data.pop("@variables")
-#         obj = cls(
-#             num_in=variables["w"].shape[0],
-#             eps=data.pop("eps"),
-#             precision=data.pop("precision"),
-#         )
-#         obj.layer_norm_variables = {
-#             f"layer_norm{suffix}/gamma": variables["w"],
-#             f"layer_norm{suffix}/beta": variables["b"],
-#         }
-#         return obj
-#
-#     def serialize(self, suffix: str = "") -> dict:
-#         """Serialize the layer to a dict.
-#
-#         Parameters
-#         ----------
-#         suffix : str, optional
-#             The suffix of the scope
-#
-#         Returns
-#         -------
-#         dict
-#             The serialized layer.
-#         """
-#         assert self.layer_norm_variables is not None
-#         gamma = self.layer_norm_variables[f"layer_norm{suffix}/gamma"]
-#         beta = self.layer_norm_variables[f"layer_norm{suffix}/beta"]
-#         return {
-#             "@class": "LayerNorm",
-#             "@version": 1,
-#             "eps": self.eps,
-#             "precision": self.filter_precision.name,
-#             "@variables": {
-#                 "w": gamma,
-#                 "b": beta,
-#             },
-#         }
-
-
 def embedding_net_rand_seed_shift(network_size):
     shift = 3 * (len(network_size) + 1)
     return shift
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index c51900e9a9..8dd2be2b6b 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -478,6 +478,9 @@ def descrpt_se_atten_args():
     doc_stripped_type_embedding = "Whether to strip the type embedding into a separated embedding network. Setting it to `False` will fall back to the previous version of `se_atten` which is non-compressible."
     doc_smooth_type_embedding = f"Whether to use smooth process in attention weights calculation. {doc_only_tf_supported} When using stripped type embedding, whether to dot smooth factor on the network output of type embedding to keep the network smooth, instead of setting `set_davg_zero` to be True."
     doc_set_davg_zero = "Set the normalization average to zero. This option should be set when `se_atten` descriptor or `atom_ener` in the energy fitting is used"
+    doc_trainable_ln = (
+        "Whether to use trainable shift and scale weights in layer normalization."
+    )
     doc_tebd_dim = "The dimension of atom type embedding."
     doc_temperature = "The scaling factor of normalization in calculations of attention weights, which is used to scale the matmul(Q, K)."
     doc_scaling_factor = (
@@ -507,11 +510,15 @@ def descrpt_se_atten_args():
             bool,
             optional=True,
             default=False,
+            alias=["smooth_type_embdding"],
             doc=doc_smooth_type_embedding,
         ),
         Argument(
             "set_davg_zero", bool, optional=True, default=True, doc=doc_set_davg_zero
         ),
+        Argument(
+            "trainable_ln", bool, optional=True, default=True, doc=doc_trainable_ln
+        ),
         # pt only
         Argument(
             "tebd_dim",