diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py index a976d8774b..ecef21c9d4 100644 --- a/deepmd/dpmodel/descriptor/dpa1.py +++ b/deepmd/dpmodel/descriptor/dpa1.py @@ -141,7 +141,9 @@ class DescrptDPA1(NativeOP, BaseDescriptor): Time-step `dt` in the resnet construction: y = x + dt * \phi (Wx + b) trainable: bool - If the weights of embedding net are trainable. + If the weights of this descriptors are trainable. + trainable_ln: bool + Whether to use trainable shift and scale weights in layer normalization. type_one_side: bool If 'False', type embeddings of both neighbor and central atoms are considered. If 'True', only type embeddings of neighbor atoms are considered. @@ -222,6 +224,7 @@ def __init__( scaling_factor=1.0, normalize: bool = True, temperature: Optional[float] = None, + trainable_ln: bool = True, smooth_type_embedding: bool = True, concat_output_tebd: bool = True, spin: Optional[Any] = None, @@ -254,6 +257,7 @@ def __init__( self.tebd_input_mode = tebd_input_mode self.resnet_dt = resnet_dt self.trainable = trainable + self.trainable_ln = trainable_ln self.type_one_side = type_one_side self.attn = attn self.attn_layer = attn_layer @@ -306,6 +310,7 @@ def __init__( scaling_factor=self.scaling_factor, normalize=self.normalize, temperature=self.temperature, + trainable_ln=self.trainable_ln, smooth=self.smooth, precision=self.precision, ) @@ -539,6 +544,7 @@ def serialize(self) -> dict: "scaling_factor": self.scaling_factor, "normalize": self.normalize, "temperature": self.temperature, + "trainable_ln": self.trainable_ln, "smooth_type_embedding": self.smooth, "type_one_side": self.type_one_side, "concat_output_tebd": self.concat_output_tebd, @@ -607,6 +613,7 @@ def __init__( scaling_factor: float = 1.0, normalize: bool = True, temperature: Optional[float] = None, + trainable_ln: bool = True, smooth: bool = True, precision: str = DEFAULT_PRECISION, ): @@ -621,6 +628,7 @@ def __init__( self.scaling_factor = scaling_factor self.normalize = normalize self.temperature = temperature + self.trainable_ln = trainable_ln self.smooth = smooth self.precision = precision self.network_type = NeighborGatedAttentionLayer @@ -635,6 +643,7 @@ def __init__( scaling_factor=scaling_factor, normalize=normalize, temperature=temperature, + trainable_ln=trainable_ln, smooth=smooth, precision=precision, ) @@ -690,6 +699,7 @@ def serialize(self): "scaling_factor": self.scaling_factor, "normalize": self.normalize, "temperature": self.temperature, + "trainable_ln": self.trainable_ln, "precision": self.precision, "attention_layers": [layer.serialize() for layer in self.attention_layers], } @@ -725,6 +735,7 @@ def __init__( scaling_factor: float = 1.0, normalize: bool = True, temperature: Optional[float] = None, + trainable_ln: bool = True, smooth: bool = True, precision: str = DEFAULT_PRECISION, ): @@ -738,6 +749,7 @@ def __init__( self.scaling_factor = scaling_factor self.normalize = normalize self.temperature = temperature + self.trainable_ln = trainable_ln self.precision = precision self.attention_layer = GatedAttentionLayer( nnei, @@ -751,7 +763,9 @@ def __init__( smooth=smooth, precision=precision, ) - self.attn_layer_norm = LayerNorm(self.embed_dim, precision=precision) + self.attn_layer_norm = LayerNorm( + self.embed_dim, trainable=self.trainable_ln, precision=precision + ) def call( self, @@ -783,6 +797,7 @@ def serialize(self) -> dict: "scaling_factor": self.scaling_factor, "normalize": self.normalize, "temperature": self.temperature, + "trainable_ln": self.trainable_ln, "precision": self.precision, "attention_layer": self.attention_layer.serialize(), "attn_layer_norm": self.attn_layer_norm.serialize(), diff --git a/deepmd/dpmodel/utils/network.py b/deepmd/dpmodel/utils/network.py index 88e97ee3c4..3490b654dd 100644 --- a/deepmd/dpmodel/utils/network.py +++ b/deepmd/dpmodel/utils/network.py @@ -399,6 +399,7 @@ def __init__( num_in: int, eps: float = 1e-5, uni_init: bool = True, + trainable: bool = True, precision: str = DEFAULT_PRECISION, ) -> None: self.eps = eps @@ -417,6 +418,8 @@ def __init__( if self.uni_init: self.w = np.ones_like(self.w) self.b = np.zeros_like(self.b) + # only to keep consistent with other backends + self.trainable = trainable def serialize(self) -> dict: """Serialize the layer to a dict. @@ -434,6 +437,7 @@ def serialize(self) -> dict: "@class": "LayerNorm", "@version": 1, "eps": self.eps, + "trainable": self.trainable, "precision": self.precision, "@variables": data, } @@ -477,6 +481,8 @@ def __setitem__(self, key, value): self.w = value elif key in ("b", "bias"): self.b = value + elif key == "trainable": + self.trainable = value elif key == "precision": self.precision = value elif key == "eps": @@ -489,6 +495,8 @@ def __getitem__(self, key): return self.w elif key in ("b", "bias"): return self.b + elif key == "trainable": + return self.trainable elif key == "precision": return self.precision elif key == "eps": @@ -512,21 +520,20 @@ def call(self, x: np.ndarray) -> np.ndarray: np.ndarray The output. """ - if self.w is None or self.b is None: - raise ValueError("w/b must be set") y = self.layer_norm_numpy(x, (self.num_in,), self.w, self.b, self.eps) return y @staticmethod - def layer_norm_numpy(x, shape, weight, bias, eps): + def layer_norm_numpy(x, shape, weight=None, bias=None, eps=1e-5): # mean and variance mean = np.mean(x, axis=tuple(range(-len(shape), 0)), keepdims=True) var = np.var(x, axis=tuple(range(-len(shape), 0)), keepdims=True) # normalize x_normalized = (x - mean) / np.sqrt(var + eps) # shift and scale - x_ln = x_normalized * weight + bias - return x_ln + if weight is not None and bias is not None: + x_normalized = x_normalized * weight + bias + return x_normalized def make_multilayer_network(T_NetworkLayer, ModuleBase): diff --git a/deepmd/pt/model/descriptor/dpa1.py b/deepmd/pt/model/descriptor/dpa1.py index aec2e75e6e..e95af36674 100644 --- a/deepmd/pt/model/descriptor/dpa1.py +++ b/deepmd/pt/model/descriptor/dpa1.py @@ -123,7 +123,9 @@ class DescrptDPA1(BaseDescriptor, torch.nn.Module): Time-step `dt` in the resnet construction: y = x + dt * \phi (Wx + b) trainable: bool - If the weights of embedding net are trainable. + If the weights of this descriptors are trainable. + trainable_ln: bool + Whether to use trainable shift and scale weights in layer normalization. type_one_side: bool If 'False', type embeddings of both neighbor and central atoms are considered. If 'True', only type embeddings of neighbor atoms are considered. @@ -205,6 +207,7 @@ def __init__( temperature=None, concat_output_tebd: bool = True, trainable: bool = True, + trainable_ln: bool = True, smooth_type_embedding: bool = True, type_one_side: bool = False, # not implemented @@ -252,6 +255,7 @@ def __init__( type_one_side=type_one_side, exclude_types=exclude_types, env_protection=env_protection, + trainable_ln=trainable_ln, old_impl=old_impl, ) self.type_embedding = TypeEmbedNet(ntypes, tebd_dim, precision=precision) @@ -385,6 +389,7 @@ def serialize(self) -> dict: "scaling_factor": obj.scaling_factor, "normalize": obj.normalize, "temperature": obj.temperature, + "trainable_ln": obj.trainable_ln, "smooth_type_embedding": obj.smooth, "type_one_side": obj.type_one_side, "concat_output_tebd": self.concat_output_tebd, diff --git a/deepmd/pt/model/descriptor/se_atten.py b/deepmd/pt/model/descriptor/se_atten.py index cfd0a7f95d..d857bc31f7 100644 --- a/deepmd/pt/model/descriptor/se_atten.py +++ b/deepmd/pt/model/descriptor/se_atten.py @@ -83,6 +83,7 @@ def __init__( type_one_side: bool = False, exclude_types: List[Tuple[int, int]] = [], env_protection: float = 0.0, + trainable_ln: bool = True, type: Optional[str] = None, old_impl: bool = False, ): @@ -119,6 +120,7 @@ def __init__( self.smooth = smooth self.type_one_side = type_one_side self.env_protection = env_protection + self.trainable_ln = trainable_ln self.old_impl = old_impl if isinstance(sel, int): @@ -157,6 +159,7 @@ def __init__( scaling_factor=self.scaling_factor, normalize=self.normalize, temperature=self.temperature, + trainable_ln=self.trainable_ln, smooth=self.smooth, precision=self.precision, ) @@ -468,6 +471,7 @@ def __init__( scaling_factor: float = 1.0, normalize: bool = True, temperature: Optional[float] = None, + trainable_ln: bool = True, smooth: bool = True, precision: str = DEFAULT_PRECISION, ): @@ -482,6 +486,7 @@ def __init__( self.scaling_factor = scaling_factor self.normalize = normalize self.temperature = temperature + self.trainable_ln = trainable_ln self.smooth = smooth self.precision = precision self.network_type = NeighborGatedAttentionLayer @@ -497,7 +502,8 @@ def __init__( scaling_factor=scaling_factor, normalize=normalize, temperature=temperature, - smooth=self.smooth, + trainable_ln=trainable_ln, + smooth=smooth, precision=precision, ) ) @@ -563,6 +569,7 @@ def serialize(self) -> dict: "scaling_factor": self.scaling_factor, "normalize": self.normalize, "temperature": self.temperature, + "trainable_ln": self.trainable_ln, "precision": self.precision, "attention_layers": [layer.serialize() for layer in self.attention_layers], } @@ -598,6 +605,7 @@ def __init__( normalize: bool = True, temperature: Optional[float] = None, smooth: bool = True, + trainable_ln: bool = True, precision: str = DEFAULT_PRECISION, ): """Construct a neighbor-wise attention layer.""" @@ -611,6 +619,7 @@ def __init__( self.normalize = normalize self.temperature = temperature self.precision = precision + self.trainable_ln = trainable_ln self.attention_layer = GatedAttentionLayer( nnei, embed_dim, @@ -623,7 +632,9 @@ def __init__( smooth=smooth, precision=precision, ) - self.attn_layer_norm = LayerNorm(self.embed_dim, precision=precision) + self.attn_layer_norm = LayerNorm( + self.embed_dim, trainable=trainable_ln, precision=precision + ) def forward( self, @@ -655,6 +666,7 @@ def serialize(self) -> dict: "scaling_factor": self.scaling_factor, "normalize": self.normalize, "temperature": self.temperature, + "trainable_ln": self.trainable_ln, "precision": self.precision, "attention_layer": self.attention_layer.serialize(), "attn_layer_norm": self.attn_layer_norm.serialize(), diff --git a/deepmd/pt/model/network/layernorm.py b/deepmd/pt/model/network/layernorm.py index efb4836db7..27b9808010 100644 --- a/deepmd/pt/model/network/layernorm.py +++ b/deepmd/pt/model/network/layernorm.py @@ -31,6 +31,7 @@ def __init__( bavg: float = 0.0, stddev: float = 1.0, precision: str = DEFAULT_PRECISION, + trainable: bool = True, ): self.eps = eps self.uni_init = uni_init @@ -50,6 +51,10 @@ def __init__( if self.uni_init: nn.init.ones_(self.matrix.data) nn.init.zeros_(self.bias.data) + self.trainable = trainable + if not self.trainable: + self.matrix.requires_grad = False + self.bias.requires_grad = False def dim_out(self) -> int: return self.matrix.shape[0] @@ -73,10 +78,8 @@ def forward( mean = xx.mean(dim=-1, keepdim=True) variance = xx.var(dim=-1, unbiased=False, keepdim=True) yy = (xx - mean) / torch.sqrt(variance + self.eps) - if self.matrix is not None: - yy = yy * self.matrix - if self.bias is not None: - yy = yy + self.bias + if self.matrix is not None and self.bias is not None: + yy = yy * self.matrix + self.bias return yy def serialize(self) -> dict: @@ -90,6 +93,7 @@ def serialize(self) -> dict: nl = DPLayerNorm( self.matrix.shape[0], eps=self.eps, + trainable=self.trainable, precision=self.precision, ) nl.w = to_numpy_array(self.matrix) @@ -110,6 +114,7 @@ def deserialize(cls, data: dict) -> "LayerNorm": obj = cls( nl["matrix"].shape[0], eps=nl["eps"], + trainable=nl["trainable"], precision=nl["precision"], ) prec = PRECISION_DICT[obj.precision] diff --git a/deepmd/tf/descriptor/se_atten.py b/deepmd/tf/descriptor/se_atten.py index 15db4f5a22..dcf785d6f4 100644 --- a/deepmd/tf/descriptor/se_atten.py +++ b/deepmd/tf/descriptor/se_atten.py @@ -181,6 +181,7 @@ def __init__( scaling_factor=1.0, normalize=True, temperature=None, + trainable_ln: bool = True, concat_output_tebd: bool = True, env_protection: float = 0.0, # not implement!! **kwargs, @@ -233,6 +234,7 @@ def __init__( raise ValueError("`model/type_map` is not set or empty!") self.stripped_type_embedding = stripped_type_embedding self.smooth = smooth_type_embedding + self.trainable_ln = trainable_ln self.ntypes = ntypes self.att_n = attn self.attn_layer = attn_layer @@ -251,12 +253,6 @@ def __init__( std_ones = np.ones([self.ntypes, self.ndescrpt]).astype( GLOBAL_NP_FLOAT_PRECISION ) - # self.beta = np.zeros([self.attn_layer, self.filter_neuron[-1]]).astype( - # GLOBAL_NP_FLOAT_PRECISION - # ) - # self.gamma = np.ones([self.attn_layer, self.filter_neuron[-1]]).astype( - # GLOBAL_NP_FLOAT_PRECISION - # ) self.attention_layer_variables = None sub_graph = tf.Graph() with sub_graph.as_default(): @@ -891,38 +887,6 @@ def _lookup_type_embedding( return self.embedding_input_2 return self.embedding_input - def _feedforward(self, input_xyz, d_in, d_mid): - residual = input_xyz - input_xyz = tf.nn.relu( - one_layer( - input_xyz, - d_mid, - name="c_ffn1", - reuse=tf.AUTO_REUSE, - seed=self.seed, - activation_fn=None, - precision=self.filter_precision, - trainable=True, - uniform_seed=self.uniform_seed, - initial_variables=self.attention_layer_variables, - ) - ) - input_xyz = one_layer( - input_xyz, - d_in, - name="c_ffn2", - reuse=tf.AUTO_REUSE, - seed=self.seed, - activation_fn=None, - precision=self.filter_precision, - trainable=True, - uniform_seed=self.uniform_seed, - initial_variables=self.attention_layer_variables, - ) - input_xyz += residual - input_xyz = tf.keras.layers.LayerNormalization()(input_xyz) - return input_xyz - def _scaled_dot_attn( self, Q, @@ -1068,15 +1032,9 @@ def _attention_layers( reuse=tf.AUTO_REUSE, seed=self.seed, uniform_seed=self.uniform_seed, - trainable=trainable, + trainable=self.trainable_ln, initial_variables=self.attention_layer_variables, ) - # input_xyz = tf.keras.layers.LayerNormalization( - # beta_initializer=tf.constant_initializer(self.beta[i]), - # gamma_initializer=tf.constant_initializer(self.gamma[i]), - # dtype=self.filter_precision, - # )(input_xyz) - # input_xyz = self._feedforward(input_xyz, outputs_size[-1], self.att_n) return input_xyz def _filter_lower( @@ -1384,27 +1342,6 @@ def init_variables( self.attention_layer_variables = get_attention_layer_variables_from_graph_def( graph_def, suffix=suffix ) - # if self.attn_layer > 0: - # self.beta[0] = self.attention_layer_variables[ - # f"attention_layer_0{suffix}/layer_normalization/beta" - # ] - # self.gamma[0] = self.attention_layer_variables[ - # f"attention_layer_0{suffix}/layer_normalization/gamma" - # ] - # for i in range(1, self.attn_layer): - # self.beta[i] = self.attention_layer_variables[ - # f"attention_layer_{i}{suffix}/layer_normalization_{i}/beta" - # ] - # self.gamma[i] = self.attention_layer_variables[ - # f"attention_layer_{i}{suffix}/layer_normalization_{i}/gamma" - # ] - # for i in range(self.attn_layer): - # self.beta[i] = self.attention_layer_variables[ - # f"attention_layer_{i}{suffix}/layer_normalization/beta" - # ] - # self.gamma[i] = self.attention_layer_variables[ - # f"attention_layer_{i}{suffix}/layer_normalization/gamma" - # ] if self.stripped_type_embedding: self.two_side_embeeding_net_variables = ( @@ -1527,6 +1464,7 @@ def serialize_attention_layers( hidden_dim: int, dotr: bool, do_mask: bool, + trainable_ln: bool, variables: dict, bias: bool = True, suffix: str = "", @@ -1538,6 +1476,7 @@ def serialize_attention_layers( "hidden_dim": hidden_dim, "dotr": dotr, "do_mask": do_mask, + "trainable_ln": trainable_ln, "precision": self.precision.name, "attention_layers": [], } @@ -1592,6 +1531,7 @@ def serialize_attention_layers( layer_norm = LayerNorm( embed_dim, + trainable=self.trainable_ln, precision=self.precision.name, ) layer_norm["matrix"] = attention_layer_params[layer_idx][ @@ -1609,6 +1549,7 @@ def serialize_attention_layers( "smooth": self.smooth, }, "attn_layer_norm": layer_norm.serialize(), + "trainable_ln": self.trainable_ln, } ) return data @@ -1778,6 +1719,7 @@ def serialize(self, suffix: str = "") -> dict: "activation_function": self.activation_function_name, "resnet_dt": self.filter_resnet_dt, "smooth_type_embedding": self.smooth, + "trainable_ln": self.trainable_ln, "precision": self.filter_precision.name, "embeddings": self.serialize_network( ntypes=self.ntypes, @@ -1799,6 +1741,7 @@ def serialize(self, suffix: str = "") -> dict: hidden_dim=self.att_n, dotr=self.attn_dotr, do_mask=self.attn_mask, + trainable_ln=self.trainable_ln, variables=self.attention_layer_variables, suffix=suffix, ), @@ -1844,7 +1787,9 @@ class DescrptDPA1Compat(DescrptSeAtten): Time-step `dt` in the resnet construction: y = x + dt * \phi (Wx + b) trainable: bool - If the weights of embedding net are trainable. + If the weights of this descriptors are trainable. + trainable_ln: bool + Whether to use trainable shift and scale weights in layer normalization. type_one_side: bool If 'False', type embeddings of both neighbor and central atoms are considered. If 'True', only type embeddings of neighbor atoms are considered. @@ -1917,6 +1862,7 @@ def __init__( scaling_factor=1.0, normalize: bool = True, temperature: Optional[float] = None, + trainable_ln: bool = True, smooth_type_embedding: bool = True, concat_output_tebd: bool = True, spin: Optional[Any] = None, @@ -1963,6 +1909,7 @@ def __init__( attn_mask=attn_mask, multi_task=True, stripped_type_embedding=False, + trainable_ln=trainable_ln, smooth_type_embedding=smooth_type_embedding, env_protection=env_protection, ) diff --git a/deepmd/tf/utils/network.py b/deepmd/tf/utils/network.py index 916f783050..7918b58d0c 100644 --- a/deepmd/tf/utils/network.py +++ b/deepmd/tf/utils/network.py @@ -200,177 +200,6 @@ def layernorm( return output -# class LayerNormCompat: -# """Implementation of Layer Normalization layer for testing with other backend references. -# -# Parameters -# ---------- -# num_in : int -# The input dimension of the layer. -# eps : float, optional -# A small value added to prevent division by zero in calculations. -# uni_init : bool, optional -# If initialize the weights to be zeros and ones. -# precision : str, optional -# The precision of the layer parameters. Supported options are |PRECISION| -# """ -# -# def __init__( -# self, -# num_in: int, -# eps: float = 1e-5, -# uni_init: bool = True, -# precision: str = "default", -# ) -> None: -# self.eps = eps -# self.uni_init = uni_init -# self.num_in = num_in -# self.filter_precision = get_precision(precision) -# self.layer_norm_variables = None -# -# def build( -# self, -# inputs, -# input_shape: List[int], -# reuse=None, -# suffix="", -# ): -# """Build the computational graph for the layer normalization. -# -# Parameters -# ---------- -# input_shape -# The shape of the input tensor. -# reuse -# The weights in the networks should be reused when get the variable. -# suffix -# Name suffix to identify this layer -# -# Returns -# ------- -# normalized_output -# The computational graph for the normalized output -# """ -# assert input_shape[-1] == self.num_in -# name = "layer_norm" + suffix -# with tf.variable_scope(name, reuse=reuse): -# gamma = tf.get_variable( -# "gamma", -# shape=[self.num_in], -# initializer=tf.ones_initializer(), -# dtype=self.filter_precision, -# trainable=True, -# ) -# beta = tf.get_variable( -# "beta", -# shape=[self.num_in], -# initializer=tf.zeros_initializer(), -# dtype=self.filter_precision, -# trainable=True, -# ) -# normalized_output = tf.contrib.layers.layer_norm( -# inputs=input, -# begin_norm_axis=-1, -# begin_params_axis=-1, -# epsilon=self.eps, -# activation_fn=None, -# param_initializers={ -# "gamma": tf.ones_initializer(), -# "beta": tf.zeros_initializer(), -# }, -# trainable=True, -# reuse=reuse, -# variables_collections=None, -# outputs_collections=None, -# data_format="NHWC", -# name=name, -# ) -# return normalized_output -# -# def init_variables( -# self, -# graph: tf.Graph, -# graph_def: tf.GraphDef, -# suffix="", -# model_type="original_model", -# ) -> None: -# """Init the layer norm variables with the given dict. -# -# Parameters -# ---------- -# graph : tf.Graph -# The input frozen model graph -# graph_def : tf.GraphDef -# The input frozen model graph_def -# suffix -# Name suffix to identify this layer -# model_type -# Indicator of whether this model is a compressed model -# """ -# self.layer_norm_variables = get_layer_norm_variables_from_graph_def( -# graph_def, suffix=suffix -# ) -# -# @classmethod -# def deserialize(cls, data: dict, suffix: str = ""): -# """Deserialize the layer from a dict. -# -# Parameters -# ---------- -# data : dict -# The dict to deserialize from. -# suffix : str, optional -# The suffix of the scope -# -# Returns -# ------- -# LayerNorm -# The deserialized layer -# """ -# data = data.copy() -# check_version_compatibility(data.pop("@version", 1), 1, 1) -# data_cls = data.pop("@class") -# assert data_cls == "LayerNorm", f"Invalid class {data_cls}" -# variables = data.pop("@variables") -# obj = cls( -# num_in=variables["w"].shape[0], -# eps=data.pop("eps"), -# precision=data.pop("precision"), -# ) -# obj.layer_norm_variables = { -# f"layer_norm{suffix}/gamma": variables["w"], -# f"layer_norm{suffix}/beta": variables["b"], -# } -# return obj -# -# def serialize(self, suffix: str = "") -> dict: -# """Serialize the layer to a dict. -# -# Parameters -# ---------- -# suffix : str, optional -# The suffix of the scope -# -# Returns -# ------- -# dict -# The serialized layer. -# """ -# assert self.layer_norm_variables is not None -# gamma = self.layer_norm_variables[f"layer_norm{suffix}/gamma"] -# beta = self.layer_norm_variables[f"layer_norm{suffix}/beta"] -# return { -# "@class": "LayerNorm", -# "@version": 1, -# "eps": self.eps, -# "precision": self.filter_precision.name, -# "@variables": { -# "w": gamma, -# "b": beta, -# }, -# } - - def embedding_net_rand_seed_shift(network_size): shift = 3 * (len(network_size) + 1) return shift diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index c51900e9a9..8dd2be2b6b 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -478,6 +478,9 @@ def descrpt_se_atten_args(): doc_stripped_type_embedding = "Whether to strip the type embedding into a separated embedding network. Setting it to `False` will fall back to the previous version of `se_atten` which is non-compressible." doc_smooth_type_embedding = f"Whether to use smooth process in attention weights calculation. {doc_only_tf_supported} When using stripped type embedding, whether to dot smooth factor on the network output of type embedding to keep the network smooth, instead of setting `set_davg_zero` to be True." doc_set_davg_zero = "Set the normalization average to zero. This option should be set when `se_atten` descriptor or `atom_ener` in the energy fitting is used" + doc_trainable_ln = ( + "Whether to use trainable shift and scale weights in layer normalization." + ) doc_tebd_dim = "The dimension of atom type embedding." doc_temperature = "The scaling factor of normalization in calculations of attention weights, which is used to scale the matmul(Q, K)." doc_scaling_factor = ( @@ -507,11 +510,15 @@ def descrpt_se_atten_args(): bool, optional=True, default=False, + alias=["smooth_type_embdding"], doc=doc_smooth_type_embedding, ), Argument( "set_davg_zero", bool, optional=True, default=True, doc=doc_set_davg_zero ), + Argument( + "trainable_ln", bool, optional=True, default=True, doc=doc_trainable_ln + ), # pt only Argument( "tebd_dim",