Add trainable option for layernorm

iProzd · Apr 22, 2024 · 85c7d6e · 85c7d6e
1 parent 3bc25da
commit 85c7d6e
Show file tree

Hide file tree

Showing 8 changed files with 79 additions and 252 deletions.
diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
@@ -141,7 +141,9 @@ class DescrptDPA1(NativeOP, BaseDescriptor):
             Time-step `dt` in the resnet construction:
             y = x + dt * \phi (Wx + b)
     trainable: bool
-            If the weights of embedding net are trainable.
+            If the weights of this descriptors are trainable.
+    trainable_ln: bool
+            Whether to use trainable shift and scale weights in layer normalization.
     type_one_side: bool
             If 'False', type embeddings of both neighbor and central atoms are considered.
             If 'True', only type embeddings of neighbor atoms are considered.
@@ -222,6 +224,7 @@ def __init__(
         scaling_factor=1.0,
         normalize: bool = True,
         temperature: Optional[float] = None,
+        trainable_ln: bool = True,
         smooth_type_embedding: bool = True,
         concat_output_tebd: bool = True,
         spin: Optional[Any] = None,
@@ -254,6 +257,7 @@ def __init__(
         self.tebd_input_mode = tebd_input_mode
         self.resnet_dt = resnet_dt
         self.trainable = trainable
+        self.trainable_ln = trainable_ln
         self.type_one_side = type_one_side
         self.attn = attn
         self.attn_layer = attn_layer
@@ -306,6 +310,7 @@ def __init__(
             scaling_factor=self.scaling_factor,
             normalize=self.normalize,
             temperature=self.temperature,
+            trainable_ln=self.trainable_ln,
             smooth=self.smooth,
             precision=self.precision,
         )
@@ -539,6 +544,7 @@ def serialize(self) -> dict:
             "scaling_factor": self.scaling_factor,
             "normalize": self.normalize,
             "temperature": self.temperature,
+            "trainable_ln": self.trainable_ln,
             "smooth_type_embedding": self.smooth,
             "type_one_side": self.type_one_side,
             "concat_output_tebd": self.concat_output_tebd,
@@ -607,6 +613,7 @@ def __init__(
         scaling_factor: float = 1.0,
         normalize: bool = True,
         temperature: Optional[float] = None,
+        trainable_ln: bool = True,
         smooth: bool = True,
         precision: str = DEFAULT_PRECISION,
     ):
@@ -621,6 +628,7 @@ def __init__(
         self.scaling_factor = scaling_factor
         self.normalize = normalize
         self.temperature = temperature
+        self.trainable_ln = trainable_ln
         self.smooth = smooth
         self.precision = precision
         self.network_type = NeighborGatedAttentionLayer
@@ -635,6 +643,7 @@ def __init__(
                 scaling_factor=scaling_factor,
                 normalize=normalize,
                 temperature=temperature,
+                trainable_ln=trainable_ln,
                 smooth=smooth,
                 precision=precision,
             )
@@ -690,6 +699,7 @@ def serialize(self):
             "scaling_factor": self.scaling_factor,
             "normalize": self.normalize,
             "temperature": self.temperature,
+            "trainable_ln": self.trainable_ln,
             "precision": self.precision,
             "attention_layers": [layer.serialize() for layer in self.attention_layers],
         }
@@ -725,6 +735,7 @@ def __init__(
         scaling_factor: float = 1.0,
         normalize: bool = True,
         temperature: Optional[float] = None,
+        trainable_ln: bool = True,
         smooth: bool = True,
         precision: str = DEFAULT_PRECISION,
     ):
@@ -738,6 +749,7 @@ def __init__(
         self.scaling_factor = scaling_factor
         self.normalize = normalize
         self.temperature = temperature
+        self.trainable_ln = trainable_ln
         self.precision = precision
         self.attention_layer = GatedAttentionLayer(
             nnei,
@@ -751,7 +763,9 @@ def __init__(
             smooth=smooth,
             precision=precision,
         )
-        self.attn_layer_norm = LayerNorm(self.embed_dim, precision=precision)
+        self.attn_layer_norm = LayerNorm(
+            self.embed_dim, trainable=self.trainable_ln, precision=precision
+        )
 
     def call(
         self,
@@ -783,6 +797,7 @@ def serialize(self) -> dict:
             "scaling_factor": self.scaling_factor,
             "normalize": self.normalize,
             "temperature": self.temperature,
+            "trainable_ln": self.trainable_ln,
             "precision": self.precision,
             "attention_layer": self.attention_layer.serialize(),
             "attn_layer_norm": self.attn_layer_norm.serialize(),

diff --git a/deepmd/dpmodel/utils/network.py b/deepmd/dpmodel/utils/network.py
@@ -399,6 +399,7 @@ def __init__(
         num_in: int,
         eps: float = 1e-5,
         uni_init: bool = True,
+        trainable: bool = True,
         precision: str = DEFAULT_PRECISION,
     ) -> None:
         self.eps = eps
@@ -417,6 +418,8 @@ def __init__(
         if self.uni_init:
             self.w = np.ones_like(self.w)
             self.b = np.zeros_like(self.b)
+        # only to keep consistent with other backends
+        self.trainable = trainable
 
     def serialize(self) -> dict:
         """Serialize the layer to a dict.
@@ -434,6 +437,7 @@ def serialize(self) -> dict:
             "@class": "LayerNorm",
             "@version": 1,
             "eps": self.eps,
+            "trainable": self.trainable,
             "precision": self.precision,
             "@variables": data,
         }
@@ -477,6 +481,8 @@ def __setitem__(self, key, value):
             self.w = value
         elif key in ("b", "bias"):
             self.b = value
+        elif key == "trainable":
+            self.trainable = value
         elif key == "precision":
             self.precision = value
         elif key == "eps":
@@ -489,6 +495,8 @@ def __getitem__(self, key):
             return self.w
         elif key in ("b", "bias"):
             return self.b
+        elif key == "trainable":
+            return self.trainable
         elif key == "precision":
             return self.precision
         elif key == "eps":
@@ -512,21 +520,20 @@ def call(self, x: np.ndarray) -> np.ndarray:
         np.ndarray
             The output.
         """
-        if self.w is None or self.b is None:
-            raise ValueError("w/b must be set")
         y = self.layer_norm_numpy(x, (self.num_in,), self.w, self.b, self.eps)
         return y
 
     @staticmethod
-    def layer_norm_numpy(x, shape, weight, bias, eps):
+    def layer_norm_numpy(x, shape, weight=None, bias=None, eps=1e-5):
         # mean and variance
         mean = np.mean(x, axis=tuple(range(-len(shape), 0)), keepdims=True)
         var = np.var(x, axis=tuple(range(-len(shape), 0)), keepdims=True)
         # normalize
         x_normalized = (x - mean) / np.sqrt(var + eps)
         # shift and scale
-        x_ln = x_normalized * weight + bias
-        return x_ln
+        if weight is not None and bias is not None:
+            x_normalized = x_normalized * weight + bias
+        return x_normalized
 
 
 def make_multilayer_network(T_NetworkLayer, ModuleBase):

diff --git a/deepmd/pt/model/descriptor/dpa1.py b/deepmd/pt/model/descriptor/dpa1.py
@@ -123,7 +123,9 @@ class DescrptDPA1(BaseDescriptor, torch.nn.Module):
             Time-step `dt` in the resnet construction:
             y = x + dt * \phi (Wx + b)
     trainable: bool
-            If the weights of embedding net are trainable.
+            If the weights of this descriptors are trainable.
+    trainable_ln: bool
+            Whether to use trainable shift and scale weights in layer normalization.
     type_one_side: bool
             If 'False', type embeddings of both neighbor and central atoms are considered.
             If 'True', only type embeddings of neighbor atoms are considered.
@@ -205,6 +207,7 @@ def __init__(
         temperature=None,
         concat_output_tebd: bool = True,
         trainable: bool = True,
+        trainable_ln: bool = True,
         smooth_type_embedding: bool = True,
         type_one_side: bool = False,
         # not implemented
@@ -252,6 +255,7 @@ def __init__(
             type_one_side=type_one_side,
             exclude_types=exclude_types,
             env_protection=env_protection,
+            trainable_ln=trainable_ln,
             old_impl=old_impl,
         )
         self.type_embedding = TypeEmbedNet(ntypes, tebd_dim, precision=precision)
@@ -385,6 +389,7 @@ def serialize(self) -> dict:
             "scaling_factor": obj.scaling_factor,
             "normalize": obj.normalize,
             "temperature": obj.temperature,
+            "trainable_ln": obj.trainable_ln,
             "smooth_type_embedding": obj.smooth,
             "type_one_side": obj.type_one_side,
             "concat_output_tebd": self.concat_output_tebd,

diff --git a/deepmd/pt/model/descriptor/se_atten.py b/deepmd/pt/model/descriptor/se_atten.py
@@ -83,6 +83,7 @@ def __init__(
         type_one_side: bool = False,
         exclude_types: List[Tuple[int, int]] = [],
         env_protection: float = 0.0,
+        trainable_ln: bool = True,
         type: Optional[str] = None,
         old_impl: bool = False,
     ):
@@ -119,6 +120,7 @@ def __init__(
         self.smooth = smooth
         self.type_one_side = type_one_side
         self.env_protection = env_protection
+        self.trainable_ln = trainable_ln
         self.old_impl = old_impl
 
         if isinstance(sel, int):
@@ -157,6 +159,7 @@ def __init__(
                 scaling_factor=self.scaling_factor,
                 normalize=self.normalize,
                 temperature=self.temperature,
+                trainable_ln=self.trainable_ln,
                 smooth=self.smooth,
                 precision=self.precision,
             )
@@ -468,6 +471,7 @@ def __init__(
         scaling_factor: float = 1.0,
         normalize: bool = True,
         temperature: Optional[float] = None,
+        trainable_ln: bool = True,
         smooth: bool = True,
         precision: str = DEFAULT_PRECISION,
     ):
@@ -482,6 +486,7 @@ def __init__(
         self.scaling_factor = scaling_factor
         self.normalize = normalize
         self.temperature = temperature
+        self.trainable_ln = trainable_ln
         self.smooth = smooth
         self.precision = precision
         self.network_type = NeighborGatedAttentionLayer
@@ -497,7 +502,8 @@ def __init__(
                     scaling_factor=scaling_factor,
                     normalize=normalize,
                     temperature=temperature,
-                    smooth=self.smooth,
+                    trainable_ln=trainable_ln,
+                    smooth=smooth,
                     precision=precision,
                 )
             )
@@ -563,6 +569,7 @@ def serialize(self) -> dict:
             "scaling_factor": self.scaling_factor,
             "normalize": self.normalize,
             "temperature": self.temperature,
+            "trainable_ln": self.trainable_ln,
             "precision": self.precision,
             "attention_layers": [layer.serialize() for layer in self.attention_layers],
         }
@@ -598,6 +605,7 @@ def __init__(
         normalize: bool = True,
         temperature: Optional[float] = None,
         smooth: bool = True,
+        trainable_ln: bool = True,
         precision: str = DEFAULT_PRECISION,
     ):
         """Construct a neighbor-wise attention layer."""
@@ -611,6 +619,7 @@ def __init__(
         self.normalize = normalize
         self.temperature = temperature
         self.precision = precision
+        self.trainable_ln = trainable_ln
         self.attention_layer = GatedAttentionLayer(
             nnei,
             embed_dim,
@@ -623,7 +632,9 @@ def __init__(
             smooth=smooth,
             precision=precision,
         )
-        self.attn_layer_norm = LayerNorm(self.embed_dim, precision=precision)
+        self.attn_layer_norm = LayerNorm(
+            self.embed_dim, trainable=trainable_ln, precision=precision
+        )
 
     def forward(
         self,
@@ -655,6 +666,7 @@ def serialize(self) -> dict:
             "scaling_factor": self.scaling_factor,
             "normalize": self.normalize,
             "temperature": self.temperature,
+            "trainable_ln": self.trainable_ln,
             "precision": self.precision,
             "attention_layer": self.attention_layer.serialize(),
             "attn_layer_norm": self.attn_layer_norm.serialize(),

diff --git a/deepmd/pt/model/network/layernorm.py b/deepmd/pt/model/network/layernorm.py
@@ -31,6 +31,7 @@ def __init__(
         bavg: float = 0.0,
         stddev: float = 1.0,
         precision: str = DEFAULT_PRECISION,
+        trainable: bool = True,
     ):
         self.eps = eps
         self.uni_init = uni_init
@@ -50,6 +51,10 @@ def __init__(
         if self.uni_init:
             nn.init.ones_(self.matrix.data)
             nn.init.zeros_(self.bias.data)
+        self.trainable = trainable
+        if not self.trainable:
+            self.matrix.requires_grad = False
+            self.bias.requires_grad = False
 
     def dim_out(self) -> int:
         return self.matrix.shape[0]
@@ -73,10 +78,8 @@ def forward(
         mean = xx.mean(dim=-1, keepdim=True)
         variance = xx.var(dim=-1, unbiased=False, keepdim=True)
         yy = (xx - mean) / torch.sqrt(variance + self.eps)
-        if self.matrix is not None:
-            yy = yy * self.matrix
-        if self.bias is not None:
-            yy = yy + self.bias
+        if self.matrix is not None and self.bias is not None:
+            yy = yy * self.matrix + self.bias
         return yy
 
     def serialize(self) -> dict:
@@ -90,6 +93,7 @@ def serialize(self) -> dict:
         nl = DPLayerNorm(
             self.matrix.shape[0],
             eps=self.eps,
+            trainable=self.trainable,
             precision=self.precision,
         )
         nl.w = to_numpy_array(self.matrix)
@@ -110,6 +114,7 @@ def deserialize(cls, data: dict) -> "LayerNorm":
         obj = cls(
             nl["matrix"].shape[0],
             eps=nl["eps"],
+            trainable=nl["trainable"],
             precision=nl["precision"],
         )
         prec = PRECISION_DICT[obj.precision]