Skip to content

Commit

Permalink
Add trainable option for layernorm
Browse files Browse the repository at this point in the history
  • Loading branch information
iProzd committed Apr 22, 2024
1 parent 3bc25da commit 85c7d6e
Show file tree
Hide file tree
Showing 8 changed files with 79 additions and 252 deletions.
19 changes: 17 additions & 2 deletions deepmd/dpmodel/descriptor/dpa1.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,9 @@ class DescrptDPA1(NativeOP, BaseDescriptor):
Time-step `dt` in the resnet construction:
y = x + dt * \phi (Wx + b)
trainable: bool
If the weights of embedding net are trainable.
If the weights of this descriptors are trainable.
trainable_ln: bool
Whether to use trainable shift and scale weights in layer normalization.
type_one_side: bool
If 'False', type embeddings of both neighbor and central atoms are considered.
If 'True', only type embeddings of neighbor atoms are considered.
Expand Down Expand Up @@ -222,6 +224,7 @@ def __init__(
scaling_factor=1.0,
normalize: bool = True,
temperature: Optional[float] = None,
trainable_ln: bool = True,
smooth_type_embedding: bool = True,
concat_output_tebd: bool = True,
spin: Optional[Any] = None,
Expand Down Expand Up @@ -254,6 +257,7 @@ def __init__(
self.tebd_input_mode = tebd_input_mode
self.resnet_dt = resnet_dt
self.trainable = trainable
self.trainable_ln = trainable_ln
self.type_one_side = type_one_side
self.attn = attn
self.attn_layer = attn_layer
Expand Down Expand Up @@ -306,6 +310,7 @@ def __init__(
scaling_factor=self.scaling_factor,
normalize=self.normalize,
temperature=self.temperature,
trainable_ln=self.trainable_ln,
smooth=self.smooth,
precision=self.precision,
)
Expand Down Expand Up @@ -539,6 +544,7 @@ def serialize(self) -> dict:
"scaling_factor": self.scaling_factor,
"normalize": self.normalize,
"temperature": self.temperature,
"trainable_ln": self.trainable_ln,
"smooth_type_embedding": self.smooth,
"type_one_side": self.type_one_side,
"concat_output_tebd": self.concat_output_tebd,
Expand Down Expand Up @@ -607,6 +613,7 @@ def __init__(
scaling_factor: float = 1.0,
normalize: bool = True,
temperature: Optional[float] = None,
trainable_ln: bool = True,
smooth: bool = True,
precision: str = DEFAULT_PRECISION,
):
Expand All @@ -621,6 +628,7 @@ def __init__(
self.scaling_factor = scaling_factor
self.normalize = normalize
self.temperature = temperature
self.trainable_ln = trainable_ln
self.smooth = smooth
self.precision = precision
self.network_type = NeighborGatedAttentionLayer
Expand All @@ -635,6 +643,7 @@ def __init__(
scaling_factor=scaling_factor,
normalize=normalize,
temperature=temperature,
trainable_ln=trainable_ln,
smooth=smooth,
precision=precision,
)
Expand Down Expand Up @@ -690,6 +699,7 @@ def serialize(self):
"scaling_factor": self.scaling_factor,
"normalize": self.normalize,
"temperature": self.temperature,
"trainable_ln": self.trainable_ln,
"precision": self.precision,
"attention_layers": [layer.serialize() for layer in self.attention_layers],
}
Expand Down Expand Up @@ -725,6 +735,7 @@ def __init__(
scaling_factor: float = 1.0,
normalize: bool = True,
temperature: Optional[float] = None,
trainable_ln: bool = True,
smooth: bool = True,
precision: str = DEFAULT_PRECISION,
):
Expand All @@ -738,6 +749,7 @@ def __init__(
self.scaling_factor = scaling_factor
self.normalize = normalize
self.temperature = temperature
self.trainable_ln = trainable_ln
self.precision = precision
self.attention_layer = GatedAttentionLayer(
nnei,
Expand All @@ -751,7 +763,9 @@ def __init__(
smooth=smooth,
precision=precision,
)
self.attn_layer_norm = LayerNorm(self.embed_dim, precision=precision)
self.attn_layer_norm = LayerNorm(
self.embed_dim, trainable=self.trainable_ln, precision=precision
)

def call(
self,
Expand Down Expand Up @@ -783,6 +797,7 @@ def serialize(self) -> dict:
"scaling_factor": self.scaling_factor,
"normalize": self.normalize,
"temperature": self.temperature,
"trainable_ln": self.trainable_ln,
"precision": self.precision,
"attention_layer": self.attention_layer.serialize(),
"attn_layer_norm": self.attn_layer_norm.serialize(),
Expand Down
17 changes: 12 additions & 5 deletions deepmd/dpmodel/utils/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,7 @@ def __init__(
num_in: int,
eps: float = 1e-5,
uni_init: bool = True,
trainable: bool = True,
precision: str = DEFAULT_PRECISION,
) -> None:
self.eps = eps
Expand All @@ -417,6 +418,8 @@ def __init__(
if self.uni_init:
self.w = np.ones_like(self.w)
self.b = np.zeros_like(self.b)
# only to keep consistent with other backends
self.trainable = trainable

def serialize(self) -> dict:
"""Serialize the layer to a dict.
Expand All @@ -434,6 +437,7 @@ def serialize(self) -> dict:
"@class": "LayerNorm",
"@version": 1,
"eps": self.eps,
"trainable": self.trainable,
"precision": self.precision,
"@variables": data,
}
Expand Down Expand Up @@ -477,6 +481,8 @@ def __setitem__(self, key, value):
self.w = value
elif key in ("b", "bias"):
self.b = value
elif key == "trainable":
self.trainable = value
elif key == "precision":
self.precision = value
elif key == "eps":
Expand All @@ -489,6 +495,8 @@ def __getitem__(self, key):
return self.w
elif key in ("b", "bias"):
return self.b
elif key == "trainable":
return self.trainable
elif key == "precision":
return self.precision
elif key == "eps":
Expand All @@ -512,21 +520,20 @@ def call(self, x: np.ndarray) -> np.ndarray:
np.ndarray
The output.
"""
if self.w is None or self.b is None:
raise ValueError("w/b must be set")
y = self.layer_norm_numpy(x, (self.num_in,), self.w, self.b, self.eps)
return y

@staticmethod
def layer_norm_numpy(x, shape, weight, bias, eps):
def layer_norm_numpy(x, shape, weight=None, bias=None, eps=1e-5):
# mean and variance
mean = np.mean(x, axis=tuple(range(-len(shape), 0)), keepdims=True)
var = np.var(x, axis=tuple(range(-len(shape), 0)), keepdims=True)
# normalize
x_normalized = (x - mean) / np.sqrt(var + eps)
# shift and scale
x_ln = x_normalized * weight + bias
return x_ln
if weight is not None and bias is not None:
x_normalized = x_normalized * weight + bias
return x_normalized


def make_multilayer_network(T_NetworkLayer, ModuleBase):
Expand Down
7 changes: 6 additions & 1 deletion deepmd/pt/model/descriptor/dpa1.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,9 @@ class DescrptDPA1(BaseDescriptor, torch.nn.Module):
Time-step `dt` in the resnet construction:
y = x + dt * \phi (Wx + b)
trainable: bool
If the weights of embedding net are trainable.
If the weights of this descriptors are trainable.
trainable_ln: bool
Whether to use trainable shift and scale weights in layer normalization.
type_one_side: bool
If 'False', type embeddings of both neighbor and central atoms are considered.
If 'True', only type embeddings of neighbor atoms are considered.
Expand Down Expand Up @@ -205,6 +207,7 @@ def __init__(
temperature=None,
concat_output_tebd: bool = True,
trainable: bool = True,
trainable_ln: bool = True,
smooth_type_embedding: bool = True,
type_one_side: bool = False,
# not implemented
Expand Down Expand Up @@ -252,6 +255,7 @@ def __init__(
type_one_side=type_one_side,
exclude_types=exclude_types,
env_protection=env_protection,
trainable_ln=trainable_ln,
old_impl=old_impl,
)
self.type_embedding = TypeEmbedNet(ntypes, tebd_dim, precision=precision)
Expand Down Expand Up @@ -385,6 +389,7 @@ def serialize(self) -> dict:
"scaling_factor": obj.scaling_factor,
"normalize": obj.normalize,
"temperature": obj.temperature,
"trainable_ln": obj.trainable_ln,
"smooth_type_embedding": obj.smooth,
"type_one_side": obj.type_one_side,
"concat_output_tebd": self.concat_output_tebd,
Expand Down
16 changes: 14 additions & 2 deletions deepmd/pt/model/descriptor/se_atten.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def __init__(
type_one_side: bool = False,
exclude_types: List[Tuple[int, int]] = [],
env_protection: float = 0.0,
trainable_ln: bool = True,
type: Optional[str] = None,
old_impl: bool = False,
):
Expand Down Expand Up @@ -119,6 +120,7 @@ def __init__(
self.smooth = smooth
self.type_one_side = type_one_side
self.env_protection = env_protection
self.trainable_ln = trainable_ln
self.old_impl = old_impl

if isinstance(sel, int):
Expand Down Expand Up @@ -157,6 +159,7 @@ def __init__(
scaling_factor=self.scaling_factor,
normalize=self.normalize,
temperature=self.temperature,
trainable_ln=self.trainable_ln,
smooth=self.smooth,
precision=self.precision,
)
Expand Down Expand Up @@ -468,6 +471,7 @@ def __init__(
scaling_factor: float = 1.0,
normalize: bool = True,
temperature: Optional[float] = None,
trainable_ln: bool = True,
smooth: bool = True,
precision: str = DEFAULT_PRECISION,
):
Expand All @@ -482,6 +486,7 @@ def __init__(
self.scaling_factor = scaling_factor
self.normalize = normalize
self.temperature = temperature
self.trainable_ln = trainable_ln
self.smooth = smooth
self.precision = precision
self.network_type = NeighborGatedAttentionLayer
Expand All @@ -497,7 +502,8 @@ def __init__(
scaling_factor=scaling_factor,
normalize=normalize,
temperature=temperature,
smooth=self.smooth,
trainable_ln=trainable_ln,
smooth=smooth,
precision=precision,
)
)
Expand Down Expand Up @@ -563,6 +569,7 @@ def serialize(self) -> dict:
"scaling_factor": self.scaling_factor,
"normalize": self.normalize,
"temperature": self.temperature,
"trainable_ln": self.trainable_ln,
"precision": self.precision,
"attention_layers": [layer.serialize() for layer in self.attention_layers],
}
Expand Down Expand Up @@ -598,6 +605,7 @@ def __init__(
normalize: bool = True,
temperature: Optional[float] = None,
smooth: bool = True,
trainable_ln: bool = True,
precision: str = DEFAULT_PRECISION,
):
"""Construct a neighbor-wise attention layer."""
Expand All @@ -611,6 +619,7 @@ def __init__(
self.normalize = normalize
self.temperature = temperature
self.precision = precision
self.trainable_ln = trainable_ln
self.attention_layer = GatedAttentionLayer(
nnei,
embed_dim,
Expand All @@ -623,7 +632,9 @@ def __init__(
smooth=smooth,
precision=precision,
)
self.attn_layer_norm = LayerNorm(self.embed_dim, precision=precision)
self.attn_layer_norm = LayerNorm(
self.embed_dim, trainable=trainable_ln, precision=precision
)

def forward(
self,
Expand Down Expand Up @@ -655,6 +666,7 @@ def serialize(self) -> dict:
"scaling_factor": self.scaling_factor,
"normalize": self.normalize,
"temperature": self.temperature,
"trainable_ln": self.trainable_ln,
"precision": self.precision,
"attention_layer": self.attention_layer.serialize(),
"attn_layer_norm": self.attn_layer_norm.serialize(),
Expand Down
13 changes: 9 additions & 4 deletions deepmd/pt/model/network/layernorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def __init__(
bavg: float = 0.0,
stddev: float = 1.0,
precision: str = DEFAULT_PRECISION,
trainable: bool = True,
):
self.eps = eps
self.uni_init = uni_init
Expand All @@ -50,6 +51,10 @@ def __init__(
if self.uni_init:
nn.init.ones_(self.matrix.data)
nn.init.zeros_(self.bias.data)
self.trainable = trainable
if not self.trainable:
self.matrix.requires_grad = False
self.bias.requires_grad = False

def dim_out(self) -> int:
return self.matrix.shape[0]
Expand All @@ -73,10 +78,8 @@ def forward(
mean = xx.mean(dim=-1, keepdim=True)
variance = xx.var(dim=-1, unbiased=False, keepdim=True)
yy = (xx - mean) / torch.sqrt(variance + self.eps)
if self.matrix is not None:
yy = yy * self.matrix
if self.bias is not None:
yy = yy + self.bias
if self.matrix is not None and self.bias is not None:
yy = yy * self.matrix + self.bias
return yy

def serialize(self) -> dict:
Expand All @@ -90,6 +93,7 @@ def serialize(self) -> dict:
nl = DPLayerNorm(
self.matrix.shape[0],
eps=self.eps,
trainable=self.trainable,
precision=self.precision,
)
nl.w = to_numpy_array(self.matrix)
Expand All @@ -110,6 +114,7 @@ def deserialize(cls, data: dict) -> "LayerNorm":
obj = cls(
nl["matrix"].shape[0],
eps=nl["eps"],
trainable=nl["trainable"],
precision=nl["precision"],
)
prec = PRECISION_DICT[obj.precision]
Expand Down
Loading

0 comments on commit 85c7d6e

Please sign in to comment.