diff --git a/deepmd/pt/model/task/dipole.py b/deepmd/pt/model/task/dipole.py
index bff3dd93bc..9df3a5fb32 100644
--- a/deepmd/pt/model/task/dipole.py
+++ b/deepmd/pt/model/task/dipole.py
@@ -132,14 +132,6 @@ def output_def(self) -> FittingOutputDef:
             ]
         )
 
-    @property
-    def data_stat_key(self):
-        """
-        Get the keys for the data statistic of the fitting.
-        Return a list of statistic names needed, such as "bias_atom_e".
-        """
-        return []
-
     def forward(
         self,
         descriptor: torch.Tensor,
diff --git a/deepmd/pt/model/task/ener.py b/deepmd/pt/model/task/ener.py
index 8479111819..ff7ae6f8ec 100644
--- a/deepmd/pt/model/task/ener.py
+++ b/deepmd/pt/model/task/ener.py
@@ -28,8 +28,11 @@
 from deepmd.pt.utils.env import (
     DEFAULT_PRECISION,
 )
-from deepmd.pt.utils.stat import (
-    compute_output_bias,
+from deepmd.pt.utils.utils import (
+    to_numpy_array,
+)
+from deepmd.utils.out_stat import (
+    compute_stats_from_redu,
 )
 from deepmd.utils.path import (
     DPPath,
@@ -135,16 +138,8 @@ def serialize(self) -> dict:
         data["atom_ener"] = self.atom_ener
         return data
 
-    @property
-    def data_stat_key(self):
-        """
-        Get the keys for the data statistic of the fitting.
-        Return a list of statistic names needed, such as "bias_atom_e".
-        """
-        return ["bias_atom_e"]
-
     def compute_output_stats(self, merged, stat_file_path: Optional[DPPath] = None):
-        energy = [item["energy"] for item in merged]
+        energy = [item[self.var_name] for item in merged]
         data_mixed_type = "real_natoms_vec" in merged[0]
         if data_mixed_type:
             input_natoms = [item["real_natoms_vec"] for item in merged]
@@ -155,7 +150,22 @@ def compute_output_stats(self, merged, stat_file_path: Optional[DPPath] = None):
         if stat_file_path is not None and stat_file_path.is_file():
             bias_atom_e = stat_file_path.load_numpy()
         else:
-            bias_atom_e = compute_output_bias(energy, input_natoms, rcond=self.rcond)
+            # shape: (nframes, ndim)
+            merged_energy = to_numpy_array(torch.cat(energy))
+            # shape: (nframes, ntypes)
+            merged_natoms = to_numpy_array(torch.cat(input_natoms)[:, 2:])
+            if self.atom_ener is not None and len(self.atom_ener) > 0:
+                assigned_atom_ener = np.array(
+                    [ee if ee is not None else np.nan for ee in self.atom_ener]
+                )
+            else:
+                assigned_atom_ener = None
+            bias_atom_e, _ = compute_stats_from_redu(
+                merged_energy,
+                merged_natoms,
+                assigned_bias=assigned_atom_ener,
+                rcond=self.rcond,
+            )
             if stat_file_path is not None:
                 stat_file_path.save_numpy(bias_atom_e)
         assert all(x is not None for x in [bias_atom_e])
diff --git a/deepmd/pt/model/task/fitting.py b/deepmd/pt/model/task/fitting.py
index 0c64983f60..20876d9be7 100644
--- a/deepmd/pt/model/task/fitting.py
+++ b/deepmd/pt/model/task/fitting.py
@@ -92,14 +92,6 @@ def share_params(self, base_class, shared_level, resume=False):
         else:
             raise NotImplementedError
 
-    @property
-    def data_stat_key(self):
-        """
-        Get the keys for the data statistic of the fitting.
-        Return a list of statistic names needed, such as "bias_atom_e".
-        """
-        raise NotImplementedError("data_stat_key is not implemented!")
-
     def change_energy_bias(
         self, config, model, old_type_map, new_type_map, bias_shift="delta", ntest=10
     ):
diff --git a/deepmd/pt/model/task/polarizability.py b/deepmd/pt/model/task/polarizability.py
index 13b0d56e31..1bc4798c48 100644
--- a/deepmd/pt/model/task/polarizability.py
+++ b/deepmd/pt/model/task/polarizability.py
@@ -160,14 +160,6 @@ def output_def(self) -> FittingOutputDef:
             ]
         )
 
-    @property
-    def data_stat_key(self):
-        """
-        Get the keys for the data statistic of the fitting.
-        Return a list of statistic names needed, such as "bias_atom_e".
-        """
-        return []
-
     def forward(
         self,
         descriptor: torch.Tensor,
diff --git a/deepmd/pt/utils/env_mat_stat.py b/deepmd/pt/utils/env_mat_stat.py
index 70b7228440..cd2943e6a8 100644
--- a/deepmd/pt/utils/env_mat_stat.py
+++ b/deepmd/pt/utils/env_mat_stat.py
@@ -80,7 +80,7 @@ def iter(
         Parameters
         ----------
         data : List[Dict[str, torch.Tensor]]
-            The environment matrix.
+            The data.
 
         Yields
         ------
diff --git a/deepmd/pt/utils/stat.py b/deepmd/pt/utils/stat.py
index 38f71d6994..4c769f019e 100644
--- a/deepmd/pt/utils/stat.py
+++ b/deepmd/pt/utils/stat.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 
-import numpy as np
 import torch
 
 log = logging.getLogger(__name__)
@@ -57,23 +56,3 @@ def make_stat_input(datasets, dataloaders, nbatches):
                 sys_stat[key] = sys_stat_list
         lst.append(sys_stat)
     return lst
-
-
-def compute_output_bias(energy, natoms, rcond=None):
-    """Update output bias for fitting net.
-
-    Args:
-    - energy: Batched energy with shape [nframes, 1].
-    - natoms: Batched atom statisics with shape [self.ntypes+2].
-
-    Returns
-    -------
-    - energy_coef: Average enery per atom for each element.
-    """
-    for i in range(len(energy)):
-        energy[i] = energy[i].mean(dim=0, keepdim=True)
-        natoms[i] = natoms[i].double().mean(dim=0, keepdim=True)
-    sys_ener = torch.cat(energy).cpu()
-    sys_tynatom = torch.cat(natoms)[:, 2:].cpu()
-    energy_coef, _, _, _ = np.linalg.lstsq(sys_tynatom, sys_ener, rcond)
-    return energy_coef
diff --git a/deepmd/tf/fit/dos.py b/deepmd/tf/fit/dos.py
index e8681f47ea..0cc5a7df62 100644
--- a/deepmd/tf/fit/dos.py
+++ b/deepmd/tf/fit/dos.py
@@ -43,6 +43,9 @@
 from deepmd.tf.utils.network import (
     one_layer_rand_seed_shift,
 )
+from deepmd.utils.out_stat import (
+    compute_stats_from_redu,
+)
 
 log = logging.getLogger(__name__)
 
@@ -225,8 +228,10 @@ def _compute_output_stats(self, all_stat, rcond=1e-3, mixed_type=False):
         sys_tynatom = np.reshape(sys_tynatom, [nsys, -1])
         sys_tynatom = sys_tynatom[:, 2:]
 
-        dos_shift, resd, rank, s_value = np.linalg.lstsq(
-            sys_tynatom, sys_dos, rcond=rcond
+        dos_shift, _ = compute_stats_from_redu(
+            sys_dos,
+            sys_tynatom,
+            rcond=rcond,
         )
 
         return dos_shift
diff --git a/deepmd/tf/fit/ener.py b/deepmd/tf/fit/ener.py
index 106e10839d..a842df50bd 100644
--- a/deepmd/tf/fit/ener.py
+++ b/deepmd/tf/fit/ener.py
@@ -53,6 +53,9 @@
 from deepmd.tf.utils.spin import (
     Spin,
 )
+from deepmd.utils.out_stat import (
+    compute_stats_from_redu,
+)
 from deepmd.utils.version import (
     check_version_compatibility,
 )
@@ -295,21 +298,17 @@ def _compute_output_stats(self, all_stat, rcond=1e-3, mixed_type=False):
             # In this situation, we directly use these assigned energies instead of computing stats.
             # This will make the loss decrease quickly
             assigned_atom_ener = np.array(
-                [ee for ee in self.atom_ener_v if ee is not None]
+                [ee if ee is not None else np.nan for ee in self.atom_ener_v]
             )
-            assigned_ener_idx = [
-                ii for ii, ee in enumerate(self.atom_ener_v) if ee is not None
-            ]
-            # np.dot out size: nframe
-            sys_ener -= np.dot(sys_tynatom[:, assigned_ener_idx], assigned_atom_ener)
-            sys_tynatom[:, assigned_ener_idx] = 0.0
-        energy_shift, resd, rank, s_value = np.linalg.lstsq(
-            sys_tynatom, sys_ener, rcond=rcond
+        else:
+            assigned_atom_ener = None
+        energy_shift, _ = compute_stats_from_redu(
+            sys_ener.reshape(-1, 1),
+            sys_tynatom,
+            assigned_bias=assigned_atom_ener,
+            rcond=rcond,
         )
-        if len(self.atom_ener) > 0:
-            for ii in assigned_ener_idx:
-                energy_shift[ii] = self.atom_ener_v[ii]
-        return energy_shift
+        return energy_shift.ravel()
 
     def compute_input_stats(self, all_stat: dict, protection: float = 1e-2) -> None:
         """Compute the input statistics.
diff --git a/deepmd/tf/fit/polar.py b/deepmd/tf/fit/polar.py
index 002082ad2e..7ac31809f3 100644
--- a/deepmd/tf/fit/polar.py
+++ b/deepmd/tf/fit/polar.py
@@ -151,16 +151,14 @@ def get_out_size(self) -> int:
         """Get the output size. Should be 9."""
         return 9
 
-    def compute_input_stats(self, all_stat, protection=1e-2):
-        """Compute the input statistics.
+    def compute_output_stats(self, all_stat):
+        """Compute the output statistics.
 
         Parameters
         ----------
         all_stat
             Dictionary of inputs.
             can be prepared by model.make_stat_input
-        protection
-            Divided-by-zero protection
         """
         if "polarizability" not in all_stat.keys():
             self.avgeig = np.zeros([9])
diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py
index 20111558cf..592b1f9748 100644
--- a/deepmd/utils/data_system.py
+++ b/deepmd/utils/data_system.py
@@ -22,6 +22,9 @@
 from deepmd.utils.data import (
     DeepmdData,
 )
+from deepmd.utils.out_stat import (
+    compute_stats_from_redu,
+)
 
 log = logging.getLogger(__name__)
 
@@ -248,10 +251,12 @@ def compute_energy_shift(self, rcond=None, key="energy"):
         sys_tynatom = np.array(self.natoms_vec, dtype=GLOBAL_NP_FLOAT_PRECISION)
         sys_tynatom = np.reshape(sys_tynatom, [self.nsystems, -1])
         sys_tynatom = sys_tynatom[:, 2:]
-        energy_shift, resd, rank, s_value = np.linalg.lstsq(
-            sys_tynatom, sys_ener, rcond=rcond
+        energy_shift, _ = compute_stats_from_redu(
+            sys_ener.reshape(-1, 1),
+            sys_tynatom,
+            rcond=rcond,
         )
-        return energy_shift
+        return energy_shift.ravel()
 
     def add_dict(self, adict: dict) -> None:
         """Add items to the data system by a `dict`.
diff --git a/deepmd/utils/out_stat.py b/deepmd/utils/out_stat.py
new file mode 100644
index 0000000000..8f68e32417
--- /dev/null
+++ b/deepmd/utils/out_stat.py
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Output statistics."""
+from typing import (
+    Optional,
+    Tuple,
+)
+
+import numpy as np
+
+
+def compute_stats_from_redu(
+    output_redu: np.ndarray,
+    natoms: np.ndarray,
+    assigned_bias: Optional[np.ndarray] = None,
+    rcond: Optional[float] = None,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Compute the output statistics.
+
+    Given the reduced output value and the number of atoms for each atom,
+    compute the least-squares solution as the atomic output bais and std.
+
+    Parameters
+    ----------
+    output_redu
+        The reduced output value, shape is [nframes, ndim].
+    natoms
+        The number of atoms for each atom, shape is [nframes, ntypes].
+    assigned_bias
+        The assigned output bias, shape is [ntypes, ndim]. Set to nan
+        if not assigned.
+    rcond
+        Cut-off ratio for small singular values of a.
+
+    Returns
+    -------
+    np.ndarray
+        The computed output bias, shape is [ntypes, ndim].
+    np.ndarray
+        The computed output std, shape is [ntypes, ndim].
+    """
+    output_redu = np.array(output_redu)
+    natoms = np.array(natoms)
+    # check shape
+    assert output_redu.ndim == 2
+    assert natoms.ndim == 2
+    assert output_redu.shape[0] == natoms.shape[0]  # nframes
+    if assigned_bias is not None:
+        assigned_bias = np.array(assigned_bias).reshape(
+            natoms.shape[1], output_redu.shape[1]
+        )
+    # compute output bias
+    if assigned_bias is not None:
+        # Atomic energies stats are incorrect if atomic energies are assigned.
+        # In this situation, we directly use these assigned energies instead of computing stats.
+        # This will make the loss decrease quickly
+        assigned_bias_atom_mask = ~np.isnan(assigned_bias).any(axis=1)
+        # assigned_bias_masked: nmask, ndim
+        assigned_bias_masked = assigned_bias[assigned_bias_atom_mask]
+        # assigned_bias_natoms: nframes, nmask
+        assigned_bias_natoms = natoms[:, assigned_bias_atom_mask]
+        # output_redu: nframes, ndim
+        output_redu -= np.einsum(
+            "ij,jk->ik", assigned_bias_natoms, assigned_bias_masked
+        )
+        # remove assigned atom
+        natoms[:, assigned_bias_atom_mask] = 0
+
+    # computed_output_bias: ntypes, ndim
+    computed_output_bias, _, _, _ = np.linalg.lstsq(natoms, output_redu, rcond=rcond)
+    if assigned_bias is not None:
+        # add back assigned atom; this might not be required
+        computed_output_bias[assigned_bias_atom_mask] = assigned_bias_masked
+    # rest_redu: nframes, ndim
+    rest_redu = output_redu - np.einsum("ij,jk->ik", natoms, computed_output_bias)
+    output_std = rest_redu.std(axis=0)
+    return computed_output_bias, output_std
+
+
+def compute_stats_from_atomic(
+    output: np.ndarray,
+    atype: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Compute the output statistics.
+
+    Given the output value and the type of atoms,
+    compute the atomic output bais and std.
+
+    Parameters
+    ----------
+    output
+        The output value, shape is [nframes, nloc, ndim].
+    atype
+        The type of atoms, shape is [nframes, nloc].
+
+    Returns
+    -------
+    np.ndarray
+        The computed output bias, shape is [ntypes, ndim].
+    np.ndarray
+        The computed output std, shape is [ntypes, ndim].
+    """
+    output = np.array(output)
+    atype = np.array(atype)
+    # check shape
+    assert output.ndim == 3
+    assert atype.ndim == 2
+    assert output.shape[:2] == atype.shape
+    # compute output bias
+    nframes, nloc, ndim = output.shape
+    ntypes = atype.max() + 1
+    output_bias = np.zeros((ntypes, ndim))
+    output_std = np.zeros((ntypes, ndim))
+    for type_i in range(ntypes):
+        mask = atype == type_i
+        output_bias[type_i] = output[mask].mean(axis=0)
+        output_std[type_i] = output[mask].std(axis=0)
+    return output_bias, output_std
diff --git a/source/tests/common/test_out_stat.py b/source/tests/common/test_out_stat.py
new file mode 100644
index 0000000000..c0cfc25071
--- /dev/null
+++ b/source/tests/common/test_out_stat.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+
+import numpy as np
+
+from deepmd.utils.out_stat import (
+    compute_stats_from_atomic,
+    compute_stats_from_redu,
+)
+
+
+class TestOutStat(unittest.TestCase):
+    def setUp(self) -> None:
+        rng = np.random.default_rng(20240227)
+        ndim = 5
+        nframes = 1000
+        ntypes = 3
+        nloc = 1000
+        self.atype = rng.integers(0, ntypes, size=(nframes, nloc))
+        # compute the number of atoms for each type in each frame
+        self.natoms = np.zeros((nframes, ntypes), dtype=np.int64)
+        for i in range(ntypes):
+            self.natoms[:, i] = (self.atype == i).sum(axis=1)
+        self.mean = rng.random((ntypes, ndim)) * 1e4
+        self.std = rng.random((ntypes, ndim)) * 1e-3
+
+        # generate random output
+        self.output = rng.normal(
+            loc=self.mean[self.atype, :],
+            scale=self.std[self.atype, :],
+            size=(nframes, nloc, ndim),
+        )
+        self.output_redu = self.output.sum(axis=1)
+
+        return super().setUp()
+
+    def test_compute_stats_from_redu(self):
+        bias, std = compute_stats_from_redu(self.output_redu, self.natoms)
+        np.testing.assert_allclose(bias, self.mean, rtol=1e-7)
+        reference_std = np.array(
+            [
+                0.01700638138272794,
+                0.01954897296228177,
+                0.020281857747683162,
+                0.010741237959989648,
+                0.020258211828681347,
+            ]
+        )
+        np.testing.assert_allclose(
+            std,
+            reference_std,
+            rtol=1e-7,
+        )
+        # ensure the sum is close
+        np.testing.assert_allclose(
+            self.output_redu,
+            self.natoms @ bias,
+            rtol=1e-7,
+        )
+
+    def test_compute_stats_from_redu_with_assigned_bias(self):
+        assigned_bias = np.full_like(self.mean, np.nan)
+        assigned_bias[0] = self.mean[0]
+        bias, std = compute_stats_from_redu(
+            self.output_redu,
+            self.natoms,
+            assigned_bias=assigned_bias,
+        )
+        np.testing.assert_allclose(bias, self.mean, rtol=1e-7)
+        np.testing.assert_allclose(bias[0], self.mean[0], rtol=1e-14)
+        reference_std = np.array(
+            [
+                0.017015794087883902,
+                0.019549011723239484,
+                0.020285565914828625,
+                0.01074124012073672,
+                0.020283557003416414,
+            ]
+        )
+        np.testing.assert_allclose(
+            std,
+            reference_std,
+            rtol=1e-7,
+        )
+        # ensure the sum is close
+        np.testing.assert_allclose(
+            self.output_redu,
+            self.natoms @ bias,
+            rtol=1e-7,
+        )
+
+    def test_compute_stats_from_atomic(self):
+        bias, std = compute_stats_from_atomic(self.output, self.atype)
+        np.testing.assert_allclose(bias, self.mean)
+        reference_std = np.array(
+            [
+                [
+                    0.0005452949516910239,
+                    0.000686732800598535,
+                    0.00089423457667224,
+                    7.818017989121455e-05,
+                    0.0004758637035637342,
+                ],
+                [
+                    2.0610161678825724e-05,
+                    0.0007728218734771541,
+                    0.0004754659308165858,
+                    0.0001809007655290948,
+                    0.0008187364708029638,
+                ],
+                [
+                    0.0007935836092665254,
+                    0.00031176505013516624,
+                    0.0005469653430009186,
+                    0.0005652240916389281,
+                    0.0006087722080071852,
+                ],
+            ]
+        )
+        np.testing.assert_allclose(
+            std,
+            reference_std,
+            rtol=1e-7,
+        )
diff --git a/source/tests/pt/test_stat.py b/source/tests/pt/test_stat.py
index 1e3c707d6f..98d4e59d95 100644
--- a/source/tests/pt/test_stat.py
+++ b/source/tests/pt/test_stat.py
@@ -20,15 +20,15 @@
 from deepmd.pt.model.descriptor.dpa1 import (
     DescrptDPA1,
 )
+from deepmd.pt.model.task.ener import (
+    EnergyFittingNet,
+)
 from deepmd.pt.utils import (
     env,
 )
 from deepmd.pt.utils.dataloader import (
     DpLoaderSet,
 )
-from deepmd.pt.utils.stat import (
-    compute_output_bias,
-)
 from deepmd.pt.utils.stat import make_stat_input as my_make
 from deepmd.tf.common import (
     expand_sys_str,
@@ -145,9 +145,14 @@ def my_merge(energy, natoms):
         dp_fn = EnerFitting(
             self.dp_d.get_ntypes(), self.dp_d.get_dim_out(), self.n_neuron
         )
-        dp_fn.compute_output_stats(self.dp_sampled)
-        bias_atom_e = compute_output_bias(energy, natoms)
-        self.assertTrue(np.allclose(dp_fn.bias_atom_e, bias_atom_e[:, 0]))
+        dp_fn.compute_output_stats(self.dp_sampled, mixed_type=self.mixed_type)
+        pt_fn = EnergyFittingNet(
+            self.dp_d.get_ntypes(), self.dp_d.get_dim_out(), self.n_neuron
+        )
+        pt_fn.compute_output_stats(self.my_sampled)
+        np.testing.assert_allclose(
+            dp_fn.bias_atom_e, pt_fn.bias_atom_e.detach().cpu().numpy().ravel()
+        )
 
     # temporarily delete this function for performance of seeds in tf and pytorch may be different
     """
diff --git a/source/tests/tf/common.py b/source/tests/tf/common.py
index a83397c11c..0bcb29b4b5 100644
--- a/source/tests/tf/common.py
+++ b/source/tests/tf/common.py
@@ -17,6 +17,9 @@
     tf,
 )
 from deepmd.tf.utils import random as dp_random
+from deepmd.utils.out_stat import (
+    compute_stats_from_redu,
+)
 
 if GLOBAL_NP_FLOAT_PRECISION == np.float32:
     global_default_fv_hh = 1e-2
@@ -1041,10 +1044,12 @@ def compute_energy_shift(self):
         sys_tynatom = np.array(self.natoms_vec, dtype=GLOBAL_NP_FLOAT_PRECISION)
         sys_tynatom = np.reshape(sys_tynatom, [self.nsystems, -1])
         sys_tynatom = sys_tynatom[:, 2:]
-        energy_shift, resd, rank, s_value = np.linalg.lstsq(
-            sys_tynatom, sys_ener, rcond=None
+        energy_shift, _ = compute_stats_from_redu(
+            sys_ener.reshape(-1, 1),
+            sys_tynatom,
+            rcond=None,
         )
-        return energy_shift
+        return energy_shift.ravel()
 
     def process_sys_weights(self, sys_weights):
         sys_weights = np.array(sys_weights)