From b5fad1686f356ad40b42c10b09e5dc92e637a3d4 Mon Sep 17 00:00:00 2001 From: jac16 Date: Thu, 25 Jul 2024 15:03:27 -0400 Subject: [PATCH] Update tests --- src/alchemlyb/convergence/convergence.py | 106 +++++++++++++--------- src/alchemlyb/estimators/bar_.py | 19 ++-- src/alchemlyb/estimators/mbar_.py | 21 ++++- src/alchemlyb/tests/test_convergence.py | 62 +++++++++++-- src/alchemlyb/tests/test_visualisation.py | 11 ++- 5 files changed, 158 insertions(+), 61 deletions(-) diff --git a/src/alchemlyb/convergence/convergence.py b/src/alchemlyb/convergence/convergence.py index 3a6b0500..0eb85535 100644 --- a/src/alchemlyb/convergence/convergence.py +++ b/src/alchemlyb/convergence/convergence.py @@ -118,16 +118,7 @@ def forward_backward_convergence( for i in range(1, num + 1): logger.info("Backward analysis: {:.2f}%".format(100 * i / num)) sample = [] - for ii, data in enumerate(df_list): - if ( - estimator in ["MBAR", "BAR"] - and len(np.unique(np.array([x[1] for x in data.index.to_numpy()]))) > 1 - ): - raise ValueError( - "Restrict to a single fep-lambda value for a meaningful result in df_list[{}]".format( - ii - ) - ) + for data in df_list: sample.append(data[-len(data) // num * i :]) mean, error = _forward_backward_convergence_estimate( sample, estimator, my_estimator, error_tol, **kwargs @@ -453,44 +444,73 @@ def moving_average(df_list, estimator="MBAR", num=10, **kwargs): estimator_fit = estimators_dispatch[estimator](**kwargs).fit logger.info(f"Use {estimator} estimator for convergence analysis.") - logger.info("Begin Moving Average Analysis") + logger.info("Check indices") + if estimator in ["MBAR"]: + index_1 = [ + np.unique(np.array([x[1] for x in data.index.to_numpy()])) + for data in df_list + ] + if len(np.unique(index_1)) == 1 and len(df_list[0].index[0]) > 2: + index_2 = [ + np.unique(np.array([x[2] for x in data.index.to_numpy()])) + for data in df_list + ] + if len(np.unique(index_2)) > 1: + raise ValueError( + "Restrict to a single fep-lambda value for a meaningful result. index[2] for each file" + " in df_list: {}".format(index_2) + ) + elif len(np.unique(index_1)) != 1: + raise ValueError( + "Restrict to a single fep-lambda value for a meaningful result. index[1] for each file" + " in df_list: {}".format(index_1) + ) + elif estimator in ["BAR"]: + index_1 = [ + np.unique(np.array([x[1] for x in data.index.to_numpy()])) + for data in df_list + ] + if len(np.unique(index_1)) == 1 and len(df_list[0].index[0]) > 2: + index_2 = [ + np.unique(np.array([x[2] for x in data.index.to_numpy()])) + for data in df_list + ] + if len(np.unique(index_2)) != 2: + raise ValueError( + "Restrict to a fep-lambda value and its forward adjacent state for a meaningful " + "result. index[2] for each file in df_list: {}".format(index_2) + ) + elif len(np.unique(index_1)) != 2: + raise ValueError( + "Restrict to a fep-lambda value and its forward adjacent state for a meaningful " + "result. index[1] for each file in df_list: {}".format(index_1) + ) + logger.info("Begin Moving Average Analysis") average_list = [] average_error_list = [] + + # Concatenate dataframes + sample = [] + data = df_list[0] + for tmp_data in df_list[1:]: + data = concat([data, tmp_data]) + for i in range(1, num): logger.info("Moving Average Analysis: {:.2f}%".format(100 * i / num)) - sample = [] - for ii, data in enumerate(df_list): - fep_values = np.unique(np.array([x[1] for x in data.index.to_numpy()])) - if estimator == "MBAR": - if len(fep_values) > 1: - raise ValueError( - "Restrict to a single fep-lambda value for a meaningful result in df_list[{}]".format( - ii - ) - ) - else: - sample.append( - data[len(data) // num * (i - 1) : len(data) // num * i] - ) - elif estimator == "BAR": - if len(fep_values) > 2: - raise ValueError( - "Restrict to a fep-lambda value and its forward adjacent state for a meaningful result in df_list[{}]".format( - ii - ) - ) - else: - data1 = data.iloc[ - data.index.get_level_values("fep-lambda").isin([fep_values[0]]) - ] - data2 = data.iloc[ - data.index.get_level_values("fep-lambda").isin([fep_values[1]]) - ] - lx = min(len(data1), len(data2)) - ind1, ind2 = lx // num * (i - 1), lx // num * i - sample.append(concat([data1[ind1:ind2], data2[ind1:ind2]])) - sample = concat(sample) + if estimator == "MBAR": + sample = data[len(data) // num * (i - 1) : len(data) // num * i] + elif estimator == "BAR": + ind, indices = 1, np.unique(np.array([x[1] for x in data.index.to_numpy()])) + if len(indices) != 2 and len(df_list[0].index[0]) > 2: + ind, indices = 2, np.unique( + np.array([x[2] for x in data.index.to_numpy()]) + ) + data1 = data.iloc[data.index.get_level_values(ind).isin([indices[0]])] + data2 = data.iloc[data.index.get_level_values(ind).isin([indices[1]])] + lx = min(len(data1), len(data2)) + ind1, ind2 = lx // num * (i - 1), lx // num * i + sample = concat([data1[ind1:ind2], data2[ind1:ind2]]) result = estimator_fit(sample) average_list.append(result.delta_f_.iloc[0, -1]) diff --git a/src/alchemlyb/estimators/bar_.py b/src/alchemlyb/estimators/bar_.py index 998785cc..bbd20982 100644 --- a/src/alchemlyb/estimators/bar_.py +++ b/src/alchemlyb/estimators/bar_.py @@ -88,21 +88,22 @@ def fit(self, u_nk): # sort by state so that rows from same state are in contiguous blocks u_nk = u_nk.sort_index(level=u_nk.index.names[1:]) + # get a list of the lambda states that are sampled + self._states_ = u_nk.columns.values.tolist() + # group u_nk by lambda states groups = u_nk.groupby(level=u_nk.index.names[1:]) N_k = [ (len(groups.get_group(i)) if i in groups.groups else 0) for i in u_nk.columns ] - - # get a list of the lambda states that are sampled - self._states_ = [x for i, x in enumerate(u_nk.columns.values.tolist()) if N_k[i] > 0] - N_k = [x for x in N_k if x > 0] - + states = [x for i, x in enumerate(self._states_) if N_k[i] > 0] # Now get free energy differences and their uncertainties for each step deltas = np.array([]) d_deltas = np.array([]) for k in range(len(N_k) - 1): + if N_k[k] == 0 or N_k[k + 1] == 0: + continue # get us from lambda step k uk = groups.get_group(self._states_[k]) # get w_F @@ -110,7 +111,7 @@ def fit(self, u_nk): # get us from lambda step k+1 uk1 = groups.get_group(self._states_[k + 1]) - + # get w_R w_r = uk1.iloc[:, k] - uk1.iloc[:, k + 1] @@ -152,13 +153,11 @@ def fit(self, u_nk): ad_delta += np.diagflat(np.array(dout), k=j + 1) # yield standard delta_f_ free energies between each state - self._delta_f_ = pd.DataFrame( - adelta - adelta.T, columns=self._states_, index=self._states_ - ) + self._delta_f_ = pd.DataFrame(adelta - adelta.T, columns=states, index=states) # yield standard deviation d_delta_f_ between each state self._d_delta_f_ = pd.DataFrame( - np.sqrt(ad_delta + ad_delta.T), columns=self._states_, index=self._states_ + np.sqrt(ad_delta + ad_delta.T), columns=states, index=states ) self._delta_f_.attrs = u_nk.attrs self._d_delta_f_.attrs = u_nk.attrs diff --git a/src/alchemlyb/estimators/mbar_.py b/src/alchemlyb/estimators/mbar_.py index e0ab594a..d0a2b31e 100644 --- a/src/alchemlyb/estimators/mbar_.py +++ b/src/alchemlyb/estimators/mbar_.py @@ -33,7 +33,7 @@ class MBAR(BaseEstimator, _EstimatorMixOut): .. versionchanged:: 2.3.0 The new default is now "BAR" as it provides a substantial speedup over the previous default `None`. - + method : str, optional, default="robust" The optimization routine to use. This can be any of the methods @@ -135,6 +135,25 @@ def fit(self, u_nk): ) bar.fit(u_nk) initial_f_k = bar.delta_f_.iloc[0, :] + states = [ + x + for i, x in enumerate(self._states_[:-1]) + if N_k[i] > 0 and N_k[i + 1] > 0 + ] + if len(bar.delta_f_.iloc[0, :]) != len(self._states_): + states = [ + x + for i, x in enumerate(self._states_[:-1]) + if N_k[i] > 0 and N_k[i + 1] > 0 + ] + initial_f_k = pd.Series( + [ + initial_f_k.loc(x) if x in states else np.nan + for x in self._states_ + ], + index=self._states_, + dtype=float, + ) else: initial_f_k = self.initial_f_k diff --git a/src/alchemlyb/tests/test_convergence.py b/src/alchemlyb/tests/test_convergence.py index 4e2afad9..a547a3c8 100644 --- a/src/alchemlyb/tests/test_convergence.py +++ b/src/alchemlyb/tests/test_convergence.py @@ -31,14 +31,64 @@ def test_convergence_fep(gmx_benzene_Coulomb_u_nk, estimator): assert convergence.loc[9, "Backward"] == pytest.approx(3.04, 0.01) +@pytest.mark.parametrize("estimator", ["DUMMY"]) +def test_moving_average_error_1(gmx_ABFE_complex_u_nk, estimator): + with pytest.raises(ValueError, match=r"Estimator DUMMY is not available .*"): + _ = moving_average(gmx_ABFE_complex_u_nk, estimator) + + +@pytest.mark.parametrize("estimator", ["MBAR"]) +def test_moving_average_error_2_mbar(gmx_ABFE_complex_u_nk, estimator): + df_list = gmx_ABFE_complex_u_nk[10:15] + with pytest.raises( + ValueError, + match=r"Restrict to a single fep-lambda value for a meaningful result. .*", + ): + _ = moving_average(df_list, estimator) + + df_list = gmx_ABFE_complex_u_nk[14:17] + with pytest.raises( + ValueError, + match=r"Restrict to a single fep-lambda value for a meaningful result. .*", + ): + _ = moving_average(df_list, estimator) + + +@pytest.mark.parametrize("estimator", ["BAR"]) +def test_moving_average_error_2_bar(gmx_ABFE_complex_u_nk, estimator): + df_list = gmx_ABFE_complex_u_nk[10:13] + with pytest.raises( + ValueError, + match=r"Restrict to a fep-lambda value and its forward adjacent state .*", + ): + _ = moving_average(df_list, estimator) + + df_list = gmx_ABFE_complex_u_nk[14:17] + with pytest.raises( + ValueError, + match=r"Restrict to a fep-lambda value and its forward adjacent state .*", + ): + _ = moving_average(df_list, estimator) + + +@pytest.mark.parametrize("estimator", ["BAR"]) +def test_moving_average_bar(gmx_ABFE_complex_u_nk, estimator): + df_avg = moving_average(gmx_ABFE_complex_u_nk[14:16], estimator) + assert df_avg.shape == (9, 2) + assert df_avg.loc[0, "FE"] == pytest.approx(0.658, 0.01) + assert df_avg.loc[0, "FE_Error"] == pytest.approx(0.054, 0.1) + assert df_avg.loc[8, "FE"] == pytest.approx(0.926, 0.01) + assert df_avg.loc[8, "FE_Error"] == pytest.approx(0.05, 0.1) + + @pytest.mark.parametrize("estimator", ["MBAR"]) -def test_moving_average_fep(gmx_benzene_Coulomb_u_nk, estimator): - df_avg = moving_average(gmx_benzene_Coulomb_u_nk, estimator) +def test_moving_average_mbar(gmx_benzene_Coulomb_u_nk, estimator): + df_avg = moving_average([gmx_benzene_Coulomb_u_nk[0]], estimator) assert df_avg.shape == (9, 2) - assert df_avg.loc[0, "FE"] == pytest.approx(3.01, 0.01) - assert df_avg.loc[0, "FE_Error"] == pytest.approx(0.067, 0.01) - assert df_avg.loc[8, "FE"] == pytest.approx(3.10, 0.01) - assert df_avg.loc[8, "FE_Error"] == pytest.approx(0.066, 0.01) + assert df_avg.loc[0, "FE"] == pytest.approx(3.41, 0.01) + assert df_avg.loc[0, "FE_Error"] == pytest.approx(0.22, 0.01) + assert df_avg.loc[8, "FE"] == pytest.approx(2.83, 0.01) + assert df_avg.loc[8, "FE_Error"] == pytest.approx(0.33, 0.01) def test_convergence_wrong_estimator(gmx_benzene_Coulomb_dHdl): diff --git a/src/alchemlyb/tests/test_visualisation.py b/src/alchemlyb/tests/test_visualisation.py index f133450c..1d1fe203 100644 --- a/src/alchemlyb/tests/test_visualisation.py +++ b/src/alchemlyb/tests/test_visualisation.py @@ -147,7 +147,7 @@ def test_plot_dF_state( def test_plot_convergence_dataframe(gmx_benzene_Coulomb_u_nk): - df = forward_backward_convergence(gmx_benzene_Coulomb_u_nk, "MBAR") + df = forward_backward_convergence([gmx_benzene_Coulomb_u_nk[0]], "MBAR") ax = plot_convergence(df) assert isinstance(ax, matplotlib.axes.Axes) plt.close(ax.figure) @@ -241,6 +241,15 @@ def test_plot_moving_average(gmx_benzene_Coulomb_u_nk): assert isinstance(ax, matplotlib.axes.Axes) plt.close(ax.figure) + ax = plot_moving_average(df, units="kJ/mol") + assert isinstance(ax, matplotlib.axes.Axes) + plt.close(ax.figure) + + df = df.drop("FE_Error", axis=1) + ax = plot_moving_average(df) + assert isinstance(ax, matplotlib.axes.Axes) + plt.close(ax.figure) + class Test_Units: @staticmethod