From 1d0a111586e736d4c990d133274963942a34b4eb Mon Sep 17 00:00:00 2001 From: Zhiyi Wu Date: Thu, 1 Jun 2023 20:17:54 +0100 Subject: [PATCH] parquet parser for dataframe serialisation (#317) * update * update * Update docs/parsing.rst Co-authored-by: Oliver Beckstein * Update src/alchemlyb/parsing/parquet.py Co-authored-by: Oliver Beckstein * Update src/alchemlyb/parsing/parquet.py Co-authored-by: Oliver Beckstein * Update src/alchemlyb/parsing/parquet.py Co-authored-by: Oliver Beckstein * update * update --------- Co-authored-by: William (Zhiyi) Wu Co-authored-by: Oliver Beckstein --- CHANGES | 4 + devtools/conda-envs/test_env.yaml | 1 + docs/parsing.rst | 24 ++++++ docs/parsing/alchemlyb.parsing.parquet.rst | 8 ++ environment.yml | 1 + setup.py | 1 + src/alchemlyb/parsing/parquet.py | 84 +++++++++++++++++++++ src/alchemlyb/tests/parsing/test_parquet.py | 23 ++++++ src/alchemlyb/tests/test_workflow_ABFE.py | 31 ++++++++ src/alchemlyb/workflows/abfe.py | 17 +++-- 10 files changed, 189 insertions(+), 5 deletions(-) create mode 100644 docs/parsing/alchemlyb.parsing.parquet.rst create mode 100644 src/alchemlyb/parsing/parquet.py create mode 100644 src/alchemlyb/tests/parsing/test_parquet.py diff --git a/CHANGES b/CHANGES index 9f5c527a..1924039c 100644 --- a/CHANGES +++ b/CHANGES @@ -17,6 +17,10 @@ The rules for this file: * 2.1.0 +Enhancements + - Add a parser to read serialised pandas dataframe (parquet) (issue #316, PR#317). + - workflow.ABFE allow parquet as input (issue #316, PR#317). + Fixes - Fix the case where visualisation.plot_convergence would fail when the final error is NaN (issue #318, PR#317). diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml index 63b439cc..b9dcd4e0 100644 --- a/devtools/conda-envs/test_env.yaml +++ b/devtools/conda-envs/test_env.yaml @@ -9,6 +9,7 @@ dependencies: - scipy - scikit-learn - matplotlib +- pyarrow # Testing - pytest diff --git a/docs/parsing.rst b/docs/parsing.rst index 17d158a2..d107278e 100644 --- a/docs/parsing.rst +++ b/docs/parsing.rst @@ -49,6 +49,29 @@ requires some care due to shortcomings in how pandas currently handles metadata (see issue `pandas-dev/pandas#28283 `_). +Serialisation +''''''''''''' + +Alchemlyb data structures (``dHdl`` and ``u_nk``) can be serialized as dataframes +and made persistent. +We use the `parquet `_ +format for serializing (writing) to a file and de-serializing (reading) from a +parquet file. + +For serialization we simply use the :meth:`pandas.DataFrame.to_parquet` method of +a :class:`pandas.DataFrame`. For loading alchemlyb data we provide the +:func:`alchemlyb.parsing.parquet.extract_dHdl` and +:func:`alchemlyb.parsing.parquet.extract_u_nk` functions as shown in the example:: + + from alchemlyb.parsing.parquet import extract_dHdl, extract_u_nk + import pandas as pd + + u_nk.to_parquet(path='u_nk.parquet', index=True) + dHdl.to_parquet(path='dHdl.parquet', index=True) + + new_u_nk = extract_u_nk('u_nk.parquet', T=300) + new_dHdl = extract_dHdl('dHdl.parquet', T=300) + .. _dHdl: @@ -211,4 +234,5 @@ See the documentation for the package you are using for more details on parser u amber namd gomc + parquet diff --git a/docs/parsing/alchemlyb.parsing.parquet.rst b/docs/parsing/alchemlyb.parsing.parquet.rst new file mode 100644 index 00000000..be1a03b3 --- /dev/null +++ b/docs/parsing/alchemlyb.parsing.parquet.rst @@ -0,0 +1,8 @@ + + +API Reference +------------- +This submodule includes these parsing functions: + +.. autofunction:: alchemlyb.parsing.parquet.extract_u_nk +.. autofunction:: alchemlyb.parsing.parquet.extract_dHdl \ No newline at end of file diff --git a/environment.yml b/environment.yml index ddee2d3c..4d2d9bda 100644 --- a/environment.yml +++ b/environment.yml @@ -8,4 +8,5 @@ dependencies: - pymbar>=4 - scipy - scikit-learn +- pyarrow - matplotlib diff --git a/setup.py b/setup.py index a1c55403..e95a2f9b 100755 --- a/setup.py +++ b/setup.py @@ -52,5 +52,6 @@ "scipy", "scikit-learn", "matplotlib", + "pyarrow", ], ) diff --git a/src/alchemlyb/parsing/parquet.py b/src/alchemlyb/parsing/parquet.py new file mode 100644 index 00000000..180817ae --- /dev/null +++ b/src/alchemlyb/parsing/parquet.py @@ -0,0 +1,84 @@ +import pandas as pd + +from . import _init_attrs + + +@_init_attrs +def extract_u_nk(path, T): + r"""Return reduced potentials `u_nk` (unit: kT) from a pandas parquet file. + + The parquet file should be serialised from the dataframe output + from any parser with command + (``u_nk_df.to_parquet(path=path, index=True)``). + + Parameters + ---------- + path : str + Path to parquet file to extract dataframe from. + T : float + Temperature in Kelvin of the simulations. + + Returns + ------- + u_nk : DataFrame + Potential energy for each alchemical state (k) for each frame (n). + + + Note + ---- + pyarraw serializers would handle the float or string column name fine but will + convert multi-lambda column name from `(0.0, 0.0)` to `"('0.0', '0.0')"`. + This parser will restore the correct column name. + Also parquet serialisation doesn't preserve the :attr:`pandas.DataFrame.attrs`. + So the temperature is assigned in this function. + + + .. versionadded:: 2.1.0 + + """ + u_nk = pd.read_parquet(path) + columns = list(u_nk.columns) + if isinstance(columns[0], str) and columns[0][0] == "(": + new_columns = [] + for column in columns: + new_columns.append( + tuple( + map( + float, column[1:-1].replace('"', "").replace("'", "").split(",") + ) + ) + ) + u_nk.columns = new_columns + return u_nk + + +@_init_attrs +def extract_dHdl(path, T): + r"""Return gradients `dH/dl` (unit: kT) from a pandas parquet file. + + The parquet file should be serialised from the dataframe output + from any parser with command + (`dHdl_df.to_parquet(path=path, index=True)`). + + Parameters + ---------- + path : str + Path to parquet file to extract dataframe from. + T : float + Temperature in Kelvin the simulations sampled. + + Returns + ------- + dH/dl : DataFrame + dH/dl as a function of time for this lambda window. + + Note + ---- + Parquet serialisation doesn't preserve the :attr:`pandas.DataFrame.attrs`. + So the temperature is assigned in this function. + + + .. versionadded:: 2.1.0 + + """ + return pd.read_parquet(path) diff --git a/src/alchemlyb/tests/parsing/test_parquet.py b/src/alchemlyb/tests/parsing/test_parquet.py new file mode 100644 index 00000000..a2d788f6 --- /dev/null +++ b/src/alchemlyb/tests/parsing/test_parquet.py @@ -0,0 +1,23 @@ +import pytest + +from alchemlyb.parsing.parquet import extract_u_nk, extract_dHdl + + +@pytest.mark.parametrize( + "dHdl_list", ["gmx_benzene_Coulomb_dHdl", "gmx_ABFE_complex_dHdl"] +) +def test_extract_dHdl(dHdl_list, request, tmp_path): + dHdl = request.getfixturevalue(dHdl_list)[0] + dHdl.to_parquet(path=str(tmp_path / "dhdl.parquet"), index=True) + new_dHdl = extract_dHdl(str(tmp_path / "dhdl.parquet"), T=300) + assert (new_dHdl.columns == dHdl.columns).all() + assert (new_dHdl.index == dHdl.index).all() + + +@pytest.mark.parametrize("u_nk_list", ["gmx_benzene_VDW_u_nk", "gmx_ABFE_complex_n_uk"]) +def test_extract_dHdl(u_nk_list, request, tmp_path): + u_nk = request.getfixturevalue(u_nk_list)[0] + u_nk.to_parquet(path=str(tmp_path / "u_nk.parquet"), index=True) + new_u_nk = extract_u_nk(str(tmp_path / "u_nk.parquet"), T=300) + assert (new_u_nk.columns == u_nk.columns).all() + assert (new_u_nk.index == u_nk.index).all() diff --git a/src/alchemlyb/tests/test_workflow_ABFE.py b/src/alchemlyb/tests/test_workflow_ABFE.py index 5a874102..dfaa23fd 100644 --- a/src/alchemlyb/tests/test_workflow_ABFE.py +++ b/src/alchemlyb/tests/test_workflow_ABFE.py @@ -5,6 +5,7 @@ from alchemtest.amber import load_bace_example from alchemtest.gmx import load_ABFE +import alchemlyb.parsing.amber from alchemlyb.workflows.abfe import ABFE @@ -397,3 +398,33 @@ def test_summary(self, workflow): """Test if if the summary is right.""" summary = workflow.generate_result() assert np.isclose(summary["TI"]["Stages"]["TOTAL"], 1.40405980473, 0.1) + + +class Test_automatic_parquet: + """Test the full automatic workflow for load_ABFE from parquet data.""" + + @staticmethod + @pytest.fixture(scope="class") + def workflow(tmp_path_factory): + outdir = tmp_path_factory.mktemp("out") + for i, u_nk in enumerate(load_bace_example()["data"]["complex"]["vdw"]): + df = alchemlyb.parsing.amber.extract_u_nk(u_nk, T=298) + df.to_parquet(path=f"{outdir}/u_nk_{i}.parquet", index=True) + + workflow = ABFE( + units="kcal/mol", + software="PARQUET", + dir=str(outdir), + prefix="u_nk_", + suffix="parquet", + T=298.0, + outdirectory=str(outdir), + ) + workflow.read() + workflow.estimate(estimators="BAR") + return workflow + + def test_summary(self, workflow): + """Test if if the summary is right.""" + summary = workflow.generate_result() + assert np.isclose(summary["BAR"]["Stages"]["TOTAL"], 1.40405980473, 0.1) diff --git a/src/alchemlyb/workflows/abfe.py b/src/alchemlyb/workflows/abfe.py index 60013600..4a00fbab 100644 --- a/src/alchemlyb/workflows/abfe.py +++ b/src/alchemlyb/workflows/abfe.py @@ -12,7 +12,7 @@ from .. import concat from ..convergence import forward_backward_convergence from ..estimators import MBAR, BAR, TI, FEP_ESTIMATORS, TI_ESTIMATORS -from ..parsing import gmx, amber +from ..parsing import gmx, amber, parquet from ..postprocessors.units import get_unit_converter from ..preprocessing.subsampling import decorrelate_dhdl, decorrelate_u_nk from ..visualisation import ( @@ -39,7 +39,7 @@ class ABFE(WorkflowBase): The unit used for printing and plotting results. {'kcal/mol', 'kJ/mol', 'kT'}. Default: 'kT'. software : str - The software used for generating input (case-insensitive). {'GROMACS', 'AMBER'}. + The software used for generating input (case-insensitive). {'GROMACS', 'AMBER', 'PARQUET'}. This option chooses the appropriate parser for the input file. dir : str Directory in which data files are stored. Default: os.path.curdir. @@ -64,7 +64,9 @@ class ABFE(WorkflowBase): .. versionadded:: 1.0.0 .. versionchanged:: 2.0.1 The `dir` argument expects a real directory without wildcards and wildcards will no longer - work as expected. Use `prefix` to specify wildcard-based patterns to search under `dir`. + work as expected. Use `prefix` to specify wildcard-based patterns to search under `dir`. + .. versionchanged:: 2.1.0 + The serialised dataframe could be read via software='PARQUET'. """ def __init__( @@ -86,8 +88,10 @@ def __init__( f"{software}" ) reg_exp = "**/" + prefix + "*" + suffix - if '*' in dir: - warnings.warn(f"A real directory is expected in `dir`={dir}, wildcard expressions should be supplied to `prefix`.") + if "*" in dir: + warnings.warn( + f"A real directory is expected in `dir`={dir}, wildcard expressions should be supplied to `prefix`." + ) if not Path(dir).is_dir(): raise ValueError(f"The input directory `dir`={dir} is not a directory.") self.file_list = list(map(str, Path(dir).glob(reg_exp))) @@ -105,6 +109,9 @@ def __init__( elif software == "AMBER": self._extract_u_nk = amber.extract_u_nk self._extract_dHdl = amber.extract_dHdl + elif software == "PARQUET": + self._extract_u_nk = parquet.extract_u_nk + self._extract_dHdl = parquet.extract_dHdl else: raise NotImplementedError(f"{software} parser not found.")