From 1d0a111586e736d4c990d133274963942a34b4eb Mon Sep 17 00:00:00 2001
From: Zhiyi Wu <zwu@exscientia.ai>
Date: Thu, 1 Jun 2023 20:17:54 +0100
Subject: [PATCH] parquet parser for dataframe serialisation (#317)

* update

* update

* Update docs/parsing.rst

Co-authored-by: Oliver Beckstein <orbeckst@gmail.com>

* Update src/alchemlyb/parsing/parquet.py

Co-authored-by: Oliver Beckstein <orbeckst@gmail.com>

* Update src/alchemlyb/parsing/parquet.py

Co-authored-by: Oliver Beckstein <orbeckst@gmail.com>

* Update src/alchemlyb/parsing/parquet.py

Co-authored-by: Oliver Beckstein <orbeckst@gmail.com>

* update

* update

---------

Co-authored-by: William (Zhiyi) Wu <zwu@exscientia.co.uk>
Co-authored-by: Oliver Beckstein <orbeckst@gmail.com>
---
 CHANGES                                     |  4 +
 devtools/conda-envs/test_env.yaml           |  1 +
 docs/parsing.rst                            | 24 ++++++
 docs/parsing/alchemlyb.parsing.parquet.rst  |  8 ++
 environment.yml                             |  1 +
 setup.py                                    |  1 +
 src/alchemlyb/parsing/parquet.py            | 84 +++++++++++++++++++++
 src/alchemlyb/tests/parsing/test_parquet.py | 23 ++++++
 src/alchemlyb/tests/test_workflow_ABFE.py   | 31 ++++++++
 src/alchemlyb/workflows/abfe.py             | 17 +++--
 10 files changed, 189 insertions(+), 5 deletions(-)
 create mode 100644 docs/parsing/alchemlyb.parsing.parquet.rst
 create mode 100644 src/alchemlyb/parsing/parquet.py
 create mode 100644 src/alchemlyb/tests/parsing/test_parquet.py

diff --git a/CHANGES b/CHANGES
index 9f5c527a..1924039c 100644
--- a/CHANGES
+++ b/CHANGES
@@ -17,6 +17,10 @@ The rules for this file:
 
   * 2.1.0
 
+Enhancements
+  - Add a parser to read serialised pandas dataframe (parquet) (issue #316, PR#317).
+  - workflow.ABFE allow parquet as input (issue #316, PR#317).
+
 Fixes
   - Fix the case where visualisation.plot_convergence would fail when the final
    error is NaN (issue #318, PR#317).
diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml
index 63b439cc..b9dcd4e0 100644
--- a/devtools/conda-envs/test_env.yaml
+++ b/devtools/conda-envs/test_env.yaml
@@ -9,6 +9,7 @@ dependencies:
 - scipy
 - scikit-learn
 - matplotlib
+- pyarrow
 
   # Testing
 - pytest
diff --git a/docs/parsing.rst b/docs/parsing.rst
index 17d158a2..d107278e 100644
--- a/docs/parsing.rst
+++ b/docs/parsing.rst
@@ -49,6 +49,29 @@ requires some care due to shortcomings in how pandas currently handles
 metadata (see issue `pandas-dev/pandas#28283 <https://github.com/pandas-dev/pandas/issues/28283>`_).
 
 
+Serialisation
+'''''''''''''
+
+Alchemlyb data structures (``dHdl`` and ``u_nk``) can be serialized as dataframes
+and made persistent.
+We use the `parquet <https://pandas.pydata.org/docs/user_guide/io.html#io-parquet>`_
+format for serializing (writing) to a file and de-serializing (reading) from a 
+parquet file.
+
+For serialization we simply use the :meth:`pandas.DataFrame.to_parquet` method of
+a :class:`pandas.DataFrame`. For loading alchemlyb data we provide the 
+:func:`alchemlyb.parsing.parquet.extract_dHdl` and 
+:func:`alchemlyb.parsing.parquet.extract_u_nk` functions as shown in the example::
+
+    from alchemlyb.parsing.parquet import extract_dHdl, extract_u_nk
+    import pandas as pd
+
+    u_nk.to_parquet(path='u_nk.parquet', index=True)
+    dHdl.to_parquet(path='dHdl.parquet', index=True)
+
+    new_u_nk = extract_u_nk('u_nk.parquet', T=300)
+    new_dHdl = extract_dHdl('dHdl.parquet', T=300)
+
 
 .. _dHdl:
 
@@ -211,4 +234,5 @@ See the documentation for the package you are using for more details on parser u
     amber
     namd
     gomc
+    parquet
     
diff --git a/docs/parsing/alchemlyb.parsing.parquet.rst b/docs/parsing/alchemlyb.parsing.parquet.rst
new file mode 100644
index 00000000..be1a03b3
--- /dev/null
+++ b/docs/parsing/alchemlyb.parsing.parquet.rst
@@ -0,0 +1,8 @@
+
+
+API Reference
+-------------
+This submodule includes these parsing functions:
+
+.. autofunction:: alchemlyb.parsing.parquet.extract_u_nk
+.. autofunction:: alchemlyb.parsing.parquet.extract_dHdl
\ No newline at end of file
diff --git a/environment.yml b/environment.yml
index ddee2d3c..4d2d9bda 100644
--- a/environment.yml
+++ b/environment.yml
@@ -8,4 +8,5 @@ dependencies:
 - pymbar>=4
 - scipy
 - scikit-learn
+- pyarrow
 - matplotlib
diff --git a/setup.py b/setup.py
index a1c55403..e95a2f9b 100755
--- a/setup.py
+++ b/setup.py
@@ -52,5 +52,6 @@
         "scipy",
         "scikit-learn",
         "matplotlib",
+        "pyarrow",
     ],
 )
diff --git a/src/alchemlyb/parsing/parquet.py b/src/alchemlyb/parsing/parquet.py
new file mode 100644
index 00000000..180817ae
--- /dev/null
+++ b/src/alchemlyb/parsing/parquet.py
@@ -0,0 +1,84 @@
+import pandas as pd
+
+from . import _init_attrs
+
+
+@_init_attrs
+def extract_u_nk(path, T):
+    r"""Return reduced potentials `u_nk` (unit: kT) from a pandas parquet file.
+
+    The parquet file should be serialised from the dataframe output
+    from any parser with command
+    (``u_nk_df.to_parquet(path=path, index=True)``).
+
+    Parameters
+    ----------
+    path : str
+        Path to parquet file to extract dataframe from.
+    T : float
+        Temperature in Kelvin of the simulations.
+
+    Returns
+    -------
+    u_nk : DataFrame
+        Potential energy for each alchemical state (k) for each frame (n).
+
+
+    Note
+    ----
+    pyarraw serializers would handle the float or string column name fine but will
+    convert multi-lambda column name from `(0.0, 0.0)` to `"('0.0', '0.0')"`.
+    This parser will restore the correct column name.
+    Also parquet serialisation doesn't preserve the :attr:`pandas.DataFrame.attrs`.
+    So the temperature is assigned in this function.
+
+
+    .. versionadded:: 2.1.0
+
+    """
+    u_nk = pd.read_parquet(path)
+    columns = list(u_nk.columns)
+    if isinstance(columns[0], str) and columns[0][0] == "(":
+        new_columns = []
+        for column in columns:
+            new_columns.append(
+                tuple(
+                    map(
+                        float, column[1:-1].replace('"', "").replace("'", "").split(",")
+                    )
+                )
+            )
+        u_nk.columns = new_columns
+    return u_nk
+
+
+@_init_attrs
+def extract_dHdl(path, T):
+    r"""Return gradients `dH/dl` (unit: kT) from a pandas parquet file.
+
+    The parquet file should be serialised from the dataframe output
+    from any parser with command
+    (`dHdl_df.to_parquet(path=path, index=True)`).
+
+    Parameters
+    ----------
+    path : str
+        Path to parquet file to extract dataframe from.
+    T : float
+        Temperature in Kelvin the simulations sampled.
+
+    Returns
+    -------
+    dH/dl : DataFrame
+        dH/dl as a function of time for this lambda window.
+
+    Note
+    ----
+    Parquet serialisation doesn't preserve the :attr:`pandas.DataFrame.attrs`.
+    So the temperature is assigned in this function.
+
+
+    .. versionadded:: 2.1.0
+
+    """
+    return pd.read_parquet(path)
diff --git a/src/alchemlyb/tests/parsing/test_parquet.py b/src/alchemlyb/tests/parsing/test_parquet.py
new file mode 100644
index 00000000..a2d788f6
--- /dev/null
+++ b/src/alchemlyb/tests/parsing/test_parquet.py
@@ -0,0 +1,23 @@
+import pytest
+
+from alchemlyb.parsing.parquet import extract_u_nk, extract_dHdl
+
+
+@pytest.mark.parametrize(
+    "dHdl_list", ["gmx_benzene_Coulomb_dHdl", "gmx_ABFE_complex_dHdl"]
+)
+def test_extract_dHdl(dHdl_list, request, tmp_path):
+    dHdl = request.getfixturevalue(dHdl_list)[0]
+    dHdl.to_parquet(path=str(tmp_path / "dhdl.parquet"), index=True)
+    new_dHdl = extract_dHdl(str(tmp_path / "dhdl.parquet"), T=300)
+    assert (new_dHdl.columns == dHdl.columns).all()
+    assert (new_dHdl.index == dHdl.index).all()
+
+
+@pytest.mark.parametrize("u_nk_list", ["gmx_benzene_VDW_u_nk", "gmx_ABFE_complex_n_uk"])
+def test_extract_dHdl(u_nk_list, request, tmp_path):
+    u_nk = request.getfixturevalue(u_nk_list)[0]
+    u_nk.to_parquet(path=str(tmp_path / "u_nk.parquet"), index=True)
+    new_u_nk = extract_u_nk(str(tmp_path / "u_nk.parquet"), T=300)
+    assert (new_u_nk.columns == u_nk.columns).all()
+    assert (new_u_nk.index == u_nk.index).all()
diff --git a/src/alchemlyb/tests/test_workflow_ABFE.py b/src/alchemlyb/tests/test_workflow_ABFE.py
index 5a874102..dfaa23fd 100644
--- a/src/alchemlyb/tests/test_workflow_ABFE.py
+++ b/src/alchemlyb/tests/test_workflow_ABFE.py
@@ -5,6 +5,7 @@
 from alchemtest.amber import load_bace_example
 from alchemtest.gmx import load_ABFE
 
+import alchemlyb.parsing.amber
 from alchemlyb.workflows.abfe import ABFE
 
 
@@ -397,3 +398,33 @@ def test_summary(self, workflow):
         """Test if if the summary is right."""
         summary = workflow.generate_result()
         assert np.isclose(summary["TI"]["Stages"]["TOTAL"], 1.40405980473, 0.1)
+
+
+class Test_automatic_parquet:
+    """Test the full automatic workflow for load_ABFE from parquet data."""
+
+    @staticmethod
+    @pytest.fixture(scope="class")
+    def workflow(tmp_path_factory):
+        outdir = tmp_path_factory.mktemp("out")
+        for i, u_nk in enumerate(load_bace_example()["data"]["complex"]["vdw"]):
+            df = alchemlyb.parsing.amber.extract_u_nk(u_nk, T=298)
+            df.to_parquet(path=f"{outdir}/u_nk_{i}.parquet", index=True)
+
+        workflow = ABFE(
+            units="kcal/mol",
+            software="PARQUET",
+            dir=str(outdir),
+            prefix="u_nk_",
+            suffix="parquet",
+            T=298.0,
+            outdirectory=str(outdir),
+        )
+        workflow.read()
+        workflow.estimate(estimators="BAR")
+        return workflow
+
+    def test_summary(self, workflow):
+        """Test if if the summary is right."""
+        summary = workflow.generate_result()
+        assert np.isclose(summary["BAR"]["Stages"]["TOTAL"], 1.40405980473, 0.1)
diff --git a/src/alchemlyb/workflows/abfe.py b/src/alchemlyb/workflows/abfe.py
index 60013600..4a00fbab 100644
--- a/src/alchemlyb/workflows/abfe.py
+++ b/src/alchemlyb/workflows/abfe.py
@@ -12,7 +12,7 @@
 from .. import concat
 from ..convergence import forward_backward_convergence
 from ..estimators import MBAR, BAR, TI, FEP_ESTIMATORS, TI_ESTIMATORS
-from ..parsing import gmx, amber
+from ..parsing import gmx, amber, parquet
 from ..postprocessors.units import get_unit_converter
 from ..preprocessing.subsampling import decorrelate_dhdl, decorrelate_u_nk
 from ..visualisation import (
@@ -39,7 +39,7 @@ class ABFE(WorkflowBase):
         The unit used for printing and plotting results. {'kcal/mol', 'kJ/mol',
         'kT'}. Default: 'kT'.
     software : str
-        The software used for generating input (case-insensitive). {'GROMACS', 'AMBER'}.
+        The software used for generating input (case-insensitive). {'GROMACS', 'AMBER', 'PARQUET'}.
         This option chooses the appropriate parser for the input file.
     dir : str
         Directory in which data files are stored. Default: os.path.curdir.
@@ -64,7 +64,9 @@ class ABFE(WorkflowBase):
     .. versionadded:: 1.0.0
     .. versionchanged:: 2.0.1
         The `dir` argument expects a real directory without wildcards and wildcards will no longer
-        work as expected. Use `prefix` to specify wildcard-based patterns to search under `dir`.  
+        work as expected. Use `prefix` to specify wildcard-based patterns to search under `dir`.
+    .. versionchanged:: 2.1.0
+        The serialised dataframe could be read via software='PARQUET'.
     """
 
     def __init__(
@@ -86,8 +88,10 @@ def __init__(
             f"{software}"
         )
         reg_exp = "**/" + prefix + "*" + suffix
-        if '*' in dir:
-            warnings.warn(f"A real directory is expected in `dir`={dir}, wildcard expressions should be supplied to `prefix`.")
+        if "*" in dir:
+            warnings.warn(
+                f"A real directory is expected in `dir`={dir}, wildcard expressions should be supplied to `prefix`."
+            )
         if not Path(dir).is_dir():
             raise ValueError(f"The input directory `dir`={dir} is not a directory.")
         self.file_list = list(map(str, Path(dir).glob(reg_exp)))
@@ -105,6 +109,9 @@ def __init__(
         elif software == "AMBER":
             self._extract_u_nk = amber.extract_u_nk
             self._extract_dHdl = amber.extract_dHdl
+        elif software == "PARQUET":
+            self._extract_u_nk = parquet.extract_u_nk
+            self._extract_dHdl = parquet.extract_dHdl
         else:
             raise NotImplementedError(f"{software} parser not found.")