-
Notifications
You must be signed in to change notification settings - Fork 51
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
parquet parser for dataframe serialisation (#317)
* update * update * Update docs/parsing.rst Co-authored-by: Oliver Beckstein <orbeckst@gmail.com> * Update src/alchemlyb/parsing/parquet.py Co-authored-by: Oliver Beckstein <orbeckst@gmail.com> * Update src/alchemlyb/parsing/parquet.py Co-authored-by: Oliver Beckstein <orbeckst@gmail.com> * Update src/alchemlyb/parsing/parquet.py Co-authored-by: Oliver Beckstein <orbeckst@gmail.com> * update * update --------- Co-authored-by: William (Zhiyi) Wu <zwu@exscientia.co.uk> Co-authored-by: Oliver Beckstein <orbeckst@gmail.com>
- Loading branch information
1 parent
064c4fe
commit 1d0a111
Showing
10 changed files
with
189 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ dependencies: | |
- scipy | ||
- scikit-learn | ||
- matplotlib | ||
- pyarrow | ||
|
||
# Testing | ||
- pytest | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
|
||
|
||
API Reference | ||
------------- | ||
This submodule includes these parsing functions: | ||
|
||
.. autofunction:: alchemlyb.parsing.parquet.extract_u_nk | ||
.. autofunction:: alchemlyb.parsing.parquet.extract_dHdl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,4 +8,5 @@ dependencies: | |
- pymbar>=4 | ||
- scipy | ||
- scikit-learn | ||
- pyarrow | ||
- matplotlib |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,5 +52,6 @@ | |
"scipy", | ||
"scikit-learn", | ||
"matplotlib", | ||
"pyarrow", | ||
], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import pandas as pd | ||
|
||
from . import _init_attrs | ||
|
||
|
||
@_init_attrs | ||
def extract_u_nk(path, T): | ||
r"""Return reduced potentials `u_nk` (unit: kT) from a pandas parquet file. | ||
The parquet file should be serialised from the dataframe output | ||
from any parser with command | ||
(``u_nk_df.to_parquet(path=path, index=True)``). | ||
Parameters | ||
---------- | ||
path : str | ||
Path to parquet file to extract dataframe from. | ||
T : float | ||
Temperature in Kelvin of the simulations. | ||
Returns | ||
------- | ||
u_nk : DataFrame | ||
Potential energy for each alchemical state (k) for each frame (n). | ||
Note | ||
---- | ||
pyarraw serializers would handle the float or string column name fine but will | ||
convert multi-lambda column name from `(0.0, 0.0)` to `"('0.0', '0.0')"`. | ||
This parser will restore the correct column name. | ||
Also parquet serialisation doesn't preserve the :attr:`pandas.DataFrame.attrs`. | ||
So the temperature is assigned in this function. | ||
.. versionadded:: 2.1.0 | ||
""" | ||
u_nk = pd.read_parquet(path) | ||
columns = list(u_nk.columns) | ||
if isinstance(columns[0], str) and columns[0][0] == "(": | ||
new_columns = [] | ||
for column in columns: | ||
new_columns.append( | ||
tuple( | ||
map( | ||
float, column[1:-1].replace('"', "").replace("'", "").split(",") | ||
) | ||
) | ||
) | ||
u_nk.columns = new_columns | ||
return u_nk | ||
|
||
|
||
@_init_attrs | ||
def extract_dHdl(path, T): | ||
r"""Return gradients `dH/dl` (unit: kT) from a pandas parquet file. | ||
The parquet file should be serialised from the dataframe output | ||
from any parser with command | ||
(`dHdl_df.to_parquet(path=path, index=True)`). | ||
Parameters | ||
---------- | ||
path : str | ||
Path to parquet file to extract dataframe from. | ||
T : float | ||
Temperature in Kelvin the simulations sampled. | ||
Returns | ||
------- | ||
dH/dl : DataFrame | ||
dH/dl as a function of time for this lambda window. | ||
Note | ||
---- | ||
Parquet serialisation doesn't preserve the :attr:`pandas.DataFrame.attrs`. | ||
So the temperature is assigned in this function. | ||
.. versionadded:: 2.1.0 | ||
""" | ||
return pd.read_parquet(path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import pytest | ||
|
||
from alchemlyb.parsing.parquet import extract_u_nk, extract_dHdl | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"dHdl_list", ["gmx_benzene_Coulomb_dHdl", "gmx_ABFE_complex_dHdl"] | ||
) | ||
def test_extract_dHdl(dHdl_list, request, tmp_path): | ||
dHdl = request.getfixturevalue(dHdl_list)[0] | ||
dHdl.to_parquet(path=str(tmp_path / "dhdl.parquet"), index=True) | ||
new_dHdl = extract_dHdl(str(tmp_path / "dhdl.parquet"), T=300) | ||
assert (new_dHdl.columns == dHdl.columns).all() | ||
assert (new_dHdl.index == dHdl.index).all() | ||
|
||
|
||
@pytest.mark.parametrize("u_nk_list", ["gmx_benzene_VDW_u_nk", "gmx_ABFE_complex_n_uk"]) | ||
def test_extract_dHdl(u_nk_list, request, tmp_path): | ||
u_nk = request.getfixturevalue(u_nk_list)[0] | ||
u_nk.to_parquet(path=str(tmp_path / "u_nk.parquet"), index=True) | ||
new_u_nk = extract_u_nk(str(tmp_path / "u_nk.parquet"), T=300) | ||
assert (new_u_nk.columns == u_nk.columns).all() | ||
assert (new_u_nk.index == u_nk.index).all() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters