Skip to content

Commit

Permalink
Sync series and dataframe in statistical_inefficieny() (#199)
Browse files Browse the repository at this point in the history
* Fixes #198
* Sync series and dataframe when subsampling:
  Make sure that both the subsampling series and the dataframe are sliced
  when calling statistical_inefficieny(). This ensure that the series and
  the dataframe are in sync before the subsampled indices of the series
  are applied to the dataframe.
* Add tests: This adds a number of tests for slicing() and statistical_inefficieny():
  * Test that slicing() respects upper and lower time bounds (currently
    passes)
  * Test that statistical_inefficieny() respects upper and lower time bounds
    when it is used without series to subsample (currently passes)
  * Test that statistical_inefficieny() respects upper and lower time bounds
    when it is used without series to subsample (currently fails)
  * Test that first using slicing() on the data frame, then
    statistical_inefficieny() without time bounds yields the same results as
    a single call to statistical_inefficieny() with time bounds (currently
    fails)
  * Extract input sanity check in separate function
  * Extract test for unchanged data in separate test
  * Parametrize tests of statistical_inefficiency to merge cases
    which use `series` with the ones that don't
  * Parametrize all tests to use `conservative` `True` and `False`
  * Remove conservative keyword where it didn't belong
  * Use pytest fixture to load data at test time
* Add notes to docstring, CHANGES, and AUTHORS
  • Loading branch information
ptmerz authored Jul 7, 2022
1 parent 72622c9 commit bc39ef3
Show file tree
Hide file tree
Showing 4 changed files with 181 additions and 3 deletions.
1 change: 1 addition & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ Chronological list of authors

2022
- Irfan Alibay (@IAlibay)
- Pascal Merz (@ptmerz)
4 changes: 3 additions & 1 deletion CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ The rules for this file:

------------------------------------------------------------------------------

*/*/2022 xiki-tempula, IAlibay, dotsdl, orbeckst
*/*/2022 xiki-tempula, IAlibay, dotsdl, orbeckst, ptmerz

* 0.7.0

Expand All @@ -35,6 +35,8 @@ Fixes
- Fixes setup.py and setup.cfg to prevent installations with Python versions
lower than 3.7 (Issue #193)
- added AutoMBAR to convergence analysis (#189)
- Fixed subsampling in statistical_inefficiency when lower or step keywords
are used (Issue #198, PR #199)


12/28/2021 schlaicha, xiki-tempula, jhenin, ttjoseph, orbeckst
Expand Down
7 changes: 7 additions & 0 deletions src/alchemlyb/preprocessing/subsampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,12 @@ def statistical_inefficiency(df, series=None, lower=None, upper=None, step=None,
inefficiency was _rounded_ (instead of ``ceil()``) and thus one could
end up with correlated data.
.. versionchanged:: 0.7.0
Fixed a bug that would effectively ignore the ``lower`` and ``step``
keywords when returning the subsampled DataFrame object. See
`issue #198 <https://github.com/alchemistry/alchemlyb/issues/198>`_ for
more details.
"""
if _check_multiple_times(df):
if drop_duplicates:
Expand Down Expand Up @@ -295,6 +301,7 @@ def statistical_inefficiency(df, series=None, lower=None, upper=None, step=None,
raise ValueError("series and data must be sampled at the same times")

series = slicing(series, lower=lower, upper=upper, step=step)
df = slicing(df, lower=lower, upper=upper, step=step)

# calculate statistical inefficiency of series (could use fft=True but needs test)
statinef = statisticalInefficiency(series, fast=False)
Expand Down
172 changes: 170 additions & 2 deletions src/alchemlyb/tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,16 @@

import alchemtest.gmx


def gmx_benzene_dHdl():
dataset = alchemtest.gmx.load_benzene()
return gmx.extract_dHdl(dataset['data']['Coulomb'][0], T=300)

# When issue #206 is addressed make the gmx_benzene_dHdl() function the
# fixture, remove the wrapper below, and replace
# gmx_benzene_dHdl_fixture --> gmx_benzene_dHdl
@pytest.fixture()
def gmx_benzene_dHdl_fixture():
return gmx_benzene_dHdl()

@pytest.fixture()
def gmx_ABFE():
Expand All @@ -41,7 +46,6 @@ def gmx_benzene_u_nk_fixture():
dataset = alchemtest.gmx.load_benzene()
return gmx.extract_u_nk(dataset['data']['Coulomb'][0], T=300)


def gmx_benzene_u_nk():
dataset = alchemtest.gmx.load_benzene()
return gmx.extract_u_nk(dataset['data']['Coulomb'][0], T=300)
Expand All @@ -56,6 +60,18 @@ def gmx_benzene_u_nk_full():
dataset = alchemtest.gmx.load_benzene()
return alchemlyb.concat([gmx.extract_u_nk(i, T=300) for i in dataset['data']['Coulomb']])


def _check_data_is_outside_bounds(data, lower, upper):
"""
Helper function to make sure that `data` has entries that are
below the `lower` bound, and above the `upper` bound.
This is used by slicing tests to make sure that the data
provided is appropriate for the tests.
"""
assert any(data.reset_index()['time'] < lower)
assert any(data.reset_index()['time'] > upper)


class TestSlicing:
"""Test slicing functionality.
Expand All @@ -68,6 +84,51 @@ def slicer(self, *args, **kwargs):
def test_basic_slicing(self, data, size):
assert len(self.slicer(data, lower=1000, upper=34000, step=5)) == size

@pytest.mark.parametrize(('dataloader', 'lower', 'upper'),
[
('gmx_benzene_dHdl_fixture', 1000, 34000),
('gmx_benzene_u_nk_fixture', 1000, 34000),
])
def test_data_is_unchanged(self, dataloader, lower, upper, request):
"""
Test that slicing does not change the underlying data
"""
# Load data
data = request.getfixturevalue(dataloader)
# Check that the input data is appropriate for the test
_check_data_is_outside_bounds(data, lower, upper)

# Slice data, and test that we didn't change the input data
original_length = len(data)
sliced = self.slicer(data,
lower=lower,
upper=upper,
step=5)
assert len(data) == original_length

@pytest.mark.parametrize(('dataloader', 'lower', 'upper'),
[
('gmx_benzene_dHdl_fixture', 1000, 34000),
('gmx_benzene_u_nk_fixture', 1000, 34000),
])
def test_lower_and_upper_bound(self, dataloader, lower, upper, request):
"""
Test that the lower and upper time is respected
"""
# Load data
data = request.getfixturevalue(dataloader)
# Check that the input data is appropriate for the test
_check_data_is_outside_bounds(data, lower, upper)

# Slice data, and test that we don't observe times outside
# the prescribed range
sliced = self.slicer(data,
lower=lower,
upper=upper,
step=5)
assert all(sliced.reset_index()['time'] >= lower)
assert all(sliced.reset_index()['time'] <= upper)

@pytest.mark.parametrize('data', [gmx_benzene_dHdl(),
gmx_benzene_u_nk()])
def test_disordered_exception(self, data):
Expand Down Expand Up @@ -213,6 +274,113 @@ def test_raise_ValueError_for_mismatched_data(self, series):
with pytest.raises(ValueError):
self.slicer(data, series=series)

@pytest.mark.parametrize(('dataloader', 'lower', 'upper'),
[
('gmx_benzene_dHdl_fixture', 1000, 34000),
('gmx_benzene_u_nk_fixture', 1000, 34000),
])
@pytest.mark.parametrize('use_series', [True, False])
@pytest.mark.parametrize('conservative', [True, False])
def test_data_is_unchanged(
self, dataloader, use_series, lower, upper, conservative, request
):
"""
Test that using statistical_inefficiency does not change the underlying data
statistical_inefficiency is equivalent to slicing it its `series` parameter
is not set. If the `series` parameter is set, additional inefficiency
calculations are performed. We want to test both behaviors. The behavior
is toggled using the `use_series` argument.
"""
# Load data
data = request.getfixturevalue(dataloader)
# Check that the input data is appropriate for the test
_check_data_is_outside_bounds(data, lower, upper)

# Define subsampling series if required
series = data.sum(axis=1) if use_series else None

# Slice data, and test that we didn't change the input data
original_length = len(data)
self.slicer(data,
series=series,
lower=lower,
upper=upper,
step=5,
conservative=conservative)
assert len(data) == original_length

@pytest.mark.parametrize(('dataloader', 'lower', 'upper'),
[
('gmx_benzene_dHdl_fixture', 1000, 34000),
('gmx_benzene_u_nk_fixture', 1000, 34000),
])
@pytest.mark.parametrize('use_series', [True, False])
@pytest.mark.parametrize('conservative', [True, False])
def test_lower_and_upper_bound_slicer(
self, dataloader, use_series, lower, upper, conservative, request
):
"""
Test that the lower and upper time is respected when using statistical_inefficiency
statistical_inefficiency is equivalent to slicing it its `series` parameter
is not set. If the `series` parameter is set, additional inefficiency
calculations are performed. We want to test both behaviors. The behavior
is toggled using the `use_series` argument.
"""
# Load data
data = request.getfixturevalue(dataloader)
# Check that the input data is appropriate for the test
_check_data_is_outside_bounds(data, lower, upper)

# Define subsampling series if required
series = data.sum(axis=1) if use_series else None

# Slice data, and test that we don't observe times outside
# the prescribed range
sliced = self.slicer(data,
series=series,
lower=lower,
upper=upper,
step=5,
conservative=conservative)
assert all(sliced.reset_index()['time'] >= lower)
assert all(sliced.reset_index()['time'] <= upper)

@pytest.mark.parametrize(('dataloader', 'lower', 'upper'),
[
('gmx_benzene_dHdl_fixture', 1000, 34000),
('gmx_benzene_u_nk_fixture', 1000, 34000),
])
@pytest.mark.parametrize('conservative', [True, False])
def test_slicing_inefficiency_equivalence(
self, dataloader, lower, upper, conservative, request
):
"""
Test that first slicing the data frame, then subsampling is equivalent to
subsampling with lower / upper bounds set
"""
# Load data
data = request.getfixturevalue(dataloader)
# Check that the input data is appropriate for the test
_check_data_is_outside_bounds(data, lower, upper)

# Slice dataframe, then subsample it based on the sum of its components
sliced_data = slicing(data, lower=lower, upper=upper)
subsampled_sliced_data = self.slicer(sliced_data,
series=sliced_data.sum(axis=1),
conservative=conservative)

# Subsample the dataframe based on the sum of its components while
# also specifying the slicing range
subsampled_data = self.slicer(data,
series=data.sum(axis=1),
lower=lower,
upper=upper,
conservative=conservative)

assert (subsampled_sliced_data == subsampled_data).all(axis=None)


class TestEquilibriumDetection(TestSlicing, CorrelatedPreprocessors):

Expand Down

0 comments on commit bc39ef3

Please sign in to comment.