Sync series and dataframe in statistical_inefficieny() (#199)

* Fixes #198 * Sync series and dataframe when subsampling: Make sure that both the subsampling series and the dataframe are sliced when calling statistical_inefficieny(). This ensure that the series and the dataframe are in sync before the subsampled indices of the series are applied to the dataframe. * Add tests: This adds a number of tests for slicing() and statistical_inefficieny(): * Test that slicing() respects upper and lower time bounds (currently passes) * Test that statistical_inefficieny() respects upper and lower time bounds when it is used without series to subsample (currently passes) * Test that statistical_inefficieny() respects upper and lower time bounds when it is used without series to subsample (currently fails) * Test that first using slicing() on the data frame, then statistical_inefficieny() without time bounds yields the same results as a single call to statistical_inefficieny() with time bounds (currently fails) * Extract input sanity check in separate function * Extract test for unchanged data in separate test * Parametrize tests of statistical_inefficiency to merge cases which use `series` with the ones that don't * Parametrize all tests to use `conservative` `True` and `False` * Remove conservative keyword where it didn't belong * Use pytest fixture to load data at test time * Add notes to docstring, CHANGES, and AUTHORS
alchemistry · Jul 7, 2022 · bc39ef3 · bc39ef3
1 parent 72622c9
commit bc39ef3
Show file tree

Hide file tree

Showing 4 changed files with 181 additions and 3 deletions.
diff --git a/AUTHORS b/AUTHORS
@@ -45,3 +45,4 @@ Chronological list of authors
 
 2022
   - Irfan Alibay (@IAlibay)
+  - Pascal Merz (@ptmerz)
diff --git a/CHANGES b/CHANGES
@@ -14,7 +14,7 @@ The rules for this file:
 
 ------------------------------------------------------------------------------
 
-*/*/2022 xiki-tempula, IAlibay, dotsdl, orbeckst
+*/*/2022 xiki-tempula, IAlibay, dotsdl, orbeckst, ptmerz
 
   * 0.7.0
 
@@ -35,6 +35,8 @@ Fixes
   - Fixes setup.py and setup.cfg to prevent installations with Python versions
     lower than 3.7 (Issue #193)
   - added AutoMBAR to convergence analysis (#189)
+  - Fixed subsampling in statistical_inefficiency when lower or step keywords
+    are used (Issue #198, PR #199)
 
 
 12/28/2021 schlaicha, xiki-tempula, jhenin, ttjoseph, orbeckst

diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py
@@ -241,6 +241,12 @@ def statistical_inefficiency(df, series=None, lower=None, upper=None, step=None,
        inefficiency was _rounded_ (instead of ``ceil()``) and thus one could
        end up with correlated data.
 
+    .. versionchanged:: 0.7.0
+       Fixed a bug that would effectively ignore the ``lower`` and ``step``
+       keywords when returning the subsampled DataFrame object. See
+       `issue #198 <https://github.com/alchemistry/alchemlyb/issues/198>`_ for 
+       more details.
+
     """
     if _check_multiple_times(df):
         if drop_duplicates:
@@ -295,6 +301,7 @@ def statistical_inefficiency(df, series=None, lower=None, upper=None, step=None,
             raise ValueError("series and data must be sampled at the same times")
 
         series = slicing(series, lower=lower, upper=upper, step=step)
+        df = slicing(df, lower=lower, upper=upper, step=step)
 
         # calculate statistical inefficiency of series (could use fft=True but needs test)
         statinef  = statisticalInefficiency(series, fast=False)

diff --git a/src/alchemlyb/tests/test_preprocessing.py b/src/alchemlyb/tests/test_preprocessing.py
@@ -15,11 +15,16 @@
 
 import alchemtest.gmx
 
-
 def gmx_benzene_dHdl():
     dataset = alchemtest.gmx.load_benzene()
     return gmx.extract_dHdl(dataset['data']['Coulomb'][0], T=300)
 
+# When issue #206 is addressed make the gmx_benzene_dHdl() function the 
+# fixture, remove the wrapper below, and replace 
+# gmx_benzene_dHdl_fixture --> gmx_benzene_dHdl
+@pytest.fixture()
+def gmx_benzene_dHdl_fixture():
+    return gmx_benzene_dHdl()
 
 @pytest.fixture()
 def gmx_ABFE():
@@ -41,7 +46,6 @@ def gmx_benzene_u_nk_fixture():
     dataset = alchemtest.gmx.load_benzene()
     return gmx.extract_u_nk(dataset['data']['Coulomb'][0], T=300)
 
-
 def gmx_benzene_u_nk():
     dataset = alchemtest.gmx.load_benzene()
     return gmx.extract_u_nk(dataset['data']['Coulomb'][0], T=300)
@@ -56,6 +60,18 @@ def gmx_benzene_u_nk_full():
     dataset = alchemtest.gmx.load_benzene()
     return alchemlyb.concat([gmx.extract_u_nk(i, T=300) for i in dataset['data']['Coulomb']])
 
+
+def _check_data_is_outside_bounds(data, lower, upper):
+    """
+    Helper function to make sure that `data` has entries that are
+    below the `lower` bound, and above the `upper` bound.
+    This is used by slicing tests to make sure that the data
+    provided is appropriate for the tests.
+    """
+    assert any(data.reset_index()['time'] < lower)
+    assert any(data.reset_index()['time'] > upper)
+
+
 class TestSlicing:
     """Test slicing functionality.
 
@@ -68,6 +84,51 @@ def slicer(self, *args, **kwargs):
     def test_basic_slicing(self, data, size):
         assert len(self.slicer(data, lower=1000, upper=34000, step=5)) == size
 
+    @pytest.mark.parametrize(('dataloader', 'lower', 'upper'),
+                             [
+                                 ('gmx_benzene_dHdl_fixture', 1000, 34000),
+                                 ('gmx_benzene_u_nk_fixture', 1000, 34000),
+                             ])
+    def test_data_is_unchanged(self, dataloader, lower, upper, request):
+        """
+        Test that slicing does not change the underlying data
+        """
+        # Load data
+        data = request.getfixturevalue(dataloader)
+        # Check that the input data is appropriate for the test
+        _check_data_is_outside_bounds(data, lower, upper)
+
+        # Slice data, and test that we didn't change the input data
+        original_length = len(data)
+        sliced = self.slicer(data,
+                             lower=lower,
+                             upper=upper,
+                             step=5)
+        assert len(data) == original_length
+
+    @pytest.mark.parametrize(('dataloader', 'lower', 'upper'),
+                             [
+                                 ('gmx_benzene_dHdl_fixture', 1000, 34000),
+                                 ('gmx_benzene_u_nk_fixture', 1000, 34000),
+                             ])
+    def test_lower_and_upper_bound(self, dataloader, lower, upper, request):
+        """
+        Test that the lower and upper time is respected
+        """
+        # Load data
+        data = request.getfixturevalue(dataloader)
+        # Check that the input data is appropriate for the test
+        _check_data_is_outside_bounds(data, lower, upper)
+
+        # Slice data, and test that we don't observe times outside
+        # the prescribed range
+        sliced = self.slicer(data,
+                             lower=lower,
+                             upper=upper,
+                             step=5)
+        assert all(sliced.reset_index()['time'] >= lower)
+        assert all(sliced.reset_index()['time'] <= upper)
+
     @pytest.mark.parametrize('data', [gmx_benzene_dHdl(),
                                       gmx_benzene_u_nk()])
     def test_disordered_exception(self, data):
@@ -213,6 +274,113 @@ def test_raise_ValueError_for_mismatched_data(self, series):
         with pytest.raises(ValueError):
             self.slicer(data, series=series)
 
+    @pytest.mark.parametrize(('dataloader', 'lower', 'upper'),
+                             [
+                                 ('gmx_benzene_dHdl_fixture', 1000, 34000),
+                                 ('gmx_benzene_u_nk_fixture', 1000, 34000),
+                             ])
+    @pytest.mark.parametrize('use_series', [True, False])
+    @pytest.mark.parametrize('conservative', [True, False])
+    def test_data_is_unchanged(
+            self, dataloader, use_series, lower, upper, conservative, request
+    ):
+        """
+        Test that using statistical_inefficiency does not change the underlying data
+
+        statistical_inefficiency is equivalent to slicing it its `series` parameter
+        is not set. If the `series` parameter is set, additional inefficiency
+        calculations are performed. We want to test both behaviors. The behavior
+        is toggled using the `use_series` argument.
+        """
+        # Load data
+        data = request.getfixturevalue(dataloader)
+        # Check that the input data is appropriate for the test
+        _check_data_is_outside_bounds(data, lower, upper)
+
+        # Define subsampling series if required
+        series = data.sum(axis=1) if use_series else None
+
+        # Slice data, and test that we didn't change the input data
+        original_length = len(data)
+        self.slicer(data,
+                    series=series,
+                    lower=lower,
+                    upper=upper,
+                    step=5,
+                    conservative=conservative)
+        assert len(data) == original_length
+
+    @pytest.mark.parametrize(('dataloader', 'lower', 'upper'),
+                             [
+                                 ('gmx_benzene_dHdl_fixture', 1000, 34000),
+                                 ('gmx_benzene_u_nk_fixture', 1000, 34000),
+                             ])
+    @pytest.mark.parametrize('use_series', [True, False])
+    @pytest.mark.parametrize('conservative', [True, False])
+    def test_lower_and_upper_bound_slicer(
+            self, dataloader, use_series, lower, upper, conservative, request
+    ):
+        """
+        Test that the lower and upper time is respected when using statistical_inefficiency
+
+        statistical_inefficiency is equivalent to slicing it its `series` parameter
+        is not set. If the `series` parameter is set, additional inefficiency
+        calculations are performed. We want to test both behaviors. The behavior
+        is toggled using the `use_series` argument.
+        """
+        # Load data
+        data = request.getfixturevalue(dataloader)
+        # Check that the input data is appropriate for the test
+        _check_data_is_outside_bounds(data, lower, upper)
+
+        # Define subsampling series if required
+        series = data.sum(axis=1) if use_series else None
+
+        # Slice data, and test that we don't observe times outside
+        # the prescribed range
+        sliced = self.slicer(data,
+                             series=series,
+                             lower=lower,
+                             upper=upper,
+                             step=5,
+                             conservative=conservative)
+        assert all(sliced.reset_index()['time'] >= lower)
+        assert all(sliced.reset_index()['time'] <= upper)
+
+    @pytest.mark.parametrize(('dataloader', 'lower', 'upper'),
+                             [
+                                 ('gmx_benzene_dHdl_fixture', 1000, 34000),
+                                 ('gmx_benzene_u_nk_fixture', 1000, 34000),
+                             ])
+    @pytest.mark.parametrize('conservative', [True, False])
+    def test_slicing_inefficiency_equivalence(
+            self, dataloader, lower, upper, conservative, request
+    ):
+        """
+        Test that first slicing the data frame, then subsampling is equivalent to
+        subsampling with lower / upper bounds set
+        """
+        # Load data
+        data = request.getfixturevalue(dataloader)
+        # Check that the input data is appropriate for the test
+        _check_data_is_outside_bounds(data, lower, upper)
+
+        # Slice dataframe, then subsample it based on the sum of its components
+        sliced_data = slicing(data, lower=lower, upper=upper)
+        subsampled_sliced_data = self.slicer(sliced_data,
+                                             series=sliced_data.sum(axis=1),
+                                             conservative=conservative)
+
+        # Subsample the dataframe based on the sum of its components while
+        # also specifying the slicing range
+        subsampled_data = self.slicer(data,
+                                      series=data.sum(axis=1),
+                                      lower=lower,
+                                      upper=upper,
+                                      conservative=conservative)
+
+        assert (subsampled_sliced_data == subsampled_data).all(axis=None)
+
 
 class TestEquilibriumDetection(TestSlicing, CorrelatedPreprocessors):
Original file line number	Diff line number	Diff line change
Expand Up		@@ -45,3 +45,4 @@ Chronological list of authors

		2022
		- Irfan Alibay (@IAlibay)
		- Pascal Merz (@ptmerz)