From 34177bc4d09b3f20f07ea294c94e2956f77669d0 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 16 Jan 2021 01:10:13 +0000 Subject: [PATCH] Backport PR #39191: Revert "BUG/REG: RollingGroupby MultiIndex levels dropped (#38737)" --- doc/source/whatsnew/v1.2.1.rst | 1 - pandas/core/shared_docs.py | 2 +- pandas/core/window/rolling.py | 28 ++++++++++------- pandas/tests/window/test_groupby.py | 47 +++-------------------------- 4 files changed, 23 insertions(+), 55 deletions(-) diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index 55fddb8b732e2..ebac06b80ad67 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -15,7 +15,6 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`to_csv` that created corrupted zip files when there were more rows than ``chunksize`` (:issue:`38714`) -- Fixed regression in ``groupby().rolling()`` where :class:`MultiIndex` levels were dropped (:issue:`38523`) - Fixed regression in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`) - Fixed regression in :meth:`DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`) - Fixed regression in :meth:`DataFrame.any` and :meth:`DataFrame.all` not returning a result for tz-aware ``datetime64`` columns (:issue:`38723`) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 4007ef50932fc..3aeb3b664b27f 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -108,7 +108,7 @@ Note this does not influence the order of observations within each group. Groupby preserves the order of rows within each group. group_keys : bool, default True - When calling ``groupby().apply()``, add group keys to index to identify pieces. + When calling apply, add group keys to index to identify pieces. squeeze : bool, default False Reduce the dimensionality of the return type if possible, otherwise return a consistent type. diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index e50a907901dc7..e6185f8ae0679 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -767,22 +767,28 @@ def _apply( numba_cache_key, **kwargs, ) - # Reconstruct the resulting MultiIndex + # Reconstruct the resulting MultiIndex from tuples # 1st set of levels = group by labels - # 2nd set of levels = original DataFrame/Series index - grouped_object_index = self.obj.index - grouped_index_name = [*grouped_object_index.names] - groupby_keys = [grouping.name for grouping in self._groupby.grouper._groupings] - result_index_names = groupby_keys + grouped_index_name + # 2nd set of levels = original index + # Ignore 2nd set of levels if a group by label include an index level + result_index_names = [ + grouping.name for grouping in self._groupby.grouper._groupings + ] + grouped_object_index = None - drop_columns = [ + column_keys = [ key - for key in groupby_keys + for key in result_index_names if key not in self.obj.index.names or key is None ] - if len(drop_columns) != len(groupby_keys): - # Our result will have kept groupby columns which should be dropped - result = result.drop(columns=drop_columns, errors="ignore") + + if len(column_keys) == len(result_index_names): + grouped_object_index = self.obj.index + grouped_index_name = [*grouped_object_index.names] + result_index_names += grouped_index_name + else: + # Our result will have still kept the column in the result + result = result.drop(columns=column_keys, errors="ignore") codes = self._groupby.grouper.codes levels = self._groupby.grouper.levels diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index f915da3330ba7..b89fb35ac3a70 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -556,31 +556,23 @@ def test_groupby_rolling_nans_in_index(self, rollings, key): with pytest.raises(ValueError, match=f"{key} must be monotonic"): df.groupby("c").rolling("60min", **rollings) - @pytest.mark.parametrize("group_keys", [True, False]) - def test_groupby_rolling_group_keys(self, group_keys): + def test_groupby_rolling_group_keys(self): # GH 37641 - # GH 38523: GH 37641 actually was not a bug. - # group_keys only applies to groupby.apply directly arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]] index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2")) s = Series([1, 2, 3], index=index) - result = s.groupby(["idx1", "idx2"], group_keys=group_keys).rolling(1).mean() + result = s.groupby(["idx1", "idx2"], group_keys=False).rolling(1).mean() expected = Series( [1.0, 2.0, 3.0], index=MultiIndex.from_tuples( - [ - ("val1", "val1", "val1", "val1"), - ("val1", "val1", "val1", "val1"), - ("val2", "val2", "val2", "val2"), - ], - names=["idx1", "idx2", "idx1", "idx2"], + [("val1", "val1"), ("val1", "val1"), ("val2", "val2")], + names=["idx1", "idx2"], ), ) tm.assert_series_equal(result, expected) def test_groupby_rolling_index_level_and_column_label(self): - # The groupby keys should not appear as a resulting column arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]] index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2")) @@ -589,12 +581,7 @@ def test_groupby_rolling_index_level_and_column_label(self): expected = DataFrame( {"B": [0.0, 1.0, 2.0]}, index=MultiIndex.from_tuples( - [ - ("val1", 1, "val1", "val1"), - ("val1", 1, "val1", "val1"), - ("val2", 2, "val2", "val2"), - ], - names=["idx1", "A", "idx1", "idx2"], + [("val1", 1), ("val1", 1), ("val2", 2)], names=["idx1", "A"] ), ) tm.assert_frame_equal(result, expected) @@ -653,30 +640,6 @@ def test_groupby_rolling_resulting_multiindex(self): ) tm.assert_index_equal(result.index, expected_index) - def test_groupby_level(self): - # GH 38523 - arrays = [ - ["Falcon", "Falcon", "Parrot", "Parrot"], - ["Captive", "Wild", "Captive", "Wild"], - ] - index = MultiIndex.from_arrays(arrays, names=("Animal", "Type")) - df = DataFrame({"Max Speed": [390.0, 350.0, 30.0, 20.0]}, index=index) - result = df.groupby(level=0)["Max Speed"].rolling(2).sum() - expected = Series( - [np.nan, 740.0, np.nan, 50.0], - index=MultiIndex.from_tuples( - [ - ("Falcon", "Falcon", "Captive"), - ("Falcon", "Falcon", "Wild"), - ("Parrot", "Parrot", "Captive"), - ("Parrot", "Parrot", "Wild"), - ], - names=["Animal", "Animal", "Type"], - ), - name="Max Speed", - ) - tm.assert_series_equal(result, expected) - class TestExpanding: def setup_method(self):