From a37f1a45e83d7f803d7fcab7d384da28e9c1e714 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 29 Dec 2020 18:56:07 +0000 Subject: [PATCH] BUG/REG: RollingGroupby MultiIndex levels dropped (#38737) Co-authored-by: Simon Hawkins --- doc/source/whatsnew/v1.2.1.rst | 1 + pandas/core/shared_docs.py | 2 +- pandas/core/window/rolling.py | 28 +++++++---------- pandas/tests/window/test_groupby.py | 47 ++++++++++++++++++++++++++--- 4 files changed, 55 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index 649b17e255f3d..a756239ee6798 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -16,6 +16,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - The deprecated attributes ``_AXIS_NAMES`` and ``_AXIS_NUMBERS`` of :class:`DataFrame` and :class:`Series` will no longer show up in ``dir`` or ``inspect.getmembers`` calls (:issue:`38740`) - :meth:`to_csv` created corrupted zip files when there were more rows than ``chunksize`` (issue:`38714`) +- Fixed a regression in ``groupby().rolling()`` where :class:`MultiIndex` levels were dropped (:issue:`38523`) - Bug in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`) - diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index ad2eafe7295b0..c1aed4eb3409b 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -108,7 +108,7 @@ Note this does not influence the order of observations within each group. Groupby preserves the order of rows within each group. group_keys : bool, default True - When calling apply, add group keys to index to identify pieces. + When calling ``groupby().apply()``, add group keys to index to identify pieces. squeeze : bool, default False Reduce the dimensionality of the return type if possible, otherwise return a consistent type. diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 440e0f210bd57..db8a48300206b 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -769,28 +769,22 @@ def _apply( numba_cache_key, **kwargs, ) - # Reconstruct the resulting MultiIndex from tuples + # Reconstruct the resulting MultiIndex # 1st set of levels = group by labels - # 2nd set of levels = original index - # Ignore 2nd set of levels if a group by label include an index level - result_index_names = [ - grouping.name for grouping in self._groupby.grouper._groupings - ] - grouped_object_index = None + # 2nd set of levels = original DataFrame/Series index + grouped_object_index = self.obj.index + grouped_index_name = [*grouped_object_index.names] + groupby_keys = [grouping.name for grouping in self._groupby.grouper._groupings] + result_index_names = groupby_keys + grouped_index_name - column_keys = [ + drop_columns = [ key - for key in result_index_names + for key in groupby_keys if key not in self.obj.index.names or key is None ] - - if len(column_keys) == len(result_index_names): - grouped_object_index = self.obj.index - grouped_index_name = [*grouped_object_index.names] - result_index_names += grouped_index_name - else: - # Our result will have still kept the column in the result - result = result.drop(columns=column_keys, errors="ignore") + if len(drop_columns) != len(groupby_keys): + # Our result will have kept groupby columns which should be dropped + result = result.drop(columns=drop_columns, errors="ignore") codes = self._groupby.grouper.codes levels = self._groupby.grouper.levels diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index b89fb35ac3a70..f915da3330ba7 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -556,23 +556,31 @@ def test_groupby_rolling_nans_in_index(self, rollings, key): with pytest.raises(ValueError, match=f"{key} must be monotonic"): df.groupby("c").rolling("60min", **rollings) - def test_groupby_rolling_group_keys(self): + @pytest.mark.parametrize("group_keys", [True, False]) + def test_groupby_rolling_group_keys(self, group_keys): # GH 37641 + # GH 38523: GH 37641 actually was not a bug. + # group_keys only applies to groupby.apply directly arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]] index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2")) s = Series([1, 2, 3], index=index) - result = s.groupby(["idx1", "idx2"], group_keys=False).rolling(1).mean() + result = s.groupby(["idx1", "idx2"], group_keys=group_keys).rolling(1).mean() expected = Series( [1.0, 2.0, 3.0], index=MultiIndex.from_tuples( - [("val1", "val1"), ("val1", "val1"), ("val2", "val2")], - names=["idx1", "idx2"], + [ + ("val1", "val1", "val1", "val1"), + ("val1", "val1", "val1", "val1"), + ("val2", "val2", "val2", "val2"), + ], + names=["idx1", "idx2", "idx1", "idx2"], ), ) tm.assert_series_equal(result, expected) def test_groupby_rolling_index_level_and_column_label(self): + # The groupby keys should not appear as a resulting column arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]] index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2")) @@ -581,7 +589,12 @@ def test_groupby_rolling_index_level_and_column_label(self): expected = DataFrame( {"B": [0.0, 1.0, 2.0]}, index=MultiIndex.from_tuples( - [("val1", 1), ("val1", 1), ("val2", 2)], names=["idx1", "A"] + [ + ("val1", 1, "val1", "val1"), + ("val1", 1, "val1", "val1"), + ("val2", 2, "val2", "val2"), + ], + names=["idx1", "A", "idx1", "idx2"], ), ) tm.assert_frame_equal(result, expected) @@ -640,6 +653,30 @@ def test_groupby_rolling_resulting_multiindex(self): ) tm.assert_index_equal(result.index, expected_index) + def test_groupby_level(self): + # GH 38523 + arrays = [ + ["Falcon", "Falcon", "Parrot", "Parrot"], + ["Captive", "Wild", "Captive", "Wild"], + ] + index = MultiIndex.from_arrays(arrays, names=("Animal", "Type")) + df = DataFrame({"Max Speed": [390.0, 350.0, 30.0, 20.0]}, index=index) + result = df.groupby(level=0)["Max Speed"].rolling(2).sum() + expected = Series( + [np.nan, 740.0, np.nan, 50.0], + index=MultiIndex.from_tuples( + [ + ("Falcon", "Falcon", "Captive"), + ("Falcon", "Falcon", "Wild"), + ("Parrot", "Parrot", "Captive"), + ("Parrot", "Parrot", "Wild"), + ], + names=["Animal", "Animal", "Type"], + ), + name="Max Speed", + ) + tm.assert_series_equal(result, expected) + class TestExpanding: def setup_method(self):