Skip to content

Commit

Permalink
CLN/BUG: Clean/Simplify _wrap_applied_output (#35792)
Browse files Browse the repository at this point in the history
  • Loading branch information
rhshadrach authored Aug 26, 2020
1 parent 9f3e429 commit d90b73b
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 71 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`)
- Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising an ``TypeError`` (:issue:`35325`)
- Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`)
- Bug in :meth:`DataFrameGroupby.apply` would drop a :class:`CategoricalIndex` when grouped on. (:issue:`35792`)

Reshaping
^^^^^^^^^
Expand Down
90 changes: 25 additions & 65 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1197,57 +1197,25 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
if len(keys) == 0:
return self.obj._constructor(index=keys)

key_names = self.grouper.names

# GH12824
first_not_none = next(com.not_none(*values), None)

if first_not_none is None:
# GH9684. If all values are None, then this will throw an error.
# We'd prefer it return an empty dataframe.
# GH9684 - All values are None, return an empty frame.
return self.obj._constructor()
elif isinstance(first_not_none, DataFrame):
return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
else:
if len(self.grouper.groupings) > 1:
key_index = self.grouper.result_index

else:
ping = self.grouper.groupings[0]
if len(keys) == ping.ngroups:
key_index = ping.group_index
key_index.name = key_names[0]

key_lookup = Index(keys)
indexer = key_lookup.get_indexer(key_index)

# reorder the values
values = [values[i] for i in indexer]

# update due to the potential reorder
first_not_none = next(com.not_none(*values), None)
else:

key_index = Index(keys, name=key_names[0])

# don't use the key indexer
if not self.as_index:
key_index = None
key_index = self.grouper.result_index if self.as_index else None

# make Nones an empty object
if first_not_none is None:
return self.obj._constructor()
elif isinstance(first_not_none, NDFrame):
if isinstance(first_not_none, Series):

# this is to silence a DeprecationWarning
# TODO: Remove when default dtype of empty Series is object
kwargs = first_not_none._construct_axes_dict()
if isinstance(first_not_none, Series):
backup = create_series_with_explicit_dtype(
**kwargs, dtype_if_empty=object
)
else:
backup = first_not_none._constructor(**kwargs)
backup = create_series_with_explicit_dtype(
**kwargs, dtype_if_empty=object
)

values = [x if (x is not None) else backup for x in values]

Expand All @@ -1256,7 +1224,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index:
if isinstance(v, Series):
applied_index = self._selected_obj._get_axis(self.axis)
all_indexed_same = all_indexes_same([x.index for x in values])
all_indexed_same = all_indexes_same((x.index for x in values))
singular_series = len(values) == 1 and applied_index.nlevels == 1

# GH3596
Expand Down Expand Up @@ -1288,7 +1256,6 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
# GH 8467
return self._concat_objects(keys, values, not_indexed_same=True)

if self.axis == 0 and isinstance(v, ABCSeries):
# GH6124 if the list of Series have a consistent name,
# then propagate that name to the result.
index = v.index.copy()
Expand All @@ -1301,34 +1268,27 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
if len(names) == 1:
index.name = list(names)[0]

# normally use vstack as its faster than concat
# and if we have mi-columns
if (
isinstance(v.index, MultiIndex)
or key_index is None
or isinstance(key_index, MultiIndex)
):
stacked_values = np.vstack([np.asarray(v) for v in values])
result = self.obj._constructor(
stacked_values, index=key_index, columns=index
)
else:
# GH5788 instead of stacking; concat gets the
# dtypes correct
from pandas.core.reshape.concat import concat

result = concat(
values,
keys=key_index,
names=key_index.names,
axis=self.axis,
).unstack()
result.columns = index
elif isinstance(v, ABCSeries):
# Combine values
# vstack+constructor is faster than concat and handles MI-columns
stacked_values = np.vstack([np.asarray(v) for v in values])

if self.axis == 0:
index = key_index
columns = v.index.copy()
if columns.name is None:
# GH6124 - propagate name of Series when it's consistent
names = {v.name for v in values}
if len(names) == 1:
columns.name = list(names)[0]
else:
index = v.index
columns = key_index
stacked_values = stacked_values.T

result = self.obj._constructor(
stacked_values.T, index=v.index, columns=key_index
stacked_values, index=index, columns=columns
)

elif not self.as_index:
# We add grouping column below, so create a frame here
result = DataFrame(
Expand Down
7 changes: 4 additions & 3 deletions pandas/core/indexes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,15 +297,16 @@ def all_indexes_same(indexes):
Parameters
----------
indexes : list of Index objects
indexes : iterable of Index objects
Returns
-------
bool
True if all indexes contain the same elements, False otherwise.
"""
first = indexes[0]
for index in indexes[1:]:
itr = iter(indexes)
first = next(itr)
for index in itr:
if not first.equals(index):
return False
return True
7 changes: 4 additions & 3 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -861,13 +861,14 @@ def test_apply_multi_level_name(category):
b = [1, 2] * 5
if category:
b = pd.Categorical(b, categories=[1, 2, 3])
expected_index = pd.CategoricalIndex([1, 2], categories=[1, 2, 3], name="B")
else:
expected_index = pd.Index([1, 2], name="B")
df = pd.DataFrame(
{"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))}
).set_index(["A", "B"])
result = df.groupby("B").apply(lambda x: x.sum())
expected = pd.DataFrame(
{"C": [20, 25], "D": [20, 25]}, index=pd.Index([1, 2], name="B")
)
expected = pd.DataFrame({"C": [20, 25], "D": [20, 25]}, index=expected_index)
tm.assert_frame_equal(result, expected)
assert df.index.names == ["A", "B"]

Expand Down

0 comments on commit d90b73b

Please sign in to comment.