Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLN/BUG: Clean/Simplify _wrap_applied_output #35792

Merged
merged 7 commits into from
Aug 26, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -249,9 +249,8 @@ Groupby/resample/rolling

- Bug in :meth:`DataFrameGroupBy.count` and :meth:`SeriesGroupBy.sum` returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`)
- Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`)
-
-
- Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`)
- Bug in :meth:`DataFrameGroupby.apply` would drop a :class:`CategoricalIndex` when grouped on.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add the issue number here (in this case use this PR number as there isn't an issue)


Reshaping
^^^^^^^^^
Expand Down
88 changes: 23 additions & 65 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1203,57 +1203,25 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
if len(keys) == 0:
return self.obj._constructor(index=keys)

key_names = self.grouper.names

# GH12824
first_not_none = next(com.not_none(*values), None)

if first_not_none is None:
# GH9684. If all values are None, then this will throw an error.
# We'd prefer it return an empty dataframe.
# GH9684 - All values are None, return an empty frame.
return self.obj._constructor()
elif isinstance(first_not_none, DataFrame):
return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
else:
if len(self.grouper.groupings) > 1:
key_index = self.grouper.result_index

else:
ping = self.grouper.groupings[0]
if len(keys) == ping.ngroups:
key_index = ping.group_index
key_index.name = key_names[0]

key_lookup = Index(keys)
indexer = key_lookup.get_indexer(key_index)

# reorder the values
values = [values[i] for i in indexer]

# update due to the potential reorder
first_not_none = next(com.not_none(*values), None)
else:

key_index = Index(keys, name=key_names[0])

# don't use the key indexer
if not self.as_index:
key_index = None
key_index = self.grouper.result_index if self.as_index else None

# make Nones an empty object
if first_not_none is None:
return self.obj._constructor()
elif isinstance(first_not_none, NDFrame):
if isinstance(first_not_none, Series):

# this is to silence a DeprecationWarning
# TODO: Remove when default dtype of empty Series is object
kwargs = first_not_none._construct_axes_dict()
if isinstance(first_not_none, Series):
backup = create_series_with_explicit_dtype(
**kwargs, dtype_if_empty=object
)
else:
backup = first_not_none._constructor(**kwargs)
backup = create_series_with_explicit_dtype(
**kwargs, dtype_if_empty=object
)

values = [x if (x is not None) else backup for x in values]

Expand All @@ -1262,7 +1230,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index:
if isinstance(v, Series):
applied_index = self._selected_obj._get_axis(self.axis)
all_indexed_same = all_indexes_same([x.index for x in values])
all_indexed_same = all_indexes_same((x.index for x in values))
singular_series = len(values) == 1 and applied_index.nlevels == 1

# GH3596
Expand Down Expand Up @@ -1294,7 +1262,6 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
# GH 8467
return self._concat_objects(keys, values, not_indexed_same=True)

if self.axis == 0 and isinstance(v, ABCSeries):
# GH6124 if the list of Series have a consistent name,
# then propagate that name to the result.
index = v.index.copy()
Expand All @@ -1307,34 +1274,25 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
if len(names) == 1:
index.name = list(names)[0]

# normally use vstack as its faster than concat
# and if we have mi-columns
if (
isinstance(v.index, MultiIndex)
or key_index is None
or isinstance(key_index, MultiIndex)
):
stacked_values = np.vstack([np.asarray(v) for v in values])
result = self.obj._constructor(
stacked_values, index=key_index, columns=index
)
else:
# GH5788 instead of stacking; concat gets the
# dtypes correct
from pandas.core.reshape.concat import concat

result = concat(
values,
keys=key_index,
names=key_index.names,
axis=self.axis,
).unstack()
result.columns = index
elif isinstance(v, ABCSeries):
stacked_values = np.vstack([np.asarray(v) for v in values])

jreback marked this conversation as resolved.
Show resolved Hide resolved
if self.axis == 0:
index = key_index
columns = v.index.copy()
if columns.name is None:
# GH6124 - propagate name of Series when it's consistent
names = {v.name for v in values}
if len(names) == 1:
columns.name = list(names)[0]
else:
index = v.index
columns = key_index
stacked_values = stacked_values.T

result = self.obj._constructor(
stacked_values.T, index=v.index, columns=key_index
stacked_values, index=index, columns=columns
)

elif not self.as_index:
# We add grouping column below, so create a frame here
result = DataFrame(
Expand Down
7 changes: 4 additions & 3 deletions pandas/core/indexes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,15 +297,16 @@ def all_indexes_same(indexes):

Parameters
----------
indexes : list of Index objects
indexes : iterable of Index objects

Returns
-------
bool
True if all indexes contain the same elements, False otherwise.
"""
first = indexes[0]
for index in indexes[1:]:
itr = iter(indexes)
first = next(itr)
for index in itr:
if not first.equals(index):
return False
return True
7 changes: 4 additions & 3 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -861,13 +861,14 @@ def test_apply_multi_level_name(category):
b = [1, 2] * 5
if category:
b = pd.Categorical(b, categories=[1, 2, 3])
expected_index = pd.CategoricalIndex([1, 2], categories=[1, 2, 3], name="B")
else:
expected_index = pd.Index([1, 2], name="B")
df = pd.DataFrame(
{"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))}
).set_index(["A", "B"])
result = df.groupby("B").apply(lambda x: x.sum())
expected = pd.DataFrame(
{"C": [20, 25], "D": [20, 25]}, index=pd.Index([1, 2], name="B")
)
expected = pd.DataFrame({"C": [20, 25], "D": [20, 25]}, index=expected_index)
tm.assert_frame_equal(result, expected)
assert df.index.names == ["A", "B"]

Expand Down