Skip to content

Commit

Permalink
BUG: Series groupby does not include nan counts for all categorical l…
Browse files Browse the repository at this point in the history
  • Loading branch information
OliverHofkens authored and proost committed Dec 19, 2019
1 parent bc30ab1 commit 933181a
Show file tree
Hide file tree
Showing 4 changed files with 141 additions and 7 deletions.
41 changes: 41 additions & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,47 @@ New repr for :class:`pandas.core.arrays.IntervalArray`
pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)])
All :class:`SeriesGroupBy` aggregation methods now respect the ``observed`` keyword
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The following methods now also correctly output values for unobserved categories when called through ``groupby(..., observed=False)`` (:issue:`17605`)

- :meth:`SeriesGroupBy.count`
- :meth:`SeriesGroupBy.size`
- :meth:`SeriesGroupBy.nunique`
- :meth:`SeriesGroupBy.nth`

.. ipython:: python
df = pd.DataFrame({
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
"cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")),
"value": [0.1] * 4,
})
df
*pandas 0.25.x*

.. code-block:: ipython
In [2]: df.groupby(["cat_1", "cat_2"], observed=False)["value"].count()
Out[2]:
cat_1 cat_2
A A 1
B 1
B A 1
B 1
Name: value, dtype: int64
*pandas 1.0.0*

.. ipython:: python
df.groupby(["cat_1", "cat_2"], observed=False)["value"].count()
.. _whatsnew_1000.api.other:

Other API changes
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,8 @@ def nunique(self, dropna: bool = True) -> Series:
res, out = np.zeros(len(ri), dtype=out.dtype), res
res[ids[idx]] = out

return Series(res, index=ri, name=self._selection_name)
result = Series(res, index=ri, name=self._selection_name)
return self._reindex_output(result, fill_value=0)

@Appender(Series.describe.__doc__)
def describe(self, **kwargs):
Expand Down Expand Up @@ -709,12 +710,13 @@ def count(self) -> Series:
minlength = ngroups or 0
out = np.bincount(ids[mask], minlength=minlength)

return Series(
result = Series(
out,
index=self.grouper.result_index,
name=self._selection_name,
dtype="int64",
)
return self._reindex_output(result, fill_value=0)

def _apply_to_column_groupbys(self, func):
""" return a pass thru """
Expand Down
22 changes: 17 additions & 5 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class providing the base-class of operations.
)
from pandas.core.dtypes.missing import isna, notna

from pandas._typing import FrameOrSeries, Scalar
from pandas.core import nanops
import pandas.core.algorithms as algorithms
from pandas.core.arrays import Categorical, try_cast_to_ea
Expand Down Expand Up @@ -1296,7 +1297,7 @@ def size(self):

if isinstance(self.obj, Series):
result.name = self.obj.name
return result
return self._reindex_output(result, fill_value=0)

@classmethod
def _add_numeric_operations(cls):
Expand Down Expand Up @@ -1740,6 +1741,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra
if not self.observed and isinstance(result_index, CategoricalIndex):
out = out.reindex(result_index)

out = self._reindex_output(out)
return out.sort_index() if self.sort else out

# dropna is truthy
Expand Down Expand Up @@ -2380,7 +2382,9 @@ def tail(self, n=5):
mask = self._cumcount_array(ascending=False) < n
return self._selected_obj[mask]

def _reindex_output(self, output):
def _reindex_output(
self, output: FrameOrSeries, fill_value: Scalar = np.NaN
) -> FrameOrSeries:
"""
If we have categorical groupers, then we might want to make sure that
we have a fully re-indexed output to the levels. This means expanding
Expand All @@ -2394,8 +2398,10 @@ def _reindex_output(self, output):
Parameters
----------
output: Series or DataFrame
output : Series or DataFrame
Object resulting from grouping and applying an operation.
fill_value : scalar, default np.NaN
Value to use for unobserved categories if self.observed is False.
Returns
-------
Expand Down Expand Up @@ -2426,7 +2432,11 @@ def _reindex_output(self, output):
).sortlevel()

if self.as_index:
d = {self.obj._get_axis_name(self.axis): index, "copy": False}
d = {
self.obj._get_axis_name(self.axis): index,
"copy": False,
"fill_value": fill_value,
}
return output.reindex(**d)

# GH 13204
Expand All @@ -2448,7 +2458,9 @@ def _reindex_output(self, output):
output = output.drop(labels=list(g_names), axis=1)

# Set a temp index and reindex (possibly expanding)
output = output.set_index(self.grouper.result_index).reindex(index, copy=False)
output = output.set_index(self.grouper.result_index).reindex(
index, copy=False, fill_value=fill_value
)

# Reset in-axis grouper columns
# (using level numbers `g_nums` because level names may not be unique)
Expand Down
79 changes: 79 additions & 0 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1252,3 +1252,82 @@ def test_get_nonexistent_category():
{"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]}
)
)


def test_series_groupby_on_2_categoricals_unobserved(
reduction_func: str, observed: bool
):
# GH 17605

if reduction_func == "ngroup":
pytest.skip("ngroup is not truly a reduction")

df = pd.DataFrame(
{
"cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")),
"cat_2": pd.Categorical(list("AB") * 2, categories=list("ABCD")),
"value": [0.1] * 4,
}
)
args = {"nth": [0]}.get(reduction_func, [])

expected_length = 4 if observed else 16

series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"]
agg = getattr(series_groupby, reduction_func)
result = agg(*args)

assert len(result) == expected_length


@pytest.mark.parametrize(
"func, zero_or_nan",
[
("all", np.NaN),
("any", np.NaN),
("count", 0),
("first", np.NaN),
("idxmax", np.NaN),
("idxmin", np.NaN),
("last", np.NaN),
("mad", np.NaN),
("max", np.NaN),
("mean", np.NaN),
("median", np.NaN),
("min", np.NaN),
("nth", np.NaN),
("nunique", 0),
("prod", np.NaN),
("quantile", np.NaN),
("sem", np.NaN),
("size", 0),
("skew", np.NaN),
("std", np.NaN),
("sum", np.NaN),
("var", np.NaN),
],
)
def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_or_nan):
# GH 17605
# Tests whether the unobserved categories in the result contain 0 or NaN
df = pd.DataFrame(
{
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
"cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")),
"value": [0.1] * 4,
}
)
unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")]
args = {"nth": [0]}.get(func, [])

series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"]
agg = getattr(series_groupby, func)
result = agg(*args)

for idx in unobserved:
val = result.loc[idx]
assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan)

# If we expect unobserved values to be zero, we also expect the dtype to be int
if zero_or_nan == 0:
assert np.issubdtype(result.dtype, np.integer)

0 comments on commit 933181a

Please sign in to comment.