From 75b6682bc3b7c494cdf9be52ddb131731fb04197 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 19 Nov 2023 09:57:54 -0500 Subject: [PATCH 1/5] PERF: groupby.nunique --- pandas/_libs/groupby.pyi | 7 ++++ pandas/_libs/groupby.pyx | 37 +++++++++++++++++ pandas/core/groupby/generic.py | 73 +++++++++++++++------------------- 3 files changed, 77 insertions(+), 40 deletions(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 609663c721950..0f8fff9837ad0 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -213,3 +213,10 @@ def group_cummax( result_mask: np.ndarray | None = ..., skipna: bool = ..., ) -> None: ... +def group_nunique( + out: np.ndarray, # int64_t[::1] + codes: np.ndarray, # ndarray[int64_t, ndim=1] + values: np.ndarray, # ndarray[int64_t, ndim=1] + labels: np.ndarray, # const intp_t[::1] + dropna: bool, +) -> None: ... diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 6f9b33b042959..f5057748085a0 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -47,6 +47,12 @@ from pandas._libs.dtypes cimport ( numeric_object_t, numeric_t, ) +from pandas._libs.khash cimport ( + kh_destroy_int64, + kh_init_int64, + kh_put_int64, + kh_resize_int64, +) from pandas._libs.missing cimport checknull @@ -2154,3 +2160,34 @@ def group_cummax( skipna=skipna, compute_max=True, ) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_nunique( + int64_t[::1] out, + ndarray[int64_t, ndim=1] codes, + ndarray[int64_t, ndim=1] values, + const intp_t[::1] labels, + bint dropna, +): + cdef: + Py_ssize_t i, N = len(values) + int ret = 0 + int64_t val + intp_t lab + + with nogil: + table = kh_init_int64() + kh_resize_int64(table, N) + for i in range(N): + lab = labels[i] + if lab < 0: + continue + if dropna and codes[i] < 0: + continue + val = values[i] + kh_put_int64(table, val, &ret) + if ret != 0: + out[lab] += 1 + kh_destroy_int64(table) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1bdba5a3e71fb..194ad2e69c221 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -28,6 +28,8 @@ Interval, lib, ) +import pandas._libs.groupby as libgroupby +from pandas._libs.hashtable import duplicated from pandas.errors import SpecificationError from pandas.util._decorators import ( Appender, @@ -84,6 +86,7 @@ default_index, ) from pandas.core.series import Series +from pandas.core.sorting import get_group_index from pandas.core.util.numba_ import maybe_use_numba from pandas.plotting import boxplot_frame_groupby @@ -672,55 +675,45 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: 2023-02-01 1 Freq: MS, dtype: int64 """ - ids, _, _ = self.grouper.group_info - + ids, _, ngroups = self.grouper.group_info val = self.obj._values - - codes, _ = algorithms.factorize(val, sort=False) - sorter = np.lexsort((codes, ids)) - codes = codes[sorter] - ids = ids[sorter] - - # group boundaries are where group ids change - # unique observations are where sorted values change - idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] - inc = np.r_[1, codes[1:] != codes[:-1]] - - # 1st item of each group is a new unique observation - mask = codes == -1 - if dropna: - inc[idx] = 1 - inc[mask] = 0 + codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False) + nvals = len(uniques) + + if ngroups * nvals <= np.iinfo(np.int64).max: + # Fastpath - can only take if we won't overflow when computing values + values = nvals * ids + codes + res = np.zeros(ngroups, dtype="int64") + libgroupby.group_nunique(res, codes, values, ids, dropna) else: - inc[mask & np.r_[False, mask[:-1]]] = 0 - inc[idx] = 1 - - out = np.add.reduceat(inc, idx).astype("int64", copy=False) - if len(ids): - # NaN/NaT group exists if the head of ids is -1, - # so remove it from res and exclude its index from idx - if ids[0] == -1: - res = out[1:] - idx = idx[np.flatnonzero(idx)] - else: - res = out - else: - res = out[1:] - ri = self.grouper.result_index - - # we might have duplications among the bins - if len(res) != len(ri): - res, out = np.zeros(len(ri), dtype=out.dtype), res - if len(ids) > 0: - # GH#21334s - res[ids[idx]] = out + if self.grouper.has_dropped_na: + mask = ids >= 0 + ids = ids[mask] + codes = codes[mask] + group_index = get_group_index( + [ids, codes], + (len(ids), 2), + sort=False, + xnull=dropna, + ) + if dropna: + mask = group_index < 0 + if mask.any(): + ids = ids[~mask] + group_index = group_index[~mask] + mask = duplicated(group_index, "first") + res = np.bincount(ids[~mask]) + if res.shape[0] < ngroups: + res = np.pad(res, (0, ngroups - res.shape[0])) + ri = self.grouper.result_index result: Series | DataFrame = self.obj._constructor( res, index=ri, name=self.obj.name ) if not self.as_index: result = self._insert_inaxis_grouper(result) result.index = default_index(len(result)) + return self._reindex_output(result, fill_value=0) @doc(Series.describe) From 8002f100044a6ad906cb761aaddcfcd412add1ff Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 26 Nov 2023 08:55:35 -0500 Subject: [PATCH 2/5] Remove fastpath --- pandas/core/groupby/generic.py | 52 +++++++++++++++------------------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 194ad2e69c221..cf20da9d53568 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -28,7 +28,6 @@ Interval, lib, ) -import pandas._libs.groupby as libgroupby from pandas._libs.hashtable import duplicated from pandas.errors import SpecificationError from pandas.util._decorators import ( @@ -677,34 +676,30 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: """ ids, _, ngroups = self.grouper.group_info val = self.obj._values - codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False) - nvals = len(uniques) - - if ngroups * nvals <= np.iinfo(np.int64).max: - # Fastpath - can only take if we won't overflow when computing values - values = nvals * ids + codes - res = np.zeros(ngroups, dtype="int64") - libgroupby.group_nunique(res, codes, values, ids, dropna) - else: - if self.grouper.has_dropped_na: - mask = ids >= 0 + codes, _ = algorithms.factorize(val, use_na_sentinel=dropna, sort=False) + + if self.grouper.has_dropped_na: + mask = ids >= 0 + ids = ids[mask] + codes = codes[mask] + + group_index = get_group_index( + labels=[ids, codes], + shape=(len(ids), 2), + sort=False, + xnull=dropna, + ) + + if dropna: + mask = group_index >= 0 + if (~mask).all(): ids = ids[mask] - codes = codes[mask] - group_index = get_group_index( - [ids, codes], - (len(ids), 2), - sort=False, - xnull=dropna, - ) - if dropna: - mask = group_index < 0 - if mask.any(): - ids = ids[~mask] - group_index = group_index[~mask] - mask = duplicated(group_index, "first") - res = np.bincount(ids[~mask]) - if res.shape[0] < ngroups: - res = np.pad(res, (0, ngroups - res.shape[0])) + group_index = group_index[mask] + + mask = duplicated(group_index, "first") + res = np.bincount(ids[~mask]) + if res.shape[0] < ngroups: + res = np.pad(res, (0, ngroups - res.shape[0])) ri = self.grouper.result_index result: Series | DataFrame = self.obj._constructor( @@ -713,7 +708,6 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: if not self.as_index: result = self._insert_inaxis_grouper(result) result.index = default_index(len(result)) - return self._reindex_output(result, fill_value=0) @doc(Series.describe) From f382a64990951dc8c8912f9283fa8e69e4b720d7 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 26 Nov 2023 15:12:03 -0500 Subject: [PATCH 3/5] Remove fastpath --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/_libs/groupby.pyi | 7 ------- pandas/_libs/groupby.pyx | 37 ---------------------------------- pandas/core/groupby/generic.py | 10 ++++----- 4 files changed, 5 insertions(+), 50 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index fbff38aeefc51..746692fdac2a0 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -426,6 +426,7 @@ Performance improvements - Performance improvement in :meth:`Series.str.get_dummies` when dtype is ``"string[pyarrow]"`` or ``"string[pyarrow_numpy]"`` (:issue:`56110`) - Performance improvement in :meth:`Series.str` methods (:issue:`55736`) - Performance improvement in :meth:`Series.value_counts` and :meth:`Series.mode` for masked dtypes (:issue:`54984`, :issue:`55340`) +- Performance improvement in :meth:`DataFrameGroupBy.nunique` and :meth:`SeriesGroupBy.nunique` (:issue:`55972`) - Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`) - Performance improvement when indexing into a non-unique index (:issue:`55816`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 013c214a8d187..135828a23648a 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -212,10 +212,3 @@ def group_cummax( result_mask: np.ndarray | None = ..., skipna: bool = ..., ) -> None: ... -def group_nunique( - out: np.ndarray, # int64_t[::1] - codes: np.ndarray, # ndarray[int64_t, ndim=1] - values: np.ndarray, # ndarray[int64_t, ndim=1] - labels: np.ndarray, # const intp_t[::1] - dropna: bool, -) -> None: ... diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index c8fbf83d63e19..19d71b0a6fde3 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -47,12 +47,6 @@ from pandas._libs.dtypes cimport ( numeric_object_t, numeric_t, ) -from pandas._libs.khash cimport ( - kh_destroy_int64, - kh_init_int64, - kh_put_int64, - kh_resize_int64, -) from pandas._libs.missing cimport checknull @@ -2044,34 +2038,3 @@ def group_cummax( skipna=skipna, compute_max=True, ) - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_nunique( - int64_t[::1] out, - ndarray[int64_t, ndim=1] codes, - ndarray[int64_t, ndim=1] values, - const intp_t[::1] labels, - bint dropna, -): - cdef: - Py_ssize_t i, N = len(values) - int ret = 0 - int64_t val - intp_t lab - - with nogil: - table = kh_init_int64() - kh_resize_int64(table, N) - for i in range(N): - lab = labels[i] - if lab < 0: - continue - if dropna and codes[i] < 0: - continue - val = values[i] - kh_put_int64(table, val, &ret) - if ret != 0: - out[lab] += 1 - kh_destroy_int64(table) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 12d4ad02efe96..d80612def27d9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -676,7 +676,7 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: """ ids, _, ngroups = self.grouper.group_info val = self.obj._values - codes, _ = algorithms.factorize(val, use_na_sentinel=dropna, sort=False) + codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False) if self.grouper.has_dropped_na: mask = ids >= 0 @@ -685,21 +685,19 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: group_index = get_group_index( labels=[ids, codes], - shape=(len(ids), 2), + shape=(ngroups, len(uniques)), sort=False, xnull=dropna, ) if dropna: mask = group_index >= 0 - if (~mask).all(): + if (~mask).any(): ids = ids[mask] group_index = group_index[mask] mask = duplicated(group_index, "first") - res = np.bincount(ids[~mask]) - if res.shape[0] < ngroups: - res = np.pad(res, (0, ngroups - res.shape[0])) + res = np.bincount(ids[~mask], minlength=ngroups) ri = self.grouper.result_index result: Series | DataFrame = self.obj._constructor( From bb1e0be18d47dca937507d0026b3de7408a2ff66 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 28 Nov 2023 17:24:13 -0500 Subject: [PATCH 4/5] int32 fixup --- pandas/core/groupby/generic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index d80612def27d9..2897e60e016c2 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -675,6 +675,8 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: Freq: MS, dtype: int64 """ ids, _, ngroups = self.grouper.group_info + # We bincount ids below; result should always be int64 + ids = ensure_int64(ids) val = self.obj._values codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False) From 89e31b9df3687f0eb21f4c1733fb37b4b9f00892 Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 28 Nov 2023 22:07:43 -0500 Subject: [PATCH 5/5] fixup --- pandas/core/groupby/generic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2897e60e016c2..5a2f8d8454526 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -675,8 +675,6 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: Freq: MS, dtype: int64 """ ids, _, ngroups = self.grouper.group_info - # We bincount ids below; result should always be int64 - ids = ensure_int64(ids) val = self.obj._values codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False) @@ -700,6 +698,7 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: mask = duplicated(group_index, "first") res = np.bincount(ids[~mask], minlength=ngroups) + res = ensure_int64(res) ri = self.grouper.result_index result: Series | DataFrame = self.obj._constructor(