From 9c4c22cef2cd679903c1471b720507360029e007 Mon Sep 17 00:00:00 2001 From: Noam Hershtig Date: Mon, 11 Feb 2019 21:39:27 +0200 Subject: [PATCH] Refactor groupby group_prod, group_var, group_mean, group_ohlc (#25249) --- pandas/_libs/groupby.pyx | 217 ++++++++++++++++++++++++++++- pandas/_libs/groupby_helper.pxi.in | 213 ---------------------------- pandas/tests/test_algos.py | 4 +- 3 files changed, 216 insertions(+), 218 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 950ba3f89ffb7e..e6b6e2c8a0055c 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -382,6 +382,10 @@ def group_any_all(uint8_t[:] out, if values[i] == flag_val: out[lab] = flag_val +# ---------------------------------------------------------------------- +# group_add, group_prod, group_var, group_mean, group_ohlc +# ---------------------------------------------------------------------- + @cython.wraparound(False) @cython.boundscheck(False) @@ -396,9 +400,9 @@ def _group_add(floating[:, :] out, cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) floating val, count - ndarray[floating, ndim=2] sumx, nobs + floating[:, :] sumx, nobs - if not len(values) == len(labels): + if len(values) != len(labels): raise AssertionError("len(index) != len(labels)") nobs = np.zeros_like(out) @@ -407,7 +411,6 @@ def _group_add(floating[:, :] out, N, K = (values).shape with nogil: - for i in range(N): lab = labels[i] if lab < 0: @@ -433,5 +436,213 @@ def _group_add(floating[:, :] out, group_add_float32 = _group_add['float'] group_add_float64 = _group_add['double'] + +@cython.wraparound(False) +@cython.boundscheck(False) +def _group_prod(floating[:, :] out, + int64_t[:] counts, + floating[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=0): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + floating val, count + floating[:, :] prodx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + N, K = (values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + prodx[lab, j] *= val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] < min_count: + out[i, j] = NAN + else: + out[i, j] = prodx[i, j] + + +group_prod_float32 = _group_prod['float'] +group_prod_float64 = _group_prod['double'] + + +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.cdivision(True) +def _group_var(floating[:, :] out, + int64_t[:] counts, + floating[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + floating val, ct, oldmean + floating[:, :] nobs, mean + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + mean = np.zeros_like(out) + + N, K = (values).shape + + out[:, :] = 0.0 + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + oldmean = mean[lab, j] + mean[lab, j] += (val - oldmean) / nobs[lab, j] + out[lab, j] += (val - mean[lab, j]) * (val - oldmean) + + for i in range(ncounts): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = NAN + else: + out[i, j] /= (ct - 1) + + +group_var_float32 = _group_var['float'] +group_var_float64 = _group_var['double'] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def _group_mean(floating[:, :] out, + int64_t[:] counts, + floating[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + floating val, count + floating[:, :] sumx, nobs + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = (values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + + for i in range(ncounts): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = sumx[i, j] / count + + +group_mean_float32 = _group_mean['float'] +group_mean_float64 = _group_mean['double'] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def _group_ohlc(floating[:, :] out, + int64_t[:] counts, + floating[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab + floating val, count + Py_ssize_t ngroups = len(counts) + + assert min_count == -1, "'min_count' only used in add and prod" + + if len(labels) == 0: + return + + N, K = (values).shape + + if out.shape[1] != 4: + raise ValueError('Output array must have 4 columns') + + if K > 1: + raise NotImplementedError("Argument 'values' must have only " + "one dimension") + out[:] = np.nan + + with nogil: + for i in range(N): + lab = labels[i] + if lab == -1: + continue + + counts[lab] += 1 + val = values[i, 0] + if val != val: + continue + + if out[lab, 0] != out[lab, 0]: + out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val + else: + out[lab, 1] = max(out[lab, 1], val) + out[lab, 2] = min(out[lab, 2], val) + out[lab, 3] = val + + +group_ohlc_float32 = _group_ohlc['float'] +group_ohlc_float64 = _group_ohlc['double'] + # generated from template include "groupby_helper.pxi" diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index db7018e1a72544..63cd4d6ac6ff2c 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -8,219 +8,6 @@ cdef extern from "numpy/npy_math.h": float64_t NAN "NPY_NAN" _int64_max = np.iinfo(np.int64).max -# ---------------------------------------------------------------------- -# group_prod, group_var, group_mean, group_ohlc -# ---------------------------------------------------------------------- - -{{py: - -# name, c_type -dtypes = [('float64', 'float64_t'), - ('float32', 'float32_t')] - -def get_dispatch(dtypes): - - for name, c_type in dtypes: - yield name, c_type -}} - -{{for name, c_type in get_dispatch(dtypes)}} - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_prod_{{name}}({{c_type}}[:, :] out, - int64_t[:] counts, - {{c_type}}[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=0): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{c_type}} val, count - ndarray[{{c_type}}, ndim=2] prodx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - prodx = np.ones_like(out) - - N, K = (values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - prodx[lab, j] *= val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] < min_count: - out[i, j] = NAN - else: - out[i, j] = prodx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -@cython.cdivision(True) -def group_var_{{name}}({{c_type}}[:, :] out, - int64_t[:] counts, - {{c_type}}[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{c_type}} val, ct, oldmean - ndarray[{{c_type}}, ndim=2] nobs, mean - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - mean = np.zeros_like(out) - - N, K = (values).shape - - out[:, :] = 0.0 - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - oldmean = mean[lab, j] - mean[lab, j] += (val - oldmean) / nobs[lab, j] - out[lab, j] += (val - mean[lab, j]) * (val - oldmean) - - for i in range(ncounts): - for j in range(K): - ct = nobs[i, j] - if ct < 2: - out[i, j] = NAN - else: - out[i, j] /= (ct - 1) -# add passing bin edges, instead of labels - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_mean_{{name}}({{c_type}}[:, :] out, - int64_t[:] counts, - {{c_type}}[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{c_type}} val, count - ndarray[{{c_type}}, ndim=2] sumx, nobs - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = (values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - - for i in range(ncounts): - for j in range(K): - count = nobs[i, j] - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = sumx[i, j] / count - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_ohlc_{{name}}({{c_type}}[:, :] out, - int64_t[:] counts, - {{c_type}}[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - {{c_type}} val, count - Py_ssize_t ngroups = len(counts) - - assert min_count == -1, "'min_count' only used in add and prod" - - if len(labels) == 0: - return - - N, K = (values).shape - - if out.shape[1] != 4: - raise ValueError('Output array must have 4 columns') - - if K > 1: - raise NotImplementedError("Argument 'values' must have only " - "one dimension") - out[:] = np.nan - - with nogil: - for i in range(N): - lab = labels[i] - if lab == -1: - continue - - counts[lab] += 1 - val = values[i, 0] - if val != val: - continue - - if out[lab, 0] != out[lab, 0]: - out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val - else: - out[lab, 1] = max(out[lab, 1], val) - out[lab, 2] = min(out[lab, 2], val) - out[lab, 3] = val - -{{endfor}} - # ---------------------------------------------------------------------- # group_nth, group_last, group_rank # ---------------------------------------------------------------------- diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 3d28b17750540f..888cf78a1c66ae 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1222,7 +1222,7 @@ def test_group_var_constant(self): class TestGroupVarFloat64(GroupVarTestMixin): __test__ = True - algo = libgroupby.group_var_float64 + algo = staticmethod(libgroupby.group_var_float64) dtype = np.float64 rtol = 1e-5 @@ -1245,7 +1245,7 @@ def test_group_var_large_inputs(self): class TestGroupVarFloat32(GroupVarTestMixin): __test__ = True - algo = libgroupby.group_var_float32 + algo = staticmethod(libgroupby.group_var_float32) dtype = np.float32 rtol = 1e-2