Skip to content

Commit

Permalink
BUG: make dense ranks results scale to 100 percent (#21203)
Browse files Browse the repository at this point in the history
(cherry picked from commit b237b11)
  • Loading branch information
peterpanmj authored and jorisvandenbossche committed Jun 9, 2018
1 parent ea737c7 commit 110cf95
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 13 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ Groupby/Resample/Rolling

- Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`)
- Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`)
- Bug in :func:`pandas.core.groupby.GroupBy.rank` where results did not scale to 100% when specifying ``method='dense'`` and ``pct=True``

Strings
^^^^^^^
Expand Down
18 changes: 12 additions & 6 deletions pandas/_libs/groupby_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
bint is_datetimelike, object ties_method,
bint ascending, bint pct, object na_option):
"""
Provides the rank of values within each group.
Provides the rank of values within each group.

Parameters
----------
Expand Down Expand Up @@ -451,8 +451,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
"""
cdef:
TiebreakEnumType tiebreak
Py_ssize_t i, j, N, K, val_start=0, grp_start=0, dups=0, sum_ranks=0
Py_ssize_t grp_vals_seen=1, grp_na_count=0
Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0
Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0
ndarray[int64_t] _as
ndarray[float64_t, ndim=2] grp_sizes
ndarray[{{c_type}}] masked_vals
Expand Down Expand Up @@ -563,6 +563,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
dups = sum_ranks = 0
val_start = i
grp_vals_seen += 1
grp_tie_count +=1

# Similar to the previous conditional, check now if we are moving
# to a new group. If so, keep track of the index where the new
Expand All @@ -571,11 +572,16 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
# (used by pct calculations later). also be sure to reset any of
# the items helping to calculate dups
if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
for j in range(grp_start, i + 1):
grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count
if tiebreak != TIEBREAK_DENSE:
for j in range(grp_start, i + 1):
grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count
else:
for j in range(grp_start, i + 1):
grp_sizes[_as[j], 0] = (grp_tie_count -
(grp_na_count > 0))
dups = sum_ranks = 0
grp_na_count = 0
val_start = i + 1
grp_tie_count = 0
grp_start = i + 1
grp_vals_seen = 1

Expand Down
14 changes: 7 additions & 7 deletions pandas/tests/groupby/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ def test_rank_apply():
('first', False, False, [3., 4., 1., 5., 2.]),
('first', False, True, [.6, .8, .2, 1., .4]),
('dense', True, False, [1., 1., 3., 1., 2.]),
('dense', True, True, [0.2, 0.2, 0.6, 0.2, 0.4]),
('dense', True, True, [1. / 3., 1. / 3., 3. / 3., 1. / 3., 2. / 3.]),
('dense', False, False, [3., 3., 1., 3., 2.]),
('dense', False, True, [.6, .6, .2, .6, .4]),
('dense', False, True, [3. / 3., 3. / 3., 1. / 3., 3. / 3., 2. / 3.]),
])
def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
key = np.repeat(grps, len(vals))
Expand Down Expand Up @@ -126,7 +126,7 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
@pytest.mark.parametrize("grps", [
['qux'], ['qux', 'quux']])
@pytest.mark.parametrize("vals", [
[2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats
[2, 2, np.nan, 8, 2, 6, np.nan, np.nan],
[pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan,
pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'),
pd.Timestamp('2018-01-06'), np.nan, np.nan]
Expand Down Expand Up @@ -167,11 +167,11 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
('dense', True, 'keep', False,
[1., 1., np.nan, 3., 1., 2., np.nan, np.nan]),
('dense', True, 'keep', True,
[0.2, 0.2, np.nan, 0.6, 0.2, 0.4, np.nan, np.nan]),
[1. / 3., 1. / 3., np.nan, 3. / 3., 1. / 3., 2. / 3., np.nan, np.nan]),
('dense', False, 'keep', False,
[3., 3., np.nan, 1., 3., 2., np.nan, np.nan]),
('dense', False, 'keep', True,
[.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
[3. / 3., 3. / 3., np.nan, 1. / 3., 3. / 3., 2. / 3., np.nan, np.nan]),
('average', True, 'no_na', False, [2., 2., 7., 5., 2., 4., 7., 7.]),
('average', True, 'no_na', True,
[0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]),
Expand All @@ -198,10 +198,10 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
[0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]),
('dense', True, 'no_na', False, [1., 1., 4., 3., 1., 2., 4., 4.]),
('dense', True, 'no_na', True,
[0.125, 0.125, 0.5, 0.375, 0.125, 0.25, 0.5, 0.5]),
[0.25, 0.25, 1., 0.75, 0.25, 0.5, 1., 1.]),
('dense', False, 'no_na', False, [3., 3., 4., 1., 3., 2., 4., 4.]),
('dense', False, 'no_na', True,
[0.375, 0.375, 0.5, 0.125, 0.375, 0.25, 0.5, 0.5])
[0.75, 0.75, 1., 0.25, 0.75, 0.5, 1., 1.])
])
def test_rank_args_missing(grps, vals, ties_method, ascending,
na_option, pct, exp):
Expand Down

0 comments on commit 110cf95

Please sign in to comment.