Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Dense ranking with percent now uses 100% basis #15639

Merged
merged 6 commits into from
Mar 9, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,7 @@ Offsets

Numeric
^^^^^^^
- Bug in :meth:`DataFrame.rank` and :meth:`Series.rank` when ``method='dense'`` and ``pct=True`` in which percentile ranks were not being used with the number of distinct observations (:issue:`15630`)
- Bug in :class:`Series` constructor with an int or float list where specifying ``dtype=str``, ``dtype='str'`` or ``dtype='U'`` failed to convert the data elements to strings (:issue:`16605`)
- Bug in :class:`Index` multiplication and division methods where operating with a ``Series`` would return an ``Index`` object instead of a ``Series`` object (:issue:`19042`)
- Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`)
Expand Down
10 changes: 8 additions & 2 deletions pandas/_libs/algos_rank_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,10 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
sum_ranks = dups = 0
{{endif}}
if pct:
return ranks / count
if tiebreak == TIEBREAK_DENSE:
return ranks / total_tie_count
else:
return ranks / count
else:
return ranks

Expand Down Expand Up @@ -385,7 +388,10 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average',
ranks[i, argsorted[i, z]] = total_tie_count
sum_ranks = dups = 0
if pct:
ranks[i, :] /= count
if tiebreak == TIEBREAK_DENSE:
ranks[i, :] /= total_tie_count
else:
ranks[i, :] /= count
if axis == 0:
return ranks.T
else:
Expand Down
43 changes: 37 additions & 6 deletions pandas/tests/frame/test_rank.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
# -*- coding: utf-8 -*-
import pytest
from datetime import timedelta, datetime
from distutils.version import LooseVersion
from numpy import nan
import numpy as np
import pandas.util.testing as tm

from pandas import Series, DataFrame
from distutils.version import LooseVersion
from datetime import timedelta, datetime
from numpy import nan

from pandas.compat import product
from pandas.util.testing import assert_frame_equal
import pandas.util.testing as tm
from pandas.tests.frame.common import TestData
from pandas import Series, DataFrame
from pandas.compat import product


class TestRank(TestData):
Expand Down Expand Up @@ -266,3 +266,34 @@ def _check2d(df, expected, method='average', axis=0):
continue
frame = df if dtype is None else df.astype(dtype)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OT: happy to clean these tests up above / parameterize and such. (next PR).

_check2d(frame, results[method], method=method, axis=axis)


@pytest.mark.parametrize(
"method,exp", [("dense",
[[1., 1., 1.],
[1., 0.5, 2. / 3],
[1., 0.5, 1. / 3]]),
("min",
[[1. / 3, 1., 1.],
[1. / 3, 1. / 3, 2. / 3],
[1. / 3, 1. / 3, 1. / 3]]),
("max",
[[1., 1., 1.],
[1., 2. / 3, 2. / 3],
[1., 2. / 3, 1. / 3]]),
("average",
[[2. / 3, 1., 1.],
[2. / 3, 0.5, 2. / 3],
[2. / 3, 0.5, 1. / 3]]),
("first",
[[1. / 3, 1., 1.],
[2. / 3, 1. / 3, 2. / 3],
[3. / 3, 2. / 3, 1. / 3]])])
def test_rank_pct_true(method, exp):
# see gh-15630.

df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
result = df.rank(method=method, pct=True)

expected = DataFrame(exp)
tm.assert_frame_equal(result, expected)
93 changes: 93 additions & 0 deletions pandas/tests/series/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,3 +376,96 @@ def test_rank_modify_inplace(self):
s.rank()
result = s
assert_series_equal(result, expected)


# GH15630, pct should be on 100% basis when method='dense'

@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might consider putting these in a single parameterize as well (though not as clear cut as the frame case where you can share the input frame)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah...I can't think of a clean way to do this at the moment. I'll address the other changes first, and then maybe we'll see if any inspiration strikes. Otherwise, I would leave it alone.

@pytest.mark.parametrize('ser, exp', [
([1], [1.]),
([1, 2], [1. / 2, 2. / 2]),
([2, 2], [1., 1.]),
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
([1, 2, 2], [1. / 2, 2. / 2, 2. / 2]),
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
([1, 1, 5, 5, 3], [1. / 3, 1. / 3, 3. / 3, 3. / 3, 2. / 3]),
([1, 1, 3, 3, 5, 5], [1. / 3, 1. / 3, 2. / 3, 2. / 3, 3. / 3, 3. / 3]),
([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
def test_rank_dense_pct(dtype, ser, exp):
s = Series(ser).astype(dtype)
result = s.rank(method='dense', pct=True)
expected = Series(exp).astype(result.dtype)
assert_series_equal(result, expected)


@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
@pytest.mark.parametrize('ser, exp', [
([1], [1.]),
([1, 2], [1. / 2, 2. / 2]),
([2, 2], [1. / 2, 1. / 2]),
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
([1, 2, 2], [1. / 3, 2. / 3, 2. / 3]),
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
([1, 1, 5, 5, 3], [1. / 5, 1. / 5, 4. / 5, 4. / 5, 3. / 5]),
([1, 1, 3, 3, 5, 5], [1. / 6, 1. / 6, 3. / 6, 3. / 6, 5. / 6, 5. / 6]),
([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
def test_rank_min_pct(dtype, ser, exp):
s = Series(ser).astype(dtype)
result = s.rank(method='min', pct=True)
expected = Series(exp).astype(result.dtype)
assert_series_equal(result, expected)


@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
@pytest.mark.parametrize('ser, exp', [
([1], [1.]),
([1, 2], [1. / 2, 2. / 2]),
([2, 2], [1., 1.]),
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
([1, 2, 2], [1. / 3, 3. / 3, 3. / 3]),
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
([1, 1, 5, 5, 3], [2. / 5, 2. / 5, 5. / 5, 5. / 5, 3. / 5]),
([1, 1, 3, 3, 5, 5], [2. / 6, 2. / 6, 4. / 6, 4. / 6, 6. / 6, 6. / 6]),
([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
def test_rank_max_pct(dtype, ser, exp):
s = Series(ser).astype(dtype)
result = s.rank(method='max', pct=True)
expected = Series(exp).astype(result.dtype)
assert_series_equal(result, expected)


@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8'])
@pytest.mark.parametrize('ser, exp', [
([1], [1.]),
([1, 2], [1. / 2, 2. / 2]),
([2, 2], [1.5 / 2, 1.5 / 2]),
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
([1, 2, 2], [1. / 3, 2.5 / 3, 2.5 / 3]),
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
([1, 1, 5, 5, 3], [1.5 / 5, 1.5 / 5, 4.5 / 5, 4.5 / 5, 3. / 5]),
([1, 1, 3, 3, 5, 5],
[1.5 / 6, 1.5 / 6, 3.5 / 6, 3.5 / 6, 5.5 / 6, 5.5 / 6]),
([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
def test_rank_average_pct(dtype, ser, exp):
s = Series(ser).astype(dtype)
result = s.rank(method='average', pct=True)
expected = Series(exp).astype(result.dtype)
assert_series_equal(result, expected)


@pytest.mark.parametrize('dtype', ['f8', 'i8'])
@pytest.mark.parametrize('ser, exp', [
([1], [1.]),
([1, 2], [1. / 2, 2. / 2]),
([2, 2], [1. / 2, 2. / 2.]),
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
([1, 2, 2], [1. / 3, 2. / 3, 3. / 3]),
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
([1, 1, 5, 5, 3], [1. / 5, 2. / 5, 4. / 5, 5. / 5, 3. / 5]),
([1, 1, 3, 3, 5, 5], [1. / 6, 2. / 6, 3. / 6, 4. / 6, 5. / 6, 6. / 6]),
([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])])
def test_rank_first_pct(dtype, ser, exp):
s = Series(ser).astype(dtype)
result = s.rank(method='first', pct=True)
expected = Series(exp).astype(result.dtype)
assert_series_equal(result, expected)