Skip to content

Commit

Permalink
BUG: stabilize sort_values algorithms for Series and time-like Indices (
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexKirko authored and ukarroum committed Nov 2, 2020
1 parent f7fb67d commit b9c9afe
Show file tree
Hide file tree
Showing 19 changed files with 69 additions and 107 deletions.
7 changes: 7 additions & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,13 @@ Optional libraries below the lowest tested version may still work, but are not c

See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.

.. _whatsnew_200.api.other:

Other API changes
^^^^^^^^^^^^^^^^^

- Sorting in descending order is now stable for :meth:`Series.sort_values` and :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses. This will affect sort order when sorting :class:`DataFrame` on multiple columns, sorting with a key function that produces duplicates, or requesting the sorting index when using :meth:`Index.sort_values`. When using :meth:`Series.value_counts`, count of missing values is no longer the last in the list of duplicate counts, and its position corresponds to the position in the original :class:`Series`. When using :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses, NaTs ignored the ``na_position`` argument and were sorted to the beggining. Now they respect ``na_position``, the default being ``last``, same as other :class:`Index` subclasses. (:issue:`35992`)

.. ---------------------------------------------------------------------------
.. _whatsnew_120.deprecations:
Expand Down
4 changes: 1 addition & 3 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1181,10 +1181,8 @@ def compute(self, method: str) -> Series:

# slow method
if n >= len(self.obj):
reverse_it = self.keep == "last" or method == "nlargest"
ascending = method == "nsmallest"
slc = np.s_[::-1] if reverse_it else np.s_[:]
return dropped[slc].sort_values(ascending=ascending).head(n)
return dropped.sort_values(ascending=ascending).head(n)

# fast method
arr, pandas_dtype = _ensure_data(dropped.values)
Expand Down
16 changes: 8 additions & 8 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -933,9 +933,9 @@ def value_counts(
>>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
>>> index.value_counts()
3.0 2
4.0 1
2.0 1
1.0 1
2.0 1
4.0 1
dtype: int64
With `normalize` set to `True`, returns the relative frequency by
Expand All @@ -944,9 +944,9 @@ def value_counts(
>>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
>>> s.value_counts(normalize=True)
3.0 0.4
4.0 0.2
2.0 0.2
1.0 0.2
2.0 0.2
4.0 0.2
dtype: float64
**bins**
Expand All @@ -957,8 +957,8 @@ def value_counts(
number of half-open bins.
>>> s.value_counts(bins=3)
(2.0, 3.0] 2
(0.996, 2.0] 2
(2.0, 3.0] 2
(3.0, 4.0] 1
dtype: int64
Expand All @@ -968,10 +968,10 @@ def value_counts(
>>> s.value_counts(dropna=False)
3.0 2
NaN 1
4.0 1
2.0 1
1.0 1
2.0 1
4.0 1
NaN 1
dtype: int64
"""
result = value_counts(
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5563,8 +5563,8 @@ def value_counts(
>>> df.value_counts()
num_legs num_wings
4 0 2
6 0 1
2 2 1
6 0 1
dtype: int64
>>> df.value_counts(sort=False)
Expand All @@ -5584,8 +5584,8 @@ def value_counts(
>>> df.value_counts(normalize=True)
num_legs num_wings
4 0 0.50
6 0 0.25
2 2 0.25
6 0 0.25
dtype: float64
"""
if subset is None:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10106,7 +10106,7 @@ def describe(
categorical
count 3
unique 3
top f
top d
freq 1
Excluding numeric columns from a ``DataFrame`` description.
Expand Down
4 changes: 1 addition & 3 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4542,9 +4542,7 @@ def sort_values(

# GH 35584. Sort missing values according to na_position kwarg
# ignore na_position for MultiIndex
if not isinstance(
self, (ABCMultiIndex, ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex)
):
if not isinstance(self, ABCMultiIndex):
_as = nargsort(
items=idx, ascending=ascending, na_position=na_position, key=key
)
Expand Down
46 changes: 9 additions & 37 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@
from pandas.core.indexing import check_bool_indexer
from pandas.core.internals import SingleBlockManager
from pandas.core.shared_docs import _shared_docs
from pandas.core.sorting import ensure_key_mapped
from pandas.core.sorting import ensure_key_mapped, nargsort
from pandas.core.strings import StringMethods
from pandas.core.tools.datetimes import to_datetime

Expand Down Expand Up @@ -3288,29 +3288,6 @@ def sort_values(
"sort in-place you must create a copy"
)

def _try_kind_sort(arr):
arr = ensure_key_mapped(arr, key)
arr = getattr(arr, "_values", arr)

# easier to ask forgiveness than permission
try:
# if kind==mergesort, it can fail for object dtype
return arr.argsort(kind=kind)
except TypeError:
# stable sort not available for object dtype
# uses the argsort default quicksort
return arr.argsort(kind="quicksort")

arr = self._values
sorted_index = np.empty(len(self), dtype=np.int32)

bad = isna(arr)

good = ~bad
idx = ibase.default_index(len(self))

argsorted = _try_kind_sort(self[good])

if is_list_like(ascending):
if len(ascending) != 1:
raise ValueError(
Expand All @@ -3321,21 +3298,16 @@ def _try_kind_sort(arr):
if not is_bool(ascending):
raise ValueError("ascending must be boolean")

if not ascending:
argsorted = argsorted[::-1]

if na_position == "last":
n = good.sum()
sorted_index[:n] = idx[good][argsorted]
sorted_index[n:] = idx[bad]
elif na_position == "first":
n = bad.sum()
sorted_index[n:] = idx[good][argsorted]
sorted_index[:n] = idx[bad]
else:
if na_position not in ["first", "last"]:
raise ValueError(f"invalid na_position: {na_position}")

result = self._constructor(arr[sorted_index], index=self.index[sorted_index])
# GH 35922. Make sorting stable by leveraging nargsort
values_to_sort = ensure_key_mapped(self, key)._values if key else self._values
sorted_index = nargsort(values_to_sort, kind, ascending, na_position)

result = self._constructor(
self._values[sorted_index], index=self.index[sorted_index]
)

if ignore_index:
result.index = ibase.default_index(len(sorted_index))
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/arrays/boolean/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,11 @@ def test_ufunc_reduce_raises(values):
def test_value_counts_na():
arr = pd.array([True, False, pd.NA], dtype="boolean")
result = arr.value_counts(dropna=False)
expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64")
expected = pd.Series([1, 1, 1], index=[False, True, pd.NA], dtype="Int64")
tm.assert_series_equal(result, expected)

result = arr.value_counts(dropna=True)
expected = pd.Series([1, 1], index=[True, False], dtype="Int64")
expected = pd.Series([1, 1], index=[False, True], dtype="Int64")
tm.assert_series_equal(result, expected)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ def test_arrow_roundtrip():
def test_value_counts_na():
arr = pd.array(["a", "b", "a", pd.NA], dtype="string")
result = arr.value_counts(dropna=False)
expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64")
expected = pd.Series([2, 1, 1], index=["a", pd.NA, "b"], dtype="Int64")
tm.assert_series_equal(result, expected)

result = arr.value_counts(dropna=True)
Expand Down
12 changes: 8 additions & 4 deletions pandas/tests/base/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,16 +153,16 @@ def test_value_counts_bins(index_or_series):
# these return the same
res4 = s1.value_counts(bins=4, dropna=True)
intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]))
tm.assert_series_equal(res4, exp4)

res4 = s1.value_counts(bins=4, dropna=False)
intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]))
tm.assert_series_equal(res4, exp4)

res4n = s1.value_counts(bins=4, normalize=True)
exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2]))
exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2]))
tm.assert_series_equal(res4n, exp4n)

# handle NA's properly
Expand Down Expand Up @@ -239,7 +239,11 @@ def test_value_counts_datetime64(index_or_series):
tm.assert_series_equal(result, expected_s)

result = s.value_counts(dropna=False)
expected_s[pd.NaT] = 1
# GH 35922. NaN-like now sorts to the beginning of duplicate counts
idx = pd.to_datetime(
["2010-01-01 00:00:00", "2008-09-09 00:00:00", pd.NaT, "2009-01-01 00:00:00"]
)
expected_s = Series([3, 2, 1, 1], index=idx)
tm.assert_series_equal(result, expected_s)

unique = s.unique()
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,11 @@ def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
result = ser.sort_values(ascending=ascending, key=sort_by_key)
expected = ser.iloc[[2, 0, 1]]
if not ascending:
expected = expected[::-1]
# GH 35922. Expect stable sort
if ser.nunique() == 2:
expected = ser.iloc[[0, 1, 2]]
else:
expected = ser.iloc[[1, 0, 2]]

self.assert_series_equal(result, expected)

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/frame/methods/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def test_describe_bool_frame(self):
)
result = df.describe()
expected = DataFrame(
{"bool_data_1": [4, 2, True, 2], "bool_data_2": [4, 2, True, 3]},
{"bool_data_1": [4, 2, False, 2], "bool_data_2": [4, 2, True, 3]},
index=["count", "unique", "top", "freq"],
)
tm.assert_frame_equal(result, expected)
Expand All @@ -79,7 +79,7 @@ def test_describe_bool_frame(self):
)
result = df.describe()
expected = DataFrame(
{"bool_data": [4, 2, True, 2], "str_data": [4, 3, "a", 2]},
{"bool_data": [4, 2, False, 2], "str_data": [4, 3, "a", 2]},
index=["count", "unique", "top", "freq"],
)
tm.assert_frame_equal(result, expected)
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/frame/methods/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def test_data_frame_value_counts_default():
expected = pd.Series(
data=[2, 1, 1],
index=pd.MultiIndex.from_arrays(
[(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
[(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"]
),
)

Expand All @@ -65,7 +65,7 @@ def test_data_frame_value_counts_normalize():
expected = pd.Series(
data=[0.5, 0.25, 0.25],
index=pd.MultiIndex.from_arrays(
[(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
[(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"]
),
)

Expand All @@ -78,7 +78,7 @@ def test_data_frame_value_counts_single_col_default():
result = df.value_counts()
expected = pd.Series(
data=[2, 1, 1],
index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]),
index=pd.MultiIndex.from_arrays([[4, 2, 6]], names=["num_legs"]),
)

tm.assert_series_equal(result, expected)
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/indexes/datetimes/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,15 +231,15 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture)
index = DatetimeIndex(index_dates, tz=tz, name="idx")
expected = DatetimeIndex(expected_dates, tz=tz, name="idx")

ordered = index.sort_values()
ordered = index.sort_values(na_position="first")
tm.assert_index_equal(ordered, expected)
assert ordered.freq is None

ordered = index.sort_values(ascending=False)
tm.assert_index_equal(ordered, expected[::-1])
assert ordered.freq is None

ordered, indexer = index.sort_values(return_indexer=True)
ordered, indexer = index.sort_values(return_indexer=True, na_position="first")
tm.assert_index_equal(ordered, expected)

exp = np.array([0, 4, 3, 1, 2])
Expand All @@ -249,7 +249,7 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture)
ordered, indexer = index.sort_values(return_indexer=True, ascending=False)
tm.assert_index_equal(ordered, expected[::-1])

exp = np.array([2, 1, 3, 4, 0])
exp = np.array([2, 1, 3, 0, 4])
tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
assert ordered.freq is None

Expand Down
14 changes: 5 additions & 9 deletions pandas/tests/indexes/period/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def _check_freq(index, expected_index):

pidx = PeriodIndex(["2011", "2013", "NaT", "2011"], name="pidx", freq="D")

result = pidx.sort_values()
result = pidx.sort_values(na_position="first")
expected = PeriodIndex(["NaT", "2011", "2011", "2013"], name="pidx", freq="D")
tm.assert_index_equal(result, expected)
assert result.freq == "D"
Expand Down Expand Up @@ -247,15 +247,15 @@ def test_order(self):
)

for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3)]:
ordered = idx.sort_values()
ordered = idx.sort_values(na_position="first")
tm.assert_index_equal(ordered, expected)
assert ordered.freq == "D"

ordered = idx.sort_values(ascending=False)
tm.assert_index_equal(ordered, expected[::-1])
assert ordered.freq == "D"

ordered, indexer = idx.sort_values(return_indexer=True)
ordered, indexer = idx.sort_values(return_indexer=True, na_position="first")
tm.assert_index_equal(ordered, expected)

exp = np.array([0, 4, 3, 1, 2])
Expand All @@ -265,7 +265,7 @@ def test_order(self):
ordered, indexer = idx.sort_values(return_indexer=True, ascending=False)
tm.assert_index_equal(ordered, expected[::-1])

exp = np.array([2, 1, 3, 4, 0])
exp = np.array([2, 1, 3, 0, 4])
tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
assert ordered.freq == "D"

Expand Down Expand Up @@ -332,12 +332,8 @@ def test_freq_setter_deprecated(self):
idx.freq = pd.offsets.Day()


@pytest.mark.xfail(reason="Datetime-like sort_values currently unstable (GH 35922)")
def test_order_stability_compat():
# GH 35584. The new implementation of sort_values for Index.sort_values
# is stable when sorting in descending order. Datetime-like sort_values
# currently aren't stable. xfail should be removed after
# the implementations' behavior is synchronized (xref GH 35922)
# GH 35922. sort_values is stable both for normal and datetime-like Index
pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A")
iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx")
ordered1, indexer1 = pidx.sort_values(return_indexer=True, ascending=False)
Expand Down
Loading

0 comments on commit b9c9afe

Please sign in to comment.