-
-
Notifications
You must be signed in to change notification settings - Fork 18.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG: stabilize sort_values algorithms for Series and time-like Indices #37310
Changes from 57 commits
151c425
7332255
546b9fa
12eb535
9b51d42
7805d1e
2b75f78
2844a97
965a547
29d47ee
151196d
d71bfc8
aff28ac
0b8aae9
e7cebc4
06931e0
0b24c3e
1b98bff
4076f0c
08aadd3
6f904e6
5c7eea9
75aad12
e503dca
759de34
12741f2
d433952
dc906df
dbf295e
cbe528e
6658b73
076fa7a
cdf63a3
dd30ec9
5cac5c7
7a68a45
4c72dbf
3599591
3ee9b8b
d63293c
a5c8f65
fc90ea9
408abe0
5f53cfc
4370f27
6099344
bfa2b28
bc004ec
cd7111e
5fbbf7d
95b9f25
1dc9b89
2946e46
488596c
c8cd8cd
ab71697
05f60d5
8669e89
00b454c
5d5f3d0
3d7f47c
351a003
8de6ac9
18bb141
9b97302
3a88ebe
d41789e
812f312
e28ce4d
0719633
61ac60d
d495064
36932cd
2156c64
e6f5741
c823043
cd66748
37a6439
d09da99
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4525,9 +4525,7 @@ def sort_values( | |
|
||
# GH 35584. Sort missing values according to na_position kwarg | ||
# ignore na_position for MultiIndex | ||
if not isinstance( | ||
self, (ABCMultiIndex, ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex) | ||
): | ||
if not isinstance(self, ABCMultiIndex): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Support everything except |
||
_as = nargsort( | ||
items=idx, ascending=ascending, na_position=na_position, key=key | ||
) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -92,7 +92,7 @@ | |
from pandas.core.indexing import check_bool_indexer | ||
from pandas.core.internals import SingleBlockManager | ||
from pandas.core.shared_docs import _shared_docs | ||
from pandas.core.sorting import ensure_key_mapped | ||
from pandas.core.sorting import ensure_key_mapped, nargsort | ||
from pandas.core.strings import StringMethods | ||
from pandas.core.tools.datetimes import to_datetime | ||
|
||
|
@@ -3274,29 +3274,6 @@ def sort_values( | |
"sort in-place you must create a copy" | ||
) | ||
|
||
def _try_kind_sort(arr): | ||
arr = ensure_key_mapped(arr, key) | ||
arr = getattr(arr, "_values", arr) | ||
|
||
# easier to ask forgiveness than permission | ||
try: | ||
# if kind==mergesort, it can fail for object dtype | ||
return arr.argsort(kind=kind) | ||
except TypeError: | ||
# stable sort not available for object dtype | ||
# uses the argsort default quicksort | ||
return arr.argsort(kind="quicksort") | ||
|
||
arr = self._values | ||
sorted_index = np.empty(len(self), dtype=np.int32) | ||
|
||
bad = isna(arr) | ||
|
||
good = ~bad | ||
idx = ibase.default_index(len(self)) | ||
|
||
argsorted = _try_kind_sort(self[good]) | ||
|
||
if is_list_like(ascending): | ||
if len(ascending) != 1: | ||
raise ValueError( | ||
|
@@ -3307,21 +3284,16 @@ def _try_kind_sort(arr): | |
if not is_bool(ascending): | ||
raise ValueError("ascending must be boolean") | ||
|
||
if not ascending: | ||
argsorted = argsorted[::-1] | ||
|
||
if na_position == "last": | ||
n = good.sum() | ||
sorted_index[:n] = idx[good][argsorted] | ||
sorted_index[n:] = idx[bad] | ||
elif na_position == "first": | ||
n = bad.sum() | ||
sorted_index[n:] = idx[good][argsorted] | ||
sorted_index[:n] = idx[bad] | ||
# GH 35922. Make sorting stable by leveraging nargsort | ||
if key: | ||
ser = ensure_key_mapped(self, key) | ||
sorted_index = nargsort(ser._values, kind, ascending, na_position) | ||
else: | ||
raise ValueError(f"invalid na_position: {na_position}") | ||
sorted_index = nargsort(self._values, kind, ascending, na_position) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback Thanks for the advice! I think that the separate handling of non-missing values and the whole There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. great, i think can write like
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. leave your comment on L3290 as well |
||
|
||
result = self._constructor(arr[sorted_index], index=self.index[sorted_index]) | ||
result = self._constructor( | ||
self._values[sorted_index], index=self.index[sorted_index] | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we change |
||
|
||
if ignore_index: | ||
result.index = ibase.default_index(len(sorted_index)) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -125,7 +125,11 @@ def test_sort_values(self, data_for_sorting, ascending, sort_by_key): | |
result = ser.sort_values(ascending=ascending, key=sort_by_key) | ||
expected = ser.iloc[[2, 0, 1]] | ||
if not ascending: | ||
expected = expected[::-1] | ||
# GH 35922. Expect stable sort | ||
if ser.nunique() == 2: | ||
expected = ser.iloc[[0, 1, 2]] | ||
else: | ||
expected = ser.iloc[[1, 0, 2]] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Some of our parametrizations in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok this is fine |
||
|
||
self.assert_series_equal(result, expected) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -231,15 +231,15 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture) | |
index = DatetimeIndex(index_dates, tz=tz, name="idx") | ||
expected = DatetimeIndex(expected_dates, tz=tz, name="idx") | ||
|
||
ordered = index.sort_values() | ||
ordered = index.sort_values(na_position="first") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NaTs used to be sorted to the beginning ignoring na_position (because their numerical representation is -big_number) - this was an ancillary bug. Now they get sorted to the end by default, and the user needs to supply |
||
tm.assert_index_equal(ordered, expected) | ||
assert ordered.freq is None | ||
|
||
ordered = index.sort_values(ascending=False) | ||
tm.assert_index_equal(ordered, expected[::-1]) | ||
assert ordered.freq is None | ||
|
||
ordered, indexer = index.sort_values(return_indexer=True) | ||
ordered, indexer = index.sort_values(return_indexer=True, na_position="first") | ||
tm.assert_index_equal(ordered, expected) | ||
|
||
exp = np.array([0, 4, 3, 1, 2]) | ||
|
@@ -249,7 +249,7 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture) | |
ordered, indexer = index.sort_values(return_indexer=True, ascending=False) | ||
tm.assert_index_equal(ordered, expected[::-1]) | ||
|
||
exp = np.array([2, 1, 3, 4, 0]) | ||
exp = np.array([2, 1, 3, 0, 4]) | ||
tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) | ||
assert ordered.freq is None | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now that
sort_values
is stable, we don't need this extra reversal stuff fornlargest
(it was used to stabilize the sort).