diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f9e6a86e4f02d..3937b6e281a42 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -309,6 +309,13 @@ Optional libraries below the lowest tested version may still work, but are not c See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. +.. _whatsnew_200.api.other: + +Other API changes +^^^^^^^^^^^^^^^^^ + +- Sorting in descending order is now stable for :meth:`Series.sort_values` and :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses. This will affect sort order when sorting :class:`DataFrame` on multiple columns, sorting with a key function that produces duplicates, or requesting the sorting index when using :meth:`Index.sort_values`. When using :meth:`Series.value_counts`, count of missing values is no longer the last in the list of duplicate counts, and its position corresponds to the position in the original :class:`Series`. When using :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses, NaTs ignored the ``na_position`` argument and were sorted to the beggining. Now they respect ``na_position``, the default being ``last``, same as other :class:`Index` subclasses. (:issue:`35992`) + .. --------------------------------------------------------------------------- .. _whatsnew_120.deprecations: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index a310ec5312cf4..e9e04ace784b6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1181,10 +1181,8 @@ def compute(self, method: str) -> Series: # slow method if n >= len(self.obj): - reverse_it = self.keep == "last" or method == "nlargest" ascending = method == "nsmallest" - slc = np.s_[::-1] if reverse_it else np.s_[:] - return dropped[slc].sort_values(ascending=ascending).head(n) + return dropped.sort_values(ascending=ascending).head(n) # fast method arr, pandas_dtype = _ensure_data(dropped.values) diff --git a/pandas/core/base.py b/pandas/core/base.py index a22aac926e36e..c91e4db004f2a 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -933,9 +933,9 @@ def value_counts( >>> index = pd.Index([3, 1, 2, 3, 4, np.nan]) >>> index.value_counts() 3.0 2 - 4.0 1 - 2.0 1 1.0 1 + 2.0 1 + 4.0 1 dtype: int64 With `normalize` set to `True`, returns the relative frequency by @@ -944,9 +944,9 @@ def value_counts( >>> s = pd.Series([3, 1, 2, 3, 4, np.nan]) >>> s.value_counts(normalize=True) 3.0 0.4 - 4.0 0.2 - 2.0 0.2 1.0 0.2 + 2.0 0.2 + 4.0 0.2 dtype: float64 **bins** @@ -957,8 +957,8 @@ def value_counts( number of half-open bins. >>> s.value_counts(bins=3) - (2.0, 3.0] 2 (0.996, 2.0] 2 + (2.0, 3.0] 2 (3.0, 4.0] 1 dtype: int64 @@ -968,10 +968,10 @@ def value_counts( >>> s.value_counts(dropna=False) 3.0 2 - NaN 1 - 4.0 1 - 2.0 1 1.0 1 + 2.0 1 + 4.0 1 + NaN 1 dtype: int64 """ result = value_counts( diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c13164a4c4f85..5134529d9c21f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5563,8 +5563,8 @@ def value_counts( >>> df.value_counts() num_legs num_wings 4 0 2 - 6 0 1 2 2 1 + 6 0 1 dtype: int64 >>> df.value_counts(sort=False) @@ -5584,8 +5584,8 @@ def value_counts( >>> df.value_counts(normalize=True) num_legs num_wings 4 0 0.50 - 6 0 0.25 2 2 0.25 + 6 0 0.25 dtype: float64 """ if subset is None: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c4ca34c3b74a6..c90ab9cceea8c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10106,7 +10106,7 @@ def describe( categorical count 3 unique 3 - top f + top d freq 1 Excluding numeric columns from a ``DataFrame`` description. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d1a52011a3ad7..20a17c27fe282 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4542,9 +4542,7 @@ def sort_values( # GH 35584. Sort missing values according to na_position kwarg # ignore na_position for MultiIndex - if not isinstance( - self, (ABCMultiIndex, ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex) - ): + if not isinstance(self, ABCMultiIndex): _as = nargsort( items=idx, ascending=ascending, na_position=na_position, key=key ) diff --git a/pandas/core/series.py b/pandas/core/series.py index 2cd861cc11b28..19d07a8c5e6bf 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -92,7 +92,7 @@ from pandas.core.indexing import check_bool_indexer from pandas.core.internals import SingleBlockManager from pandas.core.shared_docs import _shared_docs -from pandas.core.sorting import ensure_key_mapped +from pandas.core.sorting import ensure_key_mapped, nargsort from pandas.core.strings import StringMethods from pandas.core.tools.datetimes import to_datetime @@ -3288,29 +3288,6 @@ def sort_values( "sort in-place you must create a copy" ) - def _try_kind_sort(arr): - arr = ensure_key_mapped(arr, key) - arr = getattr(arr, "_values", arr) - - # easier to ask forgiveness than permission - try: - # if kind==mergesort, it can fail for object dtype - return arr.argsort(kind=kind) - except TypeError: - # stable sort not available for object dtype - # uses the argsort default quicksort - return arr.argsort(kind="quicksort") - - arr = self._values - sorted_index = np.empty(len(self), dtype=np.int32) - - bad = isna(arr) - - good = ~bad - idx = ibase.default_index(len(self)) - - argsorted = _try_kind_sort(self[good]) - if is_list_like(ascending): if len(ascending) != 1: raise ValueError( @@ -3321,21 +3298,16 @@ def _try_kind_sort(arr): if not is_bool(ascending): raise ValueError("ascending must be boolean") - if not ascending: - argsorted = argsorted[::-1] - - if na_position == "last": - n = good.sum() - sorted_index[:n] = idx[good][argsorted] - sorted_index[n:] = idx[bad] - elif na_position == "first": - n = bad.sum() - sorted_index[n:] = idx[good][argsorted] - sorted_index[:n] = idx[bad] - else: + if na_position not in ["first", "last"]: raise ValueError(f"invalid na_position: {na_position}") - result = self._constructor(arr[sorted_index], index=self.index[sorted_index]) + # GH 35922. Make sorting stable by leveraging nargsort + values_to_sort = ensure_key_mapped(self, key)._values if key else self._values + sorted_index = nargsort(values_to_sort, kind, ascending, na_position) + + result = self._constructor( + self._values[sorted_index], index=self.index[sorted_index] + ) if ignore_index: result.index = ibase.default_index(len(sorted_index)) diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py index 1547f08fa66b0..7665c350e3443 100644 --- a/pandas/tests/arrays/boolean/test_function.py +++ b/pandas/tests/arrays/boolean/test_function.py @@ -77,11 +77,11 @@ def test_ufunc_reduce_raises(values): def test_value_counts_na(): arr = pd.array([True, False, pd.NA], dtype="boolean") result = arr.value_counts(dropna=False) - expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64") + expected = pd.Series([1, 1, 1], index=[False, True, pd.NA], dtype="Int64") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) - expected = pd.Series([1, 1], index=[True, False], dtype="Int64") + expected = pd.Series([1, 1], index=[False, True], dtype="Int64") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 56a8e21edd004..089bbcf4e0e3f 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -301,7 +301,7 @@ def test_arrow_roundtrip(): def test_value_counts_na(): arr = pd.array(["a", "b", "a", pd.NA], dtype="string") result = arr.value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64") + expected = pd.Series([2, 1, 1], index=["a", pd.NA, "b"], dtype="Int64") tm.assert_series_equal(result, expected) result = arr.value_counts(dropna=True) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index def7e41e22fb1..1a6cba1ace35f 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -153,16 +153,16 @@ def test_value_counts_bins(index_or_series): # these return the same res4 = s1.value_counts(bins=4, dropna=True) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2])) tm.assert_series_equal(res4, exp4) res4 = s1.value_counts(bins=4, dropna=False) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2])) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) - exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2])) + exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2])) tm.assert_series_equal(res4n, exp4n) # handle NA's properly @@ -239,7 +239,11 @@ def test_value_counts_datetime64(index_or_series): tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) - expected_s[pd.NaT] = 1 + # GH 35922. NaN-like now sorts to the beginning of duplicate counts + idx = pd.to_datetime( + ["2010-01-01 00:00:00", "2008-09-09 00:00:00", pd.NaT, "2009-01-01 00:00:00"] + ) + expected_s = Series([3, 2, 1, 1], index=idx) tm.assert_series_equal(result, expected_s) unique = s.unique() diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 23e20a2c0903a..e973b1247941f 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -125,7 +125,11 @@ def test_sort_values(self, data_for_sorting, ascending, sort_by_key): result = ser.sort_values(ascending=ascending, key=sort_by_key) expected = ser.iloc[[2, 0, 1]] if not ascending: - expected = expected[::-1] + # GH 35922. Expect stable sort + if ser.nunique() == 2: + expected = ser.iloc[[0, 1, 2]] + else: + expected = ser.iloc[[1, 0, 2]] self.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index a90781cf43c16..f77b7cd4a6c3b 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -56,7 +56,7 @@ def test_describe_bool_frame(self): ) result = df.describe() expected = DataFrame( - {"bool_data_1": [4, 2, True, 2], "bool_data_2": [4, 2, True, 3]}, + {"bool_data_1": [4, 2, False, 2], "bool_data_2": [4, 2, True, 3]}, index=["count", "unique", "top", "freq"], ) tm.assert_frame_equal(result, expected) @@ -79,7 +79,7 @@ def test_describe_bool_frame(self): ) result = df.describe() expected = DataFrame( - {"bool_data": [4, 2, True, 2], "str_data": [4, 3, "a", 2]}, + {"bool_data": [4, 2, False, 2], "str_data": [4, 3, "a", 2]}, index=["count", "unique", "top", "freq"], ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index c409b0bbe6fa9..23f9ebdb4479d 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -48,7 +48,7 @@ def test_data_frame_value_counts_default(): expected = pd.Series( data=[2, 1, 1], index=pd.MultiIndex.from_arrays( - [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"] + [(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"] ), ) @@ -65,7 +65,7 @@ def test_data_frame_value_counts_normalize(): expected = pd.Series( data=[0.5, 0.25, 0.25], index=pd.MultiIndex.from_arrays( - [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"] + [(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"] ), ) @@ -78,7 +78,7 @@ def test_data_frame_value_counts_single_col_default(): result = df.value_counts() expected = pd.Series( data=[2, 1, 1], - index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]), + index=pd.MultiIndex.from_arrays([[4, 2, 6]], names=["num_legs"]), ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 9cf0d2035fa67..1d64fde103e9e 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -231,7 +231,7 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture) index = DatetimeIndex(index_dates, tz=tz, name="idx") expected = DatetimeIndex(expected_dates, tz=tz, name="idx") - ordered = index.sort_values() + ordered = index.sort_values(na_position="first") tm.assert_index_equal(ordered, expected) assert ordered.freq is None @@ -239,7 +239,7 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture) tm.assert_index_equal(ordered, expected[::-1]) assert ordered.freq is None - ordered, indexer = index.sort_values(return_indexer=True) + ordered, indexer = index.sort_values(return_indexer=True, na_position="first") tm.assert_index_equal(ordered, expected) exp = np.array([0, 4, 3, 1, 2]) @@ -249,7 +249,7 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture) ordered, indexer = index.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) - exp = np.array([2, 1, 3, 4, 0]) + exp = np.array([2, 1, 3, 0, 4]) tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) assert ordered.freq is None diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 74ca6ec59736b..10134b20e7d3e 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -178,7 +178,7 @@ def _check_freq(index, expected_index): pidx = PeriodIndex(["2011", "2013", "NaT", "2011"], name="pidx", freq="D") - result = pidx.sort_values() + result = pidx.sort_values(na_position="first") expected = PeriodIndex(["NaT", "2011", "2011", "2013"], name="pidx", freq="D") tm.assert_index_equal(result, expected) assert result.freq == "D" @@ -247,7 +247,7 @@ def test_order(self): ) for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3)]: - ordered = idx.sort_values() + ordered = idx.sort_values(na_position="first") tm.assert_index_equal(ordered, expected) assert ordered.freq == "D" @@ -255,7 +255,7 @@ def test_order(self): tm.assert_index_equal(ordered, expected[::-1]) assert ordered.freq == "D" - ordered, indexer = idx.sort_values(return_indexer=True) + ordered, indexer = idx.sort_values(return_indexer=True, na_position="first") tm.assert_index_equal(ordered, expected) exp = np.array([0, 4, 3, 1, 2]) @@ -265,7 +265,7 @@ def test_order(self): ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) - exp = np.array([2, 1, 3, 4, 0]) + exp = np.array([2, 1, 3, 0, 4]) tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) assert ordered.freq == "D" @@ -332,12 +332,8 @@ def test_freq_setter_deprecated(self): idx.freq = pd.offsets.Day() -@pytest.mark.xfail(reason="Datetime-like sort_values currently unstable (GH 35922)") def test_order_stability_compat(): - # GH 35584. The new implementation of sort_values for Index.sort_values - # is stable when sorting in descending order. Datetime-like sort_values - # currently aren't stable. xfail should be removed after - # the implementations' behavior is synchronized (xref GH 35922) + # GH 35922. sort_values is stable both for normal and datetime-like Index pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A") iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx") ordered1, indexer1 = pidx.sort_values(return_indexer=True, ascending=False) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 6a681ede8ff42..d47582566fe94 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -13,14 +13,7 @@ from pandas.core.dtypes.common import is_period_dtype, needs_i8_conversion import pandas as pd -from pandas import ( - CategoricalIndex, - DatetimeIndex, - MultiIndex, - PeriodIndex, - RangeIndex, - TimedeltaIndex, -) +from pandas import CategoricalIndex, MultiIndex, RangeIndex import pandas._testing as tm @@ -498,12 +491,7 @@ def test_ravel_deprecation(self, index): @pytest.mark.parametrize("na_position", [None, "middle"]) def test_sort_values_invalid_na_position(index_with_missing, na_position): - if isinstance(index_with_missing, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): - # datetime-like indices will get na_position kwarg as part of - # synchronizing duplicate-sorting behavior, because we currently expect - # them, other indices, and Series to sort differently (xref 35922) - pytest.xfail("sort_values does not support na_position kwarg") - elif isinstance(index_with_missing, (CategoricalIndex, MultiIndex)): + if isinstance(index_with_missing, (CategoricalIndex, MultiIndex)): pytest.xfail("missing value sorting order not defined for index type") if na_position not in ["first", "last"]: @@ -516,12 +504,7 @@ def test_sort_values_with_missing(index_with_missing, na_position): # GH 35584. Test that sort_values works with missing values, # sort non-missing and place missing according to na_position - if isinstance(index_with_missing, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): - # datetime-like indices will get na_position kwarg as part of - # synchronizing duplicate-sorting behavior, because we currently expect - # them, other indices, and Series to sort differently (xref 35922) - pytest.xfail("sort_values does not support na_position kwarg") - elif isinstance(index_with_missing, (CategoricalIndex, MultiIndex)): + if isinstance(index_with_missing, (CategoricalIndex, MultiIndex)): pytest.xfail("missing value sorting order not defined for index type") missing_count = np.sum(index_with_missing.isna()) diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index b74160e7e0635..15b94eafe2f27 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -134,7 +134,7 @@ def test_order(self): ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) - exp = np.array([2, 1, 3, 4, 0]) + exp = np.array([2, 1, 3, 0, 4]) tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) assert ordered.freq is None diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index e4fdfb2838b70..f22b1be672190 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -185,7 +185,7 @@ def test_value_counts_categorical_with_nan(self): ( Series([False, True, True, pd.NA]), False, - Series([2, 1, 1], index=[True, False, pd.NA]), + Series([2, 1, 1], index=[True, pd.NA, False]), ), ( Series([False, True, True, pd.NA]), @@ -195,7 +195,7 @@ def test_value_counts_categorical_with_nan(self): ( Series(range(3), index=[True, False, np.nan]).index, False, - Series([1, 1, 1], index=[True, False, pd.NA]), + Series([1, 1, 1], index=[pd.NA, False, True]), ), ], ) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index aefbcee76b5d7..88286448de900 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1178,7 +1178,7 @@ def test_dropna(self): ) tm.assert_series_equal( Series([True, True, False, None]).value_counts(dropna=False), - Series([2, 1, 1], index=[True, False, np.nan]), + Series([2, 1, 1], index=[True, np.nan, False]), ) tm.assert_series_equal( Series([10.3, 5.0, 5.0]).value_counts(dropna=True), @@ -1197,7 +1197,7 @@ def test_dropna(self): # 32-bit linux has a different ordering if IS64: result = Series([10.3, 5.0, 5.0, None]).value_counts(dropna=False) - expected = Series([2, 1, 1], index=[5.0, 10.3, np.nan]) + expected = Series([2, 1, 1], index=[5.0, np.nan, 10.3]) tm.assert_series_equal(result, expected) def test_value_counts_normalized(self): @@ -1208,12 +1208,12 @@ def test_value_counts_normalized(self): s_typed = s.astype(t) result = s_typed.value_counts(normalize=True, dropna=False) expected = Series( - [0.6, 0.2, 0.2], index=Series([np.nan, 2.0, 1.0], dtype=t) + [0.6, 0.2, 0.2], index=Series([np.nan, 1.0, 2.0], dtype=t) ) tm.assert_series_equal(result, expected) result = s_typed.value_counts(normalize=True, dropna=True) - expected = Series([0.5, 0.5], index=Series([2.0, 1.0], dtype=t)) + expected = Series([0.5, 0.5], index=Series([1.0, 2.0], dtype=t)) tm.assert_series_equal(result, expected) def test_value_counts_uint64(self): @@ -1224,7 +1224,7 @@ def test_value_counts_uint64(self): tm.assert_series_equal(result, expected) arr = np.array([-1, 2 ** 63], dtype=object) - expected = Series([1, 1], index=[-1, 2 ** 63]) + expected = Series([1, 1], index=[2 ** 63, -1]) result = algos.value_counts(arr) # 32-bit linux has a different ordering