BUG: stabilize sort_values algorithms for Series and time-like Indices (

pandas-dev#37310)
ukarroum · Nov 2, 2020 · b9c9afe · b9c9afe
1 parent f7fb67d
commit b9c9afe
Show file tree

Hide file tree

Showing 19 changed files with 69 additions and 107 deletions.
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -309,6 +309,13 @@ Optional libraries below the lowest tested version may still work, but are not c
 
 See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.
 
+.. _whatsnew_200.api.other:
+
+Other API changes
+^^^^^^^^^^^^^^^^^
+
+- Sorting in descending order is now stable for :meth:`Series.sort_values` and :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses. This will affect sort order when sorting :class:`DataFrame` on multiple columns, sorting with a key function that produces duplicates, or requesting the sorting index when using :meth:`Index.sort_values`. When using :meth:`Series.value_counts`, count of missing values is no longer the last in the list of duplicate counts, and its position corresponds to the position in the original :class:`Series`. When using :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses, NaTs ignored the ``na_position`` argument and were sorted to the beggining. Now they respect ``na_position``, the default being ``last``, same as other :class:`Index` subclasses. (:issue:`35992`)
+
 .. ---------------------------------------------------------------------------
 
 .. _whatsnew_120.deprecations:

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1181,10 +1181,8 @@ def compute(self, method: str) -> Series:
 
         # slow method
         if n >= len(self.obj):
-            reverse_it = self.keep == "last" or method == "nlargest"
             ascending = method == "nsmallest"
-            slc = np.s_[::-1] if reverse_it else np.s_[:]
-            return dropped[slc].sort_values(ascending=ascending).head(n)
+            return dropped.sort_values(ascending=ascending).head(n)
 
         # fast method
         arr, pandas_dtype = _ensure_data(dropped.values)

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -933,9 +933,9 @@ def value_counts(
         >>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
         >>> index.value_counts()
         3.0    2
-        4.0    1
-        2.0    1
         1.0    1
+        2.0    1
+        4.0    1
         dtype: int64
 
         With `normalize` set to `True`, returns the relative frequency by
@@ -944,9 +944,9 @@ def value_counts(
         >>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
         >>> s.value_counts(normalize=True)
         3.0    0.4
-        4.0    0.2
-        2.0    0.2
         1.0    0.2
+        2.0    0.2
+        4.0    0.2
         dtype: float64
 
         **bins**
@@ -957,8 +957,8 @@ def value_counts(
         number of half-open bins.
 
         >>> s.value_counts(bins=3)
-        (2.0, 3.0]      2
         (0.996, 2.0]    2
+        (2.0, 3.0]      2
         (3.0, 4.0]      1
         dtype: int64
 
@@ -968,10 +968,10 @@ def value_counts(
 
         >>> s.value_counts(dropna=False)
         3.0    2
-        NaN    1
-        4.0    1
-        2.0    1
         1.0    1
+        2.0    1
+        4.0    1
+        NaN    1
         dtype: int64
         """
         result = value_counts(

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5563,8 +5563,8 @@ def value_counts(
         >>> df.value_counts()
         num_legs  num_wings
         4         0            2
-        6         0            1
         2         2            1
+        6         0            1
         dtype: int64
 
         >>> df.value_counts(sort=False)
@@ -5584,8 +5584,8 @@ def value_counts(
         >>> df.value_counts(normalize=True)
         num_legs  num_wings
         4         0            0.50
-        6         0            0.25
         2         2            0.25
+        6         0            0.25
         dtype: float64
         """
         if subset is None:

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -10106,7 +10106,7 @@ def describe(
                categorical
         count            3
         unique           3
-        top              f
+        top              d
         freq             1
 
         Excluding numeric columns from a ``DataFrame`` description.

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -4542,9 +4542,7 @@ def sort_values(
 
         # GH 35584. Sort missing values according to na_position kwarg
         # ignore na_position for MultiIndex
-        if not isinstance(
-            self, (ABCMultiIndex, ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex)
-        ):
+        if not isinstance(self, ABCMultiIndex):
             _as = nargsort(
                 items=idx, ascending=ascending, na_position=na_position, key=key
             )

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -92,7 +92,7 @@
 from pandas.core.indexing import check_bool_indexer
 from pandas.core.internals import SingleBlockManager
 from pandas.core.shared_docs import _shared_docs
-from pandas.core.sorting import ensure_key_mapped
+from pandas.core.sorting import ensure_key_mapped, nargsort
 from pandas.core.strings import StringMethods
 from pandas.core.tools.datetimes import to_datetime
 
@@ -3288,29 +3288,6 @@ def sort_values(
                 "sort in-place you must create a copy"
             )
 
-        def _try_kind_sort(arr):
-            arr = ensure_key_mapped(arr, key)
-            arr = getattr(arr, "_values", arr)
-
-            # easier to ask forgiveness than permission
-            try:
-                # if kind==mergesort, it can fail for object dtype
-                return arr.argsort(kind=kind)
-            except TypeError:
-                # stable sort not available for object dtype
-                # uses the argsort default quicksort
-                return arr.argsort(kind="quicksort")
-
-        arr = self._values
-        sorted_index = np.empty(len(self), dtype=np.int32)
-
-        bad = isna(arr)
-
-        good = ~bad
-        idx = ibase.default_index(len(self))
-
-        argsorted = _try_kind_sort(self[good])
-
         if is_list_like(ascending):
             if len(ascending) != 1:
                 raise ValueError(
@@ -3321,21 +3298,16 @@ def _try_kind_sort(arr):
         if not is_bool(ascending):
             raise ValueError("ascending must be boolean")
 
-        if not ascending:
-            argsorted = argsorted[::-1]
-
-        if na_position == "last":
-            n = good.sum()
-            sorted_index[:n] = idx[good][argsorted]
-            sorted_index[n:] = idx[bad]
-        elif na_position == "first":
-            n = bad.sum()
-            sorted_index[n:] = idx[good][argsorted]
-            sorted_index[:n] = idx[bad]
-        else:
+        if na_position not in ["first", "last"]:
             raise ValueError(f"invalid na_position: {na_position}")
 
-        result = self._constructor(arr[sorted_index], index=self.index[sorted_index])
+        # GH 35922. Make sorting stable by leveraging nargsort
+        values_to_sort = ensure_key_mapped(self, key)._values if key else self._values
+        sorted_index = nargsort(values_to_sort, kind, ascending, na_position)
+
+        result = self._constructor(
+            self._values[sorted_index], index=self.index[sorted_index]
+        )
 
         if ignore_index:
             result.index = ibase.default_index(len(sorted_index))

diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py
@@ -77,11 +77,11 @@ def test_ufunc_reduce_raises(values):
 def test_value_counts_na():
     arr = pd.array([True, False, pd.NA], dtype="boolean")
     result = arr.value_counts(dropna=False)
-    expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64")
+    expected = pd.Series([1, 1, 1], index=[False, True, pd.NA], dtype="Int64")
     tm.assert_series_equal(result, expected)
 
     result = arr.value_counts(dropna=True)
-    expected = pd.Series([1, 1], index=[True, False], dtype="Int64")
+    expected = pd.Series([1, 1], index=[False, True], dtype="Int64")
     tm.assert_series_equal(result, expected)
 
 

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -301,7 +301,7 @@ def test_arrow_roundtrip():
 def test_value_counts_na():
     arr = pd.array(["a", "b", "a", pd.NA], dtype="string")
     result = arr.value_counts(dropna=False)
-    expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64")
+    expected = pd.Series([2, 1, 1], index=["a", pd.NA, "b"], dtype="Int64")
     tm.assert_series_equal(result, expected)
 
     result = arr.value_counts(dropna=True)

diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py
@@ -153,16 +153,16 @@ def test_value_counts_bins(index_or_series):
     # these return the same
     res4 = s1.value_counts(bins=4, dropna=True)
     intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
-    exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
+    exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]))
     tm.assert_series_equal(res4, exp4)
 
     res4 = s1.value_counts(bins=4, dropna=False)
     intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
-    exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
+    exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]))
     tm.assert_series_equal(res4, exp4)
 
     res4n = s1.value_counts(bins=4, normalize=True)
-    exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2]))
+    exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2]))
     tm.assert_series_equal(res4n, exp4n)
 
     # handle NA's properly
@@ -239,7 +239,11 @@ def test_value_counts_datetime64(index_or_series):
     tm.assert_series_equal(result, expected_s)
 
     result = s.value_counts(dropna=False)
-    expected_s[pd.NaT] = 1
+    # GH 35922. NaN-like now sorts to the beginning of duplicate counts
+    idx = pd.to_datetime(
+        ["2010-01-01 00:00:00", "2008-09-09 00:00:00", pd.NaT, "2009-01-01 00:00:00"]
+    )
+    expected_s = Series([3, 2, 1, 1], index=idx)
     tm.assert_series_equal(result, expected_s)
 
     unique = s.unique()

diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -125,7 +125,11 @@ def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
         result = ser.sort_values(ascending=ascending, key=sort_by_key)
         expected = ser.iloc[[2, 0, 1]]
         if not ascending:
-            expected = expected[::-1]
+            # GH 35922. Expect stable sort
+            if ser.nunique() == 2:
+                expected = ser.iloc[[0, 1, 2]]
+            else:
+                expected = ser.iloc[[1, 0, 2]]
 
         self.assert_series_equal(result, expected)
 

diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py
@@ -56,7 +56,7 @@ def test_describe_bool_frame(self):
         )
         result = df.describe()
         expected = DataFrame(
-            {"bool_data_1": [4, 2, True, 2], "bool_data_2": [4, 2, True, 3]},
+            {"bool_data_1": [4, 2, False, 2], "bool_data_2": [4, 2, True, 3]},
             index=["count", "unique", "top", "freq"],
         )
         tm.assert_frame_equal(result, expected)
@@ -79,7 +79,7 @@ def test_describe_bool_frame(self):
         )
         result = df.describe()
         expected = DataFrame(
-            {"bool_data": [4, 2, True, 2], "str_data": [4, 3, "a", 2]},
+            {"bool_data": [4, 2, False, 2], "str_data": [4, 3, "a", 2]},
             index=["count", "unique", "top", "freq"],
         )
         tm.assert_frame_equal(result, expected)

diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py
@@ -48,7 +48,7 @@ def test_data_frame_value_counts_default():
     expected = pd.Series(
         data=[2, 1, 1],
         index=pd.MultiIndex.from_arrays(
-            [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
+            [(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"]
         ),
     )
 
@@ -65,7 +65,7 @@ def test_data_frame_value_counts_normalize():
     expected = pd.Series(
         data=[0.5, 0.25, 0.25],
         index=pd.MultiIndex.from_arrays(
-            [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
+            [(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"]
         ),
     )
 
@@ -78,7 +78,7 @@ def test_data_frame_value_counts_single_col_default():
     result = df.value_counts()
     expected = pd.Series(
         data=[2, 1, 1],
-        index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]),
+        index=pd.MultiIndex.from_arrays([[4, 2, 6]], names=["num_legs"]),
     )
 
     tm.assert_series_equal(result, expected)

diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py
@@ -231,15 +231,15 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture)
         index = DatetimeIndex(index_dates, tz=tz, name="idx")
         expected = DatetimeIndex(expected_dates, tz=tz, name="idx")
 
-        ordered = index.sort_values()
+        ordered = index.sort_values(na_position="first")
         tm.assert_index_equal(ordered, expected)
         assert ordered.freq is None
 
         ordered = index.sort_values(ascending=False)
         tm.assert_index_equal(ordered, expected[::-1])
         assert ordered.freq is None
 
-        ordered, indexer = index.sort_values(return_indexer=True)
+        ordered, indexer = index.sort_values(return_indexer=True, na_position="first")
         tm.assert_index_equal(ordered, expected)
 
         exp = np.array([0, 4, 3, 1, 2])
@@ -249,7 +249,7 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture)
         ordered, indexer = index.sort_values(return_indexer=True, ascending=False)
         tm.assert_index_equal(ordered, expected[::-1])
 
-        exp = np.array([2, 1, 3, 4, 0])
+        exp = np.array([2, 1, 3, 0, 4])
         tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
         assert ordered.freq is None
 

diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py
@@ -178,7 +178,7 @@ def _check_freq(index, expected_index):
 
         pidx = PeriodIndex(["2011", "2013", "NaT", "2011"], name="pidx", freq="D")
 
-        result = pidx.sort_values()
+        result = pidx.sort_values(na_position="first")
         expected = PeriodIndex(["NaT", "2011", "2011", "2013"], name="pidx", freq="D")
         tm.assert_index_equal(result, expected)
         assert result.freq == "D"
@@ -247,15 +247,15 @@ def test_order(self):
         )
 
         for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3)]:
-            ordered = idx.sort_values()
+            ordered = idx.sort_values(na_position="first")
             tm.assert_index_equal(ordered, expected)
             assert ordered.freq == "D"
 
             ordered = idx.sort_values(ascending=False)
             tm.assert_index_equal(ordered, expected[::-1])
             assert ordered.freq == "D"
 
-            ordered, indexer = idx.sort_values(return_indexer=True)
+            ordered, indexer = idx.sort_values(return_indexer=True, na_position="first")
             tm.assert_index_equal(ordered, expected)
 
             exp = np.array([0, 4, 3, 1, 2])
@@ -265,7 +265,7 @@ def test_order(self):
             ordered, indexer = idx.sort_values(return_indexer=True, ascending=False)
             tm.assert_index_equal(ordered, expected[::-1])
 
-            exp = np.array([2, 1, 3, 4, 0])
+            exp = np.array([2, 1, 3, 0, 4])
             tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
             assert ordered.freq == "D"
 
@@ -332,12 +332,8 @@ def test_freq_setter_deprecated(self):
             idx.freq = pd.offsets.Day()
 
 
-@pytest.mark.xfail(reason="Datetime-like sort_values currently unstable (GH 35922)")
 def test_order_stability_compat():
-    # GH 35584. The new implementation of sort_values for Index.sort_values
-    # is stable when sorting in descending order. Datetime-like sort_values
-    # currently aren't stable. xfail should be removed after
-    # the implementations' behavior is synchronized (xref GH 35922)
+    # GH 35922. sort_values is stable both for normal and datetime-like Index
     pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A")
     iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx")
     ordered1, indexer1 = pidx.sort_values(return_indexer=True, ascending=False)
-Original file line number
+Diff line change
@@ Expand Up / @@ -10106,7 +10106,7 @@ def describe( @@
                    categorical
             count            3
             unique           3
-            top              f
+            top              d
             freq             1
             Excluding numeric columns from a ``DataFrame`` description.
@@ Expand Down @@