pandas-dev · jreback · Oct 31, 2020 · Oct 20, 2020 · Oct 20, 2020 · Oct 21, 2020
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -531,6 +531,7 @@ Other
 - Fixed metadata propagation in the :class:`Series.dt` and :class:`Series.str` accessors and :class:`DataFrame.duplicated` and ::class:`DataFrame.stack` methods (:issue:`28283`)
 - Bug in :meth:`Index.union` behaving differently depending on whether operand is a :class:`Index` or other list-like (:issue:`36384`)
 - Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError``, from a bare ``Exception`` previously (:issue:`35744`)
+- Sorting in descending order being unstable when using :meth:`Series.sort_values` and :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses (:issue:`35992`)
 
 .. ---------------------------------------------------------------------------
 

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1181,10 +1181,8 @@ def compute(self, method: str) -> Series:
 
         # slow method
         if n >= len(self.obj):
-            reverse_it = self.keep == "last" or method == "nlargest"
             ascending = method == "nsmallest"
-            slc = np.s_[::-1] if reverse_it else np.s_[:]
-            return dropped[slc].sort_values(ascending=ascending).head(n)
+            return dropped.sort_values(ascending=ascending).head(n)
 
         # fast method
         arr, pandas_dtype = _ensure_data(dropped.values)

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -933,9 +933,9 @@ def value_counts(
         >>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
         >>> index.value_counts()
         3.0    2
-        4.0    1
-        2.0    1
         1.0    1
+        2.0    1
+        4.0    1
         dtype: int64
 
         With `normalize` set to `True`, returns the relative frequency by
@@ -944,9 +944,9 @@ def value_counts(
         >>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
         >>> s.value_counts(normalize=True)
         3.0    0.4
-        4.0    0.2
-        2.0    0.2
         1.0    0.2
+        2.0    0.2
+        4.0    0.2
         dtype: float64
 
         **bins**
@@ -957,8 +957,8 @@ def value_counts(
         number of half-open bins.
 
         >>> s.value_counts(bins=3)
-        (2.0, 3.0]      2
         (0.996, 2.0]    2
+        (2.0, 3.0]      2
         (3.0, 4.0]      1
         dtype: int64
 
@@ -968,10 +968,10 @@ def value_counts(
 
         >>> s.value_counts(dropna=False)
         3.0    2
-        NaN    1
-        4.0    1
-        2.0    1
         1.0    1
+        2.0    1
+        4.0    1
+        NaN    1
         dtype: int64
         """
         result = value_counts(

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5535,8 +5535,8 @@ def value_counts(
         >>> df.value_counts()
         num_legs  num_wings
         4         0            2
-        6         0            1
         2         2            1
+        6         0            1
         dtype: int64
 
         >>> df.value_counts(sort=False)
@@ -5556,8 +5556,8 @@ def value_counts(
         >>> df.value_counts(normalize=True)
         num_legs  num_wings
         4         0            0.50
-        6         0            0.25
         2         2            0.25
+        6         0            0.25
         dtype: float64
         """
         if subset is None:

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -10108,7 +10108,7 @@ def describe(
                categorical
         count            3
         unique           3
-        top              f
+        top              d
         freq             1
 
         Excluding numeric columns from a ``DataFrame`` description.

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -4525,9 +4525,7 @@ def sort_values(
 
         # GH 35584. Sort missing values according to na_position kwarg
         # ignore na_position for MultiIndex
-        if not isinstance(
-            self, (ABCMultiIndex, ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex)
-        ):
+        if not isinstance(self, ABCMultiIndex):
             _as = nargsort(
                 items=idx, ascending=ascending, na_position=na_position, key=key
             )

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -72,7 +72,7 @@
 from pandas.core.aggregation import aggregate, transform
 from pandas.core.arrays import ExtensionArray
 from pandas.core.arrays.categorical import CategoricalAccessor
-from pandas.core.arrays.sparse import SparseAccessor
+from pandas.core.arrays.sparse import SparseAccessor, SparseArray
 import pandas.core.common as com
 from pandas.core.construction import (
     array as pd_array,
@@ -92,7 +92,7 @@
 from pandas.core.indexing import check_bool_indexer
 from pandas.core.internals import SingleBlockManager
 from pandas.core.shared_docs import _shared_docs
-from pandas.core.sorting import ensure_key_mapped
+from pandas.core.sorting import ensure_key_mapped, nargsort
 from pandas.core.strings import StringMethods
 from pandas.core.tools.datetimes import to_datetime
 
@@ -3274,29 +3274,6 @@ def sort_values(
                 "sort in-place you must create a copy"
             )
 
-        def _try_kind_sort(arr):
-            arr = ensure_key_mapped(arr, key)
-            arr = getattr(arr, "_values", arr)
-
-            # easier to ask forgiveness than permission
-            try:
-                # if kind==mergesort, it can fail for object dtype
-                return arr.argsort(kind=kind)
-            except TypeError:
-                # stable sort not available for object dtype
-                # uses the argsort default quicksort
-                return arr.argsort(kind="quicksort")
-
-        arr = self._values
-        sorted_index = np.empty(len(self), dtype=np.int32)
-
-        bad = isna(arr)
-
-        good = ~bad
-        idx = ibase.default_index(len(self))
-
-        argsorted = _try_kind_sort(self[good])
-
         if is_list_like(ascending):
             if len(ascending) != 1:
                 raise ValueError(
@@ -3307,21 +3284,24 @@ def _try_kind_sort(arr):
         if not is_bool(ascending):
             raise ValueError("ascending must be boolean")
 
-        if not ascending:
-            argsorted = argsorted[::-1]
-
-        if na_position == "last":
-            n = good.sum()
-            sorted_index[:n] = idx[good][argsorted]
-            sorted_index[n:] = idx[bad]
-        elif na_position == "first":
-            n = bad.sum()
-            sorted_index[n:] = idx[good][argsorted]
-            sorted_index[:n] = idx[bad]
-        else:
-            raise ValueError(f"invalid na_position: {na_position}")
+        arr = self._values
 
-        result = self._constructor(arr[sorted_index], index=self.index[sorted_index])
+        if key:
+            if isinstance(arr, SparseArray):
+                # SparseArray doesn't store NaNs item-by-item, so pass everything
+                arr = ensure_key_mapped(self, key)._values
+            else:
+                good = ~isna(arr)
+                keyed = ensure_key_mapped(self[good], key)._values
+                arr = arr.astype(keyed.dtype)
+                arr[good] = keyed
+
+        # GH 35922. Make sorting stable by leveraging nargsort
+        sorted_index = nargsort(arr, kind, ascending, na_position)
+
+        result = self._constructor(
+            self._values[sorted_index], index=self.index[sorted_index]
+        )
 
         if ignore_index:
             result.index = ibase.default_index(len(sorted_index))

diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
@@ -374,7 +374,16 @@ def nargsort(
     if not ascending:
         non_nans = non_nans[::-1]
         non_nan_idx = non_nan_idx[::-1]
-    indexer = non_nan_idx[non_nans.argsort(kind=kind)]
+
+    # GH 35922. Move support for object sort here from Series.sort_values
+    try:
+        # if kind==mergesort, it can fail for object dtype
+        indexer = non_nan_idx[non_nans.argsort(kind=kind)]
+    except TypeError:
+        # stable sort not available for object dtype
+        # uses the argsort default quicksort
+        indexer = non_nan_idx[non_nans.argsort(kind="quicksort")]
+
     if not ascending:
         indexer = indexer[::-1]
     # Finally, place the NaNs at the end or the beginning according to
@@ -499,7 +508,10 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None):
             result = Index(result)
         else:
             type_of_values = type(values)
-            result = type_of_values(result)  # try to revert to original type otherwise
+            # GH 35922. Support sorting objects that raise when cast to their type
+            if not isinstance(result, type_of_values):
+                # try to revert to original type otherwise
+                result = type_of_values(result)
     except TypeError:
         raise TypeError(
             f"User-provided `key` function returned an invalid type {type(result)} \

diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py
@@ -77,11 +77,11 @@ def test_ufunc_reduce_raises(values):
 def test_value_counts_na():
     arr = pd.array([True, False, pd.NA], dtype="boolean")
     result = arr.value_counts(dropna=False)
-    expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64")
+    expected = pd.Series([1, 1, 1], index=[False, True, pd.NA], dtype="Int64")
     tm.assert_series_equal(result, expected)
 
     result = arr.value_counts(dropna=True)
-    expected = pd.Series([1, 1], index=[True, False], dtype="Int64")
+    expected = pd.Series([1, 1], index=[False, True], dtype="Int64")
     tm.assert_series_equal(result, expected)
 
 

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -301,7 +301,7 @@ def test_arrow_roundtrip():
 def test_value_counts_na():
     arr = pd.array(["a", "b", "a", pd.NA], dtype="string")
     result = arr.value_counts(dropna=False)
-    expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64")
+    expected = pd.Series([2, 1, 1], index=["a", pd.NA, "b"], dtype="Int64")
     tm.assert_series_equal(result, expected)
 
     result = arr.value_counts(dropna=True)

diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py
@@ -153,16 +153,16 @@ def test_value_counts_bins(index_or_series):
     # these return the same
     res4 = s1.value_counts(bins=4, dropna=True)
     intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
-    exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
+    exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]))
     tm.assert_series_equal(res4, exp4)
 
     res4 = s1.value_counts(bins=4, dropna=False)
     intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
-    exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
+    exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]))
     tm.assert_series_equal(res4, exp4)
 
     res4n = s1.value_counts(bins=4, normalize=True)
-    exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2]))
+    exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2]))
     tm.assert_series_equal(res4n, exp4n)
 
     # handle NA's properly
@@ -239,7 +239,11 @@ def test_value_counts_datetime64(index_or_series):
     tm.assert_series_equal(result, expected_s)
 
     result = s.value_counts(dropna=False)
-    expected_s[pd.NaT] = 1
+    # GH 35922. NaN-like now sorts to the beginning of duplicate counts
+    idx = pd.to_datetime(
+        ["2010-01-01 00:00:00", "2008-09-09 00:00:00", pd.NaT, "2009-01-01 00:00:00"]
+    )
+    expected_s = Series([3, 2, 1, 1], index=idx)
     tm.assert_series_equal(result, expected_s)
 
     unique = s.unique()

diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -125,7 +125,11 @@ def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
         result = ser.sort_values(ascending=ascending, key=sort_by_key)
         expected = ser.iloc[[2, 0, 1]]
         if not ascending:
-            expected = expected[::-1]
+            # GH 35922. Expect stable sort
+            if ser.nunique() == 2:
+                expected = ser.iloc[[0, 1, 2]]
+            else:
+                expected = ser.iloc[[1, 0, 2]]
 
         self.assert_series_equal(result, expected)
 

diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py
@@ -56,7 +56,7 @@ def test_describe_bool_frame(self):
         )
         result = df.describe()
         expected = DataFrame(
-            {"bool_data_1": [4, 2, True, 2], "bool_data_2": [4, 2, True, 3]},
+            {"bool_data_1": [4, 2, False, 2], "bool_data_2": [4, 2, True, 3]},
             index=["count", "unique", "top", "freq"],
         )
         tm.assert_frame_equal(result, expected)
@@ -79,7 +79,7 @@ def test_describe_bool_frame(self):
         )
         result = df.describe()
         expected = DataFrame(
-            {"bool_data": [4, 2, True, 2], "str_data": [4, 3, "a", 2]},
+            {"bool_data": [4, 2, False, 2], "str_data": [4, 3, "a", 2]},
             index=["count", "unique", "top", "freq"],
         )
         tm.assert_frame_equal(result, expected)

diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py
@@ -48,7 +48,7 @@ def test_data_frame_value_counts_default():
     expected = pd.Series(
         data=[2, 1, 1],
         index=pd.MultiIndex.from_arrays(
-            [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
+            [(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"]
         ),
     )
 
@@ -65,7 +65,7 @@ def test_data_frame_value_counts_normalize():
     expected = pd.Series(
         data=[0.5, 0.25, 0.25],
         index=pd.MultiIndex.from_arrays(
-            [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
+            [(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"]
         ),
     )
 
@@ -78,7 +78,7 @@ def test_data_frame_value_counts_single_col_default():
     result = df.value_counts()
     expected = pd.Series(
         data=[2, 1, 1],
-        index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]),
+        index=pd.MultiIndex.from_arrays([[4, 2, 6]], names=["num_legs"]),
     )
 
     tm.assert_series_equal(result, expected)

diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py
@@ -231,15 +231,15 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture)
         index = DatetimeIndex(index_dates, tz=tz, name="idx")
         expected = DatetimeIndex(expected_dates, tz=tz, name="idx")
 
-        ordered = index.sort_values()
+        ordered = index.sort_values(na_position="first")
         tm.assert_index_equal(ordered, expected)
         assert ordered.freq is None
 
         ordered = index.sort_values(ascending=False)
         tm.assert_index_equal(ordered, expected[::-1])
         assert ordered.freq is None
 
-        ordered, indexer = index.sort_values(return_indexer=True)
+        ordered, indexer = index.sort_values(return_indexer=True, na_position="first")
         tm.assert_index_equal(ordered, expected)
 
         exp = np.array([0, 4, 3, 1, 2])
@@ -249,7 +249,7 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture)
         ordered, indexer = index.sort_values(return_indexer=True, ascending=False)
         tm.assert_index_equal(ordered, expected[::-1])
 
-        exp = np.array([2, 1, 3, 4, 0])
+        exp = np.array([2, 1, 3, 0, 4])
         tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
         assert ordered.freq is None