pandas-dev · mroeschke · Aug 18, 2022 · Apr 1, 2022 · Apr 19, 2022 · Apr 20, 2022
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -278,6 +278,8 @@ Other enhancements
 - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`)
 - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`)
 - :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`)
+- The method :meth:`.ExtensionArray.factorize` can now be passed ``use_na_sentinel=False`` for determining how null values are to be treated. (:issue:`46601`)
+-
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.notable_bug_fixes:
@@ -997,6 +999,7 @@ Groupby/resample/rolling
 - Bug in :meth:`.DataFrameGroupBy.describe` and :meth:`.SeriesGroupBy.describe` produces inconsistent results for empty datasets (:issue:`41575`)
 - Bug in :meth:`DataFrame.resample` reduction methods when used with ``on`` would attempt to aggregate the provided column (:issue:`47079`)
 - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would not respect ``dropna=False`` when the input DataFrame/Series had a NaN values in a :class:`MultiIndex` (:issue:`46783`)
+- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` with ``dropna=False`` and ``sort=False`` would put any null groups at the end instead the order that they are encountered (:issue:`46584`)
 
 Reshaping
 ^^^^^^^^^

@@ -658,7 +658,7 @@ cdef class {{name}}HashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None, object mask=None):
+                  object na_value=None, object mask=None, ignore_na=True):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -690,7 +690,7 @@ cdef class {{name}}HashTable(HashTable):
         """
         uniques_vector = {{name}}Vector()
         return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
-                            na_value=na_value, ignore_na=True, mask=mask,
+                            na_value=na_value, ignore_na=ignore_na, mask=mask,
                             return_inverse=True)
 
     def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
@@ -1037,7 +1037,7 @@ cdef class StringHashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None, object mask=None):
+                  object na_value=None, object mask=None, ignore_na=True):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -1067,7 +1067,7 @@ cdef class StringHashTable(HashTable):
         """
         uniques_vector = ObjectVector()
         return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
-                            na_value=na_value, ignore_na=True,
+                            na_value=na_value, ignore_na=ignore_na,
                             return_inverse=True)
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
@@ -1290,7 +1290,7 @@ cdef class PyObjectHashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None, object mask=None):
+                  object na_value=None, object mask=None, ignore_na=True):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -1320,7 +1320,7 @@ cdef class PyObjectHashTable(HashTable):
         """
         uniques_vector = ObjectVector()
         return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
-                            na_value=na_value, ignore_na=True,
+                            na_value=na_value, ignore_na=ignore_na,
                             return_inverse=True)
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -509,7 +509,7 @@ def f(c, v):
 
 def factorize_array(
     values: np.ndarray,
-    na_sentinel: int = -1,
+    na_sentinel: int | None = -1,
     size_hint: int | None = None,
     na_value: object = None,
     mask: npt.NDArray[np.bool_] | None = None,
@@ -540,6 +540,10 @@ def factorize_array(
     codes : ndarray[np.intp]
     uniques : ndarray
     """
+    ignore_na = na_sentinel is not None
+    if not ignore_na:
+        na_sentinel = -1
+
     original = values
     if values.dtype.kind in ["m", "M"]:
         # _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we
@@ -552,7 +556,11 @@ def factorize_array(
 
     table = hash_klass(size_hint or len(values))
     uniques, codes = table.factorize(
-        values, na_sentinel=na_sentinel, na_value=na_value, mask=mask
+        values,
+        na_sentinel=na_sentinel,
+        na_value=na_value,
+        mask=mask,
+        ignore_na=ignore_na,
     )
 
     # re-cast e.g. i8->dt64/td64, uint8->bool
@@ -737,10 +745,7 @@ def factorize(
 
     # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
     # of values, assign na_sentinel=-1 to replace code value for NaN.
-    dropna = True
-    if na_sentinel is None:
-        na_sentinel = -1
-        dropna = False
+    dropna = na_sentinel is not None
 
     if (
         isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))
@@ -753,38 +758,58 @@ def factorize(
 
     elif not isinstance(values.dtype, np.dtype):
         if (
-            na_sentinel == -1
-            and "use_na_sentinel" in inspect.signature(values.factorize).parameters
-        ):
+            na_sentinel == -1 or na_sentinel is None
+        ) and "use_na_sentinel" in inspect.signature(values.factorize).parameters:
             # Avoid using catch_warnings when possible
             # GH#46910 - TimelikeOps has deprecated signature
             codes, uniques = values.factorize(  # type: ignore[call-arg]
-                use_na_sentinel=True
+                use_na_sentinel=na_sentinel is not None
             )
         else:
+            na_sentinel_arg = -1 if na_sentinel is None else na_sentinel
             with warnings.catch_warnings():
                 # We've already warned above
                 warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning)
-                codes, uniques = values.factorize(na_sentinel=na_sentinel)
+                codes, uniques = values.factorize(na_sentinel=na_sentinel_arg)
 
     else:
         values = np.asarray(values)  # convert DTA/TDA/MultiIndex
+        # TODO: pass na_sentinel=na_sentinel to factorize_array. When sort is True and
+        #       na_sentinel is None we append NA on the end because safe_sort does not
+        #       handle null values in uniques.
+        if na_sentinel is None and sort:
+            na_sentinel_arg = -1
+        elif na_sentinel is None:
+            na_sentinel_arg = None
+        else:
+            na_sentinel_arg = na_sentinel
         codes, uniques = factorize_array(
-            values, na_sentinel=na_sentinel, size_hint=size_hint
+            values,
+            na_sentinel=na_sentinel_arg,
+            size_hint=size_hint,
         )
 
     if sort and len(uniques) > 0:
+        if na_sentinel is None:
+            # TODO: Can remove when na_sentinel=na_sentinel as in TODO above
+            na_sentinel = -1
         uniques, codes = safe_sort(
             uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
         )
 
-    code_is_na = codes == na_sentinel
-    if not dropna and code_is_na.any():
-        # na_value is set based on the dtype of uniques, and compat set to False is
-        # because we do not want na_value to be 0 for integers
-        na_value = na_value_for_dtype(uniques.dtype, compat=False)
-        uniques = np.append(uniques, [na_value])
-        codes = np.where(code_is_na, len(uniques) - 1, codes)
+    if not dropna and sort:
+        # TODO: Can remove entire block when na_sentinel=na_sentinel as in TODO above
+        if na_sentinel is None:
+            na_sentinel_arg = -1
+        else:
+            na_sentinel_arg = na_sentinel
+        code_is_na = codes == na_sentinel_arg
+        if code_is_na.any():
+            # na_value is set based on the dtype of uniques, and compat set to False is
+            # because we do not want na_value to be 0 for integers
+            na_value = na_value_for_dtype(uniques.dtype, compat=False)
+            uniques = np.append(uniques, [na_value])
+            codes = np.where(code_is_na, len(uniques) - 1, codes)
 
     uniques = _reconstruct_data(uniques, original.dtype, original)
 

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -18,6 +18,7 @@
 from pandas.compat import (
     pa_version_under1p01,
     pa_version_under2p0,
+    pa_version_under4p0,
     pa_version_under5p0,
     pa_version_under6p0,
 )
@@ -338,20 +339,32 @@ def factorize(
         use_na_sentinel: bool | lib.NoDefault = lib.no_default,
     ) -> tuple[np.ndarray, ExtensionArray]:
         resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
-        if resolved_na_sentinel is None:
-            raise NotImplementedError("Encoding NaN values is not yet implemented")
+        if pa_version_under4p0:
+            encoded = self._data.dictionary_encode()
         else:
-            na_sentinel = resolved_na_sentinel
-        encoded = self._data.dictionary_encode()
+            null_encoding = "mask" if resolved_na_sentinel is not None else "encode"
+            encoded = self._data.dictionary_encode(null_encoding=null_encoding)
         indices = pa.chunked_array(
             [c.indices for c in encoded.chunks], type=encoded.type.index_type
         ).to_pandas()
         if indices.dtype.kind == "f":
-            indices[np.isnan(indices)] = na_sentinel
+            indices[np.isnan(indices)] = (
+                resolved_na_sentinel if resolved_na_sentinel is not None else -1
+            )
         indices = indices.astype(np.int64, copy=False)
 
         if encoded.num_chunks:
             uniques = type(self)(encoded.chunk(0).dictionary)
+            if resolved_na_sentinel is None and pa_version_under4p0:
+                # TODO: share logic with BaseMaskedArray.factorize
+                # Insert na with the proper code
+                na_mask = indices.values == -1
+                na_index = na_mask.argmax()
+                if na_mask[na_index]:
+                    uniques = uniques.insert(na_index, self.dtype.na_value)
+                    na_code = 0 if na_index == 0 else indices[:na_index].argmax() + 1
+                    indices[indices >= na_code] += 1
+                    indices[indices == -1] = na_code
         else:
             uniques = type(self)(pa.array([], type=encoded.type.value_type))
 

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -1080,14 +1080,10 @@ def factorize(
         # 2. ExtensionArray.factorize.
         #    Complete control over factorization.
         resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
-        if resolved_na_sentinel is None:
-            raise NotImplementedError("Encoding NaN values is not yet implemented")
-        else:
-            na_sentinel = resolved_na_sentinel
         arr, na_value = self._values_for_factorize()
 
         codes, uniques = factorize_array(
-            arr, na_sentinel=na_sentinel, na_value=na_value
+            arr, na_sentinel=resolved_na_sentinel, na_value=na_value
         )
 
         uniques_ea = self._from_factorized(uniques, self)

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -875,19 +875,39 @@ def factorize(
         use_na_sentinel: bool | lib.NoDefault = lib.no_default,
     ) -> tuple[np.ndarray, ExtensionArray]:
         resolved_na_sentinel = algos.resolve_na_sentinel(na_sentinel, use_na_sentinel)
-        if resolved_na_sentinel is None:
-            raise NotImplementedError("Encoding NaN values is not yet implemented")
-        else:
-            na_sentinel = resolved_na_sentinel
         arr = self._data
         mask = self._mask
 
-        codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask)
+        # Pass non-None na_sentinel; recode and add NA to uniques if necessary below
+        na_sentinel_arg = -1 if resolved_na_sentinel is None else resolved_na_sentinel
+        codes, uniques = factorize_array(arr, na_sentinel=na_sentinel_arg, mask=mask)
 
         # check that factorize_array correctly preserves dtype.
         assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype)
 
-        uniques_ea = type(self)(uniques, np.zeros(len(uniques), dtype=bool))
+        has_na = mask.any()
+        if resolved_na_sentinel is not None or not has_na:
+            size = len(uniques)
+        else:
+            # Make room for an NA value
+            size = len(uniques) + 1
+        uniques_mask = np.zeros(size, dtype=bool)
+        if resolved_na_sentinel is None and has_na:
+            na_index = mask.argmax()
+            # Insert na with the proper code
+            if na_index == 0:
+                na_code = np.intp(0)
+            else:
+                # mypy error: Slice index must be an integer or None
+                # https://github.com/python/mypy/issues/2410
+                na_code = codes[:na_index].argmax() + 1  # type: ignore[misc]
+            codes[codes >= na_code] += 1
+            codes[codes == -1] = na_code
+            # dummy value for uniques; not used since uniques_mask will be True
+            uniques = np.insert(uniques, na_code, 0)
+            uniques_mask[na_code] = True
+        uniques_ea = type(self)(uniques, uniques_mask)
+
         return codes, uniques_ea
 
     @doc(ExtensionArray._values_for_argsort)

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -861,6 +861,10 @@ def factorize(
         codes, uniques = algos.factorize(
             np.asarray(self), na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel
         )
+        if na_sentinel is lib.no_default:
+            na_sentinel = -1
+        if use_na_sentinel is lib.no_default or use_na_sentinel:
+            codes[codes == -1] = na_sentinel
         uniques_sp = SparseArray(uniques, dtype=self.dtype)
         return codes, uniques_sp
 

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -378,8 +378,8 @@ def __arrow_array__(self, type=None):
     def _values_for_factorize(self):
         arr = self._ndarray.copy()
         mask = self.isna()
-        arr[mask] = -1
-        return arr, -1
+        arr[mask] = None
+        return arr, None
 
     def __setitem__(self, key, value):
         value = extract_array(value, extract_numpy=True)

@@ -657,9 +657,10 @@ def group_index(self) -> Index:
 
     @cache_readonly
     def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
-        if self._passed_categorical:
+        if self._dropna and self._passed_categorical:
             # we make a CategoricalIndex out of the cat grouper
-            # preserving the categories / ordered attributes
+            # preserving the categories / ordered attributes;
+            # doesn't (yet - GH#46909) handle dropna=False
             cat = self.grouping_vector
             categories = cat.categories