Refactor fillna logic to push specifics toward Frame subclasses and C…

…olumn subclasses (rapidsai#15957) Essentially 2 reorganizations 1. `Frame.fillna` input argument logic was pushed toward its subclasses `Series`/`DataFrame`/`IndexedFrame` where appripriate 2. `Column.fillna` was made generic. Column subclasses now implement `_validate_fillna_value` used by `Column.fillna` to validate the fill value Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: rapidsai#15957
lithomas1 · Jun 25, 2024 · bc08662 · bc08662
1 parent 1bc1f45
commit bc08662
Show file tree

Hide file tree

Showing 12 changed files with 155 additions and 261 deletions.
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
@@ -1068,51 +1068,34 @@ def notnull(self) -> ColumnBase:
 
         return result
 
-    def fillna(
-        self,
-        fill_value: Any = None,
-        method: str | None = None,
-    ) -> Self:
-        """
-        Fill null values with *fill_value*
-        """
-        if fill_value is not None:
-            fill_is_scalar = np.isscalar(fill_value)
-
-            if fill_is_scalar:
-                if fill_value == _DEFAULT_CATEGORICAL_VALUE:
-                    fill_value = self.codes.dtype.type(fill_value)
-                else:
-                    try:
-                        fill_value = self._encode(fill_value)
-                        fill_value = self.codes.dtype.type(fill_value)
-                    except ValueError as err:
-                        err_msg = "fill value must be in categories"
-                        raise ValueError(err_msg) from err
+    def _validate_fillna_value(
+        self, fill_value: ScalarLike | ColumnLike
+    ) -> cudf.Scalar | ColumnBase:
+        """Align fill_value for .fillna based on column type."""
+        if cudf.api.types.is_scalar(fill_value):
+            if fill_value != _DEFAULT_CATEGORICAL_VALUE:
+                try:
+                    fill_value = self._encode(fill_value)
+                except ValueError as err:
+                    raise ValueError(
+                        f"{fill_value=} must be in categories"
+                    ) from err
+            return cudf.Scalar(fill_value, dtype=self.codes.dtype)
+        else:
+            fill_value = column.as_column(fill_value, nan_as_null=False)
+            if isinstance(fill_value.dtype, CategoricalDtype):
+                if self.dtype != fill_value.dtype:
+                    raise TypeError(
+                        "Cannot set a categorical with another without identical categories"
+                    )
             else:
-                fill_value = column.as_column(fill_value, nan_as_null=False)
-                if isinstance(fill_value, CategoricalColumn):
-                    if self.dtype != fill_value.dtype:
-                        raise TypeError(
-                            "Cannot set a Categorical with another, "
-                            "without identical categories"
-                        )
-                # TODO: only required if fill_value has a subset of the
-                # categories:
-                fill_value = fill_value._set_categories(
-                    self.categories,
-                    is_unique=True,
-                )
-                fill_value = column.as_column(fill_value.codes).astype(
-                    self.codes.dtype
+                raise TypeError(
+                    "Cannot set a categorical with non-categorical data"
                 )
-
-        # Validation of `fill_value` will have to be performed
-        # before returning self.
-        if not self.nullable:
-            return self
-
-        return super().fillna(fill_value, method=method)
+            fill_value = fill_value._set_categories(
+                self.categories,
+            )
+            return fill_value.codes.astype(self.codes.dtype)
 
     def indices_of(
         self, value: ScalarLike
@@ -1372,11 +1355,13 @@ def _set_categories(
         if not (is_unique or new_cats.is_unique):
             new_cats = cudf.Series(new_cats)._column.unique()
 
+        if cur_cats.equals(new_cats, check_dtypes=True):
+            # TODO: Internal usages don't always need a copy; add a copy keyword
+            # as_ordered shallow copies
+            return self.copy().as_ordered(ordered=ordered)
+
         cur_codes = self.codes
-        max_cat_size = (
-            len(cur_cats) if len(cur_cats) > len(new_cats) else len(new_cats)
-        )
-        out_code_dtype = min_unsigned_type(max_cat_size)
+        out_code_dtype = min_unsigned_type(max(len(cur_cats), len(new_cats)))
 
         cur_order = column.as_column(range(len(cur_codes)))
         old_codes = column.as_column(

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -666,15 +666,32 @@ def _check_scatter_key_length(
                 f"{num_keys}"
             )
 
+    def _validate_fillna_value(
+        self, fill_value: ScalarLike | ColumnLike
+    ) -> cudf.Scalar | ColumnBase:
+        """Align fill_value for .fillna based on column type."""
+        if is_scalar(fill_value):
+            return cudf.Scalar(fill_value, dtype=self.dtype)
+        return as_column(fill_value)
+
     def fillna(
         self,
-        fill_value: Any = None,
-        method: str | None = None,
+        fill_value: ScalarLike | ColumnLike,
+        method: Literal["ffill", "bfill", None] = None,
     ) -> Self:
         """Fill null values with ``value``.
 
         Returns a copy with null filled.
         """
+        if not self.has_nulls(include_nan=True):
+            return self.copy()
+        elif method is None:
+            if is_scalar(fill_value) and libcudf.scalar._is_null_host_scalar(
+                fill_value
+            ):
+                return self.copy()
+            else:
+                fill_value = self._validate_fillna_value(fill_value)
         return libcudf.replace.replace_nulls(
             input_col=self.nans_to_nulls(),
             replacement=fill_value,

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
@@ -8,18 +8,17 @@
 import locale
 import re
 from locale import nl_langinfo
-from typing import TYPE_CHECKING, Any, Literal, Sequence, cast
+from typing import TYPE_CHECKING, Literal, Sequence, cast
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from typing_extensions import Self
 
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.labeling import label_bins
 from cudf._lib.search import search_sorted
-from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
+from cudf.api.types import is_datetime64_dtype, is_timedelta64_dtype
 from cudf.core._compat import PANDAS_GE_220
 from cudf.core._internals.timezones import (
     check_ambiguous_and_nonexistent,
@@ -641,22 +640,6 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         else:
             return result_col
 
-    def fillna(
-        self,
-        fill_value: Any = None,
-        method: str | None = None,
-    ) -> Self:
-        if fill_value is not None:
-            if cudf.utils.utils._isnat(fill_value):
-                return self.copy(deep=True)
-            if is_scalar(fill_value):
-                if not isinstance(fill_value, cudf.Scalar):
-                    fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
-            else:
-                fill_value = column.as_column(fill_value, nan_as_null=False)
-
-        return super().fillna(fill_value, method)
-
     def indices_of(
         self, value: ScalarLike
     ) -> cudf.core.column.NumericalColumn:

diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
@@ -4,12 +4,11 @@
 
 import warnings
 from decimal import Decimal
-from typing import TYPE_CHECKING, Any, Sequence, cast
+from typing import TYPE_CHECKING, Sequence, cast
 
 import cupy as cp
 import numpy as np
 import pyarrow as pa
-from typing_extensions import Self
 
 import cudf
 from cudf import _lib as libcudf
@@ -31,7 +30,7 @@
 from .numerical_base import NumericalBaseColumn
 
 if TYPE_CHECKING:
-    from cudf._typing import ColumnBinaryOperand, Dtype
+    from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
 
 
 class DecimalBaseColumn(NumericalBaseColumn):
@@ -135,30 +134,20 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str):
 
         return result
 
-    def fillna(
-        self,
-        fill_value: Any = None,
-        method: str | None = None,
-    ) -> Self:
-        """Fill null values with ``value``.
-
-        Returns a copy with null filled.
-        """
+    def _validate_fillna_value(
+        self, fill_value: ScalarLike | ColumnLike
+    ) -> cudf.Scalar | ColumnBase:
+        """Align fill_value for .fillna based on column type."""
         if isinstance(fill_value, (int, Decimal)):
-            fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
-        elif (
-            isinstance(fill_value, DecimalBaseColumn)
-            or isinstance(fill_value, cudf.core.column.NumericalColumn)
-            and is_integer_dtype(fill_value.dtype)
+            return cudf.Scalar(fill_value, dtype=self.dtype)
+        elif isinstance(fill_value, ColumnBase) and (
+            isinstance(self.dtype, DecimalDtype) or self.dtype.kind in "iu"
         ):
-            fill_value = fill_value.astype(self.dtype)
-        else:
-            raise TypeError(
-                "Decimal columns only support using fillna with decimal and "
-                "integer values"
-            )
-
-        return super().fillna(fill_value, method=method)
+            return fill_value.astype(self.dtype)
+        raise TypeError(
+            "Decimal columns only support using fillna with decimal and "
+            "integer values"
+        )
 
     def normalize_binop_value(self, other):
         if isinstance(other, ColumnBase):

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -532,57 +532,26 @@ def find_and_replace(
             replaced, df._data["old"], df._data["new"]
         )
 
-    def fillna(
-        self,
-        fill_value: Any = None,
-        method: str | None = None,
-    ) -> Self:
-        """
-        Fill null values with *fill_value*
-        """
-        col = self.nans_to_nulls()
-
-        if col.null_count == 0:
-            return col
-
-        if method is not None:
-            return super().fillna(fill_value, method)
-
-        if fill_value is None:
-            raise ValueError("Must specify either 'fill_value' or 'method'")
-
-        if (
-            isinstance(fill_value, cudf.Scalar)
-            and fill_value.dtype == col.dtype
-        ):
-            return super().fillna(fill_value, method)
-
-        if np.isscalar(fill_value):
-            # cast safely to the same dtype as self
-            fill_value_casted = col.dtype.type(fill_value)
-            if not np.isnan(fill_value) and (fill_value_casted != fill_value):
+    def _validate_fillna_value(
+        self, fill_value: ScalarLike | ColumnLike
+    ) -> cudf.Scalar | ColumnBase:
+        """Align fill_value for .fillna based on column type."""
+        if is_scalar(fill_value):
+            cudf_obj = cudf.Scalar(fill_value)
+            if not as_column(cudf_obj).can_cast_safely(self.dtype):
                 raise TypeError(
                     f"Cannot safely cast non-equivalent "
-                    f"{type(fill_value).__name__} to {col.dtype.name}"
+                    f"{type(fill_value).__name__} to {self.dtype.name}"
                 )
-            fill_value = cudf.Scalar(fill_value_casted)
         else:
-            fill_value = column.as_column(fill_value, nan_as_null=False)
-            if is_integer_dtype(col.dtype):
-                # cast safely to the same dtype as self
-                if fill_value.dtype != col.dtype:
-                    new_fill_value = fill_value.astype(col.dtype)
-                    if not (new_fill_value == fill_value).all():
-                        raise TypeError(
-                            f"Cannot safely cast non-equivalent "
-                            f"{fill_value.dtype.type.__name__} to "
-                            f"{col.dtype.type.__name__}"
-                        )
-                    fill_value = new_fill_value
-            else:
-                fill_value = fill_value.astype(col.dtype)
-
-        return super().fillna(fill_value, method)
+            cudf_obj = as_column(fill_value, nan_as_null=False)
+            if not cudf_obj.can_cast_safely(self.dtype):  # type: ignore[attr-defined]
+                raise TypeError(
+                    f"Cannot safely cast non-equivalent "
+                    f"{cudf_obj.dtype.type.__name__} to "
+                    f"{self.dtype.type.__name__}"
+                )
+        return cudf_obj.astype(self.dtype)
 
     def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
         """

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -5,12 +5,11 @@
 import re
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Sequence, cast, overload
+from typing import TYPE_CHECKING, Sequence, cast, overload
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from typing_extensions import Self
 
 import cudf
 import cudf.api.types
@@ -5838,21 +5837,6 @@ def find_and_replace(
             res = self
         return libcudf.replace.replace(res, df._data["old"], df._data["new"])
 
-    def fillna(
-        self,
-        fill_value: Any = None,
-        method: str | None = None,
-    ) -> Self:
-        if fill_value is not None:
-            if not is_scalar(fill_value):
-                fill_value = column.as_column(fill_value, dtype=self.dtype)
-            elif cudf._lib.scalar._is_null_host_scalar(fill_value):
-                # Trying to fill <NA> with <NA> value? Return copy.
-                return self.copy(deep=True)
-            else:
-                fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
-        return super().fillna(fill_value, method=method)
-
     def normalize_binop_value(self, other) -> column.ColumnBase | cudf.Scalar:
         if (
             isinstance(other, (column.ColumnBase, cudf.Scalar))

diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
@@ -4,12 +4,11 @@
 
 import datetime
 import functools
-from typing import TYPE_CHECKING, Any, Sequence, cast
+from typing import TYPE_CHECKING, Sequence, cast
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from typing_extensions import Self
 
 import cudf
 from cudf import _lib as libcudf
@@ -252,22 +251,6 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
     def time_unit(self) -> str:
         return np.datetime_data(self.dtype)[0]
 
-    def fillna(
-        self,
-        fill_value: Any = None,
-        method: str | None = None,
-    ) -> Self:
-        if fill_value is not None:
-            if cudf.utils.utils._isnat(fill_value):
-                return self.copy(deep=True)
-            if is_scalar(fill_value):
-                fill_value = cudf.Scalar(fill_value)
-                dtype = self.dtype
-                fill_value = fill_value.astype(dtype)
-            else:
-                fill_value = column.as_column(fill_value, nan_as_null=False)
-        return super().fillna(fill_value, method)
-
     def as_numerical_column(
         self, dtype: Dtype
     ) -> "cudf.core.column.NumericalColumn":