From 47a0a87db454cc767ab5f74beb2198a480d6f2c0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 15 Jul 2024 16:13:29 -1000 Subject: [PATCH] Type & reduce cupy usage (#16277) There are some cupy usages that don't seem _strictly_ necessary (generating starting data, array type conversion) in some APIs. IMO we should prefer using CPU data/the existing data structure/Column ops over cupy when possible closes https://github.com/rapidsai/cudf/issues/12133 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16277 --- python/cudf/cudf/core/_base_index.py | 4 ++- python/cudf/cudf/core/column/column.py | 8 +++--- python/cudf/cudf/core/column/datetime.py | 6 ++-- python/cudf/cudf/core/column/numerical.py | 10 ++----- python/cudf/cudf/core/cut.py | 6 ++-- python/cudf/cudf/core/dataframe.py | 18 +++++++----- python/cudf/cudf/core/frame.py | 6 ++-- python/cudf/cudf/core/groupby/groupby.py | 23 ++++++++------- python/cudf/cudf/core/index.py | 34 ++++++++++++----------- python/cudf/cudf/core/multiindex.py | 13 +++++---- python/cudf/cudf/core/tools/datetimes.py | 9 +++--- python/cudf/cudf/tests/test_datetime.py | 15 ++-------- 12 files changed, 74 insertions(+), 78 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index e160fa697ee..9ba2d161619 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -38,6 +38,8 @@ if TYPE_CHECKING: from collections.abc import Generator + import cupy + from cudf.core.column_accessor import ColumnAccessor @@ -2001,7 +2003,7 @@ def drop_duplicates( self._column_names, ) - def duplicated(self, keep="first"): + def duplicated(self, keep="first") -> cupy.ndarray: """ Indicate duplicate index values. diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f633d527681..fd3664ecac4 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -721,7 +721,7 @@ def notnull(self) -> ColumnBase: return result def indices_of( - self, value: ScalarLike | Self + self, value: ScalarLike ) -> cudf.core.column.NumericalColumn: """ Find locations of value in the column @@ -735,10 +735,10 @@ def indices_of( ------- Column of indices that match value """ - if not isinstance(value, ColumnBase): - value = as_column([value], dtype=self.dtype) + if not is_scalar(value): + raise ValueError("value must be a scalar") else: - assert len(value) == 1 + value = as_column(value, dtype=self.dtype, length=1) mask = libcudf.search.contains(value, self) return apply_boolean_mask( [as_column(range(0, len(self)), dtype=size_type_dtype)], mask diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 409c44f6eee..004a059af95 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -629,9 +629,9 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: def indices_of( self, value: ScalarLike ) -> cudf.core.column.NumericalColumn: - value = column.as_column( - pd.to_datetime(value), dtype=self.dtype - ).astype("int64") + value = ( + pd.to_datetime(value).to_numpy().astype(self.dtype).astype("int64") + ) return self.astype("int64").indices_of(value) @property diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index b8fa00e9643..7f05a5f91a1 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -5,7 +5,6 @@ import functools from typing import TYPE_CHECKING, Any, Callable, Sequence, cast -import cupy as cp import numpy as np import pandas as pd from typing_extensions import Self @@ -13,7 +12,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib import pylibcudf -from cudf._lib.types import size_type_dtype from cudf.api.types import ( is_bool_dtype, is_float_dtype, @@ -131,12 +129,8 @@ def indices_of(self, value: ScalarLike) -> NumericalColumn: and self.dtype.kind in {"c", "f"} and np.isnan(value) ): - return column.as_column( - cp.argwhere( - cp.isnan(self.data_array_view(mode="read")) - ).flatten(), - dtype=size_type_dtype, - ) + nan_col = libcudf.unary.is_nan(self) + return nan_col.indices_of(True) else: return super().indices_of(value) diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index d9f62f51f92..197f46ee9fe 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -188,9 +188,6 @@ def cut( # adjust bin edges decimal precision int_label_bins = np.around(bins, precision) - # the inputs is a column of the values in the array x - input_arr = as_column(x) - # checking for the correct inclusivity values if right: closed = "right" @@ -242,6 +239,9 @@ def cut( labels if len(set(labels)) == len(labels) else None ) + # the inputs is a column of the values in the array x + input_arr = as_column(x) + if isinstance(bins, pd.IntervalIndex): # get the left and right edges of the bins as columns # we cannot typecast an IntervalIndex, so we need to diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2aa1b95e2d1..2121e623c1c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -429,7 +429,7 @@ def _setitem_tuple_arg(self, key, value): else: value = cupy.asarray(value) - if cupy.ndim(value) == 2: + if value.ndim == 2: # If the inner dimension is 1, it's broadcastable to # all columns of the dataframe. indexed_shape = columns_df.loc[key[0]].shape @@ -566,7 +566,7 @@ def _setitem_tuple_arg(self, key, value): # TODO: consolidate code path with identical counterpart # in `_DataFrameLocIndexer._setitem_tuple_arg` value = cupy.asarray(value) - if cupy.ndim(value) == 2: + if value.ndim == 2: indexed_shape = columns_df.iloc[key[0]].shape if value.shape[1] == 1: if value.shape[0] != indexed_shape[0]: @@ -2199,8 +2199,8 @@ def from_dict( orient = orient.lower() if orient == "index": - if len(data) > 0 and isinstance( - next(iter(data.values())), (cudf.Series, cupy.ndarray) + if isinstance( + next(iter(data.values()), None), (cudf.Series, cupy.ndarray) ): result = cls(data).T result.columns = ( @@ -5698,7 +5698,13 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): @classmethod @_performance_tracking - def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): + def _from_arrays( + cls, + data: np.ndarray | cupy.ndarray, + index=None, + columns=None, + nan_as_null=False, + ): """Convert a numpy/cupy array to DataFrame. Parameters @@ -5716,8 +5722,6 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): ------- DataFrame """ - - data = cupy.asarray(data) if data.ndim != 1 and data.ndim != 2: raise ValueError( f"records dimension expected 1 or 2 but found: {data.ndim}" diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 253d200f7d4..802751e47ad 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1189,7 +1189,7 @@ def searchsorted( side: Literal["left", "right"] = "left", ascending: bool = True, na_position: Literal["first", "last"] = "last", - ): + ) -> ScalarLike | cupy.ndarray: """Find indices where elements should be inserted to maintain order Parameters @@ -1527,7 +1527,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @acquire_spill_lock() def _apply_cupy_ufunc_to_operands( self, ufunc, cupy_func, operands, **kwargs - ): + ) -> list[dict[Any, ColumnBase]]: # Note: There are some operations that may be supported by libcudf but # are not supported by pandas APIs. In particular, libcudf binary # operations support logical and/or operations as well as @@ -1538,7 +1538,7 @@ def _apply_cupy_ufunc_to_operands( # without cupy. mask = None - data = [{} for _ in range(ufunc.nout)] + data: list[dict[Any, ColumnBase]] = [{} for _ in range(ufunc.nout)] for name, (left, right, _, _) in operands.items(): cupy_inputs = [] for inp in (left, right) if ufunc.nin == 2 else (left,): diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index eccb3acabf6..8659d7c2392 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -35,7 +35,12 @@ from cudf.utils.utils import GetAttrGetItemMixin if TYPE_CHECKING: - from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType + from cudf._typing import ( + AggType, + DataFrameOrSeries, + MultiColumnAggType, + ScalarLike, + ) def _deprecate_collect(): @@ -357,7 +362,7 @@ def groups(self): ) @cached_property - def indices(self): + def indices(self) -> dict[ScalarLike, cp.ndarray]: """ Dict {group name -> group indices}. @@ -1015,18 +1020,16 @@ def ngroup(self, ascending=True): if ascending: # Count ascending from 0 to num_groups - 1 - group_ids = cudf.Series._from_data({None: cp.arange(num_groups)}) + groups = range(num_groups) elif has_null_group: # Count descending from num_groups - 1 to 0, but subtract one more # for the null group making it num_groups - 2 to -1. - group_ids = cudf.Series._from_data( - {None: cp.arange(num_groups - 2, -2, -1)} - ) + groups = range(num_groups - 2, -2, -1) else: # Count descending from num_groups - 1 to 0 - group_ids = cudf.Series._from_data( - {None: cp.arange(num_groups - 1, -1, -1)} - ) + groups = range(num_groups - 1, -1, -1) + + group_ids = cudf.Series._from_data({None: as_column(groups)}) if has_null_group: group_ids.iloc[-1] = cudf.NA @@ -1713,7 +1716,7 @@ def rolling_avg(val, avg): return grouped_values.apply_chunks(function, **kwargs) @_performance_tracking - def _broadcast(self, values): + def _broadcast(self, values: cudf.Series) -> cudf.Series: """ Broadcast the results of an aggregation to the group diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b398ee2343e..4164f981fca 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -103,7 +103,7 @@ def __subclasscheck__(self, subclass): def _lexsorted_equal_range( idx: Index | cudf.MultiIndex, - key_as_table: Frame, + keys: list[ColumnBase], is_sorted: bool, ) -> tuple[int, int, ColumnBase | None]: """Get equal range for key in lexicographically sorted index. If index @@ -118,13 +118,13 @@ def _lexsorted_equal_range( sort_vals = idx lower_bound = search_sorted( [*sort_vals._data.columns], - [*key_as_table._columns], + keys, side="left", ascending=sort_vals.is_monotonic_increasing, ).element_indexing(0) upper_bound = search_sorted( [*sort_vals._data.columns], - [*key_as_table._columns], + keys, side="right", ascending=sort_vals.is_monotonic_increasing, ).element_indexing(0) @@ -260,7 +260,9 @@ def searchsorted( ), "Invalid ascending flag" return search_range(value, self._range, side=side) - def factorize(self, sort: bool = False, use_na_sentinel: bool = True): + def factorize( + self, sort: bool = False, use_na_sentinel: bool = True + ) -> tuple[cupy.ndarray, Self]: if sort and self.step < 0: codes = cupy.arange(len(self) - 1, -1, -1) uniques = self[::-1] @@ -753,15 +755,16 @@ def difference(self, other, sort=None): super().difference(other, sort=sort) ) - def _try_reconstruct_range_index(self, index): - if isinstance(index, RangeIndex) or index.dtype.kind == "f": + def _try_reconstruct_range_index( + self, index: BaseIndex + ) -> Self | BaseIndex: + if isinstance(index, RangeIndex) or index.dtype.kind not in "iu": return index # Evenly spaced values can return a # RangeIndex instead of a materialized Index. - if not index._column.has_nulls(): + if not index._column.has_nulls(): # type: ignore[attr-defined] uniques = cupy.unique(cupy.diff(index.values)) - if len(uniques) == 1 and uniques[0].get() != 0: - diff = uniques[0].get() + if len(uniques) == 1 and (diff := uniques[0].get()) != 0: new_range = range(index[0], index[-1] + diff, diff) return type(self)(new_range, name=index.name) return index @@ -1309,7 +1312,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return _return_get_indexer_result(result_series.to_cupy()) @_performance_tracking - def get_loc(self, key): + def get_loc(self, key) -> int | slice | cupy.ndarray: if not is_scalar(key): raise TypeError("Should be a scalar-like") @@ -1317,9 +1320,8 @@ def get_loc(self, key): self.is_monotonic_increasing or self.is_monotonic_decreasing ) - target_as_table = cudf.core.frame.Frame({"None": as_column([key])}) lower_bound, upper_bound, sort_inds = _lexsorted_equal_range( - self, target_as_table, is_sorted + self, [as_column([key])], is_sorted ) if lower_bound == upper_bound: @@ -1330,7 +1332,7 @@ def get_loc(self, key): return ( lower_bound if is_sorted - else sort_inds.element_indexing(lower_bound) + else sort_inds.element_indexing(lower_bound) # type: ignore[union-attr] ) if is_sorted: @@ -1339,8 +1341,8 @@ def get_loc(self, key): return slice(lower_bound, upper_bound) # Not sorted and not unique. Return a boolean mask - mask = cupy.full(self._data.nrows, False) - true_inds = sort_inds.slice(lower_bound, upper_bound).values + mask = cupy.full(len(self), False) + true_inds = sort_inds.slice(lower_bound, upper_bound).values # type: ignore[union-attr] mask[true_inds] = True return mask @@ -2076,7 +2078,7 @@ def day_of_year(self): @property # type: ignore @_performance_tracking - def is_leap_year(self): + def is_leap_year(self) -> cupy.ndarray: """ Boolean indicator if the date belongs to a leap year. diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 6503dae6ff5..3ed72ff812a 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1926,17 +1926,18 @@ def get_loc(self, key): # Handle partial key search. If length of `key` is less than `nlevels`, # Only search levels up to `len(key)` level. - key_as_table = cudf.core.frame.Frame( - {i: column.as_column(k, length=1) for i, k in enumerate(key)} - ) partial_index = self.__class__._from_data( - data=self._data.select_by_index(slice(key_as_table._num_columns)) + data=self._data.select_by_index(slice(len(key))) ) ( lower_bound, upper_bound, sort_inds, - ) = _lexsorted_equal_range(partial_index, key_as_table, is_sorted) + ) = _lexsorted_equal_range( + partial_index, + [column.as_column(k, length=1) for k in key], + is_sorted, + ) if lower_bound == upper_bound: raise KeyError(key) @@ -1961,7 +1962,7 @@ def get_loc(self, key): return true_inds # Not sorted and not unique. Return a boolean mask - mask = cp.full(self._data.nrows, False) + mask = cp.full(len(self), False) mask[true_inds] = True return mask diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 064e8fc667d..c6e2b5d10e1 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -6,7 +6,6 @@ import warnings from typing import Literal, Sequence -import cupy as cp import numpy as np import pandas as pd import pandas.tseries.offsets as pd_offset @@ -894,7 +893,7 @@ def date_range( # integers and divide the number range evenly with `periods` elements. start = cudf.Scalar(start, dtype=dtype).value.astype("int64") end = cudf.Scalar(end, dtype=dtype).value.astype("int64") - arr = cp.linspace(start=start, stop=end, num=periods) + arr = np.linspace(start=start, stop=end, num=periods) result = cudf.core.column.as_column(arr).astype("datetime64[ns]") return cudf.DatetimeIndex._from_data({name: result}).tz_localize(tz) @@ -991,8 +990,10 @@ def date_range( stop = end_estim.astype("int64") start = start.value.astype("int64") step = _offset_to_nanoseconds_lower_bound(offset) - arr = cp.arange(start=start, stop=stop, step=step, dtype="int64") - res = cudf.core.column.as_column(arr).astype("datetime64[ns]") + arr = range(int(start), int(stop), step) + res = cudf.core.column.as_column(arr, dtype="int64").astype( + "datetime64[ns]" + ) return cudf.DatetimeIndex._from_data({name: res}, freq=freq).tz_localize( tz diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 092e9790c63..7ab9ff2ef23 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1534,18 +1534,7 @@ def test_date_range_start_end_periods(start, end, periods): ) -def test_date_range_start_end_freq(request, start, end, freq): - request.applymarker( - pytest.mark.xfail( - condition=( - start == "1831-05-08 15:23:21" - and end == "1996-11-21 04:05:30" - and freq == "110546789ms" - ), - reason="https://github.com/rapidsai/cudf/issues/12133", - ) - ) - +def test_date_range_start_end_freq(start, end, freq): if isinstance(freq, str): _gfreq = _pfreq = freq else: @@ -1561,7 +1550,7 @@ def test_date_range_start_end_freq(request, start, end, freq): ) -def test_date_range_start_freq_periods(request, start, freq, periods): +def test_date_range_start_freq_periods(start, freq, periods): if isinstance(freq, str): _gfreq = _pfreq = freq else: