Skip to content

Commit

Permalink
Type & reduce cupy usage (#16277)
Browse files Browse the repository at this point in the history
There are some cupy usages that don't seem _strictly_ necessary (generating starting data, array type conversion) in some APIs. IMO we should prefer using CPU data/the existing data structure/Column ops over cupy when possible

closes #12133

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #16277
  • Loading branch information
mroeschke authored Jul 16, 2024
1 parent dba46e7 commit 47a0a87
Show file tree
Hide file tree
Showing 12 changed files with 74 additions and 78 deletions.
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
if TYPE_CHECKING:
from collections.abc import Generator

import cupy

from cudf.core.column_accessor import ColumnAccessor


Expand Down Expand Up @@ -2001,7 +2003,7 @@ def drop_duplicates(
self._column_names,
)

def duplicated(self, keep="first"):
def duplicated(self, keep="first") -> cupy.ndarray:
"""
Indicate duplicate index values.
Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,7 @@ def notnull(self) -> ColumnBase:
return result

def indices_of(
self, value: ScalarLike | Self
self, value: ScalarLike
) -> cudf.core.column.NumericalColumn:
"""
Find locations of value in the column
Expand All @@ -735,10 +735,10 @@ def indices_of(
-------
Column of indices that match value
"""
if not isinstance(value, ColumnBase):
value = as_column([value], dtype=self.dtype)
if not is_scalar(value):
raise ValueError("value must be a scalar")
else:
assert len(value) == 1
value = as_column(value, dtype=self.dtype, length=1)
mask = libcudf.search.contains(value, self)
return apply_boolean_mask(
[as_column(range(0, len(self)), dtype=size_type_dtype)], mask
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,9 +629,9 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
def indices_of(
self, value: ScalarLike
) -> cudf.core.column.NumericalColumn:
value = column.as_column(
pd.to_datetime(value), dtype=self.dtype
).astype("int64")
value = (
pd.to_datetime(value).to_numpy().astype(self.dtype).astype("int64")
)
return self.astype("int64").indices_of(value)

@property
Expand Down
10 changes: 2 additions & 8 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,13 @@
import functools
from typing import TYPE_CHECKING, Any, Callable, Sequence, cast

import cupy as cp
import numpy as np
import pandas as pd
from typing_extensions import Self

import cudf
from cudf import _lib as libcudf
from cudf._lib import pylibcudf
from cudf._lib.types import size_type_dtype
from cudf.api.types import (
is_bool_dtype,
is_float_dtype,
Expand Down Expand Up @@ -131,12 +129,8 @@ def indices_of(self, value: ScalarLike) -> NumericalColumn:
and self.dtype.kind in {"c", "f"}
and np.isnan(value)
):
return column.as_column(
cp.argwhere(
cp.isnan(self.data_array_view(mode="read"))
).flatten(),
dtype=size_type_dtype,
)
nan_col = libcudf.unary.is_nan(self)
return nan_col.indices_of(True)
else:
return super().indices_of(value)

Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,9 +188,6 @@ def cut(
# adjust bin edges decimal precision
int_label_bins = np.around(bins, precision)

# the inputs is a column of the values in the array x
input_arr = as_column(x)

# checking for the correct inclusivity values
if right:
closed = "right"
Expand Down Expand Up @@ -242,6 +239,9 @@ def cut(
labels if len(set(labels)) == len(labels) else None
)

# the inputs is a column of the values in the array x
input_arr = as_column(x)

if isinstance(bins, pd.IntervalIndex):
# get the left and right edges of the bins as columns
# we cannot typecast an IntervalIndex, so we need to
Expand Down
18 changes: 11 additions & 7 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ def _setitem_tuple_arg(self, key, value):

else:
value = cupy.asarray(value)
if cupy.ndim(value) == 2:
if value.ndim == 2:
# If the inner dimension is 1, it's broadcastable to
# all columns of the dataframe.
indexed_shape = columns_df.loc[key[0]].shape
Expand Down Expand Up @@ -566,7 +566,7 @@ def _setitem_tuple_arg(self, key, value):
# TODO: consolidate code path with identical counterpart
# in `_DataFrameLocIndexer._setitem_tuple_arg`
value = cupy.asarray(value)
if cupy.ndim(value) == 2:
if value.ndim == 2:
indexed_shape = columns_df.iloc[key[0]].shape
if value.shape[1] == 1:
if value.shape[0] != indexed_shape[0]:
Expand Down Expand Up @@ -2199,8 +2199,8 @@ def from_dict(

orient = orient.lower()
if orient == "index":
if len(data) > 0 and isinstance(
next(iter(data.values())), (cudf.Series, cupy.ndarray)
if isinstance(
next(iter(data.values()), None), (cudf.Series, cupy.ndarray)
):
result = cls(data).T
result.columns = (
Expand Down Expand Up @@ -5698,7 +5698,13 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):

@classmethod
@_performance_tracking
def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
def _from_arrays(
cls,
data: np.ndarray | cupy.ndarray,
index=None,
columns=None,
nan_as_null=False,
):
"""Convert a numpy/cupy array to DataFrame.
Parameters
Expand All @@ -5716,8 +5722,6 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
-------
DataFrame
"""

data = cupy.asarray(data)
if data.ndim != 1 and data.ndim != 2:
raise ValueError(
f"records dimension expected 1 or 2 but found: {data.ndim}"
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1189,7 +1189,7 @@ def searchsorted(
side: Literal["left", "right"] = "left",
ascending: bool = True,
na_position: Literal["first", "last"] = "last",
):
) -> ScalarLike | cupy.ndarray:
"""Find indices where elements should be inserted to maintain order
Parameters
Expand Down Expand Up @@ -1527,7 +1527,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
@acquire_spill_lock()
def _apply_cupy_ufunc_to_operands(
self, ufunc, cupy_func, operands, **kwargs
):
) -> list[dict[Any, ColumnBase]]:
# Note: There are some operations that may be supported by libcudf but
# are not supported by pandas APIs. In particular, libcudf binary
# operations support logical and/or operations as well as
Expand All @@ -1538,7 +1538,7 @@ def _apply_cupy_ufunc_to_operands(
# without cupy.

mask = None
data = [{} for _ in range(ufunc.nout)]
data: list[dict[Any, ColumnBase]] = [{} for _ in range(ufunc.nout)]
for name, (left, right, _, _) in operands.items():
cupy_inputs = []
for inp in (left, right) if ufunc.nin == 2 else (left,):
Expand Down
23 changes: 13 additions & 10 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,12 @@
from cudf.utils.utils import GetAttrGetItemMixin

if TYPE_CHECKING:
from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
from cudf._typing import (
AggType,
DataFrameOrSeries,
MultiColumnAggType,
ScalarLike,
)


def _deprecate_collect():
Expand Down Expand Up @@ -357,7 +362,7 @@ def groups(self):
)

@cached_property
def indices(self):
def indices(self) -> dict[ScalarLike, cp.ndarray]:
"""
Dict {group name -> group indices}.
Expand Down Expand Up @@ -1015,18 +1020,16 @@ def ngroup(self, ascending=True):

if ascending:
# Count ascending from 0 to num_groups - 1
group_ids = cudf.Series._from_data({None: cp.arange(num_groups)})
groups = range(num_groups)
elif has_null_group:
# Count descending from num_groups - 1 to 0, but subtract one more
# for the null group making it num_groups - 2 to -1.
group_ids = cudf.Series._from_data(
{None: cp.arange(num_groups - 2, -2, -1)}
)
groups = range(num_groups - 2, -2, -1)
else:
# Count descending from num_groups - 1 to 0
group_ids = cudf.Series._from_data(
{None: cp.arange(num_groups - 1, -1, -1)}
)
groups = range(num_groups - 1, -1, -1)

group_ids = cudf.Series._from_data({None: as_column(groups)})

if has_null_group:
group_ids.iloc[-1] = cudf.NA
Expand Down Expand Up @@ -1713,7 +1716,7 @@ def rolling_avg(val, avg):
return grouped_values.apply_chunks(function, **kwargs)

@_performance_tracking
def _broadcast(self, values):
def _broadcast(self, values: cudf.Series) -> cudf.Series:
"""
Broadcast the results of an aggregation to the group
Expand Down
34 changes: 18 additions & 16 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def __subclasscheck__(self, subclass):

def _lexsorted_equal_range(
idx: Index | cudf.MultiIndex,
key_as_table: Frame,
keys: list[ColumnBase],
is_sorted: bool,
) -> tuple[int, int, ColumnBase | None]:
"""Get equal range for key in lexicographically sorted index. If index
Expand All @@ -118,13 +118,13 @@ def _lexsorted_equal_range(
sort_vals = idx
lower_bound = search_sorted(
[*sort_vals._data.columns],
[*key_as_table._columns],
keys,
side="left",
ascending=sort_vals.is_monotonic_increasing,
).element_indexing(0)
upper_bound = search_sorted(
[*sort_vals._data.columns],
[*key_as_table._columns],
keys,
side="right",
ascending=sort_vals.is_monotonic_increasing,
).element_indexing(0)
Expand Down Expand Up @@ -260,7 +260,9 @@ def searchsorted(
), "Invalid ascending flag"
return search_range(value, self._range, side=side)

def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
def factorize(
self, sort: bool = False, use_na_sentinel: bool = True
) -> tuple[cupy.ndarray, Self]:
if sort and self.step < 0:
codes = cupy.arange(len(self) - 1, -1, -1)
uniques = self[::-1]
Expand Down Expand Up @@ -753,15 +755,16 @@ def difference(self, other, sort=None):
super().difference(other, sort=sort)
)

def _try_reconstruct_range_index(self, index):
if isinstance(index, RangeIndex) or index.dtype.kind == "f":
def _try_reconstruct_range_index(
self, index: BaseIndex
) -> Self | BaseIndex:
if isinstance(index, RangeIndex) or index.dtype.kind not in "iu":
return index
# Evenly spaced values can return a
# RangeIndex instead of a materialized Index.
if not index._column.has_nulls():
if not index._column.has_nulls(): # type: ignore[attr-defined]
uniques = cupy.unique(cupy.diff(index.values))
if len(uniques) == 1 and uniques[0].get() != 0:
diff = uniques[0].get()
if len(uniques) == 1 and (diff := uniques[0].get()) != 0:
new_range = range(index[0], index[-1] + diff, diff)
return type(self)(new_range, name=index.name)
return index
Expand Down Expand Up @@ -1309,17 +1312,16 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
return _return_get_indexer_result(result_series.to_cupy())

@_performance_tracking
def get_loc(self, key):
def get_loc(self, key) -> int | slice | cupy.ndarray:
if not is_scalar(key):
raise TypeError("Should be a scalar-like")

is_sorted = (
self.is_monotonic_increasing or self.is_monotonic_decreasing
)

target_as_table = cudf.core.frame.Frame({"None": as_column([key])})
lower_bound, upper_bound, sort_inds = _lexsorted_equal_range(
self, target_as_table, is_sorted
self, [as_column([key])], is_sorted
)

if lower_bound == upper_bound:
Expand All @@ -1330,7 +1332,7 @@ def get_loc(self, key):
return (
lower_bound
if is_sorted
else sort_inds.element_indexing(lower_bound)
else sort_inds.element_indexing(lower_bound) # type: ignore[union-attr]
)

if is_sorted:
Expand All @@ -1339,8 +1341,8 @@ def get_loc(self, key):
return slice(lower_bound, upper_bound)

# Not sorted and not unique. Return a boolean mask
mask = cupy.full(self._data.nrows, False)
true_inds = sort_inds.slice(lower_bound, upper_bound).values
mask = cupy.full(len(self), False)
true_inds = sort_inds.slice(lower_bound, upper_bound).values # type: ignore[union-attr]
mask[true_inds] = True
return mask

Expand Down Expand Up @@ -2076,7 +2078,7 @@ def day_of_year(self):

@property # type: ignore
@_performance_tracking
def is_leap_year(self):
def is_leap_year(self) -> cupy.ndarray:
"""
Boolean indicator if the date belongs to a leap year.
Expand Down
13 changes: 7 additions & 6 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1926,17 +1926,18 @@ def get_loc(self, key):

# Handle partial key search. If length of `key` is less than `nlevels`,
# Only search levels up to `len(key)` level.
key_as_table = cudf.core.frame.Frame(
{i: column.as_column(k, length=1) for i, k in enumerate(key)}
)
partial_index = self.__class__._from_data(
data=self._data.select_by_index(slice(key_as_table._num_columns))
data=self._data.select_by_index(slice(len(key)))
)
(
lower_bound,
upper_bound,
sort_inds,
) = _lexsorted_equal_range(partial_index, key_as_table, is_sorted)
) = _lexsorted_equal_range(
partial_index,
[column.as_column(k, length=1) for k in key],
is_sorted,
)

if lower_bound == upper_bound:
raise KeyError(key)
Expand All @@ -1961,7 +1962,7 @@ def get_loc(self, key):
return true_inds

# Not sorted and not unique. Return a boolean mask
mask = cp.full(self._data.nrows, False)
mask = cp.full(len(self), False)
mask[true_inds] = True
return mask

Expand Down
Loading

0 comments on commit 47a0a87

Please sign in to comment.