Skip to content

Commit

Permalink
Ensure objects with __interface__ are converted to cupy/numpy arrays (#…
Browse files Browse the repository at this point in the history
…16436)

#16277 removed a universal cast to
a `cupy.array` in `_from_array`. Although the typing suggested this
method should only accept `np.ndarray` or `cupy.ndarray`, this method is
called on any object implementing the `__cuda_array_inferface__` or
`__array_interface__` (e.g. `numba.DeviceArray`) which caused a
performance regression in cuspatial
rapidsai/cuspatial#1413

closes #16434


```python
In [1]: import cupy, numba.cuda

In [2]: import cudf

In [3]: cupy_array = cupy.ones((10_000, 100))

In [4]: %timeit cudf.DataFrame(cupy_array)
3.88 ms ± 52 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [5]: %timeit cudf.DataFrame(numba.cuda.to_device(cupy_array))
3.99 ms ± 35.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
```

---------

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
  • Loading branch information
mroeschke and bdice authored Aug 1, 2024
1 parent ed5e4aa commit 445a75f
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 14 deletions.
7 changes: 7 additions & 0 deletions python/cudf/benchmarks/API/bench_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import string

import numba.cuda
import numpy
import pytest
import pytest_cases
Expand All @@ -16,6 +17,12 @@ def bench_construction(benchmark, N):
benchmark(cudf.DataFrame, {None: cupy.random.rand(N)})


@pytest.mark.parametrize("N", [100, 100_000])
@pytest.mark.pandas_incompatible
def bench_construction_numba_device_array(benchmark, N):
benchmark(cudf.DataFrame, numba.cuda.to_device(numpy.ones((100, N))))


@benchmark_with_object(cls="dataframe", dtype="float", cols=6)
@pytest.mark.parametrize(
"expr", ["a+b", "a+b+c+d+e", "a / (sin(a) + cos(b)) * tanh(d*e*f)"]
Expand Down
3 changes: 1 addition & 2 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1843,8 +1843,7 @@ def as_column(
else:
mask = None

arbitrary = cupy.asarray(arbitrary)
arbitrary = cupy.ascontiguousarray(arbitrary)
arbitrary = cupy.asarray(arbitrary, order="C")

data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write"))
col = build_column(data, dtype=arbitrary.dtype, mask=mask)
Expand Down
34 changes: 22 additions & 12 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -782,7 +782,6 @@ def __init__(
)
elif hasattr(data, "__cuda_array_interface__"):
arr_interface = data.__cuda_array_interface__

# descr is an optional field of the _cuda_ary_iface_
if "descr" in arr_interface:
if len(arr_interface["descr"]) == 1:
Expand Down Expand Up @@ -5835,17 +5834,18 @@ def from_records(
@_performance_tracking
def _from_arrays(
cls,
data: np.ndarray | cupy.ndarray,
data,
index=None,
columns=None,
nan_as_null=False,
):
"""Convert a numpy/cupy array to DataFrame.
"""
Convert an object implementing an array interface to DataFrame.
Parameters
----------
data : numpy/cupy array of ndim 1 or 2,
dimensions greater than 2 are not supported yet.
data : object of ndim 1 or 2,
Object implementing ``__array_interface__`` or ``__cuda_array_interface__``
index : Index or array-like
Index to use for resulting frame. Will default to
RangeIndex if no indexing information part of input data and
Expand All @@ -5857,13 +5857,23 @@ def _from_arrays(
-------
DataFrame
"""
if data.ndim != 1 and data.ndim != 2:
array_data: np.ndarray | cupy.ndarray
if hasattr(data, "__cuda_array_interface__"):
array_data = cupy.asarray(data, order="F")
elif hasattr(data, "__array_interface__"):
array_data = np.asarray(data, order="F")
else:
raise ValueError(
f"records dimension expected 1 or 2 but found: {data.ndim}"
"data must be an object implementing __cuda_array_interface__ or __array_interface__"
)

if array_data.ndim not in {1, 2}:
raise ValueError(
f"records dimension expected 1 or 2 but found: {array_data.ndim}"
)

if data.ndim == 2:
num_cols = data.shape[1]
num_cols = array_data.shape[1]
else:
# Since we validate ndim to be either 1 or 2 above,
# this case can be assumed to be ndim == 1.
Expand All @@ -5881,14 +5891,14 @@ def _from_arrays(
raise ValueError("Duplicate column names are not allowed")
names = columns

if data.ndim == 2:
if array_data.ndim == 2:
ca_data = {
k: column.as_column(data[:, i], nan_as_null=nan_as_null)
k: column.as_column(array_data[:, i], nan_as_null=nan_as_null)
for i, k in enumerate(names)
}
elif data.ndim == 1:
elif array_data.ndim == 1:
ca_data = {
names[0]: column.as_column(data, nan_as_null=nan_as_null)
names[0]: column.as_column(array_data, nan_as_null=nan_as_null)
}

if index is not None:
Expand Down

0 comments on commit 445a75f

Please sign in to comment.