From 445a75fca4d8d12d2230fef507dbfb696b6968fb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 1 Aug 2024 02:45:30 -1000 Subject: [PATCH] Ensure objects with __interface__ are converted to cupy/numpy arrays (#16436) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://github.com/rapidsai/cudf/pull/16277 removed a universal cast to a `cupy.array` in `_from_array`. Although the typing suggested this method should only accept `np.ndarray` or `cupy.ndarray`, this method is called on any object implementing the `__cuda_array_inferface__` or `__array_interface__` (e.g. `numba.DeviceArray`) which caused a performance regression in cuspatial https://github.com/rapidsai/cuspatial/issues/1413 closes #16434 ```python In [1]: import cupy, numba.cuda In [2]: import cudf In [3]: cupy_array = cupy.ones((10_000, 100)) In [4]: %timeit cudf.DataFrame(cupy_array) 3.88 ms ± 52 μs per loop (mean ± std. dev. of 7 runs, 100 loops each) In [5]: %timeit cudf.DataFrame(numba.cuda.to_device(cupy_array)) 3.99 ms ± 35.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` --------- Co-authored-by: Bradley Dice --- python/cudf/benchmarks/API/bench_dataframe.py | 7 ++++ python/cudf/cudf/core/column/column.py | 3 +- python/cudf/cudf/core/dataframe.py | 34 ++++++++++++------- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py index 59d73015962..ba243eb6a7c 100644 --- a/python/cudf/benchmarks/API/bench_dataframe.py +++ b/python/cudf/benchmarks/API/bench_dataframe.py @@ -4,6 +4,7 @@ import string +import numba.cuda import numpy import pytest import pytest_cases @@ -16,6 +17,12 @@ def bench_construction(benchmark, N): benchmark(cudf.DataFrame, {None: cupy.random.rand(N)}) +@pytest.mark.parametrize("N", [100, 100_000]) +@pytest.mark.pandas_incompatible +def bench_construction_numba_device_array(benchmark, N): + benchmark(cudf.DataFrame, numba.cuda.to_device(numpy.ones((100, N)))) + + @benchmark_with_object(cls="dataframe", dtype="float", cols=6) @pytest.mark.parametrize( "expr", ["a+b", "a+b+c+d+e", "a / (sin(a) + cos(b)) * tanh(d*e*f)"] diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 32e6aade65b..7e0d8ced595 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1843,8 +1843,7 @@ def as_column( else: mask = None - arbitrary = cupy.asarray(arbitrary) - arbitrary = cupy.ascontiguousarray(arbitrary) + arbitrary = cupy.asarray(arbitrary, order="C") data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write")) col = build_column(data, dtype=arbitrary.dtype, mask=mask) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1d7136e61e3..dca0c0b821a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -782,7 +782,6 @@ def __init__( ) elif hasattr(data, "__cuda_array_interface__"): arr_interface = data.__cuda_array_interface__ - # descr is an optional field of the _cuda_ary_iface_ if "descr" in arr_interface: if len(arr_interface["descr"]) == 1: @@ -5835,17 +5834,18 @@ def from_records( @_performance_tracking def _from_arrays( cls, - data: np.ndarray | cupy.ndarray, + data, index=None, columns=None, nan_as_null=False, ): - """Convert a numpy/cupy array to DataFrame. + """ + Convert an object implementing an array interface to DataFrame. Parameters ---------- - data : numpy/cupy array of ndim 1 or 2, - dimensions greater than 2 are not supported yet. + data : object of ndim 1 or 2, + Object implementing ``__array_interface__`` or ``__cuda_array_interface__`` index : Index or array-like Index to use for resulting frame. Will default to RangeIndex if no indexing information part of input data and @@ -5857,13 +5857,23 @@ def _from_arrays( ------- DataFrame """ - if data.ndim != 1 and data.ndim != 2: + array_data: np.ndarray | cupy.ndarray + if hasattr(data, "__cuda_array_interface__"): + array_data = cupy.asarray(data, order="F") + elif hasattr(data, "__array_interface__"): + array_data = np.asarray(data, order="F") + else: raise ValueError( - f"records dimension expected 1 or 2 but found: {data.ndim}" + "data must be an object implementing __cuda_array_interface__ or __array_interface__" + ) + + if array_data.ndim not in {1, 2}: + raise ValueError( + f"records dimension expected 1 or 2 but found: {array_data.ndim}" ) if data.ndim == 2: - num_cols = data.shape[1] + num_cols = array_data.shape[1] else: # Since we validate ndim to be either 1 or 2 above, # this case can be assumed to be ndim == 1. @@ -5881,14 +5891,14 @@ def _from_arrays( raise ValueError("Duplicate column names are not allowed") names = columns - if data.ndim == 2: + if array_data.ndim == 2: ca_data = { - k: column.as_column(data[:, i], nan_as_null=nan_as_null) + k: column.as_column(array_data[:, i], nan_as_null=nan_as_null) for i, k in enumerate(names) } - elif data.ndim == 1: + elif array_data.ndim == 1: ca_data = { - names[0]: column.as_column(data, nan_as_null=nan_as_null) + names[0]: column.as_column(array_data, nan_as_null=nan_as_null) } if index is not None: