Ensure objects with __interface__ are converted to cupy/numpy arrays (#…

…16436) #16277 removed a universal cast to a `cupy.array` in `_from_array`. Although the typing suggested this method should only accept `np.ndarray` or `cupy.ndarray`, this method is called on any object implementing the `__cuda_array_inferface__` or `__array_interface__` (e.g. `numba.DeviceArray`) which caused a performance regression in cuspatial rapidsai/cuspatial#1413 closes #16434 ```python In [1]: import cupy, numba.cuda In [2]: import cudf In [3]: cupy_array = cupy.ones((10_000, 100)) In [4]: %timeit cudf.DataFrame(cupy_array) 3.88 ms ± 52 μs per loop (mean ± std. dev. of 7 runs, 100 loops each) In [5]: %timeit cudf.DataFrame(numba.cuda.to_device(cupy_array)) 3.99 ms ± 35.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` --------- Co-authored-by: Bradley Dice <bdice@bradleydice.com>
rapidsai · Aug 1, 2024 · 445a75f · 445a75f
1 parent ed5e4aa
commit 445a75f
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 14 deletions.
diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py
@@ -4,6 +4,7 @@
 
 import string
 
+import numba.cuda
 import numpy
 import pytest
 import pytest_cases
@@ -16,6 +17,12 @@ def bench_construction(benchmark, N):
     benchmark(cudf.DataFrame, {None: cupy.random.rand(N)})
 
 
+@pytest.mark.parametrize("N", [100, 100_000])
+@pytest.mark.pandas_incompatible
+def bench_construction_numba_device_array(benchmark, N):
+    benchmark(cudf.DataFrame, numba.cuda.to_device(numpy.ones((100, N))))
+
+
 @benchmark_with_object(cls="dataframe", dtype="float", cols=6)
 @pytest.mark.parametrize(
     "expr", ["a+b", "a+b+c+d+e", "a / (sin(a) + cos(b)) * tanh(d*e*f)"]

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -1843,8 +1843,7 @@ def as_column(
         else:
             mask = None
 
-        arbitrary = cupy.asarray(arbitrary)
-        arbitrary = cupy.ascontiguousarray(arbitrary)
+        arbitrary = cupy.asarray(arbitrary, order="C")
 
         data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write"))
         col = build_column(data, dtype=arbitrary.dtype, mask=mask)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -782,7 +782,6 @@ def __init__(
             )
         elif hasattr(data, "__cuda_array_interface__"):
             arr_interface = data.__cuda_array_interface__
-
             # descr is an optional field of the _cuda_ary_iface_
             if "descr" in arr_interface:
                 if len(arr_interface["descr"]) == 1:
@@ -5835,17 +5834,18 @@ def from_records(
     @_performance_tracking
     def _from_arrays(
         cls,
-        data: np.ndarray | cupy.ndarray,
+        data,
         index=None,
         columns=None,
         nan_as_null=False,
     ):
-        """Convert a numpy/cupy array to DataFrame.
+        """
+        Convert an object implementing an array interface to DataFrame.
 
         Parameters
         ----------
-        data : numpy/cupy array of ndim 1 or 2,
-            dimensions greater than 2 are not supported yet.
+        data : object of ndim 1 or 2,
+            Object implementing ``__array_interface__`` or ``__cuda_array_interface__``
         index : Index or array-like
             Index to use for resulting frame. Will default to
             RangeIndex if no indexing information part of input data and
@@ -5857,13 +5857,23 @@ def _from_arrays(
         -------
         DataFrame
         """
-        if data.ndim != 1 and data.ndim != 2:
+        array_data: np.ndarray | cupy.ndarray
+        if hasattr(data, "__cuda_array_interface__"):
+            array_data = cupy.asarray(data, order="F")
+        elif hasattr(data, "__array_interface__"):
+            array_data = np.asarray(data, order="F")
+        else:
             raise ValueError(
-                f"records dimension expected 1 or 2 but found: {data.ndim}"
+                "data must be an object implementing __cuda_array_interface__ or __array_interface__"
+            )
+
+        if array_data.ndim not in {1, 2}:
+            raise ValueError(
+                f"records dimension expected 1 or 2 but found: {array_data.ndim}"
             )
 
         if data.ndim == 2:
-            num_cols = data.shape[1]
+            num_cols = array_data.shape[1]
         else:
             # Since we validate ndim to be either 1 or 2 above,
             # this case can be assumed to be ndim == 1.
@@ -5881,14 +5891,14 @@ def _from_arrays(
                 raise ValueError("Duplicate column names are not allowed")
             names = columns
 
-        if data.ndim == 2:
+        if array_data.ndim == 2:
             ca_data = {
-                k: column.as_column(data[:, i], nan_as_null=nan_as_null)
+                k: column.as_column(array_data[:, i], nan_as_null=nan_as_null)
                 for i, k in enumerate(names)
             }
-        elif data.ndim == 1:
+        elif array_data.ndim == 1:
             ca_data = {
-                names[0]: column.as_column(data, nan_as_null=nan_as_null)
+                names[0]: column.as_column(array_data, nan_as_null=nan_as_null)
             }
 
         if index is not None: