Skip to content

Commit

Permalink
Improve performance of Series.to_numpy/to_cupy (#15792)
Browse files Browse the repository at this point in the history
xref #11648

Essentially refactors `Frame._to_array` to short circuit some checks for a `Frame` with 1 column or `ndim == 1` 

```python
In [1]: import cudf

In [2]: s = cudf.Series(range(10000))

In [3]: %timeit s.to_cupy()
252 µs ± 3.47 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)  # PR

419 µs ± 2.21 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)  # branch 24.06
```

I needed to add `Frame.ndim` which will raise a `NotImplementedError` (until Frame actually becomes an ABC)

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #15792
  • Loading branch information
mroeschke authored May 21, 2024
1 parent b4daa16 commit 60d5717
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 40 deletions.
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def name(self):
raise NotImplementedError

@property # type: ignore
def ndim(self): # noqa: D401
def ndim(self) -> int: # noqa: D401
"""Number of dimensions of the underlying data, by definition 1."""
return 1

Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1234,7 +1234,7 @@ def dtypes(self):
return pd.Series(self._dtypes, dtype="object")

@property
def ndim(self):
def ndim(self) -> int:
"""Dimension of the data. DataFrame ndim is always 2."""
return 2

Expand Down
85 changes: 49 additions & 36 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import itertools
import operator
import pickle
import types
import warnings
from collections import abc
from typing import (
Expand Down Expand Up @@ -91,6 +92,10 @@ def _dtypes(self):
zip(self._data.names, (col.dtype for col in self._data.columns))
)

@property
def ndim(self) -> int:
raise NotImplementedError()

@_cudf_nvtx_annotate
def serialize(self):
# TODO: See if self._data can be serialized outright
Expand Down Expand Up @@ -417,51 +422,60 @@ def __arrow_array__(self, type=None):
@_cudf_nvtx_annotate
def _to_array(
self,
get_column_values: Callable,
make_empty_matrix: Callable,
get_array: Callable,
module: types.ModuleType,
copy: bool,
dtype: Union[Dtype, None] = None,
na_value=None,
) -> Union[cupy.ndarray, np.ndarray]:
) -> Union[cupy.ndarray, numpy.ndarray]:
# Internal function to implement to_cupy and to_numpy, which are nearly
# identical except for the attribute they access to generate values.

def get_column_values_na(col):
def to_array(
col: ColumnBase, dtype: np.dtype
) -> Union[cupy.ndarray, numpy.ndarray]:
if na_value is not None:
col = col.fillna(na_value)
return get_column_values(col)
array = get_array(col)
casted_array = module.asarray(array, dtype=dtype)
if copy and casted_array is array:
# Don't double copy after asarray
casted_array = casted_array.copy()
return casted_array

# Early exit for an empty Frame.
ncol = self._num_columns
if ncol == 0:
return make_empty_matrix(
shape=(len(self), ncol), dtype=np.dtype("float64"), order="F"
return module.empty(
shape=(len(self), ncol),
dtype=numpy.dtype("float64"),
order="F",
)

if dtype is None:
dtypes = [col.dtype for col in self._data.values()]
for dtype in dtypes:
if isinstance(
dtype,
(
cudf.ListDtype,
cudf.core.dtypes.DecimalDtype,
cudf.StructDtype,
),
):
raise NotImplementedError(
f"{dtype} cannot be exposed as a cupy array"
)
dtype = find_common_type(dtypes)
if ncol == 1:
dtype = next(iter(self._data.values())).dtype
else:
dtype = find_common_type(
[col.dtype for col in self._data.values()]
)

matrix = make_empty_matrix(
shape=(len(self), ncol), dtype=dtype, order="F"
)
for i, col in enumerate(self._data.values()):
# TODO: col.values may fail if there is nullable data or an
# unsupported dtype. We may want to catch and provide a more
# suitable error.
matrix[:, i] = get_column_values_na(col)
return matrix
if not isinstance(dtype, numpy.dtype):
raise NotImplementedError(
f"{dtype} cannot be exposed as an array"
)

if self.ndim == 1:
return to_array(self._data.columns[0], dtype)
else:
matrix = module.empty(
shape=(len(self), ncol), dtype=dtype, order="F"
)
for i, col in enumerate(self._data.values()):
# TODO: col.values may fail if there is nullable data or an
# unsupported dtype. We may want to catch and provide a more
# suitable error.
matrix[:, i] = to_array(col, dtype)
return matrix

# TODO: As of now, calling cupy.asarray is _much_ faster than calling
# to_cupy. We should investigate the reasons why and whether we can provide
Expand Down Expand Up @@ -496,10 +510,9 @@ def to_cupy(
cupy.ndarray
"""
return self._to_array(
(lambda col: col.values.copy())
if copy
else (lambda col: col.values),
cupy.empty,
lambda col: col.values,
cupy,
copy,
dtype,
na_value,
)
Expand Down Expand Up @@ -536,7 +549,7 @@ def to_numpy(
)

return self._to_array(
(lambda col: col.values_host), np.empty, dtype, na_value
lambda col: col.values_host, numpy, copy, dtype, na_value
)

@_cudf_nvtx_annotate
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,7 @@ def levels(self):

@property # type: ignore
@_cudf_nvtx_annotate
def ndim(self):
def ndim(self) -> int:
"""Dimension of the data. For MultiIndex ndim is always 2."""
return 2

Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/single_column_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def name(self, value):

@property # type: ignore
@_cudf_nvtx_annotate
def ndim(self): # noqa: D401
def ndim(self) -> int: # noqa: D401
"""Number of dimensions of the underlying data, by definition 1."""
return 1

Expand Down

0 comments on commit 60d5717

Please sign in to comment.