Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement iloc-getitem using parse-don't-validate approach #13534

Merged
merged 25 commits into from
Jul 14, 2023
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
93c1d21
Add iloc-getitem benchmarks
wence- Jun 23, 2023
64b093e
Length-1 categoricals are not scalars
wence- Jun 6, 2023
6b649bd
Type annotate the frame in SeriesIlocIndexer
wence- Jun 6, 2023
5094f49
Implement iloc-getitem using parse-don't-validate approach
wence- Jun 5, 2023
8ad58a8
TypeAlias from typing_extensions for py 3.9
wence- Jun 8, 2023
b43d93a
Use _gather for scalar indexing
wence- Jun 8, 2023
a479a34
Introduce GatherMap and BooleanMask
wence- Jun 22, 2023
5e4af4a
Minor simplifications
wence- Jun 23, 2023
bc44c3f
Update benchmark usage of _apply_boolean_mask
wence- Jun 23, 2023
503f4ae
Reinstate special RangeIndex handling
wence- Jun 23, 2023
19637fa
Use Union rather than | syntax for Py 3.9 compat
wence- Jun 29, 2023
dbf56b8
Indexer dataclasses have the same field name
wence- Jun 29, 2023
ad1b21a
Merge remote-tracking branch 'upstream/branch-23.08' into wence/fea/i…
wence- Jun 30, 2023
b763ebb
Refactor GatherMap and BooleanMask construction
wence- Jul 11, 2023
12e66fc
Remove walrus
wence- Jul 11, 2023
b92638d
Adapt benchmark
wence- Jul 11, 2023
99d3da1
Minor docstring fixes
wence- Jul 11, 2023
b046539
Clarify comment and fix keep_index handling in _slice
wence- Jul 11, 2023
1ace86a
Clarify scope of pytest.raises
wence- Jul 11, 2023
e547372
Numpydoc formatting
wence- Jul 11, 2023
892ee14
Simplify clamping to range
wence- Jul 11, 2023
803fbc0
Fix some cases missed in refactor
wence- Jul 12, 2023
762eb1c
A few more small fixes
wence- Jul 12, 2023
943c58e
Don't xfail, but rather pytest.raises
wence- Jul 12, 2023
dffdc4e
Merge remote-tracking branch 'upstream/branch-23.08' into wence/fea/i…
wence- Jul 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions python/cudf/benchmarks/API/bench_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,40 @@ def bench_sample(benchmark, dataframe, axis, frac, random_state):
)


@benchmark_with_object(cls="dataframe", dtype="int")
@pytest.mark.parametrize("frac", [0, 0.25, 0.5, 0.75, 1])
def bench_iloc_getitem_indices(benchmark, dataframe, frac):
rs = numpy.random.RandomState(seed=42)
n = int(len(dataframe) * frac)
values = rs.choice(len(dataframe), size=n, replace=False)
benchmark(dataframe.iloc.__getitem__, values)


@benchmark_with_object(cls="dataframe", dtype="int")
@pytest.mark.parametrize("frac", [0, 0.25, 0.5, 0.75, 1])
def bench_iloc_getitem_mask(benchmark, dataframe, frac):
rs = numpy.random.RandomState(seed=42)
n = int(len(dataframe) * frac)
values = rs.choice(len(dataframe), size=n, replace=False)
mask = numpy.zeros(len(dataframe), dtype=bool)
mask[values] = True
benchmark(dataframe.iloc.__getitem__, mask)


@benchmark_with_object(cls="dataframe", dtype="int")
@pytest.mark.parametrize(
"slice",
[slice(None), slice(0, 0, 1), slice(1, None, 10), slice(None, -1, -1)],
)
def bench_iloc_getitem_slice(benchmark, dataframe, slice):
benchmark(dataframe.iloc.__getitem__, slice)


@benchmark_with_object(cls="dataframe", dtype="int")
def bench_iloc_getitem_scalar(benchmark, dataframe):
benchmark(dataframe.iloc.__getitem__, len(dataframe) // 2)


@benchmark_with_object(cls="dataframe", dtype="int", nulls=False, cols=6)
@pytest.mark.parametrize(
"num_key_cols",
Expand Down
8 changes: 6 additions & 2 deletions python/cudf/benchmarks/internal/bench_dataframe_internal.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2023, NVIDIA CORPORATION.

"""Benchmarks of internal DataFrame methods."""

from utils import benchmark_with_object, make_boolean_mask_column

from cudf.core.copy_types import as_boolean_mask


@benchmark_with_object(cls="dataframe", dtype="int")
def bench_apply_boolean_mask(benchmark, dataframe):
mask = make_boolean_mask_column(len(dataframe))
benchmark(dataframe._apply_boolean_mask, mask)
benchmark(
dataframe._apply_boolean_mask, as_boolean_mask(mask, len(dataframe))
)
8 changes: 3 additions & 5 deletions python/cudf/cudf/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def is_integer(obj):
bool
"""
if isinstance(obj, cudf.Scalar):
return pd.api.types.is_integer(obj.dtype)
return pd.api.types.is_integer_dtype(obj.dtype)
return pd.api.types.is_integer(obj)


Expand Down Expand Up @@ -154,10 +154,8 @@ def _is_scalar_or_zero_d_array(val):
Return True if given object is scalar.
"""
return (
(isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0)
or (isinstance(val, pd.Categorical) and len(val) == 1)
or is_scalar(val)
)
isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0
shwina marked this conversation as resolved.
Show resolved Hide resolved
) or is_scalar(val)


# TODO: We should be able to reuse the pandas function for this, need to figure
Expand Down
6 changes: 5 additions & 1 deletion python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pickle
import warnings
from functools import cached_property
from typing import Any, Set
from typing import Any, Set, Tuple

import pandas as pd
from typing_extensions import Self
Expand Down Expand Up @@ -71,6 +71,10 @@ class BaseIndex(Serializable):
_accessors: Set[Any] = set()
_data: ColumnAccessor

@property
def _columns(self) -> Tuple[Any, ...]:
raise NotImplementedError
wence- marked this conversation as resolved.
Show resolved Hide resolved

@cached_property
def _values(self) -> ColumnBase:
raise NotImplementedError
Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import cupy as cp
import numpy as np

from cudf.core.column import as_column
from cudf.core import copy_types as ct
wence- marked this conversation as resolved.
Show resolved Hide resolved
from cudf.core.index import Index, RangeIndex
from cudf.core.indexed_frame import IndexedFrame
from cudf.core.scalar import Scalar
Expand Down Expand Up @@ -170,7 +170,9 @@ def _index_or_values_interpolation(column, index=None):
return column

to_interp = IndexedFrame(data={None: column}, index=index)
known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask))
known_x_and_y = to_interp._apply_boolean_mask(
ct.as_boolean_mask(~mask, len(to_interp))
)

known_x = known_x_and_y._index._column.values
known_y = known_x_and_y._data.columns[0].values
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,14 +548,14 @@ def element_indexing(self, index: int):

def slice(
self, start: int, stop: int, stride: Optional[int] = None
) -> ColumnBase:
) -> Self:
stride = 1 if stride is None else stride
if start < 0:
start = start + len(self)
if stop < 0 and not (stride < 0 and stop == -1):
stop = stop + len(self)
if (stride > 0 and start >= stop) or (stride < 0 and start <= stop):
return column_empty(0, self.dtype, masked=True)
return cast(Self, column_empty(0, self.dtype, masked=True))
# compute mask slice
if stride == 1:
return libcudf.copying.column_slice(self, [start, stop])[
Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/core/column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import pandas as pd
from packaging.version import Version
from pandas.api.types import is_bool
from typing_extensions import Self

import cudf
from cudf.core import column
Expand Down Expand Up @@ -476,6 +477,13 @@ def set_by_label(self, key: Any, value: Any, validate: bool = True):
self._data[key] = value
self._clear_cache()

def _select_by_names(self, names: abc.Sequence) -> Self:
return self.__class__(
{key: self[key] for key in names},
multiindex=self.multiindex,
level_names=self.level_names,
)

def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
# Might be a generator
key = tuple(key)
Expand Down
140 changes: 140 additions & 0 deletions python/cudf/cudf/core/copy_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# Copyright (c) 2023, NVIDIA CORPORATION.
wence- marked this conversation as resolved.
Show resolved Hide resolved
from dataclasses import dataclass
from functools import cached_property
from typing import TYPE_CHECKING, Any, cast

import cudf
import cudf._lib as libcudf
from cudf._lib.types import size_type_dtype

if TYPE_CHECKING:
from cudf.core.column import NumericalColumn


@dataclass
class GatherMap:
"""A witness to the validity of a given column as a gather map.
wence- marked this conversation as resolved.
Show resolved Hide resolved

This object witnesses that the column it carries is suitable as a
gather map for an object with the specified number of rows.

It is used to provide a safe API for calling the internal
Frame._gather method without needing to (expensively) verify that
the provided column is valid for gathering.
"""

#: The gather map
column: "NumericalColumn"
#: The number of rows the gather map has been validated for
nrows: int
#: Was the validation for nullify=True?
nullify: bool


@dataclass
class BooleanMask:
"""A witness to the validity of a given column as a boolean mask.

This object witnesses that the column it carries is suitable as a
boolean mask for an object with number of rows equal to the mask's
length.
"""

column: "NumericalColumn"

@cached_property
def nrows(self):
return len(self.column)


def as_gather_map(
column: Any,
nrows: int,
*,
nullify: bool,
check_bounds: bool,
) -> GatherMap:
"""Turn a column into a gather map

This augments the column with the information that it is valid as
a gather map for the specified number of rows with the given
nullification flag.

Parameters
----------
column
The column to verify
nrows
The number of rows to verify against
nullify
Will this gather map be used nullifying out of bounds accesses
check_bounds
Actually check whether the map is in bounds. Set to False if
you know by construction that the map is in bounds.

Returns
-------
GatherMap
New object wrapping the column bearing witness to its
suitability as a gather map for columns with nrows.

Raises
------
IndexError
If the column is of unsuitable dtype, or the map is not in bounds.
"""
column = cudf.core.column.as_column(column)
if len(column) == 0:
# Any empty column is valid as a gather map
# This is necessary because as_column([]) defaults to float64
# TODO: we should fix this further up.
# Alternately we can have an Optional[Column] and handle None
# specially in _gather.
return GatherMap(
cast("NumericalColumn", column.astype(size_type_dtype)),
nrows,
nullify,
)
if column.dtype.kind not in {"i", "u"}:
raise IndexError("Gather map must have integer dtype")
if not nullify and check_bounds:
lo, hi = libcudf.reduce.minmax(column)
if lo.value < -nrows or hi.value >= nrows:
raise IndexError(f"Gather map is out of bounds for [0, {nrows})")
return GatherMap(cast("NumericalColumn", column), nrows, nullify)


def as_boolean_mask(column: Any, nrows: int) -> BooleanMask:
"""Turn a column into a boolean mask

This augments the column with information that it is valid as a
boolean mask for columns with a given number of rows

Parameters
----------
column
The column to verify
nrows
the number of rows to verify against

Returns
-------
BooleanMask
New object wrapping the column bearing witness to its
suitability as a boolean mask for columns with matching row
count.

Raises
------
IndexError
If the column is of unsuitable dtype.
"""
column = cudf.core.column.as_column(column)
if column.dtype.kind != "b":
raise IndexError("Boolean mask must have bool dtype")
if (n := len(column)) != nrows:
raise IndexError(
f"Column with {n} rows not suitable "
f"as a boolean mask for {nrows} rows"
)
wence- marked this conversation as resolved.
Show resolved Hide resolved
return BooleanMask(cast("NumericalColumn", column))
Loading