rapidsai · rapids-bot · Jul 14, 2023 · Jun 23, 2023 · Jun 6, 2023 · Jun 6, 2023
@@ -72,6 +72,40 @@ def bench_sample(benchmark, dataframe, axis, frac, random_state):
     )
 
 
+@benchmark_with_object(cls="dataframe", dtype="int")
+@pytest.mark.parametrize("frac", [0, 0.25, 0.5, 0.75, 1])
+def bench_iloc_getitem_indices(benchmark, dataframe, frac):
+    rs = numpy.random.RandomState(seed=42)
+    n = int(len(dataframe) * frac)
+    values = rs.choice(len(dataframe), size=n, replace=False)
+    benchmark(dataframe.iloc.__getitem__, values)
+
+
+@benchmark_with_object(cls="dataframe", dtype="int")
+@pytest.mark.parametrize("frac", [0, 0.25, 0.5, 0.75, 1])
+def bench_iloc_getitem_mask(benchmark, dataframe, frac):
+    rs = numpy.random.RandomState(seed=42)
+    n = int(len(dataframe) * frac)
+    values = rs.choice(len(dataframe), size=n, replace=False)
+    mask = numpy.zeros(len(dataframe), dtype=bool)
+    mask[values] = True
+    benchmark(dataframe.iloc.__getitem__, mask)
+
+
+@benchmark_with_object(cls="dataframe", dtype="int")
+@pytest.mark.parametrize(
+    "slice",
+    [slice(None), slice(0, 0, 1), slice(1, None, 10), slice(None, -1, -1)],
+)
+def bench_iloc_getitem_slice(benchmark, dataframe, slice):
+    benchmark(dataframe.iloc.__getitem__, slice)
+
+
+@benchmark_with_object(cls="dataframe", dtype="int")
+def bench_iloc_getitem_scalar(benchmark, dataframe):
+    benchmark(dataframe.iloc.__getitem__, len(dataframe) // 2)
+
+
 @benchmark_with_object(cls="dataframe", dtype="int", nulls=False, cols=6)
 @pytest.mark.parametrize(
     "num_key_cols",

@@ -1,11 +1,15 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 """Benchmarks of internal DataFrame methods."""
 
 from utils import benchmark_with_object, make_boolean_mask_column
 
+from cudf.core.copy_types import as_boolean_mask
+
 
 @benchmark_with_object(cls="dataframe", dtype="int")
 def bench_apply_boolean_mask(benchmark, dataframe):
     mask = make_boolean_mask_column(len(dataframe))
-    benchmark(dataframe._apply_boolean_mask, mask)
+    benchmark(
+        dataframe._apply_boolean_mask, as_boolean_mask(mask, len(dataframe))
+    )
@@ -86,7 +86,7 @@ def is_integer(obj):
     bool
     """
     if isinstance(obj, cudf.Scalar):
-        return pd.api.types.is_integer(obj.dtype)
+        return pd.api.types.is_integer_dtype(obj.dtype)
     return pd.api.types.is_integer(obj)
 
 
@@ -154,10 +154,8 @@ def _is_scalar_or_zero_d_array(val):
         Return True if given object is scalar.
     """
     return (
-        (isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0)
-        or (isinstance(val, pd.Categorical) and len(val) == 1)
-        or is_scalar(val)
-    )
+        isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0
+    ) or is_scalar(val)
 
 
 # TODO: We should be able to reuse the pandas function for this, need to figure

@@ -5,7 +5,7 @@
 import pickle
 import warnings
 from functools import cached_property
-from typing import Any, Set
+from typing import Any, Set, Tuple
 
 import pandas as pd
 from typing_extensions import Self
@@ -71,6 +71,10 @@ class BaseIndex(Serializable):
     _accessors: Set[Any] = set()
     _data: ColumnAccessor
 
+    @property
+    def _columns(self) -> Tuple[Any, ...]:
+        raise NotImplementedError
+
     @cached_property
     def _values(self) -> ColumnBase:
         raise NotImplementedError

@@ -4,7 +4,7 @@
 import cupy as cp
 import numpy as np
 
-from cudf.core.column import as_column
+from cudf.core import copy_types as ct
 from cudf.core.index import Index, RangeIndex
 from cudf.core.indexed_frame import IndexedFrame
 from cudf.core.scalar import Scalar
@@ -170,7 +170,9 @@ def _index_or_values_interpolation(column, index=None):
         return column
 
     to_interp = IndexedFrame(data={None: column}, index=index)
-    known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask))
+    known_x_and_y = to_interp._apply_boolean_mask(
+        ct.as_boolean_mask(~mask, len(to_interp))
+    )
 
     known_x = known_x_and_y._index._column.values
     known_y = known_x_and_y._data.columns[0].values

@@ -548,14 +548,14 @@ def element_indexing(self, index: int):
 
     def slice(
         self, start: int, stop: int, stride: Optional[int] = None
-    ) -> ColumnBase:
+    ) -> Self:
         stride = 1 if stride is None else stride
         if start < 0:
             start = start + len(self)
         if stop < 0 and not (stride < 0 and stop == -1):
             stop = stop + len(self)
         if (stride > 0 and start >= stop) or (stride < 0 and start <= stop):
-            return column_empty(0, self.dtype, masked=True)
+            return cast(Self, column_empty(0, self.dtype, masked=True))
         # compute mask slice
         if stride == 1:
             return libcudf.copying.column_slice(self, [start, stop])[

@@ -20,6 +20,7 @@
 import pandas as pd
 from packaging.version import Version
 from pandas.api.types import is_bool
+from typing_extensions import Self
 
 import cudf
 from cudf.core import column
@@ -476,6 +477,13 @@ def set_by_label(self, key: Any, value: Any, validate: bool = True):
         self._data[key] = value
         self._clear_cache()
 
+    def _select_by_names(self, names: abc.Sequence) -> Self:
+        return self.__class__(
+            {key: self[key] for key in names},
+            multiindex=self.multiindex,
+            level_names=self.level_names,
+        )
+
     def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
         # Might be a generator
         key = tuple(key)

@@ -0,0 +1,140 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+from dataclasses import dataclass
+from functools import cached_property
+from typing import TYPE_CHECKING, Any, cast
+
+import cudf
+import cudf._lib as libcudf
+from cudf._lib.types import size_type_dtype
+
+if TYPE_CHECKING:
+    from cudf.core.column import NumericalColumn
+
+
+@dataclass
+class GatherMap:
+    """A witness to the validity of a given column as a gather map.
+
+    This object witnesses that the column it carries is suitable as a
+    gather map for an object with the specified number of rows.
+
+    It is used to provide a safe API for calling the internal
+    Frame._gather method without needing to (expensively) verify that
+    the provided column is valid for gathering.
+    """
+
+    #: The gather map
+    column: "NumericalColumn"
+    #: The number of rows the gather map has been validated for
+    nrows: int
+    #: Was the validation for nullify=True?
+    nullify: bool
+
+
+@dataclass
+class BooleanMask:
+    """A witness to the validity of a given column as a boolean mask.
+
+    This object witnesses that the column it carries is suitable as a
+    boolean mask for an object with number of rows equal to the mask's
+    length.
+    """
+
+    column: "NumericalColumn"
+
+    @cached_property
+    def nrows(self):
+        return len(self.column)
+
+
+def as_gather_map(
+    column: Any,
+    nrows: int,
+    *,
+    nullify: bool,
+    check_bounds: bool,
+) -> GatherMap:
+    """Turn a column into a gather map
+
+    This augments the column with the information that it is valid as
+    a gather map for the specified number of rows with the given
+    nullification flag.
+
+    Parameters
+    ----------
+    column
+        The column to verify
+    nrows
+        The number of rows to verify against
+    nullify
+        Will this gather map be used nullifying out of bounds accesses
+    check_bounds
+        Actually check whether the map is in bounds. Set to False if
+        you know by construction that the map is in bounds.
+
+    Returns
+    -------
+    GatherMap
+        New object wrapping the column bearing witness to its
+        suitability as a gather map for columns with nrows.
+
+    Raises
+    ------
+    IndexError
+        If the column is of unsuitable dtype, or the map is not in bounds.
+    """
+    column = cudf.core.column.as_column(column)
+    if len(column) == 0:
+        # Any empty column is valid as a gather map
+        # This is necessary because as_column([]) defaults to float64
+        # TODO: we should fix this further up.
+        # Alternately we can have an Optional[Column] and handle None
+        # specially in _gather.
+        return GatherMap(
+            cast("NumericalColumn", column.astype(size_type_dtype)),
+            nrows,
+            nullify,
+        )
+    if column.dtype.kind not in {"i", "u"}:
+        raise IndexError("Gather map must have integer dtype")
+    if not nullify and check_bounds:
+        lo, hi = libcudf.reduce.minmax(column)
+        if lo.value < -nrows or hi.value >= nrows:
+            raise IndexError(f"Gather map is out of bounds for [0, {nrows})")
+    return GatherMap(cast("NumericalColumn", column), nrows, nullify)
+
+
+def as_boolean_mask(column: Any, nrows: int) -> BooleanMask:
+    """Turn a column into a boolean mask
+
+    This augments the column with information that it is valid as a
+    boolean mask for columns with a given number of rows
+
+    Parameters
+    ----------
+    column
+        The column to verify
+    nrows
+        the number of rows to verify against
+
+    Returns
+    -------
+    BooleanMask
+        New object wrapping the column bearing witness to its
+        suitability as a boolean mask for columns with matching row
+        count.
+
+    Raises
+    ------
+    IndexError
+        If the column is of unsuitable dtype.
+    """
+    column = cudf.core.column.as_column(column)
+    if column.dtype.kind != "b":
+        raise IndexError("Boolean mask must have bool dtype")
+    if (n := len(column)) != nrows:
+        raise IndexError(
+            f"Column with {n} rows not suitable "
+            f"as a boolean mask for {nrows} rows"
+        )
+    return BooleanMask(cast("NumericalColumn", column))