Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ensure_index to not unnecessarily shallow copy cudf.Index #16117

Merged
merged 1 commit into from
Jun 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1104,7 +1104,11 @@ def difference(self, other, sort=None):
f"of [None, False, True]; {sort} was passed."
)

other = cudf.Index(other, name=getattr(other, "name", self.name))
if not isinstance(other, BaseIndex):
other = cudf.Index(
other,
name=getattr(other, "name", self.name),
)

if not len(other):
res = self._get_reconciled_name_object(other).unique()
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from cudf.core.column import as_column
from cudf.core.copy_types import BooleanMask
from cudf.core.index import Index, RangeIndex
from cudf.core.index import RangeIndex, ensure_index
from cudf.core.indexed_frame import IndexedFrame
from cudf.core.scalar import Scalar
from cudf.options import get_option
Expand Down Expand Up @@ -107,7 +107,7 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
dtype="int64" if get_option("mode.pandas_compatible") else None,
).values

return labels, cats.values if return_cupy_array else Index(cats)
return labels, cats.values if return_cupy_array else ensure_index(cats)


def _linear_interpolation(column, index=None):
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def cut(
)

# we return a categorical index, as we don't have a Categorical method
categorical_index = cudf.Index(col)
categorical_index = cudf.CategoricalIndex._from_data({None: col})

if isinstance(orig_x, (pd.Series, cudf.Series)):
# if we have a series input we return a series output
Expand Down
29 changes: 17 additions & 12 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,12 @@
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.copy_types import BooleanMask
from cudf.core.groupby.groupby import DataFrameGroupBy, groupby_doc_template
from cudf.core.index import BaseIndex, RangeIndex, _index_from_data, as_index
from cudf.core.index import (
BaseIndex,
RangeIndex,
_index_from_data,
ensure_index,
)
from cudf.core.indexed_frame import (
IndexedFrame,
_FrameIndexer,
Expand Down Expand Up @@ -338,7 +343,7 @@ def _getitem_tuple_arg(self, arg):
range(len(tmp_arg[0]))
)
},
index=as_index(tmp_arg[0]),
index=cudf.Index(tmp_arg[0]),
)
columns_df[cantor_name] = column.as_column(
range(len(columns_df))
Expand Down Expand Up @@ -702,7 +707,7 @@ def __init__(
data = data.reindex(index)
index = data.index
else:
index = cudf.Index(index)
index = ensure_index(index)
else:
index = data.index

Expand Down Expand Up @@ -751,7 +756,7 @@ def __init__(
if index is None:
self._index = RangeIndex(0)
else:
self._index = cudf.Index(index)
self._index = ensure_index(index)
if columns is not None:
rangeindex = isinstance(
columns, (range, pd.RangeIndex, cudf.RangeIndex)
Expand Down Expand Up @@ -909,7 +914,7 @@ def _init_from_series_list(self, data, columns, index):
f"not match length of index ({index_length})"
)

final_index = cudf.Index(index)
final_index = ensure_index(index)

series_lengths = list(map(len, data))
data = numeric_normalize_types(*data)
Expand Down Expand Up @@ -977,9 +982,9 @@ def _init_from_list_like(self, data, index=None, columns=None):
if index is None:
index = RangeIndex(start=0, stop=len(data))
else:
index = cudf.Index(index)
index = ensure_index(index)

self._index = cudf.Index(index)
self._index = index
# list-of-dicts case
if len(data) > 0 and isinstance(data[0], dict):
data = DataFrame.from_pandas(pd.DataFrame(data))
Expand Down Expand Up @@ -1085,7 +1090,7 @@ def _init_from_dict_like(

self._index = RangeIndex(0, num_rows)
else:
self._index = cudf.Index(index)
self._index = ensure_index(index)

if len(data):
self._data.multiindex = True
Expand Down Expand Up @@ -1491,7 +1496,7 @@ def memory_usage(self, index=True, deep=False):
names.append("Index")
return Series._from_data(
data={None: as_column(mem_usage)},
index=as_index(names),
index=cudf.Index(names),
)

@_cudf_nvtx_annotate
Expand Down Expand Up @@ -4033,7 +4038,7 @@ def transpose(self):
# Set the old column names as the new index
result = self.__class__._from_data(
ColumnAccessor(dict(enumerate(result_columns)), verify=False),
index=as_index(index),
index=cudf.Index(index),
)
# Set the old index as the new column names
result.columns = columns
Expand Down Expand Up @@ -5657,7 +5662,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
}

if not is_scalar(index):
new_index = cudf.Index(index)
new_index = ensure_index(index)
else:
new_index = None

Expand Down Expand Up @@ -5741,7 +5746,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
}

if index is not None:
index = cudf.Index(index)
index = ensure_index(index)

if isinstance(columns, (pd.Index, cudf.Index)):
level_names = tuple(columns.names)
Expand Down
13 changes: 12 additions & 1 deletion python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,17 @@
from collections.abc import Generator, Iterable


def ensure_index(index_like: Any) -> BaseIndex:
"""
Ensure an Index is returned.

Avoids a shallow copy compared to calling cudf.Index(...)
"""
if not isinstance(index_like, BaseIndex):
return cudf.Index(index_like)
return index_like


class IndexMeta(type):
"""Custom metaclass for Index that overrides instance/subclass tests."""

Expand Down Expand Up @@ -1569,7 +1580,7 @@ def append(self, other):
to_concat.append(obj)
else:
this = self
other = cudf.Index(other)
other = ensure_index(other)

if len(this) == 0 or len(other) == 0:
# we'll filter out empties later in ._concat
Expand Down
11 changes: 5 additions & 6 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
is_list_like,
is_scalar,
)
from cudf.core._base_index import BaseIndex
from cudf.core._compat import PANDAS_LT_300
from cudf.core.buffer import acquire_spill_lock
from cudf.core.column import ColumnBase, as_column
Expand All @@ -42,7 +41,7 @@
from cudf.core.dtypes import ListDtype
from cudf.core.frame import Frame
from cudf.core.groupby.groupby import GroupBy
from cudf.core.index import Index, RangeIndex, _index_from_data
from cudf.core.index import RangeIndex, _index_from_data, ensure_index
from cudf.core.missing import NA
from cudf.core.multiindex import MultiIndex
from cudf.core.resample import _Resampler
Expand All @@ -66,6 +65,8 @@
Dtype,
NotImplementedType,
)
from cudf.core._base_index import BaseIndex


doc_reset_index_template = """
Reset the index of the {klass}, or a level of it.
Expand Down Expand Up @@ -627,9 +628,7 @@ def index(self, value):
f"new values have {len(value)} elements"
)
# avoid unnecessary cast to Index
if not isinstance(value, BaseIndex):
value = Index(value)

value = ensure_index(value)
self._index = value

@_cudf_nvtx_annotate
Expand Down Expand Up @@ -3595,7 +3594,7 @@ def _align_to_index(
sort: bool = True,
allow_non_unique: bool = False,
) -> Self:
index = cudf.Index(index)
index = ensure_index(index)

if self.index.equals(index):
return self
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
BaseIndex,
_get_indexer_basic,
_lexsorted_equal_range,
ensure_index,
)
from cudf.core.join._join_helpers import _match_join_keys
from cudf.utils.dtypes import is_column_like
Expand Down Expand Up @@ -172,7 +173,7 @@ def __init__(
"codes and is inconsistent!"
)

levels = [cudf.Index(level) for level in levels]
levels = [ensure_index(level) for level in levels]

if len(levels) != len(codes._data):
raise ValueError(
Expand Down
12 changes: 5 additions & 7 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
from cudf.core.column.struct import StructMethods
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.groupby.groupby import SeriesGroupBy, groupby_doc_template
from cudf.core.index import BaseIndex, DatetimeIndex, RangeIndex, as_index
from cudf.core.index import BaseIndex, DatetimeIndex, RangeIndex, ensure_index
from cudf.core.indexed_frame import (
IndexedFrame,
_FrameIndexer,
Expand Down Expand Up @@ -588,10 +588,8 @@ def __init__(
data = data.copy(deep=True)
name_from_data = data.name
column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
if isinstance(data, pd.Series):
index_from_data = cudf.Index(data.index)
elif isinstance(data, Series):
index_from_data = data.index
if isinstance(data, (pd.Series, Series)):
index_from_data = ensure_index(data.index)
elif isinstance(data, ColumnAccessor):
raise TypeError(
"Use cudf.Series._from_data for constructing a Series from "
Expand Down Expand Up @@ -642,7 +640,7 @@ def __init__(
name = name_from_data

if index is not None:
index = cudf.Index(index)
index = ensure_index(index)

if index_from_data is not None:
first_index = index_from_data
Expand Down Expand Up @@ -3191,7 +3189,7 @@ def quantile(

return Series._from_data(
data={self.name: result},
index=as_index(np_array_q) if quant_index else None,
index=cudf.Index(np_array_q) if quant_index else None,
)

@docutils.doc_describe()
Expand Down
24 changes: 24 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -11078,3 +11078,27 @@ def test_dataframe_loc_int_float(dtype1, dtype2):
expected = pdf.loc[pidx]

assert_eq(actual, expected, check_index_type=True, check_dtype=True)


@pytest.mark.parametrize(
"data",
[
cudf.DataFrame(range(2)),
None,
[cudf.Series(range(2))],
[[0], [1]],
{1: range(2)},
cupy.arange(2),
],
)
def test_init_with_index_no_shallow_copy(data):
idx = cudf.RangeIndex(2)
df = cudf.DataFrame(data, index=idx)
assert df.index is idx


def test_from_records_with_index_no_shallow_copy():
idx = cudf.RangeIndex(2)
data = np.array([(1.0, 2), (3.0, 4)], dtype=[("x", "<f8"), ("y", "<i8")])
df = cudf.DataFrame(data.view(np.recarray), index=idx)
assert df.index is idx
Loading