Skip to content

Commit

Permalink
address more comments
Browse files Browse the repository at this point in the history
  • Loading branch information
lithomas1 committed Jun 24, 2024
1 parent d22953f commit 604c16d
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 78 deletions.
15 changes: 10 additions & 5 deletions python/cudf/cudf/_lib/pylibcudf/io/types.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,7 @@ cdef class SinkInfo:
Parameters
----------
sinks : List[Union[str, os.PathLike,
io.BytesIO, io.IOBase,
io.StringIO, io.TextIOBase]]
sinks : list of str, PathLike, BytesIO, StringIO
A homogeneous list of sinks (this can be a string filename,
bytes, or one of the Python I/O classes) to read from.
Expand Down Expand Up @@ -231,10 +229,17 @@ cdef class SinkInfo:
)
data_sinks.push_back(self.sink_storage.back().get())
self.c_obj = sink_info(data_sinks)
elif isinstance(sinks[0], (basestring, os.PathLike)):
elif isinstance(sinks[0], str):
paths.reserve(len(sinks))
for s in sinks:
if not isinstance(s, (basestring, os.PathLike)):
if not isinstance(s, str):
raise ValueError("All sinks must be of the same type!")
paths.push_back(<string> s.encode())
self.c_obj = sink_info(move(paths))
elif isinstance(sinks[0], os.PathLike):
paths.reserve(len(sinks))
for s in sinks:
if not isinstance(s, os.PathLike):
raise ValueError("All sinks must be of the same type!")
paths.push_back(<string> os.path.expanduser(s).encode())
self.c_obj = sink_info(move(paths))
Expand Down
66 changes: 35 additions & 31 deletions python/cudf/cudf/pylibcudf_tests/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,22 @@
from cudf._lib import pylibcudf as plc


def metadata_from_arrow_array(
pa_array: pa.Array,
def metadata_from_arrow_type(
pa_type: pa.Array,
name: str = "",
) -> plc.interop.ColumnMetadata | None:
metadata = None
if pa.types.is_list(dtype := pa_array.type) or pa.types.is_struct(dtype):
metadata = plc.interop.ColumnMetadata(name) # None
if pa.types.is_list(pa_type) or pa.types.is_struct(pa_type):
child_meta = []
for i in range(pa_type.num_fields):
field_meta = metadata_from_arrow_type(
pa_type.field(i).type, pa_type.field(i).name
)
child_meta.append(field_meta)
metadata = plc.interop.ColumnMetadata(
"",
name,
# libcudf does not store field names, so just match pyarrow's.
[
plc.interop.ColumnMetadata(pa_array.type.field(i).name)
for i in range(pa_array.type.num_fields)
],
child_meta,
)
return metadata

Expand All @@ -35,13 +39,13 @@ def assert_column_eq(
rhs, plc.Column
):
rhs = plc.interop.to_arrow(
rhs, metadata=metadata_from_arrow_array(lhs)
rhs, metadata=metadata_from_arrow_type(lhs.type)
)
elif isinstance(lhs, plc.Column) and isinstance(
rhs, (pa.Array, pa.ChunkedArray)
):
lhs = plc.interop.to_arrow(
lhs, metadata=metadata_from_arrow_array(rhs)
lhs, metadata=metadata_from_arrow_type(rhs.type)
)
else:
raise ValueError(
Expand Down Expand Up @@ -97,21 +101,16 @@ def is_signed_integer(plc_dtype: plc.DataType):
)


def is_unsigned_integer(plc_dtype: plc.DataType):
return plc_dtype.id() in (
plc.TypeId.UINT8,
plc.TypeId.UINT16,
plc.TypeId.UINT32,
plc.TypeId.UINT64,
)


def is_integer(plc_dtype: plc.DataType):
return plc_dtype.id() in (
plc.TypeId.INT8,
plc.TypeId.INT16,
plc.TypeId.INT32,
plc.TypeId.INT64,
plc.TypeId.UINT8,
plc.TypeId.UINT16,
plc.TypeId.UINT32,
plc.TypeId.UINT64,
)


Expand All @@ -133,24 +132,29 @@ def is_string(plc_dtype: plc.DataType):
def is_fixed_width(plc_dtype: plc.DataType):
return (
is_integer(plc_dtype)
or is_unsigned_integer(plc_dtype)
or is_floating(plc_dtype)
or is_boolean(plc_dtype)
)


def is_nested_struct(pa_type: pa.DataType):
if isinstance(pa_type, pa.StructType):
for i in range(pa_type.num_fields):
if isinstance(pa_type[i].type, pa.StructType):
return True
return False
def nesting(typ) -> tuple[int, int]:
"""Return list and struct nesting of a pyarrow type."""
if isinstance(typ, pa.ListType):
list_, struct = nesting(typ.value_type)
return list_ + 1, struct
elif isinstance(typ, pa.StructType):
lists, structs = map(max, zip(*(nesting(t.type) for t in typ)))
return lists, structs + 1
else:
return 0, 0


def is_nested_struct(typ):
return nesting(typ)[1] > 1


def is_nested_list(pa_type: pa.DataType):
if isinstance(pa_type, pa.ListType):
return isinstance(pa_type.value_type, pa.ListType)
return False
def is_nested_list(typ):
return nesting(typ)[0] > 1


def sink_to_str(sink):
Expand Down
61 changes: 19 additions & 42 deletions python/cudf/cudf/pylibcudf_tests/test_copying.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
is_nested_list,
is_nested_struct,
is_string,
metadata_from_arrow_array,
metadata_from_arrow_type,
)

from cudf._lib import pylibcudf as plc
Expand All @@ -32,7 +32,7 @@ def nested_struct_xfail(request):
request.applymarker(
pytest.mark.xfail(
condition=any(
[is_nested_struct(col.type) for col in pa_table.columns]
is_nested_struct(col.type) for col in pa_table.columns
),
reason="pylibcudf interop fails with nested struct",
)
Expand All @@ -42,9 +42,9 @@ def nested_struct_xfail(request):
or "input_column" in request.fixturenames
):
# Return value is tuple of (engine, precision)
try:
if "target_column" in request.fixturenames:
pa_col, _ = request.getfixturevalue("target_column")
except pytest.FixtureLookupError:
else:
pa_col, _ = request.getfixturevalue("input_column")
request.applymarker(
pytest.mark.xfail(
Expand All @@ -63,7 +63,14 @@ def nested_list_skip(request):
"""
if "target_table" in request.fixturenames:
pa_table, _ = request.getfixturevalue("target_table")
if any([is_nested_list(col.type) for col in pa_table.columns]):
if any(is_nested_list(col.type) for col in pa_table.columns):
pytest.skip(reason="pylibcudf interop fails with nested list")
elif "target_column" or "input_column" in request.fixturenames:
if "target_column" in request.fixturenames:
pa_col, _ = request.getfixturevalue("target_column")
else:
pa_col, _ = request.getfixturevalue("input_column")
if is_nested_list(pa_col.type):
pytest.skip(reason="pylibcudf interop fails with nested list")


Expand Down Expand Up @@ -221,7 +228,6 @@ def mask(target_column):


@skip_nested_list
@xfail_nested_struct
def test_gather(target_table, index_column):
pa_target_table, plc_target_table = target_table
pa_index_column, plc_index_column = index_column
Expand Down Expand Up @@ -402,7 +408,6 @@ def test_scatter_table_type_mismatch(source_table, index_column, target_table):


@skip_nested_list
@xfail_nested_struct
def test_scatter_scalars(
source_scalar,
index_column,
Expand Down Expand Up @@ -680,16 +685,12 @@ def test_shift_type_mismatch(target_column):
plc.copying.shift(plc_target_column, 2, fill_value)


@xfail_nested_struct
@skip_nested_list
def test_slice_column(target_column):
pa_target_column, plc_target_column = target_column
bounds = list(range(6))
upper_bounds = bounds[1::2]
lower_bounds = bounds[::2]
if is_nested_list(pa_target_column.type):
pytest.skip(
reason="pylibcudf interop fails with memoryerror/segfault",
)
result = plc.copying.slice(plc_target_column, bounds)
for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result):
assert_column_eq(pa_target_column[lb:ub], slice_)
Expand All @@ -714,7 +715,6 @@ def test_slice_column_out_of_bounds(target_column):


@skip_nested_list
@xfail_nested_struct
def test_slice_table(target_table):
pa_target_table, plc_target_table = target_table
bounds = list(range(6))
Expand All @@ -725,15 +725,11 @@ def test_slice_table(target_table):
assert_table_eq(pa_target_table[lb:ub], slice_)


@xfail_nested_struct
@skip_nested_list
def test_split_column(target_column):
upper_bounds = [1, 3, 5]
lower_bounds = [0] + upper_bounds[:-1]
pa_target_column, plc_target_column = target_column
if is_nested_list(pa_target_column.type):
pytest.skip(
reason="pylibcudf interop fails with memoryerror/segfault",
)
result = plc.copying.split(plc_target_column, upper_bounds)
for lb, ub, split in zip(lower_bounds, upper_bounds, result):
assert_column_eq(pa_target_column[lb:ub], split)
Expand All @@ -752,7 +748,6 @@ def test_split_column_out_of_bounds(target_column):


@skip_nested_list
@xfail_nested_struct
def test_split_table(target_table):
pa_target_table, plc_target_table = target_table

Expand All @@ -763,7 +758,7 @@ def test_split_table(target_table):
assert_table_eq(pa_target_table[lb:ub], split)


@xfail_nested_struct
@skip_nested_list
def test_copy_if_else_column_column(target_column, mask, source_scalar):
pa_target_column, plc_target_column = target_column
pa_source_scalar, _ = source_scalar
Expand All @@ -774,11 +769,6 @@ def test_copy_if_else_column_column(target_column, mask, source_scalar):
)
plc_other_column = plc.interop.from_arrow(pa_other_column)

if is_nested_list(pa_target_column.type):
pytest.skip(
reason="pylibcudf interop fails with memoryerror/segfault",
)

result = plc.copying.copy_if_else(
plc_target_column,
plc_other_column,
Expand Down Expand Up @@ -843,7 +833,7 @@ def test_copy_if_else_wrong_size_mask(target_column):
)


@xfail_nested_struct
@skip_nested_list
@pytest.mark.parametrize("array_left", [True, False])
def test_copy_if_else_column_scalar(
target_column,
Expand All @@ -855,11 +845,6 @@ def test_copy_if_else_column_scalar(
pa_source_scalar, plc_source_scalar = source_scalar
pa_mask, plc_mask = mask

if is_nested_list(pa_target_column.type):
pytest.skip(
reason="pylibcudf interop fails with memoryerror/segfault",
)

args = (
(plc_target_column, plc_source_scalar)
if array_left
Expand Down Expand Up @@ -1001,7 +986,7 @@ def test_boolean_mask_scatter_from_wrong_mask_type(source_table, target_table):
)


@xfail_nested_struct
@skip_nested_list
def test_boolean_mask_scatter_from_scalars(
source_scalar,
target_table,
Expand All @@ -1010,10 +995,6 @@ def test_boolean_mask_scatter_from_scalars(
pa_source_scalar, plc_source_scalar = source_scalar
pa_target_table, plc_target_table = target_table
pa_mask, plc_mask = mask
if any([is_nested_list(col.type) for col in pa_target_table.columns]):
pytest.skip(
reason="pylibcudf interop fails with memoryerror/segfault",
)
result = plc.copying.boolean_mask_scatter(
[plc_source_scalar] * 3,
plc_target_table,
Expand All @@ -1029,19 +1010,15 @@ def test_boolean_mask_scatter_from_scalars(
assert_table_eq(expected, result)


@xfail_nested_struct
@skip_nested_list
def test_get_element(input_column):
index = 1
pa_input_column, plc_input_column = input_column
if is_nested_list(pa_input_column.type):
pytest.skip(
reason="pylibcudf interop fails with memoryerror/segfault",
)
result = plc.copying.get_element(plc_input_column, index)

assert (
plc.interop.to_arrow(
result, metadata_from_arrow_array(pa_input_column)
result, metadata_from_arrow_type(pa_input_column.type)
).as_py()
== pa_input_column[index].as_py()
)
Expand Down

0 comments on commit 604c16d

Please sign in to comment.