From 97518ac124c2e5992f0bd75f71ccacf06cd866a8 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 12 Jun 2024 19:04:03 +0100 Subject: [PATCH 1/2] Fix typo bug in gather implementation (#16000) Pylibcudf calls the datatype accessor type(). Add tests to cover this case, and raising on out of bounds accesses. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Thomas Li (https://github.com/lithomas1) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16000 --- python/cudf_polars/cudf_polars/dsl/expr.py | 2 +- .../tests/expressions/test_gather.py | 31 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 13e496136b5..377a905aed6 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -801,7 +801,7 @@ def do_evaluate( obj = plc.replace.replace_nulls( indices.obj, plc.interop.from_arrow( - pa.scalar(n, type=plc.interop.to_arrow(indices.obj.data_type())) + pa.scalar(n, type=plc.interop.to_arrow(indices.obj.type())) ), ) else: diff --git a/python/cudf_polars/tests/expressions/test_gather.py b/python/cudf_polars/tests/expressions/test_gather.py index df33e19a0b6..6bffa3e252c 100644 --- a/python/cudf_polars/tests/expressions/test_gather.py +++ b/python/cudf_polars/tests/expressions/test_gather.py @@ -2,8 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import pytest + import polars as pl +from cudf_polars import execute_with_cudf from cudf_polars.testing.asserts import assert_gpu_result_equal @@ -17,3 +20,31 @@ def test_gather(): query = ldf.select(pl.col("a").gather(pl.col("b"))) assert_gpu_result_equal(query) + + +def test_gather_with_nulls(): + ldf = pl.LazyFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [0, None, 1, None, 6, 1, 0], + } + ) + + query = ldf.select(pl.col("a").gather(pl.col("b"))) + + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize("negative", [False, True]) +def test_gather_out_of_bounds(negative): + ldf = pl.LazyFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [0, -10 if negative else 10, 1, 2, 6, 1, 0], + } + ) + + query = ldf.select(pl.col("a").gather(pl.col("b"))) + + with pytest.raises(pl.exceptions.ComputeError): + query.collect(post_opt_callback=execute_with_cudf) From b35991c366cf81b650fb79fc27604fd79468f132 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 12 Jun 2024 22:50:52 +0100 Subject: [PATCH 2/2] Add test that diagonal concat with mismatching schemas raises (#16006) Arguably this should be determined during query optimization by polars, but for now it is raised late during compute, so we must validate on our side. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Thomas Li (https://github.com/lithomas1) URL: https://github.com/rapidsai/cudf/pull/16006 --- python/cudf_polars/cudf_polars/dsl/ir.py | 4 ++-- python/cudf_polars/tests/test_union.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 0a6deb5698c..46241ab8e71 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -933,10 +933,10 @@ class Union(IR): """Optional slice to apply after concatenation.""" def __post_init__(self) -> None: - """Validated preconditions.""" + """Validate preconditions.""" schema = self.dfs[0].schema if not all(s.schema == schema for s in self.dfs[1:]): - raise ValueError("Schema mismatch") + raise NotImplementedError("Schema mismatch") def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py index 18cf4748692..6c9122bc260 100644 --- a/python/cudf_polars/tests/test_union.py +++ b/python/cudf_polars/tests/test_union.py @@ -2,8 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import pytest + import polars as pl +from cudf_polars import translate_ir from cudf_polars.testing.asserts import assert_gpu_result_equal @@ -19,6 +22,19 @@ def test_union(): assert_gpu_result_equal(query) +def test_union_schema_mismatch_raises(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + ldf2 = ldf.select(pl.col("a").cast(pl.Float32)) + query = pl.concat([ldf, ldf2], how="diagonal") + with pytest.raises(NotImplementedError): + _ = translate_ir(query._ldf.visit()) + + def test_concat_vertical(): ldf = pl.LazyFrame( {