From ed37acea3da55df7ef33c33aaace4f1f34cf8239 Mon Sep 17 00:00:00 2001 From: brifitz <95299320+brifitz@users.noreply.github.com> Date: Thu, 2 Jan 2025 09:49:24 +0000 Subject: [PATCH 01/20] fix(rust): `slice_pushdown` optimization leading to incorrectly sliced row index on parquet file (#20508) --- .../polars-io/src/parquet/read/read_impl.rs | 29 +++++++++++++++---- py-polars/tests/unit/io/test_lazy_parquet.py | 28 ++++++++++++++++++ 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/crates/polars-io/src/parquet/read/read_impl.rs b/crates/polars-io/src/parquet/read/read_impl.rs index eb4448eebeb1..9f5281280c51 100644 --- a/crates/polars-io/src/parquet/read/read_impl.rs +++ b/crates/polars-io/src/parquet/read/read_impl.rs @@ -671,7 +671,7 @@ fn rg_to_dfs_par_over_rg( store: &mmap::ColumnStore, row_group_start: usize, row_group_end: usize, - previous_row_count: &mut IdxSize, + rows_read: &mut IdxSize, slice: (usize, usize), file_metadata: &FileMetadata, schema: &ArrowSchemaRef, @@ -689,15 +689,34 @@ fn rg_to_dfs_par_over_rg( .sum(); let slice_end = slice.0 + slice.1; + // rows_scanned is the number of rows that have been scanned so far when checking for overlap with the slice. + // rows_read is the number of rows found to overlap with the slice, and thus the number of rows that will be + // read into a dataframe. + let mut rows_scanned: IdxSize; + + if row_group_start > 0 { + // In the case of async reads, we need to account for the fact that row_group_start may be greater than + // zero due to earlier processing. + // For details, see: https://github.com/pola-rs/polars/pull/20508#discussion_r1900165649 + rows_scanned = (0..row_group_start) + .map(|i| file_metadata.row_groups[i].num_rows() as IdxSize) + .sum(); + } else { + rows_scanned = 0; + } + for i in row_group_start..row_group_end { - let row_count_start = *previous_row_count; + let row_count_start = rows_scanned; let rg_md = &file_metadata.row_groups[i]; + let n_rows_this_file = rg_md.num_rows(); let rg_slice = - split_slice_at_file(&mut n_rows_processed, rg_md.num_rows(), slice.0, slice_end); - *previous_row_count = previous_row_count - .checked_add(rg_slice.1 as IdxSize) + split_slice_at_file(&mut n_rows_processed, n_rows_this_file, slice.0, slice_end); + rows_scanned = rows_scanned + .checked_add(n_rows_this_file as IdxSize) .ok_or(ROW_COUNT_OVERFLOW_ERR)?; + *rows_read += rg_slice.1 as IdxSize; + if rg_slice.1 == 0 { continue; } diff --git a/py-polars/tests/unit/io/test_lazy_parquet.py b/py-polars/tests/unit/io/test_lazy_parquet.py index 05589332cc99..78ffb6b1379b 100644 --- a/py-polars/tests/unit/io/test_lazy_parquet.py +++ b/py-polars/tests/unit/io/test_lazy_parquet.py @@ -564,6 +564,34 @@ def trim_to_metadata(path: str | Path) -> None: ) +@pytest.mark.write_disk +def test_predicate_slice_pushdown_row_index_20485(tmp_path: Path) -> None: + tmp_path.mkdir(exist_ok=True) + + file_path = tmp_path / "slice_pushdown.parquet" + row_group_size = 100000 + num_row_groups = 3 + + df = pl.select(ref=pl.int_range(num_row_groups * row_group_size)) + df.write_parquet(file_path, row_group_size=row_group_size) + + # Use a slice that starts near the end of one row group and extends into the next + # to test handling of slices that span multiple row groups. + slice_start = 199995 + slice_len = 10 + ldf = pl.scan_parquet(file_path) + sliced_df = ldf.with_row_index().slice(slice_start, slice_len).collect() + sliced_df_no_pushdown = ( + ldf.with_row_index().slice(slice_start, slice_len).collect(slice_pushdown=False) + ) + + expected_index = list(range(slice_start, slice_start + slice_len)) + actual_index = list(sliced_df["index"]) + assert actual_index == expected_index + + assert_frame_equal(sliced_df, sliced_df_no_pushdown) + + @pytest.mark.write_disk @pytest.mark.parametrize("streaming", [True, False]) def test_parquet_row_groups_shift_bug_18739(tmp_path: Path, streaming: bool) -> None: From c5790a755307c3e71ce2a781c1eda0e2c830e3b0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 2 Jan 2025 10:53:40 +0100 Subject: [PATCH 02/20] chore(python): Bump the python group in /py-polars with 3 updates (#20521) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Stijn de Gooijer --- py-polars/requirements-dev.txt | 2 +- py-polars/requirements-lint.txt | 4 ++-- py-polars/tests/unit/io/test_hive.py | 2 +- py-polars/tests/unit/operations/unique/test_unique.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/py-polars/requirements-dev.txt b/py-polars/requirements-dev.txt index 7fb358bced22..89c081de50c3 100644 --- a/py-polars/requirements-dev.txt +++ b/py-polars/requirements-dev.txt @@ -60,7 +60,7 @@ hypothesis # ------- pytest==8.3.2 -pytest-codspeed==3.0.0 +pytest-codspeed==3.1.0 pytest-cov==6.0.0 pytest-xdist==3.6.1 diff --git a/py-polars/requirements-lint.txt b/py-polars/requirements-lint.txt index df703691f12a..5c6034674239 100644 --- a/py-polars/requirements-lint.txt +++ b/py-polars/requirements-lint.txt @@ -1,3 +1,3 @@ -mypy[faster-cache]==1.13.0 +mypy[faster-cache]==1.14.1 ruff==0.8.1 -typos==1.28.1 +typos==1.29.0 diff --git a/py-polars/tests/unit/io/test_hive.py b/py-polars/tests/unit/io/test_hive.py index 4a6384fb5f56..2bb0c6e7d37c 100644 --- a/py-polars/tests/unit/io/test_hive.py +++ b/py-polars/tests/unit/io/test_hive.py @@ -200,7 +200,7 @@ def test_hive_partitioned_projection_pushdown( q = pl.scan_parquet( root / "**/*.parquet", hive_partitioning=True, - parallel=parallel, # type: ignore[arg-type] + parallel=parallel, ) expected = q.collect().select("category") diff --git a/py-polars/tests/unit/operations/unique/test_unique.py b/py-polars/tests/unit/operations/unique/test_unique.py index 406a70b6e71f..595ae1db59eb 100644 --- a/py-polars/tests/unit/operations/unique/test_unique.py +++ b/py-polars/tests/unit/operations/unique/test_unique.py @@ -43,7 +43,7 @@ def test_unique_predicate_pd() -> None: for maintain_order in (True, False): for keep in ("first", "last", "any", "none"): q = ( - lf.unique("x", maintain_order=maintain_order, keep=keep) # type: ignore[arg-type] + lf.unique("x", maintain_order=maintain_order, keep=keep) .filter(pl.col("x") == "abc") .filter(pl.col("z")) ) From 90827f7f7683c5c527c09638142158d569127cfb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 2 Jan 2025 10:53:51 +0100 Subject: [PATCH 03/20] ci: Bump crate-ci/typos from 1.28.1 to 1.29.0 in the ci group (#20520) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/lint-global.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint-global.yml b/.github/workflows/lint-global.yml index ff1487ec963e..e0bf119ce624 100644 --- a/.github/workflows/lint-global.yml +++ b/.github/workflows/lint-global.yml @@ -15,4 +15,4 @@ jobs: - name: Lint Markdown and TOML uses: dprint/check@v2.2 - name: Spell Check with Typos - uses: crate-ci/typos@v1.28.1 + uses: crate-ci/typos@v1.29.0 From 70473d0eb47e2cb9376dd95bec23d8563f0f3cc8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 2 Jan 2025 10:54:03 +0100 Subject: [PATCH 04/20] chore(python): Bump markdown-exec[ansi] from 1.9.3 to 1.10.0 in /docs in the documentation group (#20518) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/source/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/requirements.txt b/docs/source/requirements.txt index 25e87aeb64ec..2c8d162a5689 100644 --- a/docs/source/requirements.txt +++ b/docs/source/requirements.txt @@ -13,5 +13,5 @@ mkdocs-material==9.5.27 mkdocs-macros-plugin==1.3.7 mkdocs-redirects==1.2.1 material-plausible-plugin==0.2.0 -markdown-exec[ansi]==1.9.3 +markdown-exec[ansi]==1.10.0 pygithub==2.5.0 From b0ac62bff3f2d38df6e8e4e872c7b8c86eaac6c7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 2 Jan 2025 10:54:38 +0100 Subject: [PATCH 05/20] build: Bump the rust group with 3 updates (#20519) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Stijn de Gooijer --- Cargo.lock | 225 ++++++++++++++++++--------------- Cargo.toml | 2 +- crates/polars-utils/Cargo.toml | 2 +- 3 files changed, 126 insertions(+), 103 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ab877a36cc56..0753cb32d661 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -95,9 +95,9 @@ checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" [[package]] name = "anyhow" -version = "1.0.94" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7" +checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04" [[package]] name = "apache-avro" @@ -206,7 +206,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -217,7 +217,7 @@ checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -265,9 +265,9 @@ dependencies = [ [[package]] name = "aws-config" -version = "1.5.11" +version = "1.5.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5d1c2c88936a73c699225d0bc00684a534166b0cebc2659c3cdf08de8edc64c" +checksum = "649316840239f4e58df0b7f620c428f5fababbbca2d504488c641534050bd141" dependencies = [ "aws-credential-types", "aws-runtime", @@ -307,9 +307,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.5.1" +version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "300a12520b4e6d08b73f77680f12c16e8ae43250d55100e0b2be46d78da16a48" +checksum = "44f6f1124d6e19ab6daf7f2e615644305dc6cb2d706892a8a8c0b98db35de020" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -333,9 +333,9 @@ dependencies = [ [[package]] name = "aws-sdk-s3" -version = "1.66.0" +version = "1.67.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "154488d16ab0d627d15ab2832b57e68a16684c8c902f14cb8a75ec933fc94852" +checksum = "bbc644164269a1e38ce7f2f7373629d3fb3d310c0e3feb5573a29744288b24d3" dependencies = [ "aws-credential-types", "aws-runtime", @@ -367,9 +367,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.51.0" +version = "1.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74995133da38f109a0eb8e8c886f9e80c713b6e9f2e6e5a6a1ba4450ce2ffc46" +checksum = "cb25f7129c74d36afe33405af4517524df8f74b635af8c2c8e91c1552b8397b2" dependencies = [ "aws-credential-types", "aws-runtime", @@ -389,9 +389,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.52.0" +version = "1.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7062a779685cbf3b2401eb36151e2c6589fd5f3569b8a6bc2d199e5aaa1d059" +checksum = "d03a3d5ef14851625eafd89660a751776f938bf32f309308b20dcca41c44b568" dependencies = [ "aws-credential-types", "aws-runtime", @@ -411,9 +411,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.52.0" +version = "1.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "299dae7b1dc0ee50434453fa5a229dc4b22bd3ee50409ff16becf1f7346e0193" +checksum = "cf3a9f073ae3a53b54421503063dfb87ff1ea83b876f567d92e8b8d9942ba91b" dependencies = [ "aws-credential-types", "aws-runtime", @@ -463,9 +463,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.2.2" +version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8aa8ff1492fd9fb99ae28e8467af0dbbb7c31512b16fabf1a0f10d7bb6ef78bb" +checksum = "427cb637d15d63d6f9aae26358e1c9a9c09d5aa490d64b09354c8217cfef0f28" dependencies = [ "futures-util", "pin-project-lite", @@ -546,9 +546,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.7.5" +version = "1.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "431a10d0e07e09091284ef04453dae4069283aa108d209974d67e77ae1caa658" +checksum = "a05dd41a70fc74051758ee75b5c4db2c0ca070ed9229c3df50e9475cda1cb985" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -590,9 +590,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.2.10" +version = "1.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ecbf4d5dfb169812e2b240a4350f15ad3c6b03a54074e5712818801615f2dc5" +checksum = "38ddc9bd6c28aeb303477170ddd183760a956a03e083b3902a990238a7e3792d" dependencies = [ "base64-simd", "bytes", @@ -778,22 +778,22 @@ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" [[package]] name = "bytemuck" -version = "1.20.0" +version = "1.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b37c88a63ffd85d15b406896cc343916d7cf57838a847b3a6f2ca5d39a5695a" +checksum = "ef657dfab802224e671f5818e9a4935f9b1957ed18e58292690cc39e7a4092a3" dependencies = [ "bytemuck_derive", ] [[package]] name = "bytemuck_derive" -version = "1.8.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec" +checksum = "3fa76293b4f7bb636ab88fd78228235b5248b4d05cc589aed610f954af5d7c7a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -823,11 +823,11 @@ dependencies = [ [[package]] name = "casey" -version = "0.4.0" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "614586263949597dcc18675da12ef9b429135e13628d92eb8b8c6fa50ca5656b" +checksum = "8e779867f62d81627d1438e0d3fb6ed7d7c9d64293ca6d87a1e88781b94ece1c" dependencies = [ - "syn 1.0.109", + "syn 2.0.94", ] [[package]] @@ -847,9 +847,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.5" +version = "1.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31a0499c1dc64f458ad13872de75c0eb7e3fdb0e67964610c914b034fc5956e" +checksum = "8d6dbb628b8f8555f86d0323c2eb39e3ec81901f4b83e091db8a6a76d316a333" dependencies = [ "jobserver", "libc", @@ -986,9 +986,9 @@ dependencies = [ [[package]] name = "compact_str" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6050c3a16ddab2e412160b31f2c871015704239bca62f72f6e5f0be631d3f644" +checksum = "3b79c4069c6cad78e2e0cdfcbd26275770669fb39fd308a752dc110e83b9af32" dependencies = [ "castaway", "cfg-if", @@ -1271,7 +1271,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -1336,7 +1336,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -1533,7 +1533,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -1607,9 +1607,9 @@ checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] name = "glob" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" [[package]] name = "group" @@ -1883,9 +1883,9 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.27.4" +version = "0.27.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6884a48c6826ec44f524c7456b163cebe9e55a18d7b5e307cb4f100371cc767" +checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2" dependencies = [ "futures-util", "http 1.2.0", @@ -2072,7 +2072,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -2688,24 +2688,25 @@ dependencies = [ [[package]] name = "object" -version = "0.36.5" +version = "0.36.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" dependencies = [ "memchr", ] [[package]] name = "object_store" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6eb4c22c6154a1e759d7099f9ffad7cc5ef8245f9efbab4a41b92623079c82f3" +checksum = "3cfccb68961a56facde1163f9319e0d15743352344e7808a11795fb99698dcaf" dependencies = [ "async-trait", "base64 0.22.1", "bytes", "chrono", "futures", + "httparse", "humantime", "hyper 1.5.2", "itertools 0.13.0", @@ -2761,7 +2762,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -3088,7 +3089,7 @@ dependencies = [ "serde", "serde_json", "strum_macros", - "thiserror 2.0.8", + "thiserror 2.0.9", "version_check", "xxhash-rust", ] @@ -3130,7 +3131,7 @@ dependencies = [ "polars-arrow-format", "regex", "simdutf8", - "thiserror 2.0.8", + "thiserror 2.0.9", ] [[package]] @@ -3457,7 +3458,7 @@ dependencies = [ "pyo3", "recursive", "serde_json", - "thiserror 2.0.8", + "thiserror 2.0.9", "version_check", ] @@ -3725,7 +3726,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -3737,7 +3738,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -3748,9 +3749,9 @@ checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" [[package]] name = "quick-xml" -version = "0.36.2" +version = "0.37.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7649a7b4df05aed9ea7ec6f628c67c9953a43869b8bc50929569b2999d443fe" +checksum = "165859e9e55f79d67b96c5d96f4e88b6f2695a1972849c15a6a3f5c59fc2c003" dependencies = [ "memchr", "serde", @@ -3780,7 +3781,7 @@ dependencies = [ "rustc-hash", "rustls 0.23.20", "socket2", - "thiserror 2.0.8", + "thiserror 2.0.9", "tokio", "tracing", ] @@ -3799,7 +3800,7 @@ dependencies = [ "rustls 0.23.20", "rustls-pki-types", "slab", - "thiserror 2.0.8", + "thiserror 2.0.9", "tinyvec", "tracing", "web-time", @@ -3821,9 +3822,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.37" +version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" dependencies = [ "proc-macro2", ] @@ -3939,7 +3940,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -3968,7 +3969,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -4014,9 +4015,9 @@ checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "reqwest" -version = "0.12.9" +version = "0.12.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f" +checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da" dependencies = [ "base64 0.22.1", "bytes", @@ -4028,7 +4029,7 @@ dependencies = [ "http-body 1.0.1", "http-body-util", "hyper 1.5.2", - "hyper-rustls 0.27.4", + "hyper-rustls 0.27.5", "hyper-tls", "hyper-util", "ipnet", @@ -4052,6 +4053,7 @@ dependencies = [ "tokio-native-tls", "tokio-rustls 0.26.1", "tokio-util", + "tower", "tower-service", "url", "wasm-bindgen", @@ -4227,9 +4229,9 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.18" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248" +checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" [[package]] name = "ryu" @@ -4375,9 +4377,9 @@ checksum = "3cb6eb87a131f756572d7fb904f6e7b68633f09cca868c5df1c4b8d1a694bbba" [[package]] name = "serde" -version = "1.0.216" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e" +checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" dependencies = [ "serde_derive", ] @@ -4393,20 +4395,20 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.216" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e" +checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] name = "serde_json" -version = "1.0.133" +version = "1.0.134" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" +checksum = "d00f4175c42ee48b15416f6193a959ba3a0d67fc699a0db9ad12df9f83991c7d" dependencies = [ "indexmap", "itoa", @@ -4545,7 +4547,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -4651,7 +4653,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -4673,9 +4675,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.90" +version = "2.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" +checksum = "987bc0be1cdea8b10216bd06e2ca407d40b9543468fafd3ddfb02f36e77f71f3" dependencies = [ "proc-macro2", "quote", @@ -4699,14 +4701,14 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] name = "sysinfo" -version = "0.32.1" +version = "0.33.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c33cd241af0f2e9e3b5c32163b873b29956890b5342e6745b917ce9d490f4af" +checksum = "4fc858248ea01b66f19d8e8a6d55f41deaf91e9d495246fd01368d99935c6c01" dependencies = [ "core-foundation-sys", "libc", @@ -4751,11 +4753,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.8" +version = "2.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f5383f3e0071702bf93ab5ee99b52d26936be9dedd9413067cbdcddcb6141a" +checksum = "f072643fd0190df67a8bab670c20ef5d8737177d6ac6b2e9a236cb096206b2cc" dependencies = [ - "thiserror-impl 2.0.8", + "thiserror-impl 2.0.9", ] [[package]] @@ -4766,18 +4768,18 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] name = "thiserror-impl" -version = "2.0.8" +version = "2.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2f357fcec90b3caef6623a099691be676d033b40a058ac95d2a6ade6fa0c943" +checksum = "7b50fa271071aae2e6ee85f842e2e28ba8cd2c5fb67f11fcb1fd70b276f9e7d4" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -4832,9 +4834,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.8.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" +checksum = "022db8904dfa342efe721985167e9fcd16c29b226db4397ed752a761cfce81e8" dependencies = [ "tinyvec_macros", ] @@ -4870,7 +4872,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -4917,6 +4919,27 @@ dependencies = [ "tokio", ] +[[package]] +name = "tower" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + [[package]] name = "tower-service" version = "0.3.3" @@ -4942,7 +4965,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -4987,7 +5010,7 @@ checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -5156,7 +5179,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", "wasm-bindgen-shared", ] @@ -5191,7 +5214,7 @@ checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -5305,7 +5328,7 @@ checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -5316,7 +5339,7 @@ checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -5534,9 +5557,9 @@ checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" [[package]] name = "xxhash-rust" -version = "0.8.13" +version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a08fd76779ae1883bbf1e46c2c46a75a0c4e37c445e68a24b01479d438f26ae6" +checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" [[package]] name = "yoke" @@ -5558,7 +5581,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", "synstructure", ] @@ -5580,7 +5603,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] @@ -5600,7 +5623,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", "synstructure", ] @@ -5629,7 +5652,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.94", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index b2d1b988fc4b..727cd49af796 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,7 +28,7 @@ avro-schema = { version = "0.3" } base64 = "0.22.0" bincode = "1.3.3" bitflags = "2" -bytemuck = { version = "1.11", features = ["derive", "extern_crate_alloc"] } +bytemuck = { version = "1.21", features = ["derive", "extern_crate_alloc"] } bytes = { version = "1.7" } chrono = { version = "0.4.31", default-features = false, features = ["std"] } chrono-tz = "0.10" diff --git a/crates/polars-utils/Cargo.toml b/crates/polars-utils/Cargo.toml index 2fb92f49e7f7..31cdf97883e8 100644 --- a/crates/polars-utils/Cargo.toml +++ b/crates/polars-utils/Cargo.toml @@ -30,7 +30,7 @@ rayon = { workspace = true } serde = { workspace = true, optional = true } serde_json = { workspace = true, optional = true } stacker = { workspace = true } -sysinfo = { version = "0.32", default-features = false, features = ["system"], optional = true } +sysinfo = { version = "0.33", default-features = false, features = ["system"], optional = true } [dev-dependencies] rand = { workspace = true } From d3bcf0ab2049a7b56ae358a8904205cf3bb6c89f Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 2 Jan 2025 11:06:03 +0100 Subject: [PATCH 06/20] fix: Fix union (#20523) --- crates/polars-core/src/datatypes/mod.rs | 2 ++ crates/polars-core/src/datatypes/schema.rs | 22 +++++++++++++++++++ crates/polars-core/src/lib.rs | 1 + .../src/plans/conversion/dsl_to_ir.rs | 4 +++- .../tests/unit/operations/test_concat.py | 10 +++++++++ 5 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 crates/polars-core/src/datatypes/schema.rs diff --git a/crates/polars-core/src/datatypes/mod.rs b/crates/polars-core/src/datatypes/mod.rs index ed2f810f05bd..f9e4b71e5602 100644 --- a/crates/polars-core/src/datatypes/mod.rs +++ b/crates/polars-core/src/datatypes/mod.rs @@ -22,6 +22,7 @@ use std::fmt::{Display, Formatter}; use std::hash::{Hash, Hasher}; use std::ops::{Add, AddAssign, Div, Mul, Rem, Sub, SubAssign}; +mod schema; pub use aliases::*; pub use any_value::*; pub use arrow::array::{ArrayCollectIterExt, ArrayFromIter, ArrayFromIterDtype, StaticArray}; @@ -42,6 +43,7 @@ use polars_utils::abs_diff::AbsDiff; use polars_utils::float::IsFloat; use polars_utils::min_max::MinMax; use polars_utils::nulls::IsNull; +pub use schema::SchemaExtPl; #[cfg(feature = "serde")] use serde::de::{EnumAccess, Error, Unexpected, VariantAccess, Visitor}; #[cfg(any(feature = "serde", feature = "serde-lazy"))] diff --git a/crates/polars-core/src/datatypes/schema.rs b/crates/polars-core/src/datatypes/schema.rs new file mode 100644 index 000000000000..edc3b38dee7b --- /dev/null +++ b/crates/polars-core/src/datatypes/schema.rs @@ -0,0 +1,22 @@ +use super::*; + +pub trait SchemaExtPl { + // Answers if this schema matches the given schema. + // + // Allows (nested) Null types in this schema to match any type in the schema, + // but not vice versa. In such a case Ok(true) is returned, because a cast + // is necessary. If no cast is necessary Ok(false) is returned, and an + // error is returned if the types are incompatible. + fn matches_schema(&self, other: &Schema) -> PolarsResult; +} + +impl SchemaExtPl for Schema { + fn matches_schema(&self, other: &Schema) -> PolarsResult { + polars_ensure!(self.len() == other.len(), SchemaMismatch: "found different number of fields in schema's\n\nLeft schema: {} fields, right schema: {} fields.", self.len(), other.len()); + let mut cast = false; + for (a, b) in self.iter_values().zip(other.iter_values()) { + cast |= a.matches_schema_type(b)?; + } + Ok(cast) + } +} diff --git a/crates/polars-core/src/lib.rs b/crates/polars-core/src/lib.rs index b81a65674eaa..25377fbfe62e 100644 --- a/crates/polars-core/src/lib.rs +++ b/crates/polars-core/src/lib.rs @@ -31,6 +31,7 @@ mod tests; use std::sync::Mutex; use std::time::{SystemTime, UNIX_EPOCH}; +pub use datatypes::SchemaExtPl; pub use hashing::IdBuildHasher; use once_cell::sync::Lazy; use rayon::{ThreadPool, ThreadPoolBuilder}; diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs index 60512a9b0703..3474c8079079 100644 --- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs +++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs @@ -384,8 +384,10 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult let schema = ctxt.lp_arena.get(first).schema(ctxt.lp_arena); for n in &inputs[1..] { let schema_i = ctxt.lp_arena.get(*n).schema(ctxt.lp_arena); - polars_ensure!(schema == schema_i, InvalidOperation: "'union'/'concat' inputs should all have the same schema,\ + // The first argument + schema_i.matches_schema(schema.as_ref()).map_err(|_| polars_err!(InvalidOperation: "'union'/'concat' inputs should all have the same schema,\ got\n{:?} and \n{:?}", schema, schema_i) + )?; } let options = args.into(); diff --git a/py-polars/tests/unit/operations/test_concat.py b/py-polars/tests/unit/operations/test_concat.py index 6c964764c181..a2664df1b000 100644 --- a/py-polars/tests/unit/operations/test_concat.py +++ b/py-polars/tests/unit/operations/test_concat.py @@ -97,3 +97,13 @@ def test_concat_series() -> None: assert pl.concat([s, s]).len() == 6 # check if s remains unchanged assert s.len() == 3 + + +def test_concat_null_20501() -> None: + a = pl.DataFrame({"id": [1], "value": ["foo"]}) + b = pl.DataFrame({"id": [2], "value": [None]}) + + assert pl.concat([a.lazy(), b.lazy()]).collect().to_dict(as_series=False) == { + "id": [1, 2], + "value": ["foo", None], + } From 91d04b855a37aa116b6f871c8be0a1b4ab770434 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 2 Jan 2025 11:08:24 +0100 Subject: [PATCH 07/20] fix: Fix global cat unique (#20524) --- .../src/chunked_array/logical/categorical/ops/unique.rs | 7 ++++--- py-polars/tests/unit/datatypes/test_categorical.py | 7 +++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs index 6d337e3570e3..076099a9c33e 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs @@ -31,10 +31,11 @@ impl CategoricalChunked { Ok(out) } } else { + let has_nulls = (self.null_count() > 0) as u32; let mut state = match cat_map.as_ref() { RevMapping::Global(map, values, _) => { if self.is_enum() { - PrimitiveRangedUniqueState::new(0, values.len() as u32 + 1) + PrimitiveRangedUniqueState::new(0, values.len() as u32 + has_nulls) } else { let mut min = u32::MAX; let mut max = 0u32; @@ -44,11 +45,11 @@ impl CategoricalChunked { max = max.max(v); } - PrimitiveRangedUniqueState::new(min, max) + PrimitiveRangedUniqueState::new(min, max + has_nulls) } }, RevMapping::Local(values, _) => { - PrimitiveRangedUniqueState::new(0, values.len() as u32 + 1) + PrimitiveRangedUniqueState::new(0, values.len() as u32 + has_nulls) }, }; diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py index 505986422c55..64b789281a21 100644 --- a/py-polars/tests/unit/datatypes/test_categorical.py +++ b/py-polars/tests/unit/datatypes/test_categorical.py @@ -898,3 +898,10 @@ def test_perfect_group_by_19950() -> None: "y": ["b"], "x": ["a"], } + + +@StringCache() +def test_categorical_unique() -> None: + s = pl.Series(["a", "b", None], dtype=pl.Categorical) + assert s.n_unique() == 3 + assert s.unique().to_list() == ["a", "b", None] From 9d7a7d335690ff9c314d88032f6051a0ee9c7b46 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 2 Jan 2025 13:38:19 +0100 Subject: [PATCH 08/20] feat: Support arbitrary expressions in 'join_where' (#20525) --- .../polars-plan/src/plans/conversion/join.rs | 336 +----------------- .../unit/operations/test_inequality_join.py | 22 +- 2 files changed, 28 insertions(+), 330 deletions(-) diff --git a/crates/polars-plan/src/plans/conversion/join.rs b/crates/polars-plan/src/plans/conversion/join.rs index dd94ed2d7784..ac7e6e1a9ae4 100644 --- a/crates/polars-plan/src/plans/conversion/join.rs +++ b/crates/polars-plan/src/plans/conversion/join.rs @@ -389,6 +389,11 @@ fn resolve_join_where( mut options: Arc, ctxt: &mut DslConversionContext, ) -> PolarsResult<(Node, Node)> { + // If not eager, respect the flag. + if ctxt.opt_flags.eager() { + ctxt.opt_flags.set(OptFlags::PREDICATE_PUSHDOWN, true); + } + ctxt.opt_flags.set(OptFlags::COLLAPSE_JOINS, true); check_join_keys(&predicates)?; let input_left = to_alp_impl(Arc::unwrap_or_clone(input_left), ctxt) .map_err(|e| e.context(failed_here!(join left)))?; @@ -403,17 +408,6 @@ fn resolve_join_where( .into_owned(); for expr in &predicates { - let mut comparison_count = 0; - for _e in expr - .into_iter() - .filter(|e| matches!(e, Expr::BinaryExpr { op, .. } if op.is_comparison())) - { - comparison_count += 1; - if comparison_count > 1 { - polars_bail!(InvalidOperation: "only one binary comparison allowed in each 'join_where' predicate; found {:?}", expr); - } - } - fn all_in_schema( schema: &Schema, other: Option<&Schema>, @@ -437,317 +431,21 @@ fn resolve_join_where( polars_ensure!( valid, InvalidOperation: "'join_where' predicate only refers to columns from a single table") } - let owned = |e: Arc| (*e).clone(); - - // We do a few things - // First we partition to: - // - IEjoin supported inequality predicates - // - equality predicates - // - remaining predicates - // And then decide to which join we dispatch. - // The remaining predicates will be applied as filter. - - // What make things a bit complicated is that duplicate join names - // are referred to in the query with the name post-join, but on joins - // we refer to the names pre-join (e.g. without suffix). So there is some - // bookkeeping. - // - // - First we determine which side of the binary expression refers to the left and right table - // and make sure that lhs of the binary expr, maps to the lhs of the join tables and vice versa. - // Next we ensure the suffixes are removed when we partition. - // - // If a predicate has to be applied as post-join filter, we put the suffixes back if needed. - let mut ie_left_on = vec![]; - let mut ie_right_on = vec![]; - let mut ie_op = vec![]; - - let mut eq_left_on = vec![]; - let mut eq_right_on = vec![]; - - let mut remaining_preds = vec![]; - - fn to_inequality_operator(op: &Operator) -> Option { - match op { - Operator::Lt => Some(InequalityOperator::Lt), - Operator::LtEq => Some(InequalityOperator::LtEq), - Operator::Gt => Some(InequalityOperator::Gt), - Operator::GtEq => Some(InequalityOperator::GtEq), - _ => None, - } - } - - fn rename_expr(e: Expr, old: &str, new: &str) -> Expr { - e.map_expr(|e| match e { - Expr::Column(name) if name.as_str() == old => Expr::Column(new.into()), - e => e, - }) - } - - fn determine_order_and_pre_join_names( - left: Expr, - op: Operator, - right: Expr, - schema_left: &Schema, - schema_right: &Schema, - suffix: &str, - ) -> PolarsResult<(Expr, Operator, Expr)> { - let left_names = expr_to_leaf_column_names_iter(&left).collect::>(); - let right_names = expr_to_leaf_column_names_iter(&right).collect::>(); - - // All left should be in the left schema. - let (left_names, right_names, left, op, mut right) = - if !left_names.iter().all(|n| schema_left.contains(n)) { - // If all right names are in left schema -> swap - if right_names.iter().all(|n| schema_left.contains(n)) { - (right_names, left_names, right, op.swap_operands(), left) - } else { - polars_bail!(InvalidOperation: "got ambiguous column names in 'join_where'") - } - } else { - (left_names, right_names, left, op, right) - }; - for name in &left_names { - polars_ensure!(!right_names.contains(name.as_str()), InvalidOperation: "found ambiguous column names in 'join_where'\n\n\ - Note that you should refer to the column names as they are post-join operation.") - } - - // Now we know left belongs to the left schema, rhs suffixes are dealt with. - for post_join_name in right_names { - if let Some(pre_join_name) = post_join_name.strip_suffix(suffix) { - // Name is both sides, so a suffix will be added by the join. - // We rename - if schema_right.contains(pre_join_name) && schema_left.contains(pre_join_name) { - right = rename_expr(right, &post_join_name, pre_join_name); - } - } - } - Ok((left, op, right)) - } - - // Make it a binary comparison and ensure the columns refer to post join names. - fn to_binary_post_join( - l: Expr, - op: Operator, - mut r: Expr, - schema_right: &Schema, - suffix: &str, - ) -> Expr { - let names = expr_to_leaf_column_names_iter(&r).collect::>(); - for pre_join_name in &names { - if !schema_right.contains(pre_join_name) { - let post_join_name = _join_suffix_name(pre_join_name, suffix); - r = rename_expr(r, pre_join_name, post_join_name.as_str()); - } - } - - Expr::BinaryExpr { - left: Arc::from(l), - op, - right: Arc::from(r), - } - } - - let suffix = options.args.suffix().clone(); - for pred in predicates.into_iter() { - let Expr::BinaryExpr { left, op, right } = pred.clone() else { - polars_bail!(InvalidOperation: "can only join on binary (in)equality expressions, found {:?}", pred) - }; - polars_ensure!(op.is_comparison(), InvalidOperation: "expected comparison in join predicate"); - let (left, op, right) = determine_order_and_pre_join_names( - owned(left), - op, - owned(right), - &schema_left, - &schema_right, - &suffix, - )?; - - if let Some(ie_op_) = to_inequality_operator(&op) { - fn is_numeric(e: &Expr, schema: &Schema) -> bool { - expr_to_leaf_column_names_iter(e).any(|name| { - if let Some(dt) = schema.get(name.as_str()) { - dt.to_physical().is_numeric() - } else { - false - } - }) - } - - // We fallback to remaining if: - // - we already have an IEjoin or Inner join - // - we already have an Inner join - // - data is not numeric (our iejoin doesn't yet implement that) - if ie_op.len() >= 2 - || !eq_right_on.is_empty() - || !is_numeric(&left, &schema_left) - || !is_numeric(&right, &schema_right) - { - remaining_preds.push(to_binary_post_join(left, op, right, &schema_right, &suffix)) - } else { - ie_left_on.push(left); - ie_right_on.push(right); - ie_op.push(ie_op_) - } - } else if matches!(op, Operator::Eq) { - eq_left_on.push(left); - eq_right_on.push(right); - } else { - remaining_preds.push(to_binary_post_join(left, op, right, &schema_right, &suffix)); - } - } - - // Now choose a primary join and do the remaining predicates as filters - // Add the ie predicates to the remaining predicates buffer so that they will be executed in the - // filter node. - fn ie_predicates_to_remaining( - remaining_preds: &mut Vec, - ie_left_on: Vec, - ie_right_on: Vec, - ie_op: Vec, - schema_right: &Schema, - suffix: &str, - ) { - for ((l, op), r) in ie_left_on - .into_iter() - .zip(ie_op.into_iter()) - .zip(ie_right_on.into_iter()) - { - remaining_preds.push(to_binary_post_join(l, op.into(), r, schema_right, suffix)) - } - } - - let (mut last_node, join_node) = if !eq_left_on.is_empty() { - // We found one or more equality predicates. Go into a default equi join - // as those are cheapest on avg. - let (last_node, join_node) = resolve_join( - Either::Right(input_left), - Either::Right(input_right), - eq_left_on, - eq_right_on, - vec![], - options.clone(), - ctxt, - )?; - - ie_predicates_to_remaining( - &mut remaining_preds, - ie_left_on, - ie_right_on, - ie_op, - &schema_right, - &suffix, - ); - (last_node, join_node) - } else if ie_right_on.len() >= 2 { - // Do an IEjoin. - let opts = Arc::make_mut(&mut options); - - opts.args.how = JoinType::IEJoin; - opts.options = Some(JoinTypeOptionsIR::IEJoin(IEJoinOptions { - operator1: ie_op[0], - operator2: Some(ie_op[1]), - })); - - let (last_node, join_node) = resolve_join( - Either::Right(input_left), - Either::Right(input_right), - ie_left_on[..2].to_vec(), - ie_right_on[..2].to_vec(), - vec![], - options.clone(), - ctxt, - )?; - - // The surplus ie-predicates will be added to the remaining predicates so that - // they will be applied in a filter node. - while ie_right_on.len() > 2 { - // Invariant: they all have equal length, so we can pop and unwrap all while len > 2. - // The first 2 predicates are used in the - let l = ie_right_on.pop().unwrap(); - let r = ie_left_on.pop().unwrap(); - let op = ie_op.pop().unwrap(); - - remaining_preds.push(to_binary_post_join(l, op.into(), r, &schema_right, &suffix)) - } - (last_node, join_node) - } else if ie_right_on.len() == 1 { - // For a single inequality comparison, we use the piecewise merge join algorithm - let opts = Arc::make_mut(&mut options); - opts.args.how = JoinType::IEJoin; - opts.options = Some(JoinTypeOptionsIR::IEJoin(IEJoinOptions { - operator1: ie_op[0], - operator2: None, - })); - - resolve_join( - Either::Right(input_left), - Either::Right(input_right), - ie_left_on, - ie_right_on, - vec![], - options.clone(), - ctxt, - )? - } else { - // No predicates found that are supported in a fast algorithm. - // Do a cross join and follow up with filters. - let opts = Arc::make_mut(&mut options); - opts.args.how = JoinType::Cross; - - resolve_join( - Either::Right(input_left), - Either::Right(input_right), - vec![], - vec![], - vec![], - options.clone(), - ctxt, - )? - }; - - let IR::Join { - input_left, - input_right, - .. - } = ctxt.lp_arena.get(join_node) - else { - unreachable!() - }; - let schema_right = ctxt - .lp_arena - .get(*input_right) - .schema(ctxt.lp_arena) - .into_owned(); + let opts = Arc::make_mut(&mut options); + opts.args.how = JoinType::Cross; - let schema_left = ctxt - .lp_arena - .get(*input_left) - .schema(ctxt.lp_arena) - .into_owned(); + let (mut last_node, join_node) = resolve_join( + Either::Right(input_left), + Either::Right(input_right), + vec![], + vec![], + vec![], + options.clone(), + ctxt, + )?; - // Ensure that the predicates use the proper suffix - for e in remaining_preds { + for e in predicates { let predicate = to_expr_ir_ignore_alias(e, ctxt.expr_arena)?; - let AExpr::BinaryExpr { mut right, .. } = *ctxt.expr_arena.get(predicate.node()) else { - unreachable!() - }; - - let original_right = right; - - for name in aexpr_to_leaf_names(right, ctxt.expr_arena) { - polars_ensure!(schema_right.contains(name.as_str()), ColumnNotFound: "could not find column {name} in the right table during join operation"); - if schema_left.contains(name.as_str()) { - let new_name = _join_suffix_name(name.as_str(), suffix.as_str()); - - right = rename_matching_aexpr_leaf_names( - right, - ctxt.expr_arena, - name.as_str(), - new_name, - ); - } - } - ctxt.expr_arena.swap(right, original_right); let ir = IR::Filter { input: last_node, diff --git a/py-polars/tests/unit/operations/test_inequality_join.py b/py-polars/tests/unit/operations/test_inequality_join.py index 848a4b2b7f85..2495d5b84f2d 100644 --- a/py-polars/tests/unit/operations/test_inequality_join.py +++ b/py-polars/tests/unit/operations/test_inequality_join.py @@ -461,17 +461,6 @@ def test_raise_on_ambiguous_name() -> None: df.join_where(df, pl.col("id") >= pl.col("id")) -def test_raise_on_multiple_binary_comparisons() -> None: - df = pl.DataFrame({"id": [1, 2]}) - with pytest.raises( - pl.exceptions.InvalidOperationError, - match="only one binary comparison allowed in each 'join_where' predicate; found ", - ): - df.join_where( - df, (pl.col("id") < pl.col("id")) ^ (pl.col("id") >= pl.col("id")) - ) - - def test_raise_invalid_input_join_where() -> None: df = pl.DataFrame({"id": [1, 2]}) with pytest.raises( @@ -681,3 +670,14 @@ def test_join_where_literal_20061() -> None: "value_right": [5, 5, 5, 25], "flag_right": [1, 1, 1, 1], } + + +def test_boolean_predicate_join_where() -> None: + urls = pl.LazyFrame({"url": "abcd.com/page"}) + categories = pl.LazyFrame({"base_url": "abcd.com", "category": "landing page"}) + assert ( + "NESTED LOOP JOIN" + in urls.join_where( + categories, pl.col("url").str.starts_with(pl.col("base_url")) + ).explain() + ) From 11fa6de9f9a69099dab14cd945fd4e32aa38ee40 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 2 Jan 2025 09:12:44 -0500 Subject: [PATCH 09/20] fix: Fix various `Int128` operations (#20515) --- crates/polars-core/src/series/mod.rs | 2 ++ crates/polars-core/src/series/ops/downcast.rs | 2 +- .../src/chunked_array/list/sum_mean.rs | 1 + .../src/frame/join/hash_join/sort_merge.rs | 4 +++ crates/polars-ops/src/series/ops/abs.rs | 2 ++ crates/polars-ops/src/series/ops/cum_agg.rs | 4 +++ .../series/ops/interpolation/interpolate.rs | 1 + .../polars-plan/src/dsl/function_expr/cum.rs | 2 ++ crates/polars-plan/src/dsl/mod.rs | 2 ++ crates/polars-python/src/expr/rolling.rs | 12 ++++++++ crates/polars-python/src/series/comparison.rs | 6 ++++ py-polars/polars/datatypes/convert.py | 3 ++ .../tests/unit/lazyframe/test_lazyframe.py | 29 ++++++++++++++----- .../tests/unit/operations/rolling/test_map.py | 13 +++++---- py-polars/tests/unit/operations/test_abs.py | 5 ++-- .../tests/unit/operations/test_interpolate.py | 1 + py-polars/tests/unit/series/test_series.py | 20 ++++++++----- 17 files changed, 86 insertions(+), 23 deletions(-) diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index 01dbcf33db33..43de366857f8 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -747,6 +747,8 @@ impl Series { }, Int64 => Ok(self.i64().unwrap().prod_reduce()), UInt64 => Ok(self.u64().unwrap().prod_reduce()), + #[cfg(feature = "dtype-i128")] + Int128 => Ok(self.i128().unwrap().prod_reduce()), Float32 => Ok(self.f32().unwrap().prod_reduce()), Float64 => Ok(self.f64().unwrap().prod_reduce()), dt => { diff --git a/crates/polars-core/src/series/ops/downcast.rs b/crates/polars-core/src/series/ops/downcast.rs index f095c512eb67..732d2228a55b 100644 --- a/crates/polars-core/src/series/ops/downcast.rs +++ b/crates/polars-core/src/series/ops/downcast.rs @@ -219,7 +219,7 @@ impl Series { .ok_or_else(|| unpack_chunked_err!(self => "Int64")) } - /// Unpack to [`ChunkedArray`] of dtype [`DataType::Int64`] + /// Unpack to [`ChunkedArray`] of dtype [`DataType::Int128`] #[cfg(feature = "dtype-i128")] pub fn i128(&self) -> PolarsResult<&Int128Chunked> { self.try_i128() diff --git a/crates/polars-ops/src/chunked_array/list/sum_mean.rs b/crates/polars-ops/src/chunked_array/list/sum_mean.rs index a1d73877ae99..9413318f1eac 100644 --- a/crates/polars-ops/src/chunked_array/list/sum_mean.rs +++ b/crates/polars-ops/src/chunked_array/list/sum_mean.rs @@ -161,6 +161,7 @@ pub(super) fn mean_list_numerical(ca: &ListChunked, inner_type: &DataType) -> Se Int16 => dispatch_mean::(values, offsets, arr.validity()), Int32 => dispatch_mean::(values, offsets, arr.validity()), Int64 => dispatch_mean::(values, offsets, arr.validity()), + Int128 => dispatch_mean::(values, offsets, arr.validity()), UInt8 => dispatch_mean::(values, offsets, arr.validity()), UInt16 => dispatch_mean::(values, offsets, arr.validity()), UInt32 => dispatch_mean::(values, offsets, arr.validity()), diff --git a/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs b/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs index 2053db8786e8..5b2f83282a76 100644 --- a/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs +++ b/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs @@ -146,6 +146,10 @@ pub(super) fn par_sorted_merge_inner_no_nulls( DataType::Int64 => { par_sorted_merge_inner_impl(s_left.i64().unwrap(), s_right.i64().unwrap()) }, + #[cfg(feature = "dtype-i128")] + DataType::Int128 => { + par_sorted_merge_inner_impl(s_left.i128().unwrap(), s_right.i128().unwrap()) + }, DataType::Float32 => { par_sorted_merge_inner_impl(s_left.f32().unwrap(), s_right.f32().unwrap()) }, diff --git a/crates/polars-ops/src/series/ops/abs.rs b/crates/polars-ops/src/series/ops/abs.rs index 5a84678df591..e93e3c13c60d 100644 --- a/crates/polars-ops/src/series/ops/abs.rs +++ b/crates/polars-ops/src/series/ops/abs.rs @@ -10,6 +10,8 @@ pub fn abs(s: &Series) -> PolarsResult { Int16 => s.i16().unwrap().wrapping_abs().into_series(), Int32 => s.i32().unwrap().wrapping_abs().into_series(), Int64 => s.i64().unwrap().wrapping_abs().into_series(), + #[cfg(feature = "dtype-i128")] + Int128 => s.i128().unwrap().wrapping_abs().into_series(), Float32 => s.f32().unwrap().wrapping_abs().into_series(), Float64 => s.f64().unwrap().wrapping_abs().into_series(), #[cfg(feature = "dtype-decimal")] diff --git a/crates/polars-ops/src/series/ops/cum_agg.rs b/crates/polars-ops/src/series/ops/cum_agg.rs index 829c57c820d4..163aa10eb080 100644 --- a/crates/polars-ops/src/series/ops/cum_agg.rs +++ b/crates/polars-ops/src/series/ops/cum_agg.rs @@ -187,6 +187,8 @@ pub fn cum_prod(s: &Series, reverse: bool) -> PolarsResult { }, Int64 => cum_prod_numeric(s.i64()?, reverse).into_series(), UInt64 => cum_prod_numeric(s.u64()?, reverse).into_series(), + #[cfg(feature = "dtype-i128")] + Int128 => cum_prod_numeric(s.i128()?, reverse).into_series(), Float32 => cum_prod_numeric(s.f32()?, reverse).into_series(), Float64 => cum_prod_numeric(s.f64()?, reverse).into_series(), dt => polars_bail!(opq = cum_prod, dt), @@ -213,6 +215,8 @@ pub fn cum_sum(s: &Series, reverse: bool) -> PolarsResult { UInt32 => cum_sum_numeric(s.u32()?, reverse).into_series(), Int64 => cum_sum_numeric(s.i64()?, reverse).into_series(), UInt64 => cum_sum_numeric(s.u64()?, reverse).into_series(), + #[cfg(feature = "dtype-i128")] + Int128 => cum_sum_numeric(s.i128()?, reverse).into_series(), Float32 => cum_sum_numeric(s.f32()?, reverse).into_series(), Float64 => cum_sum_numeric(s.f64()?, reverse).into_series(), #[cfg(feature = "dtype-duration")] diff --git a/crates/polars-ops/src/series/ops/interpolation/interpolate.rs b/crates/polars-ops/src/series/ops/interpolation/interpolate.rs index 36d9dc12e556..095b38a6b20e 100644 --- a/crates/polars-ops/src/series/ops/interpolation/interpolate.rs +++ b/crates/polars-ops/src/series/ops/interpolation/interpolate.rs @@ -164,6 +164,7 @@ fn interpolate_linear(s: &Series) -> Series { | DataType::Int16 | DataType::Int32 | DataType::Int64 + | DataType::Int128 | DataType::UInt8 | DataType::UInt16 | DataType::UInt32 diff --git a/crates/polars-plan/src/dsl/function_expr/cum.rs b/crates/polars-plan/src/dsl/function_expr/cum.rs index 755199c3a2a0..02f652274907 100644 --- a/crates/polars-plan/src/dsl/function_expr/cum.rs +++ b/crates/polars-plan/src/dsl/function_expr/cum.rs @@ -38,6 +38,7 @@ pub(super) mod dtypes { match dt { Boolean => UInt32, Int32 => Int32, + Int128 => Int128, UInt32 => UInt32, UInt64 => UInt64, Float32 => Float32, @@ -56,6 +57,7 @@ pub(super) mod dtypes { match dt { Boolean => Int64, UInt64 => UInt64, + Int128 => Int128, Float32 => Float32, Float64 => Float64, _ => Int64, diff --git a/crates/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs index 02e276c98565..ef27dc3966b9 100644 --- a/crates/polars-plan/src/dsl/mod.rs +++ b/crates/polars-plan/src/dsl/mod.rs @@ -850,6 +850,8 @@ impl Expr { T::Float32 => T::Float32, T::Float64 => T::Float64, T::UInt64 => T::UInt64, + #[cfg(feature = "dtype-i128")] + T::Int128 => T::Int128, _ => T::Int64, }) }), diff --git a/crates/polars-python/src/expr/rolling.rs b/crates/polars-python/src/expr/rolling.rs index 5ef511902613..b9d38124247a 100644 --- a/crates/polars-python/src/expr/rolling.rs +++ b/crates/polars-python/src/expr/rolling.rs @@ -457,6 +457,18 @@ impl PyExpr { }) } }, + Int128 => { + if is_float { + let v = obj.extract::(py).unwrap(); + Ok(Int128Chunked::from_slice(PlSmallStr::EMPTY, &[v as i128]) + .into_series()) + } else { + obj.extract::(py).map(|v| { + Int128Chunked::from_slice(PlSmallStr::EMPTY, &[v]) + .into_series() + }) + } + }, Float32 => obj.extract::(py).map(|v| { Float32Chunked::from_slice(PlSmallStr::EMPTY, &[v]).into_series() }), diff --git a/crates/polars-python/src/series/comparison.rs b/crates/polars-python/src/series/comparison.rs index 2b7de37931f9..38afdbd1c667 100644 --- a/crates/polars-python/src/series/comparison.rs +++ b/crates/polars-python/src/series/comparison.rs @@ -71,6 +71,7 @@ impl_eq_num!(eq_i8, i8); impl_eq_num!(eq_i16, i16); impl_eq_num!(eq_i32, i32); impl_eq_num!(eq_i64, i64); +impl_eq_num!(eq_i128, i128); impl_eq_num!(eq_f32, f32); impl_eq_num!(eq_f64, f64); impl_eq_num!(eq_str, &str); @@ -98,6 +99,7 @@ impl_neq_num!(neq_i8, i8); impl_neq_num!(neq_i16, i16); impl_neq_num!(neq_i32, i32); impl_neq_num!(neq_i64, i64); +impl_neq_num!(neq_i128, i128); impl_neq_num!(neq_f32, f32); impl_neq_num!(neq_f64, f64); impl_neq_num!(neq_str, &str); @@ -124,6 +126,7 @@ impl_gt_num!(gt_i8, i8); impl_gt_num!(gt_i16, i16); impl_gt_num!(gt_i32, i32); impl_gt_num!(gt_i64, i64); +impl_gt_num!(gt_i128, i128); impl_gt_num!(gt_f32, f32); impl_gt_num!(gt_f64, f64); impl_gt_num!(gt_str, &str); @@ -150,6 +153,7 @@ impl_gt_eq_num!(gt_eq_i8, i8); impl_gt_eq_num!(gt_eq_i16, i16); impl_gt_eq_num!(gt_eq_i32, i32); impl_gt_eq_num!(gt_eq_i64, i64); +impl_gt_eq_num!(gt_eq_i128, i128); impl_gt_eq_num!(gt_eq_f32, f32); impl_gt_eq_num!(gt_eq_f64, f64); impl_gt_eq_num!(gt_eq_str, &str); @@ -177,6 +181,7 @@ impl_lt_num!(lt_i8, i8); impl_lt_num!(lt_i16, i16); impl_lt_num!(lt_i32, i32); impl_lt_num!(lt_i64, i64); +impl_lt_num!(lt_i128, i128); impl_lt_num!(lt_f32, f32); impl_lt_num!(lt_f64, f64); impl_lt_num!(lt_str, &str); @@ -203,6 +208,7 @@ impl_lt_eq_num!(lt_eq_i8, i8); impl_lt_eq_num!(lt_eq_i16, i16); impl_lt_eq_num!(lt_eq_i32, i32); impl_lt_eq_num!(lt_eq_i64, i64); +impl_lt_eq_num!(lt_eq_i128, i128); impl_lt_eq_num!(lt_eq_f32, f32); impl_lt_eq_num!(lt_eq_f64, f64); impl_lt_eq_num!(lt_eq_str, &str); diff --git a/py-polars/polars/datatypes/convert.py b/py-polars/polars/datatypes/convert.py index 423c687833ca..20abe6737854 100644 --- a/py-polars/polars/datatypes/convert.py +++ b/py-polars/polars/datatypes/convert.py @@ -28,6 +28,7 @@ Int16, Int32, Int64, + Int128, List, Null, Object, @@ -149,6 +150,7 @@ def DTYPE_TO_FFINAME(self) -> dict[PolarsDataType, str]: Duration: "duration", Float32: "f32", Float64: "f64", + Int128: "i128", Int16: "i16", Int32: "i32", Int64: "i64", @@ -177,6 +179,7 @@ def DTYPE_TO_PY_TYPE(self) -> dict[PolarsDataType, PythonDataType]: Duration: timedelta, Float32: float, Float64: float, + Int128: int, Int16: int, Int32: int, Int64: int, diff --git a/py-polars/tests/unit/lazyframe/test_lazyframe.py b/py-polars/tests/unit/lazyframe/test_lazyframe.py index 38f89ff87852..e94590a27d03 100644 --- a/py-polars/tests/unit/lazyframe/test_lazyframe.py +++ b/py-polars/tests/unit/lazyframe/test_lazyframe.py @@ -19,7 +19,7 @@ PolarsInefficientMapWarning, ) from polars.testing import assert_frame_equal, assert_series_equal -from tests.unit.conftest import FLOAT_DTYPES +from tests.unit.conftest import FLOAT_DTYPES, NUMERIC_DTYPES if TYPE_CHECKING: from _pytest.capture import CaptureFixture @@ -488,19 +488,34 @@ def test_len() -> None: assert cast(int, ldf.select(pl.col("nrs").len()).collect().item()) == 3 -def test_cum_agg() -> None: - ldf = pl.LazyFrame({"a": [1, 2, 3, 2]}) +@pytest.mark.parametrize("dtype", NUMERIC_DTYPES) +def test_cum_agg(dtype: PolarsDataType) -> None: + ldf = pl.LazyFrame({"a": [1, 2, 3, 2]}, schema={"a": dtype}) assert_series_equal( - ldf.select(pl.col("a").cum_sum()).collect()["a"], pl.Series("a", [1, 3, 6, 8]) + ldf.select(pl.col("a").cum_min()).collect()["a"], + pl.Series("a", [1, 1, 1, 1], dtype=dtype), ) assert_series_equal( - ldf.select(pl.col("a").cum_min()).collect()["a"], pl.Series("a", [1, 1, 1, 1]) + ldf.select(pl.col("a").cum_max()).collect()["a"], + pl.Series("a", [1, 2, 3, 3], dtype=dtype), + ) + + expected_dtype = ( + pl.Int64 if dtype in [pl.Int8, pl.Int16, pl.UInt8, pl.UInt16] else dtype ) assert_series_equal( - ldf.select(pl.col("a").cum_max()).collect()["a"], pl.Series("a", [1, 2, 3, 3]) + ldf.select(pl.col("a").cum_sum()).collect()["a"], + pl.Series("a", [1, 3, 6, 8], dtype=expected_dtype), + ) + + expected_dtype = ( + pl.Int64 + if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.UInt8, pl.UInt16, pl.UInt32] + else dtype ) assert_series_equal( - ldf.select(pl.col("a").cum_prod()).collect()["a"], pl.Series("a", [1, 2, 6, 12]) + ldf.select(pl.col("a").cum_prod()).collect()["a"], + pl.Series("a", [1, 2, 6, 12], dtype=expected_dtype), ) diff --git a/py-polars/tests/unit/operations/rolling/test_map.py b/py-polars/tests/unit/operations/rolling/test_map.py index 730b76baad55..cf3b1cefbf80 100644 --- a/py-polars/tests/unit/operations/rolling/test_map.py +++ b/py-polars/tests/unit/operations/rolling/test_map.py @@ -7,6 +7,7 @@ import polars as pl from polars.testing import assert_series_equal +from tests.unit.conftest import INTEGER_DTYPES if TYPE_CHECKING: from polars._typing import PolarsDataType @@ -82,17 +83,19 @@ def test_rolling_map_std_weights(dtype: PolarsDataType) -> None: assert_series_equal(result, expected) -def test_rolling_map_sum_int() -> None: - s = pl.Series("A", [1, 2, 9, 2, 13], dtype=pl.Int32) +@pytest.mark.parametrize("dtype", INTEGER_DTYPES) +def test_rolling_map_sum_int(dtype: PolarsDataType) -> None: + s = pl.Series("A", [1, 2, 9, 2, 13], dtype=dtype) result = s.rolling_map(function=lambda s: s.sum(), window_size=3) - expected = pl.Series("A", [None, None, 12, 13, 24], dtype=pl.Int32) + expected = pl.Series("A", [None, None, 12, 13, 24], dtype=dtype) assert_series_equal(result, expected) -def test_rolling_map_sum_int_cast_to_float() -> None: - s = pl.Series("A", [1, 2, 9, None, 13], dtype=pl.Int32) +@pytest.mark.parametrize("dtype", INTEGER_DTYPES) +def test_rolling_map_sum_int_cast_to_float(dtype: PolarsDataType) -> None: + s = pl.Series("A", [1, 2, 9, None, 13], dtype=dtype) result = s.rolling_map( function=lambda s: s.sum(), window_size=3, weights=[1.0, 2.0, 3.0] diff --git a/py-polars/tests/unit/operations/test_abs.py b/py-polars/tests/unit/operations/test_abs.py index 68e4518a93f9..ad0d6eadf9c1 100644 --- a/py-polars/tests/unit/operations/test_abs.py +++ b/py-polars/tests/unit/operations/test_abs.py @@ -10,6 +10,7 @@ import polars as pl from polars.exceptions import InvalidOperationError from polars.testing import assert_frame_equal, assert_series_equal +from tests.unit.conftest import FLOAT_DTYPES, SIGNED_INTEGER_DTYPES if TYPE_CHECKING: from polars._typing import PolarsDataType @@ -47,9 +48,7 @@ def test_builtin_abs() -> None: assert abs(s).to_list() == [1, 0, 1, None] -@pytest.mark.parametrize( - "dtype", [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.Float32, pl.Float64] -) +@pytest.mark.parametrize("dtype", [*FLOAT_DTYPES, *SIGNED_INTEGER_DTYPES]) def test_abs_builtin(dtype: PolarsDataType) -> None: lf = pl.LazyFrame({"a": [-1, 0, 1, None]}, schema={"a": dtype}) result = lf.select(abs(pl.col("a"))) diff --git a/py-polars/tests/unit/operations/test_interpolate.py b/py-polars/tests/unit/operations/test_interpolate.py index 9f690e6ecd7b..5d39ffc751fa 100644 --- a/py-polars/tests/unit/operations/test_interpolate.py +++ b/py-polars/tests/unit/operations/test_interpolate.py @@ -22,6 +22,7 @@ (pl.Int16, pl.Float64), (pl.Int32, pl.Float64), (pl.Int64, pl.Float64), + (pl.Int128, pl.Float64), (pl.UInt8, pl.Float64), (pl.UInt16, pl.Float64), (pl.UInt32, pl.Float64), diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py index fc3872ce1f3b..68b2528d5da6 100644 --- a/py-polars/tests/unit/series/test_series.py +++ b/py-polars/tests/unit/series/test_series.py @@ -30,6 +30,7 @@ ShapeError, ) from polars.testing import assert_frame_equal, assert_series_equal +from tests.unit.conftest import FLOAT_DTYPES, INTEGER_DTYPES from tests.unit.utils.pycapsule_utils import PyCapsuleStreamHolder if TYPE_CHECKING: @@ -1717,23 +1718,28 @@ def test_trigonometric_invalid_input() -> None: s.cosh() -def test_product() -> None: - a = pl.Series("a", [1, 2, 3]) +@pytest.mark.parametrize("dtype", INTEGER_DTYPES) +def test_product_ints(dtype: PolarsDataType) -> None: + a = pl.Series("a", [1, 2, 3], dtype=dtype) out = a.product() assert out == 6 - a = pl.Series("a", [1, 2, None]) + a = pl.Series("a", [1, 2, None], dtype=dtype) out = a.product() assert out == 2 - a = pl.Series("a", [None, 2, 3]) + a = pl.Series("a", [None, 2, 3], dtype=dtype) out = a.product() assert out == 6 - a = pl.Series("a", [], dtype=pl.Float32) + + +@pytest.mark.parametrize("dtype", FLOAT_DTYPES) +def test_product_floats(dtype: PolarsDataType) -> None: + a = pl.Series("a", [], dtype=dtype) out = a.product() assert out == 1 - a = pl.Series("a", [None, None], dtype=pl.Float32) + a = pl.Series("a", [None, None], dtype=dtype) out = a.product() assert out == 1 - a = pl.Series("a", [3.0, None, float("nan")]) + a = pl.Series("a", [3.0, None, float("nan")], dtype=dtype) out = a.product() assert math.isnan(out) From 5c9bb7189f220d9064034b4c03175cf841c06d77 Mon Sep 17 00:00:00 2001 From: Marshall Date: Fri, 3 Jan 2025 02:53:35 -0500 Subject: [PATCH 10/20] fix: Add `unique` fast path for empty categoricals (#20536) --- .../chunked_array/logical/categorical/ops/unique.rs | 12 ++++++++++++ .../tests/unit/operations/unique/test_unique.py | 10 ++++++++++ 2 files changed, 22 insertions(+) diff --git a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs index 076099a9c33e..7792fae8a544 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs @@ -7,6 +7,18 @@ use super::*; impl CategoricalChunked { pub fn unique(&self) -> PolarsResult { let cat_map = self.get_rev_map(); + if self.is_empty() { + // SAFETY: rev map is valid. + unsafe { + return Ok(CategoricalChunked::from_cats_and_rev_map_unchecked( + UInt32Chunked::full_null(self.name().clone(), 0), + cat_map.clone(), + self.is_enum(), + self.get_ordering(), + )); + } + }; + if self._can_fast_unique() { let ca = match &**cat_map { RevMapping::Local(a, _) => UInt32Chunked::from_iter_values( diff --git a/py-polars/tests/unit/operations/unique/test_unique.py b/py-polars/tests/unit/operations/unique/test_unique.py index 595ae1db59eb..ff4a0cd10f32 100644 --- a/py-polars/tests/unit/operations/unique/test_unique.py +++ b/py-polars/tests/unit/operations/unique/test_unique.py @@ -154,6 +154,16 @@ def test_unique_categorical(input: list[str | None], output: list[str | None]) - assert_series_equal(result, expected) +def test_unique_categorical_global() -> None: + with pl.StringCache(): + pl.Series(["aaaa", "bbbb", "cccc"]) # pre-fill global cache + s = pl.Series(["a", "b", "c"], dtype=pl.Categorical) + s_empty = s.slice(0, 0) + + assert s_empty.unique().to_list() == [] + assert_series_equal(s_empty.cat.get_categories(), pl.Series(["a", "b", "c"])) + + def test_unique_with_null() -> None: df = pl.DataFrame( { From 5f4499773678fe3104daa44646794686d82f273e Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 3 Jan 2025 05:26:36 -0500 Subject: [PATCH 11/20] feat: Add `Int128` IO support for csv & ipc (#20535) --- .../polars-arrow/src/array/dictionary/mod.rs | 4 +++ crates/polars-arrow/src/datatypes/mod.rs | 1 + .../src/datatypes/physical_type.rs | 2 ++ crates/polars-arrow/src/io/ipc/read/schema.rs | 3 +- .../polars-arrow/src/io/ipc/write/schema.rs | 3 +- crates/polars-arrow/src/util/macros.rs | 1 + .../polars-compute/src/comparisons/array.rs | 1 + .../src/comparisons/dyn_array.rs | 1 + crates/polars-compute/src/comparisons/list.rs | 2 ++ crates/polars-io/Cargo.toml | 1 + crates/polars-io/src/csv/read/buffer.rs | 26 ++++++++++++++++ .../src/csv/write/write_impl/serializer.rs | 1 + crates/polars/Cargo.toml | 1 + py-polars/tests/unit/io/test_csv.py | 23 ++++++++++---- py-polars/tests/unit/io/test_ipc.py | 31 ++++++++++++++++--- 15 files changed, 88 insertions(+), 13 deletions(-) diff --git a/crates/polars-arrow/src/array/dictionary/mod.rs b/crates/polars-arrow/src/array/dictionary/mod.rs index 3f44dd604980..8d31109d8f19 100644 --- a/crates/polars-arrow/src/array/dictionary/mod.rs +++ b/crates/polars-arrow/src/array/dictionary/mod.rs @@ -81,6 +81,10 @@ unsafe impl DictionaryKey for i64 { const KEY_TYPE: IntegerType = IntegerType::Int64; const MAX_USIZE_VALUE: usize = i64::MAX as usize; } +unsafe impl DictionaryKey for i128 { + const KEY_TYPE: IntegerType = IntegerType::Int128; + const MAX_USIZE_VALUE: usize = i128::MAX as usize; +} unsafe impl DictionaryKey for u8 { const KEY_TYPE: IntegerType = IntegerType::UInt8; const MAX_USIZE_VALUE: usize = u8::MAX as usize; diff --git a/crates/polars-arrow/src/datatypes/mod.rs b/crates/polars-arrow/src/datatypes/mod.rs index d3bc5417a9d8..cc7a081a81cc 100644 --- a/crates/polars-arrow/src/datatypes/mod.rs +++ b/crates/polars-arrow/src/datatypes/mod.rs @@ -455,6 +455,7 @@ impl From for ArrowDataType { IntegerType::Int16 => ArrowDataType::Int16, IntegerType::Int32 => ArrowDataType::Int32, IntegerType::Int64 => ArrowDataType::Int64, + IntegerType::Int128 => ArrowDataType::Int128, IntegerType::UInt8 => ArrowDataType::UInt8, IntegerType::UInt16 => ArrowDataType::UInt16, IntegerType::UInt32 => ArrowDataType::UInt32, diff --git a/crates/polars-arrow/src/datatypes/physical_type.rs b/crates/polars-arrow/src/datatypes/physical_type.rs index 732a129055a6..f75d8e644f4c 100644 --- a/crates/polars-arrow/src/datatypes/physical_type.rs +++ b/crates/polars-arrow/src/datatypes/physical_type.rs @@ -76,6 +76,8 @@ pub enum IntegerType { Int32, /// A signed 64-bit integer. Int64, + /// A signed 128-bit integer. + Int128, /// An unsigned 8-bit integer. UInt8, /// An unsigned 16-bit integer. diff --git a/crates/polars-arrow/src/io/ipc/read/schema.rs b/crates/polars-arrow/src/io/ipc/read/schema.rs index d9bb3b21828e..3ed84d3005bd 100644 --- a/crates/polars-arrow/src/io/ipc/read/schema.rs +++ b/crates/polars-arrow/src/io/ipc/read/schema.rs @@ -72,7 +72,8 @@ fn deserialize_integer(int: arrow_format::ipc::IntRef) -> PolarsResult IntegerType::UInt32, (64, true) => IntegerType::Int64, (64, false) => IntegerType::UInt64, - _ => polars_bail!(oos = "IPC: indexType can only be 8, 16, 32 or 64."), + (128, true) => IntegerType::Int128, + _ => polars_bail!(oos = "IPC: indexType can only be 8, 16, 32, 64 or 128."), }) } diff --git a/crates/polars-arrow/src/io/ipc/write/schema.rs b/crates/polars-arrow/src/io/ipc/write/schema.rs index a7e15bbdf464..bdceb58acc5d 100644 --- a/crates/polars-arrow/src/io/ipc/write/schema.rs +++ b/crates/polars-arrow/src/io/ipc/write/schema.rs @@ -327,7 +327,7 @@ pub(crate) fn serialize_dictionary( ) -> arrow_format::ipc::DictionaryEncoding { use IntegerType::*; let is_signed = match index_type { - Int8 | Int16 | Int32 | Int64 => true, + Int8 | Int16 | Int32 | Int64 | Int128 => true, UInt8 | UInt16 | UInt32 | UInt64 => false, }; @@ -336,6 +336,7 @@ pub(crate) fn serialize_dictionary( Int16 | UInt16 => 16, Int32 | UInt32 => 32, Int64 | UInt64 => 64, + Int128 => 128, }; let index_type = arrow_format::ipc::Int { diff --git a/crates/polars-arrow/src/util/macros.rs b/crates/polars-arrow/src/util/macros.rs index fb5bd61ebba0..2153d2cb3a07 100644 --- a/crates/polars-arrow/src/util/macros.rs +++ b/crates/polars-arrow/src/util/macros.rs @@ -57,6 +57,7 @@ macro_rules! match_integer_type {( Int16 => __with_ty__! { i16 }, Int32 => __with_ty__! { i32 }, Int64 => __with_ty__! { i64 }, + Int128 => __with_ty__! { i128 }, UInt8 => __with_ty__! { u8 }, UInt16 => __with_ty__! { u16 }, UInt32 => __with_ty__! { u32 }, diff --git a/crates/polars-compute/src/comparisons/array.rs b/crates/polars-compute/src/comparisons/array.rs index facde12a5c37..210a8a0489aa 100644 --- a/crates/polars-compute/src/comparisons/array.rs +++ b/crates/polars-compute/src/comparisons/array.rs @@ -205,6 +205,7 @@ macro_rules! compare { PH::Dictionary(I::Int16) => call_binary!(DictionaryArray), PH::Dictionary(I::Int32) => call_binary!(DictionaryArray), PH::Dictionary(I::Int64) => call_binary!(DictionaryArray), + PH::Dictionary(I::Int128) => call_binary!(DictionaryArray), PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray), PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray), PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray), diff --git a/crates/polars-compute/src/comparisons/dyn_array.rs b/crates/polars-compute/src/comparisons/dyn_array.rs index 3ee3d802f09f..07fd4bbd9a9d 100644 --- a/crates/polars-compute/src/comparisons/dyn_array.rs +++ b/crates/polars-compute/src/comparisons/dyn_array.rs @@ -68,6 +68,7 @@ macro_rules! compare { PH::Dictionary(I::Int16) => call_binary!(DictionaryArray, lhs, rhs, $op), PH::Dictionary(I::Int32) => call_binary!(DictionaryArray, lhs, rhs, $op), PH::Dictionary(I::Int64) => call_binary!(DictionaryArray, lhs, rhs, $op), + PH::Dictionary(I::Int128) => call_binary!(DictionaryArray, lhs, rhs, $op), PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray, lhs, rhs, $op), PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray, lhs, rhs, $op), PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray, lhs, rhs, $op), diff --git a/crates/polars-compute/src/comparisons/list.rs b/crates/polars-compute/src/comparisons/list.rs index f7e18b79c0e7..c7c1db50ed70 100644 --- a/crates/polars-compute/src/comparisons/list.rs +++ b/crates/polars-compute/src/comparisons/list.rs @@ -99,6 +99,7 @@ macro_rules! compare { PH::Dictionary(I::Int16) => call_binary!(DictionaryArray), PH::Dictionary(I::Int32) => call_binary!(DictionaryArray), PH::Dictionary(I::Int64) => call_binary!(DictionaryArray), + PH::Dictionary(I::Int128) => call_binary!(DictionaryArray), PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray), PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray), PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray), @@ -196,6 +197,7 @@ macro_rules! compare_broadcast { PH::Dictionary(I::Int16) => call_binary!(DictionaryArray), PH::Dictionary(I::Int32) => call_binary!(DictionaryArray), PH::Dictionary(I::Int64) => call_binary!(DictionaryArray), + PH::Dictionary(I::Int128) => call_binary!(DictionaryArray), PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray), PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray), PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray), diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml index f4ff3dd431c7..4c3ed4b04a76 100644 --- a/crates/polars-io/Cargo.toml +++ b/crates/polars-io/Cargo.toml @@ -82,6 +82,7 @@ dtype-u8 = ["polars-core/dtype-u8"] dtype-u16 = ["polars-core/dtype-u16"] dtype-i8 = ["polars-core/dtype-i8"] dtype-i16 = ["polars-core/dtype-i16"] +dtype-i128 = ["polars-core/dtype-i128"] dtype-categorical = ["polars-core/dtype-categorical"] dtype-date = ["polars-core/dtype-date", "polars-time/dtype-date"] object = ["polars-core/object"] diff --git a/crates/polars-io/src/csv/read/buffer.rs b/crates/polars-io/src/csv/read/buffer.rs index 22b8b34d3676..a13ab46e585e 100644 --- a/crates/polars-io/src/csv/read/buffer.rs +++ b/crates/polars-io/src/csv/read/buffer.rs @@ -82,6 +82,13 @@ impl PrimitiveParser for Int64Type { atoi_simd::parse_skipped(bytes).ok() } } +#[cfg(feature = "dtype-i128")] +impl PrimitiveParser for Int128Type { + #[inline] + fn parse(bytes: &[u8]) -> Option { + atoi_simd::parse_skipped(bytes).ok() + } +} trait ParsedBuffer { fn parse_bytes( @@ -522,6 +529,8 @@ pub fn init_buffers( &DataType::Int16 => Buffer::Int16(PrimitiveChunkedBuilder::new(name, capacity)), &DataType::Int32 => Buffer::Int32(PrimitiveChunkedBuilder::new(name, capacity)), &DataType::Int64 => Buffer::Int64(PrimitiveChunkedBuilder::new(name, capacity)), + #[cfg(feature = "dtype-i128")] + &DataType::Int128 => Buffer::Int128(PrimitiveChunkedBuilder::new(name, capacity)), #[cfg(feature = "dtype-u8")] &DataType::UInt8 => Buffer::UInt8(PrimitiveChunkedBuilder::new(name, capacity)), #[cfg(feature = "dtype-u16")] @@ -594,6 +603,8 @@ pub enum Buffer { Int16(PrimitiveChunkedBuilder), Int32(PrimitiveChunkedBuilder), Int64(PrimitiveChunkedBuilder), + #[cfg(feature = "dtype-i128")] + Int128(PrimitiveChunkedBuilder), #[cfg(feature = "dtype-u8")] UInt8(PrimitiveChunkedBuilder), #[cfg(feature = "dtype-u16")] @@ -628,6 +639,8 @@ impl Buffer { Buffer::Int16(v) => v.finish().into_series(), Buffer::Int32(v) => v.finish().into_series(), Buffer::Int64(v) => v.finish().into_series(), + #[cfg(feature = "dtype-i128")] + Buffer::Int128(v) => v.finish().into_series(), #[cfg(feature = "dtype-u8")] Buffer::UInt8(v) => v.finish().into_series(), #[cfg(feature = "dtype-u16")] @@ -701,6 +714,8 @@ impl Buffer { Buffer::Int16(v) => v.append_null(), Buffer::Int32(v) => v.append_null(), Buffer::Int64(v) => v.append_null(), + #[cfg(feature = "dtype-i128")] + Buffer::Int128(v) => v.append_null(), #[cfg(feature = "dtype-u8")] Buffer::UInt8(v) => v.append_null(), #[cfg(feature = "dtype-u16")] @@ -745,6 +760,8 @@ impl Buffer { Buffer::Int16(_) => DataType::Int16, Buffer::Int32(_) => DataType::Int32, Buffer::Int64(_) => DataType::Int64, + #[cfg(feature = "dtype-i128")] + Buffer::Int128(_) => DataType::Int128, #[cfg(feature = "dtype-u8")] Buffer::UInt8(_) => DataType::UInt8, #[cfg(feature = "dtype-u16")] @@ -824,6 +841,15 @@ impl Buffer { missing_is_null, None, ), + #[cfg(feature = "dtype-i128")] + Int128(buf) => as ParsedBuffer>::parse_bytes( + buf, + bytes, + ignore_errors, + needs_escaping, + missing_is_null, + None, + ), #[cfg(feature = "dtype-u8")] UInt8(buf) => as ParsedBuffer>::parse_bytes( buf, diff --git a/crates/polars-io/src/csv/write/write_impl/serializer.rs b/crates/polars-io/src/csv/write/write_impl/serializer.rs index 6a4f964d88b3..5229455022cc 100644 --- a/crates/polars-io/src/csv/write/write_impl/serializer.rs +++ b/crates/polars-io/src/csv/write/write_impl/serializer.rs @@ -535,6 +535,7 @@ pub(super) fn serializer_for<'a>( DataType::UInt32 => quote_if_always!(integer_serializer::), DataType::Int64 => quote_if_always!(integer_serializer::), DataType::UInt64 => quote_if_always!(integer_serializer::), + DataType::Int128 => quote_if_always!(integer_serializer::), DataType::Float32 => match options.float_precision { Some(precision) => match options.float_scientific { Some(true) => { diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index eab4dd534314..25a1bcbb0489 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -330,6 +330,7 @@ dtype-i16 = [ ] dtype-i128 = [ "polars-core/dtype-i128", + "polars-io/dtype-i128", "polars-lazy?/dtype-i128", "polars-ops/dtype-i128", "polars-time?/dtype-i128", diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index 43b05892d65d..8d258eb2ba5e 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -259,12 +259,12 @@ def test_csv_missing_utf8_is_empty_string() -> None: def test_csv_int_types() -> None: f = io.StringIO( - "u8,i8,u16,i16,u32,i32,u64,i64\n" - "0,0,0,0,0,0,0,0\n" - "0,-128,0,-32768,0,-2147483648,0,-9223372036854775808\n" - "255,127,65535,32767,4294967295,2147483647,18446744073709551615,9223372036854775807\n" - "01,01,01,01,01,01,01,01\n" - "01,-01,01,-01,01,-01,01,-01\n" + "u8,i8,u16,i16,u32,i32,u64,i64,i128\n" + "0,0,0,0,0,0,0,0,0\n" + "0,-128,0,-32768,0,-2147483648,0,-9223372036854775808,-170141183460469231731687303715884105728\n" + "255,127,65535,32767,4294967295,2147483647,18446744073709551615,9223372036854775807,170141183460469231731687303715884105727\n" + "01,01,01,01,01,01,01,01,01\n" + "01,-01,01,-01,01,-01,01,-01,01\n" ) df = pl.read_csv( f, @@ -277,6 +277,7 @@ def test_csv_int_types() -> None: "i32": pl.Int32, "u64": pl.UInt64, "i64": pl.Int64, + "i128": pl.Int128, }, ) @@ -295,6 +296,16 @@ def test_csv_int_types() -> None: [0, -9223372036854775808, 9223372036854775807, 1, -1], dtype=pl.Int64, ), + "i128": pl.Series( + [ + 0, + -170141183460469231731687303715884105728, + 170141183460469231731687303715884105727, + 1, + 1, + ], + dtype=pl.Int128, + ), } ), ) diff --git a/py-polars/tests/unit/io/test_ipc.py b/py-polars/tests/unit/io/test_ipc.py index 84e6436cb10e..bdd7a47b38fe 100644 --- a/py-polars/tests/unit/io/test_ipc.py +++ b/py-polars/tests/unit/io/test_ipc.py @@ -95,7 +95,8 @@ def test_select_columns_from_buffer(stream: bool) -> None: "a": [1], "b": [2], "c": [3], - } + }, + schema={"a": pl.Int64(), "b": pl.Int128(), "c": pl.UInt8()}, ) f = io.BytesIO() @@ -109,7 +110,8 @@ def test_select_columns_from_buffer(stream: bool) -> None: "b": [2], "c": [3], "a": [1], - } + }, + schema={"b": pl.Int128(), "c": pl.UInt8(), "a": pl.Int64()}, ) assert_frame_equal(expected, actual) @@ -142,14 +144,33 @@ def test_compressed_simple(compression: IpcCompression, stream: bool) -> None: @pytest.mark.parametrize("compression", COMPRESSIONS) def test_ipc_schema(compression: IpcCompression) -> None: - df = pl.DataFrame({"a": [1, 2], "b": ["a", None], "c": [True, False]}) + schema = { + "i64": pl.Int64(), + "i128": pl.Int128(), + "u8": pl.UInt8(), + "f32": pl.Float32(), + "f64": pl.Float64(), + "str": pl.String(), + "bool": pl.Boolean(), + } + df = pl.DataFrame( + { + "i64": [1, 2], + "i128": [1, 2], + "u8": [1, 2], + "f32": [1, 2], + "f64": [1, 2], + "str": ["a", None], + "bool": [True, False], + }, + schema=schema, + ) f = io.BytesIO() df.write_ipc(f, compression=compression) f.seek(0) - expected = {"a": pl.Int64(), "b": pl.String(), "c": pl.Boolean()} - assert pl.read_ipc_schema(f) == expected + assert pl.read_ipc_schema(f) == schema @pytest.mark.write_disk From 58c17455390da12615e5a13f68ebc69d0d554eba Mon Sep 17 00:00:00 2001 From: Prathamesh Ghatole <77586602+Prathamesh-Ghatole@users.noreply.github.com> Date: Fri, 3 Jan 2025 16:59:50 +0530 Subject: [PATCH 12/20] docs(python): Fix typo in `DataFrame.cast` (#20532) --- py-polars/polars/dataframe/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 21cddd52a81e..b08ce16ad91b 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -7937,7 +7937,7 @@ def cast( Mapping of column names (or selector) to dtypes, or a single dtype to which all columns will be cast. strict - Raise if cast is invalid on rows after predicates are pusded down. + Raise if cast is invalid on rows after predicates are pushed down. If `False`, invalid casts will produce null values. Examples From ca36b66110e4c73b116c91d0f7d74c9fd0377ed4 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 3 Jan 2025 13:39:50 +0100 Subject: [PATCH 13/20] fix: Revert categorical unique code (#20540) --- .../logical/categorical/ops/unique.rs | 31 +++---------------- py-polars/polars/_utils/various.py | 1 + .../tests/unit/datatypes/test_categorical.py | 20 ++++++++++++ 3 files changed, 25 insertions(+), 27 deletions(-) diff --git a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs index 7792fae8a544..c46291e4382a 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs @@ -1,6 +1,4 @@ -use polars_compute::unique::{ - DictionaryRangedUniqueState, PrimitiveRangedUniqueState, RangedUniqueKernel, -}; +use polars_compute::unique::{DictionaryRangedUniqueState, RangedUniqueKernel}; use super::*; @@ -43,32 +41,11 @@ impl CategoricalChunked { Ok(out) } } else { - let has_nulls = (self.null_count() > 0) as u32; - let mut state = match cat_map.as_ref() { - RevMapping::Global(map, values, _) => { - if self.is_enum() { - PrimitiveRangedUniqueState::new(0, values.len() as u32 + has_nulls) - } else { - let mut min = u32::MAX; - let mut max = 0u32; - - for &v in map.keys() { - min = min.min(v); - max = max.max(v); - } - - PrimitiveRangedUniqueState::new(min, max + has_nulls) - } - }, - RevMapping::Local(values, _) => { - PrimitiveRangedUniqueState::new(0, values.len() as u32 + has_nulls) - }, - }; - + let mut state = DictionaryRangedUniqueState::new(cat_map.get_categories().to_boxed()); for chunk in self.physical().downcast_iter() { - state.append(chunk); + state.key_state().append(chunk); } - let unique = state.finalize_unique(); + let (_, unique, _) = state.finalize_unique().take(); let ca = unsafe { UInt32Chunked::from_chunks_and_dtype_unchecked( self.physical().name().clone(), diff --git a/py-polars/polars/_utils/various.py b/py-polars/polars/_utils/various.py index 126929d6d627..3d9b58dddb7e 100644 --- a/py-polars/polars/_utils/various.py +++ b/py-polars/polars/_utils/various.py @@ -652,6 +652,7 @@ def re_escape(s: str) -> str: return re.sub(f"([{re_rust_metachars}])", r"\\\1", s) +# Don't rename or move. This is used by polars cloud def display_dot_graph( *, dot: str, diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py index 64b789281a21..66aa7b2ba898 100644 --- a/py-polars/tests/unit/datatypes/test_categorical.py +++ b/py-polars/tests/unit/datatypes/test_categorical.py @@ -905,3 +905,23 @@ def test_categorical_unique() -> None: s = pl.Series(["a", "b", None], dtype=pl.Categorical) assert s.n_unique() == 3 assert s.unique().to_list() == ["a", "b", None] + + +@StringCache() +def test_categorical_unique_20539() -> None: + df = pl.DataFrame({"number": [1, 1, 2, 2, 3], "letter": ["a", "b", "b", "c", "c"]}) + + result = ( + df.cast({"letter": pl.Categorical}) + .group_by("number") + .agg( + unique=pl.col("letter").unique(), + unique_with_order=pl.col("letter").unique(maintain_order=True), + ) + ) + + assert result.sort("number").to_dict(as_series=False) == { + "number": [1, 2, 3], + "unique": [["a", "b"], ["b", "c"], ["c"]], + "unique_with_order": [["a", "b"], ["b", "c"], ["c"]], + } From 9ce1c070dfae7ecef9664f902fce3697712f4186 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 3 Jan 2025 16:12:43 +0100 Subject: [PATCH 14/20] ci: Report wheel sizes (#20541) --- .github/workflows/benchmark.yml | 59 ++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index afa9219231a3..34898e640697 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -73,6 +73,63 @@ jobs: working-directory: py-polars run: maturin develop --release -- -C codegen-units=8 -C lto=thin -C target-cpu=native + - name: Set wheel size + run: | + WHEEL_SIZE=$(ls -l py-polars/polars/polars*.so | awk '{ print $5 }') + echo "WHEEL_SIZE=$WHEEL_SIZE" >> $GITHUB_ENV + + - name: Upload wheel sizes artifact (main only) + if: github.ref_name == 'main' + uses: actions/upload-artifact@v3 + with: + name: wheel-size + path: | + echo "$GITHUB_RUN_ID $WHEEL_SIZE" > wheel_sizes.txt + wheel_sizes.txt + + - name: Download main wheel size + uses: actions/download-artifact@v3 + with: + name: wheel-size + continue-on-error: true + + - name: Extract previous wheel size + id: load_previous_size + run: | + if [[ -f wheel_sizes.txt ]]; then + PREVIOUS_WHEEL_SIZE=$(tail -n 1 wheel_sizes.txt | awk '{ print $2 }') + echo "PREVIOUS_WHEEL_SIZE=$PREVIOUS_WHEEL_SIZE" >> $GITHUB_ENV + else + echo "PREVIOUS_WHEEL_SIZE=Unknown" >> $GITHUB_ENV + fi + + - name: Comment wheel size + uses: actions/github-script@v7 + with: + script: | + const previousSize = process.env.PREVIOUS_WHEEL_SIZE || 'Unknown'; + const currentSize = process.env.WHEEL_SIZE || 'Unknown'; + + // Convert to MB + const previousSizeMB = previousSize !== 'Unknown' ? (previousSize / 1024 / 1024).toFixed(4) : 'Unknown'; + const currentSizeMB = currentSize !== 'Unknown' ? (currentSize / 1024 / 1024).toFixed(4) : 'Unknown'; + + let commentBody = `The previous wheel size was **${previousSizeMB} MB**.\nThe current wheel size after this PR is **${currentSizeMB} MB**.`; + + // Calculate percentage increase if both sizes are available + if (previousSize !== 'Unknown' && currentSize !== '') { + const increase = ((currentSize - previousSize) / previousSize) * 100; + commentBody += `\nThis represents a **${increase.toFixed(2)}% increase** in size.`; + } + + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: commentBody + }); + + - name: Run benchmark tests uses: CodSpeedHQ/action@v3 with: @@ -87,4 +144,4 @@ jobs: working-directory: py-polars env: POLARS_AUTO_NEW_STREAMING: 1 - run: pytest -n auto --dist loadgroup -m "not may_fail_auto_streaming and not slow and not write_disk and not release and not docs and not hypothesis and not benchmark and not ci_only" \ No newline at end of file + run: pytest -n auto --dist loadgroup -m "not may_fail_auto_streaming and not slow and not write_disk and not release and not docs and not hypothesis and not benchmark and not ci_only" From 7c64640ab4ce2b669b3af15fb20793b910f91c1a Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Fri, 3 Jan 2025 11:56:57 -0500 Subject: [PATCH 15/20] fix: Update eager join doctest on multiple columns (#20542) --- crates/polars/src/docs/eager.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/polars/src/docs/eager.rs b/crates/polars/src/docs/eager.rs index d9b00886d5af..f15a2d9f1a15 100644 --- a/crates/polars/src/docs/eager.rs +++ b/crates/polars/src/docs/eager.rs @@ -407,7 +407,7 @@ //! temp.full_join(&rain, ["days"], ["days"]); //! //! // join on multiple columns -//! temp.join(&rain, vec!["days", "other"], vec!["days", "other"], JoinArgs::new(JoinType::Left)); +//! temp.join(&rain, vec!["days", "other"], vec!["days", "other"], JoinArgs::new(JoinType::Left), None); //! //! # Ok(()) //! # } From 15175999972720195b41205a5c30e42d824055ff Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 3 Jan 2025 18:32:07 +0100 Subject: [PATCH 16/20] fix: Fix more global categorical issues (#20547) --- .github/workflows/benchmark.yml | 2 +- .../chunked_array/comparison/categorical.rs | 30 ++++++++++++++----- .../logical/categorical/ops/unique.rs | 22 ++------------ .../tests/unit/datatypes/test_categorical.py | 21 ++++++++++++- 4 files changed, 46 insertions(+), 29 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 34898e640697..49ef91de9be4 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -114,7 +114,7 @@ jobs: const previousSizeMB = previousSize !== 'Unknown' ? (previousSize / 1024 / 1024).toFixed(4) : 'Unknown'; const currentSizeMB = currentSize !== 'Unknown' ? (currentSize / 1024 / 1024).toFixed(4) : 'Unknown'; - let commentBody = `The previous wheel size was **${previousSizeMB} MB**.\nThe current wheel size after this PR is **${currentSizeMB} MB**.`; + let commentBody = `The uncompressed binary size was **${previousSizeMB} MB**.\nThe uncompressed binary size after this PR is **${currentSizeMB} MB**.`; // Calculate percentage increase if both sizes are available if (previousSize !== 'Unknown' && currentSize !== '') { diff --git a/crates/polars-core/src/chunked_array/comparison/categorical.rs b/crates/polars-core/src/chunked_array/comparison/categorical.rs index bbcd6b6047c9..09573c5fbd32 100644 --- a/crates/polars-core/src/chunked_array/comparison/categorical.rs +++ b/crates/polars-core/src/chunked_array/comparison/categorical.rs @@ -374,13 +374,29 @@ where // Apply comparison on categories map and then do a lookup let bitmap = str_single_compare_function(lhs.get_rev_map().get_categories(), rhs); - Ok( - BooleanChunked::from_iter_trusted_length(lhs.physical().into_iter().map(|opt_idx| { - // SAFETY: indexing into bitmap with same length as original array - opt_idx.map(|idx| unsafe { bitmap.get_bit_unchecked(idx as usize) }) - })) - .with_name(lhs.name().clone()), - ) + let mask = match lhs.get_rev_map().as_ref() { + RevMapping::Local(_, _) => { + BooleanChunked::from_iter_trusted_length(lhs.physical().into_iter().map( + |opt_idx| { + // SAFETY: indexing into bitmap with same length as original array + opt_idx.map(|idx| unsafe { bitmap.get_bit_unchecked(idx as usize) }) + }, + )) + }, + RevMapping::Global(idx_map, _, _) => { + BooleanChunked::from_iter_trusted_length(lhs.physical().into_iter().map( + |opt_idx| { + // SAFETY: indexing into bitmap with same length as original array + opt_idx.map(|idx| unsafe { + let idx = *idx_map.get(&idx).unwrap(); + bitmap.get_bit_unchecked(idx as usize) + }) + }, + )) + }, + }; + + Ok(mask.with_name(lhs.name().clone())) } } diff --git a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs index c46291e4382a..17752f828d8d 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs @@ -1,5 +1,3 @@ -use polars_compute::unique::{DictionaryRangedUniqueState, RangedUniqueKernel}; - use super::*; impl CategoricalChunked { @@ -41,18 +39,7 @@ impl CategoricalChunked { Ok(out) } } else { - let mut state = DictionaryRangedUniqueState::new(cat_map.get_categories().to_boxed()); - for chunk in self.physical().downcast_iter() { - state.key_state().append(chunk); - } - let (_, unique, _) = state.finalize_unique().take(); - let ca = unsafe { - UInt32Chunked::from_chunks_and_dtype_unchecked( - self.physical().name().clone(), - vec![unique.to_boxed()], - DataType::UInt32, - ) - }; + let ca = self.physical().unique()?; // SAFETY: // we only removed some indexes so we are still in bounds unsafe { @@ -70,12 +57,7 @@ impl CategoricalChunked { if self._can_fast_unique() { Ok(self.get_rev_map().len()) } else { - let cat_map = self.get_rev_map(); - let mut state = DictionaryRangedUniqueState::new(cat_map.get_categories().to_boxed()); - for chunk in self.physical().downcast_iter() { - state.key_state().append(chunk); - } - Ok(state.finalize_n_unique()) + self.physical().n_unique() } } diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py index 66aa7b2ba898..d5416bc47a93 100644 --- a/py-polars/tests/unit/datatypes/test_categorical.py +++ b/py-polars/tests/unit/datatypes/test_categorical.py @@ -904,7 +904,7 @@ def test_perfect_group_by_19950() -> None: def test_categorical_unique() -> None: s = pl.Series(["a", "b", None], dtype=pl.Categorical) assert s.n_unique() == 3 - assert s.unique().to_list() == ["a", "b", None] + assert s.unique().sort().to_list() == [None, "a", "b"] @StringCache() @@ -925,3 +925,22 @@ def test_categorical_unique_20539() -> None: "unique": [["a", "b"], ["b", "c"], ["c"]], "unique_with_order": [["a", "b"], ["b", "c"], ["c"]], } + + +@StringCache() +@pytest.mark.may_fail_auto_streaming +def test_categorical_prefill() -> None: + # https://github.com/pola-rs/polars/pull/20547#issuecomment-2569473443 + # prefill cache + pl.Series(["aaa", "bbb", "ccc"], dtype=pl.Categorical) # pre-fill cache + + # test_compare_categorical_single + assert (pl.Series(["a"], dtype=pl.Categorical) < "a").to_list() == [False] + + # test_unique_categorical + a = pl.Series(["a"], dtype=pl.Categorical) + assert a.unique().to_list() == ["a"] + + s = pl.Series(["1", "2", "3"], dtype=pl.Categorical) + s = s.filter([True, False, True]) + assert s.n_unique() == 2 From 409f09158bf95bbda78dfaa2624d59adcdcd3412 Mon Sep 17 00:00:00 2001 From: Marshall Date: Fri, 3 Jan 2025 13:21:40 -0500 Subject: [PATCH 17/20] chore: Increase categorical test coverage (#20514) --- py-polars/tests/unit/conftest.py | 22 ++++++ .../constructors/test_any_value_fallbacks.py | 24 +++--- .../tests/unit/datatypes/test_categorical.py | 42 +++++++--- py-polars/tests/unit/datatypes/test_list.py | 1 + .../functions/as_datatype/test_concat_list.py | 1 + .../unit/interchange/test_from_dataframe.py | 1 + py-polars/tests/unit/io/test_delta.py | 1 + py-polars/tests/unit/io/test_lazy_parquet.py | 2 + py-polars/tests/unit/io/test_other.py | 1 + py-polars/tests/unit/io/test_parquet.py | 1 + .../operations/namespaces/test_categorical.py | 78 +++++++------------ .../tests/unit/operations/test_filter.py | 1 + .../tests/unit/operations/test_group_by.py | 1 + .../unit/operations/unique/test_unique.py | 8 +- py-polars/tests/unit/series/test_series.py | 1 + 15 files changed, 114 insertions(+), 71 deletions(-) diff --git a/py-polars/tests/unit/conftest.py b/py-polars/tests/unit/conftest.py index 825dc161cdbf..8197872831b5 100644 --- a/py-polars/tests/unit/conftest.py +++ b/py-polars/tests/unit/conftest.py @@ -16,6 +16,9 @@ if TYPE_CHECKING: from collections.abc import Generator + from typing import Any + + FixtureRequest = Any load_profile( profile=os.environ.get("POLARS_HYPOTHESIS_PROFILE", "fast"), # type: ignore[arg-type] @@ -229,3 +232,22 @@ def memory_usage_without_pyarrow() -> Generator[MemoryUsage, Any, Any]: yield MemoryUsage() finally: tracemalloc.stop() + + +@pytest.fixture(params=[True, False]) +def test_global_and_local( + request: FixtureRequest, +) -> Generator[Any, Any, Any]: + """ + Setup fixture which runs each test with and without global string cache. + + Usage: @pytest.mark.usefixtures("test_global_and_local") + """ + use_global = request.param + if use_global: + with pl.StringCache(): + # Pre-fill some global items to ensure physical repr isn't 0..n. + pl.Series(["eapioejf", "2m4lmv", "3v3v9dlf"], dtype=pl.Categorical) + yield + else: + yield diff --git a/py-polars/tests/unit/constructors/test_any_value_fallbacks.py b/py-polars/tests/unit/constructors/test_any_value_fallbacks.py index f3584b85d533..490515b89844 100644 --- a/py-polars/tests/unit/constructors/test_any_value_fallbacks.py +++ b/py-polars/tests/unit/constructors/test_any_value_fallbacks.py @@ -398,16 +398,16 @@ def test_fallback_with_dtype_strict_failure_decimal_precision() -> None: PySeries.new_from_any_values_and_dtype("", values, dtype, strict=True) +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_lit_18874() -> None: - with pl.StringCache(): - assert_frame_equal( - pl.DataFrame( - {"a": [1, 2, 3]}, - ).with_columns(b=pl.lit("foo").cast(pl.Categorical)), - pl.DataFrame( - [ - pl.Series("a", [1, 2, 3]), - pl.Series("b", ["foo"] * 3, pl.Categorical), - ] - ), - ) + assert_frame_equal( + pl.DataFrame( + {"a": [1, 2, 3]}, + ).with_columns(b=pl.lit("foo").cast(pl.Categorical)), + pl.DataFrame( + [ + pl.Series("a", [1, 2, 3]), + pl.Series("b", ["foo"] * 3, pl.Categorical), + ] + ), + ) diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py index d5416bc47a93..1b37763c0d08 100644 --- a/py-polars/tests/unit/datatypes/test_categorical.py +++ b/py-polars/tests/unit/datatypes/test_categorical.py @@ -72,6 +72,7 @@ def test_categorical_full_outer_join() -> None: assert df["key_right"].cast(pl.String).to_list() == ["bar", "baz", None] +@pytest.mark.usefixtures("test_global_and_local") def test_read_csv_categorical() -> None: f = io.BytesIO() f.write(b"col1,col2,col3,col4,col5,col6\n'foo',2,3,4,5,6\n'bar',8,9,10,11,12") @@ -80,6 +81,7 @@ def test_read_csv_categorical() -> None: assert df["col1"].dtype == pl.Categorical +@pytest.mark.usefixtures("test_global_and_local") def test_cat_to_dummies() -> None: df = pl.DataFrame({"foo": [1, 2, 3, 4], "bar": ["a", "b", "a", "c"]}) df = df.with_columns(pl.col("bar").cast(pl.Categorical)) @@ -94,7 +96,7 @@ def test_cat_to_dummies() -> None: } -@StringCache() +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_is_in_list() -> None: # this requires type coercion to cast. # we should not cast within the function as this would be expensive within a @@ -110,7 +112,7 @@ def test_categorical_is_in_list() -> None: } -@StringCache() +@pytest.mark.usefixtures("test_global_and_local") def test_unset_sorted_on_append() -> None: df1 = pl.DataFrame( [ @@ -137,6 +139,7 @@ def test_unset_sorted_on_append() -> None: (pl.Series.eq_missing, pl.Series([True, True, True, False, False, False])), ], ) +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_equality( op: Callable[[pl.Series, pl.Series], pl.Series], expected: pl.Series ) -> None: @@ -272,6 +275,7 @@ def test_categorical_global_ordering_broadcast_lhs( (operator.gt, pl.Series([False, False, False, True, False, False])), ], ) +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_ordering( op: Callable[[pl.Series, pl.Series], pl.Series], expected: pl.Series ) -> None: @@ -289,6 +293,7 @@ def test_categorical_ordering( (operator.gt, pl.Series([None, False, False, False, False, False])), ], ) +@pytest.mark.usefixtures("test_global_and_local") def test_compare_categorical( op: Callable[[pl.Series, pl.Series], pl.Series], expected: pl.Series ) -> None: @@ -311,6 +316,7 @@ def test_compare_categorical( (pl.Series.ne_missing, pl.Series([True, True, False, True, False, True])), ], ) +@pytest.mark.usefixtures("test_global_and_local") def test_compare_categorical_single( op: Callable[[pl.Series, pl.Series], pl.Series], expected: pl.Series ) -> None: @@ -400,6 +406,7 @@ def test_categorical_error_on_local_cmp() -> None: df_cat.filter(pl.col("a_cat") == pl.col("b_cat")) +@pytest.mark.usefixtures("test_global_and_local") def test_cast_null_to_categorical() -> None: assert pl.DataFrame().with_columns( pl.lit(None).cast(pl.Categorical).alias("nullable_enum") @@ -454,6 +461,7 @@ def create_lazy(data: dict) -> pl.LazyFrame: # type: ignore[type-arg] assert pl.using_string_cache() is False +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_in_struct_nulls() -> None: s = pl.Series( "job", ["doctor", "waiter", None, None, None, "doctor"], pl.Categorical @@ -466,6 +474,7 @@ def test_categorical_in_struct_nulls() -> None: assert s[2] == {"job": "waiter", "count": 1} +@pytest.mark.usefixtures("test_global_and_local") def test_cast_inner_categorical() -> None: dtype = pl.List(pl.Categorical) out = pl.Series("foo", [["a"], ["a", "b"]]).cast(dtype) @@ -501,6 +510,7 @@ def test_stringcache() -> None: (pl.Categorical("lexical"), ["bar", "baz", "foo"]), ], ) +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_sort_order_by_parameter( dtype: PolarsDataType, outcome: list[str] ) -> None: @@ -557,12 +567,14 @@ def test_err_on_categorical_asof_join_by_arg() -> None: df1.join_asof(df2, on=pl.col("time").set_sorted(), by="cat") +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_list_get_item() -> None: out = pl.Series([["a"]]).cast(pl.List(pl.Categorical)).item() assert isinstance(out, pl.Series) assert out.dtype == pl.Categorical +@pytest.mark.usefixtures("test_global_and_local") def test_nested_categorical_aggregation_7848() -> None: # a double categorical aggregation assert pl.DataFrame( @@ -580,6 +592,7 @@ def test_nested_categorical_aggregation_7848() -> None: } +@pytest.mark.usefixtures("test_global_and_local") def test_nested_categorical_cast() -> None: values = [["x"], ["y"], ["x"]] dtype = pl.List(pl.Categorical) @@ -588,6 +601,7 @@ def test_nested_categorical_cast() -> None: assert s.to_list() == values +@pytest.mark.usefixtures("test_global_and_local") def test_struct_categorical_nesting() -> None: # this triggers a lot of materialization df = pl.DataFrame( @@ -610,7 +624,7 @@ def test_categorical_fill_null_existing_category() -> None: assert result.to_dict(as_series=False) == expected -@StringCache() +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_fill_null_stringcache() -> None: df = pl.LazyFrame( {"index": [1, 2, 3], "cat": ["a", "b", None]}, @@ -622,6 +636,7 @@ def test_categorical_fill_null_stringcache() -> None: assert a.dtypes == [pl.Categorical] +@pytest.mark.usefixtures("test_global_and_local") def test_fast_unique_flag_from_arrow() -> None: df = pl.DataFrame( { @@ -633,6 +648,7 @@ def test_fast_unique_flag_from_arrow() -> None: assert pl.from_arrow(filtered).select(pl.col("colB").n_unique()).item() == 4 # type: ignore[union-attr] +@pytest.mark.usefixtures("test_global_and_local") def test_construct_with_null() -> None: # Example from https://github.com/pola-rs/polars/issues/7188 df = pl.from_dicts([{"A": None}, {"A": "foo"}], schema={"A": pl.Categorical}) @@ -663,6 +679,7 @@ def test_list_builder_different_categorical_rev_maps() -> None: } +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_collect_11408() -> None: df = pl.DataFrame( data={"groups": ["a", "b", "c"], "cats": ["a", "b", "c"], "amount": [1, 2, 3]}, @@ -677,6 +694,7 @@ def test_categorical_collect_11408() -> None: } +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_nested_cast_unchecked() -> None: s = pl.Series("cat", [["cat"]]).cast(pl.List(pl.Categorical)) assert pl.Series([s]).to_list() == [[["cat"]]] @@ -751,6 +769,7 @@ def test_categorical_vstack_with_local_different_rev_map() -> None: assert df3.get_column("a").cast(pl.UInt32).to_list() == [0, 1, 2, 3, 4, 5] +@pytest.mark.usefixtures("test_global_and_local") def test_shift_over_13041() -> None: df = pl.DataFrame( { @@ -768,6 +787,7 @@ def test_shift_over_13041() -> None: @pytest.mark.parametrize("context", [pl.StringCache(), contextlib.nullcontext()]) @pytest.mark.parametrize("ordering", ["physical", "lexical"]) +@pytest.mark.usefixtures("test_global_and_local") def test_sort_categorical_retain_none( context: contextlib.AbstractContextManager, # type: ignore[type-arg] ordering: Literal["physical", "lexical"], @@ -799,6 +819,7 @@ def test_sort_categorical_retain_none( ] +@pytest.mark.usefixtures("test_global_and_local") def test_cast_from_cat_to_numeric() -> None: cat_series = pl.Series( "cat_series", @@ -811,12 +832,14 @@ def test_cast_from_cat_to_numeric() -> None: assert s.cast(pl.UInt8).sum() == 6 +@pytest.mark.usefixtures("test_global_and_local") def test_cat_preserve_lexical_ordering_on_clear() -> None: s = pl.Series("a", ["a", "b"], dtype=pl.Categorical(ordering="lexical")) s2 = s.clear() assert s.dtype == s2.dtype +@pytest.mark.usefixtures("test_global_and_local") def test_cat_preserve_lexical_ordering_on_concat() -> None: dtype = pl.Categorical(ordering="lexical") @@ -827,6 +850,7 @@ def test_cat_preserve_lexical_ordering_on_concat() -> None: # TODO: Bug see: https://github.com/pola-rs/polars/issues/20440 @pytest.mark.may_fail_auto_streaming +@pytest.mark.usefixtures("test_global_and_local") def test_cat_append_lexical_sorted_flag() -> None: df = pl.DataFrame({"x": [0, 1, 1], "y": ["B", "B", "A"]}).with_columns( pl.col("y").cast(pl.Categorical(ordering="lexical")) @@ -845,6 +869,7 @@ def test_cat_append_lexical_sorted_flag() -> None: assert not (s1.is_sorted()) +@pytest.mark.usefixtures("test_global_and_local") def test_get_cat_categories_multiple_chunks() -> None: df = pl.DataFrame( [ @@ -877,6 +902,7 @@ def test_nested_categorical_concat( pl.concat([a, b]) +@pytest.mark.usefixtures("test_global_and_local") def test_perfect_group_by_19452() -> None: n = 40 df2 = pl.DataFrame( @@ -889,6 +915,7 @@ def test_perfect_group_by_19452() -> None: assert df2.with_columns(a=(pl.col("b")).over(pl.col("a")))["a"].is_sorted() +@pytest.mark.usefixtures("test_global_and_local") def test_perfect_group_by_19950() -> None: dtype = pl.Enum(categories=["a", "b", "c"]) @@ -900,14 +927,14 @@ def test_perfect_group_by_19950() -> None: } -@StringCache() +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_unique() -> None: s = pl.Series(["a", "b", None], dtype=pl.Categorical) assert s.n_unique() == 3 assert s.unique().sort().to_list() == [None, "a", "b"] -@StringCache() +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_unique_20539() -> None: df = pl.DataFrame({"number": [1, 1, 2, 2, 3], "letter": ["a", "b", "b", "c", "c"]}) @@ -927,13 +954,10 @@ def test_categorical_unique_20539() -> None: } -@StringCache() @pytest.mark.may_fail_auto_streaming +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_prefill() -> None: # https://github.com/pola-rs/polars/pull/20547#issuecomment-2569473443 - # prefill cache - pl.Series(["aaa", "bbb", "ccc"], dtype=pl.Categorical) # pre-fill cache - # test_compare_categorical_single assert (pl.Series(["a"], dtype=pl.Categorical) < "a").to_list() == [False] diff --git a/py-polars/tests/unit/datatypes/test_list.py b/py-polars/tests/unit/datatypes/test_list.py index 54b97fdef9fc..53c401ec110e 100644 --- a/py-polars/tests/unit/datatypes/test_list.py +++ b/py-polars/tests/unit/datatypes/test_list.py @@ -65,6 +65,7 @@ def test_dtype() -> None: ] +@pytest.mark.usefixtures("test_global_and_local") def test_categorical() -> None: # https://github.com/pola-rs/polars/issues/2038 df = pl.DataFrame( diff --git a/py-polars/tests/unit/functions/as_datatype/test_concat_list.py b/py-polars/tests/unit/functions/as_datatype/test_concat_list.py index b0f70edbf51d..3ad6173ec176 100644 --- a/py-polars/tests/unit/functions/as_datatype/test_concat_list.py +++ b/py-polars/tests/unit/functions/as_datatype/test_concat_list.py @@ -91,6 +91,7 @@ def test_list_concat_supertype() -> None: ].to_list() == [[1, 10000], [2, 20000]] +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_list_concat_4762() -> None: df = pl.DataFrame({"x": "a"}) expected = {"x": [["a", "a"]]} diff --git a/py-polars/tests/unit/interchange/test_from_dataframe.py b/py-polars/tests/unit/interchange/test_from_dataframe.py index 35fcc595451a..c9864d481d90 100644 --- a/py-polars/tests/unit/interchange/test_from_dataframe.py +++ b/py-polars/tests/unit/interchange/test_from_dataframe.py @@ -334,6 +334,7 @@ def test_string_column_to_series_no_offsets() -> None: _string_column_to_series(col, allow_copy=True) +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_column_to_series_non_dictionary() -> None: s = pl.Series(["a", "b", None, "a"], dtype=pl.Categorical) diff --git a/py-polars/tests/unit/io/test_delta.py b/py-polars/tests/unit/io/test_delta.py index 46931906ab3c..04361adf22d1 100644 --- a/py-polars/tests/unit/io/test_delta.py +++ b/py-polars/tests/unit/io/test_delta.py @@ -474,6 +474,7 @@ def test_unsupported_dtypes(tmp_path: Path) -> None: reason="upstream bug in delta-rs causing categorical to be written as categorical in parquet" ) @pytest.mark.write_disk +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_becomes_string(tmp_path: Path) -> None: df = pl.DataFrame({"a": ["A", "B", "A"]}, schema={"a": pl.Categorical}) df.write_delta(tmp_path) diff --git a/py-polars/tests/unit/io/test_lazy_parquet.py b/py-polars/tests/unit/io/test_lazy_parquet.py index 78ffb6b1379b..5a82a5304a9d 100644 --- a/py-polars/tests/unit/io/test_lazy_parquet.py +++ b/py-polars/tests/unit/io/test_lazy_parquet.py @@ -66,6 +66,7 @@ def test_row_index_len_16543(foods_parquet_path: Path) -> None: @pytest.mark.write_disk +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_parquet_statistics(tmp_path: Path) -> None: tmp_path.mkdir(exist_ok=True) @@ -281,6 +282,7 @@ def test_parquet_statistics(monkeypatch: Any, capfd: Any, tmp_path: Path) -> Non @pytest.mark.write_disk +@pytest.mark.usefixtures("test_global_and_local") def test_categorical(tmp_path: Path) -> None: tmp_path.mkdir(exist_ok=True) diff --git a/py-polars/tests/unit/io/test_other.py b/py-polars/tests/unit/io/test_other.py index 4c08250838d8..20e71c36e5a1 100644 --- a/py-polars/tests/unit/io/test_other.py +++ b/py-polars/tests/unit/io/test_other.py @@ -84,6 +84,7 @@ def test_copy() -> None: assert_series_equal(copy.deepcopy(a), a) +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_round_trip() -> None: df = pl.DataFrame({"ints": [1, 2, 3], "cat": ["a", "b", "c"]}) df = df.with_columns(pl.col("cat").cast(pl.Categorical)) diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index c70f2cfb2031..1fcbc193e933 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -2433,6 +2433,7 @@ def test_dict_masked( ) +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_sliced_20017() -> None: f = io.BytesIO() df = ( diff --git a/py-polars/tests/unit/operations/namespaces/test_categorical.py b/py-polars/tests/unit/operations/namespaces/test_categorical.py index 9f60ff4f7be9..bc596c71794f 100644 --- a/py-polars/tests/unit/operations/namespaces/test_categorical.py +++ b/py-polars/tests/unit/operations/namespaces/test_categorical.py @@ -1,36 +1,14 @@ from __future__ import annotations -from typing import TYPE_CHECKING - import pytest import polars as pl from polars.testing import assert_frame_equal, assert_series_equal -if TYPE_CHECKING: - from collections.abc import Generator - from typing import Any - - FixtureRequest = Any - - -@pytest.fixture(params=[True, False]) -def test_global_and_local( - request: FixtureRequest, -) -> Generator[Any, Any, Any]: - """Setup fixture which runs each test with and without global string cache.""" - use_global = request.param - if use_global: - with pl.StringCache(): - # Pre-fill some global items to ensure physical repr isn't 0..n. - pl.Series(["a", "b", "c"], dtype=pl.Categorical) - yield - else: - yield - # @TODO: Bug, see https://github.com/pola-rs/polars/issues/20440 @pytest.mark.may_fail_auto_streaming +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_lexical_sort() -> None: df = pl.DataFrame( {"cats": ["z", "z", "k", "a", "b"], "vals": [3, 1, 2, 2, 3]} @@ -66,50 +44,51 @@ def test_categorical_lexical_sort() -> None: ] +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_lexical_ordering_after_concat() -> None: - with pl.StringCache(): - ldf1 = ( - pl.DataFrame([pl.Series("key1", [8, 5]), pl.Series("key2", ["fox", "baz"])]) - .lazy() - .with_columns(pl.col("key2").cast(pl.Categorical("lexical"))) - ) - ldf2 = ( - pl.DataFrame( - [pl.Series("key1", [6, 8, 6]), pl.Series("key2", ["fox", "foo", "bar"])] - ) - .lazy() - .with_columns(pl.col("key2").cast(pl.Categorical("lexical"))) + ldf1 = ( + pl.DataFrame([pl.Series("key1", [8, 5]), pl.Series("key2", ["fox", "baz"])]) + .lazy() + .with_columns(pl.col("key2").cast(pl.Categorical("lexical"))) + ) + ldf2 = ( + pl.DataFrame( + [pl.Series("key1", [6, 8, 6]), pl.Series("key2", ["fox", "foo", "bar"])] ) - df = pl.concat([ldf1, ldf2]).select(pl.col("key2")).collect() + .lazy() + .with_columns(pl.col("key2").cast(pl.Categorical("lexical"))) + ) + df = pl.concat([ldf1, ldf2]).select(pl.col("key2")).collect() - assert df.sort("key2").to_dict(as_series=False) == { - "key2": ["bar", "baz", "foo", "fox", "fox"] - } + assert df.sort("key2").to_dict(as_series=False) == { + "key2": ["bar", "baz", "foo", "fox", "fox"] + } @pytest.mark.may_fail_auto_streaming +@pytest.mark.usefixtures("test_global_and_local") def test_sort_categoricals_6014_internal() -> None: - with pl.StringCache(): - # create basic categorical - df = pl.DataFrame({"key": ["bbb", "aaa", "ccc"]}).with_columns( - pl.col("key").cast(pl.Categorical) - ) + # create basic categorical + df = pl.DataFrame({"key": ["bbb", "aaa", "ccc"]}).with_columns( + pl.col("key").cast(pl.Categorical) + ) out = df.sort("key") assert out.to_dict(as_series=False) == {"key": ["bbb", "aaa", "ccc"]} +@pytest.mark.usefixtures("test_global_and_local") def test_sort_categoricals_6014_lexical() -> None: - with pl.StringCache(): - # create lexically-ordered categorical - df = pl.DataFrame({"key": ["bbb", "aaa", "ccc"]}).with_columns( - pl.col("key").cast(pl.Categorical("lexical")) - ) + # create lexically-ordered categorical + df = pl.DataFrame({"key": ["bbb", "aaa", "ccc"]}).with_columns( + pl.col("key").cast(pl.Categorical("lexical")) + ) out = df.sort("key") assert out.to_dict(as_series=False) == {"key": ["aaa", "bbb", "ccc"]} +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_get_categories() -> None: assert pl.Series( "cats", ["foo", "bar", "foo", "foo", "ham"], dtype=pl.Categorical @@ -166,6 +145,7 @@ def test_cat_is_local() -> None: assert not s2.cat.is_local() +@pytest.mark.usefixtures("test_global_and_local") def test_cat_uses_lexical_ordering() -> None: s = pl.Series(["a", "b", None, "b"]).cast(pl.Categorical) assert s.cat.uses_lexical_ordering() is False diff --git a/py-polars/tests/unit/operations/test_filter.py b/py-polars/tests/unit/operations/test_filter.py index d49c99f5999a..08f5b13bc0ab 100644 --- a/py-polars/tests/unit/operations/test_filter.py +++ b/py-polars/tests/unit/operations/test_filter.py @@ -153,6 +153,7 @@ def test_binary_simplification_5971() -> None: ] +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_string_comparison_6283() -> None: scores = pl.DataFrame( { diff --git a/py-polars/tests/unit/operations/test_group_by.py b/py-polars/tests/unit/operations/test_group_by.py index 924781a10bef..39bb2df449f5 100644 --- a/py-polars/tests/unit/operations/test_group_by.py +++ b/py-polars/tests/unit/operations/test_group_by.py @@ -925,6 +925,7 @@ def test_group_by_multiple_null_cols_15623() -> None: @pytest.mark.release +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_vs_str_group_by() -> None: # this triggers the perfect hash table s = pl.Series("a", np.random.randint(0, 50, 100)) diff --git a/py-polars/tests/unit/operations/unique/test_unique.py b/py-polars/tests/unit/operations/unique/test_unique.py index ff4a0cd10f32..d9fbda1f843b 100644 --- a/py-polars/tests/unit/operations/unique/test_unique.py +++ b/py-polars/tests/unit/operations/unique/test_unique.py @@ -144,15 +144,20 @@ def test_unique_null() -> None: [ ([], []), (["a", "b", "b", "c"], ["a", "b", "c"]), - (["a", "b", "b", None], ["a", "b", None]), + ([None, "a", "b", "b"], [None, "a", "b"]), ], ) +@pytest.mark.usefixtures("test_global_and_local") def test_unique_categorical(input: list[str | None], output: list[str | None]) -> None: s = pl.Series(input, dtype=pl.Categorical) result = s.unique(maintain_order=True) expected = pl.Series(output, dtype=pl.Categorical) assert_series_equal(result, expected) + result = s.unique(maintain_order=False).sort() + expected = pl.Series(output, dtype=pl.Categorical) + assert_series_equal(result, expected) + def test_unique_categorical_global() -> None: with pl.StringCache(): @@ -206,6 +211,7 @@ def test_unique_with_bad_subset( df.unique(subset=subset) +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_unique_19409() -> None: df = pl.DataFrame({"x": [str(n % 50) for n in range(127)]}).cast(pl.Categorical) uniq = df.unique() diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py index 68b2528d5da6..bd8baa002cf9 100644 --- a/py-polars/tests/unit/series/test_series.py +++ b/py-polars/tests/unit/series/test_series.py @@ -356,6 +356,7 @@ def test_date_agg() -> None: (pl.Series(["c", "b", "a"], dtype=pl.Enum(["c", "b", "a", "d"])), "c", "a"), ], ) +@pytest.mark.usefixtures("test_global_and_local") def test_categorical_agg(s: pl.Series, min: str | None, max: str | None) -> None: assert s.min() == min assert s.max() == max From da0b58936ee544c92859ea77df39c6a21370dbe9 Mon Sep 17 00:00:00 2001 From: Marshall Date: Fri, 3 Jan 2025 14:10:11 -0500 Subject: [PATCH 18/20] fix: Output index type instead of u32 for `sum_horizontal` with boolean inputs (#20531) --- .../polars-ops/src/series/ops/horizontal.rs | 6 +-- .../src/dsl/function_expr/schema.rs | 7 ++- .../operations/aggregation/test_horizontal.py | 44 ++++++++++++++++--- 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/crates/polars-ops/src/series/ops/horizontal.rs b/crates/polars-ops/src/series/ops/horizontal.rs index 025779e77349..6a6960480c47 100644 --- a/crates/polars-ops/src/series/ops/horizontal.rs +++ b/crates/polars-ops/src/series/ops/horizontal.rs @@ -221,9 +221,9 @@ pub fn sum_horizontal( // If we have any null columns and null strategy is not `Ignore`, we can return immediately. if !ignore_nulls && non_null_cols.len() < columns.len() { - // We must first determine the correct return dtype. + // We must determine the correct return dtype. let return_dtype = match dtypes_to_supertype(non_null_cols.iter().map(|c| c.dtype()))? { - DataType::Boolean => DataType::UInt32, + DataType::Boolean => IDX_DTYPE, dt => dt, }; return Ok(Some(Column::full_null( @@ -244,7 +244,7 @@ pub fn sum_horizontal( }, 1 => Ok(Some( apply_null_strategy(if non_null_cols[0].dtype() == &DataType::Boolean { - non_null_cols[0].cast(&DataType::UInt32)? + non_null_cols[0].cast(&IDX_DTYPE)? } else { non_null_cols[0].clone() })? diff --git a/crates/polars-plan/src/dsl/function_expr/schema.rs b/crates/polars-plan/src/dsl/function_expr/schema.rs index d45f75c01e9d..beaacac49942 100644 --- a/crates/polars-plan/src/dsl/function_expr/schema.rs +++ b/crates/polars-plan/src/dsl/function_expr/schema.rs @@ -331,11 +331,10 @@ impl FunctionExpr { MinHorizontal => mapper.map_to_supertype(), SumHorizontal { .. } => { mapper.map_to_supertype().map(|mut f| { - match f.dtype { - // Booleans sum to UInt32. - DataType::Boolean => { f.dtype = DataType::UInt32; f}, - _ => f, + if f.dtype == DataType::Boolean { + f.dtype = IDX_DTYPE; } + f }) }, MeanHorizontal { .. } => { diff --git a/py-polars/tests/unit/operations/aggregation/test_horizontal.py b/py-polars/tests/unit/operations/aggregation/test_horizontal.py index 3959e15e22ed..bc557a231d75 100644 --- a/py-polars/tests/unit/operations/aggregation/test_horizontal.py +++ b/py-polars/tests/unit/operations/aggregation/test_horizontal.py @@ -319,6 +319,39 @@ def test_sum_single_col() -> None: ) +@pytest.mark.parametrize("ignore_nulls", [False, True]) +def test_sum_correct_supertype(ignore_nulls: bool) -> None: + values = [1, 2] if ignore_nulls else [None, None] # type: ignore[list-item] + lf = pl.LazyFrame( + { + "null": [None, None], + "int": pl.Series(values, dtype=pl.Int32), + "float": pl.Series(values, dtype=pl.Float32), + } + ) + + # null + int32 should produce int32 + out = lf.select(pl.sum_horizontal("null", "int", ignore_nulls=ignore_nulls)) + expected = pl.LazyFrame({"null": pl.Series(values, dtype=pl.Int32)}) + assert_frame_equal(out.collect(), expected.collect()) + assert out.collect_schema() == expected.collect_schema() + + # null + float32 should produce float32 + out = lf.select(pl.sum_horizontal("null", "float", ignore_nulls=ignore_nulls)) + expected = pl.LazyFrame({"null": pl.Series(values, dtype=pl.Float32)}) + assert_frame_equal(out.collect(), expected.collect()) + assert out.collect_schema() == expected.collect_schema() + + # null + int32 + float32 should produce float64 + values = [2, 4] if ignore_nulls else [None, None] # type: ignore[list-item] + out = lf.select( + pl.sum_horizontal("null", "int", "float", ignore_nulls=ignore_nulls) + ) + expected = pl.LazyFrame({"null": pl.Series(values, dtype=pl.Float64)}) + assert_frame_equal(out.collect(), expected.collect()) + assert out.collect_schema() == expected.collect_schema() + + def test_cum_sum_horizontal() -> None: df = pl.DataFrame( { @@ -541,8 +574,8 @@ def test_horizontal_sum_boolean_with_null() -> None: expected_schema = pl.Schema( { - "null_first": pl.UInt32, - "bool_first": pl.UInt32, + "null_first": pl.get_index_type(), + "bool_first": pl.get_index_type(), } ) @@ -550,8 +583,8 @@ def test_horizontal_sum_boolean_with_null() -> None: expected_df = pl.DataFrame( { - "null_first": pl.Series([1, 0], dtype=pl.UInt32), - "bool_first": pl.Series([1, 0], dtype=pl.UInt32), + "null_first": pl.Series([1, 0], dtype=pl.get_index_type()), + "bool_first": pl.Series([1, 0], dtype=pl.get_index_type()), } ) @@ -563,7 +596,7 @@ def test_horizontal_sum_boolean_with_null() -> None: ("dtype_in", "dtype_out"), [ (pl.Null, pl.Null), - (pl.Boolean, pl.UInt32), + (pl.Boolean, pl.get_index_type()), (pl.UInt8, pl.UInt8), (pl.Float32, pl.Float32), (pl.Float64, pl.Float64), @@ -589,6 +622,7 @@ def test_horizontal_sum_with_null_col_ignore_strategy( values = [None, None, None] # type: ignore[list-item] expected = pl.LazyFrame(pl.Series("null", values, dtype=dtype_out)) assert_frame_equal(result, expected) + assert result.collect_schema() == expected.collect_schema() @pytest.mark.parametrize("ignore_nulls", [True, False]) From 841c387d99d7024037556c4ef79d96bf2caac397 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 3 Jan 2025 20:18:35 +0100 Subject: [PATCH 19/20] Python Polars 1.19.0 (#20552) --- Cargo.lock | 2 +- py-polars/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0753cb32d661..4b52ff4fe016 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3671,7 +3671,7 @@ dependencies = [ [[package]] name = "py-polars" -version = "1.18.0" +version = "1.19.0" dependencies = [ "jemallocator", "libc", diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index 7628df05b8c6..f09fe3952f6f 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-polars" -version = "1.18.0" +version = "1.19.0" edition = "2021" [lib] From 58d69d6c39a6d83319855c57256be378e5d854e9 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 3 Jan 2025 20:45:58 +0100 Subject: [PATCH 20/20] ci: Improve bin size info (#20551) --- .github/workflows/benchmark.yml | 46 ++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 49ef91de9be4..b4b7ad7691b8 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -78,22 +78,26 @@ jobs: WHEEL_SIZE=$(ls -l py-polars/polars/polars*.so | awk '{ print $5 }') echo "WHEEL_SIZE=$WHEEL_SIZE" >> $GITHUB_ENV + - name: Wheel size txt + if: github.ref_name == 'main' + run: | + echo "$GITHUB_RUN_ID $WHEEL_SIZE" > wheel_sizes.txt + - name: Upload wheel sizes artifact (main only) if: github.ref_name == 'main' - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: wheel-size - path: | - echo "$GITHUB_RUN_ID $WHEEL_SIZE" > wheel_sizes.txt - wheel_sizes.txt + path: wheel_sizes.txt - name: Download main wheel size - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: wheel-size continue-on-error: true - name: Extract previous wheel size + if: github.ref_name != 'main' id: load_previous_size run: | if [[ -f wheel_sizes.txt ]]; then @@ -102,9 +106,11 @@ jobs: else echo "PREVIOUS_WHEEL_SIZE=Unknown" >> $GITHUB_ENV fi + continue-on-error: true - name: Comment wheel size uses: actions/github-script@v7 + if: github.ref_name != 'main' with: script: | const previousSize = process.env.PREVIOUS_WHEEL_SIZE || 'Unknown'; @@ -114,7 +120,7 @@ jobs: const previousSizeMB = previousSize !== 'Unknown' ? (previousSize / 1024 / 1024).toFixed(4) : 'Unknown'; const currentSizeMB = currentSize !== 'Unknown' ? (currentSize / 1024 / 1024).toFixed(4) : 'Unknown'; - let commentBody = `The uncompressed binary size was **${previousSizeMB} MB**.\nThe uncompressed binary size after this PR is **${currentSizeMB} MB**.`; + let commentBody = `The previous uncompressed lib size was **${previousSizeMB} MB**.\nThe current uncompressed lib size after this PR is **${currentSizeMB} MB**.`; // Calculate percentage increase if both sizes are available if (previousSize !== 'Unknown' && currentSize !== '') { @@ -122,13 +128,35 @@ jobs: commentBody += `\nThis represents a **${increase.toFixed(2)}% increase** in size.`; } - github.rest.issues.createComment({ - issue_number: context.issue.number, + const { data: comments } = await github.rest.issues.listComments({ owner: context.repo.owner, repo: context.repo.repo, - body: commentBody + issue_number: context.issue.number, }); + // Look for an existing comment + const existingComment = comments.find(comment => + comment.body.includes('The previous uncompressed lib size was') + ); + + if (existingComment) { + // Update the existing comment + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existingComment.id, + body: commentBody, + }); + } else { + // Create a new comment + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: commentBody, + }); + } + continue-on-error: true - name: Run benchmark tests uses: CodSpeedHQ/action@v3