From ed37acea3da55df7ef33c33aaace4f1f34cf8239 Mon Sep 17 00:00:00 2001
From: brifitz <95299320+brifitz@users.noreply.github.com>
Date: Thu, 2 Jan 2025 09:49:24 +0000
Subject: [PATCH 01/20] fix(rust): `slice_pushdown` optimization leading to
 incorrectly sliced row index on parquet file (#20508)

---
 .../polars-io/src/parquet/read/read_impl.rs   | 29 +++++++++++++++----
 py-polars/tests/unit/io/test_lazy_parquet.py  | 28 ++++++++++++++++++
 2 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/crates/polars-io/src/parquet/read/read_impl.rs b/crates/polars-io/src/parquet/read/read_impl.rs
index eb4448eebeb1..9f5281280c51 100644
--- a/crates/polars-io/src/parquet/read/read_impl.rs
+++ b/crates/polars-io/src/parquet/read/read_impl.rs
@@ -671,7 +671,7 @@ fn rg_to_dfs_par_over_rg(
     store: &mmap::ColumnStore,
     row_group_start: usize,
     row_group_end: usize,
-    previous_row_count: &mut IdxSize,
+    rows_read: &mut IdxSize,
     slice: (usize, usize),
     file_metadata: &FileMetadata,
     schema: &ArrowSchemaRef,
@@ -689,15 +689,34 @@ fn rg_to_dfs_par_over_rg(
         .sum();
     let slice_end = slice.0 + slice.1;
 
+    // rows_scanned is the number of rows that have been scanned so far when checking for overlap with the slice.
+    // rows_read is the number of rows found to overlap with the slice, and thus the number of rows that will be
+    // read into a dataframe.
+    let mut rows_scanned: IdxSize;
+
+    if row_group_start > 0 {
+        // In the case of async reads, we need to account for the fact that row_group_start may be greater than
+        // zero due to earlier processing.
+        // For details, see: https://github.com/pola-rs/polars/pull/20508#discussion_r1900165649
+        rows_scanned = (0..row_group_start)
+            .map(|i| file_metadata.row_groups[i].num_rows() as IdxSize)
+            .sum();
+    } else {
+        rows_scanned = 0;
+    }
+
     for i in row_group_start..row_group_end {
-        let row_count_start = *previous_row_count;
+        let row_count_start = rows_scanned;
         let rg_md = &file_metadata.row_groups[i];
+        let n_rows_this_file = rg_md.num_rows();
         let rg_slice =
-            split_slice_at_file(&mut n_rows_processed, rg_md.num_rows(), slice.0, slice_end);
-        *previous_row_count = previous_row_count
-            .checked_add(rg_slice.1 as IdxSize)
+            split_slice_at_file(&mut n_rows_processed, n_rows_this_file, slice.0, slice_end);
+        rows_scanned = rows_scanned
+            .checked_add(n_rows_this_file as IdxSize)
             .ok_or(ROW_COUNT_OVERFLOW_ERR)?;
 
+        *rows_read += rg_slice.1 as IdxSize;
+
         if rg_slice.1 == 0 {
             continue;
         }
diff --git a/py-polars/tests/unit/io/test_lazy_parquet.py b/py-polars/tests/unit/io/test_lazy_parquet.py
index 05589332cc99..78ffb6b1379b 100644
--- a/py-polars/tests/unit/io/test_lazy_parquet.py
+++ b/py-polars/tests/unit/io/test_lazy_parquet.py
@@ -564,6 +564,34 @@ def trim_to_metadata(path: str | Path) -> None:
         )
 
 
+@pytest.mark.write_disk
+def test_predicate_slice_pushdown_row_index_20485(tmp_path: Path) -> None:
+    tmp_path.mkdir(exist_ok=True)
+
+    file_path = tmp_path / "slice_pushdown.parquet"
+    row_group_size = 100000
+    num_row_groups = 3
+
+    df = pl.select(ref=pl.int_range(num_row_groups * row_group_size))
+    df.write_parquet(file_path, row_group_size=row_group_size)
+
+    # Use a slice that starts near the end of one row group and extends into the next
+    # to test handling of slices that span multiple row groups.
+    slice_start = 199995
+    slice_len = 10
+    ldf = pl.scan_parquet(file_path)
+    sliced_df = ldf.with_row_index().slice(slice_start, slice_len).collect()
+    sliced_df_no_pushdown = (
+        ldf.with_row_index().slice(slice_start, slice_len).collect(slice_pushdown=False)
+    )
+
+    expected_index = list(range(slice_start, slice_start + slice_len))
+    actual_index = list(sliced_df["index"])
+    assert actual_index == expected_index
+
+    assert_frame_equal(sliced_df, sliced_df_no_pushdown)
+
+
 @pytest.mark.write_disk
 @pytest.mark.parametrize("streaming", [True, False])
 def test_parquet_row_groups_shift_bug_18739(tmp_path: Path, streaming: bool) -> None:

From c5790a755307c3e71ce2a781c1eda0e2c830e3b0 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 2 Jan 2025 10:53:40 +0100
Subject: [PATCH 02/20] chore(python): Bump the python group in /py-polars with
 3 updates (#20521)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Stijn de Gooijer <stijndegooijer@gmail.com>
---
 py-polars/requirements-dev.txt                        | 2 +-
 py-polars/requirements-lint.txt                       | 4 ++--
 py-polars/tests/unit/io/test_hive.py                  | 2 +-
 py-polars/tests/unit/operations/unique/test_unique.py | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/py-polars/requirements-dev.txt b/py-polars/requirements-dev.txt
index 7fb358bced22..89c081de50c3 100644
--- a/py-polars/requirements-dev.txt
+++ b/py-polars/requirements-dev.txt
@@ -60,7 +60,7 @@ hypothesis
 # -------
 
 pytest==8.3.2
-pytest-codspeed==3.0.0
+pytest-codspeed==3.1.0
 pytest-cov==6.0.0
 pytest-xdist==3.6.1
 
diff --git a/py-polars/requirements-lint.txt b/py-polars/requirements-lint.txt
index df703691f12a..5c6034674239 100644
--- a/py-polars/requirements-lint.txt
+++ b/py-polars/requirements-lint.txt
@@ -1,3 +1,3 @@
-mypy[faster-cache]==1.13.0
+mypy[faster-cache]==1.14.1
 ruff==0.8.1
-typos==1.28.1
+typos==1.29.0
diff --git a/py-polars/tests/unit/io/test_hive.py b/py-polars/tests/unit/io/test_hive.py
index 4a6384fb5f56..2bb0c6e7d37c 100644
--- a/py-polars/tests/unit/io/test_hive.py
+++ b/py-polars/tests/unit/io/test_hive.py
@@ -200,7 +200,7 @@ def test_hive_partitioned_projection_pushdown(
         q = pl.scan_parquet(
             root / "**/*.parquet",
             hive_partitioning=True,
-            parallel=parallel,  # type: ignore[arg-type]
+            parallel=parallel,
         )
 
         expected = q.collect().select("category")
diff --git a/py-polars/tests/unit/operations/unique/test_unique.py b/py-polars/tests/unit/operations/unique/test_unique.py
index 406a70b6e71f..595ae1db59eb 100644
--- a/py-polars/tests/unit/operations/unique/test_unique.py
+++ b/py-polars/tests/unit/operations/unique/test_unique.py
@@ -43,7 +43,7 @@ def test_unique_predicate_pd() -> None:
     for maintain_order in (True, False):
         for keep in ("first", "last", "any", "none"):
             q = (
-                lf.unique("x", maintain_order=maintain_order, keep=keep)  # type: ignore[arg-type]
+                lf.unique("x", maintain_order=maintain_order, keep=keep)
                 .filter(pl.col("x") == "abc")
                 .filter(pl.col("z"))
             )

From 90827f7f7683c5c527c09638142158d569127cfb Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 2 Jan 2025 10:53:51 +0100
Subject: [PATCH 03/20] ci: Bump crate-ci/typos from 1.28.1 to 1.29.0 in the ci
 group (#20520)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/lint-global.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint-global.yml b/.github/workflows/lint-global.yml
index ff1487ec963e..e0bf119ce624 100644
--- a/.github/workflows/lint-global.yml
+++ b/.github/workflows/lint-global.yml
@@ -15,4 +15,4 @@ jobs:
       - name: Lint Markdown and TOML
         uses: dprint/check@v2.2
       - name: Spell Check with Typos
-        uses: crate-ci/typos@v1.28.1
+        uses: crate-ci/typos@v1.29.0

From 70473d0eb47e2cb9376dd95bec23d8563f0f3cc8 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 2 Jan 2025 10:54:03 +0100
Subject: [PATCH 04/20] chore(python): Bump markdown-exec[ansi] from 1.9.3 to
 1.10.0 in /docs in the documentation group (#20518)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/source/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/requirements.txt b/docs/source/requirements.txt
index 25e87aeb64ec..2c8d162a5689 100644
--- a/docs/source/requirements.txt
+++ b/docs/source/requirements.txt
@@ -13,5 +13,5 @@ mkdocs-material==9.5.27
 mkdocs-macros-plugin==1.3.7
 mkdocs-redirects==1.2.1
 material-plausible-plugin==0.2.0
-markdown-exec[ansi]==1.9.3
+markdown-exec[ansi]==1.10.0
 pygithub==2.5.0

From b0ac62bff3f2d38df6e8e4e872c7b8c86eaac6c7 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 2 Jan 2025 10:54:38 +0100
Subject: [PATCH 05/20] build: Bump the rust group with 3 updates (#20519)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Stijn de Gooijer <stijndegooijer@gmail.com>
---
 Cargo.lock                     | 225 ++++++++++++++++++---------------
 Cargo.toml                     |   2 +-
 crates/polars-utils/Cargo.toml |   2 +-
 3 files changed, 126 insertions(+), 103 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ab877a36cc56..0753cb32d661 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -95,9 +95,9 @@ checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
 
 [[package]]
 name = "anyhow"
-version = "1.0.94"
+version = "1.0.95"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7"
+checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04"
 
 [[package]]
 name = "apache-avro"
@@ -206,7 +206,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -217,7 +217,7 @@ checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -265,9 +265,9 @@ dependencies = [
 
 [[package]]
 name = "aws-config"
-version = "1.5.11"
+version = "1.5.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a5d1c2c88936a73c699225d0bc00684a534166b0cebc2659c3cdf08de8edc64c"
+checksum = "649316840239f4e58df0b7f620c428f5fababbbca2d504488c641534050bd141"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -307,9 +307,9 @@ dependencies = [
 
 [[package]]
 name = "aws-runtime"
-version = "1.5.1"
+version = "1.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "300a12520b4e6d08b73f77680f12c16e8ae43250d55100e0b2be46d78da16a48"
+checksum = "44f6f1124d6e19ab6daf7f2e615644305dc6cb2d706892a8a8c0b98db35de020"
 dependencies = [
  "aws-credential-types",
  "aws-sigv4",
@@ -333,9 +333,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-s3"
-version = "1.66.0"
+version = "1.67.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "154488d16ab0d627d15ab2832b57e68a16684c8c902f14cb8a75ec933fc94852"
+checksum = "bbc644164269a1e38ce7f2f7373629d3fb3d310c0e3feb5573a29744288b24d3"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -367,9 +367,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sso"
-version = "1.51.0"
+version = "1.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "74995133da38f109a0eb8e8c886f9e80c713b6e9f2e6e5a6a1ba4450ce2ffc46"
+checksum = "cb25f7129c74d36afe33405af4517524df8f74b635af8c2c8e91c1552b8397b2"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -389,9 +389,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-ssooidc"
-version = "1.52.0"
+version = "1.53.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7062a779685cbf3b2401eb36151e2c6589fd5f3569b8a6bc2d199e5aaa1d059"
+checksum = "d03a3d5ef14851625eafd89660a751776f938bf32f309308b20dcca41c44b568"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -411,9 +411,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sts"
-version = "1.52.0"
+version = "1.53.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "299dae7b1dc0ee50434453fa5a229dc4b22bd3ee50409ff16becf1f7346e0193"
+checksum = "cf3a9f073ae3a53b54421503063dfb87ff1ea83b876f567d92e8b8d9942ba91b"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -463,9 +463,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-async"
-version = "1.2.2"
+version = "1.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8aa8ff1492fd9fb99ae28e8467af0dbbb7c31512b16fabf1a0f10d7bb6ef78bb"
+checksum = "427cb637d15d63d6f9aae26358e1c9a9c09d5aa490d64b09354c8217cfef0f28"
 dependencies = [
  "futures-util",
  "pin-project-lite",
@@ -546,9 +546,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime"
-version = "1.7.5"
+version = "1.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "431a10d0e07e09091284ef04453dae4069283aa108d209974d67e77ae1caa658"
+checksum = "a05dd41a70fc74051758ee75b5c4db2c0ca070ed9229c3df50e9475cda1cb985"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -590,9 +590,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "1.2.10"
+version = "1.2.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ecbf4d5dfb169812e2b240a4350f15ad3c6b03a54074e5712818801615f2dc5"
+checksum = "38ddc9bd6c28aeb303477170ddd183760a956a03e083b3902a990238a7e3792d"
 dependencies = [
  "base64-simd",
  "bytes",
@@ -778,22 +778,22 @@ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
 
 [[package]]
 name = "bytemuck"
-version = "1.20.0"
+version = "1.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b37c88a63ffd85d15b406896cc343916d7cf57838a847b3a6f2ca5d39a5695a"
+checksum = "ef657dfab802224e671f5818e9a4935f9b1957ed18e58292690cc39e7a4092a3"
 dependencies = [
  "bytemuck_derive",
 ]
 
 [[package]]
 name = "bytemuck_derive"
-version = "1.8.0"
+version = "1.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec"
+checksum = "3fa76293b4f7bb636ab88fd78228235b5248b4d05cc589aed610f954af5d7c7a"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -823,11 +823,11 @@ dependencies = [
 
 [[package]]
 name = "casey"
-version = "0.4.0"
+version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "614586263949597dcc18675da12ef9b429135e13628d92eb8b8c6fa50ca5656b"
+checksum = "8e779867f62d81627d1438e0d3fb6ed7d7c9d64293ca6d87a1e88781b94ece1c"
 dependencies = [
- "syn 1.0.109",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -847,9 +847,9 @@ dependencies = [
 
 [[package]]
 name = "cc"
-version = "1.2.5"
+version = "1.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c31a0499c1dc64f458ad13872de75c0eb7e3fdb0e67964610c914b034fc5956e"
+checksum = "8d6dbb628b8f8555f86d0323c2eb39e3ec81901f4b83e091db8a6a76d316a333"
 dependencies = [
  "jobserver",
  "libc",
@@ -986,9 +986,9 @@ dependencies = [
 
 [[package]]
 name = "compact_str"
-version = "0.8.0"
+version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6050c3a16ddab2e412160b31f2c871015704239bca62f72f6e5f0be631d3f644"
+checksum = "3b79c4069c6cad78e2e0cdfcbd26275770669fb39fd308a752dc110e83b9af32"
 dependencies = [
  "castaway",
  "cfg-if",
@@ -1271,7 +1271,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -1336,7 +1336,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -1533,7 +1533,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -1607,9 +1607,9 @@ checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
 
 [[package]]
 name = "glob"
-version = "0.3.1"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
+checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2"
 
 [[package]]
 name = "group"
@@ -1883,9 +1883,9 @@ dependencies = [
 
 [[package]]
 name = "hyper-rustls"
-version = "0.27.4"
+version = "0.27.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6884a48c6826ec44f524c7456b163cebe9e55a18d7b5e307cb4f100371cc767"
+checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2"
 dependencies = [
  "futures-util",
  "http 1.2.0",
@@ -2072,7 +2072,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -2688,24 +2688,25 @@ dependencies = [
 
 [[package]]
 name = "object"
-version = "0.36.5"
+version = "0.36.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e"
+checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87"
 dependencies = [
  "memchr",
 ]
 
 [[package]]
 name = "object_store"
-version = "0.11.1"
+version = "0.11.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6eb4c22c6154a1e759d7099f9ffad7cc5ef8245f9efbab4a41b92623079c82f3"
+checksum = "3cfccb68961a56facde1163f9319e0d15743352344e7808a11795fb99698dcaf"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
  "bytes",
  "chrono",
  "futures",
+ "httparse",
  "humantime",
  "hyper 1.5.2",
  "itertools 0.13.0",
@@ -2761,7 +2762,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -3088,7 +3089,7 @@ dependencies = [
  "serde",
  "serde_json",
  "strum_macros",
- "thiserror 2.0.8",
+ "thiserror 2.0.9",
  "version_check",
  "xxhash-rust",
 ]
@@ -3130,7 +3131,7 @@ dependencies = [
  "polars-arrow-format",
  "regex",
  "simdutf8",
- "thiserror 2.0.8",
+ "thiserror 2.0.9",
 ]
 
 [[package]]
@@ -3457,7 +3458,7 @@ dependencies = [
  "pyo3",
  "recursive",
  "serde_json",
- "thiserror 2.0.8",
+ "thiserror 2.0.9",
  "version_check",
 ]
 
@@ -3725,7 +3726,7 @@ dependencies = [
  "proc-macro2",
  "pyo3-macros-backend",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -3737,7 +3738,7 @@ dependencies = [
  "proc-macro2",
  "pyo3-build-config",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -3748,9 +3749,9 @@ checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40"
 
 [[package]]
 name = "quick-xml"
-version = "0.36.2"
+version = "0.37.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7649a7b4df05aed9ea7ec6f628c67c9953a43869b8bc50929569b2999d443fe"
+checksum = "165859e9e55f79d67b96c5d96f4e88b6f2695a1972849c15a6a3f5c59fc2c003"
 dependencies = [
  "memchr",
  "serde",
@@ -3780,7 +3781,7 @@ dependencies = [
  "rustc-hash",
  "rustls 0.23.20",
  "socket2",
- "thiserror 2.0.8",
+ "thiserror 2.0.9",
  "tokio",
  "tracing",
 ]
@@ -3799,7 +3800,7 @@ dependencies = [
  "rustls 0.23.20",
  "rustls-pki-types",
  "slab",
- "thiserror 2.0.8",
+ "thiserror 2.0.9",
  "tinyvec",
  "tracing",
  "web-time",
@@ -3821,9 +3822,9 @@ dependencies = [
 
 [[package]]
 name = "quote"
-version = "1.0.37"
+version = "1.0.38"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
+checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
 dependencies = [
  "proc-macro2",
 ]
@@ -3939,7 +3940,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b"
 dependencies = [
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -3968,7 +3969,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -4014,9 +4015,9 @@ checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
 
 [[package]]
 name = "reqwest"
-version = "0.12.9"
+version = "0.12.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f"
+checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da"
 dependencies = [
  "base64 0.22.1",
  "bytes",
@@ -4028,7 +4029,7 @@ dependencies = [
  "http-body 1.0.1",
  "http-body-util",
  "hyper 1.5.2",
- "hyper-rustls 0.27.4",
+ "hyper-rustls 0.27.5",
  "hyper-tls",
  "hyper-util",
  "ipnet",
@@ -4052,6 +4053,7 @@ dependencies = [
  "tokio-native-tls",
  "tokio-rustls 0.26.1",
  "tokio-util",
+ "tower",
  "tower-service",
  "url",
  "wasm-bindgen",
@@ -4227,9 +4229,9 @@ dependencies = [
 
 [[package]]
 name = "rustversion"
-version = "1.0.18"
+version = "1.0.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248"
+checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4"
 
 [[package]]
 name = "ryu"
@@ -4375,9 +4377,9 @@ checksum = "3cb6eb87a131f756572d7fb904f6e7b68633f09cca868c5df1c4b8d1a694bbba"
 
 [[package]]
 name = "serde"
-version = "1.0.216"
+version = "1.0.217"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e"
+checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70"
 dependencies = [
  "serde_derive",
 ]
@@ -4393,20 +4395,20 @@ dependencies = [
 
 [[package]]
 name = "serde_derive"
-version = "1.0.216"
+version = "1.0.217"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e"
+checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
 name = "serde_json"
-version = "1.0.133"
+version = "1.0.134"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377"
+checksum = "d00f4175c42ee48b15416f6193a959ba3a0d67fc699a0db9ad12df9f83991c7d"
 dependencies = [
  "indexmap",
  "itoa",
@@ -4545,7 +4547,7 @@ dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -4651,7 +4653,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -4673,9 +4675,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.90"
+version = "2.0.94"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31"
+checksum = "987bc0be1cdea8b10216bd06e2ca407d40b9543468fafd3ddfb02f36e77f71f3"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -4699,14 +4701,14 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
 name = "sysinfo"
-version = "0.32.1"
+version = "0.33.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c33cd241af0f2e9e3b5c32163b873b29956890b5342e6745b917ce9d490f4af"
+checksum = "4fc858248ea01b66f19d8e8a6d55f41deaf91e9d495246fd01368d99935c6c01"
 dependencies = [
  "core-foundation-sys",
  "libc",
@@ -4751,11 +4753,11 @@ dependencies = [
 
 [[package]]
 name = "thiserror"
-version = "2.0.8"
+version = "2.0.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08f5383f3e0071702bf93ab5ee99b52d26936be9dedd9413067cbdcddcb6141a"
+checksum = "f072643fd0190df67a8bab670c20ef5d8737177d6ac6b2e9a236cb096206b2cc"
 dependencies = [
- "thiserror-impl 2.0.8",
+ "thiserror-impl 2.0.9",
 ]
 
 [[package]]
@@ -4766,18 +4768,18 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "2.0.8"
+version = "2.0.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2f357fcec90b3caef6623a099691be676d033b40a058ac95d2a6ade6fa0c943"
+checksum = "7b50fa271071aae2e6ee85f842e2e28ba8cd2c5fb67f11fcb1fd70b276f9e7d4"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -4832,9 +4834,9 @@ dependencies = [
 
 [[package]]
 name = "tinyvec"
-version = "1.8.0"
+version = "1.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938"
+checksum = "022db8904dfa342efe721985167e9fcd16c29b226db4397ed752a761cfce81e8"
 dependencies = [
  "tinyvec_macros",
 ]
@@ -4870,7 +4872,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -4917,6 +4919,27 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "tower"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "pin-project-lite",
+ "sync_wrapper",
+ "tokio",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
+
 [[package]]
 name = "tower-service"
 version = "0.3.3"
@@ -4942,7 +4965,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -4987,7 +5010,7 @@ checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -5156,7 +5179,7 @@ dependencies = [
  "log",
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
  "wasm-bindgen-shared",
 ]
 
@@ -5191,7 +5214,7 @@ checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -5305,7 +5328,7 @@ checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -5316,7 +5339,7 @@ checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -5534,9 +5557,9 @@ checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4"
 
 [[package]]
 name = "xxhash-rust"
-version = "0.8.13"
+version = "0.8.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a08fd76779ae1883bbf1e46c2c46a75a0c4e37c445e68a24b01479d438f26ae6"
+checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3"
 
 [[package]]
 name = "yoke"
@@ -5558,7 +5581,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
  "synstructure",
 ]
 
@@ -5580,7 +5603,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
@@ -5600,7 +5623,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
  "synstructure",
 ]
 
@@ -5629,7 +5652,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.94",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index b2d1b988fc4b..727cd49af796 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -28,7 +28,7 @@ avro-schema = { version = "0.3" }
 base64 = "0.22.0"
 bincode = "1.3.3"
 bitflags = "2"
-bytemuck = { version = "1.11", features = ["derive", "extern_crate_alloc"] }
+bytemuck = { version = "1.21", features = ["derive", "extern_crate_alloc"] }
 bytes = { version = "1.7" }
 chrono = { version = "0.4.31", default-features = false, features = ["std"] }
 chrono-tz = "0.10"
diff --git a/crates/polars-utils/Cargo.toml b/crates/polars-utils/Cargo.toml
index 2fb92f49e7f7..31cdf97883e8 100644
--- a/crates/polars-utils/Cargo.toml
+++ b/crates/polars-utils/Cargo.toml
@@ -30,7 +30,7 @@ rayon = { workspace = true }
 serde = { workspace = true, optional = true }
 serde_json = { workspace = true, optional = true }
 stacker = { workspace = true }
-sysinfo = { version = "0.32", default-features = false, features = ["system"], optional = true }
+sysinfo = { version = "0.33", default-features = false, features = ["system"], optional = true }
 
 [dev-dependencies]
 rand = { workspace = true }

From d3bcf0ab2049a7b56ae358a8904205cf3bb6c89f Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Thu, 2 Jan 2025 11:06:03 +0100
Subject: [PATCH 06/20] fix: Fix union (#20523)

---
 crates/polars-core/src/datatypes/mod.rs       |  2 ++
 crates/polars-core/src/datatypes/schema.rs    | 22 +++++++++++++++++++
 crates/polars-core/src/lib.rs                 |  1 +
 .../src/plans/conversion/dsl_to_ir.rs         |  4 +++-
 .../tests/unit/operations/test_concat.py      | 10 +++++++++
 5 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 crates/polars-core/src/datatypes/schema.rs

diff --git a/crates/polars-core/src/datatypes/mod.rs b/crates/polars-core/src/datatypes/mod.rs
index ed2f810f05bd..f9e4b71e5602 100644
--- a/crates/polars-core/src/datatypes/mod.rs
+++ b/crates/polars-core/src/datatypes/mod.rs
@@ -22,6 +22,7 @@ use std::fmt::{Display, Formatter};
 use std::hash::{Hash, Hasher};
 use std::ops::{Add, AddAssign, Div, Mul, Rem, Sub, SubAssign};
 
+mod schema;
 pub use aliases::*;
 pub use any_value::*;
 pub use arrow::array::{ArrayCollectIterExt, ArrayFromIter, ArrayFromIterDtype, StaticArray};
@@ -42,6 +43,7 @@ use polars_utils::abs_diff::AbsDiff;
 use polars_utils::float::IsFloat;
 use polars_utils::min_max::MinMax;
 use polars_utils::nulls::IsNull;
+pub use schema::SchemaExtPl;
 #[cfg(feature = "serde")]
 use serde::de::{EnumAccess, Error, Unexpected, VariantAccess, Visitor};
 #[cfg(any(feature = "serde", feature = "serde-lazy"))]
diff --git a/crates/polars-core/src/datatypes/schema.rs b/crates/polars-core/src/datatypes/schema.rs
new file mode 100644
index 000000000000..edc3b38dee7b
--- /dev/null
+++ b/crates/polars-core/src/datatypes/schema.rs
@@ -0,0 +1,22 @@
+use super::*;
+
+pub trait SchemaExtPl {
+    // Answers if this schema matches the given schema.
+    //
+    // Allows (nested) Null types in this schema to match any type in the schema,
+    // but not vice versa. In such a case Ok(true) is returned, because a cast
+    // is necessary. If no cast is necessary Ok(false) is returned, and an
+    // error is returned if the types are incompatible.
+    fn matches_schema(&self, other: &Schema) -> PolarsResult<bool>;
+}
+
+impl SchemaExtPl for Schema {
+    fn matches_schema(&self, other: &Schema) -> PolarsResult<bool> {
+        polars_ensure!(self.len() == other.len(), SchemaMismatch: "found different number of fields in schema's\n\nLeft schema: {} fields, right schema: {} fields.", self.len(), other.len());
+        let mut cast = false;
+        for (a, b) in self.iter_values().zip(other.iter_values()) {
+            cast |= a.matches_schema_type(b)?;
+        }
+        Ok(cast)
+    }
+}
diff --git a/crates/polars-core/src/lib.rs b/crates/polars-core/src/lib.rs
index b81a65674eaa..25377fbfe62e 100644
--- a/crates/polars-core/src/lib.rs
+++ b/crates/polars-core/src/lib.rs
@@ -31,6 +31,7 @@ mod tests;
 use std::sync::Mutex;
 use std::time::{SystemTime, UNIX_EPOCH};
 
+pub use datatypes::SchemaExtPl;
 pub use hashing::IdBuildHasher;
 use once_cell::sync::Lazy;
 use rayon::{ThreadPool, ThreadPoolBuilder};
diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
index 60512a9b0703..3474c8079079 100644
--- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
+++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
@@ -384,8 +384,10 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
             let schema = ctxt.lp_arena.get(first).schema(ctxt.lp_arena);
             for n in &inputs[1..] {
                 let schema_i = ctxt.lp_arena.get(*n).schema(ctxt.lp_arena);
-                polars_ensure!(schema == schema_i, InvalidOperation: "'union'/'concat' inputs should all have the same schema,\
+                // The first argument
+                schema_i.matches_schema(schema.as_ref()).map_err(|_| polars_err!(InvalidOperation:  "'union'/'concat' inputs should all have the same schema,\
                     got\n{:?} and \n{:?}", schema, schema_i)
+                )?;
             }
 
             let options = args.into();
diff --git a/py-polars/tests/unit/operations/test_concat.py b/py-polars/tests/unit/operations/test_concat.py
index 6c964764c181..a2664df1b000 100644
--- a/py-polars/tests/unit/operations/test_concat.py
+++ b/py-polars/tests/unit/operations/test_concat.py
@@ -97,3 +97,13 @@ def test_concat_series() -> None:
     assert pl.concat([s, s]).len() == 6
     # check if s remains unchanged
     assert s.len() == 3
+
+
+def test_concat_null_20501() -> None:
+    a = pl.DataFrame({"id": [1], "value": ["foo"]})
+    b = pl.DataFrame({"id": [2], "value": [None]})
+
+    assert pl.concat([a.lazy(), b.lazy()]).collect().to_dict(as_series=False) == {
+        "id": [1, 2],
+        "value": ["foo", None],
+    }

From 91d04b855a37aa116b6f871c8be0a1b4ab770434 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Thu, 2 Jan 2025 11:08:24 +0100
Subject: [PATCH 07/20] fix: Fix global cat unique (#20524)

---
 .../src/chunked_array/logical/categorical/ops/unique.rs    | 7 ++++---
 py-polars/tests/unit/datatypes/test_categorical.py         | 7 +++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs
index 6d337e3570e3..076099a9c33e 100644
--- a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs
+++ b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs
@@ -31,10 +31,11 @@ impl CategoricalChunked {
                 Ok(out)
             }
         } else {
+            let has_nulls = (self.null_count() > 0) as u32;
             let mut state = match cat_map.as_ref() {
                 RevMapping::Global(map, values, _) => {
                     if self.is_enum() {
-                        PrimitiveRangedUniqueState::new(0, values.len() as u32 + 1)
+                        PrimitiveRangedUniqueState::new(0, values.len() as u32 + has_nulls)
                     } else {
                         let mut min = u32::MAX;
                         let mut max = 0u32;
@@ -44,11 +45,11 @@ impl CategoricalChunked {
                             max = max.max(v);
                         }
 
-                        PrimitiveRangedUniqueState::new(min, max)
+                        PrimitiveRangedUniqueState::new(min, max + has_nulls)
                     }
                 },
                 RevMapping::Local(values, _) => {
-                    PrimitiveRangedUniqueState::new(0, values.len() as u32 + 1)
+                    PrimitiveRangedUniqueState::new(0, values.len() as u32 + has_nulls)
                 },
             };
 
diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py
index 505986422c55..64b789281a21 100644
--- a/py-polars/tests/unit/datatypes/test_categorical.py
+++ b/py-polars/tests/unit/datatypes/test_categorical.py
@@ -898,3 +898,10 @@ def test_perfect_group_by_19950() -> None:
         "y": ["b"],
         "x": ["a"],
     }
+
+
+@StringCache()
+def test_categorical_unique() -> None:
+    s = pl.Series(["a", "b", None], dtype=pl.Categorical)
+    assert s.n_unique() == 3
+    assert s.unique().to_list() == ["a", "b", None]

From 9d7a7d335690ff9c314d88032f6051a0ee9c7b46 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Thu, 2 Jan 2025 13:38:19 +0100
Subject: [PATCH 08/20] feat: Support arbitrary expressions in 'join_where'
 (#20525)

---
 .../polars-plan/src/plans/conversion/join.rs  | 336 +-----------------
 .../unit/operations/test_inequality_join.py   |  22 +-
 2 files changed, 28 insertions(+), 330 deletions(-)

diff --git a/crates/polars-plan/src/plans/conversion/join.rs b/crates/polars-plan/src/plans/conversion/join.rs
index dd94ed2d7784..ac7e6e1a9ae4 100644
--- a/crates/polars-plan/src/plans/conversion/join.rs
+++ b/crates/polars-plan/src/plans/conversion/join.rs
@@ -389,6 +389,11 @@ fn resolve_join_where(
     mut options: Arc<JoinOptions>,
     ctxt: &mut DslConversionContext,
 ) -> PolarsResult<(Node, Node)> {
+    // If not eager, respect the flag.
+    if ctxt.opt_flags.eager() {
+        ctxt.opt_flags.set(OptFlags::PREDICATE_PUSHDOWN, true);
+    }
+    ctxt.opt_flags.set(OptFlags::COLLAPSE_JOINS, true);
     check_join_keys(&predicates)?;
     let input_left = to_alp_impl(Arc::unwrap_or_clone(input_left), ctxt)
         .map_err(|e| e.context(failed_here!(join left)))?;
@@ -403,17 +408,6 @@ fn resolve_join_where(
         .into_owned();
 
     for expr in &predicates {
-        let mut comparison_count = 0;
-        for _e in expr
-            .into_iter()
-            .filter(|e| matches!(e, Expr::BinaryExpr { op, .. } if op.is_comparison()))
-        {
-            comparison_count += 1;
-            if comparison_count > 1 {
-                polars_bail!(InvalidOperation: "only one binary comparison allowed in each 'join_where' predicate; found {:?}", expr);
-            }
-        }
-
         fn all_in_schema(
             schema: &Schema,
             other: Option<&Schema>,
@@ -437,317 +431,21 @@ fn resolve_join_where(
         polars_ensure!( valid, InvalidOperation: "'join_where' predicate only refers to columns from a single table")
     }
 
-    let owned = |e: Arc<Expr>| (*e).clone();
-
-    // We do a few things
-    // First we partition to:
-    // - IEjoin supported inequality predicates
-    // - equality predicates
-    // - remaining predicates
-    // And then decide to which join we dispatch.
-    // The remaining predicates will be applied as filter.
-
-    // What make things a bit complicated is that duplicate join names
-    // are referred to in the query with the name post-join, but on joins
-    // we refer to the names pre-join (e.g. without suffix). So there is some
-    // bookkeeping.
-    //
-    // - First we determine which side of the binary expression refers to the left and right table
-    // and make sure that lhs of the binary expr, maps to the lhs of the join tables and vice versa.
-    // Next we ensure the suffixes are removed when we partition.
-    //
-    // If a predicate has to be applied as post-join filter, we put the suffixes back if needed.
-    let mut ie_left_on = vec![];
-    let mut ie_right_on = vec![];
-    let mut ie_op = vec![];
-
-    let mut eq_left_on = vec![];
-    let mut eq_right_on = vec![];
-
-    let mut remaining_preds = vec![];
-
-    fn to_inequality_operator(op: &Operator) -> Option<InequalityOperator> {
-        match op {
-            Operator::Lt => Some(InequalityOperator::Lt),
-            Operator::LtEq => Some(InequalityOperator::LtEq),
-            Operator::Gt => Some(InequalityOperator::Gt),
-            Operator::GtEq => Some(InequalityOperator::GtEq),
-            _ => None,
-        }
-    }
-
-    fn rename_expr(e: Expr, old: &str, new: &str) -> Expr {
-        e.map_expr(|e| match e {
-            Expr::Column(name) if name.as_str() == old => Expr::Column(new.into()),
-            e => e,
-        })
-    }
-
-    fn determine_order_and_pre_join_names(
-        left: Expr,
-        op: Operator,
-        right: Expr,
-        schema_left: &Schema,
-        schema_right: &Schema,
-        suffix: &str,
-    ) -> PolarsResult<(Expr, Operator, Expr)> {
-        let left_names = expr_to_leaf_column_names_iter(&left).collect::<PlHashSet<_>>();
-        let right_names = expr_to_leaf_column_names_iter(&right).collect::<PlHashSet<_>>();
-
-        // All left should be in the left schema.
-        let (left_names, right_names, left, op, mut right) =
-            if !left_names.iter().all(|n| schema_left.contains(n)) {
-                // If all right names are in left schema -> swap
-                if right_names.iter().all(|n| schema_left.contains(n)) {
-                    (right_names, left_names, right, op.swap_operands(), left)
-                } else {
-                    polars_bail!(InvalidOperation: "got ambiguous column names in 'join_where'")
-                }
-            } else {
-                (left_names, right_names, left, op, right)
-            };
-        for name in &left_names {
-            polars_ensure!(!right_names.contains(name.as_str()), InvalidOperation: "found ambiguous column names in 'join_where'\n\n\
-            Note that you should refer to the column names as they are post-join operation.")
-        }
-
-        // Now we know left belongs to the left schema, rhs suffixes are dealt with.
-        for post_join_name in right_names {
-            if let Some(pre_join_name) = post_join_name.strip_suffix(suffix) {
-                // Name is both sides, so a suffix will be added by the join.
-                // We rename
-                if schema_right.contains(pre_join_name) && schema_left.contains(pre_join_name) {
-                    right = rename_expr(right, &post_join_name, pre_join_name);
-                }
-            }
-        }
-        Ok((left, op, right))
-    }
-
-    // Make it a binary comparison and ensure the columns refer to post join names.
-    fn to_binary_post_join(
-        l: Expr,
-        op: Operator,
-        mut r: Expr,
-        schema_right: &Schema,
-        suffix: &str,
-    ) -> Expr {
-        let names = expr_to_leaf_column_names_iter(&r).collect::<Vec<_>>();
-        for pre_join_name in &names {
-            if !schema_right.contains(pre_join_name) {
-                let post_join_name = _join_suffix_name(pre_join_name, suffix);
-                r = rename_expr(r, pre_join_name, post_join_name.as_str());
-            }
-        }
-
-        Expr::BinaryExpr {
-            left: Arc::from(l),
-            op,
-            right: Arc::from(r),
-        }
-    }
-
-    let suffix = options.args.suffix().clone();
-    for pred in predicates.into_iter() {
-        let Expr::BinaryExpr { left, op, right } = pred.clone() else {
-            polars_bail!(InvalidOperation: "can only join on binary (in)equality expressions, found {:?}", pred)
-        };
-        polars_ensure!(op.is_comparison(), InvalidOperation: "expected comparison in join predicate");
-        let (left, op, right) = determine_order_and_pre_join_names(
-            owned(left),
-            op,
-            owned(right),
-            &schema_left,
-            &schema_right,
-            &suffix,
-        )?;
-
-        if let Some(ie_op_) = to_inequality_operator(&op) {
-            fn is_numeric(e: &Expr, schema: &Schema) -> bool {
-                expr_to_leaf_column_names_iter(e).any(|name| {
-                    if let Some(dt) = schema.get(name.as_str()) {
-                        dt.to_physical().is_numeric()
-                    } else {
-                        false
-                    }
-                })
-            }
-
-            // We fallback to remaining if:
-            // - we already have an IEjoin or Inner join
-            // - we already have an Inner join
-            // - data is not numeric (our iejoin doesn't yet implement that)
-            if ie_op.len() >= 2
-                || !eq_right_on.is_empty()
-                || !is_numeric(&left, &schema_left)
-                || !is_numeric(&right, &schema_right)
-            {
-                remaining_preds.push(to_binary_post_join(left, op, right, &schema_right, &suffix))
-            } else {
-                ie_left_on.push(left);
-                ie_right_on.push(right);
-                ie_op.push(ie_op_)
-            }
-        } else if matches!(op, Operator::Eq) {
-            eq_left_on.push(left);
-            eq_right_on.push(right);
-        } else {
-            remaining_preds.push(to_binary_post_join(left, op, right, &schema_right, &suffix));
-        }
-    }
-
-    // Now choose a primary join and do the remaining predicates as filters
-    // Add the ie predicates to the remaining predicates buffer so that they will be executed in the
-    // filter node.
-    fn ie_predicates_to_remaining(
-        remaining_preds: &mut Vec<Expr>,
-        ie_left_on: Vec<Expr>,
-        ie_right_on: Vec<Expr>,
-        ie_op: Vec<InequalityOperator>,
-        schema_right: &Schema,
-        suffix: &str,
-    ) {
-        for ((l, op), r) in ie_left_on
-            .into_iter()
-            .zip(ie_op.into_iter())
-            .zip(ie_right_on.into_iter())
-        {
-            remaining_preds.push(to_binary_post_join(l, op.into(), r, schema_right, suffix))
-        }
-    }
-
-    let (mut last_node, join_node) = if !eq_left_on.is_empty() {
-        // We found one or more  equality predicates. Go into a default equi join
-        // as those are cheapest on avg.
-        let (last_node, join_node) = resolve_join(
-            Either::Right(input_left),
-            Either::Right(input_right),
-            eq_left_on,
-            eq_right_on,
-            vec![],
-            options.clone(),
-            ctxt,
-        )?;
-
-        ie_predicates_to_remaining(
-            &mut remaining_preds,
-            ie_left_on,
-            ie_right_on,
-            ie_op,
-            &schema_right,
-            &suffix,
-        );
-        (last_node, join_node)
-    } else if ie_right_on.len() >= 2 {
-        // Do an IEjoin.
-        let opts = Arc::make_mut(&mut options);
-
-        opts.args.how = JoinType::IEJoin;
-        opts.options = Some(JoinTypeOptionsIR::IEJoin(IEJoinOptions {
-            operator1: ie_op[0],
-            operator2: Some(ie_op[1]),
-        }));
-
-        let (last_node, join_node) = resolve_join(
-            Either::Right(input_left),
-            Either::Right(input_right),
-            ie_left_on[..2].to_vec(),
-            ie_right_on[..2].to_vec(),
-            vec![],
-            options.clone(),
-            ctxt,
-        )?;
-
-        // The surplus ie-predicates will be added to the remaining predicates so that
-        // they will be applied in a filter node.
-        while ie_right_on.len() > 2 {
-            // Invariant: they all have equal length, so we can pop and unwrap all while len > 2.
-            // The first 2 predicates are used in the
-            let l = ie_right_on.pop().unwrap();
-            let r = ie_left_on.pop().unwrap();
-            let op = ie_op.pop().unwrap();
-
-            remaining_preds.push(to_binary_post_join(l, op.into(), r, &schema_right, &suffix))
-        }
-        (last_node, join_node)
-    } else if ie_right_on.len() == 1 {
-        // For a single inequality comparison, we use the piecewise merge join algorithm
-        let opts = Arc::make_mut(&mut options);
-        opts.args.how = JoinType::IEJoin;
-        opts.options = Some(JoinTypeOptionsIR::IEJoin(IEJoinOptions {
-            operator1: ie_op[0],
-            operator2: None,
-        }));
-
-        resolve_join(
-            Either::Right(input_left),
-            Either::Right(input_right),
-            ie_left_on,
-            ie_right_on,
-            vec![],
-            options.clone(),
-            ctxt,
-        )?
-    } else {
-        // No predicates found that are supported in a fast algorithm.
-        // Do a cross join and follow up with filters.
-        let opts = Arc::make_mut(&mut options);
-        opts.args.how = JoinType::Cross;
-
-        resolve_join(
-            Either::Right(input_left),
-            Either::Right(input_right),
-            vec![],
-            vec![],
-            vec![],
-            options.clone(),
-            ctxt,
-        )?
-    };
-
-    let IR::Join {
-        input_left,
-        input_right,
-        ..
-    } = ctxt.lp_arena.get(join_node)
-    else {
-        unreachable!()
-    };
-    let schema_right = ctxt
-        .lp_arena
-        .get(*input_right)
-        .schema(ctxt.lp_arena)
-        .into_owned();
+    let opts = Arc::make_mut(&mut options);
+    opts.args.how = JoinType::Cross;
 
-    let schema_left = ctxt
-        .lp_arena
-        .get(*input_left)
-        .schema(ctxt.lp_arena)
-        .into_owned();
+    let (mut last_node, join_node) = resolve_join(
+        Either::Right(input_left),
+        Either::Right(input_right),
+        vec![],
+        vec![],
+        vec![],
+        options.clone(),
+        ctxt,
+    )?;
 
-    // Ensure that the predicates use the proper suffix
-    for e in remaining_preds {
+    for e in predicates {
         let predicate = to_expr_ir_ignore_alias(e, ctxt.expr_arena)?;
-        let AExpr::BinaryExpr { mut right, .. } = *ctxt.expr_arena.get(predicate.node()) else {
-            unreachable!()
-        };
-
-        let original_right = right;
-
-        for name in aexpr_to_leaf_names(right, ctxt.expr_arena) {
-            polars_ensure!(schema_right.contains(name.as_str()), ColumnNotFound: "could not find column {name} in the right table during join operation");
-            if schema_left.contains(name.as_str()) {
-                let new_name = _join_suffix_name(name.as_str(), suffix.as_str());
-
-                right = rename_matching_aexpr_leaf_names(
-                    right,
-                    ctxt.expr_arena,
-                    name.as_str(),
-                    new_name,
-                );
-            }
-        }
-        ctxt.expr_arena.swap(right, original_right);
 
         let ir = IR::Filter {
             input: last_node,
diff --git a/py-polars/tests/unit/operations/test_inequality_join.py b/py-polars/tests/unit/operations/test_inequality_join.py
index 848a4b2b7f85..2495d5b84f2d 100644
--- a/py-polars/tests/unit/operations/test_inequality_join.py
+++ b/py-polars/tests/unit/operations/test_inequality_join.py
@@ -461,17 +461,6 @@ def test_raise_on_ambiguous_name() -> None:
         df.join_where(df, pl.col("id") >= pl.col("id"))
 
 
-def test_raise_on_multiple_binary_comparisons() -> None:
-    df = pl.DataFrame({"id": [1, 2]})
-    with pytest.raises(
-        pl.exceptions.InvalidOperationError,
-        match="only one binary comparison allowed in each 'join_where' predicate; found ",
-    ):
-        df.join_where(
-            df, (pl.col("id") < pl.col("id")) ^ (pl.col("id") >= pl.col("id"))
-        )
-
-
 def test_raise_invalid_input_join_where() -> None:
     df = pl.DataFrame({"id": [1, 2]})
     with pytest.raises(
@@ -681,3 +670,14 @@ def test_join_where_literal_20061() -> None:
         "value_right": [5, 5, 5, 25],
         "flag_right": [1, 1, 1, 1],
     }
+
+
+def test_boolean_predicate_join_where() -> None:
+    urls = pl.LazyFrame({"url": "abcd.com/page"})
+    categories = pl.LazyFrame({"base_url": "abcd.com", "category": "landing page"})
+    assert (
+        "NESTED LOOP JOIN"
+        in urls.join_where(
+            categories, pl.col("url").str.starts_with(pl.col("base_url"))
+        ).explain()
+    )

From 11fa6de9f9a69099dab14cd945fd4e32aa38ee40 Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Thu, 2 Jan 2025 09:12:44 -0500
Subject: [PATCH 09/20] fix: Fix various `Int128` operations (#20515)

---
 crates/polars-core/src/series/mod.rs          |  2 ++
 crates/polars-core/src/series/ops/downcast.rs |  2 +-
 .../src/chunked_array/list/sum_mean.rs        |  1 +
 .../src/frame/join/hash_join/sort_merge.rs    |  4 +++
 crates/polars-ops/src/series/ops/abs.rs       |  2 ++
 crates/polars-ops/src/series/ops/cum_agg.rs   |  4 +++
 .../series/ops/interpolation/interpolate.rs   |  1 +
 .../polars-plan/src/dsl/function_expr/cum.rs  |  2 ++
 crates/polars-plan/src/dsl/mod.rs             |  2 ++
 crates/polars-python/src/expr/rolling.rs      | 12 ++++++++
 crates/polars-python/src/series/comparison.rs |  6 ++++
 py-polars/polars/datatypes/convert.py         |  3 ++
 .../tests/unit/lazyframe/test_lazyframe.py    | 29 ++++++++++++++-----
 .../tests/unit/operations/rolling/test_map.py | 13 +++++----
 py-polars/tests/unit/operations/test_abs.py   |  5 ++--
 .../tests/unit/operations/test_interpolate.py |  1 +
 py-polars/tests/unit/series/test_series.py    | 20 ++++++++-----
 17 files changed, 86 insertions(+), 23 deletions(-)

diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs
index 01dbcf33db33..43de366857f8 100644
--- a/crates/polars-core/src/series/mod.rs
+++ b/crates/polars-core/src/series/mod.rs
@@ -747,6 +747,8 @@ impl Series {
                 },
                 Int64 => Ok(self.i64().unwrap().prod_reduce()),
                 UInt64 => Ok(self.u64().unwrap().prod_reduce()),
+                #[cfg(feature = "dtype-i128")]
+                Int128 => Ok(self.i128().unwrap().prod_reduce()),
                 Float32 => Ok(self.f32().unwrap().prod_reduce()),
                 Float64 => Ok(self.f64().unwrap().prod_reduce()),
                 dt => {
diff --git a/crates/polars-core/src/series/ops/downcast.rs b/crates/polars-core/src/series/ops/downcast.rs
index f095c512eb67..732d2228a55b 100644
--- a/crates/polars-core/src/series/ops/downcast.rs
+++ b/crates/polars-core/src/series/ops/downcast.rs
@@ -219,7 +219,7 @@ impl Series {
             .ok_or_else(|| unpack_chunked_err!(self => "Int64"))
     }
 
-    /// Unpack to [`ChunkedArray`] of dtype [`DataType::Int64`]
+    /// Unpack to [`ChunkedArray`] of dtype [`DataType::Int128`]
     #[cfg(feature = "dtype-i128")]
     pub fn i128(&self) -> PolarsResult<&Int128Chunked> {
         self.try_i128()
diff --git a/crates/polars-ops/src/chunked_array/list/sum_mean.rs b/crates/polars-ops/src/chunked_array/list/sum_mean.rs
index a1d73877ae99..9413318f1eac 100644
--- a/crates/polars-ops/src/chunked_array/list/sum_mean.rs
+++ b/crates/polars-ops/src/chunked_array/list/sum_mean.rs
@@ -161,6 +161,7 @@ pub(super) fn mean_list_numerical(ca: &ListChunked, inner_type: &DataType) -> Se
                 Int16 => dispatch_mean::<i16, f64>(values, offsets, arr.validity()),
                 Int32 => dispatch_mean::<i32, f64>(values, offsets, arr.validity()),
                 Int64 => dispatch_mean::<i64, f64>(values, offsets, arr.validity()),
+                Int128 => dispatch_mean::<i128, f64>(values, offsets, arr.validity()),
                 UInt8 => dispatch_mean::<u8, f64>(values, offsets, arr.validity()),
                 UInt16 => dispatch_mean::<u16, f64>(values, offsets, arr.validity()),
                 UInt32 => dispatch_mean::<u32, f64>(values, offsets, arr.validity()),
diff --git a/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs b/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs
index 2053db8786e8..5b2f83282a76 100644
--- a/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs
+++ b/crates/polars-ops/src/frame/join/hash_join/sort_merge.rs
@@ -146,6 +146,10 @@ pub(super) fn par_sorted_merge_inner_no_nulls(
         DataType::Int64 => {
             par_sorted_merge_inner_impl(s_left.i64().unwrap(), s_right.i64().unwrap())
         },
+        #[cfg(feature = "dtype-i128")]
+        DataType::Int128 => {
+            par_sorted_merge_inner_impl(s_left.i128().unwrap(), s_right.i128().unwrap())
+        },
         DataType::Float32 => {
             par_sorted_merge_inner_impl(s_left.f32().unwrap(), s_right.f32().unwrap())
         },
diff --git a/crates/polars-ops/src/series/ops/abs.rs b/crates/polars-ops/src/series/ops/abs.rs
index 5a84678df591..e93e3c13c60d 100644
--- a/crates/polars-ops/src/series/ops/abs.rs
+++ b/crates/polars-ops/src/series/ops/abs.rs
@@ -10,6 +10,8 @@ pub fn abs(s: &Series) -> PolarsResult<Series> {
         Int16 => s.i16().unwrap().wrapping_abs().into_series(),
         Int32 => s.i32().unwrap().wrapping_abs().into_series(),
         Int64 => s.i64().unwrap().wrapping_abs().into_series(),
+        #[cfg(feature = "dtype-i128")]
+        Int128 => s.i128().unwrap().wrapping_abs().into_series(),
         Float32 => s.f32().unwrap().wrapping_abs().into_series(),
         Float64 => s.f64().unwrap().wrapping_abs().into_series(),
         #[cfg(feature = "dtype-decimal")]
diff --git a/crates/polars-ops/src/series/ops/cum_agg.rs b/crates/polars-ops/src/series/ops/cum_agg.rs
index 829c57c820d4..163aa10eb080 100644
--- a/crates/polars-ops/src/series/ops/cum_agg.rs
+++ b/crates/polars-ops/src/series/ops/cum_agg.rs
@@ -187,6 +187,8 @@ pub fn cum_prod(s: &Series, reverse: bool) -> PolarsResult<Series> {
         },
         Int64 => cum_prod_numeric(s.i64()?, reverse).into_series(),
         UInt64 => cum_prod_numeric(s.u64()?, reverse).into_series(),
+        #[cfg(feature = "dtype-i128")]
+        Int128 => cum_prod_numeric(s.i128()?, reverse).into_series(),
         Float32 => cum_prod_numeric(s.f32()?, reverse).into_series(),
         Float64 => cum_prod_numeric(s.f64()?, reverse).into_series(),
         dt => polars_bail!(opq = cum_prod, dt),
@@ -213,6 +215,8 @@ pub fn cum_sum(s: &Series, reverse: bool) -> PolarsResult<Series> {
         UInt32 => cum_sum_numeric(s.u32()?, reverse).into_series(),
         Int64 => cum_sum_numeric(s.i64()?, reverse).into_series(),
         UInt64 => cum_sum_numeric(s.u64()?, reverse).into_series(),
+        #[cfg(feature = "dtype-i128")]
+        Int128 => cum_sum_numeric(s.i128()?, reverse).into_series(),
         Float32 => cum_sum_numeric(s.f32()?, reverse).into_series(),
         Float64 => cum_sum_numeric(s.f64()?, reverse).into_series(),
         #[cfg(feature = "dtype-duration")]
diff --git a/crates/polars-ops/src/series/ops/interpolation/interpolate.rs b/crates/polars-ops/src/series/ops/interpolation/interpolate.rs
index 36d9dc12e556..095b38a6b20e 100644
--- a/crates/polars-ops/src/series/ops/interpolation/interpolate.rs
+++ b/crates/polars-ops/src/series/ops/interpolation/interpolate.rs
@@ -164,6 +164,7 @@ fn interpolate_linear(s: &Series) -> Series {
                     | DataType::Int16
                     | DataType::Int32
                     | DataType::Int64
+                    | DataType::Int128
                     | DataType::UInt8
                     | DataType::UInt16
                     | DataType::UInt32
diff --git a/crates/polars-plan/src/dsl/function_expr/cum.rs b/crates/polars-plan/src/dsl/function_expr/cum.rs
index 755199c3a2a0..02f652274907 100644
--- a/crates/polars-plan/src/dsl/function_expr/cum.rs
+++ b/crates/polars-plan/src/dsl/function_expr/cum.rs
@@ -38,6 +38,7 @@ pub(super) mod dtypes {
             match dt {
                 Boolean => UInt32,
                 Int32 => Int32,
+                Int128 => Int128,
                 UInt32 => UInt32,
                 UInt64 => UInt64,
                 Float32 => Float32,
@@ -56,6 +57,7 @@ pub(super) mod dtypes {
         match dt {
             Boolean => Int64,
             UInt64 => UInt64,
+            Int128 => Int128,
             Float32 => Float32,
             Float64 => Float64,
             _ => Int64,
diff --git a/crates/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs
index 02e276c98565..ef27dc3966b9 100644
--- a/crates/polars-plan/src/dsl/mod.rs
+++ b/crates/polars-plan/src/dsl/mod.rs
@@ -850,6 +850,8 @@ impl Expr {
                     T::Float32 => T::Float32,
                     T::Float64 => T::Float64,
                     T::UInt64 => T::UInt64,
+                    #[cfg(feature = "dtype-i128")]
+                    T::Int128 => T::Int128,
                     _ => T::Int64,
                 })
             }),
diff --git a/crates/polars-python/src/expr/rolling.rs b/crates/polars-python/src/expr/rolling.rs
index 5ef511902613..b9d38124247a 100644
--- a/crates/polars-python/src/expr/rolling.rs
+++ b/crates/polars-python/src/expr/rolling.rs
@@ -457,6 +457,18 @@ impl PyExpr {
                                     })
                                 }
                             },
+                            Int128 => {
+                                if is_float {
+                                    let v = obj.extract::<f64>(py).unwrap();
+                                    Ok(Int128Chunked::from_slice(PlSmallStr::EMPTY, &[v as i128])
+                                        .into_series())
+                                } else {
+                                    obj.extract::<i128>(py).map(|v| {
+                                        Int128Chunked::from_slice(PlSmallStr::EMPTY, &[v])
+                                            .into_series()
+                                    })
+                                }
+                            },
                             Float32 => obj.extract::<f32>(py).map(|v| {
                                 Float32Chunked::from_slice(PlSmallStr::EMPTY, &[v]).into_series()
                             }),
diff --git a/crates/polars-python/src/series/comparison.rs b/crates/polars-python/src/series/comparison.rs
index 2b7de37931f9..38afdbd1c667 100644
--- a/crates/polars-python/src/series/comparison.rs
+++ b/crates/polars-python/src/series/comparison.rs
@@ -71,6 +71,7 @@ impl_eq_num!(eq_i8, i8);
 impl_eq_num!(eq_i16, i16);
 impl_eq_num!(eq_i32, i32);
 impl_eq_num!(eq_i64, i64);
+impl_eq_num!(eq_i128, i128);
 impl_eq_num!(eq_f32, f32);
 impl_eq_num!(eq_f64, f64);
 impl_eq_num!(eq_str, &str);
@@ -98,6 +99,7 @@ impl_neq_num!(neq_i8, i8);
 impl_neq_num!(neq_i16, i16);
 impl_neq_num!(neq_i32, i32);
 impl_neq_num!(neq_i64, i64);
+impl_neq_num!(neq_i128, i128);
 impl_neq_num!(neq_f32, f32);
 impl_neq_num!(neq_f64, f64);
 impl_neq_num!(neq_str, &str);
@@ -124,6 +126,7 @@ impl_gt_num!(gt_i8, i8);
 impl_gt_num!(gt_i16, i16);
 impl_gt_num!(gt_i32, i32);
 impl_gt_num!(gt_i64, i64);
+impl_gt_num!(gt_i128, i128);
 impl_gt_num!(gt_f32, f32);
 impl_gt_num!(gt_f64, f64);
 impl_gt_num!(gt_str, &str);
@@ -150,6 +153,7 @@ impl_gt_eq_num!(gt_eq_i8, i8);
 impl_gt_eq_num!(gt_eq_i16, i16);
 impl_gt_eq_num!(gt_eq_i32, i32);
 impl_gt_eq_num!(gt_eq_i64, i64);
+impl_gt_eq_num!(gt_eq_i128, i128);
 impl_gt_eq_num!(gt_eq_f32, f32);
 impl_gt_eq_num!(gt_eq_f64, f64);
 impl_gt_eq_num!(gt_eq_str, &str);
@@ -177,6 +181,7 @@ impl_lt_num!(lt_i8, i8);
 impl_lt_num!(lt_i16, i16);
 impl_lt_num!(lt_i32, i32);
 impl_lt_num!(lt_i64, i64);
+impl_lt_num!(lt_i128, i128);
 impl_lt_num!(lt_f32, f32);
 impl_lt_num!(lt_f64, f64);
 impl_lt_num!(lt_str, &str);
@@ -203,6 +208,7 @@ impl_lt_eq_num!(lt_eq_i8, i8);
 impl_lt_eq_num!(lt_eq_i16, i16);
 impl_lt_eq_num!(lt_eq_i32, i32);
 impl_lt_eq_num!(lt_eq_i64, i64);
+impl_lt_eq_num!(lt_eq_i128, i128);
 impl_lt_eq_num!(lt_eq_f32, f32);
 impl_lt_eq_num!(lt_eq_f64, f64);
 impl_lt_eq_num!(lt_eq_str, &str);
diff --git a/py-polars/polars/datatypes/convert.py b/py-polars/polars/datatypes/convert.py
index 423c687833ca..20abe6737854 100644
--- a/py-polars/polars/datatypes/convert.py
+++ b/py-polars/polars/datatypes/convert.py
@@ -28,6 +28,7 @@
     Int16,
     Int32,
     Int64,
+    Int128,
     List,
     Null,
     Object,
@@ -149,6 +150,7 @@ def DTYPE_TO_FFINAME(self) -> dict[PolarsDataType, str]:
             Duration: "duration",
             Float32: "f32",
             Float64: "f64",
+            Int128: "i128",
             Int16: "i16",
             Int32: "i32",
             Int64: "i64",
@@ -177,6 +179,7 @@ def DTYPE_TO_PY_TYPE(self) -> dict[PolarsDataType, PythonDataType]:
             Duration: timedelta,
             Float32: float,
             Float64: float,
+            Int128: int,
             Int16: int,
             Int32: int,
             Int64: int,
diff --git a/py-polars/tests/unit/lazyframe/test_lazyframe.py b/py-polars/tests/unit/lazyframe/test_lazyframe.py
index 38f89ff87852..e94590a27d03 100644
--- a/py-polars/tests/unit/lazyframe/test_lazyframe.py
+++ b/py-polars/tests/unit/lazyframe/test_lazyframe.py
@@ -19,7 +19,7 @@
     PolarsInefficientMapWarning,
 )
 from polars.testing import assert_frame_equal, assert_series_equal
-from tests.unit.conftest import FLOAT_DTYPES
+from tests.unit.conftest import FLOAT_DTYPES, NUMERIC_DTYPES
 
 if TYPE_CHECKING:
     from _pytest.capture import CaptureFixture
@@ -488,19 +488,34 @@ def test_len() -> None:
     assert cast(int, ldf.select(pl.col("nrs").len()).collect().item()) == 3
 
 
-def test_cum_agg() -> None:
-    ldf = pl.LazyFrame({"a": [1, 2, 3, 2]})
+@pytest.mark.parametrize("dtype", NUMERIC_DTYPES)
+def test_cum_agg(dtype: PolarsDataType) -> None:
+    ldf = pl.LazyFrame({"a": [1, 2, 3, 2]}, schema={"a": dtype})
     assert_series_equal(
-        ldf.select(pl.col("a").cum_sum()).collect()["a"], pl.Series("a", [1, 3, 6, 8])
+        ldf.select(pl.col("a").cum_min()).collect()["a"],
+        pl.Series("a", [1, 1, 1, 1], dtype=dtype),
     )
     assert_series_equal(
-        ldf.select(pl.col("a").cum_min()).collect()["a"], pl.Series("a", [1, 1, 1, 1])
+        ldf.select(pl.col("a").cum_max()).collect()["a"],
+        pl.Series("a", [1, 2, 3, 3], dtype=dtype),
+    )
+
+    expected_dtype = (
+        pl.Int64 if dtype in [pl.Int8, pl.Int16, pl.UInt8, pl.UInt16] else dtype
     )
     assert_series_equal(
-        ldf.select(pl.col("a").cum_max()).collect()["a"], pl.Series("a", [1, 2, 3, 3])
+        ldf.select(pl.col("a").cum_sum()).collect()["a"],
+        pl.Series("a", [1, 3, 6, 8], dtype=expected_dtype),
+    )
+
+    expected_dtype = (
+        pl.Int64
+        if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.UInt8, pl.UInt16, pl.UInt32]
+        else dtype
     )
     assert_series_equal(
-        ldf.select(pl.col("a").cum_prod()).collect()["a"], pl.Series("a", [1, 2, 6, 12])
+        ldf.select(pl.col("a").cum_prod()).collect()["a"],
+        pl.Series("a", [1, 2, 6, 12], dtype=expected_dtype),
     )
 
 
diff --git a/py-polars/tests/unit/operations/rolling/test_map.py b/py-polars/tests/unit/operations/rolling/test_map.py
index 730b76baad55..cf3b1cefbf80 100644
--- a/py-polars/tests/unit/operations/rolling/test_map.py
+++ b/py-polars/tests/unit/operations/rolling/test_map.py
@@ -7,6 +7,7 @@
 
 import polars as pl
 from polars.testing import assert_series_equal
+from tests.unit.conftest import INTEGER_DTYPES
 
 if TYPE_CHECKING:
     from polars._typing import PolarsDataType
@@ -82,17 +83,19 @@ def test_rolling_map_std_weights(dtype: PolarsDataType) -> None:
     assert_series_equal(result, expected)
 
 
-def test_rolling_map_sum_int() -> None:
-    s = pl.Series("A", [1, 2, 9, 2, 13], dtype=pl.Int32)
+@pytest.mark.parametrize("dtype", INTEGER_DTYPES)
+def test_rolling_map_sum_int(dtype: PolarsDataType) -> None:
+    s = pl.Series("A", [1, 2, 9, 2, 13], dtype=dtype)
 
     result = s.rolling_map(function=lambda s: s.sum(), window_size=3)
 
-    expected = pl.Series("A", [None, None, 12, 13, 24], dtype=pl.Int32)
+    expected = pl.Series("A", [None, None, 12, 13, 24], dtype=dtype)
     assert_series_equal(result, expected)
 
 
-def test_rolling_map_sum_int_cast_to_float() -> None:
-    s = pl.Series("A", [1, 2, 9, None, 13], dtype=pl.Int32)
+@pytest.mark.parametrize("dtype", INTEGER_DTYPES)
+def test_rolling_map_sum_int_cast_to_float(dtype: PolarsDataType) -> None:
+    s = pl.Series("A", [1, 2, 9, None, 13], dtype=dtype)
 
     result = s.rolling_map(
         function=lambda s: s.sum(), window_size=3, weights=[1.0, 2.0, 3.0]
diff --git a/py-polars/tests/unit/operations/test_abs.py b/py-polars/tests/unit/operations/test_abs.py
index 68e4518a93f9..ad0d6eadf9c1 100644
--- a/py-polars/tests/unit/operations/test_abs.py
+++ b/py-polars/tests/unit/operations/test_abs.py
@@ -10,6 +10,7 @@
 import polars as pl
 from polars.exceptions import InvalidOperationError
 from polars.testing import assert_frame_equal, assert_series_equal
+from tests.unit.conftest import FLOAT_DTYPES, SIGNED_INTEGER_DTYPES
 
 if TYPE_CHECKING:
     from polars._typing import PolarsDataType
@@ -47,9 +48,7 @@ def test_builtin_abs() -> None:
     assert abs(s).to_list() == [1, 0, 1, None]
 
 
-@pytest.mark.parametrize(
-    "dtype", [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.Float32, pl.Float64]
-)
+@pytest.mark.parametrize("dtype", [*FLOAT_DTYPES, *SIGNED_INTEGER_DTYPES])
 def test_abs_builtin(dtype: PolarsDataType) -> None:
     lf = pl.LazyFrame({"a": [-1, 0, 1, None]}, schema={"a": dtype})
     result = lf.select(abs(pl.col("a")))
diff --git a/py-polars/tests/unit/operations/test_interpolate.py b/py-polars/tests/unit/operations/test_interpolate.py
index 9f690e6ecd7b..5d39ffc751fa 100644
--- a/py-polars/tests/unit/operations/test_interpolate.py
+++ b/py-polars/tests/unit/operations/test_interpolate.py
@@ -22,6 +22,7 @@
         (pl.Int16, pl.Float64),
         (pl.Int32, pl.Float64),
         (pl.Int64, pl.Float64),
+        (pl.Int128, pl.Float64),
         (pl.UInt8, pl.Float64),
         (pl.UInt16, pl.Float64),
         (pl.UInt32, pl.Float64),
diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py
index fc3872ce1f3b..68b2528d5da6 100644
--- a/py-polars/tests/unit/series/test_series.py
+++ b/py-polars/tests/unit/series/test_series.py
@@ -30,6 +30,7 @@
     ShapeError,
 )
 from polars.testing import assert_frame_equal, assert_series_equal
+from tests.unit.conftest import FLOAT_DTYPES, INTEGER_DTYPES
 from tests.unit.utils.pycapsule_utils import PyCapsuleStreamHolder
 
 if TYPE_CHECKING:
@@ -1717,23 +1718,28 @@ def test_trigonometric_invalid_input() -> None:
         s.cosh()
 
 
-def test_product() -> None:
-    a = pl.Series("a", [1, 2, 3])
+@pytest.mark.parametrize("dtype", INTEGER_DTYPES)
+def test_product_ints(dtype: PolarsDataType) -> None:
+    a = pl.Series("a", [1, 2, 3], dtype=dtype)
     out = a.product()
     assert out == 6
-    a = pl.Series("a", [1, 2, None])
+    a = pl.Series("a", [1, 2, None], dtype=dtype)
     out = a.product()
     assert out == 2
-    a = pl.Series("a", [None, 2, 3])
+    a = pl.Series("a", [None, 2, 3], dtype=dtype)
     out = a.product()
     assert out == 6
-    a = pl.Series("a", [], dtype=pl.Float32)
+
+
+@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
+def test_product_floats(dtype: PolarsDataType) -> None:
+    a = pl.Series("a", [], dtype=dtype)
     out = a.product()
     assert out == 1
-    a = pl.Series("a", [None, None], dtype=pl.Float32)
+    a = pl.Series("a", [None, None], dtype=dtype)
     out = a.product()
     assert out == 1
-    a = pl.Series("a", [3.0, None, float("nan")])
+    a = pl.Series("a", [3.0, None, float("nan")], dtype=dtype)
     out = a.product()
     assert math.isnan(out)
 

From 5c9bb7189f220d9064034b4c03175cf841c06d77 Mon Sep 17 00:00:00 2001
From: Marshall <mcrumiller@users.noreply.github.com>
Date: Fri, 3 Jan 2025 02:53:35 -0500
Subject: [PATCH 10/20] fix: Add `unique` fast path for empty categoricals
 (#20536)

---
 .../chunked_array/logical/categorical/ops/unique.rs  | 12 ++++++++++++
 .../tests/unit/operations/unique/test_unique.py      | 10 ++++++++++
 2 files changed, 22 insertions(+)

diff --git a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs
index 076099a9c33e..7792fae8a544 100644
--- a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs
+++ b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs
@@ -7,6 +7,18 @@ use super::*;
 impl CategoricalChunked {
     pub fn unique(&self) -> PolarsResult<Self> {
         let cat_map = self.get_rev_map();
+        if self.is_empty() {
+            // SAFETY: rev map is valid.
+            unsafe {
+                return Ok(CategoricalChunked::from_cats_and_rev_map_unchecked(
+                    UInt32Chunked::full_null(self.name().clone(), 0),
+                    cat_map.clone(),
+                    self.is_enum(),
+                    self.get_ordering(),
+                ));
+            }
+        };
+
         if self._can_fast_unique() {
             let ca = match &**cat_map {
                 RevMapping::Local(a, _) => UInt32Chunked::from_iter_values(
diff --git a/py-polars/tests/unit/operations/unique/test_unique.py b/py-polars/tests/unit/operations/unique/test_unique.py
index 595ae1db59eb..ff4a0cd10f32 100644
--- a/py-polars/tests/unit/operations/unique/test_unique.py
+++ b/py-polars/tests/unit/operations/unique/test_unique.py
@@ -154,6 +154,16 @@ def test_unique_categorical(input: list[str | None], output: list[str | None]) -
     assert_series_equal(result, expected)
 
 
+def test_unique_categorical_global() -> None:
+    with pl.StringCache():
+        pl.Series(["aaaa", "bbbb", "cccc"])  # pre-fill global cache
+        s = pl.Series(["a", "b", "c"], dtype=pl.Categorical)
+        s_empty = s.slice(0, 0)
+
+        assert s_empty.unique().to_list() == []
+        assert_series_equal(s_empty.cat.get_categories(), pl.Series(["a", "b", "c"]))
+
+
 def test_unique_with_null() -> None:
     df = pl.DataFrame(
         {

From 5f4499773678fe3104daa44646794686d82f273e Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Fri, 3 Jan 2025 05:26:36 -0500
Subject: [PATCH 11/20] feat: Add `Int128` IO support for csv & ipc (#20535)

---
 .../polars-arrow/src/array/dictionary/mod.rs  |  4 +++
 crates/polars-arrow/src/datatypes/mod.rs      |  1 +
 .../src/datatypes/physical_type.rs            |  2 ++
 crates/polars-arrow/src/io/ipc/read/schema.rs |  3 +-
 .../polars-arrow/src/io/ipc/write/schema.rs   |  3 +-
 crates/polars-arrow/src/util/macros.rs        |  1 +
 .../polars-compute/src/comparisons/array.rs   |  1 +
 .../src/comparisons/dyn_array.rs              |  1 +
 crates/polars-compute/src/comparisons/list.rs |  2 ++
 crates/polars-io/Cargo.toml                   |  1 +
 crates/polars-io/src/csv/read/buffer.rs       | 26 ++++++++++++++++
 .../src/csv/write/write_impl/serializer.rs    |  1 +
 crates/polars/Cargo.toml                      |  1 +
 py-polars/tests/unit/io/test_csv.py           | 23 ++++++++++----
 py-polars/tests/unit/io/test_ipc.py           | 31 ++++++++++++++++---
 15 files changed, 88 insertions(+), 13 deletions(-)

diff --git a/crates/polars-arrow/src/array/dictionary/mod.rs b/crates/polars-arrow/src/array/dictionary/mod.rs
index 3f44dd604980..8d31109d8f19 100644
--- a/crates/polars-arrow/src/array/dictionary/mod.rs
+++ b/crates/polars-arrow/src/array/dictionary/mod.rs
@@ -81,6 +81,10 @@ unsafe impl DictionaryKey for i64 {
     const KEY_TYPE: IntegerType = IntegerType::Int64;
     const MAX_USIZE_VALUE: usize = i64::MAX as usize;
 }
+unsafe impl DictionaryKey for i128 {
+    const KEY_TYPE: IntegerType = IntegerType::Int128;
+    const MAX_USIZE_VALUE: usize = i128::MAX as usize;
+}
 unsafe impl DictionaryKey for u8 {
     const KEY_TYPE: IntegerType = IntegerType::UInt8;
     const MAX_USIZE_VALUE: usize = u8::MAX as usize;
diff --git a/crates/polars-arrow/src/datatypes/mod.rs b/crates/polars-arrow/src/datatypes/mod.rs
index d3bc5417a9d8..cc7a081a81cc 100644
--- a/crates/polars-arrow/src/datatypes/mod.rs
+++ b/crates/polars-arrow/src/datatypes/mod.rs
@@ -455,6 +455,7 @@ impl From<IntegerType> for ArrowDataType {
             IntegerType::Int16 => ArrowDataType::Int16,
             IntegerType::Int32 => ArrowDataType::Int32,
             IntegerType::Int64 => ArrowDataType::Int64,
+            IntegerType::Int128 => ArrowDataType::Int128,
             IntegerType::UInt8 => ArrowDataType::UInt8,
             IntegerType::UInt16 => ArrowDataType::UInt16,
             IntegerType::UInt32 => ArrowDataType::UInt32,
diff --git a/crates/polars-arrow/src/datatypes/physical_type.rs b/crates/polars-arrow/src/datatypes/physical_type.rs
index 732a129055a6..f75d8e644f4c 100644
--- a/crates/polars-arrow/src/datatypes/physical_type.rs
+++ b/crates/polars-arrow/src/datatypes/physical_type.rs
@@ -76,6 +76,8 @@ pub enum IntegerType {
     Int32,
     /// A signed 64-bit integer.
     Int64,
+    /// A signed 128-bit integer.
+    Int128,
     /// An unsigned 8-bit integer.
     UInt8,
     /// An unsigned 16-bit integer.
diff --git a/crates/polars-arrow/src/io/ipc/read/schema.rs b/crates/polars-arrow/src/io/ipc/read/schema.rs
index d9bb3b21828e..3ed84d3005bd 100644
--- a/crates/polars-arrow/src/io/ipc/read/schema.rs
+++ b/crates/polars-arrow/src/io/ipc/read/schema.rs
@@ -72,7 +72,8 @@ fn deserialize_integer(int: arrow_format::ipc::IntRef) -> PolarsResult<IntegerTy
         (32, false) => IntegerType::UInt32,
         (64, true) => IntegerType::Int64,
         (64, false) => IntegerType::UInt64,
-        _ => polars_bail!(oos = "IPC: indexType can only be 8, 16, 32 or 64."),
+        (128, true) => IntegerType::Int128,
+        _ => polars_bail!(oos = "IPC: indexType can only be 8, 16, 32, 64 or 128."),
     })
 }
 
diff --git a/crates/polars-arrow/src/io/ipc/write/schema.rs b/crates/polars-arrow/src/io/ipc/write/schema.rs
index a7e15bbdf464..bdceb58acc5d 100644
--- a/crates/polars-arrow/src/io/ipc/write/schema.rs
+++ b/crates/polars-arrow/src/io/ipc/write/schema.rs
@@ -327,7 +327,7 @@ pub(crate) fn serialize_dictionary(
 ) -> arrow_format::ipc::DictionaryEncoding {
     use IntegerType::*;
     let is_signed = match index_type {
-        Int8 | Int16 | Int32 | Int64 => true,
+        Int8 | Int16 | Int32 | Int64 | Int128 => true,
         UInt8 | UInt16 | UInt32 | UInt64 => false,
     };
 
@@ -336,6 +336,7 @@ pub(crate) fn serialize_dictionary(
         Int16 | UInt16 => 16,
         Int32 | UInt32 => 32,
         Int64 | UInt64 => 64,
+        Int128 => 128,
     };
 
     let index_type = arrow_format::ipc::Int {
diff --git a/crates/polars-arrow/src/util/macros.rs b/crates/polars-arrow/src/util/macros.rs
index fb5bd61ebba0..2153d2cb3a07 100644
--- a/crates/polars-arrow/src/util/macros.rs
+++ b/crates/polars-arrow/src/util/macros.rs
@@ -57,6 +57,7 @@ macro_rules! match_integer_type {(
         Int16 => __with_ty__! { i16 },
         Int32 => __with_ty__! { i32 },
         Int64 => __with_ty__! { i64 },
+        Int128 => __with_ty__! { i128 },
         UInt8 => __with_ty__! { u8 },
         UInt16 => __with_ty__! { u16 },
         UInt32 => __with_ty__! { u32 },
diff --git a/crates/polars-compute/src/comparisons/array.rs b/crates/polars-compute/src/comparisons/array.rs
index facde12a5c37..210a8a0489aa 100644
--- a/crates/polars-compute/src/comparisons/array.rs
+++ b/crates/polars-compute/src/comparisons/array.rs
@@ -205,6 +205,7 @@ macro_rules! compare {
             PH::Dictionary(I::Int16) => call_binary!(DictionaryArray<i16>),
             PH::Dictionary(I::Int32) => call_binary!(DictionaryArray<i32>),
             PH::Dictionary(I::Int64) => call_binary!(DictionaryArray<i64>),
+            PH::Dictionary(I::Int128) => call_binary!(DictionaryArray<i128>),
             PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray<u8>),
             PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray<u16>),
             PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray<u32>),
diff --git a/crates/polars-compute/src/comparisons/dyn_array.rs b/crates/polars-compute/src/comparisons/dyn_array.rs
index 3ee3d802f09f..07fd4bbd9a9d 100644
--- a/crates/polars-compute/src/comparisons/dyn_array.rs
+++ b/crates/polars-compute/src/comparisons/dyn_array.rs
@@ -68,6 +68,7 @@ macro_rules! compare {
             PH::Dictionary(I::Int16) => call_binary!(DictionaryArray<i16>, lhs, rhs, $op),
             PH::Dictionary(I::Int32) => call_binary!(DictionaryArray<i32>, lhs, rhs, $op),
             PH::Dictionary(I::Int64) => call_binary!(DictionaryArray<i64>, lhs, rhs, $op),
+            PH::Dictionary(I::Int128) => call_binary!(DictionaryArray<i128>, lhs, rhs, $op),
             PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray<u8>, lhs, rhs, $op),
             PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray<u16>, lhs, rhs, $op),
             PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray<u32>, lhs, rhs, $op),
diff --git a/crates/polars-compute/src/comparisons/list.rs b/crates/polars-compute/src/comparisons/list.rs
index f7e18b79c0e7..c7c1db50ed70 100644
--- a/crates/polars-compute/src/comparisons/list.rs
+++ b/crates/polars-compute/src/comparisons/list.rs
@@ -99,6 +99,7 @@ macro_rules! compare {
             PH::Dictionary(I::Int16) => call_binary!(DictionaryArray<i16>),
             PH::Dictionary(I::Int32) => call_binary!(DictionaryArray<i32>),
             PH::Dictionary(I::Int64) => call_binary!(DictionaryArray<i64>),
+            PH::Dictionary(I::Int128) => call_binary!(DictionaryArray<i128>),
             PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray<u8>),
             PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray<u16>),
             PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray<u32>),
@@ -196,6 +197,7 @@ macro_rules! compare_broadcast {
             PH::Dictionary(I::Int16) => call_binary!(DictionaryArray<i16>),
             PH::Dictionary(I::Int32) => call_binary!(DictionaryArray<i32>),
             PH::Dictionary(I::Int64) => call_binary!(DictionaryArray<i64>),
+            PH::Dictionary(I::Int128) => call_binary!(DictionaryArray<i128>),
             PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray<u8>),
             PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray<u16>),
             PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray<u32>),
diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml
index f4ff3dd431c7..4c3ed4b04a76 100644
--- a/crates/polars-io/Cargo.toml
+++ b/crates/polars-io/Cargo.toml
@@ -82,6 +82,7 @@ dtype-u8 = ["polars-core/dtype-u8"]
 dtype-u16 = ["polars-core/dtype-u16"]
 dtype-i8 = ["polars-core/dtype-i8"]
 dtype-i16 = ["polars-core/dtype-i16"]
+dtype-i128 = ["polars-core/dtype-i128"]
 dtype-categorical = ["polars-core/dtype-categorical"]
 dtype-date = ["polars-core/dtype-date", "polars-time/dtype-date"]
 object = ["polars-core/object"]
diff --git a/crates/polars-io/src/csv/read/buffer.rs b/crates/polars-io/src/csv/read/buffer.rs
index 22b8b34d3676..a13ab46e585e 100644
--- a/crates/polars-io/src/csv/read/buffer.rs
+++ b/crates/polars-io/src/csv/read/buffer.rs
@@ -82,6 +82,13 @@ impl PrimitiveParser for Int64Type {
         atoi_simd::parse_skipped(bytes).ok()
     }
 }
+#[cfg(feature = "dtype-i128")]
+impl PrimitiveParser for Int128Type {
+    #[inline]
+    fn parse(bytes: &[u8]) -> Option<i128> {
+        atoi_simd::parse_skipped(bytes).ok()
+    }
+}
 
 trait ParsedBuffer {
     fn parse_bytes(
@@ -522,6 +529,8 @@ pub fn init_buffers(
                 &DataType::Int16 => Buffer::Int16(PrimitiveChunkedBuilder::new(name, capacity)),
                 &DataType::Int32 => Buffer::Int32(PrimitiveChunkedBuilder::new(name, capacity)),
                 &DataType::Int64 => Buffer::Int64(PrimitiveChunkedBuilder::new(name, capacity)),
+                #[cfg(feature = "dtype-i128")]
+                &DataType::Int128 => Buffer::Int128(PrimitiveChunkedBuilder::new(name, capacity)),
                 #[cfg(feature = "dtype-u8")]
                 &DataType::UInt8 => Buffer::UInt8(PrimitiveChunkedBuilder::new(name, capacity)),
                 #[cfg(feature = "dtype-u16")]
@@ -594,6 +603,8 @@ pub enum Buffer {
     Int16(PrimitiveChunkedBuilder<Int16Type>),
     Int32(PrimitiveChunkedBuilder<Int32Type>),
     Int64(PrimitiveChunkedBuilder<Int64Type>),
+    #[cfg(feature = "dtype-i128")]
+    Int128(PrimitiveChunkedBuilder<Int128Type>),
     #[cfg(feature = "dtype-u8")]
     UInt8(PrimitiveChunkedBuilder<UInt8Type>),
     #[cfg(feature = "dtype-u16")]
@@ -628,6 +639,8 @@ impl Buffer {
             Buffer::Int16(v) => v.finish().into_series(),
             Buffer::Int32(v) => v.finish().into_series(),
             Buffer::Int64(v) => v.finish().into_series(),
+            #[cfg(feature = "dtype-i128")]
+            Buffer::Int128(v) => v.finish().into_series(),
             #[cfg(feature = "dtype-u8")]
             Buffer::UInt8(v) => v.finish().into_series(),
             #[cfg(feature = "dtype-u16")]
@@ -701,6 +714,8 @@ impl Buffer {
             Buffer::Int16(v) => v.append_null(),
             Buffer::Int32(v) => v.append_null(),
             Buffer::Int64(v) => v.append_null(),
+            #[cfg(feature = "dtype-i128")]
+            Buffer::Int128(v) => v.append_null(),
             #[cfg(feature = "dtype-u8")]
             Buffer::UInt8(v) => v.append_null(),
             #[cfg(feature = "dtype-u16")]
@@ -745,6 +760,8 @@ impl Buffer {
             Buffer::Int16(_) => DataType::Int16,
             Buffer::Int32(_) => DataType::Int32,
             Buffer::Int64(_) => DataType::Int64,
+            #[cfg(feature = "dtype-i128")]
+            Buffer::Int128(_) => DataType::Int128,
             #[cfg(feature = "dtype-u8")]
             Buffer::UInt8(_) => DataType::UInt8,
             #[cfg(feature = "dtype-u16")]
@@ -824,6 +841,15 @@ impl Buffer {
                 missing_is_null,
                 None,
             ),
+            #[cfg(feature = "dtype-i128")]
+            Int128(buf) => <PrimitiveChunkedBuilder<Int128Type> as ParsedBuffer>::parse_bytes(
+                buf,
+                bytes,
+                ignore_errors,
+                needs_escaping,
+                missing_is_null,
+                None,
+            ),
             #[cfg(feature = "dtype-u8")]
             UInt8(buf) => <PrimitiveChunkedBuilder<UInt8Type> as ParsedBuffer>::parse_bytes(
                 buf,
diff --git a/crates/polars-io/src/csv/write/write_impl/serializer.rs b/crates/polars-io/src/csv/write/write_impl/serializer.rs
index 6a4f964d88b3..5229455022cc 100644
--- a/crates/polars-io/src/csv/write/write_impl/serializer.rs
+++ b/crates/polars-io/src/csv/write/write_impl/serializer.rs
@@ -535,6 +535,7 @@ pub(super) fn serializer_for<'a>(
         DataType::UInt32 => quote_if_always!(integer_serializer::<u32>),
         DataType::Int64 => quote_if_always!(integer_serializer::<i64>),
         DataType::UInt64 => quote_if_always!(integer_serializer::<u64>),
+        DataType::Int128 => quote_if_always!(integer_serializer::<i128>),
         DataType::Float32 => match options.float_precision {
             Some(precision) => match options.float_scientific {
                 Some(true) => {
diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml
index eab4dd534314..25a1bcbb0489 100644
--- a/crates/polars/Cargo.toml
+++ b/crates/polars/Cargo.toml
@@ -330,6 +330,7 @@ dtype-i16 = [
 ]
 dtype-i128 = [
   "polars-core/dtype-i128",
+  "polars-io/dtype-i128",
   "polars-lazy?/dtype-i128",
   "polars-ops/dtype-i128",
   "polars-time?/dtype-i128",
diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py
index 43b05892d65d..8d258eb2ba5e 100644
--- a/py-polars/tests/unit/io/test_csv.py
+++ b/py-polars/tests/unit/io/test_csv.py
@@ -259,12 +259,12 @@ def test_csv_missing_utf8_is_empty_string() -> None:
 
 def test_csv_int_types() -> None:
     f = io.StringIO(
-        "u8,i8,u16,i16,u32,i32,u64,i64\n"
-        "0,0,0,0,0,0,0,0\n"
-        "0,-128,0,-32768,0,-2147483648,0,-9223372036854775808\n"
-        "255,127,65535,32767,4294967295,2147483647,18446744073709551615,9223372036854775807\n"
-        "01,01,01,01,01,01,01,01\n"
-        "01,-01,01,-01,01,-01,01,-01\n"
+        "u8,i8,u16,i16,u32,i32,u64,i64,i128\n"
+        "0,0,0,0,0,0,0,0,0\n"
+        "0,-128,0,-32768,0,-2147483648,0,-9223372036854775808,-170141183460469231731687303715884105728\n"
+        "255,127,65535,32767,4294967295,2147483647,18446744073709551615,9223372036854775807,170141183460469231731687303715884105727\n"
+        "01,01,01,01,01,01,01,01,01\n"
+        "01,-01,01,-01,01,-01,01,-01,01\n"
     )
     df = pl.read_csv(
         f,
@@ -277,6 +277,7 @@ def test_csv_int_types() -> None:
             "i32": pl.Int32,
             "u64": pl.UInt64,
             "i64": pl.Int64,
+            "i128": pl.Int128,
         },
     )
 
@@ -295,6 +296,16 @@ def test_csv_int_types() -> None:
                     [0, -9223372036854775808, 9223372036854775807, 1, -1],
                     dtype=pl.Int64,
                 ),
+                "i128": pl.Series(
+                    [
+                        0,
+                        -170141183460469231731687303715884105728,
+                        170141183460469231731687303715884105727,
+                        1,
+                        1,
+                    ],
+                    dtype=pl.Int128,
+                ),
             }
         ),
     )
diff --git a/py-polars/tests/unit/io/test_ipc.py b/py-polars/tests/unit/io/test_ipc.py
index 84e6436cb10e..bdd7a47b38fe 100644
--- a/py-polars/tests/unit/io/test_ipc.py
+++ b/py-polars/tests/unit/io/test_ipc.py
@@ -95,7 +95,8 @@ def test_select_columns_from_buffer(stream: bool) -> None:
             "a": [1],
             "b": [2],
             "c": [3],
-        }
+        },
+        schema={"a": pl.Int64(), "b": pl.Int128(), "c": pl.UInt8()},
     )
 
     f = io.BytesIO()
@@ -109,7 +110,8 @@ def test_select_columns_from_buffer(stream: bool) -> None:
             "b": [2],
             "c": [3],
             "a": [1],
-        }
+        },
+        schema={"b": pl.Int128(), "c": pl.UInt8(), "a": pl.Int64()},
     )
     assert_frame_equal(expected, actual)
 
@@ -142,14 +144,33 @@ def test_compressed_simple(compression: IpcCompression, stream: bool) -> None:
 
 @pytest.mark.parametrize("compression", COMPRESSIONS)
 def test_ipc_schema(compression: IpcCompression) -> None:
-    df = pl.DataFrame({"a": [1, 2], "b": ["a", None], "c": [True, False]})
+    schema = {
+        "i64": pl.Int64(),
+        "i128": pl.Int128(),
+        "u8": pl.UInt8(),
+        "f32": pl.Float32(),
+        "f64": pl.Float64(),
+        "str": pl.String(),
+        "bool": pl.Boolean(),
+    }
+    df = pl.DataFrame(
+        {
+            "i64": [1, 2],
+            "i128": [1, 2],
+            "u8": [1, 2],
+            "f32": [1, 2],
+            "f64": [1, 2],
+            "str": ["a", None],
+            "bool": [True, False],
+        },
+        schema=schema,
+    )
 
     f = io.BytesIO()
     df.write_ipc(f, compression=compression)
     f.seek(0)
 
-    expected = {"a": pl.Int64(), "b": pl.String(), "c": pl.Boolean()}
-    assert pl.read_ipc_schema(f) == expected
+    assert pl.read_ipc_schema(f) == schema
 
 
 @pytest.mark.write_disk

From 58c17455390da12615e5a13f68ebc69d0d554eba Mon Sep 17 00:00:00 2001
From: Prathamesh Ghatole
 <77586602+Prathamesh-Ghatole@users.noreply.github.com>
Date: Fri, 3 Jan 2025 16:59:50 +0530
Subject: [PATCH 12/20] docs(python): Fix typo in `DataFrame.cast` (#20532)

---
 py-polars/polars/dataframe/frame.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index 21cddd52a81e..b08ce16ad91b 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -7937,7 +7937,7 @@ def cast(
             Mapping of column names (or selector) to dtypes, or a single dtype
             to which all columns will be cast.
         strict
-            Raise if cast is invalid on rows after predicates are pusded down.
+            Raise if cast is invalid on rows after predicates are pushed down.
             If `False`, invalid casts will produce null values.
 
         Examples

From ca36b66110e4c73b116c91d0f7d74c9fd0377ed4 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Fri, 3 Jan 2025 13:39:50 +0100
Subject: [PATCH 13/20] fix: Revert categorical unique code (#20540)

---
 .../logical/categorical/ops/unique.rs         | 31 +++----------------
 py-polars/polars/_utils/various.py            |  1 +
 .../tests/unit/datatypes/test_categorical.py  | 20 ++++++++++++
 3 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs
index 7792fae8a544..c46291e4382a 100644
--- a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs
+++ b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs
@@ -1,6 +1,4 @@
-use polars_compute::unique::{
-    DictionaryRangedUniqueState, PrimitiveRangedUniqueState, RangedUniqueKernel,
-};
+use polars_compute::unique::{DictionaryRangedUniqueState, RangedUniqueKernel};
 
 use super::*;
 
@@ -43,32 +41,11 @@ impl CategoricalChunked {
                 Ok(out)
             }
         } else {
-            let has_nulls = (self.null_count() > 0) as u32;
-            let mut state = match cat_map.as_ref() {
-                RevMapping::Global(map, values, _) => {
-                    if self.is_enum() {
-                        PrimitiveRangedUniqueState::new(0, values.len() as u32 + has_nulls)
-                    } else {
-                        let mut min = u32::MAX;
-                        let mut max = 0u32;
-
-                        for &v in map.keys() {
-                            min = min.min(v);
-                            max = max.max(v);
-                        }
-
-                        PrimitiveRangedUniqueState::new(min, max + has_nulls)
-                    }
-                },
-                RevMapping::Local(values, _) => {
-                    PrimitiveRangedUniqueState::new(0, values.len() as u32 + has_nulls)
-                },
-            };
-
+            let mut state = DictionaryRangedUniqueState::new(cat_map.get_categories().to_boxed());
             for chunk in self.physical().downcast_iter() {
-                state.append(chunk);
+                state.key_state().append(chunk);
             }
-            let unique = state.finalize_unique();
+            let (_, unique, _) = state.finalize_unique().take();
             let ca = unsafe {
                 UInt32Chunked::from_chunks_and_dtype_unchecked(
                     self.physical().name().clone(),
diff --git a/py-polars/polars/_utils/various.py b/py-polars/polars/_utils/various.py
index 126929d6d627..3d9b58dddb7e 100644
--- a/py-polars/polars/_utils/various.py
+++ b/py-polars/polars/_utils/various.py
@@ -652,6 +652,7 @@ def re_escape(s: str) -> str:
     return re.sub(f"([{re_rust_metachars}])", r"\\\1", s)
 
 
+# Don't rename or move. This is used by polars cloud
 def display_dot_graph(
     *,
     dot: str,
diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py
index 64b789281a21..66aa7b2ba898 100644
--- a/py-polars/tests/unit/datatypes/test_categorical.py
+++ b/py-polars/tests/unit/datatypes/test_categorical.py
@@ -905,3 +905,23 @@ def test_categorical_unique() -> None:
     s = pl.Series(["a", "b", None], dtype=pl.Categorical)
     assert s.n_unique() == 3
     assert s.unique().to_list() == ["a", "b", None]
+
+
+@StringCache()
+def test_categorical_unique_20539() -> None:
+    df = pl.DataFrame({"number": [1, 1, 2, 2, 3], "letter": ["a", "b", "b", "c", "c"]})
+
+    result = (
+        df.cast({"letter": pl.Categorical})
+        .group_by("number")
+        .agg(
+            unique=pl.col("letter").unique(),
+            unique_with_order=pl.col("letter").unique(maintain_order=True),
+        )
+    )
+
+    assert result.sort("number").to_dict(as_series=False) == {
+        "number": [1, 2, 3],
+        "unique": [["a", "b"], ["b", "c"], ["c"]],
+        "unique_with_order": [["a", "b"], ["b", "c"], ["c"]],
+    }

From 9ce1c070dfae7ecef9664f902fce3697712f4186 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Fri, 3 Jan 2025 16:12:43 +0100
Subject: [PATCH 14/20] ci: Report wheel sizes (#20541)

---
 .github/workflows/benchmark.yml | 59 ++++++++++++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index afa9219231a3..34898e640697 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -73,6 +73,63 @@ jobs:
         working-directory: py-polars
         run: maturin develop --release -- -C codegen-units=8 -C lto=thin -C target-cpu=native
 
+      - name: Set wheel size
+        run: |
+          WHEEL_SIZE=$(ls -l py-polars/polars/polars*.so | awk '{ print $5 }')
+          echo "WHEEL_SIZE=$WHEEL_SIZE" >> $GITHUB_ENV
+
+      - name: Upload wheel sizes artifact (main only)
+        if: github.ref_name == 'main'
+        uses: actions/upload-artifact@v3
+        with:
+          name: wheel-size
+          path: |
+            echo "$GITHUB_RUN_ID $WHEEL_SIZE" > wheel_sizes.txt
+            wheel_sizes.txt
+
+      - name: Download main wheel size
+        uses: actions/download-artifact@v3
+        with:
+          name: wheel-size
+        continue-on-error: true 
+
+      - name: Extract previous wheel size
+        id: load_previous_size
+        run: |
+          if [[ -f wheel_sizes.txt ]]; then
+            PREVIOUS_WHEEL_SIZE=$(tail -n 1 wheel_sizes.txt | awk '{ print $2 }')
+            echo "PREVIOUS_WHEEL_SIZE=$PREVIOUS_WHEEL_SIZE" >> $GITHUB_ENV
+          else
+            echo "PREVIOUS_WHEEL_SIZE=Unknown" >> $GITHUB_ENV
+          fi
+
+      - name: Comment wheel size
+        uses: actions/github-script@v7
+        with:
+          script: |
+              const previousSize = process.env.PREVIOUS_WHEEL_SIZE || 'Unknown';
+              const currentSize = process.env.WHEEL_SIZE || 'Unknown';
+
+              // Convert to MB
+              const previousSizeMB = previousSize !== 'Unknown' ? (previousSize / 1024 / 1024).toFixed(4) : 'Unknown';
+              const currentSizeMB = currentSize !== 'Unknown' ? (currentSize / 1024 / 1024).toFixed(4) : 'Unknown';
+
+              let commentBody = `The previous wheel size was **${previousSizeMB} MB**.\nThe current wheel size after this PR is **${currentSizeMB} MB**.`;
+
+              // Calculate percentage increase if both sizes are available
+              if (previousSize !== 'Unknown' && currentSize !== '') {
+                const increase = ((currentSize - previousSize) / previousSize) * 100;
+                commentBody += `\nThis represents a **${increase.toFixed(2)}% increase** in size.`;
+              }
+
+              github.rest.issues.createComment({
+                issue_number: context.issue.number,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                body: commentBody
+              });
+
+
       - name: Run benchmark tests
         uses: CodSpeedHQ/action@v3
         with:
@@ -87,4 +144,4 @@ jobs:
         working-directory: py-polars
         env:
           POLARS_AUTO_NEW_STREAMING: 1
-        run: pytest -n auto --dist loadgroup -m "not may_fail_auto_streaming and not slow and not write_disk and not release and not docs and not hypothesis and not benchmark and not ci_only"
\ No newline at end of file
+        run: pytest -n auto --dist loadgroup -m "not may_fail_auto_streaming and not slow and not write_disk and not release and not docs and not hypothesis and not benchmark and not ci_only"

From 7c64640ab4ce2b669b3af15fb20793b910f91c1a Mon Sep 17 00:00:00 2001
From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com>
Date: Fri, 3 Jan 2025 11:56:57 -0500
Subject: [PATCH 15/20] fix: Update eager join doctest on multiple columns
 (#20542)

---
 crates/polars/src/docs/eager.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/polars/src/docs/eager.rs b/crates/polars/src/docs/eager.rs
index d9b00886d5af..f15a2d9f1a15 100644
--- a/crates/polars/src/docs/eager.rs
+++ b/crates/polars/src/docs/eager.rs
@@ -407,7 +407,7 @@
 //! temp.full_join(&rain, ["days"], ["days"]);
 //!
 //! // join on multiple columns
-//! temp.join(&rain, vec!["days", "other"], vec!["days", "other"], JoinArgs::new(JoinType::Left));
+//! temp.join(&rain, vec!["days", "other"], vec!["days", "other"], JoinArgs::new(JoinType::Left), None);
 //!
 //! # Ok(())
 //! # }

From 15175999972720195b41205a5c30e42d824055ff Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Fri, 3 Jan 2025 18:32:07 +0100
Subject: [PATCH 16/20] fix: Fix more global categorical issues (#20547)

---
 .github/workflows/benchmark.yml               |  2 +-
 .../chunked_array/comparison/categorical.rs   | 30 ++++++++++++++-----
 .../logical/categorical/ops/unique.rs         | 22 ++------------
 .../tests/unit/datatypes/test_categorical.py  | 21 ++++++++++++-
 4 files changed, 46 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 34898e640697..49ef91de9be4 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -114,7 +114,7 @@ jobs:
               const previousSizeMB = previousSize !== 'Unknown' ? (previousSize / 1024 / 1024).toFixed(4) : 'Unknown';
               const currentSizeMB = currentSize !== 'Unknown' ? (currentSize / 1024 / 1024).toFixed(4) : 'Unknown';
 
-              let commentBody = `The previous wheel size was **${previousSizeMB} MB**.\nThe current wheel size after this PR is **${currentSizeMB} MB**.`;
+              let commentBody = `The uncompressed binary size was **${previousSizeMB} MB**.\nThe uncompressed binary size after this PR is **${currentSizeMB} MB**.`;
 
               // Calculate percentage increase if both sizes are available
               if (previousSize !== 'Unknown' && currentSize !== '') {
diff --git a/crates/polars-core/src/chunked_array/comparison/categorical.rs b/crates/polars-core/src/chunked_array/comparison/categorical.rs
index bbcd6b6047c9..09573c5fbd32 100644
--- a/crates/polars-core/src/chunked_array/comparison/categorical.rs
+++ b/crates/polars-core/src/chunked_array/comparison/categorical.rs
@@ -374,13 +374,29 @@ where
         // Apply comparison on categories map and then do a lookup
         let bitmap = str_single_compare_function(lhs.get_rev_map().get_categories(), rhs);
 
-        Ok(
-            BooleanChunked::from_iter_trusted_length(lhs.physical().into_iter().map(|opt_idx| {
-                // SAFETY: indexing into bitmap with same length as original array
-                opt_idx.map(|idx| unsafe { bitmap.get_bit_unchecked(idx as usize) })
-            }))
-            .with_name(lhs.name().clone()),
-        )
+        let mask = match lhs.get_rev_map().as_ref() {
+            RevMapping::Local(_, _) => {
+                BooleanChunked::from_iter_trusted_length(lhs.physical().into_iter().map(
+                    |opt_idx| {
+                        // SAFETY: indexing into bitmap with same length as original array
+                        opt_idx.map(|idx| unsafe { bitmap.get_bit_unchecked(idx as usize) })
+                    },
+                ))
+            },
+            RevMapping::Global(idx_map, _, _) => {
+                BooleanChunked::from_iter_trusted_length(lhs.physical().into_iter().map(
+                    |opt_idx| {
+                        // SAFETY: indexing into bitmap with same length as original array
+                        opt_idx.map(|idx| unsafe {
+                            let idx = *idx_map.get(&idx).unwrap();
+                            bitmap.get_bit_unchecked(idx as usize)
+                        })
+                    },
+                ))
+            },
+        };
+
+        Ok(mask.with_name(lhs.name().clone()))
     }
 }
 
diff --git a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs
index c46291e4382a..17752f828d8d 100644
--- a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs
+++ b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs
@@ -1,5 +1,3 @@
-use polars_compute::unique::{DictionaryRangedUniqueState, RangedUniqueKernel};
-
 use super::*;
 
 impl CategoricalChunked {
@@ -41,18 +39,7 @@ impl CategoricalChunked {
                 Ok(out)
             }
         } else {
-            let mut state = DictionaryRangedUniqueState::new(cat_map.get_categories().to_boxed());
-            for chunk in self.physical().downcast_iter() {
-                state.key_state().append(chunk);
-            }
-            let (_, unique, _) = state.finalize_unique().take();
-            let ca = unsafe {
-                UInt32Chunked::from_chunks_and_dtype_unchecked(
-                    self.physical().name().clone(),
-                    vec![unique.to_boxed()],
-                    DataType::UInt32,
-                )
-            };
+            let ca = self.physical().unique()?;
             // SAFETY:
             // we only removed some indexes so we are still in bounds
             unsafe {
@@ -70,12 +57,7 @@ impl CategoricalChunked {
         if self._can_fast_unique() {
             Ok(self.get_rev_map().len())
         } else {
-            let cat_map = self.get_rev_map();
-            let mut state = DictionaryRangedUniqueState::new(cat_map.get_categories().to_boxed());
-            for chunk in self.physical().downcast_iter() {
-                state.key_state().append(chunk);
-            }
-            Ok(state.finalize_n_unique())
+            self.physical().n_unique()
         }
     }
 
diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py
index 66aa7b2ba898..d5416bc47a93 100644
--- a/py-polars/tests/unit/datatypes/test_categorical.py
+++ b/py-polars/tests/unit/datatypes/test_categorical.py
@@ -904,7 +904,7 @@ def test_perfect_group_by_19950() -> None:
 def test_categorical_unique() -> None:
     s = pl.Series(["a", "b", None], dtype=pl.Categorical)
     assert s.n_unique() == 3
-    assert s.unique().to_list() == ["a", "b", None]
+    assert s.unique().sort().to_list() == [None, "a", "b"]
 
 
 @StringCache()
@@ -925,3 +925,22 @@ def test_categorical_unique_20539() -> None:
         "unique": [["a", "b"], ["b", "c"], ["c"]],
         "unique_with_order": [["a", "b"], ["b", "c"], ["c"]],
     }
+
+
+@StringCache()
+@pytest.mark.may_fail_auto_streaming
+def test_categorical_prefill() -> None:
+    # https://github.com/pola-rs/polars/pull/20547#issuecomment-2569473443
+    # prefill cache
+    pl.Series(["aaa", "bbb", "ccc"], dtype=pl.Categorical)  # pre-fill cache
+
+    # test_compare_categorical_single
+    assert (pl.Series(["a"], dtype=pl.Categorical) < "a").to_list() == [False]
+
+    # test_unique_categorical
+    a = pl.Series(["a"], dtype=pl.Categorical)
+    assert a.unique().to_list() == ["a"]
+
+    s = pl.Series(["1", "2", "3"], dtype=pl.Categorical)
+    s = s.filter([True, False, True])
+    assert s.n_unique() == 2

From 409f09158bf95bbda78dfaa2624d59adcdcd3412 Mon Sep 17 00:00:00 2001
From: Marshall <mcrumiller@users.noreply.github.com>
Date: Fri, 3 Jan 2025 13:21:40 -0500
Subject: [PATCH 17/20] chore: Increase categorical test coverage (#20514)

---
 py-polars/tests/unit/conftest.py              | 22 ++++++
 .../constructors/test_any_value_fallbacks.py  | 24 +++---
 .../tests/unit/datatypes/test_categorical.py  | 42 +++++++---
 py-polars/tests/unit/datatypes/test_list.py   |  1 +
 .../functions/as_datatype/test_concat_list.py |  1 +
 .../unit/interchange/test_from_dataframe.py   |  1 +
 py-polars/tests/unit/io/test_delta.py         |  1 +
 py-polars/tests/unit/io/test_lazy_parquet.py  |  2 +
 py-polars/tests/unit/io/test_other.py         |  1 +
 py-polars/tests/unit/io/test_parquet.py       |  1 +
 .../operations/namespaces/test_categorical.py | 78 +++++++------------
 .../tests/unit/operations/test_filter.py      |  1 +
 .../tests/unit/operations/test_group_by.py    |  1 +
 .../unit/operations/unique/test_unique.py     |  8 +-
 py-polars/tests/unit/series/test_series.py    |  1 +
 15 files changed, 114 insertions(+), 71 deletions(-)

diff --git a/py-polars/tests/unit/conftest.py b/py-polars/tests/unit/conftest.py
index 825dc161cdbf..8197872831b5 100644
--- a/py-polars/tests/unit/conftest.py
+++ b/py-polars/tests/unit/conftest.py
@@ -16,6 +16,9 @@
 
 if TYPE_CHECKING:
     from collections.abc import Generator
+    from typing import Any
+
+    FixtureRequest = Any
 
 load_profile(
     profile=os.environ.get("POLARS_HYPOTHESIS_PROFILE", "fast"),  # type: ignore[arg-type]
@@ -229,3 +232,22 @@ def memory_usage_without_pyarrow() -> Generator[MemoryUsage, Any, Any]:
         yield MemoryUsage()
     finally:
         tracemalloc.stop()
+
+
+@pytest.fixture(params=[True, False])
+def test_global_and_local(
+    request: FixtureRequest,
+) -> Generator[Any, Any, Any]:
+    """
+    Setup fixture which runs each test with and without global string cache.
+
+    Usage: @pytest.mark.usefixtures("test_global_and_local")
+    """
+    use_global = request.param
+    if use_global:
+        with pl.StringCache():
+            # Pre-fill some global items to ensure physical repr isn't 0..n.
+            pl.Series(["eapioejf", "2m4lmv", "3v3v9dlf"], dtype=pl.Categorical)
+            yield
+    else:
+        yield
diff --git a/py-polars/tests/unit/constructors/test_any_value_fallbacks.py b/py-polars/tests/unit/constructors/test_any_value_fallbacks.py
index f3584b85d533..490515b89844 100644
--- a/py-polars/tests/unit/constructors/test_any_value_fallbacks.py
+++ b/py-polars/tests/unit/constructors/test_any_value_fallbacks.py
@@ -398,16 +398,16 @@ def test_fallback_with_dtype_strict_failure_decimal_precision() -> None:
         PySeries.new_from_any_values_and_dtype("", values, dtype, strict=True)
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_lit_18874() -> None:
-    with pl.StringCache():
-        assert_frame_equal(
-            pl.DataFrame(
-                {"a": [1, 2, 3]},
-            ).with_columns(b=pl.lit("foo").cast(pl.Categorical)),
-            pl.DataFrame(
-                [
-                    pl.Series("a", [1, 2, 3]),
-                    pl.Series("b", ["foo"] * 3, pl.Categorical),
-                ]
-            ),
-        )
+    assert_frame_equal(
+        pl.DataFrame(
+            {"a": [1, 2, 3]},
+        ).with_columns(b=pl.lit("foo").cast(pl.Categorical)),
+        pl.DataFrame(
+            [
+                pl.Series("a", [1, 2, 3]),
+                pl.Series("b", ["foo"] * 3, pl.Categorical),
+            ]
+        ),
+    )
diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py
index d5416bc47a93..1b37763c0d08 100644
--- a/py-polars/tests/unit/datatypes/test_categorical.py
+++ b/py-polars/tests/unit/datatypes/test_categorical.py
@@ -72,6 +72,7 @@ def test_categorical_full_outer_join() -> None:
     assert df["key_right"].cast(pl.String).to_list() == ["bar", "baz", None]
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_read_csv_categorical() -> None:
     f = io.BytesIO()
     f.write(b"col1,col2,col3,col4,col5,col6\n'foo',2,3,4,5,6\n'bar',8,9,10,11,12")
@@ -80,6 +81,7 @@ def test_read_csv_categorical() -> None:
     assert df["col1"].dtype == pl.Categorical
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_cat_to_dummies() -> None:
     df = pl.DataFrame({"foo": [1, 2, 3, 4], "bar": ["a", "b", "a", "c"]})
     df = df.with_columns(pl.col("bar").cast(pl.Categorical))
@@ -94,7 +96,7 @@ def test_cat_to_dummies() -> None:
     }
 
 
-@StringCache()
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_is_in_list() -> None:
     # this requires type coercion to cast.
     # we should not cast within the function as this would be expensive within a
@@ -110,7 +112,7 @@ def test_categorical_is_in_list() -> None:
     }
 
 
-@StringCache()
+@pytest.mark.usefixtures("test_global_and_local")
 def test_unset_sorted_on_append() -> None:
     df1 = pl.DataFrame(
         [
@@ -137,6 +139,7 @@ def test_unset_sorted_on_append() -> None:
         (pl.Series.eq_missing, pl.Series([True, True, True, False, False, False])),
     ],
 )
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_equality(
     op: Callable[[pl.Series, pl.Series], pl.Series], expected: pl.Series
 ) -> None:
@@ -272,6 +275,7 @@ def test_categorical_global_ordering_broadcast_lhs(
         (operator.gt, pl.Series([False, False, False, True, False, False])),
     ],
 )
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_ordering(
     op: Callable[[pl.Series, pl.Series], pl.Series], expected: pl.Series
 ) -> None:
@@ -289,6 +293,7 @@ def test_categorical_ordering(
         (operator.gt, pl.Series([None, False, False, False, False, False])),
     ],
 )
+@pytest.mark.usefixtures("test_global_and_local")
 def test_compare_categorical(
     op: Callable[[pl.Series, pl.Series], pl.Series], expected: pl.Series
 ) -> None:
@@ -311,6 +316,7 @@ def test_compare_categorical(
         (pl.Series.ne_missing, pl.Series([True, True, False, True, False, True])),
     ],
 )
+@pytest.mark.usefixtures("test_global_and_local")
 def test_compare_categorical_single(
     op: Callable[[pl.Series, pl.Series], pl.Series], expected: pl.Series
 ) -> None:
@@ -400,6 +406,7 @@ def test_categorical_error_on_local_cmp() -> None:
         df_cat.filter(pl.col("a_cat") == pl.col("b_cat"))
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_cast_null_to_categorical() -> None:
     assert pl.DataFrame().with_columns(
         pl.lit(None).cast(pl.Categorical).alias("nullable_enum")
@@ -454,6 +461,7 @@ def create_lazy(data: dict) -> pl.LazyFrame:  # type: ignore[type-arg]
     assert pl.using_string_cache() is False
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_in_struct_nulls() -> None:
     s = pl.Series(
         "job", ["doctor", "waiter", None, None, None, "doctor"], pl.Categorical
@@ -466,6 +474,7 @@ def test_categorical_in_struct_nulls() -> None:
     assert s[2] == {"job": "waiter", "count": 1}
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_cast_inner_categorical() -> None:
     dtype = pl.List(pl.Categorical)
     out = pl.Series("foo", [["a"], ["a", "b"]]).cast(dtype)
@@ -501,6 +510,7 @@ def test_stringcache() -> None:
         (pl.Categorical("lexical"), ["bar", "baz", "foo"]),
     ],
 )
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_sort_order_by_parameter(
     dtype: PolarsDataType, outcome: list[str]
 ) -> None:
@@ -557,12 +567,14 @@ def test_err_on_categorical_asof_join_by_arg() -> None:
         df1.join_asof(df2, on=pl.col("time").set_sorted(), by="cat")
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_list_get_item() -> None:
     out = pl.Series([["a"]]).cast(pl.List(pl.Categorical)).item()
     assert isinstance(out, pl.Series)
     assert out.dtype == pl.Categorical
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_nested_categorical_aggregation_7848() -> None:
     # a double categorical aggregation
     assert pl.DataFrame(
@@ -580,6 +592,7 @@ def test_nested_categorical_aggregation_7848() -> None:
     }
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_nested_categorical_cast() -> None:
     values = [["x"], ["y"], ["x"]]
     dtype = pl.List(pl.Categorical)
@@ -588,6 +601,7 @@ def test_nested_categorical_cast() -> None:
     assert s.to_list() == values
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_struct_categorical_nesting() -> None:
     # this triggers a lot of materialization
     df = pl.DataFrame(
@@ -610,7 +624,7 @@ def test_categorical_fill_null_existing_category() -> None:
     assert result.to_dict(as_series=False) == expected
 
 
-@StringCache()
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_fill_null_stringcache() -> None:
     df = pl.LazyFrame(
         {"index": [1, 2, 3], "cat": ["a", "b", None]},
@@ -622,6 +636,7 @@ def test_categorical_fill_null_stringcache() -> None:
     assert a.dtypes == [pl.Categorical]
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_fast_unique_flag_from_arrow() -> None:
     df = pl.DataFrame(
         {
@@ -633,6 +648,7 @@ def test_fast_unique_flag_from_arrow() -> None:
     assert pl.from_arrow(filtered).select(pl.col("colB").n_unique()).item() == 4  # type: ignore[union-attr]
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_construct_with_null() -> None:
     # Example from https://github.com/pola-rs/polars/issues/7188
     df = pl.from_dicts([{"A": None}, {"A": "foo"}], schema={"A": pl.Categorical})
@@ -663,6 +679,7 @@ def test_list_builder_different_categorical_rev_maps() -> None:
     }
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_collect_11408() -> None:
     df = pl.DataFrame(
         data={"groups": ["a", "b", "c"], "cats": ["a", "b", "c"], "amount": [1, 2, 3]},
@@ -677,6 +694,7 @@ def test_categorical_collect_11408() -> None:
     }
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_nested_cast_unchecked() -> None:
     s = pl.Series("cat", [["cat"]]).cast(pl.List(pl.Categorical))
     assert pl.Series([s]).to_list() == [[["cat"]]]
@@ -751,6 +769,7 @@ def test_categorical_vstack_with_local_different_rev_map() -> None:
     assert df3.get_column("a").cast(pl.UInt32).to_list() == [0, 1, 2, 3, 4, 5]
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_shift_over_13041() -> None:
     df = pl.DataFrame(
         {
@@ -768,6 +787,7 @@ def test_shift_over_13041() -> None:
 
 @pytest.mark.parametrize("context", [pl.StringCache(), contextlib.nullcontext()])
 @pytest.mark.parametrize("ordering", ["physical", "lexical"])
+@pytest.mark.usefixtures("test_global_and_local")
 def test_sort_categorical_retain_none(
     context: contextlib.AbstractContextManager,  # type: ignore[type-arg]
     ordering: Literal["physical", "lexical"],
@@ -799,6 +819,7 @@ def test_sort_categorical_retain_none(
             ]
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_cast_from_cat_to_numeric() -> None:
     cat_series = pl.Series(
         "cat_series",
@@ -811,12 +832,14 @@ def test_cast_from_cat_to_numeric() -> None:
     assert s.cast(pl.UInt8).sum() == 6
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_cat_preserve_lexical_ordering_on_clear() -> None:
     s = pl.Series("a", ["a", "b"], dtype=pl.Categorical(ordering="lexical"))
     s2 = s.clear()
     assert s.dtype == s2.dtype
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_cat_preserve_lexical_ordering_on_concat() -> None:
     dtype = pl.Categorical(ordering="lexical")
 
@@ -827,6 +850,7 @@ def test_cat_preserve_lexical_ordering_on_concat() -> None:
 
 # TODO: Bug see: https://github.com/pola-rs/polars/issues/20440
 @pytest.mark.may_fail_auto_streaming
+@pytest.mark.usefixtures("test_global_and_local")
 def test_cat_append_lexical_sorted_flag() -> None:
     df = pl.DataFrame({"x": [0, 1, 1], "y": ["B", "B", "A"]}).with_columns(
         pl.col("y").cast(pl.Categorical(ordering="lexical"))
@@ -845,6 +869,7 @@ def test_cat_append_lexical_sorted_flag() -> None:
     assert not (s1.is_sorted())
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_get_cat_categories_multiple_chunks() -> None:
     df = pl.DataFrame(
         [
@@ -877,6 +902,7 @@ def test_nested_categorical_concat(
         pl.concat([a, b])
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_perfect_group_by_19452() -> None:
     n = 40
     df2 = pl.DataFrame(
@@ -889,6 +915,7 @@ def test_perfect_group_by_19452() -> None:
     assert df2.with_columns(a=(pl.col("b")).over(pl.col("a")))["a"].is_sorted()
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_perfect_group_by_19950() -> None:
     dtype = pl.Enum(categories=["a", "b", "c"])
 
@@ -900,14 +927,14 @@ def test_perfect_group_by_19950() -> None:
     }
 
 
-@StringCache()
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_unique() -> None:
     s = pl.Series(["a", "b", None], dtype=pl.Categorical)
     assert s.n_unique() == 3
     assert s.unique().sort().to_list() == [None, "a", "b"]
 
 
-@StringCache()
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_unique_20539() -> None:
     df = pl.DataFrame({"number": [1, 1, 2, 2, 3], "letter": ["a", "b", "b", "c", "c"]})
 
@@ -927,13 +954,10 @@ def test_categorical_unique_20539() -> None:
     }
 
 
-@StringCache()
 @pytest.mark.may_fail_auto_streaming
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_prefill() -> None:
     # https://github.com/pola-rs/polars/pull/20547#issuecomment-2569473443
-    # prefill cache
-    pl.Series(["aaa", "bbb", "ccc"], dtype=pl.Categorical)  # pre-fill cache
-
     # test_compare_categorical_single
     assert (pl.Series(["a"], dtype=pl.Categorical) < "a").to_list() == [False]
 
diff --git a/py-polars/tests/unit/datatypes/test_list.py b/py-polars/tests/unit/datatypes/test_list.py
index 54b97fdef9fc..53c401ec110e 100644
--- a/py-polars/tests/unit/datatypes/test_list.py
+++ b/py-polars/tests/unit/datatypes/test_list.py
@@ -65,6 +65,7 @@ def test_dtype() -> None:
     ]
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical() -> None:
     # https://github.com/pola-rs/polars/issues/2038
     df = pl.DataFrame(
diff --git a/py-polars/tests/unit/functions/as_datatype/test_concat_list.py b/py-polars/tests/unit/functions/as_datatype/test_concat_list.py
index b0f70edbf51d..3ad6173ec176 100644
--- a/py-polars/tests/unit/functions/as_datatype/test_concat_list.py
+++ b/py-polars/tests/unit/functions/as_datatype/test_concat_list.py
@@ -91,6 +91,7 @@ def test_list_concat_supertype() -> None:
     ].to_list() == [[1, 10000], [2, 20000]]
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_list_concat_4762() -> None:
     df = pl.DataFrame({"x": "a"})
     expected = {"x": [["a", "a"]]}
diff --git a/py-polars/tests/unit/interchange/test_from_dataframe.py b/py-polars/tests/unit/interchange/test_from_dataframe.py
index 35fcc595451a..c9864d481d90 100644
--- a/py-polars/tests/unit/interchange/test_from_dataframe.py
+++ b/py-polars/tests/unit/interchange/test_from_dataframe.py
@@ -334,6 +334,7 @@ def test_string_column_to_series_no_offsets() -> None:
         _string_column_to_series(col, allow_copy=True)
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_column_to_series_non_dictionary() -> None:
     s = pl.Series(["a", "b", None, "a"], dtype=pl.Categorical)
 
diff --git a/py-polars/tests/unit/io/test_delta.py b/py-polars/tests/unit/io/test_delta.py
index 46931906ab3c..04361adf22d1 100644
--- a/py-polars/tests/unit/io/test_delta.py
+++ b/py-polars/tests/unit/io/test_delta.py
@@ -474,6 +474,7 @@ def test_unsupported_dtypes(tmp_path: Path) -> None:
     reason="upstream bug in delta-rs causing categorical to be written as categorical in parquet"
 )
 @pytest.mark.write_disk
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_becomes_string(tmp_path: Path) -> None:
     df = pl.DataFrame({"a": ["A", "B", "A"]}, schema={"a": pl.Categorical})
     df.write_delta(tmp_path)
diff --git a/py-polars/tests/unit/io/test_lazy_parquet.py b/py-polars/tests/unit/io/test_lazy_parquet.py
index 78ffb6b1379b..5a82a5304a9d 100644
--- a/py-polars/tests/unit/io/test_lazy_parquet.py
+++ b/py-polars/tests/unit/io/test_lazy_parquet.py
@@ -66,6 +66,7 @@ def test_row_index_len_16543(foods_parquet_path: Path) -> None:
 
 
 @pytest.mark.write_disk
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_parquet_statistics(tmp_path: Path) -> None:
     tmp_path.mkdir(exist_ok=True)
 
@@ -281,6 +282,7 @@ def test_parquet_statistics(monkeypatch: Any, capfd: Any, tmp_path: Path) -> Non
 
 
 @pytest.mark.write_disk
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical(tmp_path: Path) -> None:
     tmp_path.mkdir(exist_ok=True)
 
diff --git a/py-polars/tests/unit/io/test_other.py b/py-polars/tests/unit/io/test_other.py
index 4c08250838d8..20e71c36e5a1 100644
--- a/py-polars/tests/unit/io/test_other.py
+++ b/py-polars/tests/unit/io/test_other.py
@@ -84,6 +84,7 @@ def test_copy() -> None:
     assert_series_equal(copy.deepcopy(a), a)
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_round_trip() -> None:
     df = pl.DataFrame({"ints": [1, 2, 3], "cat": ["a", "b", "c"]})
     df = df.with_columns(pl.col("cat").cast(pl.Categorical))
diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py
index c70f2cfb2031..1fcbc193e933 100644
--- a/py-polars/tests/unit/io/test_parquet.py
+++ b/py-polars/tests/unit/io/test_parquet.py
@@ -2433,6 +2433,7 @@ def test_dict_masked(
     )
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_sliced_20017() -> None:
     f = io.BytesIO()
     df = (
diff --git a/py-polars/tests/unit/operations/namespaces/test_categorical.py b/py-polars/tests/unit/operations/namespaces/test_categorical.py
index 9f60ff4f7be9..bc596c71794f 100644
--- a/py-polars/tests/unit/operations/namespaces/test_categorical.py
+++ b/py-polars/tests/unit/operations/namespaces/test_categorical.py
@@ -1,36 +1,14 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
-
 import pytest
 
 import polars as pl
 from polars.testing import assert_frame_equal, assert_series_equal
 
-if TYPE_CHECKING:
-    from collections.abc import Generator
-    from typing import Any
-
-    FixtureRequest = Any
-
-
-@pytest.fixture(params=[True, False])
-def test_global_and_local(
-    request: FixtureRequest,
-) -> Generator[Any, Any, Any]:
-    """Setup fixture which runs each test with and without global string cache."""
-    use_global = request.param
-    if use_global:
-        with pl.StringCache():
-            # Pre-fill some global items to ensure physical repr isn't 0..n.
-            pl.Series(["a", "b", "c"], dtype=pl.Categorical)
-            yield
-    else:
-        yield
-
 
 # @TODO: Bug, see https://github.com/pola-rs/polars/issues/20440
 @pytest.mark.may_fail_auto_streaming
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_lexical_sort() -> None:
     df = pl.DataFrame(
         {"cats": ["z", "z", "k", "a", "b"], "vals": [3, 1, 2, 2, 3]}
@@ -66,50 +44,51 @@ def test_categorical_lexical_sort() -> None:
     ]
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_lexical_ordering_after_concat() -> None:
-    with pl.StringCache():
-        ldf1 = (
-            pl.DataFrame([pl.Series("key1", [8, 5]), pl.Series("key2", ["fox", "baz"])])
-            .lazy()
-            .with_columns(pl.col("key2").cast(pl.Categorical("lexical")))
-        )
-        ldf2 = (
-            pl.DataFrame(
-                [pl.Series("key1", [6, 8, 6]), pl.Series("key2", ["fox", "foo", "bar"])]
-            )
-            .lazy()
-            .with_columns(pl.col("key2").cast(pl.Categorical("lexical")))
+    ldf1 = (
+        pl.DataFrame([pl.Series("key1", [8, 5]), pl.Series("key2", ["fox", "baz"])])
+        .lazy()
+        .with_columns(pl.col("key2").cast(pl.Categorical("lexical")))
+    )
+    ldf2 = (
+        pl.DataFrame(
+            [pl.Series("key1", [6, 8, 6]), pl.Series("key2", ["fox", "foo", "bar"])]
         )
-        df = pl.concat([ldf1, ldf2]).select(pl.col("key2")).collect()
+        .lazy()
+        .with_columns(pl.col("key2").cast(pl.Categorical("lexical")))
+    )
+    df = pl.concat([ldf1, ldf2]).select(pl.col("key2")).collect()
 
-        assert df.sort("key2").to_dict(as_series=False) == {
-            "key2": ["bar", "baz", "foo", "fox", "fox"]
-        }
+    assert df.sort("key2").to_dict(as_series=False) == {
+        "key2": ["bar", "baz", "foo", "fox", "fox"]
+    }
 
 
 @pytest.mark.may_fail_auto_streaming
+@pytest.mark.usefixtures("test_global_and_local")
 def test_sort_categoricals_6014_internal() -> None:
-    with pl.StringCache():
-        # create basic categorical
-        df = pl.DataFrame({"key": ["bbb", "aaa", "ccc"]}).with_columns(
-            pl.col("key").cast(pl.Categorical)
-        )
+    # create basic categorical
+    df = pl.DataFrame({"key": ["bbb", "aaa", "ccc"]}).with_columns(
+        pl.col("key").cast(pl.Categorical)
+    )
 
     out = df.sort("key")
     assert out.to_dict(as_series=False) == {"key": ["bbb", "aaa", "ccc"]}
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_sort_categoricals_6014_lexical() -> None:
-    with pl.StringCache():
-        # create lexically-ordered categorical
-        df = pl.DataFrame({"key": ["bbb", "aaa", "ccc"]}).with_columns(
-            pl.col("key").cast(pl.Categorical("lexical"))
-        )
+    # create lexically-ordered categorical
+    df = pl.DataFrame({"key": ["bbb", "aaa", "ccc"]}).with_columns(
+        pl.col("key").cast(pl.Categorical("lexical"))
+    )
 
     out = df.sort("key")
     assert out.to_dict(as_series=False) == {"key": ["aaa", "bbb", "ccc"]}
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_get_categories() -> None:
     assert pl.Series(
         "cats", ["foo", "bar", "foo", "foo", "ham"], dtype=pl.Categorical
@@ -166,6 +145,7 @@ def test_cat_is_local() -> None:
     assert not s2.cat.is_local()
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_cat_uses_lexical_ordering() -> None:
     s = pl.Series(["a", "b", None, "b"]).cast(pl.Categorical)
     assert s.cat.uses_lexical_ordering() is False
diff --git a/py-polars/tests/unit/operations/test_filter.py b/py-polars/tests/unit/operations/test_filter.py
index d49c99f5999a..08f5b13bc0ab 100644
--- a/py-polars/tests/unit/operations/test_filter.py
+++ b/py-polars/tests/unit/operations/test_filter.py
@@ -153,6 +153,7 @@ def test_binary_simplification_5971() -> None:
     ]
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_string_comparison_6283() -> None:
     scores = pl.DataFrame(
         {
diff --git a/py-polars/tests/unit/operations/test_group_by.py b/py-polars/tests/unit/operations/test_group_by.py
index 924781a10bef..39bb2df449f5 100644
--- a/py-polars/tests/unit/operations/test_group_by.py
+++ b/py-polars/tests/unit/operations/test_group_by.py
@@ -925,6 +925,7 @@ def test_group_by_multiple_null_cols_15623() -> None:
 
 
 @pytest.mark.release
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_vs_str_group_by() -> None:
     # this triggers the perfect hash table
     s = pl.Series("a", np.random.randint(0, 50, 100))
diff --git a/py-polars/tests/unit/operations/unique/test_unique.py b/py-polars/tests/unit/operations/unique/test_unique.py
index ff4a0cd10f32..d9fbda1f843b 100644
--- a/py-polars/tests/unit/operations/unique/test_unique.py
+++ b/py-polars/tests/unit/operations/unique/test_unique.py
@@ -144,15 +144,20 @@ def test_unique_null() -> None:
     [
         ([], []),
         (["a", "b", "b", "c"], ["a", "b", "c"]),
-        (["a", "b", "b", None], ["a", "b", None]),
+        ([None, "a", "b", "b"], [None, "a", "b"]),
     ],
 )
+@pytest.mark.usefixtures("test_global_and_local")
 def test_unique_categorical(input: list[str | None], output: list[str | None]) -> None:
     s = pl.Series(input, dtype=pl.Categorical)
     result = s.unique(maintain_order=True)
     expected = pl.Series(output, dtype=pl.Categorical)
     assert_series_equal(result, expected)
 
+    result = s.unique(maintain_order=False).sort()
+    expected = pl.Series(output, dtype=pl.Categorical)
+    assert_series_equal(result, expected)
+
 
 def test_unique_categorical_global() -> None:
     with pl.StringCache():
@@ -206,6 +211,7 @@ def test_unique_with_bad_subset(
         df.unique(subset=subset)
 
 
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_unique_19409() -> None:
     df = pl.DataFrame({"x": [str(n % 50) for n in range(127)]}).cast(pl.Categorical)
     uniq = df.unique()
diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py
index 68b2528d5da6..bd8baa002cf9 100644
--- a/py-polars/tests/unit/series/test_series.py
+++ b/py-polars/tests/unit/series/test_series.py
@@ -356,6 +356,7 @@ def test_date_agg() -> None:
         (pl.Series(["c", "b", "a"], dtype=pl.Enum(["c", "b", "a", "d"])), "c", "a"),
     ],
 )
+@pytest.mark.usefixtures("test_global_and_local")
 def test_categorical_agg(s: pl.Series, min: str | None, max: str | None) -> None:
     assert s.min() == min
     assert s.max() == max

From da0b58936ee544c92859ea77df39c6a21370dbe9 Mon Sep 17 00:00:00 2001
From: Marshall <mcrumiller@users.noreply.github.com>
Date: Fri, 3 Jan 2025 14:10:11 -0500
Subject: [PATCH 18/20] fix: Output index type instead of u32 for
 `sum_horizontal` with boolean inputs (#20531)

---
 .../polars-ops/src/series/ops/horizontal.rs   |  6 +--
 .../src/dsl/function_expr/schema.rs           |  7 ++-
 .../operations/aggregation/test_horizontal.py | 44 ++++++++++++++++---
 3 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/crates/polars-ops/src/series/ops/horizontal.rs b/crates/polars-ops/src/series/ops/horizontal.rs
index 025779e77349..6a6960480c47 100644
--- a/crates/polars-ops/src/series/ops/horizontal.rs
+++ b/crates/polars-ops/src/series/ops/horizontal.rs
@@ -221,9 +221,9 @@ pub fn sum_horizontal(
 
     // If we have any null columns and null strategy is not `Ignore`, we can return immediately.
     if !ignore_nulls && non_null_cols.len() < columns.len() {
-        // We must first determine the correct return dtype.
+        // We must determine the correct return dtype.
         let return_dtype = match dtypes_to_supertype(non_null_cols.iter().map(|c| c.dtype()))? {
-            DataType::Boolean => DataType::UInt32,
+            DataType::Boolean => IDX_DTYPE,
             dt => dt,
         };
         return Ok(Some(Column::full_null(
@@ -244,7 +244,7 @@ pub fn sum_horizontal(
         },
         1 => Ok(Some(
             apply_null_strategy(if non_null_cols[0].dtype() == &DataType::Boolean {
-                non_null_cols[0].cast(&DataType::UInt32)?
+                non_null_cols[0].cast(&IDX_DTYPE)?
             } else {
                 non_null_cols[0].clone()
             })?
diff --git a/crates/polars-plan/src/dsl/function_expr/schema.rs b/crates/polars-plan/src/dsl/function_expr/schema.rs
index d45f75c01e9d..beaacac49942 100644
--- a/crates/polars-plan/src/dsl/function_expr/schema.rs
+++ b/crates/polars-plan/src/dsl/function_expr/schema.rs
@@ -331,11 +331,10 @@ impl FunctionExpr {
             MinHorizontal => mapper.map_to_supertype(),
             SumHorizontal { .. } => {
                 mapper.map_to_supertype().map(|mut f| {
-                    match f.dtype {
-                        // Booleans sum to UInt32.
-                        DataType::Boolean => { f.dtype = DataType::UInt32; f},
-                        _ => f,
+                    if f.dtype == DataType::Boolean {
+                        f.dtype = IDX_DTYPE;
                     }
+                    f
                 })
             },
             MeanHorizontal { .. } => {
diff --git a/py-polars/tests/unit/operations/aggregation/test_horizontal.py b/py-polars/tests/unit/operations/aggregation/test_horizontal.py
index 3959e15e22ed..bc557a231d75 100644
--- a/py-polars/tests/unit/operations/aggregation/test_horizontal.py
+++ b/py-polars/tests/unit/operations/aggregation/test_horizontal.py
@@ -319,6 +319,39 @@ def test_sum_single_col() -> None:
     )
 
 
+@pytest.mark.parametrize("ignore_nulls", [False, True])
+def test_sum_correct_supertype(ignore_nulls: bool) -> None:
+    values = [1, 2] if ignore_nulls else [None, None]  # type: ignore[list-item]
+    lf = pl.LazyFrame(
+        {
+            "null": [None, None],
+            "int": pl.Series(values, dtype=pl.Int32),
+            "float": pl.Series(values, dtype=pl.Float32),
+        }
+    )
+
+    # null + int32 should produce int32
+    out = lf.select(pl.sum_horizontal("null", "int", ignore_nulls=ignore_nulls))
+    expected = pl.LazyFrame({"null": pl.Series(values, dtype=pl.Int32)})
+    assert_frame_equal(out.collect(), expected.collect())
+    assert out.collect_schema() == expected.collect_schema()
+
+    # null + float32 should produce float32
+    out = lf.select(pl.sum_horizontal("null", "float", ignore_nulls=ignore_nulls))
+    expected = pl.LazyFrame({"null": pl.Series(values, dtype=pl.Float32)})
+    assert_frame_equal(out.collect(), expected.collect())
+    assert out.collect_schema() == expected.collect_schema()
+
+    # null + int32 + float32 should produce float64
+    values = [2, 4] if ignore_nulls else [None, None]  # type: ignore[list-item]
+    out = lf.select(
+        pl.sum_horizontal("null", "int", "float", ignore_nulls=ignore_nulls)
+    )
+    expected = pl.LazyFrame({"null": pl.Series(values, dtype=pl.Float64)})
+    assert_frame_equal(out.collect(), expected.collect())
+    assert out.collect_schema() == expected.collect_schema()
+
+
 def test_cum_sum_horizontal() -> None:
     df = pl.DataFrame(
         {
@@ -541,8 +574,8 @@ def test_horizontal_sum_boolean_with_null() -> None:
 
     expected_schema = pl.Schema(
         {
-            "null_first": pl.UInt32,
-            "bool_first": pl.UInt32,
+            "null_first": pl.get_index_type(),
+            "bool_first": pl.get_index_type(),
         }
     )
 
@@ -550,8 +583,8 @@ def test_horizontal_sum_boolean_with_null() -> None:
 
     expected_df = pl.DataFrame(
         {
-            "null_first": pl.Series([1, 0], dtype=pl.UInt32),
-            "bool_first": pl.Series([1, 0], dtype=pl.UInt32),
+            "null_first": pl.Series([1, 0], dtype=pl.get_index_type()),
+            "bool_first": pl.Series([1, 0], dtype=pl.get_index_type()),
         }
     )
 
@@ -563,7 +596,7 @@ def test_horizontal_sum_boolean_with_null() -> None:
     ("dtype_in", "dtype_out"),
     [
         (pl.Null, pl.Null),
-        (pl.Boolean, pl.UInt32),
+        (pl.Boolean, pl.get_index_type()),
         (pl.UInt8, pl.UInt8),
         (pl.Float32, pl.Float32),
         (pl.Float64, pl.Float64),
@@ -589,6 +622,7 @@ def test_horizontal_sum_with_null_col_ignore_strategy(
         values = [None, None, None]  # type: ignore[list-item]
     expected = pl.LazyFrame(pl.Series("null", values, dtype=dtype_out))
     assert_frame_equal(result, expected)
+    assert result.collect_schema() == expected.collect_schema()
 
 
 @pytest.mark.parametrize("ignore_nulls", [True, False])

From 841c387d99d7024037556c4ef79d96bf2caac397 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Fri, 3 Jan 2025 20:18:35 +0100
Subject: [PATCH 19/20] Python Polars 1.19.0 (#20552)

---
 Cargo.lock           | 2 +-
 py-polars/Cargo.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0753cb32d661..4b52ff4fe016 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3671,7 +3671,7 @@ dependencies = [
 
 [[package]]
 name = "py-polars"
-version = "1.18.0"
+version = "1.19.0"
 dependencies = [
  "jemallocator",
  "libc",
diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml
index 7628df05b8c6..f09fe3952f6f 100644
--- a/py-polars/Cargo.toml
+++ b/py-polars/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "py-polars"
-version = "1.18.0"
+version = "1.19.0"
 edition = "2021"
 
 [lib]

From 58d69d6c39a6d83319855c57256be378e5d854e9 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Fri, 3 Jan 2025 20:45:58 +0100
Subject: [PATCH 20/20] ci: Improve bin size info (#20551)

---
 .github/workflows/benchmark.yml | 46 ++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 49ef91de9be4..b4b7ad7691b8 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -78,22 +78,26 @@ jobs:
           WHEEL_SIZE=$(ls -l py-polars/polars/polars*.so | awk '{ print $5 }')
           echo "WHEEL_SIZE=$WHEEL_SIZE" >> $GITHUB_ENV
 
+      - name: Wheel size txt
+        if: github.ref_name == 'main'
+        run: |
+          echo "$GITHUB_RUN_ID $WHEEL_SIZE" > wheel_sizes.txt
+
       - name: Upload wheel sizes artifact (main only)
         if: github.ref_name == 'main'
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: wheel-size
-          path: |
-            echo "$GITHUB_RUN_ID $WHEEL_SIZE" > wheel_sizes.txt
-            wheel_sizes.txt
+          path: wheel_sizes.txt
 
       - name: Download main wheel size
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: wheel-size
         continue-on-error: true 
 
       - name: Extract previous wheel size
+        if: github.ref_name != 'main'
         id: load_previous_size
         run: |
           if [[ -f wheel_sizes.txt ]]; then
@@ -102,9 +106,11 @@ jobs:
           else
             echo "PREVIOUS_WHEEL_SIZE=Unknown" >> $GITHUB_ENV
           fi
+        continue-on-error: true 
 
       - name: Comment wheel size
         uses: actions/github-script@v7
+        if: github.ref_name != 'main'
         with:
           script: |
               const previousSize = process.env.PREVIOUS_WHEEL_SIZE || 'Unknown';
@@ -114,7 +120,7 @@ jobs:
               const previousSizeMB = previousSize !== 'Unknown' ? (previousSize / 1024 / 1024).toFixed(4) : 'Unknown';
               const currentSizeMB = currentSize !== 'Unknown' ? (currentSize / 1024 / 1024).toFixed(4) : 'Unknown';
 
-              let commentBody = `The uncompressed binary size was **${previousSizeMB} MB**.\nThe uncompressed binary size after this PR is **${currentSizeMB} MB**.`;
+              let commentBody = `The previous uncompressed lib size was **${previousSizeMB} MB**.\nThe current uncompressed lib size after this PR is **${currentSizeMB} MB**.`;
 
               // Calculate percentage increase if both sizes are available
               if (previousSize !== 'Unknown' && currentSize !== '') {
@@ -122,13 +128,35 @@ jobs:
                 commentBody += `\nThis represents a **${increase.toFixed(2)}% increase** in size.`;
               }
 
-              github.rest.issues.createComment({
-                issue_number: context.issue.number,
+              const { data: comments } = await github.rest.issues.listComments({
                 owner: context.repo.owner,
                 repo: context.repo.repo,
-                body: commentBody
+                issue_number: context.issue.number,
               });
 
+              // Look for an existing comment
+              const existingComment = comments.find(comment =>
+                comment.body.includes('The previous uncompressed lib size was')
+              );
+
+              if (existingComment) {
+                // Update the existing comment
+                await github.rest.issues.updateComment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  comment_id: existingComment.id,
+                  body: commentBody,
+                });
+              } else {
+                // Create a new comment
+                await github.rest.issues.createComment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: context.issue.number,
+                  body: commentBody,
+                });
+              }
+        continue-on-error: true 
 
       - name: Run benchmark tests
         uses: CodSpeedHQ/action@v3