diff --git a/crates/polars-core/src/chunked_array/logical/categorical/builder.rs b/crates/polars-core/src/chunked_array/logical/categorical/builder.rs index 53199329e20f..9e0ebc118a49 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/builder.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/builder.rs @@ -162,8 +162,8 @@ impl CategoricalChunkedBuilder { false, self.ordering, ) + .with_fast_unique(true) } - .with_fast_unique(true) } pub fn drain_iter_and_finish<'a, I>(mut self, i: I) -> CategoricalChunked @@ -187,8 +187,8 @@ impl CategoricalChunkedBuilder { &self.categories.into(), self.ordering, ) + .with_fast_unique(true) } - .with_fast_unique(true) } } @@ -369,7 +369,8 @@ impl CategoricalChunked { Arc::new(rev_map), true, ordering, - )) + ) + .with_fast_unique(false)) } } } diff --git a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs index 2429d918e2ff..94fef8aeffa7 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs @@ -297,12 +297,18 @@ impl CategoricalChunked { } } - pub(crate) fn with_fast_unique(mut self, toggle: bool) -> Self { + /// Set `FAST_UNIQUE` metadata + /// # Safety + /// This invariant must hold `unique(categories) == unique(self)` + pub(crate) unsafe fn with_fast_unique(mut self, toggle: bool) -> Self { self.set_fast_unique(toggle); self } - pub fn _with_fast_unique(self, toggle: bool) -> Self { + /// Set `FAST_UNIQUE` metadata + /// # Safety + /// This invariant must hold `unique(categories) == unique(self)` + pub unsafe fn _with_fast_unique(self, toggle: bool) -> Self { self.with_fast_unique(toggle) } diff --git a/crates/polars-core/src/chunked_array/logical/enum_/mod.rs b/crates/polars-core/src/chunked_array/logical/enum_/mod.rs index 5279099f1cd7..e301965a511c 100644 --- a/crates/polars-core/src/chunked_array/logical/enum_/mod.rs +++ b/crates/polars-core/src/chunked_array/logical/enum_/mod.rs @@ -94,7 +94,7 @@ impl EnumChunkedBuilder { // SAFETY: keys and values are in bounds unsafe { CategoricalChunked::from_cats_and_rev_map_unchecked(ca, self.rev, true, self.ordering) + .with_fast_unique(true) } - .with_fast_unique(true) } } diff --git a/crates/polars-core/src/frame/group_by/mod.rs b/crates/polars-core/src/frame/group_by/mod.rs index 17a36dc4ddfd..7d1d9e761504 100644 --- a/crates/polars-core/src/frame/group_by/mod.rs +++ b/crates/polars-core/src/frame/group_by/mod.rs @@ -258,7 +258,6 @@ impl<'df> GroupBy<'df> { } else { &self.groups }; - POOL.install(|| { self.selected_keys .par_iter() diff --git a/crates/polars-core/src/frame/group_by/perfect.rs b/crates/polars-core/src/frame/group_by/perfect.rs index a110ed982471..a020aa4fb37e 100644 --- a/crates/polars-core/src/frame/group_by/perfect.rs +++ b/crates/polars-core/src/frame/group_by/perfect.rs @@ -170,6 +170,7 @@ impl CategoricalChunked { let mut out = match &**rev_map { RevMapping::Local(cached, _) => { if self._can_fast_unique() { + assert!(cached.len() <= self.len(), "invalid invariant"); if verbose() { eprintln!("grouping categoricals, run perfect hash function"); } diff --git a/crates/polars-core/src/series/implementations/categorical.rs b/crates/polars-core/src/series/implementations/categorical.rs index 4e7b6efe04a2..7c45baa5c054 100644 --- a/crates/polars-core/src/series/implementations/categorical.rs +++ b/crates/polars-core/src/series/implementations/categorical.rs @@ -215,7 +215,7 @@ impl SeriesTrait for SeriesWrap { } fn new_from_index(&self, index: usize, length: usize) -> Series { - self.with_state(true, |cats| cats.new_from_index(index, length)) + self.with_state(false, |cats| cats.new_from_index(index, length)) .into_series() } diff --git a/crates/polars-ops/src/series/ops/cut.rs b/crates/polars-ops/src/series/ops/cut.rs index b7b8d3e9f179..9b6680cc4ff6 100644 --- a/crates/polars-ops/src/series/ops/cut.rs +++ b/crates/polars-ops/src/series/ops/cut.rs @@ -57,24 +57,24 @@ fn map_cats( }, }); - let outvals = [ - brk_vals.finish().into_series(), + let outvals = [brk_vals.finish().into_series(), unsafe { bld.finish() ._with_fast_unique(label_has_value.iter().all(bool::clone)) - .into_series(), - ]; + .into_series() + }]; Ok(StructChunked::from_series(out_name, outvals[0].len(), outvals.iter())?.into_series()) } else { - Ok(bld - .drain_iter_and_finish(s_iter.map(|opt| { + Ok(unsafe { + bld.drain_iter_and_finish(s_iter.map(|opt| { opt.filter(|x| !x.is_nan()).map(|x| { let pt = sorted_breaks.partition_point(|v| op(&x, v)); - unsafe { *label_has_value.get_unchecked_mut(pt) = true }; - unsafe { labels.get_unchecked(pt).as_str() } + *label_has_value.get_unchecked_mut(pt) = true; + labels.get_unchecked(pt).as_str() }) })) ._with_fast_unique(label_has_value.iter().all(bool::clone)) - .into_series()) + } + .into_series()) } } diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py index 7847081689e6..b44255aba244 100644 --- a/py-polars/tests/unit/datatypes/test_categorical.py +++ b/py-polars/tests/unit/datatypes/test_categorical.py @@ -876,3 +876,14 @@ def test_perfect_group_by_19452() -> None: ) assert df2.with_columns(a=(pl.col("b")).over(pl.col("a")))["a"].is_sorted() + + +def test_perfect_group_by_19950() -> None: + dtype = pl.Enum(categories=["a", "b", "c"]) + + left = pl.DataFrame({"x": "a"}).cast(dtype) + right = pl.DataFrame({"x": "a", "y": "b"}).cast(dtype) + assert left.join(right, on="x").group_by("y").first().to_dict(as_series=False) == { + "y": ["b"], + "x": ["a"], + }