Skip to content

Commit

Permalink
fix: Toggle 'fast_unique' on new_from_index (#19956)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored Nov 24, 2024
1 parent 3fad074 commit 84a68a7
Show file tree
Hide file tree
Showing 8 changed files with 35 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,8 @@ impl CategoricalChunkedBuilder {
false,
self.ordering,
)
.with_fast_unique(true)
}
.with_fast_unique(true)
}

pub fn drain_iter_and_finish<'a, I>(mut self, i: I) -> CategoricalChunked
Expand All @@ -187,8 +187,8 @@ impl CategoricalChunkedBuilder {
&self.categories.into(),
self.ordering,
)
.with_fast_unique(true)
}
.with_fast_unique(true)
}
}

Expand Down Expand Up @@ -369,7 +369,8 @@ impl CategoricalChunked {
Arc::new(rev_map),
true,
ordering,
))
)
.with_fast_unique(false))
}
}
}
Expand Down
10 changes: 8 additions & 2 deletions crates/polars-core/src/chunked_array/logical/categorical/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -297,12 +297,18 @@ impl CategoricalChunked {
}
}

pub(crate) fn with_fast_unique(mut self, toggle: bool) -> Self {
/// Set `FAST_UNIQUE` metadata
/// # Safety
/// This invariant must hold `unique(categories) == unique(self)`
pub(crate) unsafe fn with_fast_unique(mut self, toggle: bool) -> Self {
self.set_fast_unique(toggle);
self
}

pub fn _with_fast_unique(self, toggle: bool) -> Self {
/// Set `FAST_UNIQUE` metadata
/// # Safety
/// This invariant must hold `unique(categories) == unique(self)`
pub unsafe fn _with_fast_unique(self, toggle: bool) -> Self {
self.with_fast_unique(toggle)
}

Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/logical/enum_/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ impl EnumChunkedBuilder {
// SAFETY: keys and values are in bounds
unsafe {
CategoricalChunked::from_cats_and_rev_map_unchecked(ca, self.rev, true, self.ordering)
.with_fast_unique(true)
}
.with_fast_unique(true)
}
}
1 change: 0 additions & 1 deletion crates/polars-core/src/frame/group_by/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,6 @@ impl<'df> GroupBy<'df> {
} else {
&self.groups
};

POOL.install(|| {
self.selected_keys
.par_iter()
Expand Down
1 change: 1 addition & 0 deletions crates/polars-core/src/frame/group_by/perfect.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ impl CategoricalChunked {
let mut out = match &**rev_map {
RevMapping::Local(cached, _) => {
if self._can_fast_unique() {
assert!(cached.len() <= self.len(), "invalid invariant");
if verbose() {
eprintln!("grouping categoricals, run perfect hash function");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ impl SeriesTrait for SeriesWrap<CategoricalChunked> {
}

fn new_from_index(&self, index: usize, length: usize) -> Series {
self.with_state(true, |cats| cats.new_from_index(index, length))
self.with_state(false, |cats| cats.new_from_index(index, length))
.into_series()
}

Expand Down
18 changes: 9 additions & 9 deletions crates/polars-ops/src/series/ops/cut.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,24 +57,24 @@ fn map_cats(
},
});

let outvals = [
brk_vals.finish().into_series(),
let outvals = [brk_vals.finish().into_series(), unsafe {
bld.finish()
._with_fast_unique(label_has_value.iter().all(bool::clone))
.into_series(),
];
.into_series()
}];
Ok(StructChunked::from_series(out_name, outvals[0].len(), outvals.iter())?.into_series())
} else {
Ok(bld
.drain_iter_and_finish(s_iter.map(|opt| {
Ok(unsafe {
bld.drain_iter_and_finish(s_iter.map(|opt| {
opt.filter(|x| !x.is_nan()).map(|x| {
let pt = sorted_breaks.partition_point(|v| op(&x, v));
unsafe { *label_has_value.get_unchecked_mut(pt) = true };
unsafe { labels.get_unchecked(pt).as_str() }
*label_has_value.get_unchecked_mut(pt) = true;
labels.get_unchecked(pt).as_str()
})
}))
._with_fast_unique(label_has_value.iter().all(bool::clone))
.into_series())
}
.into_series())
}
}

Expand Down
11 changes: 11 additions & 0 deletions py-polars/tests/unit/datatypes/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -876,3 +876,14 @@ def test_perfect_group_by_19452() -> None:
)

assert df2.with_columns(a=(pl.col("b")).over(pl.col("a")))["a"].is_sorted()


def test_perfect_group_by_19950() -> None:
dtype = pl.Enum(categories=["a", "b", "c"])

left = pl.DataFrame({"x": "a"}).cast(dtype)
right = pl.DataFrame({"x": "a", "y": "b"}).cast(dtype)
assert left.join(right, on="x").group_by("y").first().to_dict(as_series=False) == {
"y": ["b"],
"x": ["a"],
}

0 comments on commit 84a68a7

Please sign in to comment.