From 399a2c13433a4f0aad64ae62ee34093b00d36368 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 24 Dec 2024 13:49:45 +0100 Subject: [PATCH 1/2] perf: Generalize the `arg_sort` fast path onto `Column`. This makes the implementation of `arg_sort` take the fast path in `Column` before ever going into a specific SeriesTrait implementor. This now makes that every type has the same fast path that gets taken when sorted. It also now allows taking the fast path when `maintain_order=True`. This PR needed to adjust some tests because they wrongly assumed things about the output order of the sorting when `maintain_order=False`. --- crates/polars-core/src/frame/column/mod.rs | 156 +++++++++++++++++- .../unit/operations/test_interpolate_by.py | 2 +- py-polars/tests/unit/operations/test_sort.py | 74 +++++---- 3 files changed, 194 insertions(+), 38 deletions(-) diff --git a/crates/polars-core/src/frame/column/mod.rs b/crates/polars-core/src/frame/column/mod.rs index c4484ddbfe5d..fbc735e71497 100644 --- a/crates/polars-core/src/frame/column/mod.rs +++ b/crates/polars-core/src/frame/column/mod.rs @@ -894,8 +894,155 @@ impl Column { } pub fn arg_sort(&self, options: SortOptions) -> IdxCa { + if self.is_empty() { + return IdxCa::from_vec(self.name().clone(), Vec::new()); + } + + if self.null_count() == self.len() { + // We might need to maintain order so just respect the descending parameter. + let values = if options.descending { + (0..self.len() as IdxSize).rev().collect() + } else { + (0..self.len() as IdxSize).collect() + }; + + return IdxCa::from_vec(self.name().clone(), values); + } + + let is_sorted = Some(self.is_sorted_flag()); + let Some(is_sorted) = is_sorted.filter(|v| !matches!(v, IsSorted::Not)) else { + return self.as_materialized_series().arg_sort(options); + }; + + // Fast path: the data is sorted. + let is_sorted_dsc = matches!(is_sorted, IsSorted::Descending); + let invert = options.descending != is_sorted_dsc; + + let mut values = Vec::with_capacity(self.len()); + + #[inline(never)] + fn extend( + start: IdxSize, + end: IdxSize, + slf: &Column, + values: &mut Vec, + is_only_nulls: bool, + invert: bool, + maintain_order: bool, + ) { + debug_assert!(start <= end); + debug_assert!(start as usize <= slf.len()); + debug_assert!(end as usize <= slf.len()); + + if !invert || is_only_nulls { + values.extend(start..end); + return; + } + + // If we don't have to maintain order but we have to invert. Just flip it around. + if !maintain_order { + values.extend((start..end).rev()); + return; + } + + // If we want to maintain order but we also needs to invert, we need to invert + // per group of items. + // + // @NOTE: Since the column is sorted, arg_unique can also take a fast path and + // just do a single traversal. + let arg_unique = slf + .slice(start as i64, (end - start) as usize) + .arg_unique() + .unwrap(); + + assert!(!arg_unique.has_nulls()); + + let num_unique = arg_unique.len(); + + // Fast path: all items are unique. + if num_unique == (end - start) as usize { + values.extend((start..end).rev()); + return; + } + + if num_unique == 1 { + values.extend(start..end); + return; + } + + let mut prev_idx = end - start; + for chunk in arg_unique.downcast_iter() { + for &idx in chunk.values().as_slice().iter().rev() { + values.extend(start + idx..start + prev_idx); + prev_idx = idx; + } + } + } + macro_rules! extend { + ($start:expr, $end:expr) => { + extend!($start, $end, is_only_nulls = false); + }; + ($start:expr, $end:expr, is_only_nulls = $is_only_nulls:expr) => { + extend( + $start, + $end, + self, + &mut values, + $is_only_nulls, + invert, + options.maintain_order, + ); + }; + } + + let length = self.len() as IdxSize; + let null_count = self.null_count() as IdxSize; + + if null_count == 0 { + extend!(0, length); + } else { + let has_nulls_last = self.get(self.len() - 1).unwrap().is_null(); + match (options.nulls_last, has_nulls_last) { + (true, true) => { + // Current: Nulls last, Wanted: Nulls last + extend!(0, length - null_count); + extend!(length - null_count, length, is_only_nulls = true); + }, + (true, false) => { + // Current: Nulls first, Wanted: Nulls last + extend!(null_count, length); + extend!(0, null_count, is_only_nulls = true); + }, + (false, true) => { + // Current: Nulls last, Wanted: Nulls first + extend!(length - null_count, length, is_only_nulls = true); + extend!(0, length - null_count); + }, + (false, false) => { + // Current: Nulls first, Wanted: Nulls first + extend!(0, null_count, is_only_nulls = true); + extend!(null_count, length); + }, + } + } + + IdxCa::from_vec(self.name().clone(), values) + } + + pub fn arg_sort_multiple( + &self, + by: &[Column], + options: &SortMultipleOptions, + ) -> PolarsResult { // @scalar-opt - self.as_materialized_series().arg_sort(options) + self.as_materialized_series().arg_sort_multiple(by, options) + } + + pub fn arg_unique(&self) -> PolarsResult { + match self { + Column::Scalar(s) => Ok(IdxCa::new_vec(s.name().clone(), vec![0])), + _ => self.as_materialized_series().arg_unique(), + } } pub fn bit_repr(&self) -> Option { @@ -986,8 +1133,11 @@ impl Column { } pub fn is_sorted_flag(&self) -> IsSorted { - // @scalar-opt - self.as_materialized_series().is_sorted_flag() + match self { + Column::Series(s) => s.is_sorted_flag(), + Column::Partitioned(s) => s.partitions().is_sorted_flag(), + Column::Scalar(_) => IsSorted::Ascending, + } } pub fn unique(&self) -> PolarsResult { diff --git a/py-polars/tests/unit/operations/test_interpolate_by.py b/py-polars/tests/unit/operations/test_interpolate_by.py index 98ee656fdaed..bcfaf562426c 100644 --- a/py-polars/tests/unit/operations/test_interpolate_by.py +++ b/py-polars/tests/unit/operations/test_interpolate_by.py @@ -112,7 +112,7 @@ def test_interpolate_by_leading_nulls() -> None: result = ( df.sort("times", descending=True) .with_columns(pl.col("values").interpolate_by("times")) - .sort("times") + .sort("times", maintain_order=True) .drop("times") ) assert_frame_equal(result, expected) diff --git a/py-polars/tests/unit/operations/test_sort.py b/py-polars/tests/unit/operations/test_sort.py index 928890da836e..6f90f6744b61 100644 --- a/py-polars/tests/unit/operations/test_sort.py +++ b/py-polars/tests/unit/operations/test_sort.py @@ -123,7 +123,7 @@ def test_sort_by( assert out["a"].to_list() == expected[2] # by can also be a single column - out = df.select(pl.col("a").sort_by("b", descending=[False])) + out = df.select(pl.col("a").sort_by("b", descending=[False], maintain_order=True)) assert out["a"].to_list() == expected[3] @@ -143,17 +143,21 @@ def test_expr_sort_by_nulls_last( df = sort_function(df) # nulls last - expected = pl.DataFrame({"a": [1, 2, 5, None, None], "b": [None, 1, None, 1, 2]}) out = df.select(pl.all().sort_by("a", nulls_last=True)) - assert_frame_equal(out, expected) + assert out["a"].to_list() == [1, 2, 5, None, None] + # We don't maintain order so there are two possibilities + assert out["b"].to_list()[:3] == [None, 1, None] + assert out["b"].to_list()[3:] in [[1, 2], [2, 1]] # nulls first (default) - expected = pl.DataFrame({"a": [None, None, 1, 2, 5], "b": [1, 2, None, 1, None]}) for out in ( df.select(pl.all().sort_by("a", nulls_last=False)), df.select(pl.all().sort_by("a")), ): - assert_frame_equal(out, expected) + assert out["a"].to_list() == [None, None, 1, 2, 5] + # We don't maintain order so there are two possibilities + assert out["b"].to_list()[2:] == [None, 1, None] + assert out["b"].to_list()[:2] in [[1, 2], [2, 1]] def test_expr_sort_by_multi_nulls_last() -> None: @@ -423,21 +427,23 @@ def test_sorted_fast_paths() -> None: ( pl.DataFrame({"Idx": [0, 1, 2, 3, 4, 5, 6], "Val": [0, 1, 2, 3, 4, 5, 6]}), ( - [0, 1, 2, 3, 4, 5, 6], - [6, 5, 4, 3, 2, 1, 0], - [0, 1, 2, 3, 4, 5, 6], - [6, 5, 4, 3, 2, 1, 0], + [[0, 1, 2, 3, 4, 5, 6]], + [[6, 5, 4, 3, 2, 1, 0]], + [[0, 1, 2, 3, 4, 5, 6]], + [[6, 5, 4, 3, 2, 1, 0]], ), ), ( pl.DataFrame( {"Idx": [0, 1, 2, 3, 4, 5, 6], "Val": [0, 1, None, 3, None, 5, 6]} ), + # We don't use maintain order here, so it might as well do anything + # with the None elements. ( - [0, 1, 3, 5, 6, 2, 4], - [6, 5, 3, 1, 0, 2, 4], - [2, 4, 0, 1, 3, 5, 6], - [2, 4, 6, 5, 3, 1, 0], + [[0, 1, 3, 5, 6, 2, 4], [0, 1, 3, 5, 6, 4, 2]], + [[6, 5, 3, 1, 0, 2, 4], [6, 5, 3, 1, 0, 4, 2]], + [[2, 4, 0, 1, 3, 5, 6], [4, 2, 0, 1, 3, 5, 6]], + [[2, 4, 6, 5, 3, 1, 0], [4, 2, 6, 5, 3, 1, 0]], ), ), ], @@ -454,7 +460,7 @@ def test_sorted_fast_paths() -> None: def test_sorted_arg_sort_fast_paths( sort_function: Callable[[pl.DataFrame], pl.DataFrame], df: pl.DataFrame, - expected: tuple[list[int], list[int], list[int], list[int]], + expected: tuple[list[list[int]], list[list[int]], list[list[int]], list[list[int]]], ) -> None: # Test that an already sorted df is correctly sorted (by a single column) # In certain cases below we will not go through fast path; this test @@ -467,34 +473,34 @@ def test_sorted_arg_sort_fast_paths( # Test dataframe.sort assert ( df.sort("Val", descending=False, nulls_last=True)["Idx"].to_list() - == expected[0] + in expected[0] ) assert ( - df.sort("Val", descending=True, nulls_last=True)["Idx"].to_list() == expected[1] + df.sort("Val", descending=True, nulls_last=True)["Idx"].to_list() in expected[1] ) assert ( df.sort("Val", descending=False, nulls_last=False)["Idx"].to_list() - == expected[2] + in expected[2] ) assert ( df.sort("Val", descending=True, nulls_last=False)["Idx"].to_list() - == expected[3] + in expected[3] ) # Test series.arg_sort assert ( df["Idx"][s.arg_sort(descending=False, nulls_last=True)].to_list() - == expected[0] + in expected[0] ) assert ( - df["Idx"][s.arg_sort(descending=True, nulls_last=True)].to_list() == expected[1] + df["Idx"][s.arg_sort(descending=True, nulls_last=True)].to_list() in expected[1] ) assert ( df["Idx"][s.arg_sort(descending=False, nulls_last=False)].to_list() - == expected[2] + in expected[2] ) assert ( df["Idx"][s.arg_sort(descending=True, nulls_last=False)].to_list() - == expected[3] + in expected[3] ) @@ -896,30 +902,30 @@ def test_sort_with_null_12139( } ) df = sort_function(df) - assert df.sort("bool", descending=False, nulls_last=False).to_dict( - as_series=False - ) == { + assert df.sort( + "bool", descending=False, nulls_last=False, maintain_order=True + ).to_dict(as_series=False) == { "bool": [None, False, False, True, True], "float": [3.0, 2.0, 5.0, 1.0, 4.0], } - assert df.sort("bool", descending=False, nulls_last=True).to_dict( - as_series=False - ) == { + assert df.sort( + "bool", descending=False, nulls_last=True, maintain_order=True + ).to_dict(as_series=False) == { "bool": [False, False, True, True, None], "float": [2.0, 5.0, 1.0, 4.0, 3.0], } - assert df.sort("bool", descending=True, nulls_last=True).to_dict( - as_series=False - ) == { + assert df.sort( + "bool", descending=True, nulls_last=True, maintain_order=True + ).to_dict(as_series=False) == { "bool": [True, True, False, False, None], "float": [1.0, 4.0, 2.0, 5.0, 3.0], } - assert df.sort("bool", descending=True, nulls_last=False).to_dict( - as_series=False - ) == { + assert df.sort( + "bool", descending=True, nulls_last=False, maintain_order=True + ).to_dict(as_series=False) == { "bool": [None, True, True, False, False], "float": [3.0, 1.0, 4.0, 2.0, 5.0], } From d569b9e937260474f7cfba8d178105b61675a676 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 24 Dec 2024 15:56:42 +0100 Subject: [PATCH 2/2] fix many tests --- crates/polars-core/src/frame/column/mod.rs | 12 ++++++++++++ py-polars/tests/unit/dataframe/test_df.py | 2 +- .../tests/unit/datatypes/test_categorical.py | 2 ++ .../operations/namespaces/test_categorical.py | 2 ++ .../unit/operations/test_interpolate_by.py | 8 +++----- py-polars/tests/unit/operations/test_join.py | 17 ++++++++++++----- 6 files changed, 32 insertions(+), 11 deletions(-) diff --git a/crates/polars-core/src/frame/column/mod.rs b/crates/polars-core/src/frame/column/mod.rs index fbc735e71497..125fc62bb99e 100644 --- a/crates/polars-core/src/frame/column/mod.rs +++ b/crates/polars-core/src/frame/column/mod.rs @@ -1026,6 +1026,18 @@ impl Column { } } + // @NOTE: This can theoretically be pushed into the previous operation but it is really + // worth it... probably not... + if let Some((limit, limit_dsc)) = options.limit { + let limit = limit.min(length); + + if limit_dsc { + values = values.drain((length - limit) as usize..).collect(); + } else { + values.truncate(limit as usize); + } + } + IdxCa::from_vec(self.name().clone(), values) } diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index 46e4779cdef8..9cac26d1e7df 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -1014,7 +1014,7 @@ def test_multiple_column_sort() -> None: pl.DataFrame({"a": [3, 2, 1], "b": ["b", "a", "a"]}), ) assert_frame_equal( - df.sort("b", descending=True), + df.sort("b", descending=True, maintain_order=True), pl.DataFrame({"a": [3, 1, 2], "b": ["b", "a", "a"]}), ) assert_frame_equal( diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py index b44255aba244..b28dd269c19c 100644 --- a/py-polars/tests/unit/datatypes/test_categorical.py +++ b/py-polars/tests/unit/datatypes/test_categorical.py @@ -825,6 +825,8 @@ def test_cat_preserve_lexical_ordering_on_concat() -> None: assert df2["x"].dtype == dtype +# TODO: Bug see: https://github.com/pola-rs/polars/issues/20440 +@pytest.mark.may_fail_auto_streaming def test_cat_append_lexical_sorted_flag() -> None: df = pl.DataFrame({"x": [0, 1, 1], "y": ["B", "B", "A"]}).with_columns( pl.col("y").cast(pl.Categorical(ordering="lexical")) diff --git a/py-polars/tests/unit/operations/namespaces/test_categorical.py b/py-polars/tests/unit/operations/namespaces/test_categorical.py index 7bd457871b86..9f60ff4f7be9 100644 --- a/py-polars/tests/unit/operations/namespaces/test_categorical.py +++ b/py-polars/tests/unit/operations/namespaces/test_categorical.py @@ -29,6 +29,8 @@ def test_global_and_local( yield +# @TODO: Bug, see https://github.com/pola-rs/polars/issues/20440 +@pytest.mark.may_fail_auto_streaming def test_categorical_lexical_sort() -> None: df = pl.DataFrame( {"cats": ["z", "z", "k", "a", "b"], "vals": [3, 1, 2, 2, 3]} diff --git a/py-polars/tests/unit/operations/test_interpolate_by.py b/py-polars/tests/unit/operations/test_interpolate_by.py index bcfaf562426c..07bee621c505 100644 --- a/py-polars/tests/unit/operations/test_interpolate_by.py +++ b/py-polars/tests/unit/operations/test_interpolate_by.py @@ -105,17 +105,15 @@ def test_interpolate_by_leading_nulls() -> None: } ) result = df.select(pl.col("values").interpolate_by("times")) - expected = pl.DataFrame( - {"values": [None, None, None, 1.0, 1.7999999999999998, 4.6, 5.0]} - ) + expected = pl.DataFrame({"values": [None, None, None, 1.0, 1.8, 4.6, 5.0]}) assert_frame_equal(result, expected) result = ( - df.sort("times", descending=True) + df.sort("times", maintain_order=True, descending=True) .with_columns(pl.col("values").interpolate_by("times")) .sort("times", maintain_order=True) .drop("times") ) - assert_frame_equal(result, expected) + assert_frame_equal(result, expected, check_exact=False) @pytest.mark.parametrize("dataset", ["floats", "dates"]) diff --git a/py-polars/tests/unit/operations/test_join.py b/py-polars/tests/unit/operations/test_join.py index 3bc0649a72ea..fd384408e247 100644 --- a/py-polars/tests/unit/operations/test_join.py +++ b/py-polars/tests/unit/operations/test_join.py @@ -285,11 +285,18 @@ def test_join_on_cast() -> None: df_b = pl.DataFrame({"a": [-2, -3, 3, 10]}) - assert df_a.join(df_b, on=pl.col("a").cast(pl.Int64)).to_dict(as_series=False) == { - "index": [1, 2, 3, 5], - "a": [-2, 3, 3, 10], - "a_right": [-2, 3, 3, 10], - } + assert_frame_equal( + df_a.join(df_b, on=pl.col("a").cast(pl.Int64)), + pl.DataFrame( + { + "index": [1, 2, 3, 5], + "a": [-2, 3, 3, 10], + "a_right": [-2, 3, 3, 10], + } + ), + check_row_order=False, + check_dtypes=False, + ) assert df_a.lazy().join( df_b.lazy(), on=pl.col("a").cast(pl.Int64) ).collect().to_dict(as_series=False) == {