From 86e7d8b4fb61aa1c5b618e90c34327df16fedd03 Mon Sep 17 00:00:00 2001 From: ritchie Date: Fri, 29 Sep 2023 17:58:12 +0200 Subject: [PATCH 1/2] fix: count literal as leaf columns --- crates/polars-plan/src/constants.rs | 1 + crates/polars-plan/src/logical_plan/aexpr/schema.rs | 3 ++- crates/polars-plan/src/utils.rs | 10 +++++----- py-polars/tests/unit/operations/test_drop.py | 9 +++++++++ 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/crates/polars-plan/src/constants.rs b/crates/polars-plan/src/constants.rs index f260e7ffcaa2..33b27e6c1c29 100644 --- a/crates/polars-plan/src/constants.rs +++ b/crates/polars-plan/src/constants.rs @@ -1,2 +1,3 @@ pub static MAP_LIST_NAME: &str = "map_list"; pub static CSE_REPLACED: &str = "__POLARS_CSER_"; +pub static LITERAL_NAME: &str = "literal"; diff --git a/crates/polars-plan/src/logical_plan/aexpr/schema.rs b/crates/polars-plan/src/logical_plan/aexpr/schema.rs index 81eb3502c861..faca02f272c7 100644 --- a/crates/polars-plan/src/logical_plan/aexpr/schema.rs +++ b/crates/polars-plan/src/logical_plan/aexpr/schema.rs @@ -1,4 +1,5 @@ use super::*; +use crate::constants::LITERAL_NAME; fn float_type(field: &mut Field) { if field.dtype.is_numeric() && !matches!(&field.dtype, DataType::Float32) { @@ -51,7 +52,7 @@ impl AExpr { }, Literal(sv) => Ok(match sv { LiteralValue::Series(s) => s.field().into_owned(), - _ => Field::new("literal", sv.get_datatype()), + _ => Field::new(LITERAL_NAME, sv.get_datatype()), }), BinaryExpr { left, right, op } => { use DataType::*; diff --git a/crates/polars-plan/src/utils.rs b/crates/polars-plan/src/utils.rs index b8e3cd9d80c8..0958db65d8e2 100644 --- a/crates/polars-plan/src/utils.rs +++ b/crates/polars-plan/src/utils.rs @@ -6,6 +6,7 @@ use std::vec::IntoIter; use polars_core::prelude::*; use smartstring::alias::String as SmartString; +use crate::constants::LITERAL_NAME; use crate::logical_plan::iterator::ArenaExprIter; use crate::logical_plan::Context; use crate::prelude::names::COUNT; @@ -349,12 +350,11 @@ pub fn aexpr_to_leaf_names_iter( node: Node, arena: &Arena, ) -> impl Iterator> + '_ { - aexpr_to_column_nodes_iter(node, arena).map(|node| match arena.get(node) { + arena.iter(node).flat_map(|(_, ae)| match ae { // expecting only columns here, wildcards and dtypes should already be replaced - AExpr::Column(name) => name.clone(), - e => { - panic!("{e:?} not expected") - }, + AExpr::Column(name) => Some(name.clone()), + AExpr::Literal(_) => Some(Arc::from(LITERAL_NAME)), + _ => None, }) } diff --git a/py-polars/tests/unit/operations/test_drop.py b/py-polars/tests/unit/operations/test_drop.py index 374b7081c837..2680d968951c 100644 --- a/py-polars/tests/unit/operations/test_drop.py +++ b/py-polars/tests/unit/operations/test_drop.py @@ -119,3 +119,12 @@ def test_drop_nan_ignore_null_3525() -> None: 3.0, 4.0, ] + + +def test_drop_join_lit() -> None: + df = pl.LazyFrame({"date": [1, 2, 3], "symbol": [4, 5, 6]}) + dates = df.select("date").unique() + symbols = df.select("symbol").unique() + assert symbols.join(dates, left_on=pl.lit(1), right_on=pl.lit(1)).drop( + "literal" + ).collect().columns == ["symbol", "date"] From 911815091d5f7beea28e721019f05a3a57191096 Mon Sep 17 00:00:00 2001 From: ritchie Date: Fri, 29 Sep 2023 20:41:39 +0200 Subject: [PATCH 2/2] literal and column differentiator --- .../src/physical_plan/planner/expr.rs | 2 +- .../src/physical_plan/planner/lp.rs | 2 +- .../src/logical_plan/optimizer/drop_nulls.rs | 4 +-- .../optimizer/predicate_pushdown/utils.rs | 10 ++++--- .../optimizer/projection_pushdown/joins.rs | 10 +++---- .../optimizer/projection_pushdown/mod.rs | 10 +++---- .../projection_pushdown/projection.rs | 2 +- .../logical_plan/optimizer/simplify_expr.rs | 2 +- crates/polars-plan/src/utils.rs | 26 +++++++++++++++---- 9 files changed, 43 insertions(+), 25 deletions(-) diff --git a/crates/polars-lazy/src/physical_plan/planner/expr.rs b/crates/polars-lazy/src/physical_plan/planner/expr.rs index 2411539969d0..d2855cc741a9 100644 --- a/crates/polars-lazy/src/physical_plan/planner/expr.rs +++ b/crates/polars-lazy/src/physical_plan/planner/expr.rs @@ -113,7 +113,7 @@ pub(crate) fn create_physical_expr( let phys_function = create_physical_expr(function, Context::Aggregation, expr_arena, schema, state)?; let mut out_name = None; - let mut apply_columns = aexpr_to_leaf_names(function, expr_arena); + let mut apply_columns = aexpr_to_leaf_column_names(function, expr_arena); // sort and then dedup removes consecutive duplicates == all duplicates apply_columns.sort(); apply_columns.dedup(); diff --git a/crates/polars-lazy/src/physical_plan/planner/lp.rs b/crates/polars-lazy/src/physical_plan/planner/lp.rs index 6c1d27b0ee97..21bedb944f35 100644 --- a/crates/polars-lazy/src/physical_plan/planner/lp.rs +++ b/crates/polars-lazy/src/physical_plan/planner/lp.rs @@ -119,7 +119,7 @@ fn partitionable_gb( #[cfg(feature = "object")] { - for name in aexpr_to_leaf_names(*agg, expr_arena) { + for name in aexpr_to_leaf_column_names(*agg, expr_arena) { let dtype = _input_schema.get(&name).unwrap(); if let DataType::Object(_) = dtype { diff --git a/crates/polars-plan/src/logical_plan/optimizer/drop_nulls.rs b/crates/polars-plan/src/logical_plan/optimizer/drop_nulls.rs index 8683d2716a03..909a4643ddf1 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/drop_nulls.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/drop_nulls.rs @@ -4,7 +4,7 @@ use super::*; use crate::dsl::function_expr::FunctionExpr; use crate::logical_plan::functions::FunctionNode; use crate::logical_plan::iterator::*; -use crate::utils::aexpr_to_leaf_names; +use crate::utils::aexpr_to_leaf_column_names; /// If we realize that a predicate drops nulls on a subset /// we replace it with an explicit df.drop_nulls call, as this @@ -72,7 +72,7 @@ impl OptimizationRule for ReplaceDropNulls { } } if not_null_count == column_count && binary_and_count < column_count { - let subset = Arc::from(aexpr_to_leaf_names(*predicate, expr_arena)); + let subset = Arc::from(aexpr_to_leaf_column_names(*predicate, expr_arena)); Some(ALogicalPlan::MapFunction { input: *input, diff --git a/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/utils.rs b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/utils.rs index 58179b8aae8b..73f46c57661e 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/utils.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/utils.rs @@ -4,7 +4,9 @@ use polars_core::prelude::*; use super::keys::*; use crate::logical_plan::Context; use crate::prelude::*; -use crate::utils::{aexpr_to_leaf_names, check_input_node, has_aexpr, rename_aexpr_leaf_names}; +use crate::utils::{ + aexpr_to_leaf_column_names, check_input_node, has_aexpr, rename_aexpr_leaf_names, +}; trait Dsl { fn and(self, right: Node, arena: &mut Arena) -> Node; @@ -214,7 +216,7 @@ fn rename_predicate_columns_due_to_aliased_projection( let projection_aexpr = expr_arena.get(projection_node); if let AExpr::Alias(_, alias_name) = projection_aexpr { let alias_name = alias_name.as_ref(); - let projection_leaves = aexpr_to_leaf_names(projection_node, expr_arena); + let projection_leaves = aexpr_to_leaf_column_names(projection_node, expr_arena); // this means the leaf is a literal if projection_leaves.is_empty() { @@ -362,7 +364,7 @@ pub(super) fn no_pushdown_preds( // matching expr are typically explode, shift, etc. expressions that mess up predicates when pushed down if has_aexpr(node, arena, matches) { // columns that are projected. We check if we can push down the predicates past this projection - let columns = aexpr_to_leaf_names(node, arena); + let columns = aexpr_to_leaf_column_names(node, arena); let condition = |name: Arc| columns.contains(&name); local_predicates.extend(transfer_to_local_by_name(arena, acc_predicates, condition)); @@ -382,7 +384,7 @@ where let mut remove_keys = Vec::with_capacity(acc_predicates.len()); for (key, predicate) in &*acc_predicates { - let root_names = aexpr_to_leaf_names(*predicate, expr_arena); + let root_names = aexpr_to_leaf_column_names(*predicate, expr_arena); for name in root_names { if condition(name) { remove_keys.push(key.clone()); diff --git a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/joins.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/joins.rs index d5c004ce8bad..bc9f386cf50c 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/joins.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/joins.rs @@ -135,7 +135,7 @@ pub(super) fn process_asof_join( let mut add_local = if already_added_local_to_local_projected.is_empty() { true } else { - let name = aexpr_to_leaf_name(proj, expr_arena); + let name = aexpr_to_leaf_column_name(proj, expr_arena); !already_added_local_to_local_projected.contains(&name) }; @@ -273,7 +273,7 @@ pub(super) fn process_join( let mut add_local = if already_added_local_to_local_projected.is_empty() { true } else { - let name = aexpr_to_leaf_name(proj, expr_arena); + let name = aexpr_to_leaf_column_name(proj, expr_arena); !already_added_local_to_local_projected.contains(&name) }; @@ -363,7 +363,7 @@ fn process_projection( // this branch tries to pushdown the column without suffix { // Column name of the projection without any alias. - let leaf_column_name = aexpr_to_leaf_names(proj, expr_arena).pop().unwrap(); + let leaf_column_name = aexpr_to_leaf_column_names(proj, expr_arena).pop().unwrap(); let suffix = options.args.suffix(); // If _right suffix exists we need to push a projection down without this @@ -407,7 +407,7 @@ pub(super) fn process_alias( mut add_local: bool, ) -> bool { if let AExpr::Alias(expr, name) = expr_arena.get(proj).clone() { - for root_name in aexpr_to_leaf_names(expr, expr_arena) { + for root_name in aexpr_to_leaf_column_names(expr, expr_arena) { let node = expr_arena.add(AExpr::Column(root_name)); let proj = expr_arena.add(AExpr::Alias(node, name.clone())); local_projection.push(proj) @@ -448,7 +448,7 @@ fn resolve_join_suffixes( let schema_after_join = alp.schema(lp_arena); for proj in local_projection { - for name in aexpr_to_leaf_names(*proj, expr_arena) { + for name in aexpr_to_leaf_column_names(*proj, expr_arena) { if name.contains(suffix) && schema_after_join.get(&name).is_none() { let new_name = &name.as_ref()[..name.len() - suffix.len()]; diff --git a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs index 2e208daf5530..f0afbea7eb12 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/mod.rs @@ -24,7 +24,7 @@ use crate::prelude::optimizer::projection_pushdown::projection::process_projecti use crate::prelude::optimizer::projection_pushdown::rename::process_rename; use crate::prelude::*; use crate::utils::{ - aexpr_assign_renamed_leaf, aexpr_to_column_nodes, aexpr_to_leaf_names, check_input_node, + aexpr_assign_renamed_leaf, aexpr_to_column_nodes, aexpr_to_leaf_column_names, check_input_node, expr_is_projected_upstream, }; @@ -45,7 +45,7 @@ fn get_scan_columns( if !acc_projections.is_empty() { let mut columns = Vec::with_capacity(acc_projections.len()); for expr in acc_projections { - for name in aexpr_to_leaf_names(*expr, expr_arena) { + for name in aexpr_to_leaf_column_names(*expr, expr_arena) { // we shouldn't project the row-count column, as that is generated // in the scan let push = match row_count { @@ -86,7 +86,7 @@ fn split_acc_projections( .partition(|expr| check_input_node(*expr, down_schema, expr_arena)); let mut names = init_set(); for proj in &acc_projections { - for name in aexpr_to_leaf_names(*proj, expr_arena) { + for name in aexpr_to_leaf_column_names(*proj, expr_arena) { names.insert(name); } } @@ -132,7 +132,7 @@ fn update_scan_schema( let mut new_schema = Schema::with_capacity(acc_projections.len()); let mut new_cols = Vec::with_capacity(acc_projections.len()); for node in acc_projections.iter() { - for name in aexpr_to_leaf_names(*node, expr_arena) { + for name in aexpr_to_leaf_column_names(*node, expr_arena) { let item = schema.get_full(&name).ok_or_else(|| { polars_err!(ComputeError: "column '{}' not available in schema {:?}", name, schema) })?; @@ -218,7 +218,7 @@ impl ProjectionPushDown { ) -> (bool, bool) { let mut pushed_at_least_one = false; let mut already_projected = false; - let names = aexpr_to_leaf_names(proj, expr_arena); + let names = aexpr_to_leaf_column_names(proj, expr_arena); let root_projections = aexpr_to_column_nodes(proj, expr_arena); for (name, root_projection) in names.into_iter().zip(root_projections) { diff --git a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/projection.rs b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/projection.rs index e49ba5cb5642..aca4a9107702 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/projection.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/projection_pushdown/projection.rs @@ -61,7 +61,7 @@ pub(super) fn process_projection( if let AExpr::Alias(_, name) = ae { if projected_names.remove(name) { acc_projections.retain(|expr| { - !aexpr_to_leaf_names(*expr, expr_arena).contains(name) + !aexpr_to_leaf_column_names(*expr, expr_arena).contains(name) }); } } diff --git a/crates/polars-plan/src/logical_plan/optimizer/simplify_expr.rs b/crates/polars-plan/src/logical_plan/optimizer/simplify_expr.rs index 92d8621090c4..fc94139571c8 100644 --- a/crates/polars-plan/src/logical_plan/optimizer/simplify_expr.rs +++ b/crates/polars-plan/src/logical_plan/optimizer/simplify_expr.rs @@ -248,7 +248,7 @@ impl OptimizationRule for SimplifyBooleanRule { AExpr::Literal(LiteralValue::Boolean(false)) ) => { - let names = aexpr_to_leaf_names(*truthy, expr_arena); + let names = aexpr_to_leaf_column_names(*truthy, expr_arena); let name = names.get(0).map(Arc::clone).unwrap_or_else(|| "".into()); Some(AExpr::Alias(*falsy, name)) }, diff --git a/crates/polars-plan/src/utils.rs b/crates/polars-plan/src/utils.rs index 0958db65d8e2..3724b1dae638 100644 --- a/crates/polars-plan/src/utils.rs +++ b/crates/polars-plan/src/utils.rs @@ -346,6 +346,27 @@ pub fn expressions_to_schema( .collect() } +pub fn aexpr_to_leaf_column_names_iter( + node: Node, + arena: &Arena, +) -> impl Iterator> + '_ { + aexpr_to_column_nodes_iter(node, arena).map(|node| match arena.get(node) { + // expecting only columns here, wildcards and dtypes should already be replaced + AExpr::Column(name) => name.clone(), + e => { + panic!("{e:?} not expected") + }, + }) +} + +pub fn aexpr_to_leaf_column_names(node: Node, arena: &Arena) -> Vec> { + aexpr_to_leaf_column_names_iter(node, arena).collect() +} + +pub fn aexpr_to_leaf_column_name(node: Node, arena: &Arena) -> Arc { + aexpr_to_leaf_column_names_iter(node, arena).next().unwrap() +} + pub fn aexpr_to_leaf_names_iter( node: Node, arena: &Arena, @@ -357,11 +378,6 @@ pub fn aexpr_to_leaf_names_iter( _ => None, }) } - -pub fn aexpr_to_leaf_names(node: Node, arena: &Arena) -> Vec> { - aexpr_to_leaf_names_iter(node, arena).collect() -} - pub fn aexpr_to_leaf_name(node: Node, arena: &Arena) -> Arc { aexpr_to_leaf_names_iter(node, arena).next().unwrap() }