diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index a1e51e07422e..63bf72cf226c 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -20,10 +20,12 @@ pub use super::Operator; use crate::error::{DataFusionError, Result}; +use crate::execution::context::ExecutionProps; use crate::field_util::get_indexed_field; use crate::logical_plan::{ plan::Aggregate, window_frames, DFField, DFSchema, LogicalPlan, }; +use crate::optimizer::simplify_expressions::{ConstEvaluator, Simplifier}; use crate::physical_plan::functions::Volatility; use crate::physical_plan::{ aggregates, expressions::binary_operator_data_type, functions, udf::ScalarUDF, @@ -971,6 +973,58 @@ impl Expr { Ok(expr) } } + + /// Simplifies this [`Expr`]`s as much as possible, evaluating + /// constants and applying algebraic simplifications + /// + /// # Example: + /// `b > 2 AND b > 2` + /// can be written to + /// `b > 2` + /// + /// ``` + /// use datafusion::logical_plan::*; + /// use datafusion::error::Result; + /// use datafusion::execution::context::ExecutionProps; + /// + /// /// Simple implementation that provides `Simplifier` the information it needs + /// #[derive(Default)] + /// struct Info { + /// execution_props: ExecutionProps, + /// }; + /// + /// impl SimplifyInfo for Info { + /// fn is_boolean_type(&self, expr: &Expr) -> Result { + /// Ok(false) + /// } + /// fn nullable(&self, expr: &Expr) -> Result { + /// Ok(true) + /// } + /// fn execution_props(&self) -> &ExecutionProps { + /// &self.execution_props + /// } + /// } + /// + /// // b < 2 + /// let b_lt_2 = col("b").gt(lit(2)); + /// + /// // (b < 2) OR (b < 2) + /// let expr = b_lt_2.clone().or(b_lt_2.clone()); + /// + /// // (b < 2) OR (b < 2) --> (b < 2) + /// let expr = expr.simplify(&Info::default()).unwrap(); + /// assert_eq!(expr, b_lt_2); + /// ``` + pub fn simplify(self, info: &S) -> Result { + let mut rewriter = Simplifier::new(info); + let mut const_evaluator = ConstEvaluator::new(info.execution_props()); + + // TODO iterate until no changes are made during rewrite + // (evaluating constants can enable new simplifications and + // simplifications can enable new constant evaluation) + // https://github.com/apache/arrow-datafusion/issues/1160 + self.rewrite(&mut const_evaluator)?.rewrite(&mut rewriter) + } } impl Not for Expr { @@ -1092,6 +1146,20 @@ pub trait ExprRewriter: Sized { fn mutate(&mut self, expr: Expr) -> Result; } +/// The information necessary to apply algebraic simplification to an +/// [Expr]. See [SimplifyContext] for one implementation +pub trait SimplifyInfo { + /// returns true if this Expr has boolean type + fn is_boolean_type(&self, expr: &Expr) -> Result; + + /// returns true of this expr is nullable (could possibly be NULL) + fn nullable(&self, expr: &Expr) -> Result; + + /// Returns details needed for partial expression evaluation + fn execution_props(&self) -> &ExecutionProps; +} + +/// Helper struct for building [Expr::Case] pub struct CaseBuilder { expr: Option>, when_expr: Vec, diff --git a/datafusion/src/logical_plan/mod.rs b/datafusion/src/logical_plan/mod.rs index 06c6bf90c790..25714514d78a 100644 --- a/datafusion/src/logical_plan/mod.rs +++ b/datafusion/src/logical_plan/mod.rs @@ -47,6 +47,7 @@ pub use expr::{ signum, sin, split_part, sqrt, starts_with, strpos, substr, sum, tan, to_hex, translate, trim, trunc, unalias, unnormalize_col, unnormalize_cols, upper, when, Column, Expr, ExprRewriter, ExpressionVisitor, Literal, Recursion, RewriteRecursion, + SimplifyInfo, }; pub use extension::UserDefinedLogicalNode; pub use operators::Operator; diff --git a/datafusion/src/optimizer/simplify_expressions.rs b/datafusion/src/optimizer/simplify_expressions.rs index 00739ccff5ac..c000bdbc2bea 100644 --- a/datafusion/src/optimizer/simplify_expressions.rs +++ b/datafusion/src/optimizer/simplify_expressions.rs @@ -23,8 +23,10 @@ use arrow::record_batch::RecordBatch; use crate::error::DataFusionError; use crate::execution::context::ExecutionProps; -use crate::logical_plan::{lit, DFSchemaRef, Expr}; -use crate::logical_plan::{DFSchema, ExprRewriter, LogicalPlan, RewriteRecursion}; +use crate::logical_plan::{ + lit, DFSchema, DFSchemaRef, Expr, ExprRewriter, LogicalPlan, RewriteRecursion, + SimplifyInfo, +}; use crate::optimizer::optimizer::OptimizerRule; use crate::optimizer::utils; use crate::physical_plan::functions::Volatility; @@ -32,8 +34,57 @@ use crate::physical_plan::planner::create_physical_expr; use crate::scalar::ScalarValue; use crate::{error::Result, logical_plan::Operator}; -/// Simplifies plans by rewriting [`Expr`]`s evaluating constants -/// and applying algebraic simplifications +/// Provides simplification information based on schema and properties +struct SimplifyContext<'a, 'b> { + schemas: Vec<&'a DFSchemaRef>, + props: &'b ExecutionProps, +} + +impl<'a, 'b> SimplifyContext<'a, 'b> { + /// Create a new SimplifyContext + pub fn new(schemas: Vec<&'a DFSchemaRef>, props: &'b ExecutionProps) -> Self { + Self { schemas, props } + } +} + +impl<'a, 'b> SimplifyInfo for SimplifyContext<'a, 'b> { + /// returns true if this Expr has boolean type + fn is_boolean_type(&self, expr: &Expr) -> Result { + for schema in &self.schemas { + if let Ok(DataType::Boolean) = expr.get_type(schema) { + return Ok(true); + } + } + + Ok(false) + } + /// Returns true if expr is nullable + fn nullable(&self, expr: &Expr) -> Result { + self.schemas + .iter() + .find_map(|schema| { + // expr may be from another input, so ignore errors + // by converting to None to keep trying + expr.nullable(schema.as_ref()).ok() + }) + .ok_or_else(|| { + // This means we weren't able to compute `Expr::nullable` with + // *any* input schemas, signalling a problem + DataFusionError::Internal(format!( + "Could not find find columns in '{}' during simplify", + expr + )) + }) + } + + fn execution_props(&self) -> &ExecutionProps { + self.props + } +} + +/// Optimizer Pass that simplifies [`LogicalPlan`]s by rewriting +/// [`Expr`]`s evaluating constants and applying algebraic +/// simplifications /// /// # Introduction /// It uses boolean algebra laws to simplify or reduce the number of terms in expressions. @@ -44,7 +95,7 @@ use crate::{error::Result, logical_plan::Operator}; /// `Filter: b > 2` /// #[derive(Default)] -pub struct SimplifyExpressions {} +pub(crate) struct SimplifyExpressions {} /// returns true if `needle` is found in a chain of search_op /// expressions. Such as: (A AND B) AND C @@ -150,9 +201,7 @@ impl OptimizerRule for SimplifyExpressions { // projected columns. With just the projected schema, it's not possible to infer types for // expressions that references non-projected columns within the same project plan or its // children plans. - let mut simplifier = Simplifier::new(plan.all_schemas()); - - let mut const_evaluator = ConstEvaluator::new(execution_props); + let info = SimplifyContext::new(plan.all_schemas(), execution_props); let new_inputs = plan .inputs() @@ -168,15 +217,8 @@ impl OptimizerRule for SimplifyExpressions { // Constant folding should not change expression name. let name = &e.name(plan.schema()); - // TODO iterate until no changes are made - // during rewrite (evaluating constants can - // enable new simplifications and - // simplifications can enable new constant - // evaluation) - let new_e = e - // fold constants and then simplify - .rewrite(&mut const_evaluator)? - .rewrite(&mut simplifier)?; + // Apply the actual simplification logic + let new_e = e.simplify(&info)?; let new_name = &new_e.name(plan.schema()); @@ -389,52 +431,23 @@ impl<'a> ConstEvaluator<'a> { /// * `false = true` and `true = false` to `false` /// * `!!expr` to `expr` /// * `expr = null` and `expr != null` to `null` -pub(crate) struct Simplifier<'a> { - /// input schemas - schemas: Vec<&'a DFSchemaRef>, +pub(crate) struct Simplifier<'a, S> { + info: &'a S, } -impl<'a> Simplifier<'a> { - pub fn new(schemas: Vec<&'a DFSchemaRef>) -> Self { - Self { schemas } - } - - fn is_boolean_type(&self, expr: &Expr) -> bool { - for schema in &self.schemas { - if let Ok(DataType::Boolean) = expr.get_type(schema) { - return true; - } - } - - false - } - - /// Returns true if expr is nullable - fn nullable(&self, expr: &Expr) -> Result { - self.schemas - .iter() - .find_map(|schema| { - // expr may be from another input, so ignore errors - // by converting to None to keep trying - expr.nullable(schema.as_ref()).ok() - }) - .ok_or_else(|| { - // This means we weren't able to compute `Expr::nullable` with - // *any* input schemas, signalling a problem - DataFusionError::Internal(format!( - "Could not find find columns in '{}' during simplify", - expr - )) - }) +impl<'a, S> Simplifier<'a, S> { + pub fn new(info: &'a S) -> Self { + Self { info } } } -impl<'a> ExprRewriter for Simplifier<'a> { +impl<'a, S: SimplifyInfo> ExprRewriter for Simplifier<'a, S> { /// rewrite the expression simplifying any constant expressions fn mutate(&mut self, expr: Expr) -> Result { use Expr::*; use Operator::{And, Divide, Eq, Multiply, NotEq, Or}; + let info = self.info; let new_expr = match expr { // // Rules for Eq @@ -447,7 +460,7 @@ impl<'a> ExprRewriter for Simplifier<'a> { left, op: Eq, right, - } if is_bool_lit(&left) && self.is_boolean_type(&right) => { + } if is_bool_lit(&left) && info.is_boolean_type(&right)? => { match as_bool_lit(*left) { Some(true) => *right, Some(false) => Not(right), @@ -461,7 +474,7 @@ impl<'a> ExprRewriter for Simplifier<'a> { left, op: Eq, right, - } if is_bool_lit(&right) && self.is_boolean_type(&left) => { + } if is_bool_lit(&right) && info.is_boolean_type(&left)? => { match as_bool_lit(*right) { Some(true) => *left, Some(false) => Not(left), @@ -480,7 +493,7 @@ impl<'a> ExprRewriter for Simplifier<'a> { left, op: NotEq, right, - } if is_bool_lit(&left) && self.is_boolean_type(&right) => { + } if is_bool_lit(&left) && info.is_boolean_type(&right)? => { match as_bool_lit(*left) { Some(true) => Not(right), Some(false) => *right, @@ -494,7 +507,7 @@ impl<'a> ExprRewriter for Simplifier<'a> { left, op: NotEq, right, - } if is_bool_lit(&right) && self.is_boolean_type(&left) => { + } if is_bool_lit(&right) && info.is_boolean_type(&left)? => { match as_bool_lit(*right) { Some(true) => Not(left), Some(false) => *left, @@ -547,13 +560,13 @@ impl<'a> ExprRewriter for Simplifier<'a> { left, op: Or, right, - } if !self.nullable(&right)? && is_op_with(And, &right, &left) => *left, + } if !info.nullable(&right)? && is_op_with(And, &right, &left) => *left, // (A AND B) OR A --> A (if B not null) BinaryExpr { left, op: Or, right, - } if !self.nullable(&left)? && is_op_with(And, &left, &right) => *right, + } if !info.nullable(&left)? && is_op_with(And, &left, &right) => *right, // // Rules for AND @@ -600,13 +613,13 @@ impl<'a> ExprRewriter for Simplifier<'a> { left, op: And, right, - } if !self.nullable(&right)? && is_op_with(Or, &right, &left) => *left, + } if !info.nullable(&right)? && is_op_with(Or, &right, &left) => *left, // (A OR B) AND A --> A (if B not null) BinaryExpr { left, op: And, right, - } if !self.nullable(&left)? && is_op_with(Or, &left, &right) => *right, + } if !info.nullable(&left)? && is_op_with(Or, &left, &right) => *right, // // Rules for Multiply @@ -643,7 +656,7 @@ impl<'a> ExprRewriter for Simplifier<'a> { left, op: Divide, right, - } if !self.nullable(&left)? && left == right => lit(1), + } if !info.nullable(&left)? && left == right => lit(1), // // Rules for Not @@ -676,7 +689,7 @@ impl<'a> ExprRewriter for Simplifier<'a> { else_expr, } if !when_then_expr.is_empty() && when_then_expr.len() < 3 // The rewrite is O(n!) so limit to small number - && self.is_boolean_type(&when_then_expr[0].1) => + && info.is_boolean_type(&when_then_expr[0].1)? => { // The disjunction of all the when predicates encountered so far let mut filter_expr = lit(false); @@ -1208,17 +1221,9 @@ mod tests { fn simplify(expr: Expr) -> Expr { let schema = expr_test_schema(); - let mut rewriter = Simplifier::new(vec![&schema]); - let execution_props = ExecutionProps::new(); - let mut const_evaluator = ConstEvaluator::new(&execution_props); - - expr.rewrite(&mut rewriter) - .expect("expected to simplify") - .rewrite(&mut const_evaluator) - .expect("expected to const evaluate") - .rewrite(&mut rewriter) - .expect("expected to simplify") + let info = SimplifyContext::new(vec![&schema], &execution_props); + expr.simplify(&info).unwrap() } fn expr_test_schema() -> DFSchemaRef { @@ -1357,30 +1362,36 @@ mod tests { // CASE WHERE c2 THEN true ELSE c2 // --> // c2 + // + // Need to call simplify 2x due to + // https://github.com/apache/arrow-datafusion/issues/1160 assert_eq!( - simplify(Expr::Case { + simplify(simplify(Expr::Case { expr: None, when_then_expr: vec![( Box::new(col("c2").not_eq(lit(false))), Box::new(lit("ok").eq(lit("ok"))), )], else_expr: Some(Box::new(col("c2").eq(lit(true)))), - }), + })), col("c2").or(col("c2").not().and(col("c2"))) // #1716 ); // CASE WHERE ISNULL(c2) THEN true ELSE c2 // --> // ISNULL(c2) OR c2 + // + // Need to call simplify 2x due to + // https://github.com/apache/arrow-datafusion/issues/1160 assert_eq!( - simplify(Expr::Case { + simplify(simplify(Expr::Case { expr: None, when_then_expr: vec![( Box::new(col("c2").is_null()), Box::new(lit(true)), )], else_expr: Some(Box::new(col("c2"))), - }), + })), col("c2") .is_null() .or(col("c2").is_null().not().and(col("c2"))) @@ -1390,15 +1401,18 @@ mod tests { // --> c1 OR (NOT(c1) AND c2 AND FALSE) OR (NOT(c1 OR c2) AND TRUE) // --> c1 OR (NOT(c1 OR c2)) // --> NOT(c1) AND c2 + // + // Need to call simplify 2x due to + // https://github.com/apache/arrow-datafusion/issues/1160 assert_eq!( - simplify(Expr::Case { + simplify(simplify(Expr::Case { expr: None, when_then_expr: vec![ (Box::new(col("c1")), Box::new(lit(true)),), (Box::new(col("c2")), Box::new(lit(false)),) ], else_expr: Some(Box::new(lit(true))), - }), + })), col("c1").or(col("c1").or(col("c2")).not()) ); @@ -1406,15 +1420,18 @@ mod tests { // --> c1 OR (NOT(c1) AND c2 AND TRUE) OR (NOT(c1 OR c2) AND FALSE) // --> c1 OR (NOT(c1) AND c2) // --> c1 OR c2 + // + // Need to call simplify 2x due to + // https://github.com/apache/arrow-datafusion/issues/1160 assert_eq!( - simplify(Expr::Case { + simplify(simplify(Expr::Case { expr: None, when_then_expr: vec![ (Box::new(col("c1")), Box::new(lit(true)),), (Box::new(col("c2")), Box::new(lit(false)),) ], else_expr: Some(Box::new(lit(true))), - }), + })), col("c1").or(col("c1").or(col("c2")).not()) ); } diff --git a/datafusion/tests/simplification.rs b/datafusion/tests/simplification.rs new file mode 100644 index 000000000000..5edf43f5ccb2 --- /dev/null +++ b/datafusion/tests/simplification.rs @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! This program demonstrates the DataFusion expression simplification API. + +use arrow::datatypes::{DataType, Field, Schema}; +use datafusion::{ + error::Result, + execution::context::ExecutionProps, + logical_plan::{DFSchema, Expr, SimplifyInfo}, + prelude::*, +}; + +/// In order to simplify expressions, DataFusion must have information +/// about the expressions. +/// +/// You can provide that information using DataFusion [DFSchema] +/// objects or from some other implemention +struct MyInfo { + /// The input schema + schema: DFSchema, + + /// Execution specific details needed for constant evaluation such + /// as the current time for `now()` and [VariableProviders] + execution_props: ExecutionProps, +} + +impl SimplifyInfo for MyInfo { + fn is_boolean_type(&self, expr: &Expr) -> Result { + Ok(matches!(expr.get_type(&self.schema)?, DataType::Boolean)) + } + + fn nullable(&self, expr: &Expr) -> Result { + expr.nullable(&self.schema) + } + + fn execution_props(&self) -> &ExecutionProps { + &self.execution_props + } +} + +impl From for MyInfo { + fn from(schema: DFSchema) -> Self { + Self { + schema, + execution_props: ExecutionProps::new(), + } + } +} + +/// A schema like: +/// +/// a: Int32 (possibly with nulls) +/// b: Int32 +/// s: Utf8 +fn schema() -> DFSchema { + Schema::new(vec![ + Field::new("a", DataType::Int32, true), + Field::new("b", DataType::Int32, false), + Field::new("s", DataType::Utf8, false), + ]) + .try_into() + .unwrap() +} + +#[test] +fn basic() { + let info: MyInfo = schema().into(); + + // The `Expr` is a core concept in DataFusion, and DataFusion can + // help simplify it. + + // For example 'a < (2 + 3)' can be rewritten into the easier to + // optimize form `a < 5` automatically + let expr = col("a").lt(lit(2i32) + lit(3i32)); + + let simplified = expr.simplify(&info).unwrap(); + assert_eq!(simplified, col("a").lt(lit(5i32))); +} + +#[test] +fn fold_and_simplify() { + let info: MyInfo = schema().into(); + + // What will it do with the expression `concat('foo', 'bar') == 'foobar')`? + let expr = concat(&[lit("foo"), lit("bar")]).eq(lit("foobar")); + + // Since datafusion applies both simplification *and* rewriting + // some expressions can be entirely simplified + let simplified = expr.simplify(&info).unwrap(); + assert_eq!(simplified, lit(true)) +}