From bcd2bd1d0436e62e12d08b6f31391e5fa006a403 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sat, 2 Mar 2024 20:47:30 -0500 Subject: [PATCH 01/17] Move date_part, date_trunc, date_bin functions to datafusion-functions --- datafusion-cli/Cargo.lock | 3 + datafusion/expr/src/built_in_function.rs | 143 +- datafusion/expr/src/expr_fn.rs | 6 - datafusion/functions/src/datetime/date_bin.rs | 765 +++++++++ .../functions/src/datetime/date_part.rs | 269 +++ .../functions/src/datetime/date_trunc.rs | 825 +++++++++ datafusion/functions/src/datetime/mod.rs | 35 +- datafusion/physical-expr/Cargo.toml | 1 + .../physical-expr/src/datetime_expressions.rs | 1496 +---------------- .../src/equivalence/projection.rs | 59 +- datafusion/physical-expr/src/functions.rs | 3 - datafusion/proto/proto/datafusion.proto | 6 +- datafusion/proto/src/generated/pbjson.rs | 9 - datafusion/proto/src/generated/prost.rs | 12 +- .../proto/src/logical_plan/from_proto.rs | 20 +- datafusion/proto/src/logical_plan/to_proto.rs | 3 - datafusion/sql/Cargo.toml | 2 + datafusion/sql/src/expr/mod.rs | 9 +- datafusion/sql/tests/sql_integration.rs | 63 +- 19 files changed, 2017 insertions(+), 1712 deletions(-) create mode 100644 datafusion/functions/src/datetime/date_bin.rs create mode 100644 datafusion/functions/src/datetime/date_part.rs create mode 100644 datafusion/functions/src/datetime/date_trunc.rs diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 2379a30ce10f..8ef6c6acbbcc 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1288,6 +1288,7 @@ dependencies = [ "datafusion-common", "datafusion-execution", "datafusion-expr", + "datafusion-functions", "half", "hashbrown 0.14.3", "hex", @@ -1340,7 +1341,9 @@ dependencies = [ "arrow", "arrow-schema", "datafusion-common", + "datafusion-execution", "datafusion-expr", + "datafusion-functions", "log", "sqlparser", ] diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index 91e3acd0f7bb..51e2ee79bc21 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -30,7 +30,7 @@ use crate::{ conditional_expressions, FuncMonotonicity, Signature, TypeSignature, Volatility, }; -use arrow::datatypes::{DataType, Field, Fields, IntervalUnit, TimeUnit}; +use arrow::datatypes::{DataType, Field, Fields, TimeUnit}; use datafusion_common::{exec_err, plan_err, DataFusionError, Result}; use strum::IntoEnumIterator; @@ -200,12 +200,6 @@ pub enum BuiltinScalarFunction { Concat, /// concat_ws ConcatWithSeparator, - /// date_part - DatePart, - /// date_trunc - DateTrunc, - /// date_bin - DateBin, /// ends_with EndsWith, /// initcap @@ -418,9 +412,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Chr => Volatility::Immutable, BuiltinScalarFunction::Concat => Volatility::Immutable, BuiltinScalarFunction::ConcatWithSeparator => Volatility::Immutable, - BuiltinScalarFunction::DatePart => Volatility::Immutable, - BuiltinScalarFunction::DateTrunc => Volatility::Immutable, - BuiltinScalarFunction::DateBin => Volatility::Immutable, BuiltinScalarFunction::EndsWith => Volatility::Immutable, BuiltinScalarFunction::InitCap => Volatility::Immutable, BuiltinScalarFunction::InStr => Volatility::Immutable, @@ -655,27 +646,6 @@ impl BuiltinScalarFunction { } BuiltinScalarFunction::Concat => Ok(Utf8), BuiltinScalarFunction::ConcatWithSeparator => Ok(Utf8), - BuiltinScalarFunction::DatePart => Ok(Float64), - BuiltinScalarFunction::DateBin | BuiltinScalarFunction::DateTrunc => { - match &input_expr_types[1] { - Timestamp(Nanosecond, None) | Utf8 | Null => { - Ok(Timestamp(Nanosecond, None)) - } - Timestamp(Nanosecond, tz_opt) => { - Ok(Timestamp(Nanosecond, tz_opt.clone())) - } - Timestamp(Microsecond, tz_opt) => { - Ok(Timestamp(Microsecond, tz_opt.clone())) - } - Timestamp(Millisecond, tz_opt) => { - Ok(Timestamp(Millisecond, tz_opt.clone())) - } - Timestamp(Second, tz_opt) => Ok(Timestamp(Second, tz_opt.clone())), - _ => plan_err!( - "The {self} function can only accept timestamp as the second arg." - ), - } - } BuiltinScalarFunction::InitCap => { utf8_to_str_type(&input_expr_types[0], "initcap") } @@ -862,7 +832,6 @@ impl BuiltinScalarFunction { /// Return the argument [`Signature`] supported by this function pub fn signature(&self) -> Signature { use DataType::*; - use IntervalUnit::*; use TimeUnit::*; use TypeSignature::*; // note: the physical expression must accept the type returned by this function or the execution panics. @@ -1040,108 +1009,6 @@ impl BuiltinScalarFunction { ], self.volatility(), ), - BuiltinScalarFunction::DateTrunc => Signature::one_of( - vec![ - Exact(vec![Utf8, Timestamp(Nanosecond, None)]), - Exact(vec![ - Utf8, - Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())), - ]), - Exact(vec![Utf8, Timestamp(Microsecond, None)]), - Exact(vec![ - Utf8, - Timestamp(Microsecond, Some(TIMEZONE_WILDCARD.into())), - ]), - Exact(vec![Utf8, Timestamp(Millisecond, None)]), - Exact(vec![ - Utf8, - Timestamp(Millisecond, Some(TIMEZONE_WILDCARD.into())), - ]), - Exact(vec![Utf8, Timestamp(Second, None)]), - Exact(vec![ - Utf8, - Timestamp(Second, Some(TIMEZONE_WILDCARD.into())), - ]), - ], - self.volatility(), - ), - BuiltinScalarFunction::DateBin => { - let base_sig = |array_type: TimeUnit| { - vec![ - Exact(vec![ - Interval(MonthDayNano), - Timestamp(array_type.clone(), None), - Timestamp(Nanosecond, None), - ]), - Exact(vec![ - Interval(MonthDayNano), - Timestamp(array_type.clone(), Some(TIMEZONE_WILDCARD.into())), - Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())), - ]), - Exact(vec![ - Interval(DayTime), - Timestamp(array_type.clone(), None), - Timestamp(Nanosecond, None), - ]), - Exact(vec![ - Interval(DayTime), - Timestamp(array_type.clone(), Some(TIMEZONE_WILDCARD.into())), - Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())), - ]), - Exact(vec![ - Interval(MonthDayNano), - Timestamp(array_type.clone(), None), - ]), - Exact(vec![ - Interval(MonthDayNano), - Timestamp(array_type.clone(), Some(TIMEZONE_WILDCARD.into())), - ]), - Exact(vec![ - Interval(DayTime), - Timestamp(array_type.clone(), None), - ]), - Exact(vec![ - Interval(DayTime), - Timestamp(array_type, Some(TIMEZONE_WILDCARD.into())), - ]), - ] - }; - - let full_sig = [Nanosecond, Microsecond, Millisecond, Second] - .into_iter() - .map(base_sig) - .collect::>() - .concat(); - - Signature::one_of(full_sig, self.volatility()) - } - BuiltinScalarFunction::DatePart => Signature::one_of( - vec![ - Exact(vec![Utf8, Timestamp(Nanosecond, None)]), - Exact(vec![ - Utf8, - Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())), - ]), - Exact(vec![Utf8, Timestamp(Millisecond, None)]), - Exact(vec![ - Utf8, - Timestamp(Millisecond, Some(TIMEZONE_WILDCARD.into())), - ]), - Exact(vec![Utf8, Timestamp(Microsecond, None)]), - Exact(vec![ - Utf8, - Timestamp(Microsecond, Some(TIMEZONE_WILDCARD.into())), - ]), - Exact(vec![Utf8, Timestamp(Second, None)]), - Exact(vec![ - Utf8, - Timestamp(Second, Some(TIMEZONE_WILDCARD.into())), - ]), - Exact(vec![Utf8, Date64]), - Exact(vec![Utf8, Date32]), - ], - self.volatility(), - ), BuiltinScalarFunction::SplitPart => Signature::one_of( vec![ Exact(vec![Utf8, Utf8, Int64]), @@ -1351,11 +1218,6 @@ impl BuiltinScalarFunction { | BuiltinScalarFunction::Pi ) { Some(vec![Some(true)]) - } else if matches!( - &self, - BuiltinScalarFunction::DateTrunc | BuiltinScalarFunction::DateBin - ) { - Some(vec![None, Some(true)]) } else if *self == BuiltinScalarFunction::Log { Some(vec![Some(true), Some(false)]) } else { @@ -1453,9 +1315,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::CurrentDate => &["current_date", "today"], BuiltinScalarFunction::CurrentTime => &["current_time"], BuiltinScalarFunction::MakeDate => &["make_date"], - BuiltinScalarFunction::DateBin => &["date_bin"], - BuiltinScalarFunction::DateTrunc => &["date_trunc", "datetrunc"], - BuiltinScalarFunction::DatePart => &["date_part", "datepart"], BuiltinScalarFunction::ToChar => &["to_char", "date_format"], BuiltinScalarFunction::FromUnixtime => &["from_unixtime"], diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 157b8b0989df..585679d50706 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -872,9 +872,6 @@ nary_scalar_expr!( ); // date functions -scalar_expr!(DatePart, date_part, part date, "extracts a subfield from the date"); -scalar_expr!(DateTrunc, date_trunc, part date, "truncates the date to a specified level of precision"); -scalar_expr!(DateBin, date_bin, stride source origin, "coerces an arbitrary timestamp to the start of the nearest specified interval"); scalar_expr!( ToChar, to_char, @@ -1380,9 +1377,6 @@ mod test { test_scalar_expr!(Trim, trim, string); test_scalar_expr!(Upper, upper, string); - test_scalar_expr!(DatePart, date_part, part, date); - test_scalar_expr!(DateTrunc, date_trunc, part, date); - test_scalar_expr!(DateBin, date_bin, stride, source, origin); test_scalar_expr!(FromUnixtime, from_unixtime, unixtime); test_scalar_expr!(ArrayAppend, array_append, array, element); diff --git a/datafusion/functions/src/datetime/date_bin.rs b/datafusion/functions/src/datetime/date_bin.rs new file mode 100644 index 000000000000..69297a92aa8f --- /dev/null +++ b/datafusion/functions/src/datetime/date_bin.rs @@ -0,0 +1,765 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::datatypes::DataType::{Null, Timestamp, Utf8}; +use arrow::datatypes::IntervalUnit::{DayTime, MonthDayNano}; +use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; +use arrow::datatypes::{DataType, TimeUnit}; +use arrow_array::temporal_conversions::NANOSECONDS; +use arrow_array::types::{ + ArrowTimestampType, IntervalDayTimeType, IntervalMonthDayNanoType, + TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, + TimestampSecondType, +}; +use arrow_array::{ArrayRef, PrimitiveArray}; +use chrono::{DateTime, Datelike, Duration, Months, NaiveDateTime, Utc}; + +use datafusion_common::cast::as_primitive_array; +use datafusion_common::{exec_err, not_impl_err, plan_err, Result, ScalarValue}; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ + ColumnarValue, FuncMonotonicity, ScalarUDFImpl, Signature, Volatility, + TIMEZONE_WILDCARD, +}; + +#[derive(Debug)] +pub(super) struct DateBinFunc { + signature: Signature, +} + +impl DateBinFunc { + pub fn new() -> Self { + let base_sig = |array_type: TimeUnit| { + vec![ + Exact(vec![ + DataType::Interval(MonthDayNano), + Timestamp(array_type.clone(), None), + Timestamp(Nanosecond, None), + ]), + Exact(vec![ + DataType::Interval(MonthDayNano), + Timestamp(array_type.clone(), Some(TIMEZONE_WILDCARD.into())), + Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())), + ]), + Exact(vec![ + DataType::Interval(DayTime), + Timestamp(array_type.clone(), None), + Timestamp(Nanosecond, None), + ]), + Exact(vec![ + DataType::Interval(DayTime), + Timestamp(array_type.clone(), Some(TIMEZONE_WILDCARD.into())), + Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())), + ]), + Exact(vec![ + DataType::Interval(MonthDayNano), + Timestamp(array_type.clone(), None), + ]), + Exact(vec![ + DataType::Interval(MonthDayNano), + Timestamp(array_type.clone(), Some(TIMEZONE_WILDCARD.into())), + ]), + Exact(vec![ + DataType::Interval(DayTime), + Timestamp(array_type.clone(), None), + ]), + Exact(vec![ + DataType::Interval(DayTime), + Timestamp(array_type, Some(TIMEZONE_WILDCARD.into())), + ]), + ] + }; + + let full_sig = [Nanosecond, Microsecond, Millisecond, Second] + .into_iter() + .map(base_sig) + .collect::>() + .concat(); + + Self { + signature: Signature::one_of(full_sig, Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for DateBinFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "date_bin" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + match &arg_types[1] { + Timestamp(Nanosecond, None) | Utf8 | Null => Ok(Timestamp(Nanosecond, None)), + Timestamp(Nanosecond, tz_opt) => Ok(Timestamp(Nanosecond, tz_opt.clone())), + Timestamp(Microsecond, tz_opt) => Ok(Timestamp(Microsecond, tz_opt.clone())), + Timestamp(Millisecond, tz_opt) => Ok(Timestamp(Millisecond, tz_opt.clone())), + Timestamp(Second, tz_opt) => Ok(Timestamp(Second, tz_opt.clone())), + _ => plan_err!( + "The date_bin function can only accept timestamp as the second arg." + ), + } + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + if args.len() == 2 { + // Default to unix EPOCH + let origin = ColumnarValue::Scalar(ScalarValue::TimestampNanosecond( + Some(0), + Some("+00:00".into()), + )); + date_bin_impl(&args[0], &args[1], &origin) + } else if args.len() == 3 { + date_bin_impl(&args[0], &args[1], &args[2]) + } else { + exec_err!("DATE_BIN expected two or three arguments") + } + } + + fn monotonicity(&self) -> Result> { + Ok(Some(vec![None, Some(true)])) + } +} + +enum Interval { + Nanoseconds(i64), + Months(i64), +} + +impl Interval { + /// Returns (`stride_nanos`, `fn`) where + /// + /// 1. `stride_nanos` is a width, in nanoseconds + /// 2. `fn` is a function that takes (stride_nanos, source, origin) + /// + /// `source` is the timestamp being binned + /// + /// `origin` is the time, in nanoseconds, where windows are measured from + fn bin_fn(&self) -> (i64, fn(i64, i64, i64) -> i64) { + match self { + Interval::Nanoseconds(nanos) => (*nanos, date_bin_nanos_interval), + Interval::Months(months) => (*months, date_bin_months_interval), + } + } +} + +// return time in nanoseconds that the source timestamp falls into based on the stride and origin +fn date_bin_nanos_interval(stride_nanos: i64, source: i64, origin: i64) -> i64 { + let time_diff = source - origin; + + // distance from origin to bin + let time_delta = compute_distance(time_diff, stride_nanos); + + origin + time_delta +} + +// distance from origin to bin +fn compute_distance(time_diff: i64, stride: i64) -> i64 { + let time_delta = time_diff - (time_diff % stride); + + if time_diff < 0 && stride > 1 { + // The origin is later than the source timestamp, round down to the previous bin + time_delta - stride + } else { + time_delta + } +} + +// return time in nanoseconds that the source timestamp falls into based on the stride and origin +fn date_bin_months_interval(stride_months: i64, source: i64, origin: i64) -> i64 { + // convert source and origin to DateTime + let source_date = to_utc_date_time(source); + let origin_date = to_utc_date_time(origin); + + // calculate the number of months between the source and origin + let month_diff = (source_date.year() - origin_date.year()) * 12 + + source_date.month() as i32 + - origin_date.month() as i32; + + // distance from origin to bin + let month_delta = compute_distance(month_diff as i64, stride_months); + + let mut bin_time = if month_delta < 0 { + origin_date - Months::new(month_delta.unsigned_abs() as u32) + } else { + origin_date + Months::new(month_delta as u32) + }; + + // If origin is not midnight of first date of the month, the bin_time may be larger than the source + // In this case, we need to move back to previous bin + if bin_time > source_date { + let month_delta = month_delta - stride_months; + bin_time = if month_delta < 0 { + origin_date - Months::new(month_delta.unsigned_abs() as u32) + } else { + origin_date + Months::new(month_delta as u32) + }; + } + + bin_time.timestamp_nanos_opt().unwrap() +} + +fn to_utc_date_time(nanos: i64) -> DateTime { + let secs = nanos / 1_000_000_000; + let nsec = (nanos % 1_000_000_000) as u32; + let date = NaiveDateTime::from_timestamp_opt(secs, nsec).unwrap(); + DateTime::::from_naive_utc_and_offset(date, Utc) +} + +// Supported intervals: +// 1. IntervalDayTime: this means that the stride is in days, hours, minutes, seconds and milliseconds +// We will assume month interval won't be converted into this type +// TODO (my next PR): without `INTERVAL` keyword, the stride was converted into ScalarValue::IntervalDayTime somwhere +// for month interval. I need to find that and make it ScalarValue::IntervalMonthDayNano instead +// 2. IntervalMonthDayNano +fn date_bin_impl( + stride: &ColumnarValue, + array: &ColumnarValue, + origin: &ColumnarValue, +) -> Result { + let stride = match stride { + ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(v))) => { + let (days, ms) = IntervalDayTimeType::to_parts(*v); + let nanos = (Duration::days(days as i64) + Duration::milliseconds(ms as i64)) + .num_nanoseconds(); + + match nanos { + Some(v) => Interval::Nanoseconds(v), + _ => return exec_err!("DATE_BIN stride argument is too large"), + } + } + ColumnarValue::Scalar(ScalarValue::IntervalMonthDayNano(Some(v))) => { + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(*v); + + // If interval is months, its origin must be midnight of first date of the month + if months != 0 { + // Return error if days or nanos is not zero + if days != 0 || nanos != 0 { + return not_impl_err!( + "DATE_BIN stride does not support combination of month, day and nanosecond intervals" + ); + } else { + Interval::Months(months as i64) + } + } else { + let nanos = (Duration::days(days as i64) + Duration::nanoseconds(nanos)) + .num_nanoseconds(); + match nanos { + Some(v) => Interval::Nanoseconds(v), + _ => return exec_err!("DATE_BIN stride argument is too large"), + } + } + } + ColumnarValue::Scalar(v) => { + return exec_err!( + "DATE_BIN expects stride argument to be an INTERVAL but got {}", + v.data_type() + ); + } + ColumnarValue::Array(_) => { + return not_impl_err!( + "DATE_BIN only supports literal values for the stride argument, not arrays" + ); + } + }; + + let origin = match origin { + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(v), _)) => *v, + ColumnarValue::Scalar(v) => { + return exec_err!( + "DATE_BIN expects origin argument to be a TIMESTAMP with nanosecond precision but got {}", + v.data_type() + ); + } + ColumnarValue::Array(_) => { + return not_impl_err!( + "DATE_BIN only supports literal values for the origin argument, not arrays" + ); + } + }; + + let (stride, stride_fn) = stride.bin_fn(); + + // Return error if stride is 0 + if stride == 0 { + return exec_err!("DATE_BIN stride must be non-zero"); + } + + fn stride_map_fn( + origin: i64, + stride: i64, + stride_fn: fn(i64, i64, i64) -> i64, + ) -> impl Fn(Option) -> Option { + let scale = match T::UNIT { + Nanosecond => 1, + Microsecond => NANOSECONDS / 1_000_000, + Millisecond => NANOSECONDS / 1_000, + Second => NANOSECONDS, + }; + move |x: Option| x.map(|x| stride_fn(stride, x * scale, origin) / scale) + } + + Ok(match array { + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(v, tz_opt)) => { + let apply_stride_fn = + stride_map_fn::(origin, stride, stride_fn); + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond( + apply_stride_fn(*v), + tz_opt.clone(), + )) + } + ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(v, tz_opt)) => { + let apply_stride_fn = + stride_map_fn::(origin, stride, stride_fn); + ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond( + apply_stride_fn(*v), + tz_opt.clone(), + )) + } + ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(v, tz_opt)) => { + let apply_stride_fn = + stride_map_fn::(origin, stride, stride_fn); + ColumnarValue::Scalar(ScalarValue::TimestampMillisecond( + apply_stride_fn(*v), + tz_opt.clone(), + )) + } + ColumnarValue::Scalar(ScalarValue::TimestampSecond(v, tz_opt)) => { + let apply_stride_fn = + stride_map_fn::(origin, stride, stride_fn); + ColumnarValue::Scalar(ScalarValue::TimestampSecond( + apply_stride_fn(*v), + tz_opt.clone(), + )) + } + + ColumnarValue::Array(array) => { + fn transform_array_with_stride( + origin: i64, + stride: i64, + stride_fn: fn(i64, i64, i64) -> i64, + array: &ArrayRef, + tz_opt: &Option>, + ) -> Result + where + T: ArrowTimestampType, + { + let array = as_primitive_array::(array)?; + let apply_stride_fn = stride_map_fn::(origin, stride, stride_fn); + let array = array + .iter() + .map(apply_stride_fn) + .collect::>() + .with_timezone_opt(tz_opt.clone()); + + Ok(ColumnarValue::Array(Arc::new(array))) + } + match array.data_type() { + Timestamp(Nanosecond, tz_opt) => { + transform_array_with_stride::( + origin, stride, stride_fn, array, tz_opt, + )? + } + Timestamp(Microsecond, tz_opt) => { + transform_array_with_stride::( + origin, stride, stride_fn, array, tz_opt, + )? + } + Timestamp(Millisecond, tz_opt) => { + transform_array_with_stride::( + origin, stride, stride_fn, array, tz_opt, + )? + } + Timestamp(Second, tz_opt) => { + transform_array_with_stride::( + origin, stride, stride_fn, array, tz_opt, + )? + } + _ => { + return exec_err!( + "DATE_BIN expects source argument to be a TIMESTAMP but got {}", + array.data_type() + ); + } + } + } + _ => { + return exec_err!( + "DATE_BIN expects source argument to be a TIMESTAMP scalar or array" + ); + } + }) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos; + use arrow::datatypes::{DataType, TimeUnit}; + use arrow_array::types::TimestampNanosecondType; + use arrow_array::{IntervalDayTimeArray, TimestampNanosecondArray}; + + use datafusion_common::ScalarValue; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use crate::datetime::date_bin::{date_bin_nanos_interval, DateBinFunc}; + + #[test] + fn test_date_bin() { + let res = DateBinFunc::new().invoke(&[ + ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(1))), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ]); + assert!(res.is_ok()); + + let timestamps = Arc::new((1..6).map(Some).collect::()); + let res = DateBinFunc::new().invoke(&[ + ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(1))), + ColumnarValue::Array(timestamps), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ]); + assert!(res.is_ok()); + + let res = DateBinFunc::new().invoke(&[ + ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(1))), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ]); + assert!(res.is_ok()); + + // stride supports month-day-nano + let res = DateBinFunc::new().invoke(&[ + ColumnarValue::Scalar(ScalarValue::IntervalMonthDayNano(Some(1))), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ]); + assert!(res.is_ok()); + + // + // Fallible test cases + // + + // invalid number of arguments + let res = DateBinFunc::new() + .invoke(&[ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(1)))]); + assert_eq!( + res.err().unwrap().strip_backtrace(), + "Execution error: DATE_BIN expected two or three arguments" + ); + + // stride: invalid type + let res = DateBinFunc::new().invoke(&[ + ColumnarValue::Scalar(ScalarValue::IntervalYearMonth(Some(1))), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ]); + assert_eq!( + res.err().unwrap().strip_backtrace(), + "Execution error: DATE_BIN expects stride argument to be an INTERVAL but got Interval(YearMonth)" + ); + + // stride: invalid value + let res = DateBinFunc::new().invoke(&[ + ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(0))), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ]); + assert_eq!( + res.err().unwrap().strip_backtrace(), + "Execution error: DATE_BIN stride must be non-zero" + ); + + // stride: overflow of day-time interval + let res = DateBinFunc::new().invoke(&[ + ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(i64::MAX))), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ]); + assert_eq!( + res.err().unwrap().strip_backtrace(), + "Execution error: DATE_BIN stride argument is too large" + ); + + // stride: overflow of month-day-nano interval + let res = DateBinFunc::new().invoke(&[ + ColumnarValue::Scalar(ScalarValue::new_interval_mdn(0, i32::MAX, 1)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ]); + assert_eq!( + res.err().unwrap().strip_backtrace(), + "Execution error: DATE_BIN stride argument is too large" + ); + + // stride: month intervals + let res = DateBinFunc::new().invoke(&[ + ColumnarValue::Scalar(ScalarValue::new_interval_mdn(1, 1, 1)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ]); + assert_eq!( + res.err().unwrap().strip_backtrace(), + "This feature is not implemented: DATE_BIN stride does not support combination of month, day and nanosecond intervals" + ); + + // origin: invalid type + let res = DateBinFunc::new().invoke(&[ + ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(1))), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(Some(1), None)), + ]); + assert_eq!( + res.err().unwrap().strip_backtrace(), + "Execution error: DATE_BIN expects origin argument to be a TIMESTAMP with nanosecond precision but got Timestamp(Microsecond, None)" + ); + + let res = DateBinFunc::new().invoke(&[ + ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(1))), + ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ]); + assert!(res.is_ok()); + + // unsupported array type for stride + let intervals = Arc::new((1..6).map(Some).collect::()); + let res = DateBinFunc::new().invoke(&[ + ColumnarValue::Array(intervals), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ]); + assert_eq!( + res.err().unwrap().strip_backtrace(), + "This feature is not implemented: DATE_BIN only supports literal values for the stride argument, not arrays" + ); + + // unsupported array type for origin + let timestamps = Arc::new((1..6).map(Some).collect::()); + let res = DateBinFunc::new().invoke(&[ + ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(1))), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Array(timestamps), + ]); + assert_eq!( + res.err().unwrap().strip_backtrace(), + "This feature is not implemented: DATE_BIN only supports literal values for the origin argument, not arrays" + ); + } + + #[test] + fn test_date_bin_timezones() { + let cases = vec![ + ( + vec![ + "2020-09-08T00:00:00Z", + "2020-09-08T01:00:00Z", + "2020-09-08T02:00:00Z", + "2020-09-08T03:00:00Z", + "2020-09-08T04:00:00Z", + ], + Some("+00".into()), + "1970-01-01T00:00:00Z", + vec![ + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + ], + ), + ( + vec![ + "2020-09-08T00:00:00Z", + "2020-09-08T01:00:00Z", + "2020-09-08T02:00:00Z", + "2020-09-08T03:00:00Z", + "2020-09-08T04:00:00Z", + ], + None, + "1970-01-01T00:00:00Z", + vec![ + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + ], + ), + ( + vec![ + "2020-09-08T00:00:00Z", + "2020-09-08T01:00:00Z", + "2020-09-08T02:00:00Z", + "2020-09-08T03:00:00Z", + "2020-09-08T04:00:00Z", + ], + Some("-02".into()), + "1970-01-01T00:00:00Z", + vec![ + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + ], + ), + ( + vec![ + "2020-09-08T00:00:00+05", + "2020-09-08T01:00:00+05", + "2020-09-08T02:00:00+05", + "2020-09-08T03:00:00+05", + "2020-09-08T04:00:00+05", + ], + Some("+05".into()), + "1970-01-01T00:00:00+05", + vec![ + "2020-09-08T00:00:00+05", + "2020-09-08T00:00:00+05", + "2020-09-08T00:00:00+05", + "2020-09-08T00:00:00+05", + "2020-09-08T00:00:00+05", + ], + ), + ( + vec![ + "2020-09-08T00:00:00+08", + "2020-09-08T01:00:00+08", + "2020-09-08T02:00:00+08", + "2020-09-08T03:00:00+08", + "2020-09-08T04:00:00+08", + ], + Some("+08".into()), + "1970-01-01T00:00:00+08", + vec![ + "2020-09-08T00:00:00+08", + "2020-09-08T00:00:00+08", + "2020-09-08T00:00:00+08", + "2020-09-08T00:00:00+08", + "2020-09-08T00:00:00+08", + ], + ), + ]; + + cases + .iter() + .for_each(|(original, tz_opt, origin, expected)| { + let input = original + .iter() + .map(|s| Some(string_to_timestamp_nanos(s).unwrap())) + .collect::() + .with_timezone_opt(tz_opt.clone()); + let right = expected + .iter() + .map(|s| Some(string_to_timestamp_nanos(s).unwrap())) + .collect::() + .with_timezone_opt(tz_opt.clone()); + let result = DateBinFunc::new() + .invoke(&[ + ColumnarValue::Scalar(ScalarValue::new_interval_dt(1, 0)), + ColumnarValue::Array(Arc::new(input)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond( + Some(string_to_timestamp_nanos(origin).unwrap()), + tz_opt.clone(), + )), + ]) + .unwrap(); + if let ColumnarValue::Array(result) = result { + assert_eq!( + result.data_type(), + &DataType::Timestamp(TimeUnit::Nanosecond, tz_opt.clone()) + ); + let left = arrow_array::cast::as_primitive_array::< + TimestampNanosecondType, + >(&result); + assert_eq!(left, &right); + } else { + panic!("unexpected column type"); + } + }); + } + + #[test] + fn test_date_bin_single() { + use chrono::Duration; + + let cases = vec![ + ( + ( + Duration::minutes(15), + "2004-04-09T02:03:04.123456789Z", + "2001-01-01T00:00:00", + ), + "2004-04-09T02:00:00Z", + ), + ( + ( + Duration::minutes(15), + "2004-04-09T02:03:04.123456789Z", + "2001-01-01T00:02:30", + ), + "2004-04-09T02:02:30Z", + ), + ( + ( + Duration::minutes(15), + "2004-04-09T02:03:04.123456789Z", + "2005-01-01T00:02:30", + ), + "2004-04-09T02:02:30Z", + ), + ( + ( + Duration::hours(1), + "2004-04-09T02:03:04.123456789Z", + "2001-01-01T00:00:00", + ), + "2004-04-09T02:00:00Z", + ), + ( + ( + Duration::seconds(10), + "2004-04-09T02:03:11.123456789Z", + "2001-01-01T00:00:00", + ), + "2004-04-09T02:03:10Z", + ), + ]; + + cases + .iter() + .for_each(|((stride, source, origin), expected)| { + let stride1 = stride.num_nanoseconds().unwrap(); + let source1 = string_to_timestamp_nanos(source).unwrap(); + let origin1 = string_to_timestamp_nanos(origin).unwrap(); + + let expected1 = string_to_timestamp_nanos(expected).unwrap(); + let result = date_bin_nanos_interval(stride1, source1, origin1); + assert_eq!(result, expected1, "{source} = {expected}"); + }) + } +} diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs new file mode 100644 index 000000000000..200dcff6d46e --- /dev/null +++ b/datafusion/functions/src/datetime/date_part.rs @@ -0,0 +1,269 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::compute::cast; +use arrow::compute::kernels::temporal; +use arrow::datatypes::DataType::{Date32, Date64, Float64, Timestamp, Utf8}; +use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; +use arrow::datatypes::{DataType, TimeUnit}; +use arrow_array::types::ArrowTemporalType; +use arrow_array::{Array, ArrayRef, ArrowNumericType, Float64Array, PrimitiveArray}; + +use datafusion_common::cast::{ + as_date32_array, as_date64_array, as_timestamp_microsecond_array, + as_timestamp_millisecond_array, as_timestamp_nanosecond_array, + as_timestamp_second_array, +}; +use datafusion_common::{exec_err, Result, ScalarValue}; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ + ColumnarValue, ScalarUDFImpl, Signature, Volatility, TIMEZONE_WILDCARD, +}; + +#[derive(Debug)] +pub(super) struct DatePartFunc { + signature: Signature, + aliases: Vec, +} + +impl DatePartFunc { + pub fn new() -> Self { + Self { + signature: Signature::one_of( + vec![ + Exact(vec![Utf8, Timestamp(Nanosecond, None)]), + Exact(vec![ + Utf8, + Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())), + ]), + Exact(vec![Utf8, Timestamp(Millisecond, None)]), + Exact(vec![ + Utf8, + Timestamp(Millisecond, Some(TIMEZONE_WILDCARD.into())), + ]), + Exact(vec![Utf8, Timestamp(Microsecond, None)]), + Exact(vec![ + Utf8, + Timestamp(Microsecond, Some(TIMEZONE_WILDCARD.into())), + ]), + Exact(vec![Utf8, Timestamp(Second, None)]), + Exact(vec![ + Utf8, + Timestamp(Second, Some(TIMEZONE_WILDCARD.into())), + ]), + Exact(vec![Utf8, Date64]), + Exact(vec![Utf8, Date32]), + ], + Volatility::Immutable, + ), + aliases: vec![String::from("datepart")], + } + } +} + +macro_rules! extract_date_part { + ($ARRAY: expr, $FN:expr) => { + match $ARRAY.data_type() { + DataType::Date32 => { + let array = as_date32_array($ARRAY)?; + Ok($FN(array) + .map(|v| cast(&(Arc::new(v) as ArrayRef), &DataType::Float64))?) + } + DataType::Date64 => { + let array = as_date64_array($ARRAY)?; + Ok($FN(array) + .map(|v| cast(&(Arc::new(v) as ArrayRef), &DataType::Float64))?) + } + DataType::Timestamp(time_unit, _) => match time_unit { + TimeUnit::Second => { + let array = as_timestamp_second_array($ARRAY)?; + Ok($FN(array) + .map(|v| cast(&(Arc::new(v) as ArrayRef), &DataType::Float64))?) + } + TimeUnit::Millisecond => { + let array = as_timestamp_millisecond_array($ARRAY)?; + Ok($FN(array) + .map(|v| cast(&(Arc::new(v) as ArrayRef), &DataType::Float64))?) + } + TimeUnit::Microsecond => { + let array = as_timestamp_microsecond_array($ARRAY)?; + Ok($FN(array) + .map(|v| cast(&(Arc::new(v) as ArrayRef), &DataType::Float64))?) + } + TimeUnit::Nanosecond => { + let array = as_timestamp_nanosecond_array($ARRAY)?; + Ok($FN(array) + .map(|v| cast(&(Arc::new(v) as ArrayRef), &DataType::Float64))?) + } + }, + datatype => exec_err!("Extract does not support datatype {:?}", datatype), + } + }; +} + +impl ScalarUDFImpl for DatePartFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "date_part" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(Float64) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + if args.len() != 2 { + return exec_err!("Expected two arguments in DATE_PART"); + } + let (date_part, array) = (&args[0], &args[1]); + + let date_part = + if let ColumnarValue::Scalar(ScalarValue::Utf8(Some(v))) = date_part { + v + } else { + return exec_err!( + "First argument of `DATE_PART` must be non-null scalar Utf8" + ); + }; + + let is_scalar = matches!(array, ColumnarValue::Scalar(_)); + + let array = match array { + ColumnarValue::Array(array) => array.clone(), + ColumnarValue::Scalar(scalar) => scalar.to_array()?, + }; + + let arr = match date_part.to_lowercase().as_str() { + "year" => extract_date_part!(&array, temporal::year), + "quarter" => extract_date_part!(&array, temporal::quarter), + "month" => extract_date_part!(&array, temporal::month), + "week" => extract_date_part!(&array, temporal::week), + "day" => extract_date_part!(&array, temporal::day), + "doy" => extract_date_part!(&array, temporal::doy), + "dow" => extract_date_part!(&array, temporal::num_days_from_sunday), + "hour" => extract_date_part!(&array, temporal::hour), + "minute" => extract_date_part!(&array, temporal::minute), + "second" => extract_date_part!(&array, seconds), + "millisecond" => extract_date_part!(&array, millis), + "microsecond" => extract_date_part!(&array, micros), + "nanosecond" => extract_date_part!(&array, nanos), + "epoch" => extract_date_part!(&array, epoch), + _ => exec_err!("Date part '{date_part}' not supported"), + }?; + + Ok(if is_scalar { + ColumnarValue::Scalar(ScalarValue::try_from_array(&arr?, 0)?) + } else { + ColumnarValue::Array(arr?) + }) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +fn to_ticks(array: &PrimitiveArray, frac: i32) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: From, +{ + let zipped = temporal::second(array)? + .values() + .iter() + .zip(temporal::nanosecond(array)?.values().iter()) + .map(|o| (*o.0 as f64 + (*o.1 as f64) / 1_000_000_000.0) * (frac as f64)) + .collect::>(); + + Ok(Float64Array::from(zipped)) +} + +fn seconds(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: From, +{ + to_ticks(array, 1) +} + +fn millis(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: From, +{ + to_ticks(array, 1_000) +} + +fn micros(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: From, +{ + to_ticks(array, 1_000_000) +} + +fn nanos(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: From, +{ + to_ticks(array, 1_000_000_000) +} + +fn epoch(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: From, +{ + let b = match array.data_type() { + Timestamp(tu, _) => { + let scale = match tu { + Second => 1, + Millisecond => 1_000, + Microsecond => 1_000_000, + Nanosecond => 1_000_000_000, + } as f64; + array.unary(|n| { + let n: i64 = n.into(); + n as f64 / scale + }) + } + Date32 => { + let seconds_in_a_day = 86400_f64; + array.unary(|n| { + let n: i64 = n.into(); + n as f64 * seconds_in_a_day + }) + } + Date64 => array.unary(|n| { + let n: i64 = n.into(); + n as f64 / 1_000_f64 + }), + _ => return exec_err!("Can not convert {:?} to epoch", array.data_type()), + }; + Ok(b) +} diff --git a/datafusion/functions/src/datetime/date_trunc.rs b/datafusion/functions/src/datetime/date_trunc.rs new file mode 100644 index 000000000000..4ece175abfb2 --- /dev/null +++ b/datafusion/functions/src/datetime/date_trunc.rs @@ -0,0 +1,825 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::ops::{Add, Sub}; +use std::str::FromStr; +use std::sync::Arc; + +use arrow::datatypes::DataType::{Null, Timestamp, Utf8}; +use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; +use arrow::datatypes::{DataType, TimeUnit}; +use arrow_array::temporal_conversions::{ + as_datetime_with_timezone, timestamp_ns_to_datetime, +}; +use arrow_array::timezone::Tz; +use arrow_array::types::{ + ArrowTimestampType, TimestampMicrosecondType, TimestampMillisecondType, + TimestampNanosecondType, TimestampSecondType, +}; +use arrow_array::{Array, PrimitiveArray}; +use chrono::{ + DateTime, Datelike, Duration, LocalResult, NaiveDateTime, Offset, Timelike, +}; + +use datafusion_common::cast::as_primitive_array; +use datafusion_common::{exec_err, plan_err, DataFusionError, Result, ScalarValue}; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ + ColumnarValue, FuncMonotonicity, ScalarUDFImpl, Signature, Volatility, + TIMEZONE_WILDCARD, +}; + +#[derive(Debug)] +pub(super) struct DateTruncFunc { + signature: Signature, + aliases: Vec, +} + +impl DateTruncFunc { + pub fn new() -> Self { + Self { + signature: Signature::one_of( + vec![ + Exact(vec![Utf8, Timestamp(Nanosecond, None)]), + Exact(vec![ + Utf8, + Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())), + ]), + Exact(vec![Utf8, Timestamp(Microsecond, None)]), + Exact(vec![ + Utf8, + Timestamp(Microsecond, Some(TIMEZONE_WILDCARD.into())), + ]), + Exact(vec![Utf8, Timestamp(Millisecond, None)]), + Exact(vec![ + Utf8, + Timestamp(Millisecond, Some(TIMEZONE_WILDCARD.into())), + ]), + Exact(vec![Utf8, Timestamp(Second, None)]), + Exact(vec![ + Utf8, + Timestamp(Second, Some(TIMEZONE_WILDCARD.into())), + ]), + ], + Volatility::Immutable, + ), + aliases: vec![String::from("datetrunc")], + } + } +} + +impl ScalarUDFImpl for DateTruncFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "date_trunc" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + match &arg_types[1] { + Timestamp(Nanosecond, None) | Utf8 | Null => Ok(Timestamp(Nanosecond, None)), + Timestamp(Nanosecond, tz_opt) => Ok(Timestamp(Nanosecond, tz_opt.clone())), + Timestamp(Microsecond, tz_opt) => Ok(Timestamp(Microsecond, tz_opt.clone())), + Timestamp(Millisecond, tz_opt) => Ok(Timestamp(Millisecond, tz_opt.clone())), + Timestamp(Second, tz_opt) => Ok(Timestamp(Second, tz_opt.clone())), + _ => plan_err!( + "The date_trunc function can only accept timestamp as the second arg." + ), + } + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let (granularity, array) = (&args[0], &args[1]); + + let granularity = if let ColumnarValue::Scalar(ScalarValue::Utf8(Some(v))) = + granularity + { + v.to_lowercase() + } else { + return exec_err!("Granularity of `date_trunc` must be non-null scalar Utf8"); + }; + + fn process_array( + array: &dyn Array, + granularity: String, + tz_opt: &Option>, + ) -> Result { + let parsed_tz = parse_tz(tz_opt)?; + let array = as_primitive_array::(array)?; + let array = array + .iter() + .map(|x| general_date_trunc(T::UNIT, &x, parsed_tz, granularity.as_str())) + .collect::>>()? + .with_timezone_opt(tz_opt.clone()); + Ok(ColumnarValue::Array(Arc::new(array))) + } + + fn process_scalar( + v: &Option, + granularity: String, + tz_opt: &Option>, + ) -> Result { + let parsed_tz = parse_tz(tz_opt)?; + let value = general_date_trunc(T::UNIT, v, parsed_tz, granularity.as_str())?; + let value = ScalarValue::new_timestamp::(value, tz_opt.clone()); + Ok(ColumnarValue::Scalar(value)) + } + + Ok(match array { + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(v, tz_opt)) => { + process_scalar::(v, granularity, tz_opt)? + } + ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(v, tz_opt)) => { + process_scalar::(v, granularity, tz_opt)? + } + ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(v, tz_opt)) => { + process_scalar::(v, granularity, tz_opt)? + } + ColumnarValue::Scalar(ScalarValue::TimestampSecond(v, tz_opt)) => { + process_scalar::(v, granularity, tz_opt)? + } + ColumnarValue::Array(array) => { + let array_type = array.data_type(); + match array_type { + Timestamp(Second, tz_opt) => { + process_array::(array, granularity, tz_opt)? + } + Timestamp(Millisecond, tz_opt) => process_array::< + TimestampMillisecondType, + >( + array, granularity, tz_opt + )?, + Timestamp(Microsecond, tz_opt) => process_array::< + TimestampMicrosecondType, + >( + array, granularity, tz_opt + )?, + Timestamp(Nanosecond, tz_opt) => process_array::< + TimestampNanosecondType, + >( + array, granularity, tz_opt + )?, + _ => process_array::( + array, + granularity, + &None, + )?, + } + } + _ => { + return exec_err!( + "second argument of `date_trunc` must be nanosecond timestamp scalar or array" + ); + } + }) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } + + fn monotonicity(&self) -> Result> { + Ok(Some(vec![None, Some(true)])) + } +} + +fn _date_trunc_coarse(granularity: &str, value: Option) -> Result> +where + T: Datelike + Timelike + Sub + Copy, +{ + let value = match granularity { + "millisecond" => value, + "microsecond" => value, + "second" => value.and_then(|d| d.with_nanosecond(0)), + "minute" => value + .and_then(|d| d.with_nanosecond(0)) + .and_then(|d| d.with_second(0)), + "hour" => value + .and_then(|d| d.with_nanosecond(0)) + .and_then(|d| d.with_second(0)) + .and_then(|d| d.with_minute(0)), + "day" => value + .and_then(|d| d.with_nanosecond(0)) + .and_then(|d| d.with_second(0)) + .and_then(|d| d.with_minute(0)) + .and_then(|d| d.with_hour(0)), + "week" => value + .and_then(|d| d.with_nanosecond(0)) + .and_then(|d| d.with_second(0)) + .and_then(|d| d.with_minute(0)) + .and_then(|d| d.with_hour(0)) + .map(|d| d - Duration::seconds(60 * 60 * 24 * d.weekday() as i64)), + "month" => value + .and_then(|d| d.with_nanosecond(0)) + .and_then(|d| d.with_second(0)) + .and_then(|d| d.with_minute(0)) + .and_then(|d| d.with_hour(0)) + .and_then(|d| d.with_day0(0)), + "quarter" => value + .and_then(|d| d.with_nanosecond(0)) + .and_then(|d| d.with_second(0)) + .and_then(|d| d.with_minute(0)) + .and_then(|d| d.with_hour(0)) + .and_then(|d| d.with_day0(0)) + .and_then(|d| d.with_month(quarter_month(&d))), + "year" => value + .and_then(|d| d.with_nanosecond(0)) + .and_then(|d| d.with_second(0)) + .and_then(|d| d.with_minute(0)) + .and_then(|d| d.with_hour(0)) + .and_then(|d| d.with_day0(0)) + .and_then(|d| d.with_month0(0)), + unsupported => { + return exec_err!("Unsupported date_trunc granularity: {unsupported}"); + } + }; + Ok(value) +} + +fn quarter_month(date: &T) -> u32 +where + T: Datelike, +{ + 1 + 3 * ((date.month() - 1) / 3) +} + +fn _date_trunc_coarse_with_tz( + granularity: &str, + value: Option>, +) -> Result> { + if let Some(value) = value { + let local = value.naive_local(); + let truncated = _date_trunc_coarse::(granularity, Some(local))?; + let truncated = truncated.and_then(|truncated| { + match truncated.and_local_timezone(value.timezone()) { + LocalResult::None => { + // This can happen if the date_trunc operation moves the time into + // an hour that doesn't exist due to daylight savings. On known example where + // this can happen is with historic dates in the America/Sao_Paulo time zone. + // To account for this adjust the time by a few hours, convert to local time, + // and then adjust the time back. + truncated + .sub(Duration::hours(3)) + .and_local_timezone(value.timezone()) + .single() + .map(|v| v.add(Duration::hours(3))) + } + LocalResult::Single(datetime) => Some(datetime), + LocalResult::Ambiguous(datetime1, datetime2) => { + // Because we are truncating from an equally or more specific time + // the original time must have been within the ambiguous local time + // period. Therefore the offset of one of these times should match the + // offset of the original time. + if datetime1.offset().fix() == value.offset().fix() { + Some(datetime1) + } else { + Some(datetime2) + } + } + } + }); + Ok(truncated.and_then(|value| value.timestamp_nanos_opt())) + } else { + _date_trunc_coarse::(granularity, None)?; + Ok(None) + } +} + +fn _date_trunc_coarse_without_tz( + granularity: &str, + value: Option, +) -> Result> { + let value = _date_trunc_coarse::(granularity, value)?; + Ok(value.and_then(|value| value.timestamp_nanos_opt())) +} + +/// Truncates the single `value`, expressed in nanoseconds since the +/// epoch, for granularities greater than 1 second, in taking into +/// account that some granularities are not uniform durations of time +/// (e.g. months are not always the same lengths, leap seconds, etc) +fn date_trunc_coarse(granularity: &str, value: i64, tz: Option) -> Result { + let value = match tz { + Some(tz) => { + // Use chrono DateTime to clear the various fields because need to clear per timezone, + // and NaiveDateTime (ISO 8601) has no concept of timezones + let value = as_datetime_with_timezone::(value, tz) + .ok_or(DataFusionError::Execution(format!( + "Timestamp {value} out of range" + )))?; + _date_trunc_coarse_with_tz(granularity, Some(value)) + } + None => { + // Use chrono NaiveDateTime to clear the various fields, if we don't have a timezone. + let value = timestamp_ns_to_datetime(value).ok_or_else(|| { + DataFusionError::Execution(format!("Timestamp {value} out of range")) + })?; + _date_trunc_coarse_without_tz(granularity, Some(value)) + } + }?; + + // `with_x(0)` are infallible because `0` are always a valid + Ok(value.unwrap()) +} + +// truncates a single value with the given timeunit to the specified granularity +fn general_date_trunc( + tu: TimeUnit, + value: &Option, + tz: Option, + granularity: &str, +) -> Result, DataFusionError> { + let scale = match tu { + Second => 1_000_000_000, + Millisecond => 1_000_000, + Microsecond => 1_000, + Nanosecond => 1, + }; + + let Some(value) = value else { + return Ok(None); + }; + + // convert to nanoseconds + let nano = date_trunc_coarse(granularity, scale * value, tz)?; + + let result = match tu { + Second => match granularity { + "minute" => Some(nano / 1_000_000_000 / 60 * 60), + _ => Some(nano / 1_000_000_000), + }, + Millisecond => match granularity { + "minute" => Some(nano / 1_000_000 / 1_000 / 60 * 1_000 * 60), + "second" => Some(nano / 1_000_000 / 1_000 * 1_000), + _ => Some(nano / 1_000_000), + }, + Microsecond => match granularity { + "minute" => Some(nano / 1_000 / 1_000_000 / 60 * 60 * 1_000_000), + "second" => Some(nano / 1_000 / 1_000_000 * 1_000_000), + "millisecond" => Some(nano / 1_000 / 1_000 * 1_000), + _ => Some(nano / 1_000), + }, + _ => match granularity { + "minute" => Some(nano / 1_000_000_000 / 60 * 1_000_000_000 * 60), + "second" => Some(nano / 1_000_000_000 * 1_000_000_000), + "millisecond" => Some(nano / 1_000_000 * 1_000_000), + "microsecond" => Some(nano / 1_000 * 1_000), + _ => Some(nano), + }, + }; + Ok(result) +} + +fn parse_tz(tz: &Option>) -> Result> { + tz.as_ref() + .map(|tz| { + Tz::from_str(tz).map_err(|op| { + DataFusionError::Execution(format!("failed on timezone {tz}: {:?}", op)) + }) + }) + .transpose() +} + +#[cfg(test)] +mod tests { + use crate::datetime::date_trunc::{date_trunc_coarse, DateTruncFunc}; + use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos; + use arrow::datatypes::{DataType, TimeUnit}; + use arrow_array::cast::as_primitive_array; + use arrow_array::types::TimestampNanosecondType; + use arrow_array::TimestampNanosecondArray; + use datafusion_common::ScalarValue; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + use std::sync::Arc; + + #[test] + fn date_trunc_test() { + let cases = vec![ + ( + "2020-09-08T13:42:29.190855Z", + "second", + "2020-09-08T13:42:29.000000Z", + ), + ( + "2020-09-08T13:42:29.190855Z", + "minute", + "2020-09-08T13:42:00.000000Z", + ), + ( + "2020-09-08T13:42:29.190855Z", + "hour", + "2020-09-08T13:00:00.000000Z", + ), + ( + "2020-09-08T13:42:29.190855Z", + "day", + "2020-09-08T00:00:00.000000Z", + ), + ( + "2020-09-08T13:42:29.190855Z", + "week", + "2020-09-07T00:00:00.000000Z", + ), + ( + "2020-09-08T13:42:29.190855Z", + "month", + "2020-09-01T00:00:00.000000Z", + ), + ( + "2020-09-08T13:42:29.190855Z", + "year", + "2020-01-01T00:00:00.000000Z", + ), + // week + ( + "2021-01-01T13:42:29.190855Z", + "week", + "2020-12-28T00:00:00.000000Z", + ), + ( + "2020-01-01T13:42:29.190855Z", + "week", + "2019-12-30T00:00:00.000000Z", + ), + // quarter + ( + "2020-01-01T13:42:29.190855Z", + "quarter", + "2020-01-01T00:00:00.000000Z", + ), + ( + "2020-02-01T13:42:29.190855Z", + "quarter", + "2020-01-01T00:00:00.000000Z", + ), + ( + "2020-03-01T13:42:29.190855Z", + "quarter", + "2020-01-01T00:00:00.000000Z", + ), + ( + "2020-04-01T13:42:29.190855Z", + "quarter", + "2020-04-01T00:00:00.000000Z", + ), + ( + "2020-08-01T13:42:29.190855Z", + "quarter", + "2020-07-01T00:00:00.000000Z", + ), + ( + "2020-11-01T13:42:29.190855Z", + "quarter", + "2020-10-01T00:00:00.000000Z", + ), + ( + "2020-12-01T13:42:29.190855Z", + "quarter", + "2020-10-01T00:00:00.000000Z", + ), + ]; + + cases.iter().for_each(|(original, granularity, expected)| { + let left = string_to_timestamp_nanos(original).unwrap(); + let right = string_to_timestamp_nanos(expected).unwrap(); + let result = date_trunc_coarse(granularity, left, None).unwrap(); + assert_eq!(result, right, "{original} = {expected}"); + }); + } + + #[test] + fn test_date_trunc_timezones() { + let cases = vec![ + ( + vec![ + "2020-09-08T00:00:00Z", + "2020-09-08T01:00:00Z", + "2020-09-08T02:00:00Z", + "2020-09-08T03:00:00Z", + "2020-09-08T04:00:00Z", + ], + Some("+00".into()), + vec![ + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + ], + ), + ( + vec![ + "2020-09-08T00:00:00Z", + "2020-09-08T01:00:00Z", + "2020-09-08T02:00:00Z", + "2020-09-08T03:00:00Z", + "2020-09-08T04:00:00Z", + ], + None, + vec![ + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + "2020-09-08T00:00:00Z", + ], + ), + ( + vec![ + "2020-09-08T00:00:00Z", + "2020-09-08T01:00:00Z", + "2020-09-08T02:00:00Z", + "2020-09-08T03:00:00Z", + "2020-09-08T04:00:00Z", + ], + Some("-02".into()), + vec![ + "2020-09-07T02:00:00Z", + "2020-09-07T02:00:00Z", + "2020-09-08T02:00:00Z", + "2020-09-08T02:00:00Z", + "2020-09-08T02:00:00Z", + ], + ), + ( + vec![ + "2020-09-08T00:00:00+05", + "2020-09-08T01:00:00+05", + "2020-09-08T02:00:00+05", + "2020-09-08T03:00:00+05", + "2020-09-08T04:00:00+05", + ], + Some("+05".into()), + vec![ + "2020-09-08T00:00:00+05", + "2020-09-08T00:00:00+05", + "2020-09-08T00:00:00+05", + "2020-09-08T00:00:00+05", + "2020-09-08T00:00:00+05", + ], + ), + ( + vec![ + "2020-09-08T00:00:00+08", + "2020-09-08T01:00:00+08", + "2020-09-08T02:00:00+08", + "2020-09-08T03:00:00+08", + "2020-09-08T04:00:00+08", + ], + Some("+08".into()), + vec![ + "2020-09-08T00:00:00+08", + "2020-09-08T00:00:00+08", + "2020-09-08T00:00:00+08", + "2020-09-08T00:00:00+08", + "2020-09-08T00:00:00+08", + ], + ), + ( + vec![ + "2024-10-26T23:00:00Z", + "2024-10-27T00:00:00Z", + "2024-10-27T01:00:00Z", + "2024-10-27T02:00:00Z", + ], + Some("Europe/Berlin".into()), + vec![ + "2024-10-27T00:00:00+02", + "2024-10-27T00:00:00+02", + "2024-10-27T00:00:00+02", + "2024-10-27T00:00:00+02", + ], + ), + ( + vec![ + "2018-02-18T00:00:00Z", + "2018-02-18T01:00:00Z", + "2018-02-18T02:00:00Z", + "2018-02-18T03:00:00Z", + "2018-11-04T01:00:00Z", + "2018-11-04T02:00:00Z", + "2018-11-04T03:00:00Z", + "2018-11-04T04:00:00Z", + ], + Some("America/Sao_Paulo".into()), + vec![ + "2018-02-17T00:00:00-02", + "2018-02-17T00:00:00-02", + "2018-02-17T00:00:00-02", + "2018-02-18T00:00:00-03", + "2018-11-03T00:00:00-03", + "2018-11-03T00:00:00-03", + "2018-11-04T01:00:00-02", + "2018-11-04T01:00:00-02", + ], + ), + ]; + + cases.iter().for_each(|(original, tz_opt, expected)| { + let input = original + .iter() + .map(|s| Some(string_to_timestamp_nanos(s).unwrap())) + .collect::() + .with_timezone_opt(tz_opt.clone()); + let right = expected + .iter() + .map(|s| Some(string_to_timestamp_nanos(s).unwrap())) + .collect::() + .with_timezone_opt(tz_opt.clone()); + let result = DateTruncFunc::new() + .invoke(&[ + ColumnarValue::Scalar(ScalarValue::from("day")), + ColumnarValue::Array(Arc::new(input)), + ]) + .unwrap(); + if let ColumnarValue::Array(result) = result { + assert_eq!( + result.data_type(), + &DataType::Timestamp(TimeUnit::Nanosecond, tz_opt.clone()) + ); + let left = as_primitive_array::(&result); + assert_eq!(left, &right); + } else { + panic!("unexpected column type"); + } + }); + } + + #[test] + fn test_date_trunc_hour_timezones() { + let cases = vec![ + ( + vec![ + "2020-09-08T00:30:00Z", + "2020-09-08T01:30:00Z", + "2020-09-08T02:30:00Z", + "2020-09-08T03:30:00Z", + "2020-09-08T04:30:00Z", + ], + Some("+00".into()), + vec![ + "2020-09-08T00:00:00Z", + "2020-09-08T01:00:00Z", + "2020-09-08T02:00:00Z", + "2020-09-08T03:00:00Z", + "2020-09-08T04:00:00Z", + ], + ), + ( + vec![ + "2020-09-08T00:30:00Z", + "2020-09-08T01:30:00Z", + "2020-09-08T02:30:00Z", + "2020-09-08T03:30:00Z", + "2020-09-08T04:30:00Z", + ], + None, + vec![ + "2020-09-08T00:00:00Z", + "2020-09-08T01:00:00Z", + "2020-09-08T02:00:00Z", + "2020-09-08T03:00:00Z", + "2020-09-08T04:00:00Z", + ], + ), + ( + vec![ + "2020-09-08T00:30:00Z", + "2020-09-08T01:30:00Z", + "2020-09-08T02:30:00Z", + "2020-09-08T03:30:00Z", + "2020-09-08T04:30:00Z", + ], + Some("-02".into()), + vec![ + "2020-09-08T00:00:00Z", + "2020-09-08T01:00:00Z", + "2020-09-08T02:00:00Z", + "2020-09-08T03:00:00Z", + "2020-09-08T04:00:00Z", + ], + ), + ( + vec![ + "2020-09-08T00:30:00+05", + "2020-09-08T01:30:00+05", + "2020-09-08T02:30:00+05", + "2020-09-08T03:30:00+05", + "2020-09-08T04:30:00+05", + ], + Some("+05".into()), + vec![ + "2020-09-08T00:00:00+05", + "2020-09-08T01:00:00+05", + "2020-09-08T02:00:00+05", + "2020-09-08T03:00:00+05", + "2020-09-08T04:00:00+05", + ], + ), + ( + vec![ + "2020-09-08T00:30:00+08", + "2020-09-08T01:30:00+08", + "2020-09-08T02:30:00+08", + "2020-09-08T03:30:00+08", + "2020-09-08T04:30:00+08", + ], + Some("+08".into()), + vec![ + "2020-09-08T00:00:00+08", + "2020-09-08T01:00:00+08", + "2020-09-08T02:00:00+08", + "2020-09-08T03:00:00+08", + "2020-09-08T04:00:00+08", + ], + ), + ( + vec![ + "2024-10-26T23:30:00Z", + "2024-10-27T00:30:00Z", + "2024-10-27T01:30:00Z", + "2024-10-27T02:30:00Z", + ], + Some("Europe/Berlin".into()), + vec![ + "2024-10-27T01:00:00+02", + "2024-10-27T02:00:00+02", + "2024-10-27T02:00:00+01", + "2024-10-27T03:00:00+01", + ], + ), + ( + vec![ + "2018-02-18T00:30:00Z", + "2018-02-18T01:30:00Z", + "2018-02-18T02:30:00Z", + "2018-02-18T03:30:00Z", + "2018-11-04T01:00:00Z", + "2018-11-04T02:00:00Z", + "2018-11-04T03:00:00Z", + "2018-11-04T04:00:00Z", + ], + Some("America/Sao_Paulo".into()), + vec![ + "2018-02-17T22:00:00-02", + "2018-02-17T23:00:00-02", + "2018-02-17T23:00:00-03", + "2018-02-18T00:00:00-03", + "2018-11-03T22:00:00-03", + "2018-11-03T23:00:00-03", + "2018-11-04T01:00:00-02", + "2018-11-04T02:00:00-02", + ], + ), + ]; + + cases.iter().for_each(|(original, tz_opt, expected)| { + let input = original + .iter() + .map(|s| Some(string_to_timestamp_nanos(s).unwrap())) + .collect::() + .with_timezone_opt(tz_opt.clone()); + let right = expected + .iter() + .map(|s| Some(string_to_timestamp_nanos(s).unwrap())) + .collect::() + .with_timezone_opt(tz_opt.clone()); + let result = DateTruncFunc::new() + .invoke(&[ + ColumnarValue::Scalar(ScalarValue::from("hour")), + ColumnarValue::Array(Arc::new(input)), + ]) + .unwrap(); + if let ColumnarValue::Array(result) = result { + assert_eq!( + result.data_type(), + &DataType::Timestamp(TimeUnit::Nanosecond, tz_opt.clone()) + ); + let left = as_primitive_array::(&result); + assert_eq!(left, &right); + } else { + panic!("unexpected column type"); + } + }); + } +} diff --git a/datafusion/functions/src/datetime/mod.rs b/datafusion/functions/src/datetime/mod.rs index 233e8b2cdbb4..bf8406c47273 100644 --- a/datafusion/functions/src/datetime/mod.rs +++ b/datafusion/functions/src/datetime/mod.rs @@ -22,10 +22,16 @@ use std::sync::Arc; use datafusion_expr::ScalarUDF; mod common; +mod date_bin; +mod date_part; +mod date_trunc; mod to_date; mod to_timestamp; // create UDFs +make_udf_function!(date_bin::DateBinFunc, DATE_BIN, date_bin); +make_udf_function!(date_part::DatePartFunc, DATE_PART, date_part); +make_udf_function!(date_trunc::DateTruncFunc, DATE_TRUNC, date_trunc); make_udf_function!(to_date::ToDateFunc, TO_DATE, to_date); make_udf_function!(to_timestamp::ToTimestampFunc, TO_TIMESTAMP, to_timestamp); make_udf_function!( @@ -53,7 +59,23 @@ make_udf_function!( // functions with varargs currently pub mod expr_fn { - use datafusion_expr::Expr; + use datafusion_expr::{Expr, ScalarUDF}; + use std::sync::Arc; + + #[doc = "coerces an arbitrary timestamp to the start of the nearest specified interval"] + pub fn date_bin(args: Vec) -> Expr { + super::date_bin().call(args) + } + + #[doc = "extracts a subfield from the date"] + pub fn date_part(args: Vec) -> Expr { + super::date_part().call(args) + } + + #[doc = "truncates the date to a specified level of precision"] + pub fn date_trunc(args: Vec) -> Expr { + super::date_trunc().call(args) + } /// ```ignore /// # use std::sync::Arc; @@ -129,11 +151,22 @@ pub mod expr_fn { pub fn to_timestamp_nanos(args: Vec) -> Expr { super::to_timestamp_nanos().call(args) } + + pub fn _date_bin_scalar_udf() -> Arc { + super::date_bin() + } + + pub fn _date_part_scalar_udf() -> Arc { + super::date_part() + } } /// Return a list of all functions in this package pub fn functions() -> Vec> { vec![ + date_bin(), + date_part(), + date_trunc(), to_date(), to_timestamp(), to_timestamp_seconds(), diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index 0ff7bd595c5b..7e59d617b3f1 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -61,6 +61,7 @@ chrono = { workspace = true } datafusion-common = { workspace = true, default-features = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } +datafusion-functions = { workspace = true } half = { workspace = true } hashbrown = { version = "0.14", features = ["raw"] } hex = { version = "0.4", optional = true } diff --git a/datafusion/physical-expr/src/datetime_expressions.rs b/datafusion/physical-expr/src/datetime_expressions.rs index 3b322ae2692f..a7133dcbedd4 100644 --- a/datafusion/physical-expr/src/datetime_expressions.rs +++ b/datafusion/physical-expr/src/datetime_expressions.rs @@ -17,40 +17,22 @@ //! DateTime expressions -use std::ops::{Add, Sub}; -use std::str::FromStr; use std::sync::Arc; -use arrow::compute::cast; +use arrow::datatypes::TimeUnit; use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions}; use arrow::{ - array::{Array, ArrayRef, Float64Array, PrimitiveArray}, - datatypes::{ - ArrowNumericType, ArrowTemporalType, DataType, IntervalDayTimeType, - IntervalMonthDayNanoType, TimestampMicrosecondType, TimestampMillisecondType, - TimestampNanosecondType, TimestampSecondType, - }, -}; -use arrow::{ - compute::kernels::temporal, - datatypes::TimeUnit, - temporal_conversions::{as_datetime_with_timezone, timestamp_ns_to_datetime}, + array::{Array, ArrayRef, PrimitiveArray}, + datatypes::DataType, }; use arrow_array::builder::PrimitiveBuilder; use arrow_array::cast::AsArray; -use arrow_array::temporal_conversions::NANOSECONDS; -use arrow_array::timezone::Tz; -use arrow_array::types::{ArrowTimestampType, Date32Type, Int32Type}; +use arrow_array::types::{Date32Type, Int32Type}; use arrow_array::StringArray; use chrono::prelude::*; -use chrono::{Duration, LocalResult, Months, NaiveDate}; +use chrono::NaiveDate; -use datafusion_common::cast::{ - as_date32_array, as_date64_array, as_primitive_array, as_timestamp_microsecond_array, - as_timestamp_millisecond_array, as_timestamp_nanosecond_array, - as_timestamp_second_array, -}; -use datafusion_common::{exec_err, not_impl_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{exec_err, Result, ScalarValue}; use datafusion_expr::ColumnarValue; /// Create an implementation of `now()` that always returns the @@ -419,728 +401,6 @@ pub fn make_date(args: &[ColumnarValue]) -> Result { } } -fn quarter_month(date: &T) -> u32 -where - T: Datelike, -{ - 1 + 3 * ((date.month() - 1) / 3) -} - -fn _date_trunc_coarse(granularity: &str, value: Option) -> Result> -where - T: Datelike + Timelike + Sub + Copy, -{ - let value = match granularity { - "millisecond" => value, - "microsecond" => value, - "second" => value.and_then(|d| d.with_nanosecond(0)), - "minute" => value - .and_then(|d| d.with_nanosecond(0)) - .and_then(|d| d.with_second(0)), - "hour" => value - .and_then(|d| d.with_nanosecond(0)) - .and_then(|d| d.with_second(0)) - .and_then(|d| d.with_minute(0)), - "day" => value - .and_then(|d| d.with_nanosecond(0)) - .and_then(|d| d.with_second(0)) - .and_then(|d| d.with_minute(0)) - .and_then(|d| d.with_hour(0)), - "week" => value - .and_then(|d| d.with_nanosecond(0)) - .and_then(|d| d.with_second(0)) - .and_then(|d| d.with_minute(0)) - .and_then(|d| d.with_hour(0)) - .map(|d| d - Duration::seconds(60 * 60 * 24 * d.weekday() as i64)), - "month" => value - .and_then(|d| d.with_nanosecond(0)) - .and_then(|d| d.with_second(0)) - .and_then(|d| d.with_minute(0)) - .and_then(|d| d.with_hour(0)) - .and_then(|d| d.with_day0(0)), - "quarter" => value - .and_then(|d| d.with_nanosecond(0)) - .and_then(|d| d.with_second(0)) - .and_then(|d| d.with_minute(0)) - .and_then(|d| d.with_hour(0)) - .and_then(|d| d.with_day0(0)) - .and_then(|d| d.with_month(quarter_month(&d))), - "year" => value - .and_then(|d| d.with_nanosecond(0)) - .and_then(|d| d.with_second(0)) - .and_then(|d| d.with_minute(0)) - .and_then(|d| d.with_hour(0)) - .and_then(|d| d.with_day0(0)) - .and_then(|d| d.with_month0(0)), - unsupported => { - return exec_err!("Unsupported date_trunc granularity: {unsupported}"); - } - }; - Ok(value) -} - -fn _date_trunc_coarse_with_tz( - granularity: &str, - value: Option>, -) -> Result> { - if let Some(value) = value { - let local = value.naive_local(); - let truncated = _date_trunc_coarse::(granularity, Some(local))?; - let truncated = truncated.and_then(|truncated| { - match truncated.and_local_timezone(value.timezone()) { - LocalResult::None => { - // This can happen if the date_trunc operation moves the time into - // an hour that doesn't exist due to daylight savings. On known example where - // this can happen is with historic dates in the America/Sao_Paulo time zone. - // To account for this adjust the time by a few hours, convert to local time, - // and then adjust the time back. - truncated - .sub(Duration::hours(3)) - .and_local_timezone(value.timezone()) - .single() - .map(|v| v.add(Duration::hours(3))) - } - LocalResult::Single(datetime) => Some(datetime), - LocalResult::Ambiguous(datetime1, datetime2) => { - // Because we are truncating from an equally or more specific time - // the original time must have been within the ambiguous local time - // period. Therefore the offset of one of these times should match the - // offset of the original time. - if datetime1.offset().fix() == value.offset().fix() { - Some(datetime1) - } else { - Some(datetime2) - } - } - } - }); - Ok(truncated.and_then(|value| value.timestamp_nanos_opt())) - } else { - _date_trunc_coarse::(granularity, None)?; - Ok(None) - } -} - -fn _date_trunc_coarse_without_tz( - granularity: &str, - value: Option, -) -> Result> { - let value = _date_trunc_coarse::(granularity, value)?; - Ok(value.and_then(|value| value.timestamp_nanos_opt())) -} - -/// Tuncates the single `value`, expressed in nanoseconds since the -/// epoch, for granularities greater than 1 second, in taking into -/// account that some granularities are not uniform durations of time -/// (e.g. months are not always the same lengths, leap seconds, etc) -fn date_trunc_coarse(granularity: &str, value: i64, tz: Option) -> Result { - let value = match tz { - Some(tz) => { - // Use chrono DateTime to clear the various fields because need to clear per timezone, - // and NaiveDateTime (ISO 8601) has no concept of timezones - let value = as_datetime_with_timezone::(value, tz) - .ok_or(DataFusionError::Execution(format!( - "Timestamp {value} out of range" - )))?; - _date_trunc_coarse_with_tz(granularity, Some(value)) - } - None => { - // Use chrono NaiveDateTime to clear the various fields, if we don't have a timezone. - let value = timestamp_ns_to_datetime(value).ok_or_else(|| { - DataFusionError::Execution(format!("Timestamp {value} out of range")) - })?; - _date_trunc_coarse_without_tz(granularity, Some(value)) - } - }?; - - // `with_x(0)` are infallible because `0` are always a valid - Ok(value.unwrap()) -} - -// truncates a single value with the given timeunit to the specified granularity -fn general_date_trunc( - tu: TimeUnit, - value: &Option, - tz: Option, - granularity: &str, -) -> Result, DataFusionError> { - let scale = match tu { - TimeUnit::Second => 1_000_000_000, - TimeUnit::Millisecond => 1_000_000, - TimeUnit::Microsecond => 1_000, - TimeUnit::Nanosecond => 1, - }; - - let Some(value) = value else { - return Ok(None); - }; - - // convert to nanoseconds - let nano = date_trunc_coarse(granularity, scale * value, tz)?; - - let result = match tu { - TimeUnit::Second => match granularity { - "minute" => Some(nano / 1_000_000_000 / 60 * 60), - _ => Some(nano / 1_000_000_000), - }, - TimeUnit::Millisecond => match granularity { - "minute" => Some(nano / 1_000_000 / 1_000 / 60 * 1_000 * 60), - "second" => Some(nano / 1_000_000 / 1_000 * 1_000), - _ => Some(nano / 1_000_000), - }, - TimeUnit::Microsecond => match granularity { - "minute" => Some(nano / 1_000 / 1_000_000 / 60 * 60 * 1_000_000), - "second" => Some(nano / 1_000 / 1_000_000 * 1_000_000), - "millisecond" => Some(nano / 1_000 / 1_000 * 1_000), - _ => Some(nano / 1_000), - }, - _ => match granularity { - "minute" => Some(nano / 1_000_000_000 / 60 * 1_000_000_000 * 60), - "second" => Some(nano / 1_000_000_000 * 1_000_000_000), - "millisecond" => Some(nano / 1_000_000 * 1_000_000), - "microsecond" => Some(nano / 1_000 * 1_000), - _ => Some(nano), - }, - }; - Ok(result) -} - -fn parse_tz(tz: &Option>) -> Result> { - tz.as_ref() - .map(|tz| { - Tz::from_str(tz).map_err(|op| { - DataFusionError::Execution(format!("failed on timezone {tz}: {:?}", op)) - }) - }) - .transpose() -} - -/// date_trunc SQL function -pub fn date_trunc(args: &[ColumnarValue]) -> Result { - let (granularity, array) = (&args[0], &args[1]); - - let granularity = - if let ColumnarValue::Scalar(ScalarValue::Utf8(Some(v))) = granularity { - v.to_lowercase() - } else { - return exec_err!("Granularity of `date_trunc` must be non-null scalar Utf8"); - }; - - fn process_array( - array: &dyn Array, - granularity: String, - tz_opt: &Option>, - ) -> Result { - let parsed_tz = parse_tz(tz_opt)?; - let array = as_primitive_array::(array)?; - let array = array - .iter() - .map(|x| general_date_trunc(T::UNIT, &x, parsed_tz, granularity.as_str())) - .collect::>>()? - .with_timezone_opt(tz_opt.clone()); - Ok(ColumnarValue::Array(Arc::new(array))) - } - - fn process_scalar( - v: &Option, - granularity: String, - tz_opt: &Option>, - ) -> Result { - let parsed_tz = parse_tz(tz_opt)?; - let value = general_date_trunc(T::UNIT, v, parsed_tz, granularity.as_str())?; - let value = ScalarValue::new_timestamp::(value, tz_opt.clone()); - Ok(ColumnarValue::Scalar(value)) - } - - Ok(match array { - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(v, tz_opt)) => { - process_scalar::(v, granularity, tz_opt)? - } - ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(v, tz_opt)) => { - process_scalar::(v, granularity, tz_opt)? - } - ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(v, tz_opt)) => { - process_scalar::(v, granularity, tz_opt)? - } - ColumnarValue::Scalar(ScalarValue::TimestampSecond(v, tz_opt)) => { - process_scalar::(v, granularity, tz_opt)? - } - ColumnarValue::Array(array) => { - let array_type = array.data_type(); - match array_type { - DataType::Timestamp(TimeUnit::Second, tz_opt) => { - process_array::(array, granularity, tz_opt)? - } - DataType::Timestamp(TimeUnit::Millisecond, tz_opt) => { - process_array::(array, granularity, tz_opt)? - } - DataType::Timestamp(TimeUnit::Microsecond, tz_opt) => { - process_array::(array, granularity, tz_opt)? - } - DataType::Timestamp(TimeUnit::Nanosecond, tz_opt) => { - process_array::(array, granularity, tz_opt)? - } - _ => process_array::(array, granularity, &None)?, - } - } - _ => { - return exec_err!( - "second argument of `date_trunc` must be nanosecond timestamp scalar or array" - ); - } - }) -} - -// return time in nanoseconds that the source timestamp falls into based on the stride and origin -fn date_bin_nanos_interval(stride_nanos: i64, source: i64, origin: i64) -> i64 { - let time_diff = source - origin; - - // distance from origin to bin - let time_delta = compute_distance(time_diff, stride_nanos); - - origin + time_delta -} - -// distance from origin to bin -fn compute_distance(time_diff: i64, stride: i64) -> i64 { - let time_delta = time_diff - (time_diff % stride); - - if time_diff < 0 && stride > 1 { - // The origin is later than the source timestamp, round down to the previous bin - time_delta - stride - } else { - time_delta - } -} - -// return time in nanoseconds that the source timestamp falls into based on the stride and origin -fn date_bin_months_interval(stride_months: i64, source: i64, origin: i64) -> i64 { - // convert source and origin to DateTime - let source_date = to_utc_date_time(source); - let origin_date = to_utc_date_time(origin); - - // calculate the number of months between the source and origin - let month_diff = (source_date.year() - origin_date.year()) * 12 - + source_date.month() as i32 - - origin_date.month() as i32; - - // distance from origin to bin - let month_delta = compute_distance(month_diff as i64, stride_months); - - let mut bin_time = if month_delta < 0 { - origin_date - Months::new(month_delta.unsigned_abs() as u32) - } else { - origin_date + Months::new(month_delta as u32) - }; - - // If origin is not midnight of first date of the month, the bin_time may be larger than the source - // In this case, we need to move back to previous bin - if bin_time > source_date { - let month_delta = month_delta - stride_months; - bin_time = if month_delta < 0 { - origin_date - Months::new(month_delta.unsigned_abs() as u32) - } else { - origin_date + Months::new(month_delta as u32) - }; - } - - bin_time.timestamp_nanos_opt().unwrap() -} - -fn to_utc_date_time(nanos: i64) -> DateTime { - let secs = nanos / 1_000_000_000; - let nsec = (nanos % 1_000_000_000) as u32; - let date = NaiveDateTime::from_timestamp_opt(secs, nsec).unwrap(); - DateTime::::from_naive_utc_and_offset(date, Utc) -} - -/// DATE_BIN sql function -pub fn date_bin(args: &[ColumnarValue]) -> Result { - if args.len() == 2 { - // Default to unix EPOCH - let origin = ColumnarValue::Scalar(ScalarValue::TimestampNanosecond( - Some(0), - Some("+00:00".into()), - )); - date_bin_impl(&args[0], &args[1], &origin) - } else if args.len() == 3 { - date_bin_impl(&args[0], &args[1], &args[2]) - } else { - exec_err!("DATE_BIN expected two or three arguments") - } -} - -enum Interval { - Nanoseconds(i64), - Months(i64), -} - -impl Interval { - /// Returns (`stride_nanos`, `fn`) where - /// - /// 1. `stride_nanos` is a width, in nanoseconds - /// 2. `fn` is a function that takes (stride_nanos, source, origin) - /// - /// `source` is the timestamp being binned - /// - /// `origin` is the time, in nanoseconds, where windows are measured from - fn bin_fn(&self) -> (i64, fn(i64, i64, i64) -> i64) { - match self { - Interval::Nanoseconds(nanos) => (*nanos, date_bin_nanos_interval), - Interval::Months(months) => (*months, date_bin_months_interval), - } - } -} - -// Supported intervals: -// 1. IntervalDayTime: this means that the stride is in days, hours, minutes, seconds and milliseconds -// We will assume month interval won't be converted into this type -// TODO (my next PR): without `INTERVAL` keyword, the stride was converted into ScalarValue::IntervalDayTime somwhere -// for month interval. I need to find that and make it ScalarValue::IntervalMonthDayNano instead -// 2. IntervalMonthDayNano -fn date_bin_impl( - stride: &ColumnarValue, - array: &ColumnarValue, - origin: &ColumnarValue, -) -> Result { - let stride = match stride { - ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(v))) => { - let (days, ms) = IntervalDayTimeType::to_parts(*v); - let nanos = (Duration::days(days as i64) + Duration::milliseconds(ms as i64)) - .num_nanoseconds(); - - match nanos { - Some(v) => Interval::Nanoseconds(v), - _ => return exec_err!("DATE_BIN stride argument is too large"), - } - } - ColumnarValue::Scalar(ScalarValue::IntervalMonthDayNano(Some(v))) => { - let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(*v); - - // If interval is months, its origin must be midnight of first date of the month - if months != 0 { - // Return error if days or nanos is not zero - if days != 0 || nanos != 0 { - return not_impl_err!( - "DATE_BIN stride does not support combination of month, day and nanosecond intervals" - ); - } else { - Interval::Months(months as i64) - } - } else { - let nanos = (Duration::days(days as i64) + Duration::nanoseconds(nanos)) - .num_nanoseconds(); - match nanos { - Some(v) => Interval::Nanoseconds(v), - _ => return exec_err!("DATE_BIN stride argument is too large"), - } - } - } - ColumnarValue::Scalar(v) => { - return exec_err!( - "DATE_BIN expects stride argument to be an INTERVAL but got {}", - v.data_type() - ) - } - ColumnarValue::Array(_) => { - return not_impl_err!( - "DATE_BIN only supports literal values for the stride argument, not arrays" - ) - } - }; - - let origin = match origin { - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(v), _)) => *v, - ColumnarValue::Scalar(v) => { - return exec_err!( - "DATE_BIN expects origin argument to be a TIMESTAMP with nanosececond precision but got {}", - v.data_type() - ) - } - ColumnarValue::Array(_) => return not_impl_err!( - "DATE_BIN only supports literal values for the origin argument, not arrays" - ), - }; - - let (stride, stride_fn) = stride.bin_fn(); - - // Return error if stride is 0 - if stride == 0 { - return exec_err!("DATE_BIN stride must be non-zero"); - } - - fn stride_map_fn( - origin: i64, - stride: i64, - stride_fn: fn(i64, i64, i64) -> i64, - ) -> impl Fn(Option) -> Option { - let scale = match T::UNIT { - TimeUnit::Nanosecond => 1, - TimeUnit::Microsecond => NANOSECONDS / 1_000_000, - TimeUnit::Millisecond => NANOSECONDS / 1_000, - TimeUnit::Second => NANOSECONDS, - }; - move |x: Option| x.map(|x| stride_fn(stride, x * scale, origin) / scale) - } - - Ok(match array { - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(v, tz_opt)) => { - let apply_stride_fn = - stride_map_fn::(origin, stride, stride_fn); - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond( - apply_stride_fn(*v), - tz_opt.clone(), - )) - } - ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(v, tz_opt)) => { - let apply_stride_fn = - stride_map_fn::(origin, stride, stride_fn); - ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond( - apply_stride_fn(*v), - tz_opt.clone(), - )) - } - ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(v, tz_opt)) => { - let apply_stride_fn = - stride_map_fn::(origin, stride, stride_fn); - ColumnarValue::Scalar(ScalarValue::TimestampMillisecond( - apply_stride_fn(*v), - tz_opt.clone(), - )) - } - ColumnarValue::Scalar(ScalarValue::TimestampSecond(v, tz_opt)) => { - let apply_stride_fn = - stride_map_fn::(origin, stride, stride_fn); - ColumnarValue::Scalar(ScalarValue::TimestampSecond( - apply_stride_fn(*v), - tz_opt.clone(), - )) - } - - ColumnarValue::Array(array) => { - fn transform_array_with_stride( - origin: i64, - stride: i64, - stride_fn: fn(i64, i64, i64) -> i64, - array: &ArrayRef, - tz_opt: &Option>, - ) -> Result - where - T: ArrowTimestampType, - { - let array = as_primitive_array::(array)?; - let apply_stride_fn = stride_map_fn::(origin, stride, stride_fn); - let array = array - .iter() - .map(apply_stride_fn) - .collect::>() - .with_timezone_opt(tz_opt.clone()); - - Ok(ColumnarValue::Array(Arc::new(array))) - } - match array.data_type() { - DataType::Timestamp(TimeUnit::Nanosecond, tz_opt) => { - transform_array_with_stride::( - origin, stride, stride_fn, array, tz_opt, - )? - } - DataType::Timestamp(TimeUnit::Microsecond, tz_opt) => { - transform_array_with_stride::( - origin, stride, stride_fn, array, tz_opt, - )? - } - DataType::Timestamp(TimeUnit::Millisecond, tz_opt) => { - transform_array_with_stride::( - origin, stride, stride_fn, array, tz_opt, - )? - } - DataType::Timestamp(TimeUnit::Second, tz_opt) => { - transform_array_with_stride::( - origin, stride, stride_fn, array, tz_opt, - )? - } - _ => { - return exec_err!( - "DATE_BIN expects source argument to be a TIMESTAMP but got {}", - array.data_type() - ) - } - } - } - _ => { - return exec_err!( - "DATE_BIN expects source argument to be a TIMESTAMP scalar or array" - ); - } - }) -} - -macro_rules! extract_date_part { - ($ARRAY: expr, $FN:expr) => { - match $ARRAY.data_type() { - DataType::Date32 => { - let array = as_date32_array($ARRAY)?; - Ok($FN(array) - .map(|v| cast(&(Arc::new(v) as ArrayRef), &DataType::Float64))?) - } - DataType::Date64 => { - let array = as_date64_array($ARRAY)?; - Ok($FN(array) - .map(|v| cast(&(Arc::new(v) as ArrayRef), &DataType::Float64))?) - } - DataType::Timestamp(time_unit, _) => match time_unit { - TimeUnit::Second => { - let array = as_timestamp_second_array($ARRAY)?; - Ok($FN(array) - .map(|v| cast(&(Arc::new(v) as ArrayRef), &DataType::Float64))?) - } - TimeUnit::Millisecond => { - let array = as_timestamp_millisecond_array($ARRAY)?; - Ok($FN(array) - .map(|v| cast(&(Arc::new(v) as ArrayRef), &DataType::Float64))?) - } - TimeUnit::Microsecond => { - let array = as_timestamp_microsecond_array($ARRAY)?; - Ok($FN(array) - .map(|v| cast(&(Arc::new(v) as ArrayRef), &DataType::Float64))?) - } - TimeUnit::Nanosecond => { - let array = as_timestamp_nanosecond_array($ARRAY)?; - Ok($FN(array) - .map(|v| cast(&(Arc::new(v) as ArrayRef), &DataType::Float64))?) - } - }, - datatype => exec_err!("Extract does not support datatype {:?}", datatype), - } - }; -} - -/// DATE_PART SQL function -pub fn date_part(args: &[ColumnarValue]) -> Result { - if args.len() != 2 { - return exec_err!("Expected two arguments in DATE_PART"); - } - let (date_part, array) = (&args[0], &args[1]); - - let date_part = if let ColumnarValue::Scalar(ScalarValue::Utf8(Some(v))) = date_part { - v - } else { - return exec_err!("First argument of `DATE_PART` must be non-null scalar Utf8"); - }; - - let is_scalar = matches!(array, ColumnarValue::Scalar(_)); - - let array = match array { - ColumnarValue::Array(array) => array.clone(), - ColumnarValue::Scalar(scalar) => scalar.to_array()?, - }; - - let arr = match date_part.to_lowercase().as_str() { - "year" => extract_date_part!(&array, temporal::year), - "quarter" => extract_date_part!(&array, temporal::quarter), - "month" => extract_date_part!(&array, temporal::month), - "week" => extract_date_part!(&array, temporal::week), - "day" => extract_date_part!(&array, temporal::day), - "doy" => extract_date_part!(&array, temporal::doy), - "dow" => extract_date_part!(&array, temporal::num_days_from_sunday), - "hour" => extract_date_part!(&array, temporal::hour), - "minute" => extract_date_part!(&array, temporal::minute), - "second" => extract_date_part!(&array, seconds), - "millisecond" => extract_date_part!(&array, millis), - "microsecond" => extract_date_part!(&array, micros), - "nanosecond" => extract_date_part!(&array, nanos), - "epoch" => extract_date_part!(&array, epoch), - _ => exec_err!("Date part '{date_part}' not supported"), - }?; - - Ok(if is_scalar { - ColumnarValue::Scalar(ScalarValue::try_from_array(&arr?, 0)?) - } else { - ColumnarValue::Array(arr?) - }) -} - -fn to_ticks(array: &PrimitiveArray, frac: i32) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - let zipped = temporal::second(array)? - .values() - .iter() - .zip(temporal::nanosecond(array)?.values().iter()) - .map(|o| ((*o.0 as f64 + (*o.1 as f64) / 1_000_000_000.0) * (frac as f64))) - .collect::>(); - - Ok(Float64Array::from(zipped)) -} - -fn seconds(array: &PrimitiveArray) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - to_ticks(array, 1) -} - -fn millis(array: &PrimitiveArray) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - to_ticks(array, 1_000) -} - -fn micros(array: &PrimitiveArray) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - to_ticks(array, 1_000_000) -} - -fn nanos(array: &PrimitiveArray) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - to_ticks(array, 1_000_000_000) -} - -fn epoch(array: &PrimitiveArray) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - let b = match array.data_type() { - DataType::Timestamp(tu, _) => { - let scale = match tu { - TimeUnit::Second => 1, - TimeUnit::Millisecond => 1_000, - TimeUnit::Microsecond => 1_000_000, - TimeUnit::Nanosecond => 1_000_000_000, - } as f64; - array.unary(|n| { - let n: i64 = n.into(); - n as f64 / scale - }) - } - DataType::Date32 => { - let seconds_in_a_day = 86400_f64; - array.unary(|n| { - let n: i64 = n.into(); - n as f64 * seconds_in_a_day - }) - } - DataType::Date64 => array.unary(|n| { - let n: i64 = n.into(); - n as f64 / 1_000_f64 - }), - _ => return exec_err!("Can not convert {:?} to epoch", array.data_type()), - }; - Ok(b) -} - /// from_unixtime() SQL function implementation pub fn from_unixtime_invoke(args: &[ColumnarValue]) -> Result { if args.len() != 1 { @@ -1167,758 +427,18 @@ pub fn from_unixtime_invoke(args: &[ColumnarValue]) -> Result { mod tests { use std::sync::Arc; - use arrow::array::{as_primitive_array, ArrayRef, Int64Array, IntervalDayTimeArray}; - use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos; + use arrow::array::{ArrayRef, Int64Array}; use arrow_array::{ Date32Array, Date64Array, Int32Array, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt32Array, }; + use datafusion_common::ScalarValue; use super::*; - #[test] - fn date_trunc_test() { - let cases = vec![ - ( - "2020-09-08T13:42:29.190855Z", - "second", - "2020-09-08T13:42:29.000000Z", - ), - ( - "2020-09-08T13:42:29.190855Z", - "minute", - "2020-09-08T13:42:00.000000Z", - ), - ( - "2020-09-08T13:42:29.190855Z", - "hour", - "2020-09-08T13:00:00.000000Z", - ), - ( - "2020-09-08T13:42:29.190855Z", - "day", - "2020-09-08T00:00:00.000000Z", - ), - ( - "2020-09-08T13:42:29.190855Z", - "week", - "2020-09-07T00:00:00.000000Z", - ), - ( - "2020-09-08T13:42:29.190855Z", - "month", - "2020-09-01T00:00:00.000000Z", - ), - ( - "2020-09-08T13:42:29.190855Z", - "year", - "2020-01-01T00:00:00.000000Z", - ), - // week - ( - "2021-01-01T13:42:29.190855Z", - "week", - "2020-12-28T00:00:00.000000Z", - ), - ( - "2020-01-01T13:42:29.190855Z", - "week", - "2019-12-30T00:00:00.000000Z", - ), - // quarter - ( - "2020-01-01T13:42:29.190855Z", - "quarter", - "2020-01-01T00:00:00.000000Z", - ), - ( - "2020-02-01T13:42:29.190855Z", - "quarter", - "2020-01-01T00:00:00.000000Z", - ), - ( - "2020-03-01T13:42:29.190855Z", - "quarter", - "2020-01-01T00:00:00.000000Z", - ), - ( - "2020-04-01T13:42:29.190855Z", - "quarter", - "2020-04-01T00:00:00.000000Z", - ), - ( - "2020-08-01T13:42:29.190855Z", - "quarter", - "2020-07-01T00:00:00.000000Z", - ), - ( - "2020-11-01T13:42:29.190855Z", - "quarter", - "2020-10-01T00:00:00.000000Z", - ), - ( - "2020-12-01T13:42:29.190855Z", - "quarter", - "2020-10-01T00:00:00.000000Z", - ), - ]; - - cases.iter().for_each(|(original, granularity, expected)| { - let left = string_to_timestamp_nanos(original).unwrap(); - let right = string_to_timestamp_nanos(expected).unwrap(); - let result = date_trunc_coarse(granularity, left, None).unwrap(); - assert_eq!(result, right, "{original} = {expected}"); - }); - } - - #[test] - fn test_date_trunc_timezones() { - let cases = vec![ - ( - vec![ - "2020-09-08T00:00:00Z", - "2020-09-08T01:00:00Z", - "2020-09-08T02:00:00Z", - "2020-09-08T03:00:00Z", - "2020-09-08T04:00:00Z", - ], - Some("+00".into()), - vec![ - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - ], - ), - ( - vec![ - "2020-09-08T00:00:00Z", - "2020-09-08T01:00:00Z", - "2020-09-08T02:00:00Z", - "2020-09-08T03:00:00Z", - "2020-09-08T04:00:00Z", - ], - None, - vec![ - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - ], - ), - ( - vec![ - "2020-09-08T00:00:00Z", - "2020-09-08T01:00:00Z", - "2020-09-08T02:00:00Z", - "2020-09-08T03:00:00Z", - "2020-09-08T04:00:00Z", - ], - Some("-02".into()), - vec![ - "2020-09-07T02:00:00Z", - "2020-09-07T02:00:00Z", - "2020-09-08T02:00:00Z", - "2020-09-08T02:00:00Z", - "2020-09-08T02:00:00Z", - ], - ), - ( - vec![ - "2020-09-08T00:00:00+05", - "2020-09-08T01:00:00+05", - "2020-09-08T02:00:00+05", - "2020-09-08T03:00:00+05", - "2020-09-08T04:00:00+05", - ], - Some("+05".into()), - vec![ - "2020-09-08T00:00:00+05", - "2020-09-08T00:00:00+05", - "2020-09-08T00:00:00+05", - "2020-09-08T00:00:00+05", - "2020-09-08T00:00:00+05", - ], - ), - ( - vec![ - "2020-09-08T00:00:00+08", - "2020-09-08T01:00:00+08", - "2020-09-08T02:00:00+08", - "2020-09-08T03:00:00+08", - "2020-09-08T04:00:00+08", - ], - Some("+08".into()), - vec![ - "2020-09-08T00:00:00+08", - "2020-09-08T00:00:00+08", - "2020-09-08T00:00:00+08", - "2020-09-08T00:00:00+08", - "2020-09-08T00:00:00+08", - ], - ), - ( - vec![ - "2024-10-26T23:00:00Z", - "2024-10-27T00:00:00Z", - "2024-10-27T01:00:00Z", - "2024-10-27T02:00:00Z", - ], - Some("Europe/Berlin".into()), - vec![ - "2024-10-27T00:00:00+02", - "2024-10-27T00:00:00+02", - "2024-10-27T00:00:00+02", - "2024-10-27T00:00:00+02", - ], - ), - ( - vec![ - "2018-02-18T00:00:00Z", - "2018-02-18T01:00:00Z", - "2018-02-18T02:00:00Z", - "2018-02-18T03:00:00Z", - "2018-11-04T01:00:00Z", - "2018-11-04T02:00:00Z", - "2018-11-04T03:00:00Z", - "2018-11-04T04:00:00Z", - ], - Some("America/Sao_Paulo".into()), - vec![ - "2018-02-17T00:00:00-02", - "2018-02-17T00:00:00-02", - "2018-02-17T00:00:00-02", - "2018-02-18T00:00:00-03", - "2018-11-03T00:00:00-03", - "2018-11-03T00:00:00-03", - "2018-11-04T01:00:00-02", - "2018-11-04T01:00:00-02", - ], - ), - ]; - - cases.iter().for_each(|(original, tz_opt, expected)| { - let input = original - .iter() - .map(|s| Some(string_to_timestamp_nanos(s).unwrap())) - .collect::() - .with_timezone_opt(tz_opt.clone()); - let right = expected - .iter() - .map(|s| Some(string_to_timestamp_nanos(s).unwrap())) - .collect::() - .with_timezone_opt(tz_opt.clone()); - let result = date_trunc(&[ - ColumnarValue::Scalar(ScalarValue::from("day")), - ColumnarValue::Array(Arc::new(input)), - ]) - .unwrap(); - if let ColumnarValue::Array(result) = result { - assert_eq!( - result.data_type(), - &DataType::Timestamp(TimeUnit::Nanosecond, tz_opt.clone()) - ); - let left = as_primitive_array::(&result); - assert_eq!(left, &right); - } else { - panic!("unexpected column type"); - } - }); - } - - #[test] - fn test_date_trunc_hour_timezones() { - let cases = vec![ - ( - vec![ - "2020-09-08T00:30:00Z", - "2020-09-08T01:30:00Z", - "2020-09-08T02:30:00Z", - "2020-09-08T03:30:00Z", - "2020-09-08T04:30:00Z", - ], - Some("+00".into()), - vec![ - "2020-09-08T00:00:00Z", - "2020-09-08T01:00:00Z", - "2020-09-08T02:00:00Z", - "2020-09-08T03:00:00Z", - "2020-09-08T04:00:00Z", - ], - ), - ( - vec![ - "2020-09-08T00:30:00Z", - "2020-09-08T01:30:00Z", - "2020-09-08T02:30:00Z", - "2020-09-08T03:30:00Z", - "2020-09-08T04:30:00Z", - ], - None, - vec![ - "2020-09-08T00:00:00Z", - "2020-09-08T01:00:00Z", - "2020-09-08T02:00:00Z", - "2020-09-08T03:00:00Z", - "2020-09-08T04:00:00Z", - ], - ), - ( - vec![ - "2020-09-08T00:30:00Z", - "2020-09-08T01:30:00Z", - "2020-09-08T02:30:00Z", - "2020-09-08T03:30:00Z", - "2020-09-08T04:30:00Z", - ], - Some("-02".into()), - vec![ - "2020-09-08T00:00:00Z", - "2020-09-08T01:00:00Z", - "2020-09-08T02:00:00Z", - "2020-09-08T03:00:00Z", - "2020-09-08T04:00:00Z", - ], - ), - ( - vec![ - "2020-09-08T00:30:00+05", - "2020-09-08T01:30:00+05", - "2020-09-08T02:30:00+05", - "2020-09-08T03:30:00+05", - "2020-09-08T04:30:00+05", - ], - Some("+05".into()), - vec![ - "2020-09-08T00:00:00+05", - "2020-09-08T01:00:00+05", - "2020-09-08T02:00:00+05", - "2020-09-08T03:00:00+05", - "2020-09-08T04:00:00+05", - ], - ), - ( - vec![ - "2020-09-08T00:30:00+08", - "2020-09-08T01:30:00+08", - "2020-09-08T02:30:00+08", - "2020-09-08T03:30:00+08", - "2020-09-08T04:30:00+08", - ], - Some("+08".into()), - vec![ - "2020-09-08T00:00:00+08", - "2020-09-08T01:00:00+08", - "2020-09-08T02:00:00+08", - "2020-09-08T03:00:00+08", - "2020-09-08T04:00:00+08", - ], - ), - ( - vec![ - "2024-10-26T23:30:00Z", - "2024-10-27T00:30:00Z", - "2024-10-27T01:30:00Z", - "2024-10-27T02:30:00Z", - ], - Some("Europe/Berlin".into()), - vec![ - "2024-10-27T01:00:00+02", - "2024-10-27T02:00:00+02", - "2024-10-27T02:00:00+01", - "2024-10-27T03:00:00+01", - ], - ), - ( - vec![ - "2018-02-18T00:30:00Z", - "2018-02-18T01:30:00Z", - "2018-02-18T02:30:00Z", - "2018-02-18T03:30:00Z", - "2018-11-04T01:00:00Z", - "2018-11-04T02:00:00Z", - "2018-11-04T03:00:00Z", - "2018-11-04T04:00:00Z", - ], - Some("America/Sao_Paulo".into()), - vec![ - "2018-02-17T22:00:00-02", - "2018-02-17T23:00:00-02", - "2018-02-17T23:00:00-03", - "2018-02-18T00:00:00-03", - "2018-11-03T22:00:00-03", - "2018-11-03T23:00:00-03", - "2018-11-04T01:00:00-02", - "2018-11-04T02:00:00-02", - ], - ), - ]; - - cases.iter().for_each(|(original, tz_opt, expected)| { - let input = original - .iter() - .map(|s| Some(string_to_timestamp_nanos(s).unwrap())) - .collect::() - .with_timezone_opt(tz_opt.clone()); - let right = expected - .iter() - .map(|s| Some(string_to_timestamp_nanos(s).unwrap())) - .collect::() - .with_timezone_opt(tz_opt.clone()); - let result = date_trunc(&[ - ColumnarValue::Scalar(ScalarValue::from("hour")), - ColumnarValue::Array(Arc::new(input)), - ]) - .unwrap(); - if let ColumnarValue::Array(result) = result { - assert_eq!( - result.data_type(), - &DataType::Timestamp(TimeUnit::Nanosecond, tz_opt.clone()) - ); - let left = as_primitive_array::(&result); - assert_eq!(left, &right); - } else { - panic!("unexpected column type"); - } - }); - } - - #[test] - fn test_date_bin_single() { - use chrono::Duration; - - let cases = vec![ - ( - ( - Duration::minutes(15), - "2004-04-09T02:03:04.123456789Z", - "2001-01-01T00:00:00", - ), - "2004-04-09T02:00:00Z", - ), - ( - ( - Duration::minutes(15), - "2004-04-09T02:03:04.123456789Z", - "2001-01-01T00:02:30", - ), - "2004-04-09T02:02:30Z", - ), - ( - ( - Duration::minutes(15), - "2004-04-09T02:03:04.123456789Z", - "2005-01-01T00:02:30", - ), - "2004-04-09T02:02:30Z", - ), - ( - ( - Duration::hours(1), - "2004-04-09T02:03:04.123456789Z", - "2001-01-01T00:00:00", - ), - "2004-04-09T02:00:00Z", - ), - ( - ( - Duration::seconds(10), - "2004-04-09T02:03:11.123456789Z", - "2001-01-01T00:00:00", - ), - "2004-04-09T02:03:10Z", - ), - ]; - - cases - .iter() - .for_each(|((stride, source, origin), expected)| { - let stride1 = stride.num_nanoseconds().unwrap(); - let source1 = string_to_timestamp_nanos(source).unwrap(); - let origin1 = string_to_timestamp_nanos(origin).unwrap(); - - let expected1 = string_to_timestamp_nanos(expected).unwrap(); - let result = date_bin_nanos_interval(stride1, source1, origin1); - assert_eq!(result, expected1, "{source} = {expected}"); - }) - } - - #[test] - fn test_date_bin() { - let res = date_bin(&[ - ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(1))), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); - assert!(res.is_ok()); - - let timestamps = Arc::new((1..6).map(Some).collect::()); - let res = date_bin(&[ - ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(1))), - ColumnarValue::Array(timestamps), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); - assert!(res.is_ok()); - - let res = date_bin(&[ - ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(1))), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); - assert!(res.is_ok()); - - // stride supports month-day-nano - let res = date_bin(&[ - ColumnarValue::Scalar(ScalarValue::IntervalMonthDayNano(Some(1))), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); - assert!(res.is_ok()); - - // - // Fallible test cases - // - - // invalid number of arguments - let res = - date_bin(&[ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(1)))]); - assert_eq!( - res.err().unwrap().strip_backtrace(), - "Execution error: DATE_BIN expected two or three arguments" - ); - - // stride: invalid type - let res = date_bin(&[ - ColumnarValue::Scalar(ScalarValue::IntervalYearMonth(Some(1))), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); - assert_eq!( - res.err().unwrap().strip_backtrace(), - "Execution error: DATE_BIN expects stride argument to be an INTERVAL but got Interval(YearMonth)" - ); - - // stride: invalid value - let res = date_bin(&[ - ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(0))), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); - assert_eq!( - res.err().unwrap().strip_backtrace(), - "Execution error: DATE_BIN stride must be non-zero" - ); - - // stride: overflow of day-time interval - let res = date_bin(&[ - ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(i64::MAX))), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); - assert_eq!( - res.err().unwrap().strip_backtrace(), - "Execution error: DATE_BIN stride argument is too large" - ); - - // stride: overflow of month-day-nano interval - let res = date_bin(&[ - ColumnarValue::Scalar(ScalarValue::new_interval_mdn(0, i32::MAX, 1)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); - assert_eq!( - res.err().unwrap().strip_backtrace(), - "Execution error: DATE_BIN stride argument is too large" - ); - - // stride: month intervals - let res = date_bin(&[ - ColumnarValue::Scalar(ScalarValue::new_interval_mdn(1, 1, 1)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); - assert_eq!( - res.err().unwrap().strip_backtrace(), - "This feature is not implemented: DATE_BIN stride does not support combination of month, day and nanosecond intervals" - ); - - // origin: invalid type - let res = date_bin(&[ - ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(1))), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(Some(1), None)), - ]); - assert_eq!( - res.err().unwrap().strip_backtrace(), - "Execution error: DATE_BIN expects origin argument to be a TIMESTAMP with nanosececond precision but got Timestamp(Microsecond, None)" - ); - - let res = date_bin(&[ - ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(1))), - ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); - assert!(res.is_ok()); - - // unsupported array type for stride - let intervals = Arc::new((1..6).map(Some).collect::()); - let res = date_bin(&[ - ColumnarValue::Array(intervals), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); - assert_eq!( - res.err().unwrap().strip_backtrace(), - "This feature is not implemented: DATE_BIN only supports literal values for the stride argument, not arrays" - ); - - // unsupported array type for origin - let timestamps = Arc::new((1..6).map(Some).collect::()); - let res = date_bin(&[ - ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(1))), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Array(timestamps), - ]); - assert_eq!( - res.err().unwrap().strip_backtrace(), - "This feature is not implemented: DATE_BIN only supports literal values for the origin argument, not arrays" - ); - } - - #[test] - fn test_date_bin_timezones() { - let cases = vec![ - ( - vec![ - "2020-09-08T00:00:00Z", - "2020-09-08T01:00:00Z", - "2020-09-08T02:00:00Z", - "2020-09-08T03:00:00Z", - "2020-09-08T04:00:00Z", - ], - Some("+00".into()), - "1970-01-01T00:00:00Z", - vec![ - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - ], - ), - ( - vec![ - "2020-09-08T00:00:00Z", - "2020-09-08T01:00:00Z", - "2020-09-08T02:00:00Z", - "2020-09-08T03:00:00Z", - "2020-09-08T04:00:00Z", - ], - None, - "1970-01-01T00:00:00Z", - vec![ - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - ], - ), - ( - vec![ - "2020-09-08T00:00:00Z", - "2020-09-08T01:00:00Z", - "2020-09-08T02:00:00Z", - "2020-09-08T03:00:00Z", - "2020-09-08T04:00:00Z", - ], - Some("-02".into()), - "1970-01-01T00:00:00Z", - vec![ - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - "2020-09-08T00:00:00Z", - ], - ), - ( - vec![ - "2020-09-08T00:00:00+05", - "2020-09-08T01:00:00+05", - "2020-09-08T02:00:00+05", - "2020-09-08T03:00:00+05", - "2020-09-08T04:00:00+05", - ], - Some("+05".into()), - "1970-01-01T00:00:00+05", - vec![ - "2020-09-08T00:00:00+05", - "2020-09-08T00:00:00+05", - "2020-09-08T00:00:00+05", - "2020-09-08T00:00:00+05", - "2020-09-08T00:00:00+05", - ], - ), - ( - vec![ - "2020-09-08T00:00:00+08", - "2020-09-08T01:00:00+08", - "2020-09-08T02:00:00+08", - "2020-09-08T03:00:00+08", - "2020-09-08T04:00:00+08", - ], - Some("+08".into()), - "1970-01-01T00:00:00+08", - vec![ - "2020-09-08T00:00:00+08", - "2020-09-08T00:00:00+08", - "2020-09-08T00:00:00+08", - "2020-09-08T00:00:00+08", - "2020-09-08T00:00:00+08", - ], - ), - ]; - - cases - .iter() - .for_each(|(original, tz_opt, origin, expected)| { - let input = original - .iter() - .map(|s| Some(string_to_timestamp_nanos(s).unwrap())) - .collect::() - .with_timezone_opt(tz_opt.clone()); - let right = expected - .iter() - .map(|s| Some(string_to_timestamp_nanos(s).unwrap())) - .collect::() - .with_timezone_opt(tz_opt.clone()); - let result = date_bin(&[ - ColumnarValue::Scalar(ScalarValue::new_interval_dt(1, 0)), - ColumnarValue::Array(Arc::new(input)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond( - Some(string_to_timestamp_nanos(origin).unwrap()), - tz_opt.clone(), - )), - ]) - .unwrap(); - if let ColumnarValue::Array(result) = result { - assert_eq!( - result.data_type(), - &DataType::Timestamp(TimeUnit::Nanosecond, tz_opt.clone()) - ); - let left = as_primitive_array::(&result); - assert_eq!(left, &right); - } else { - panic!("unexpected column type"); - } - }); - } - #[test] fn test_make_date() { let res = make_date(&[ diff --git a/datafusion/physical-expr/src/equivalence/projection.rs b/datafusion/physical-expr/src/equivalence/projection.rs index 0f92b2c2f431..a598eac101b7 100644 --- a/datafusion/physical-expr/src/equivalence/projection.rs +++ b/datafusion/physical-expr/src/equivalence/projection.rs @@ -121,34 +121,44 @@ mod tests { use crate::PhysicalSortExpr; use arrow::datatypes::{DataType, Field, Schema}; use arrow_schema::{SortOptions, TimeUnit}; - use datafusion_common::{Result, ScalarValue}; - use datafusion_expr::{BuiltinScalarFunction, Operator}; + use datafusion_common::{Result, ScalarValue, ToDFSchema}; + use datafusion_expr::{BuiltinScalarFunction, Expr, Operator}; + use datafusion_functions::expr_fn::_date_bin_scalar_udf; use itertools::Itertools; use std::sync::Arc; #[test] fn project_orderings() -> Result<()> { - let schema = Arc::new(Schema::new(vec![ + let s = Schema::new(vec![ Field::new("a", DataType::Int32, true), Field::new("b", DataType::Int32, true), Field::new("c", DataType::Int32, true), Field::new("d", DataType::Int32, true), Field::new("e", DataType::Int32, true), Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true), - ])); + ]); + let df_schema = s.clone().to_dfschema()?; + let schema = Arc::new(s); let col_a = &col("a", &schema)?; let col_b = &col("b", &schema)?; let col_c = &col("c", &schema)?; let col_d = &col("d", &schema)?; let col_e = &col("e", &schema)?; let col_ts = &col("ts", &schema)?; - let interval = Arc::new(Literal::new(ScalarValue::IntervalDayTime(Some(2)))) - as Arc; - let date_bin_func = &create_physical_expr( - &BuiltinScalarFunction::DateBin, - &[interval, col_ts.clone()], - &schema, - &ExecutionProps::default(), + let expr_col_ts = datafusion_common::Column::from_qualified_name("ts"); + let interval_value = ScalarValue::IntervalDayTime(Some(2)); + let expr_interval = Expr::Literal(interval_value.clone()); + let interval = Arc::new(Literal::new(interval_value)) as Arc; + let date_bin_udf = _date_bin_scalar_udf(); + let date_bin_func = &crate::udf::create_physical_expr( + &date_bin_udf, + &[interval.clone(), col_ts.clone()], + date_bin_udf + .return_type_from_exprs( + &[expr_interval, Expr::Column(expr_col_ts)], + &df_schema, + ) + .unwrap(), )?; let a_plus_b = Arc::new(BinaryExpr::new( col_a.clone(), @@ -641,29 +651,38 @@ mod tests { #[test] fn project_orderings2() -> Result<()> { - let schema = Arc::new(Schema::new(vec![ + let s = Schema::new(vec![ Field::new("a", DataType::Int32, true), Field::new("b", DataType::Int32, true), Field::new("c", DataType::Int32, true), Field::new("d", DataType::Int32, true), Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true), - ])); + ]); + let df_schema = s.clone().to_dfschema()?; + let schema = Arc::new(s.clone()); let col_a = &col("a", &schema)?; let col_b = &col("b", &schema)?; let col_c = &col("c", &schema)?; let col_ts = &col("ts", &schema)?; + let expr_col_ts = datafusion_common::Column::from_qualified_name("ts"); let a_plus_b = Arc::new(BinaryExpr::new( col_a.clone(), Operator::Plus, col_b.clone(), )) as Arc; - let interval = Arc::new(Literal::new(ScalarValue::IntervalDayTime(Some(2)))) - as Arc; - let date_bin_ts = &create_physical_expr( - &BuiltinScalarFunction::DateBin, - &[interval, col_ts.clone()], - &schema, - &ExecutionProps::default(), + let interval_value = ScalarValue::IntervalDayTime(Some(2)); + let expr_interval = Expr::Literal(interval_value.clone()); + let interval = Arc::new(Literal::new(interval_value)) as Arc; + let date_bin_udf = _date_bin_scalar_udf(); + let date_bin_ts = &crate::udf::create_physical_expr( + &date_bin_udf, + &[interval.clone(), col_ts.clone()], + date_bin_udf + .return_type_from_exprs( + &[expr_interval, Expr::Column(expr_col_ts)], + &df_schema, + ) + .unwrap(), )?; let round_c = &create_physical_expr( diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index 56ad92082d9f..f3f37b20e3d2 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -482,9 +482,6 @@ pub fn create_physical_fun( BuiltinScalarFunction::ConcatWithSeparator => Arc::new(|args| { make_scalar_function_inner(string_expressions::concat_ws)(args) }), - BuiltinScalarFunction::DatePart => Arc::new(datetime_expressions::date_part), - BuiltinScalarFunction::DateTrunc => Arc::new(datetime_expressions::date_trunc), - BuiltinScalarFunction::DateBin => Arc::new(datetime_expressions::date_bin), BuiltinScalarFunction::Now => { // bind value for now at plan time Arc::new(datetime_expressions::make_now( diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 1ad7a2c3afaf..41dc6fef1924 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -576,8 +576,8 @@ enum ScalarFunction { Chr = 25; Concat = 26; ConcatWithSeparator = 27; - DatePart = 28; - DateTrunc = 29; + // 28 was DatePart + // 29 was DateTrunc InitCap = 30; Left = 31; Lpad = 32; @@ -616,7 +616,7 @@ enum ScalarFunction { StructFun = 65; FromUnixtime = 66; Atan2 = 67; - DateBin = 68; + // 68 was DateBin ArrowTypeof = 69; CurrentDate = 70; CurrentTime = 71; diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 33ebdf310ae0..0def1e3b3586 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22346,8 +22346,6 @@ impl serde::Serialize for ScalarFunction { Self::Chr => "Chr", Self::Concat => "Concat", Self::ConcatWithSeparator => "ConcatWithSeparator", - Self::DatePart => "DatePart", - Self::DateTrunc => "DateTrunc", Self::InitCap => "InitCap", Self::Left => "Left", Self::Lpad => "Lpad", @@ -22381,7 +22379,6 @@ impl serde::Serialize for ScalarFunction { Self::StructFun => "StructFun", Self::FromUnixtime => "FromUnixtime", Self::Atan2 => "Atan2", - Self::DateBin => "DateBin", Self::ArrowTypeof => "ArrowTypeof", Self::CurrentDate => "CurrentDate", Self::CurrentTime => "CurrentTime", @@ -22480,8 +22477,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Chr", "Concat", "ConcatWithSeparator", - "DatePart", - "DateTrunc", "InitCap", "Left", "Lpad", @@ -22515,7 +22510,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "StructFun", "FromUnixtime", "Atan2", - "DateBin", "ArrowTypeof", "CurrentDate", "CurrentTime", @@ -22643,8 +22637,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Chr" => Ok(ScalarFunction::Chr), "Concat" => Ok(ScalarFunction::Concat), "ConcatWithSeparator" => Ok(ScalarFunction::ConcatWithSeparator), - "DatePart" => Ok(ScalarFunction::DatePart), - "DateTrunc" => Ok(ScalarFunction::DateTrunc), "InitCap" => Ok(ScalarFunction::InitCap), "Left" => Ok(ScalarFunction::Left), "Lpad" => Ok(ScalarFunction::Lpad), @@ -22678,7 +22670,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "StructFun" => Ok(ScalarFunction::StructFun), "FromUnixtime" => Ok(ScalarFunction::FromUnixtime), "Atan2" => Ok(ScalarFunction::Atan2), - "DateBin" => Ok(ScalarFunction::DateBin), "ArrowTypeof" => Ok(ScalarFunction::ArrowTypeof), "CurrentDate" => Ok(ScalarFunction::CurrentDate), "CurrentTime" => Ok(ScalarFunction::CurrentTime), diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 2d21f15570dd..b24fcf6680f8 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2663,8 +2663,8 @@ pub enum ScalarFunction { Chr = 25, Concat = 26, ConcatWithSeparator = 27, - DatePart = 28, - DateTrunc = 29, + /// 28 was DatePart + /// 29 was DateTrunc InitCap = 30, Left = 31, Lpad = 32, @@ -2703,7 +2703,7 @@ pub enum ScalarFunction { StructFun = 65, FromUnixtime = 66, Atan2 = 67, - DateBin = 68, + /// 68 was DateBin ArrowTypeof = 69, CurrentDate = 70, CurrentTime = 71, @@ -2804,8 +2804,6 @@ impl ScalarFunction { ScalarFunction::Chr => "Chr", ScalarFunction::Concat => "Concat", ScalarFunction::ConcatWithSeparator => "ConcatWithSeparator", - ScalarFunction::DatePart => "DatePart", - ScalarFunction::DateTrunc => "DateTrunc", ScalarFunction::InitCap => "InitCap", ScalarFunction::Left => "Left", ScalarFunction::Lpad => "Lpad", @@ -2839,7 +2837,6 @@ impl ScalarFunction { ScalarFunction::StructFun => "StructFun", ScalarFunction::FromUnixtime => "FromUnixtime", ScalarFunction::Atan2 => "Atan2", - ScalarFunction::DateBin => "DateBin", ScalarFunction::ArrowTypeof => "ArrowTypeof", ScalarFunction::CurrentDate => "CurrentDate", ScalarFunction::CurrentTime => "CurrentTime", @@ -2932,8 +2929,6 @@ impl ScalarFunction { "Chr" => Some(Self::Chr), "Concat" => Some(Self::Concat), "ConcatWithSeparator" => Some(Self::ConcatWithSeparator), - "DatePart" => Some(Self::DatePart), - "DateTrunc" => Some(Self::DateTrunc), "InitCap" => Some(Self::InitCap), "Left" => Some(Self::Left), "Lpad" => Some(Self::Lpad), @@ -2967,7 +2962,6 @@ impl ScalarFunction { "StructFun" => Some(Self::StructFun), "FromUnixtime" => Some(Self::FromUnixtime), "Atan2" => Some(Self::Atan2), - "DateBin" => Some(Self::DateBin), "ArrowTypeof" => Some(Self::ArrowTypeof), "CurrentDate" => Some(Self::CurrentDate), "CurrentTime" => Some(Self::CurrentTime), diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index ab7065cfbd85..66d52adce71d 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -54,8 +54,8 @@ use datafusion_expr::{ array_repeat, array_replace, array_replace_all, array_replace_n, array_resize, array_slice, array_sort, array_union, arrow_typeof, ascii, asinh, atan, atan2, atanh, bit_length, btrim, cardinality, cbrt, ceil, character_length, chr, coalesce, - concat_expr, concat_ws_expr, cos, cosh, cot, current_date, current_time, date_bin, - date_part, date_trunc, degrees, digest, ends_with, exp, + concat_expr, concat_ws_expr, cos, cosh, cot, current_date, current_time, degrees, + digest, ends_with, exp, expr::{self, InList, Sort, WindowFunction}, factorial, find_in_set, flatten, floor, from_unixtime, gcd, initcap, instr, iszero, lcm, left, levenshtein, ln, log, log10, log2, @@ -509,9 +509,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::ArrayResize => Self::ArrayResize, ScalarFunction::Cardinality => Self::Cardinality, ScalarFunction::Array => Self::MakeArray, - ScalarFunction::DatePart => Self::DatePart, - ScalarFunction::DateTrunc => Self::DateTrunc, - ScalarFunction::DateBin => Self::DateBin, ScalarFunction::Md5 => Self::MD5, ScalarFunction::Sha224 => Self::SHA224, ScalarFunction::Sha256 => Self::SHA256, @@ -1530,19 +1527,6 @@ pub fn parse_expr( ScalarFunction::Trim => Ok(trim(parse_expr(&args[0], registry)?)), ScalarFunction::Ltrim => Ok(ltrim(parse_expr(&args[0], registry)?)), ScalarFunction::Rtrim => Ok(rtrim(parse_expr(&args[0], registry)?)), - ScalarFunction::DatePart => Ok(date_part( - parse_expr(&args[0], registry)?, - parse_expr(&args[1], registry)?, - )), - ScalarFunction::DateTrunc => Ok(date_trunc( - parse_expr(&args[0], registry)?, - parse_expr(&args[1], registry)?, - )), - ScalarFunction::DateBin => Ok(date_bin( - parse_expr(&args[0], registry)?, - parse_expr(&args[1], registry)?, - parse_expr(&args[2], registry)?, - )), ScalarFunction::Sha224 => Ok(sha224(parse_expr(&args[0], registry)?)), ScalarFunction::Sha256 => Ok(sha256(parse_expr(&args[0], registry)?)), ScalarFunction::Sha384 => Ok(sha384(parse_expr(&args[0], registry)?)), diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index c913119ff9ed..591ee796173f 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1489,9 +1489,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::ArrayUnion => Self::ArrayUnion, BuiltinScalarFunction::Cardinality => Self::Cardinality, BuiltinScalarFunction::MakeArray => Self::Array, - BuiltinScalarFunction::DatePart => Self::DatePart, - BuiltinScalarFunction::DateTrunc => Self::DateTrunc, - BuiltinScalarFunction::DateBin => Self::DateBin, BuiltinScalarFunction::MD5 => Self::Md5, BuiltinScalarFunction::SHA224 => Self::Sha224, BuiltinScalarFunction::SHA256 => Self::Sha256, diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml index fb300e2c8791..c055c3f611de 100644 --- a/datafusion/sql/Cargo.toml +++ b/datafusion/sql/Cargo.toml @@ -40,7 +40,9 @@ unicode_expressions = [] arrow = { workspace = true } arrow-schema = { workspace = true } datafusion-common = { workspace = true, default-features = true } +datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } +datafusion-functions = { workspace = true } log = { workspace = true } sqlparser = { workspace = true } diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index b058fb79b4a1..3e7a62e398c3 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -40,6 +40,7 @@ use datafusion_expr::{ col, expr, lit, AggregateFunction, Between, BinaryExpr, BuiltinScalarFunction, Cast, Expr, ExprSchemable, GetFieldAccess, GetIndexedField, Like, Operator, TryCast, }; +use datafusion_functions::expr_fn::_date_part_scalar_udf; use sqlparser::ast::{ArrayAgg, Expr as SQLExpr, JsonOperator, TrimWhereField, Value}; use sqlparser::parser::ParserError::ParserError; @@ -169,8 +170,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { self.parse_value(value, planner_context.prepare_param_data_types()) } SQLExpr::Extract { field, expr } => { - Ok(Expr::ScalarFunction(ScalarFunction::new( - BuiltinScalarFunction::DatePart, + Ok(Expr::ScalarFunction(ScalarFunction::new_udf( + _date_part_scalar_udf(), vec![ Expr::Literal(ScalarValue::from(format!("{field}"))), self.sql_expr_to_logical_expr(*expr, schema, planner_context)?, @@ -885,8 +886,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { #[cfg(test)] mod tests { - use super::*; - use std::collections::HashMap; use std::sync::Arc; @@ -900,6 +899,8 @@ mod tests { use crate::TableReference; + use super::*; + struct TestContextProvider { options: ConfigOptions, tables: HashMap>, diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 55551d1d25a3..688e9f341bf2 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -18,14 +18,15 @@ use std::any::Any; #[cfg(test)] use std::collections::HashMap; +use std::collections::HashSet; use std::{sync::Arc, vec}; use arrow_schema::*; use sqlparser::dialect::{Dialect, GenericDialect, HiveDialect, MySqlDialect}; use datafusion_common::{ - assert_contains, config::ConfigOptions, DataFusionError, Result, ScalarValue, - TableReference, + assert_contains, config::ConfigOptions, exec_err, not_impl_err, DataFusionError, + Result, ScalarValue, TableReference, }; use datafusion_common::{plan_err, ParamValues}; use datafusion_expr::{ @@ -38,6 +39,7 @@ use datafusion_sql::{ planner::{ContextProvider, ParserOptions, SqlToRel}, }; +use datafusion_execution::FunctionRegistry; use rstest::rstest; #[test] @@ -2661,7 +2663,8 @@ fn logical_plan_with_options(sql: &str, options: ParserOptions) -> Result Result { - let context = MockContextProvider::default(); + let mut context = MockContextProvider::default(); + datafusion_functions::register_all(&mut context)?; let planner = SqlToRel::new(&context); let result = DFParser::parse_sql_with_dialect(sql, dialect); let mut ast = result?; @@ -2673,11 +2676,12 @@ fn logical_plan_with_dialect_and_options( dialect: &dyn Dialect, options: ParserOptions, ) -> Result { - let context = MockContextProvider::default().with_udf(make_udf( + let mut context = MockContextProvider::default().with_udf(make_udf( "nullif", vec![DataType::Int32, DataType::Int32], DataType::Int32, )); + datafusion_functions::register_all(&mut context)?; let planner = SqlToRel::new_with_options(&context, options); let result = DFParser::parse_sql_with_dialect(sql, dialect); @@ -2731,8 +2735,7 @@ impl ScalarUDFImpl for DummyUDF { /// Create logical plan, write with formatter, compare to expected output fn quick_test(sql: &str, expected: &str) { - let plan = logical_plan(sql).unwrap(); - assert_eq!(format!("{plan:?}"), expected); + quick_test_with_options(sql, expected, ParserOptions::default()) } fn quick_test_with_options(sql: &str, expected: &str, options: ParserOptions) { @@ -2909,6 +2912,54 @@ impl ContextProvider for MockContextProvider { } } +impl FunctionRegistry for MockContextProvider { + fn udfs(&self) -> HashSet { + self.udfs + .keys() + .map(|s| s.to_string()) + .collect::>() + } + + fn udf(&self, name: &str) -> Result> { + let opt = self.udfs.get(name); + match opt { + Some(udf) => Ok(udf.clone()), + None => exec_err!("Not found"), + } + } + + fn udaf(&self, name: &str) -> Result> { + let opt = self.udafs.get(name); + match opt { + Some(udaf) => Ok(udaf.clone()), + None => exec_err!("Not found"), + } + } + + fn udwf(&self, _name: &str) -> Result> { + not_impl_err!("not implemented") + } + + fn register_udf(&mut self, udf: Arc) -> Result>> { + Ok(self.udfs.insert(String::from(udf.name()), udf)) + } + + fn register_udaf( + &mut self, + udaf: Arc, + ) -> Result>> { + Ok(self.udafs.insert(String::from(udaf.name()), udaf)) + } + + fn deregister_udf(&mut self, name: &str) -> Result>> { + Ok(self.udfs.remove(name)) + } + + fn deregister_udaf(&mut self, name: &str) -> Result>> { + Ok(self.udafs.remove(name)) + } +} + #[test] fn select_partially_qualified_column() { let sql = r#"SELECT person.first_name FROM public.person"#; From 2a4196021a13e56453dd0251f2b55ca498579c00 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sun, 3 Mar 2024 12:22:15 -0500 Subject: [PATCH 02/17] I do not understand why the logical plan changed but updating the explain text to reflect the change. The physical plan is unchanged. --- .../sqllogictest/test_files/tpch/q8.slt.part | 57 ++++++++++--------- .../sqllogictest/test_files/tpch/q9.slt.part | 35 ++++++------ 2 files changed, 47 insertions(+), 45 deletions(-) diff --git a/datafusion/sqllogictest/test_files/tpch/q8.slt.part b/datafusion/sqllogictest/test_files/tpch/q8.slt.part index 760b40ad1ae8..621a132129e8 100644 --- a/datafusion/sqllogictest/test_files/tpch/q8.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/q8.slt.part @@ -61,34 +61,35 @@ Sort: all_nations.o_year ASC NULLS LAST ----Aggregate: groupBy=[[all_nations.o_year]], aggr=[[SUM(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Decimal128(Some(0),38,4) END) AS SUM(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END), SUM(all_nations.volume)]] ------SubqueryAlias: all_nations --------Projection: date_part(Utf8("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume, n2.n_name AS nation -----------Inner Join: n1.n_regionkey = region.r_regionkey -------------Projection: lineitem.l_extendedprice, lineitem.l_discount, orders.o_orderdate, n1.n_regionkey, n2.n_name ---------------Inner Join: supplier.s_nationkey = n2.n_nationkey -----------------Projection: lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, orders.o_orderdate, n1.n_regionkey -------------------Inner Join: customer.c_nationkey = n1.n_nationkey ---------------------Projection: lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, orders.o_orderdate, customer.c_nationkey -----------------------Inner Join: orders.o_custkey = customer.c_custkey -------------------------Projection: lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, orders.o_custkey, orders.o_orderdate ---------------------------Inner Join: lineitem.l_orderkey = orders.o_orderkey -----------------------------Projection: lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey -------------------------------Inner Join: lineitem.l_suppkey = supplier.s_suppkey ---------------------------------Projection: lineitem.l_orderkey, lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount -----------------------------------Inner Join: part.p_partkey = lineitem.l_partkey -------------------------------------Projection: part.p_partkey ---------------------------------------Filter: part.p_type = Utf8("ECONOMY ANODIZED STEEL") -----------------------------------------TableScan: part projection=[p_partkey, p_type], partial_filters=[part.p_type = Utf8("ECONOMY ANODIZED STEEL")] -------------------------------------TableScan: lineitem projection=[l_orderkey, l_partkey, l_suppkey, l_extendedprice, l_discount] ---------------------------------TableScan: supplier projection=[s_suppkey, s_nationkey] -----------------------------Filter: orders.o_orderdate >= Date32("9131") AND orders.o_orderdate <= Date32("9861") -------------------------------TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate], partial_filters=[orders.o_orderdate >= Date32("9131"), orders.o_orderdate <= Date32("9861")] -------------------------TableScan: customer projection=[c_custkey, c_nationkey] ---------------------SubqueryAlias: n1 -----------------------TableScan: nation projection=[n_nationkey, n_regionkey] -----------------SubqueryAlias: n2 -------------------TableScan: nation projection=[n_nationkey, n_name] -------------Projection: region.r_regionkey ---------------Filter: region.r_name = Utf8("AMERICA") -----------------TableScan: region projection=[r_regionkey, r_name], partial_filters=[region.r_name = Utf8("AMERICA")] +----------Projection: lineitem.l_extendedprice, lineitem.l_discount, orders.o_orderdate, n2.n_name +------------Inner Join: n1.n_regionkey = region.r_regionkey +--------------Projection: lineitem.l_extendedprice, lineitem.l_discount, orders.o_orderdate, n1.n_regionkey, n2.n_name +----------------Inner Join: supplier.s_nationkey = n2.n_nationkey +------------------Projection: lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, orders.o_orderdate, n1.n_regionkey +--------------------Inner Join: customer.c_nationkey = n1.n_nationkey +----------------------Projection: lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, orders.o_orderdate, customer.c_nationkey +------------------------Inner Join: orders.o_custkey = customer.c_custkey +--------------------------Projection: lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, orders.o_custkey, orders.o_orderdate +----------------------------Inner Join: lineitem.l_orderkey = orders.o_orderkey +------------------------------Projection: lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey +--------------------------------Inner Join: lineitem.l_suppkey = supplier.s_suppkey +----------------------------------Projection: lineitem.l_orderkey, lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount +------------------------------------Inner Join: part.p_partkey = lineitem.l_partkey +--------------------------------------Projection: part.p_partkey +----------------------------------------Filter: part.p_type = Utf8("ECONOMY ANODIZED STEEL") +------------------------------------------TableScan: part projection=[p_partkey, p_type], partial_filters=[part.p_type = Utf8("ECONOMY ANODIZED STEEL")] +--------------------------------------TableScan: lineitem projection=[l_orderkey, l_partkey, l_suppkey, l_extendedprice, l_discount] +----------------------------------TableScan: supplier projection=[s_suppkey, s_nationkey] +------------------------------Filter: orders.o_orderdate >= Date32("9131") AND orders.o_orderdate <= Date32("9861") +--------------------------------TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate], partial_filters=[orders.o_orderdate >= Date32("9131"), orders.o_orderdate <= Date32("9861")] +--------------------------TableScan: customer projection=[c_custkey, c_nationkey] +----------------------SubqueryAlias: n1 +------------------------TableScan: nation projection=[n_nationkey, n_regionkey] +------------------SubqueryAlias: n2 +--------------------TableScan: nation projection=[n_nationkey, n_name] +--------------Projection: region.r_regionkey +----------------Filter: region.r_name = Utf8("AMERICA") +------------------TableScan: region projection=[r_regionkey, r_name], partial_filters=[region.r_name = Utf8("AMERICA")] physical_plan SortPreservingMergeExec: [o_year@0 ASC NULLS LAST] --SortExec: expr=[o_year@0 ASC NULLS LAST] diff --git a/datafusion/sqllogictest/test_files/tpch/q9.slt.part b/datafusion/sqllogictest/test_files/tpch/q9.slt.part index 5db97f79bdb1..ecd0056d17b4 100644 --- a/datafusion/sqllogictest/test_files/tpch/q9.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/q9.slt.part @@ -58,23 +58,24 @@ Limit: skip=0, fetch=10 ------Aggregate: groupBy=[[profit.nation, profit.o_year]], aggr=[[SUM(profit.amount)]] --------SubqueryAlias: profit ----------Projection: nation.n_name AS nation, date_part(Utf8("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) - partsupp.ps_supplycost * lineitem.l_quantity AS amount -------------Inner Join: supplier.s_nationkey = nation.n_nationkey ---------------Projection: lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, partsupp.ps_supplycost, orders.o_orderdate -----------------Inner Join: lineitem.l_orderkey = orders.o_orderkey -------------------Projection: lineitem.l_orderkey, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, partsupp.ps_supplycost ---------------------Inner Join: lineitem.l_suppkey = partsupp.ps_suppkey, lineitem.l_partkey = partsupp.ps_partkey -----------------------Projection: lineitem.l_orderkey, lineitem.l_partkey, lineitem.l_suppkey, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey -------------------------Inner Join: lineitem.l_suppkey = supplier.s_suppkey ---------------------------Projection: lineitem.l_orderkey, lineitem.l_partkey, lineitem.l_suppkey, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount -----------------------------Inner Join: part.p_partkey = lineitem.l_partkey -------------------------------Projection: part.p_partkey ---------------------------------Filter: part.p_name LIKE Utf8("%green%") -----------------------------------TableScan: part projection=[p_partkey, p_name], partial_filters=[part.p_name LIKE Utf8("%green%")] -------------------------------TableScan: lineitem projection=[l_orderkey, l_partkey, l_suppkey, l_quantity, l_extendedprice, l_discount] ---------------------------TableScan: supplier projection=[s_suppkey, s_nationkey] -----------------------TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] -------------------TableScan: orders projection=[o_orderkey, o_orderdate] ---------------TableScan: nation projection=[n_nationkey, n_name] +------------Projection: lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, partsupp.ps_supplycost, orders.o_orderdate, nation.n_name +--------------Inner Join: supplier.s_nationkey = nation.n_nationkey +----------------Projection: lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, partsupp.ps_supplycost, orders.o_orderdate +------------------Inner Join: lineitem.l_orderkey = orders.o_orderkey +--------------------Projection: lineitem.l_orderkey, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, partsupp.ps_supplycost +----------------------Inner Join: lineitem.l_suppkey = partsupp.ps_suppkey, lineitem.l_partkey = partsupp.ps_partkey +------------------------Projection: lineitem.l_orderkey, lineitem.l_partkey, lineitem.l_suppkey, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey +--------------------------Inner Join: lineitem.l_suppkey = supplier.s_suppkey +----------------------------Projection: lineitem.l_orderkey, lineitem.l_partkey, lineitem.l_suppkey, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount +------------------------------Inner Join: part.p_partkey = lineitem.l_partkey +--------------------------------Projection: part.p_partkey +----------------------------------Filter: part.p_name LIKE Utf8("%green%") +------------------------------------TableScan: part projection=[p_partkey, p_name], partial_filters=[part.p_name LIKE Utf8("%green%")] +--------------------------------TableScan: lineitem projection=[l_orderkey, l_partkey, l_suppkey, l_quantity, l_extendedprice, l_discount] +----------------------------TableScan: supplier projection=[s_suppkey, s_nationkey] +------------------------TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] +--------------------TableScan: orders projection=[o_orderkey, o_orderdate] +----------------TableScan: nation projection=[n_nationkey, n_name] physical_plan GlobalLimitExec: skip=0, fetch=10 --SortPreservingMergeExec: [nation@0 ASC NULLS LAST,o_year@1 DESC], fetch=10 From 618cc40cabeeef0bdac30b9f4259347e874deec7 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sun, 3 Mar 2024 12:40:50 -0500 Subject: [PATCH 03/17] Fix fmt --- datafusion/proto/src/logical_plan/from_proto.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index e21a5717aa7f..8befb7e91690 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -54,8 +54,7 @@ use datafusion_expr::{ array_replace_all, array_replace_n, array_resize, array_slice, array_sort, array_union, arrow_typeof, ascii, asinh, atan, atan2, atanh, bit_length, btrim, cbrt, ceil, character_length, chr, coalesce, concat_expr, concat_ws_expr, cos, cosh, cot, - current_date, current_time, degrees, - digest, ends_with, exp, + current_date, current_time, degrees, digest, ends_with, exp, expr::{self, InList, Sort, WindowFunction}, factorial, find_in_set, flatten, floor, from_unixtime, gcd, initcap, iszero, lcm, left, levenshtein, ln, log, log10, log2, From 98d5ff7acc10fec0005d52a90d9f9a4ef54a3063 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Mon, 4 Mar 2024 20:53:54 -0500 Subject: [PATCH 04/17] Improvements to remove datafusion-functions dependency from sq and physical-expr --- datafusion-cli/Cargo.lock | 3 - datafusion/functions/src/datetime/mod.rs | 11 +-- datafusion/physical-expr/Cargo.toml | 1 - .../src/equivalence/projection.rs | 63 ++------------- datafusion/sql/Cargo.toml | 2 - datafusion/sql/src/expr/mod.rs | 22 ++++-- datafusion/sql/tests/sql_integration.rs | 76 ++++--------------- 7 files changed, 39 insertions(+), 139 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 683f4e320530..46484be0e195 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1296,7 +1296,6 @@ dependencies = [ "datafusion-common", "datafusion-execution", "datafusion-expr", - "datafusion-functions", "half", "hashbrown 0.14.3", "hex", @@ -1350,9 +1349,7 @@ dependencies = [ "arrow", "arrow-schema", "datafusion-common", - "datafusion-execution", "datafusion-expr", - "datafusion-functions", "log", "sqlparser", ] diff --git a/datafusion/functions/src/datetime/mod.rs b/datafusion/functions/src/datetime/mod.rs index bf8406c47273..529789acf105 100644 --- a/datafusion/functions/src/datetime/mod.rs +++ b/datafusion/functions/src/datetime/mod.rs @@ -59,8 +59,7 @@ make_udf_function!( // functions with varargs currently pub mod expr_fn { - use datafusion_expr::{Expr, ScalarUDF}; - use std::sync::Arc; + use datafusion_expr::Expr; #[doc = "coerces an arbitrary timestamp to the start of the nearest specified interval"] pub fn date_bin(args: Vec) -> Expr { @@ -151,14 +150,6 @@ pub mod expr_fn { pub fn to_timestamp_nanos(args: Vec) -> Expr { super::to_timestamp_nanos().call(args) } - - pub fn _date_bin_scalar_udf() -> Arc { - super::date_bin() - } - - pub fn _date_part_scalar_udf() -> Arc { - super::date_part() - } } /// Return a list of all functions in this package diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index 7e59d617b3f1..0ff7bd595c5b 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -61,7 +61,6 @@ chrono = { workspace = true } datafusion-common = { workspace = true, default-features = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } -datafusion-functions = { workspace = true } half = { workspace = true } hashbrown = { version = "0.14", features = ["raw"] } hex = { version = "0.4", optional = true } diff --git a/datafusion/physical-expr/src/equivalence/projection.rs b/datafusion/physical-expr/src/equivalence/projection.rs index 9458fb608fa6..11867337d00e 100644 --- a/datafusion/physical-expr/src/equivalence/projection.rs +++ b/datafusion/physical-expr/src/equivalence/projection.rs @@ -119,50 +119,33 @@ mod tests { }; use crate::equivalence::EquivalenceProperties; use crate::execution_props::ExecutionProps; - use crate::expressions::{col, BinaryExpr, Literal}; + use crate::expressions::{col, BinaryExpr}; use crate::functions::create_physical_expr; use crate::PhysicalSortExpr; use arrow::datatypes::{DataType, Field, Schema}; use arrow_schema::{SortOptions, TimeUnit}; - use datafusion_common::{Result, ScalarValue}; + use datafusion_common::Result; use datafusion_expr::{BuiltinScalarFunction, Operator}; use itertools::Itertools; #[test] fn project_orderings() -> Result<()> { - let s = Schema::new(vec![ + let schema = Arc::new(Schema::new(vec![ Field::new("a", DataType::Int32, true), Field::new("b", DataType::Int32, true), Field::new("c", DataType::Int32, true), Field::new("d", DataType::Int32, true), Field::new("e", DataType::Int32, true), Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true), - ]); - let df_schema = s.clone().to_dfschema()?; - let schema = Arc::new(s); + ])); let col_a = &col("a", &schema)?; let col_b = &col("b", &schema)?; let col_c = &col("c", &schema)?; let col_d = &col("d", &schema)?; let col_e = &col("e", &schema)?; let col_ts = &col("ts", &schema)?; - let expr_col_ts = datafusion_common::Column::from_qualified_name("ts"); - let interval_value = ScalarValue::IntervalDayTime(Some(2)); - let expr_interval = Expr::Literal(interval_value.clone()); - let interval = Arc::new(Literal::new(interval_value)) as Arc; - let date_bin_udf = _date_bin_scalar_udf(); - let date_bin_func = &crate::udf::create_physical_expr( - &date_bin_udf, - &[interval.clone(), col_ts.clone()], - date_bin_udf - .return_type_from_exprs( - &[expr_interval, Expr::Column(expr_col_ts)], - &df_schema, - ) - .unwrap(), - )?; let a_plus_b = Arc::new(BinaryExpr::new( col_a.clone(), Operator::Plus, @@ -234,12 +217,9 @@ mod tests { (col_b, "b_new".to_string()), (col_a, "a_new".to_string()), (col_ts, "ts_new".to_string()), - (date_bin_func, "date_bin_res".to_string()), ], // expected vec![ - // [date_bin_res ASC] - vec![("date_bin_res", option_asc)], // [ts_new ASC] vec![("ts_new", option_asc)], ], @@ -258,18 +238,13 @@ mod tests { (col_b, "b_new".to_string()), (col_a, "a_new".to_string()), (col_ts, "ts_new".to_string()), - (date_bin_func, "date_bin_res".to_string()), ], // expected vec![ // [a_new ASC, ts_new ASC] vec![("a_new", option_asc), ("ts_new", option_asc)], - // [a_new ASC, date_bin_res ASC] - vec![("a_new", option_asc), ("date_bin_res", option_asc)], // [b_new ASC, ts_new ASC] vec![("b_new", option_asc), ("ts_new", option_asc)], - // [b_new ASC, date_bin_res ASC] - vec![("b_new", option_asc), ("date_bin_res", option_asc)], ], ), // ---------- TEST CASE 5 ------------ @@ -654,39 +629,22 @@ mod tests { #[test] fn project_orderings2() -> Result<()> { - let s = Schema::new(vec![ + let schema = Arc::new(Schema::new(vec![ Field::new("a", DataType::Int32, true), Field::new("b", DataType::Int32, true), Field::new("c", DataType::Int32, true), Field::new("d", DataType::Int32, true), Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true), - ]); - let df_schema = s.clone().to_dfschema()?; - let schema = Arc::new(s.clone()); + ])); let col_a = &col("a", &schema)?; let col_b = &col("b", &schema)?; let col_c = &col("c", &schema)?; let col_ts = &col("ts", &schema)?; - let expr_col_ts = datafusion_common::Column::from_qualified_name("ts"); let a_plus_b = Arc::new(BinaryExpr::new( col_a.clone(), Operator::Plus, col_b.clone(), )) as Arc; - let interval_value = ScalarValue::IntervalDayTime(Some(2)); - let expr_interval = Expr::Literal(interval_value.clone()); - let interval = Arc::new(Literal::new(interval_value)) as Arc; - let date_bin_udf = _date_bin_scalar_udf(); - let date_bin_ts = &crate::udf::create_physical_expr( - &date_bin_udf, - &[interval.clone(), col_ts.clone()], - date_bin_udf - .return_type_from_exprs( - &[expr_interval, Expr::Column(expr_col_ts)], - &df_schema, - ) - .unwrap(), - )?; let round_c = &create_physical_expr( &BuiltinScalarFunction::Round, @@ -704,7 +662,6 @@ mod tests { (col_b, "b_new".to_string()), (col_a, "a_new".to_string()), (col_c, "c_new".to_string()), - (date_bin_ts, "date_bin_res".to_string()), (round_c, "round_c_res".to_string()), ]; let proj_exprs = proj_exprs @@ -717,7 +674,6 @@ mod tests { let col_a_new = &col("a_new", &output_schema)?; let col_b_new = &col("b_new", &output_schema)?; let col_c_new = &col("c_new", &output_schema)?; - let col_date_bin_res = &col("date_bin_res", &output_schema)?; let col_round_c_res = &col("round_c_res", &output_schema)?; let a_new_plus_b_new = Arc::new(BinaryExpr::new( col_a_new.clone(), @@ -762,7 +718,7 @@ mod tests { // expected vec![ // [a_new ASC, date_bin_res ASC] - vec![(col_a_new, option_asc), (col_date_bin_res, option_asc)], + vec![(col_a_new, option_asc)], ], ), // ---------- TEST CASE 4 ------------ @@ -779,10 +735,7 @@ mod tests { // expected vec![ // [a_new ASC, date_bin_res ASC] - // Please note that result is not [a_new ASC, date_bin_res ASC, b_new ASC] - // because, datebin_res may not be 1-1 function. Hence without introducing ts - // dependency we cannot guarantee any ordering after date_bin_res column. - vec![(col_a_new, option_asc), (col_date_bin_res, option_asc)], + vec![(col_a_new, option_asc)], ], ), // ---------- TEST CASE 5 ------------ diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml index c055c3f611de..fb300e2c8791 100644 --- a/datafusion/sql/Cargo.toml +++ b/datafusion/sql/Cargo.toml @@ -40,9 +40,7 @@ unicode_expressions = [] arrow = { workspace = true } arrow-schema = { workspace = true } datafusion-common = { workspace = true, default-features = true } -datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } -datafusion-functions = { workspace = true } log = { workspace = true } sqlparser = { workspace = true } diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 3a24a5418783..be82a59f08bc 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -31,7 +31,8 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use arrow_schema::DataType; use arrow_schema::TimeUnit; use datafusion_common::{ - internal_err, not_impl_err, plan_err, Column, DFSchema, Result, ScalarValue, + internal_datafusion_err, internal_err, not_impl_err, plan_err, Column, DFSchema, + Result, ScalarValue, }; use datafusion_expr::expr::AggregateFunctionDefinition; use datafusion_expr::expr::InList; @@ -40,7 +41,6 @@ use datafusion_expr::{ col, expr, lit, AggregateFunction, Between, BinaryExpr, BuiltinScalarFunction, Cast, Expr, ExprSchemable, GetFieldAccess, GetIndexedField, Like, Operator, TryCast, }; -use datafusion_functions::expr_fn::_date_part_scalar_udf; use sqlparser::ast::{ArrayAgg, Expr as SQLExpr, JsonOperator, TrimWhereField, Value}; use sqlparser::parser::ParserError::ParserError; @@ -170,12 +170,20 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { self.parse_value(value, planner_context.prepare_param_data_types()) } SQLExpr::Extract { field, expr } => { + let date_part = self + .context_provider + .get_function_meta("date_part") + .ok_or_else(|| { + internal_datafusion_err!( + "Unable to find expected 'date_part' function" + ) + })?; + let args = vec![ + Expr::Literal(ScalarValue::from(format!("{field}"))), + self.sql_expr_to_logical_expr(*expr, schema, planner_context)?, + ]; Ok(Expr::ScalarFunction(ScalarFunction::new_udf( - _date_part_scalar_udf(), - vec![ - Expr::Literal(ScalarValue::from(format!("{field}"))), - self.sql_expr_to_logical_expr(*expr, schema, planner_context)?, - ], + date_part, args, ))) } diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 570476dc89d6..90e22b4c75dc 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -18,15 +18,15 @@ use std::any::Any; #[cfg(test)] use std::collections::HashMap; -use std::collections::HashSet; use std::{sync::Arc, vec}; +use arrow_schema::TimeUnit::Nanosecond; use arrow_schema::*; use sqlparser::dialect::{Dialect, GenericDialect, HiveDialect, MySqlDialect}; use datafusion_common::{ - assert_contains, config::ConfigOptions, exec_err, not_impl_err, DataFusionError, - Result, ScalarValue, TableReference, + assert_contains, config::ConfigOptions, DataFusionError, Result, ScalarValue, + TableReference, }; use datafusion_common::{plan_err, ParamValues}; use datafusion_expr::{ @@ -39,7 +39,6 @@ use datafusion_sql::{ planner::{ContextProvider, ParserOptions, SqlToRel}, }; -use datafusion_execution::FunctionRegistry; use rstest::rstest; #[test] @@ -2663,8 +2662,7 @@ fn logical_plan_with_options(sql: &str, options: ParserOptions) -> Result Result { - let mut context = MockContextProvider::default(); - datafusion_functions::register_all(&mut context)?; + let context = MockContextProvider::default(); let planner = SqlToRel::new(&context); let result = DFParser::parse_sql_with_dialect(sql, dialect); let mut ast = result?; @@ -2676,13 +2674,17 @@ fn logical_plan_with_dialect_and_options( dialect: &dyn Dialect, options: ParserOptions, ) -> Result { - let mut context = MockContextProvider::default().with_udf(make_udf( - "nullif", - vec![DataType::Int32, DataType::Int32], - DataType::Int32, - )); - datafusion_functions::register_all(&mut context)?; - + let context = MockContextProvider::default() + .with_udf(make_udf( + "nullif", + vec![DataType::Int32, DataType::Int32], + DataType::Int32, + )) + .with_udf(make_udf( + "date_trunc", + vec![DataType::Utf8, DataType::Timestamp(Nanosecond, None)], + DataType::Int32, + )); let planner = SqlToRel::new_with_options(&context, options); let result = DFParser::parse_sql_with_dialect(sql, dialect); let mut ast = result?; @@ -2912,54 +2914,6 @@ impl ContextProvider for MockContextProvider { } } -impl FunctionRegistry for MockContextProvider { - fn udfs(&self) -> HashSet { - self.udfs - .keys() - .map(|s| s.to_string()) - .collect::>() - } - - fn udf(&self, name: &str) -> Result> { - let opt = self.udfs.get(name); - match opt { - Some(udf) => Ok(udf.clone()), - None => exec_err!("Not found"), - } - } - - fn udaf(&self, name: &str) -> Result> { - let opt = self.udafs.get(name); - match opt { - Some(udaf) => Ok(udaf.clone()), - None => exec_err!("Not found"), - } - } - - fn udwf(&self, _name: &str) -> Result> { - not_impl_err!("not implemented") - } - - fn register_udf(&mut self, udf: Arc) -> Result>> { - Ok(self.udfs.insert(String::from(udf.name()), udf)) - } - - fn register_udaf( - &mut self, - udaf: Arc, - ) -> Result>> { - Ok(self.udafs.insert(String::from(udaf.name()), udaf)) - } - - fn deregister_udf(&mut self, name: &str) -> Result>> { - Ok(self.udfs.remove(name)) - } - - fn deregister_udaf(&mut self, name: &str) -> Result>> { - Ok(self.udafs.remove(name)) - } -} - #[test] fn select_partially_qualified_column() { let sql = r#"SELECT person.first_name FROM public.person"#; From 4c84f0889d549295c77d2ae8e352768bd16a843a Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Tue, 5 Mar 2024 18:13:53 -0500 Subject: [PATCH 05/17] WIP --- datafusion/expr/src/built_in_function.rs | 18 +---- .../functions/src/datetime/from_unixtime.rs | 75 +++++++++++++++++++ datafusion/functions/src/datetime/mod.rs | 8 ++ datafusion/functions/src/datetime/now.rs | 58 ++++++++++++++ .../physical-expr/src/datetime_expressions.rs | 22 ------ datafusion/physical-expr/src/functions.rs | 3 - datafusion/proto/proto/datafusion.proto | 2 +- 7 files changed, 143 insertions(+), 43 deletions(-) create mode 100644 datafusion/functions/src/datetime/from_unixtime.rs create mode 100644 datafusion/functions/src/datetime/now.rs diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index 45f66ac14e28..a7cd106f4b66 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -247,10 +247,6 @@ pub enum BuiltinScalarFunction { Substr, /// to_hex ToHex, - /// from_unixtime - FromUnixtime, - ///now - Now, ///current_date CurrentDate, /// current_time @@ -432,7 +428,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Trim => Volatility::Immutable, BuiltinScalarFunction::Upper => Volatility::Immutable, BuiltinScalarFunction::Struct => Volatility::Immutable, - BuiltinScalarFunction::FromUnixtime => Volatility::Immutable, BuiltinScalarFunction::ArrowTypeof => Volatility::Immutable, BuiltinScalarFunction::OverLay => Volatility::Immutable, BuiltinScalarFunction::Levenshtein => Volatility::Immutable, @@ -440,7 +435,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::FindInSet => Volatility::Immutable, // Stable builtin functions - BuiltinScalarFunction::Now => Volatility::Stable, BuiltinScalarFunction::CurrentDate => Volatility::Stable, BuiltinScalarFunction::CurrentTime => Volatility::Stable, @@ -707,10 +701,6 @@ impl BuiltinScalarFunction { utf8_to_int_type(&input_expr_types[0], "find_in_set") } BuiltinScalarFunction::ToChar => Ok(Utf8), - BuiltinScalarFunction::FromUnixtime => Ok(Timestamp(Second, None)), - BuiltinScalarFunction::Now => { - Ok(Timestamp(Nanosecond, Some("+00:00".into()))) - } BuiltinScalarFunction::CurrentDate => Ok(Date32), BuiltinScalarFunction::CurrentTime => Ok(Time64(Nanosecond)), BuiltinScalarFunction::MakeDate => Ok(Date32), @@ -962,9 +952,6 @@ impl BuiltinScalarFunction { ], self.volatility(), ), - BuiltinScalarFunction::FromUnixtime => { - Signature::uniform(1, vec![Int64], self.volatility()) - } BuiltinScalarFunction::Digest => Signature::one_of( vec![ Exact(vec![Utf8, Utf8]), @@ -1127,8 +1114,7 @@ impl BuiltinScalarFunction { // will be as good as the number of digits in the number Signature::uniform(1, vec![Float64, Float32], self.volatility()) } - BuiltinScalarFunction::Now - | BuiltinScalarFunction::CurrentDate + BuiltinScalarFunction::CurrentDate | BuiltinScalarFunction::CurrentTime => { Signature::uniform(0, vec![], self.volatility()) } @@ -1264,12 +1250,10 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::RegexpReplace => &["regexp_replace"], // time/date functions - BuiltinScalarFunction::Now => &["now"], BuiltinScalarFunction::CurrentDate => &["current_date", "today"], BuiltinScalarFunction::CurrentTime => &["current_time"], BuiltinScalarFunction::MakeDate => &["make_date"], BuiltinScalarFunction::ToChar => &["to_char", "date_format"], - BuiltinScalarFunction::FromUnixtime => &["from_unixtime"], // hashing functions BuiltinScalarFunction::Digest => &["digest"], diff --git a/datafusion/functions/src/datetime/from_unixtime.rs b/datafusion/functions/src/datetime/from_unixtime.rs new file mode 100644 index 000000000000..f0d5016c0db9 --- /dev/null +++ b/datafusion/functions/src/datetime/from_unixtime.rs @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; + +use arrow::datatypes::DataType; +use arrow::datatypes::DataType::{Int64, Timestamp}; +use arrow::datatypes::TimeUnit::Second; + +use datafusion_common::{exec_err, Result}; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +#[derive(Debug)] +pub(super) struct FromUnixtimeFunc { + signature: Signature, +} + +impl FromUnixtimeFunc { + pub fn new() -> Self { + Self { + signature: Signature::uniform(1, vec![Int64], Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for FromUnixtimeFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "from_unixtime" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(Timestamp(Second, None)) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + if args.len() != 1 { + return exec_err!( + "from_unixtime function requires 1 argument, got {}", + args.len() + ); + } + + match args[0].data_type() { + Int64 => args[0].cast_to(&Timestamp(Second, None), None), + other => { + exec_err!( + "Unsupported data type {:?} for function from_unixtime", + other + ) + } + } + } +} diff --git a/datafusion/functions/src/datetime/mod.rs b/datafusion/functions/src/datetime/mod.rs index 529789acf105..58f238bd3f0b 100644 --- a/datafusion/functions/src/datetime/mod.rs +++ b/datafusion/functions/src/datetime/mod.rs @@ -25,13 +25,16 @@ mod common; mod date_bin; mod date_part; mod date_trunc; +mod from_unixtime; mod to_date; mod to_timestamp; +mod now; // create UDFs make_udf_function!(date_bin::DateBinFunc, DATE_BIN, date_bin); make_udf_function!(date_part::DatePartFunc, DATE_PART, date_part); make_udf_function!(date_trunc::DateTruncFunc, DATE_TRUNC, date_trunc); +make_udf_function!(from_unixtime::FromUnixtimeFunc, FROM_UNIXTIME, from_unixtime); make_udf_function!(to_date::ToDateFunc, TO_DATE, to_date); make_udf_function!(to_timestamp::ToTimestampFunc, TO_TIMESTAMP, to_timestamp); make_udf_function!( @@ -76,6 +79,11 @@ pub mod expr_fn { super::date_trunc().call(args) } + #[doc = "todo -fixme"] + pub fn from_unixtime(args: Vec) -> Expr { + super::from_unixtime().call(args) + } + /// ```ignore /// # use std::sync::Arc; /// diff --git a/datafusion/functions/src/datetime/now.rs b/datafusion/functions/src/datetime/now.rs new file mode 100644 index 000000000000..075c81e7d184 --- /dev/null +++ b/datafusion/functions/src/datetime/now.rs @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; + +use arrow::datatypes::DataType; +use arrow::datatypes::DataType::{Int64, Timestamp}; +use arrow::datatypes::TimeUnit::{Nanosecond, Second}; + +use datafusion_common::{exec_err, Result}; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +#[derive(Debug)] +pub(super) struct NowFunc { + signature: Signature, +} + +impl NowFunc { + pub fn new() -> Self { + Self { + signature: Signature::uniform(0, vec![], Volatility::Stable), + } + } +} + +impl ScalarUDFImpl for NowFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "now" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(Timestamp(Nanosecond, Some("+00:00".into()))) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result {} +} diff --git a/datafusion/physical-expr/src/datetime_expressions.rs b/datafusion/physical-expr/src/datetime_expressions.rs index a7133dcbedd4..75af63f62eab 100644 --- a/datafusion/physical-expr/src/datetime_expressions.rs +++ b/datafusion/physical-expr/src/datetime_expressions.rs @@ -401,28 +401,6 @@ pub fn make_date(args: &[ColumnarValue]) -> Result { } } -/// from_unixtime() SQL function implementation -pub fn from_unixtime_invoke(args: &[ColumnarValue]) -> Result { - if args.len() != 1 { - return exec_err!( - "from_unixtime function requires 1 argument, got {}", - args.len() - ); - } - - match args[0].data_type() { - DataType::Int64 => { - args[0].cast_to(&DataType::Timestamp(TimeUnit::Second, None), None) - } - other => { - exec_err!( - "Unsupported data type {:?} for function from_unixtime", - other - ) - } - } -} - #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index b7afd4c3bca0..1d7e07c7042c 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -485,9 +485,6 @@ pub fn create_physical_fun( } BuiltinScalarFunction::MakeDate => Arc::new(datetime_expressions::make_date), BuiltinScalarFunction::ToChar => Arc::new(datetime_expressions::to_char), - BuiltinScalarFunction::FromUnixtime => { - Arc::new(datetime_expressions::from_unixtime_invoke) - } BuiltinScalarFunction::InitCap => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function_inner(string_expressions::initcap::)(args) diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 61c9e2cd9c9b..ed2d7146ad91 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -614,7 +614,7 @@ enum ScalarFunction { Coalesce = 63; Power = 64; StructFun = 65; - FromUnixtime = 66; + // 66 was FromUnixtime Atan2 = 67; // 68 was DateBin ArrowTypeof = 69; From 9675dcdb7c78b177bc565bec828c17ea9ec02d20 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Tue, 5 Mar 2024 19:50:00 -0500 Subject: [PATCH 06/17] Fix function arguments for date_bin, date_trunc and date_part. --- datafusion/functions/src/datetime/mod.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/datafusion/functions/src/datetime/mod.rs b/datafusion/functions/src/datetime/mod.rs index 529789acf105..06ba307aa22a 100644 --- a/datafusion/functions/src/datetime/mod.rs +++ b/datafusion/functions/src/datetime/mod.rs @@ -62,18 +62,18 @@ pub mod expr_fn { use datafusion_expr::Expr; #[doc = "coerces an arbitrary timestamp to the start of the nearest specified interval"] - pub fn date_bin(args: Vec) -> Expr { - super::date_bin().call(args) + pub fn date_bin(stride: Expr, source: Expr, origin: Expr) -> Expr { + super::date_bin().call(vec![stride, source, origin]) } #[doc = "extracts a subfield from the date"] - pub fn date_part(args: Vec) -> Expr { - super::date_part().call(args) + pub fn date_part(part: Expr, date: Expr) -> Expr { + super::date_part().call(vec![part, date]) } #[doc = "truncates the date to a specified level of precision"] - pub fn date_trunc(args: Vec) -> Expr { - super::date_trunc().call(args) + pub fn date_trunc(part: Expr, date: Expr) -> Expr { + super::date_trunc().call(vec![part, date]) } /// ```ignore From 8840d5004b70fd2f41005e6ffafaed6db3d14c40 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Tue, 5 Mar 2024 22:06:51 -0500 Subject: [PATCH 07/17] WIP --- datafusion/expr/src/built_in_function.rs | 3 +- datafusion/expr/src/expr_fn.rs | 7 ---- datafusion/functions/src/datetime/mod.rs | 24 ++++++++++---- datafusion/functions/src/datetime/now.rs | 33 ++++++++++++++++--- .../physical-expr/src/datetime_expressions.rs | 7 ---- datafusion/physical-expr/src/functions.rs | 7 ---- datafusion/proto/proto/datafusion.proto | 2 +- datafusion/proto/src/generated/pbjson.rs | 6 ---- datafusion/proto/src/generated/prost.rs | 8 ++--- .../proto/src/logical_plan/from_proto.rs | 16 +++------ datafusion/proto/src/logical_plan/to_proto.rs | 2 -- 11 files changed, 55 insertions(+), 60 deletions(-) diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index 450e5a7969b3..ea9ef083a7aa 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -1099,8 +1099,7 @@ impl BuiltinScalarFunction { // will be as good as the number of digits in the number Signature::uniform(1, vec![Float64, Float32], self.volatility()) } - BuiltinScalarFunction::CurrentDate - | BuiltinScalarFunction::CurrentTime => { + BuiltinScalarFunction::CurrentDate | BuiltinScalarFunction::CurrentTime => { Signature::uniform(0, vec![], self.volatility()) } BuiltinScalarFunction::MakeDate => Signature::uniform( diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index d18697d11277..2ff64cc5ea60 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -861,14 +861,7 @@ scalar_expr!( datetime format, "converts a date, time, timestamp or duration to a string based on the provided format" ); -scalar_expr!( - FromUnixtime, - from_unixtime, - unixtime, - "returns the unix time in format" -); scalar_expr!(CurrentDate, current_date, ,"returns current UTC date as a [`DataType::Date32`] value"); -scalar_expr!(Now, now, ,"returns current timestamp in nanoseconds, using the same value for all instances of now() in same statement"); scalar_expr!(CurrentTime, current_time, , "returns current UTC time as a [`DataType::Time64`] value"); scalar_expr!(MakeDate, make_date, year month day, "make a date from year, month and day component parts"); scalar_expr!(Nanvl, nanvl, x y, "returns x if x is not NaN otherwise returns y"); diff --git a/datafusion/functions/src/datetime/mod.rs b/datafusion/functions/src/datetime/mod.rs index 58f238bd3f0b..5bf68b22edd4 100644 --- a/datafusion/functions/src/datetime/mod.rs +++ b/datafusion/functions/src/datetime/mod.rs @@ -26,15 +26,20 @@ mod date_bin; mod date_part; mod date_trunc; mod from_unixtime; +mod now; mod to_date; mod to_timestamp; -mod now; // create UDFs make_udf_function!(date_bin::DateBinFunc, DATE_BIN, date_bin); make_udf_function!(date_part::DatePartFunc, DATE_PART, date_part); make_udf_function!(date_trunc::DateTruncFunc, DATE_TRUNC, date_trunc); -make_udf_function!(from_unixtime::FromUnixtimeFunc, FROM_UNIXTIME, from_unixtime); +make_udf_function!( + from_unixtime::FromUnixtimeFunc, + FROM_UNIXTIME, + from_unixtime +); +make_udf_function!(now::NowFunc, NOW, now); make_udf_function!(to_date::ToDateFunc, TO_DATE, to_date); make_udf_function!(to_timestamp::ToTimestampFunc, TO_TIMESTAMP, to_timestamp); make_udf_function!( @@ -79,11 +84,16 @@ pub mod expr_fn { super::date_trunc().call(args) } - #[doc = "todo -fixme"] - pub fn from_unixtime(args: Vec) -> Expr { - super::from_unixtime().call(args) + #[doc = "converts an integer to RFC3339 timestamp format"] + pub fn from_unixtime(unixtime: Expr) -> Expr { + super::from_unixtime().call(vec![unixtime]) } - + + #[doc = "returns the current timestamp in nanoseconds, using the same value for all instances of now() in same statement"] + pub fn now() -> Expr { + super::now().call(vec![]) + } + /// ```ignore /// # use std::sync::Arc; /// @@ -166,6 +176,8 @@ pub fn functions() -> Vec> { date_bin(), date_part(), date_trunc(), + from_unixtime(), + now(), to_date(), to_timestamp(), to_timestamp_seconds(), diff --git a/datafusion/functions/src/datetime/now.rs b/datafusion/functions/src/datetime/now.rs index 075c81e7d184..8ceda64e0ba9 100644 --- a/datafusion/functions/src/datetime/now.rs +++ b/datafusion/functions/src/datetime/now.rs @@ -18,11 +18,12 @@ use std::any::Any; use arrow::datatypes::DataType; -use arrow::datatypes::DataType::{Int64, Timestamp}; -use arrow::datatypes::TimeUnit::{Nanosecond, Second}; +use arrow::datatypes::DataType::Timestamp; +use arrow::datatypes::TimeUnit::Nanosecond; -use datafusion_common::{exec_err, Result}; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_common::{Result, ScalarValue}; +use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; +use datafusion_expr::{ColumnarValue, Expr, ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] pub(super) struct NowFunc { @@ -37,6 +38,12 @@ impl NowFunc { } } +/// Create an implementation of `now()` that always returns the +/// specified timestamp. +/// +/// The semantics of `now()` require it to return the same value +/// wherever it appears within a single statement. This value is +/// chosen during planning time. impl ScalarUDFImpl for NowFunc { fn as_any(&self) -> &dyn Any { self @@ -54,5 +61,21 @@ impl ScalarUDFImpl for NowFunc { Ok(Timestamp(Nanosecond, Some("+00:00".into()))) } - fn invoke(&self, args: &[ColumnarValue]) -> Result {} + fn invoke(&self, _args: &[ColumnarValue]) -> Result { + todo!() + } + + fn simplify( + &self, + _args: Vec, + info: &dyn SimplifyInfo, + ) -> Result { + let now_ts = info + .execution_props() + .query_execution_start_time + .timestamp_nanos_opt(); + Ok(ExprSimplifyResult::Simplified(Expr::Literal( + ScalarValue::TimestampNanosecond(now_ts, Some("+00:00".into())), + ))) + } } diff --git a/datafusion/physical-expr/src/datetime_expressions.rs b/datafusion/physical-expr/src/datetime_expressions.rs index 75af63f62eab..22a67e6fc850 100644 --- a/datafusion/physical-expr/src/datetime_expressions.rs +++ b/datafusion/physical-expr/src/datetime_expressions.rs @@ -19,7 +19,6 @@ use std::sync::Arc; -use arrow::datatypes::TimeUnit; use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions}; use arrow::{ array::{Array, ArrayRef, PrimitiveArray}, @@ -35,12 +34,6 @@ use chrono::NaiveDate; use datafusion_common::{exec_err, Result, ScalarValue}; use datafusion_expr::ColumnarValue; -/// Create an implementation of `now()` that always returns the -/// specified timestamp. -/// -/// The semantics of `now()` require it to return the same value -/// wherever it appears within a single statement. This value is -/// chosen during planning time. pub fn make_now( now_ts: DateTime, ) -> impl Fn(&[ColumnarValue]) -> Result { diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index d8aa434d76c5..12fd49dceca3 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -457,12 +457,6 @@ pub fn create_physical_fun( BuiltinScalarFunction::ConcatWithSeparator => Arc::new(|args| { make_scalar_function_inner(string_expressions::concat_ws)(args) }), - BuiltinScalarFunction::Now => { - // bind value for now at plan time - Arc::new(datetime_expressions::make_now( - execution_props.query_execution_start_time, - )) - } BuiltinScalarFunction::CurrentDate => { // bind value for current_date at plan time Arc::new(datetime_expressions::make_current_date( @@ -2839,7 +2833,6 @@ mod tests { let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); let funs = [ - BuiltinScalarFunction::Now, BuiltinScalarFunction::Pi, BuiltinScalarFunction::Random, BuiltinScalarFunction::Uuid, diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 59ee1155d34b..81eab272d8c8 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -607,7 +607,7 @@ enum ScalarFunction { // 56 was ToTimestampMillis // 57 was ToTimestampMicros // 58 was ToTimestampSeconds - Now = 59; + // 59 was Now Translate = 60; Trim = 61; Upper = 62; diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 4bb4ab22962c..a77d7857097a 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22369,14 +22369,12 @@ impl serde::Serialize for ScalarFunction { Self::Strpos => "Strpos", Self::Substr => "Substr", Self::ToHex => "ToHex", - Self::Now => "Now", Self::Translate => "Translate", Self::Trim => "Trim", Self::Upper => "Upper", Self::Coalesce => "Coalesce", Self::Power => "Power", Self::StructFun => "StructFun", - Self::FromUnixtime => "FromUnixtime", Self::Atan2 => "Atan2", Self::ArrowTypeof => "ArrowTypeof", Self::CurrentDate => "CurrentDate", @@ -22494,14 +22492,12 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Strpos", "Substr", "ToHex", - "Now", "Translate", "Trim", "Upper", "Coalesce", "Power", "StructFun", - "FromUnixtime", "Atan2", "ArrowTypeof", "CurrentDate", @@ -22648,14 +22644,12 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Strpos" => Ok(ScalarFunction::Strpos), "Substr" => Ok(ScalarFunction::Substr), "ToHex" => Ok(ScalarFunction::ToHex), - "Now" => Ok(ScalarFunction::Now), "Translate" => Ok(ScalarFunction::Translate), "Trim" => Ok(ScalarFunction::Trim), "Upper" => Ok(ScalarFunction::Upper), "Coalesce" => Ok(ScalarFunction::Coalesce), "Power" => Ok(ScalarFunction::Power), "StructFun" => Ok(ScalarFunction::StructFun), - "FromUnixtime" => Ok(ScalarFunction::FromUnixtime), "Atan2" => Ok(ScalarFunction::Atan2), "ArrowTypeof" => Ok(ScalarFunction::ArrowTypeof), "CurrentDate" => Ok(ScalarFunction::CurrentDate), diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 4e0212ea8f1b..a22ff8a4fe4a 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2696,14 +2696,14 @@ pub enum ScalarFunction { /// 56 was ToTimestampMillis /// 57 was ToTimestampMicros /// 58 was ToTimestampSeconds - Now = 59, + /// 59 was Now Translate = 60, Trim = 61, Upper = 62, Coalesce = 63, Power = 64, StructFun = 65, - FromUnixtime = 66, + /// 66 was FromUnixtime Atan2 = 67, /// 68 was DateBin ArrowTypeof = 69, @@ -2830,14 +2830,12 @@ impl ScalarFunction { ScalarFunction::Strpos => "Strpos", ScalarFunction::Substr => "Substr", ScalarFunction::ToHex => "ToHex", - ScalarFunction::Now => "Now", ScalarFunction::Translate => "Translate", ScalarFunction::Trim => "Trim", ScalarFunction::Upper => "Upper", ScalarFunction::Coalesce => "Coalesce", ScalarFunction::Power => "Power", ScalarFunction::StructFun => "StructFun", - ScalarFunction::FromUnixtime => "FromUnixtime", ScalarFunction::Atan2 => "Atan2", ScalarFunction::ArrowTypeof => "ArrowTypeof", ScalarFunction::CurrentDate => "CurrentDate", @@ -2949,14 +2947,12 @@ impl ScalarFunction { "Strpos" => Some(Self::Strpos), "Substr" => Some(Self::Substr), "ToHex" => Some(Self::ToHex), - "Now" => Some(Self::Now), "Translate" => Some(Self::Translate), "Trim" => Some(Self::Trim), "Upper" => Some(Self::Upper), "Coalesce" => Some(Self::Coalesce), "Power" => Some(Self::Power), "StructFun" => Some(Self::StructFun), - "FromUnixtime" => Some(Self::FromUnixtime), "Atan2" => Some(Self::Atan2), "ArrowTypeof" => Some(Self::ArrowTypeof), "CurrentDate" => Some(Self::CurrentDate), diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 17aae61478f3..da63e1fc5293 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -56,12 +56,12 @@ use datafusion_expr::{ ceil, character_length, chr, coalesce, concat_expr, concat_ws_expr, cos, cosh, cot, current_date, current_time, degrees, digest, ends_with, exp, expr::{self, InList, Sort, WindowFunction}, - factorial, find_in_set, flatten, floor, from_unixtime, gcd, initcap, iszero, lcm, - left, levenshtein, ln, log, log10, log2, + factorial, find_in_set, flatten, floor, gcd, initcap, iszero, lcm, left, levenshtein, + ln, log, log10, log2, logical_plan::{PlanType, StringifiedPlan}, - lower, lpad, ltrim, md5, nanvl, now, octet_length, overlay, pi, power, radians, - random, repeat, replace, reverse, right, round, rpad, rtrim, sha224, sha256, sha384, - sha512, signum, sin, sinh, split_part, sqrt, starts_with, string_to_array, strpos, + lower, lpad, ltrim, md5, nanvl, octet_length, overlay, pi, power, radians, random, + repeat, replace, reverse, right, round, rpad, rtrim, sha224, sha256, sha384, sha512, + signum, sin, sinh, split_part, sqrt, starts_with, string_to_array, strpos, struct_fun, substr, substr_index, substring, tan, tanh, to_hex, translate, trim, trunc, upper, uuid, AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction, BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, GetIndexedField, @@ -538,7 +538,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Substr => Self::Substr, ScalarFunction::ToHex => Self::ToHex, ScalarFunction::ToChar => Self::ToChar, - ScalarFunction::Now => Self::Now, ScalarFunction::CurrentDate => Self::CurrentDate, ScalarFunction::CurrentTime => Self::CurrentTime, ScalarFunction::MakeDate => Self::MakeDate, @@ -548,7 +547,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Pi => Self::Pi, ScalarFunction::Power => Self::Power, ScalarFunction::StructFun => Self::Struct, - ScalarFunction::FromUnixtime => Self::FromUnixtime, ScalarFunction::Atan2 => Self::Atan2, ScalarFunction::Nanvl => Self::Nanvl, ScalarFunction::Iszero => Self::Iszero, @@ -1751,7 +1749,6 @@ pub fn parse_expr( args, ))) } - ScalarFunction::Now => Ok(now()), ScalarFunction::Translate => Ok(translate( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, @@ -1772,9 +1769,6 @@ pub fn parse_expr( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, )), - ScalarFunction::FromUnixtime => { - Ok(from_unixtime(parse_expr(&args[0], registry, codec)?)) - } ScalarFunction::Atan2 => Ok(atan2( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 55a19f471613..8fd74adbbade 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1517,7 +1517,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Strpos => Self::Strpos, BuiltinScalarFunction::Substr => Self::Substr, BuiltinScalarFunction::ToHex => Self::ToHex, - BuiltinScalarFunction::Now => Self::Now, BuiltinScalarFunction::CurrentDate => Self::CurrentDate, BuiltinScalarFunction::CurrentTime => Self::CurrentTime, BuiltinScalarFunction::MakeDate => Self::MakeDate, @@ -1526,7 +1525,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Pi => Self::Pi, BuiltinScalarFunction::Power => Self::Power, BuiltinScalarFunction::Struct => Self::StructFun, - BuiltinScalarFunction::FromUnixtime => Self::FromUnixtime, BuiltinScalarFunction::Atan2 => Self::Atan2, BuiltinScalarFunction::Nanvl => Self::Nanvl, BuiltinScalarFunction::Iszero => Self::Iszero, From c0ce36286c38e2d72fd9da3a0feaa6f1e520f568 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 6 Mar 2024 10:55:59 +0300 Subject: [PATCH 08/17] Fix projection change. Add new test date_bin monotonicity --- datafusion/expr/src/expr.rs | 5 ++ .../optimizer/src/optimize_projections.rs | 11 ++-- .../sqllogictest/test_files/group_by.slt | 32 +++++++++++ .../sqllogictest/test_files/tpch/q8.slt.part | 57 +++++++++---------- .../sqllogictest/test_files/tpch/q9.slt.part | 35 ++++++------ 5 files changed, 87 insertions(+), 53 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index e83d2f1a65f6..0da05d96f67e 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -419,6 +419,11 @@ impl ScalarFunction { args, } } + + /// Create a new ScalarFunction expression with a user-defined function (UDF) + pub fn new_func_def(func_def: ScalarFunctionDefinition, args: Vec) -> Self { + Self { func_def, args } + } } /// Access a sub field of a nested type, such as `Field` or `List` diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs index d8d7f71d7143..08ee38f64abd 100644 --- a/datafusion/optimizer/src/optimize_projections.rs +++ b/datafusion/optimizer/src/optimize_projections.rs @@ -33,7 +33,7 @@ use arrow::datatypes::SchemaRef; use datafusion_common::{ get_required_group_by_exprs_indices, Column, DFSchema, DFSchemaRef, JoinType, Result, }; -use datafusion_expr::expr::{Alias, ScalarFunction, ScalarFunctionDefinition}; +use datafusion_expr::expr::{Alias, ScalarFunction}; use datafusion_expr::{ logical_plan::LogicalPlan, projection_schema, Aggregate, BinaryExpr, Cast, Distinct, Expr, Projection, TableScan, Window, @@ -558,17 +558,16 @@ fn rewrite_expr(expr: &Expr, input: &Projection) -> Result> { Expr::Cast(Cast::new(Box::new(new_expr), cast.data_type.clone())) } Expr::ScalarFunction(scalar_fn) => { - // TODO: Support UDFs. - let ScalarFunctionDefinition::BuiltIn(fun) = scalar_fn.func_def else { - return Ok(None); - }; return Ok(scalar_fn .args .iter() .map(|expr| rewrite_expr(expr, input)) .collect::>>()? .map(|new_args| { - Expr::ScalarFunction(ScalarFunction::new(fun, new_args)) + Expr::ScalarFunction(ScalarFunction::new_func_def( + scalar_fn.func_def.clone(), + new_args, + )) })); } // Unsupported type for consecutive projection merge analysis. diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 906926a5a9ab..1f80a47548dc 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -4306,6 +4306,38 @@ SELECT extract(month from ts) as months 12 11 +# create an unbounded table that contains name, timestamp. +# where table is ordered by name DESC, ts DESC +statement ok +CREATE UNBOUNDED EXTERNAL TABLE unbounded_csv_with_timestamps2 ( + name VARCHAR, + ts TIMESTAMP +) +STORED AS CSV +WITH ORDER (name DESC, ts DESC) +LOCATION '../core/tests/data/timestamps.csv' + +# result shouldn't have SortExec(sort_exprs=[name DESC, time_chunks DESC]) in the result. +# datafusion should deduce that given ordering: [name DESC, ts DESC] is satisfied +# ordering: [name DESC, date_bin('15 minutes', ts) DESC] is also valid. +query TT +EXPLAIN SELECT name, date_bin('15 minutes', ts) as time_chunks + FROM unbounded_csv_with_timestamps2 + ORDER BY name DESC, time_chunks DESC + LIMIT 5; +---- +logical_plan +Limit: skip=0, fetch=5 +--Sort: unbounded_csv_with_timestamps2.name DESC NULLS FIRST, time_chunks DESC NULLS FIRST, fetch=5 +----Projection: unbounded_csv_with_timestamps2.name, date_bin(IntervalMonthDayNano("900000000000"), unbounded_csv_with_timestamps2.ts) AS time_chunks +------TableScan: unbounded_csv_with_timestamps2 projection=[name, ts] +physical_plan +GlobalLimitExec: skip=0, fetch=5 +--SortPreservingMergeExec: [name@0 DESC,time_chunks@1 DESC], fetch=5 +----ProjectionExec: expr=[name@0 as name, date_bin(900000000000, ts@1) as time_chunks] +------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 +--------StreamingTableExec: partition_sizes=1, projection=[name, ts], infinite_source=true, output_ordering=[name@0 DESC, ts@1 DESC] + statement ok drop table t1 diff --git a/datafusion/sqllogictest/test_files/tpch/q8.slt.part b/datafusion/sqllogictest/test_files/tpch/q8.slt.part index 621a132129e8..760b40ad1ae8 100644 --- a/datafusion/sqllogictest/test_files/tpch/q8.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/q8.slt.part @@ -61,35 +61,34 @@ Sort: all_nations.o_year ASC NULLS LAST ----Aggregate: groupBy=[[all_nations.o_year]], aggr=[[SUM(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Decimal128(Some(0),38,4) END) AS SUM(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END), SUM(all_nations.volume)]] ------SubqueryAlias: all_nations --------Projection: date_part(Utf8("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume, n2.n_name AS nation -----------Projection: lineitem.l_extendedprice, lineitem.l_discount, orders.o_orderdate, n2.n_name -------------Inner Join: n1.n_regionkey = region.r_regionkey ---------------Projection: lineitem.l_extendedprice, lineitem.l_discount, orders.o_orderdate, n1.n_regionkey, n2.n_name -----------------Inner Join: supplier.s_nationkey = n2.n_nationkey -------------------Projection: lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, orders.o_orderdate, n1.n_regionkey ---------------------Inner Join: customer.c_nationkey = n1.n_nationkey -----------------------Projection: lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, orders.o_orderdate, customer.c_nationkey -------------------------Inner Join: orders.o_custkey = customer.c_custkey ---------------------------Projection: lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, orders.o_custkey, orders.o_orderdate -----------------------------Inner Join: lineitem.l_orderkey = orders.o_orderkey -------------------------------Projection: lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey ---------------------------------Inner Join: lineitem.l_suppkey = supplier.s_suppkey -----------------------------------Projection: lineitem.l_orderkey, lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount -------------------------------------Inner Join: part.p_partkey = lineitem.l_partkey ---------------------------------------Projection: part.p_partkey -----------------------------------------Filter: part.p_type = Utf8("ECONOMY ANODIZED STEEL") -------------------------------------------TableScan: part projection=[p_partkey, p_type], partial_filters=[part.p_type = Utf8("ECONOMY ANODIZED STEEL")] ---------------------------------------TableScan: lineitem projection=[l_orderkey, l_partkey, l_suppkey, l_extendedprice, l_discount] -----------------------------------TableScan: supplier projection=[s_suppkey, s_nationkey] -------------------------------Filter: orders.o_orderdate >= Date32("9131") AND orders.o_orderdate <= Date32("9861") ---------------------------------TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate], partial_filters=[orders.o_orderdate >= Date32("9131"), orders.o_orderdate <= Date32("9861")] ---------------------------TableScan: customer projection=[c_custkey, c_nationkey] -----------------------SubqueryAlias: n1 -------------------------TableScan: nation projection=[n_nationkey, n_regionkey] -------------------SubqueryAlias: n2 ---------------------TableScan: nation projection=[n_nationkey, n_name] ---------------Projection: region.r_regionkey -----------------Filter: region.r_name = Utf8("AMERICA") -------------------TableScan: region projection=[r_regionkey, r_name], partial_filters=[region.r_name = Utf8("AMERICA")] +----------Inner Join: n1.n_regionkey = region.r_regionkey +------------Projection: lineitem.l_extendedprice, lineitem.l_discount, orders.o_orderdate, n1.n_regionkey, n2.n_name +--------------Inner Join: supplier.s_nationkey = n2.n_nationkey +----------------Projection: lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, orders.o_orderdate, n1.n_regionkey +------------------Inner Join: customer.c_nationkey = n1.n_nationkey +--------------------Projection: lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, orders.o_orderdate, customer.c_nationkey +----------------------Inner Join: orders.o_custkey = customer.c_custkey +------------------------Projection: lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, orders.o_custkey, orders.o_orderdate +--------------------------Inner Join: lineitem.l_orderkey = orders.o_orderkey +----------------------------Projection: lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey +------------------------------Inner Join: lineitem.l_suppkey = supplier.s_suppkey +--------------------------------Projection: lineitem.l_orderkey, lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount +----------------------------------Inner Join: part.p_partkey = lineitem.l_partkey +------------------------------------Projection: part.p_partkey +--------------------------------------Filter: part.p_type = Utf8("ECONOMY ANODIZED STEEL") +----------------------------------------TableScan: part projection=[p_partkey, p_type], partial_filters=[part.p_type = Utf8("ECONOMY ANODIZED STEEL")] +------------------------------------TableScan: lineitem projection=[l_orderkey, l_partkey, l_suppkey, l_extendedprice, l_discount] +--------------------------------TableScan: supplier projection=[s_suppkey, s_nationkey] +----------------------------Filter: orders.o_orderdate >= Date32("9131") AND orders.o_orderdate <= Date32("9861") +------------------------------TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate], partial_filters=[orders.o_orderdate >= Date32("9131"), orders.o_orderdate <= Date32("9861")] +------------------------TableScan: customer projection=[c_custkey, c_nationkey] +--------------------SubqueryAlias: n1 +----------------------TableScan: nation projection=[n_nationkey, n_regionkey] +----------------SubqueryAlias: n2 +------------------TableScan: nation projection=[n_nationkey, n_name] +------------Projection: region.r_regionkey +--------------Filter: region.r_name = Utf8("AMERICA") +----------------TableScan: region projection=[r_regionkey, r_name], partial_filters=[region.r_name = Utf8("AMERICA")] physical_plan SortPreservingMergeExec: [o_year@0 ASC NULLS LAST] --SortExec: expr=[o_year@0 ASC NULLS LAST] diff --git a/datafusion/sqllogictest/test_files/tpch/q9.slt.part b/datafusion/sqllogictest/test_files/tpch/q9.slt.part index ecd0056d17b4..5db97f79bdb1 100644 --- a/datafusion/sqllogictest/test_files/tpch/q9.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/q9.slt.part @@ -58,24 +58,23 @@ Limit: skip=0, fetch=10 ------Aggregate: groupBy=[[profit.nation, profit.o_year]], aggr=[[SUM(profit.amount)]] --------SubqueryAlias: profit ----------Projection: nation.n_name AS nation, date_part(Utf8("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) - partsupp.ps_supplycost * lineitem.l_quantity AS amount -------------Projection: lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, partsupp.ps_supplycost, orders.o_orderdate, nation.n_name ---------------Inner Join: supplier.s_nationkey = nation.n_nationkey -----------------Projection: lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, partsupp.ps_supplycost, orders.o_orderdate -------------------Inner Join: lineitem.l_orderkey = orders.o_orderkey ---------------------Projection: lineitem.l_orderkey, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, partsupp.ps_supplycost -----------------------Inner Join: lineitem.l_suppkey = partsupp.ps_suppkey, lineitem.l_partkey = partsupp.ps_partkey -------------------------Projection: lineitem.l_orderkey, lineitem.l_partkey, lineitem.l_suppkey, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey ---------------------------Inner Join: lineitem.l_suppkey = supplier.s_suppkey -----------------------------Projection: lineitem.l_orderkey, lineitem.l_partkey, lineitem.l_suppkey, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount -------------------------------Inner Join: part.p_partkey = lineitem.l_partkey ---------------------------------Projection: part.p_partkey -----------------------------------Filter: part.p_name LIKE Utf8("%green%") -------------------------------------TableScan: part projection=[p_partkey, p_name], partial_filters=[part.p_name LIKE Utf8("%green%")] ---------------------------------TableScan: lineitem projection=[l_orderkey, l_partkey, l_suppkey, l_quantity, l_extendedprice, l_discount] -----------------------------TableScan: supplier projection=[s_suppkey, s_nationkey] -------------------------TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] ---------------------TableScan: orders projection=[o_orderkey, o_orderdate] -----------------TableScan: nation projection=[n_nationkey, n_name] +------------Inner Join: supplier.s_nationkey = nation.n_nationkey +--------------Projection: lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, partsupp.ps_supplycost, orders.o_orderdate +----------------Inner Join: lineitem.l_orderkey = orders.o_orderkey +------------------Projection: lineitem.l_orderkey, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, partsupp.ps_supplycost +--------------------Inner Join: lineitem.l_suppkey = partsupp.ps_suppkey, lineitem.l_partkey = partsupp.ps_partkey +----------------------Projection: lineitem.l_orderkey, lineitem.l_partkey, lineitem.l_suppkey, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey +------------------------Inner Join: lineitem.l_suppkey = supplier.s_suppkey +--------------------------Projection: lineitem.l_orderkey, lineitem.l_partkey, lineitem.l_suppkey, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount +----------------------------Inner Join: part.p_partkey = lineitem.l_partkey +------------------------------Projection: part.p_partkey +--------------------------------Filter: part.p_name LIKE Utf8("%green%") +----------------------------------TableScan: part projection=[p_partkey, p_name], partial_filters=[part.p_name LIKE Utf8("%green%")] +------------------------------TableScan: lineitem projection=[l_orderkey, l_partkey, l_suppkey, l_quantity, l_extendedprice, l_discount] +--------------------------TableScan: supplier projection=[s_suppkey, s_nationkey] +----------------------TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] +------------------TableScan: orders projection=[o_orderkey, o_orderdate] +--------------TableScan: nation projection=[n_nationkey, n_name] physical_plan GlobalLimitExec: skip=0, fetch=10 --SortPreservingMergeExec: [nation@0 ASC NULLS LAST,o_year@1 DESC], fetch=10 From e574abf4db5e5eee9e68a404a70ecb1582456764 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Wed, 6 Mar 2024 17:20:40 -0500 Subject: [PATCH 09/17] Move now, current_date and current_time functions to datafusion-functions --- .../tests/optimizer_integration.rs | 20 +++- datafusion/core/tests/simplification.rs | 34 +++++-- datafusion/expr/src/built_in_function.rs | 16 ---- datafusion/expr/src/expr_fn.rs | 4 - datafusion/expr/src/signature.rs | 2 +- .../functions/src/datetime/current_date.rs | 92 +++++++++++++++++++ .../functions/src/datetime/current_time.rs | 81 ++++++++++++++++ datafusion/functions/src/datetime/mod.rs | 18 +++- datafusion/functions/src/datetime/now.rs | 4 +- .../simplify_expressions/simplify_exprs.rs | 23 ----- .../src/simplify_expressions/utils.rs | 9 -- .../physical-expr/src/datetime_expressions.rs | 31 ------- datafusion/physical-expr/src/functions.rs | 14 +-- datafusion/proto/proto/datafusion.proto | 4 +- datafusion/proto/src/generated/pbjson.rs | 6 -- datafusion/proto/src/generated/prost.rs | 8 +- .../proto/src/logical_plan/from_proto.rs | 6 +- datafusion/proto/src/logical_plan/to_proto.rs | 2 - 18 files changed, 242 insertions(+), 132 deletions(-) rename datafusion/{optimizer => core}/tests/optimizer_integration.rs (96%) create mode 100644 datafusion/functions/src/datetime/current_date.rs create mode 100644 datafusion/functions/src/datetime/current_time.rs diff --git a/datafusion/optimizer/tests/optimizer_integration.rs b/datafusion/core/tests/optimizer_integration.rs similarity index 96% rename from datafusion/optimizer/tests/optimizer_integration.rs rename to datafusion/core/tests/optimizer_integration.rs index fe1234de5ab8..f8afd1b63dd5 100644 --- a/datafusion/optimizer/tests/optimizer_integration.rs +++ b/datafusion/core/tests/optimizer_integration.rs @@ -33,6 +33,7 @@ use datafusion_sql::sqlparser::parser::Parser; use datafusion_sql::TableReference; use chrono::{DateTime, NaiveDateTime, Utc}; +use datafusion_functions::datetime; #[cfg(test)] #[ctor::ctor] @@ -342,7 +343,12 @@ fn test_sql(sql: &str) -> Result { let statement = &ast[0]; // create a logical query plan - let context_provider = MyContextProvider::default(); + let now_udf = datetime::functions() + .iter() + .find(|f| f.name() == "now") + .unwrap() + .to_owned(); + let context_provider = MyContextProvider::default().with_udf(now_udf); let sql_to_rel = SqlToRel::new(&context_provider); let plan = sql_to_rel.sql_statement_to_plan(statement.clone()).unwrap(); @@ -362,6 +368,14 @@ fn test_sql(sql: &str) -> Result { #[derive(Default)] struct MyContextProvider { options: ConfigOptions, + udfs: HashMap>, +} + +impl MyContextProvider { + fn with_udf(mut self, udf: Arc) -> Self { + self.udfs.insert(udf.name().to_string(), udf); + self + } } impl ContextProvider for MyContextProvider { @@ -399,8 +413,8 @@ impl ContextProvider for MyContextProvider { } } - fn get_function_meta(&self, _name: &str) -> Option> { - None + fn get_function_meta(&self, name: &str) -> Option> { + self.udfs.get(name).cloned() } fn get_aggregate_meta(&self, _name: &str) -> Option> { diff --git a/datafusion/core/tests/simplification.rs b/datafusion/core/tests/simplification.rs index 41457df02cfc..25f994d320c1 100644 --- a/datafusion/core/tests/simplification.rs +++ b/datafusion/core/tests/simplification.rs @@ -185,10 +185,6 @@ fn make_udf_add(volatility: Volatility) -> Arc { )) } -fn now_expr() -> Expr { - call_fn("now", vec![]).unwrap() -} - fn cast_to_int64_expr(expr: Expr) -> Expr { Expr::Cast(Cast::new(expr.into(), DataType::Int64)) } @@ -255,7 +251,7 @@ fn now_less_than_timestamp() -> Result<()> { // cast(now() as int) < cast(to_timestamp(...) as int) + 50000_i64 let plan = LogicalPlanBuilder::from(table_scan) .filter( - cast_to_int64_expr(now_expr()) + cast_to_int64_expr(now()) .lt(cast_to_int64_expr(to_timestamp_expr(ts_string)) + lit(50000_i64)), )? .build()?; @@ -368,14 +364,14 @@ fn test_const_evaluator_now() { let time = chrono::Utc.timestamp_nanos(ts_nanos); let ts_string = "2020-09-08T12:05:00+00:00"; // now() --> ts - test_evaluate_with_start_time(now_expr(), lit_timestamp_nano(ts_nanos), &time); + test_evaluate_with_start_time(now(), lit_timestamp_nano(ts_nanos), &time); // CAST(now() as int64) + 100_i64 --> ts + 100_i64 - let expr = cast_to_int64_expr(now_expr()) + lit(100_i64); + let expr = cast_to_int64_expr(now()) + lit(100_i64); test_evaluate_with_start_time(expr, lit(ts_nanos + 100), &time); // CAST(now() as int64) < cast(to_timestamp(...) as int64) + 50000_i64 ---> true - let expr = cast_to_int64_expr(now_expr()) + let expr = cast_to_int64_expr(now()) .lt(cast_to_int64_expr(to_timestamp_expr(ts_string)) + lit(50000i64)); test_evaluate_with_start_time(expr, lit(true), &time); } @@ -413,3 +409,25 @@ fn test_evaluator_udfs() { )); test_evaluate(expr, expected_expr); } + +#[test] +fn multiple_now() -> Result<()> { + let table_scan = test_table_scan(); + let time = Utc::now(); + let proj = vec![now(), now().alias("t2")]; + let plan = LogicalPlanBuilder::from(table_scan) + .project(proj)? + .build()?; + + // expect the same timestamp appears in both exprs + let actual = get_optimized_plan_formatted(&plan, &time); + let expected = format!( + "Projection: TimestampNanosecond({}, Some(\"+00:00\")) AS now(), TimestampNanosecond({}, Some(\"+00:00\")) AS t2\ + \n TableScan: test", + time.timestamp_nanos_opt().unwrap(), + time.timestamp_nanos_opt().unwrap() + ); + + assert_eq!(expected, actual); + Ok(()) +} diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index ea9ef083a7aa..4f66c19b180b 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -244,10 +244,6 @@ pub enum BuiltinScalarFunction { Substr, /// to_hex ToHex, - ///current_date - CurrentDate, - /// current_time - CurrentTime, /// make_date MakeDate, /// translate @@ -430,10 +426,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::SubstrIndex => Volatility::Immutable, BuiltinScalarFunction::FindInSet => Volatility::Immutable, - // Stable builtin functions - BuiltinScalarFunction::CurrentDate => Volatility::Stable, - BuiltinScalarFunction::CurrentTime => Volatility::Stable, - // Volatile builtin functions BuiltinScalarFunction::Random => Volatility::Volatile, BuiltinScalarFunction::Uuid => Volatility::Volatile, @@ -469,7 +461,6 @@ impl BuiltinScalarFunction { /// 2. Deduce the output `DataType` based on the provided `input_expr_types`. pub fn return_type(self, input_expr_types: &[DataType]) -> Result { use DataType::*; - use TimeUnit::*; // Note that this function *must* return the same type that the respective physical expression returns // or the execution panics. @@ -694,8 +685,6 @@ impl BuiltinScalarFunction { utf8_to_int_type(&input_expr_types[0], "find_in_set") } BuiltinScalarFunction::ToChar => Ok(Utf8), - BuiltinScalarFunction::CurrentDate => Ok(Date32), - BuiltinScalarFunction::CurrentTime => Ok(Time64(Nanosecond)), BuiltinScalarFunction::MakeDate => Ok(Date32), BuiltinScalarFunction::Translate => { utf8_to_str_type(&input_expr_types[0], "translate") @@ -1099,9 +1088,6 @@ impl BuiltinScalarFunction { // will be as good as the number of digits in the number Signature::uniform(1, vec![Float64, Float32], self.volatility()) } - BuiltinScalarFunction::CurrentDate | BuiltinScalarFunction::CurrentTime => { - Signature::uniform(0, vec![], self.volatility()) - } BuiltinScalarFunction::MakeDate => Signature::uniform( 3, vec![Int32, Int64, UInt32, UInt64, Utf8], @@ -1231,8 +1217,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::FindInSet => &["find_in_set"], // time/date functions - BuiltinScalarFunction::CurrentDate => &["current_date", "today"], - BuiltinScalarFunction::CurrentTime => &["current_time"], BuiltinScalarFunction::MakeDate => &["make_date"], BuiltinScalarFunction::ToChar => &["to_char", "date_format"], diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 2ff64cc5ea60..1debb74523f7 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -861,8 +861,6 @@ scalar_expr!( datetime format, "converts a date, time, timestamp or duration to a string based on the provided format" ); -scalar_expr!(CurrentDate, current_date, ,"returns current UTC date as a [`DataType::Date32`] value"); -scalar_expr!(CurrentTime, current_time, , "returns current UTC time as a [`DataType::Time64`] value"); scalar_expr!(MakeDate, make_date, year month day, "make a date from year, month and day component parts"); scalar_expr!(Nanvl, nanvl, x y, "returns x if x is not NaN otherwise returns y"); scalar_expr!( @@ -1335,8 +1333,6 @@ mod test { test_scalar_expr!(Trim, trim, string); test_scalar_expr!(Upper, upper, string); - test_scalar_expr!(FromUnixtime, from_unixtime, unixtime); - test_scalar_expr!(ArrayAppend, array_append, array, element); test_scalar_expr!(ArraySort, array_sort, array, desc, null_first); test_scalar_expr!(ArrayPopFront, array_pop_front, array); diff --git a/datafusion/expr/src/signature.rs b/datafusion/expr/src/signature.rs index ad29fe0724a4..0d65c068c4a0 100644 --- a/datafusion/expr/src/signature.rs +++ b/datafusion/expr/src/signature.rs @@ -44,7 +44,7 @@ pub enum Volatility { Immutable, /// A stable function may return different values given the same input across different /// queries but must return the same value for a given input within a query. An example of - /// this is [super::BuiltinScalarFunction::Now]. DataFusion + /// this is the `Now` function. DataFusion /// will attempt to inline `Stable` functions during planning, when possible. /// For query `select col1, now() from t1`, it might take a while to execute but /// `now()` column will be the same for each output row, which is evaluated diff --git a/datafusion/functions/src/datetime/current_date.rs b/datafusion/functions/src/datetime/current_date.rs new file mode 100644 index 000000000000..5338234a8e49 --- /dev/null +++ b/datafusion/functions/src/datetime/current_date.rs @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; + +use arrow::datatypes::DataType; +use arrow::datatypes::DataType::Date32; +use chrono::{Datelike, NaiveDate}; + +use datafusion_common::{internal_err, Result, ScalarValue}; +use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; +use datafusion_expr::{ColumnarValue, Expr, ScalarUDFImpl, Signature, Volatility}; + +#[derive(Debug)] +pub(super) struct CurrentDateFunc { + signature: Signature, + aliases: Vec, +} + +impl CurrentDateFunc { + pub fn new() -> Self { + Self { + signature: Signature::uniform(0, vec![], Volatility::Stable), + aliases: vec![String::from("today")], + } + } +} + +/// Create an implementation of `current_date()` that always returns the +/// specified current date. +/// +/// The semantics of `current_date()` require it to return the same value +/// wherever it appears within a single statement. This value is +/// chosen during planning time. +impl ScalarUDFImpl for CurrentDateFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "current_date" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(Date32) + } + + fn invoke(&self, _args: &[ColumnarValue]) -> Result { + internal_err!( + "invoke should not be called on a simplified current_date() function" + ) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } + + fn simplify( + &self, + _args: Vec, + info: &dyn SimplifyInfo, + ) -> Result { + let now_ts = info.execution_props().query_execution_start_time; + let days = Some( + now_ts.num_days_from_ce() + - NaiveDate::from_ymd_opt(1970, 1, 1) + .unwrap() + .num_days_from_ce(), + ); + Ok(ExprSimplifyResult::Simplified(Expr::Literal( + ScalarValue::Date32(days), + ))) + } +} diff --git a/datafusion/functions/src/datetime/current_time.rs b/datafusion/functions/src/datetime/current_time.rs new file mode 100644 index 000000000000..b8a8aa2acb53 --- /dev/null +++ b/datafusion/functions/src/datetime/current_time.rs @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; + +use arrow::datatypes::DataType; +use arrow::datatypes::DataType::Time64; +use arrow::datatypes::TimeUnit::Nanosecond; + +use datafusion_common::{internal_err, Result, ScalarValue}; +use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; +use datafusion_expr::{ColumnarValue, Expr, ScalarUDFImpl, Signature, Volatility}; + +#[derive(Debug)] +pub(super) struct CurrentTimeFunc { + signature: Signature, +} + +impl CurrentTimeFunc { + pub fn new() -> Self { + Self { + signature: Signature::uniform(0, vec![], Volatility::Stable), + } + } +} + +/// Create an implementation of `current_time()` that always returns the +/// specified current time. +/// +/// The semantics of `current_time()` require it to return the same value +/// wherever it appears within a single statement. This value is +/// chosen during planning time. +impl ScalarUDFImpl for CurrentTimeFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "current_time" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(Time64(Nanosecond)) + } + + fn invoke(&self, _args: &[ColumnarValue]) -> Result { + internal_err!( + "invoke should not be called on a simplified current_time() function" + ) + } + + fn simplify( + &self, + _args: Vec, + info: &dyn SimplifyInfo, + ) -> Result { + let now_ts = info.execution_props().query_execution_start_time; + let nano = now_ts.timestamp_nanos_opt().map(|ts| ts % 86400000000000); + Ok(ExprSimplifyResult::Simplified(Expr::Literal( + ScalarValue::Time64Nanosecond(nano), + ))) + } +} diff --git a/datafusion/functions/src/datetime/mod.rs b/datafusion/functions/src/datetime/mod.rs index e71b3ed97e12..d4d4120fb9f0 100644 --- a/datafusion/functions/src/datetime/mod.rs +++ b/datafusion/functions/src/datetime/mod.rs @@ -22,6 +22,8 @@ use std::sync::Arc; use datafusion_expr::ScalarUDF; mod common; +mod current_date; +mod current_time; mod date_bin; mod date_part; mod date_trunc; @@ -31,6 +33,8 @@ mod to_date; mod to_timestamp; // create UDFs +make_udf_function!(current_date::CurrentDateFunc, CURRENT_DATE, current_date); +make_udf_function!(current_time::CurrentTimeFunc, CURRENT_TIME, current_time); make_udf_function!(date_bin::DateBinFunc, DATE_BIN, date_bin); make_udf_function!(date_part::DatePartFunc, DATE_PART, date_part); make_udf_function!(date_trunc::DateTruncFunc, DATE_TRUNC, date_trunc); @@ -69,6 +73,16 @@ make_udf_function!( pub mod expr_fn { use datafusion_expr::Expr; + #[doc = "returns current UTC date as a Date32 value"] + pub fn current_date() -> Expr { + super::current_date().call(vec![]) + } + + #[doc = "returns current UTC time as a Time64 value"] + pub fn current_time() -> Expr { + super::current_time().call(vec![]) + } + #[doc = "coerces an arbitrary timestamp to the start of the nearest specified interval"] pub fn date_bin(stride: Expr, source: Expr, origin: Expr) -> Expr { super::date_bin().call(vec![stride, source, origin]) @@ -84,7 +98,7 @@ pub mod expr_fn { super::date_trunc().call(vec![part, date]) } - #[doc = "converts an integer to RFC3339 timestamp format"] + #[doc = "converts an integer to RFC3339 timestamp format string"] pub fn from_unixtime(unixtime: Expr) -> Expr { super::from_unixtime().call(vec![unixtime]) } @@ -173,6 +187,8 @@ pub mod expr_fn { /// Return a list of all functions in this package pub fn functions() -> Vec> { vec![ + current_date(), + current_time(), date_bin(), date_part(), date_trunc(), diff --git a/datafusion/functions/src/datetime/now.rs b/datafusion/functions/src/datetime/now.rs index 8ceda64e0ba9..cc7979df0d86 100644 --- a/datafusion/functions/src/datetime/now.rs +++ b/datafusion/functions/src/datetime/now.rs @@ -21,7 +21,7 @@ use arrow::datatypes::DataType; use arrow::datatypes::DataType::Timestamp; use arrow::datatypes::TimeUnit::Nanosecond; -use datafusion_common::{Result, ScalarValue}; +use datafusion_common::{internal_err, Result, ScalarValue}; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::{ColumnarValue, Expr, ScalarUDFImpl, Signature, Volatility}; @@ -62,7 +62,7 @@ impl ScalarUDFImpl for NowFunc { } fn invoke(&self, _args: &[ColumnarValue]) -> Result { - todo!() + internal_err!("invoke should not be called on a simplified now() function") } fn simplify( diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs index 00d60d0a80dc..70b163acc208 100644 --- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs +++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs @@ -146,7 +146,6 @@ mod tests { }; use datafusion_expr::{call_fn, or, BinaryExpr, Cast, Operator}; - use crate::simplify_expressions::utils::for_test::now_expr; use crate::test::{assert_fields_eq, test_table_scan_with_name}; use crate::OptimizerContext; @@ -446,28 +445,6 @@ mod tests { Ok(()) } - #[test] - fn multiple_now_expr() -> Result<()> { - let table_scan = test_table_scan(); - let time = Utc::now(); - let proj = vec![now_expr(), now_expr().alias("t2")]; - let plan = LogicalPlanBuilder::from(table_scan) - .project(proj)? - .build()?; - - // expect the same timestamp appears in both exprs - let actual = get_optimized_plan_formatted(&plan, &time); - let expected = format!( - "Projection: TimestampNanosecond({}, Some(\"+00:00\")) AS now(), TimestampNanosecond({}, Some(\"+00:00\")) AS t2\ - \n TableScan: test", - time.timestamp_nanos_opt().unwrap(), - time.timestamp_nanos_opt().unwrap() - ); - - assert_eq!(expected, actual); - Ok(()) - } - #[test] fn simplify_and_eval() -> Result<()> { // demonstrate a case where the evaluation needs to run prior diff --git a/datafusion/optimizer/src/simplify_expressions/utils.rs b/datafusion/optimizer/src/simplify_expressions/utils.rs index 8952d5d79856..1dd3a6162894 100644 --- a/datafusion/optimizer/src/simplify_expressions/utils.rs +++ b/datafusion/optimizer/src/simplify_expressions/utils.rs @@ -530,12 +530,3 @@ pub fn simpl_concat_ws(delimiter: &Expr, args: &[Expr]) -> Result { )), } } - -#[cfg(test)] -pub mod for_test { - use datafusion_expr::{call_fn, Expr}; - - pub fn now_expr() -> Expr { - call_fn("now", vec![]).unwrap() - } -} diff --git a/datafusion/physical-expr/src/datetime_expressions.rs b/datafusion/physical-expr/src/datetime_expressions.rs index 22a67e6fc850..87a9ca1766b6 100644 --- a/datafusion/physical-expr/src/datetime_expressions.rs +++ b/datafusion/physical-expr/src/datetime_expressions.rs @@ -46,37 +46,6 @@ pub fn make_now( } } -/// Create an implementation of `current_date()` that always returns the -/// specified current date. -/// -/// The semantics of `current_date()` require it to return the same value -/// wherever it appears within a single statement. This value is -/// chosen during planning time. -pub fn make_current_date( - now_ts: DateTime, -) -> impl Fn(&[ColumnarValue]) -> Result { - let days = Some( - now_ts.num_days_from_ce() - - NaiveDate::from_ymd_opt(1970, 1, 1) - .unwrap() - .num_days_from_ce(), - ); - move |_arg| Ok(ColumnarValue::Scalar(ScalarValue::Date32(days))) -} - -/// Create an implementation of `current_time()` that always returns the -/// specified current time. -/// -/// The semantics of `current_time()` require it to return the same value -/// wherever it appears within a single statement. This value is -/// chosen during planning time. -pub fn make_current_time( - now_ts: DateTime, -) -> impl Fn(&[ColumnarValue]) -> Result { - let nano = now_ts.timestamp_nanos_opt().map(|ts| ts % 86400000000000); - move |_arg| Ok(ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(nano))) -} - /// Returns a string representation of a date, time, timestamp or duration based /// on a Chrono pattern. /// diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index 12fd49dceca3..8983e4a44c50 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -240,7 +240,7 @@ where /// Create a physical scalar function. pub fn create_physical_fun( fun: &BuiltinScalarFunction, - execution_props: &ExecutionProps, + _execution_props: &ExecutionProps, ) -> Result { Ok(match fun { // math functions @@ -457,18 +457,6 @@ pub fn create_physical_fun( BuiltinScalarFunction::ConcatWithSeparator => Arc::new(|args| { make_scalar_function_inner(string_expressions::concat_ws)(args) }), - BuiltinScalarFunction::CurrentDate => { - // bind value for current_date at plan time - Arc::new(datetime_expressions::make_current_date( - execution_props.query_execution_start_time, - )) - } - BuiltinScalarFunction::CurrentTime => { - // bind value for current_time at plan time - Arc::new(datetime_expressions::make_current_time( - execution_props.query_execution_start_time, - )) - } BuiltinScalarFunction::MakeDate => Arc::new(datetime_expressions::make_date), BuiltinScalarFunction::ToChar => Arc::new(datetime_expressions::to_char), BuiltinScalarFunction::InitCap => Arc::new(|args| match args[0].data_type() { diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 81eab272d8c8..4c09eeb58e3a 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -618,8 +618,8 @@ enum ScalarFunction { Atan2 = 67; // 68 was DateBin ArrowTypeof = 69; - CurrentDate = 70; - CurrentTime = 71; + // 70 was CurrentDate + // 71 was CurrentTime Uuid = 72; Cbrt = 73; Acosh = 74; diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index a77d7857097a..ed2e2987a35e 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22377,8 +22377,6 @@ impl serde::Serialize for ScalarFunction { Self::StructFun => "StructFun", Self::Atan2 => "Atan2", Self::ArrowTypeof => "ArrowTypeof", - Self::CurrentDate => "CurrentDate", - Self::CurrentTime => "CurrentTime", Self::Uuid => "Uuid", Self::Cbrt => "Cbrt", Self::Acosh => "Acosh", @@ -22500,8 +22498,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "StructFun", "Atan2", "ArrowTypeof", - "CurrentDate", - "CurrentTime", "Uuid", "Cbrt", "Acosh", @@ -22652,8 +22648,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "StructFun" => Ok(ScalarFunction::StructFun), "Atan2" => Ok(ScalarFunction::Atan2), "ArrowTypeof" => Ok(ScalarFunction::ArrowTypeof), - "CurrentDate" => Ok(ScalarFunction::CurrentDate), - "CurrentTime" => Ok(ScalarFunction::CurrentTime), "Uuid" => Ok(ScalarFunction::Uuid), "Cbrt" => Ok(ScalarFunction::Cbrt), "Acosh" => Ok(ScalarFunction::Acosh), diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index a22ff8a4fe4a..20e805c03892 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2707,8 +2707,8 @@ pub enum ScalarFunction { Atan2 = 67, /// 68 was DateBin ArrowTypeof = 69, - CurrentDate = 70, - CurrentTime = 71, + /// 70 was CurrentDate + /// 71 was CurrentTime Uuid = 72, Cbrt = 73, Acosh = 74, @@ -2838,8 +2838,6 @@ impl ScalarFunction { ScalarFunction::StructFun => "StructFun", ScalarFunction::Atan2 => "Atan2", ScalarFunction::ArrowTypeof => "ArrowTypeof", - ScalarFunction::CurrentDate => "CurrentDate", - ScalarFunction::CurrentTime => "CurrentTime", ScalarFunction::Uuid => "Uuid", ScalarFunction::Cbrt => "Cbrt", ScalarFunction::Acosh => "Acosh", @@ -2955,8 +2953,6 @@ impl ScalarFunction { "StructFun" => Some(Self::StructFun), "Atan2" => Some(Self::Atan2), "ArrowTypeof" => Some(Self::ArrowTypeof), - "CurrentDate" => Some(Self::CurrentDate), - "CurrentTime" => Some(Self::CurrentTime), "Uuid" => Some(Self::Uuid), "Cbrt" => Some(Self::Cbrt), "Acosh" => Some(Self::Acosh), diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index da63e1fc5293..4df202859ae9 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -54,7 +54,7 @@ use datafusion_expr::{ array_replace_all, array_replace_n, array_resize, array_slice, array_sort, array_union, arrow_typeof, ascii, asinh, atan, atan2, atanh, bit_length, btrim, cbrt, ceil, character_length, chr, coalesce, concat_expr, concat_ws_expr, cos, cosh, cot, - current_date, current_time, degrees, digest, ends_with, exp, + degrees, digest, ends_with, exp, expr::{self, InList, Sort, WindowFunction}, factorial, find_in_set, flatten, floor, gcd, initcap, iszero, lcm, left, levenshtein, ln, log, log10, log2, @@ -538,8 +538,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Substr => Self::Substr, ScalarFunction::ToHex => Self::ToHex, ScalarFunction::ToChar => Self::ToChar, - ScalarFunction::CurrentDate => Self::CurrentDate, - ScalarFunction::CurrentTime => Self::CurrentTime, ScalarFunction::MakeDate => Self::MakeDate, ScalarFunction::Uuid => Self::Uuid, ScalarFunction::Translate => Self::Translate, @@ -1773,8 +1771,6 @@ pub fn parse_expr( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, )), - ScalarFunction::CurrentDate => Ok(current_date()), - ScalarFunction::CurrentTime => Ok(current_time()), ScalarFunction::Cot => Ok(cot(parse_expr(&args[0], registry, codec)?)), ScalarFunction::Nanvl => Ok(nanvl( parse_expr(&args[0], registry, codec)?, diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 8fd74adbbade..2c7ddb11f978 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1517,8 +1517,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Strpos => Self::Strpos, BuiltinScalarFunction::Substr => Self::Substr, BuiltinScalarFunction::ToHex => Self::ToHex, - BuiltinScalarFunction::CurrentDate => Self::CurrentDate, - BuiltinScalarFunction::CurrentTime => Self::CurrentTime, BuiltinScalarFunction::MakeDate => Self::MakeDate, BuiltinScalarFunction::Translate => Self::Translate, BuiltinScalarFunction::Coalesce => Self::Coalesce, From ea89f715bfa9081cc50c79c31bd20980d7d70dc2 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sat, 9 Mar 2024 15:17:38 -0500 Subject: [PATCH 10/17] Force exact version of chrono --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 48e555bd5527..9038baf8425f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,7 +68,7 @@ arrow-string = { version = "50.0.0", default-features = false } async-trait = "0.1.73" bigdecimal = "=0.4.1" bytes = "1.4" -chrono = { version = "0.4.34", default-features = false } +chrono = { version = "=0.4.34", default-features = false } ctor = "0.2.0" dashmap = "5.4.0" datafusion = { path = "datafusion/core", version = "36.0.0", default-features = false } From 8abb99a3eadfc2c0331761e539421627e609ffa7 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sat, 9 Mar 2024 15:18:22 -0500 Subject: [PATCH 11/17] Merge updates. --- datafusion-cli/Cargo.lock | 4 ++-- datafusion/proto/src/logical_plan/from_proto.rs | 13 ++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 5e3c8648fc25..7d64a45eccbd 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -873,9 +873,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.35" +version = "0.4.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8eaf5903dcbc0a39312feb77df2ff4c76387d591b9fc7b04a238dcf8bb62639a" +checksum = "5bc015644b92d5890fab7489e49d21f879d5c990186827d42ec511919404f38b" dependencies = [ "android-tzdata", "iana-time-zone", diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 828a081528b7..4f4b6b76a61a 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -48,13 +48,12 @@ use datafusion_expr::expr::Unnest; use datafusion_expr::window_frame::{check_window_frame, regularize_window_order_by}; use datafusion_expr::{ acosh, array, array_append, array_concat, array_distinct, array_element, - array_except, array_intersect, - array_pop_back, array_pop_front, array_position, array_positions, array_prepend, - array_remove, array_remove_all, array_remove_n, array_repeat, array_replace, - array_replace_all, array_replace_n, array_resize, array_slice, array_sort, - array_union, arrow_typeof, ascii, asinh, atan, atan2, atanh, bit_length, btrim, cbrt, - ceil, character_length, chr, coalesce, concat_expr, concat_ws_expr, cos, cosh, cot, - degrees, digest, ends_with, exp, + array_except, array_intersect, array_pop_back, array_pop_front, array_position, + array_positions, array_prepend, array_remove, array_remove_all, array_remove_n, + array_repeat, array_replace, array_replace_all, array_replace_n, array_resize, + array_slice, array_sort, array_union, arrow_typeof, ascii, asinh, atan, atan2, atanh, + bit_length, btrim, cbrt, ceil, character_length, chr, coalesce, concat_expr, + concat_ws_expr, cos, cosh, cot, degrees, digest, ends_with, exp, expr::{self, InList, Sort, WindowFunction}, factorial, find_in_set, flatten, floor, gcd, initcap, iszero, lcm, left, levenshtein, ln, log, log10, log2, From 15c50ebe56ae4b6d4d080f45a2c9024b8007f90c Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sat, 9 Mar 2024 16:16:01 -0500 Subject: [PATCH 12/17] Updates for chrono changes --- Cargo.toml | 2 +- datafusion/functions/src/datetime/date_bin.rs | 28 +++++++++---------- .../functions/src/datetime/date_trunc.rs | 12 ++++---- 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9038baf8425f..48e555bd5527 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,7 +68,7 @@ arrow-string = { version = "50.0.0", default-features = false } async-trait = "0.1.73" bigdecimal = "=0.4.1" bytes = "1.4" -chrono = { version = "=0.4.34", default-features = false } +chrono = { version = "0.4.34", default-features = false } ctor = "0.2.0" dashmap = "5.4.0" datafusion = { path = "datafusion/core", version = "36.0.0", default-features = false } diff --git a/datafusion/functions/src/datetime/date_bin.rs b/datafusion/functions/src/datetime/date_bin.rs index 69297a92aa8f..9887e83b63d3 100644 --- a/datafusion/functions/src/datetime/date_bin.rs +++ b/datafusion/functions/src/datetime/date_bin.rs @@ -29,7 +29,7 @@ use arrow_array::types::{ TimestampSecondType, }; use arrow_array::{ArrayRef, PrimitiveArray}; -use chrono::{DateTime, Datelike, Duration, Months, NaiveDateTime, Utc}; +use chrono::{DateTime, Datelike, Duration, Months, TimeDelta, Utc}; use datafusion_common::cast::as_primitive_array; use datafusion_common::{exec_err, not_impl_err, plan_err, Result, ScalarValue}; @@ -226,8 +226,7 @@ fn date_bin_months_interval(stride_months: i64, source: i64, origin: i64) -> i64 fn to_utc_date_time(nanos: i64) -> DateTime { let secs = nanos / 1_000_000_000; let nsec = (nanos % 1_000_000_000) as u32; - let date = NaiveDateTime::from_timestamp_opt(secs, nsec).unwrap(); - DateTime::::from_naive_utc_and_offset(date, Utc) + DateTime::from_timestamp(secs, nsec).unwrap() } // Supported intervals: @@ -244,8 +243,9 @@ fn date_bin_impl( let stride = match stride { ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(v))) => { let (days, ms) = IntervalDayTimeType::to_parts(*v); - let nanos = (Duration::days(days as i64) + Duration::milliseconds(ms as i64)) - .num_nanoseconds(); + let nanos = (TimeDelta::try_days(days as i64).unwrap() + + TimeDelta::try_milliseconds(ms as i64).unwrap()) + .num_nanoseconds(); match nanos { Some(v) => Interval::Nanoseconds(v), @@ -266,8 +266,9 @@ fn date_bin_impl( Interval::Months(months as i64) } } else { - let nanos = (Duration::days(days as i64) + Duration::nanoseconds(nanos)) - .num_nanoseconds(); + let nanos = (TimeDelta::try_days(days as i64).unwrap() + + Duration::nanoseconds(nanos)) + .num_nanoseconds(); match nanos { Some(v) => Interval::Nanoseconds(v), _ => return exec_err!("DATE_BIN stride argument is too large"), @@ -423,6 +424,7 @@ mod tests { use arrow::datatypes::{DataType, TimeUnit}; use arrow_array::types::TimestampNanosecondType; use arrow_array::{IntervalDayTimeArray, TimestampNanosecondArray}; + use chrono::TimeDelta; use datafusion_common::ScalarValue; use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; @@ -705,12 +707,10 @@ mod tests { #[test] fn test_date_bin_single() { - use chrono::Duration; - let cases = vec![ ( ( - Duration::minutes(15), + TimeDelta::try_minutes(15).unwrap(), "2004-04-09T02:03:04.123456789Z", "2001-01-01T00:00:00", ), @@ -718,7 +718,7 @@ mod tests { ), ( ( - Duration::minutes(15), + TimeDelta::try_minutes(15).unwrap(), "2004-04-09T02:03:04.123456789Z", "2001-01-01T00:02:30", ), @@ -726,7 +726,7 @@ mod tests { ), ( ( - Duration::minutes(15), + TimeDelta::try_minutes(15).unwrap(), "2004-04-09T02:03:04.123456789Z", "2005-01-01T00:02:30", ), @@ -734,7 +734,7 @@ mod tests { ), ( ( - Duration::hours(1), + TimeDelta::try_hours(1).unwrap(), "2004-04-09T02:03:04.123456789Z", "2001-01-01T00:00:00", ), @@ -742,7 +742,7 @@ mod tests { ), ( ( - Duration::seconds(10), + TimeDelta::try_seconds(10).unwrap(), "2004-04-09T02:03:11.123456789Z", "2001-01-01T00:00:00", ), diff --git a/datafusion/functions/src/datetime/date_trunc.rs b/datafusion/functions/src/datetime/date_trunc.rs index 4ece175abfb2..307510712686 100644 --- a/datafusion/functions/src/datetime/date_trunc.rs +++ b/datafusion/functions/src/datetime/date_trunc.rs @@ -33,7 +33,7 @@ use arrow_array::types::{ }; use arrow_array::{Array, PrimitiveArray}; use chrono::{ - DateTime, Datelike, Duration, LocalResult, NaiveDateTime, Offset, Timelike, + DateTime, Datelike, Duration, LocalResult, NaiveDateTime, Offset, TimeDelta, Timelike, }; use datafusion_common::cast::as_primitive_array; @@ -229,7 +229,9 @@ where .and_then(|d| d.with_second(0)) .and_then(|d| d.with_minute(0)) .and_then(|d| d.with_hour(0)) - .map(|d| d - Duration::seconds(60 * 60 * 24 * d.weekday() as i64)), + .map(|d| { + d - TimeDelta::try_seconds(60 * 60 * 24 * d.weekday() as i64).unwrap() + }), "month" => value .and_then(|d| d.with_nanosecond(0)) .and_then(|d| d.with_second(0)) @@ -280,10 +282,10 @@ fn _date_trunc_coarse_with_tz( // To account for this adjust the time by a few hours, convert to local time, // and then adjust the time back. truncated - .sub(Duration::hours(3)) + .sub(TimeDelta::try_hours(3).unwrap()) .and_local_timezone(value.timezone()) .single() - .map(|v| v.add(Duration::hours(3))) + .map(|v| v.add(TimeDelta::try_hours(3).unwrap())) } LocalResult::Single(datetime) => Some(datetime), LocalResult::Ambiguous(datetime1, datetime2) => { @@ -311,7 +313,7 @@ fn _date_trunc_coarse_without_tz( value: Option, ) -> Result> { let value = _date_trunc_coarse::(granularity, value)?; - Ok(value.and_then(|value| value.timestamp_nanos_opt())) + Ok(value.and_then(|value| value.and_utc().timestamp_nanos_opt())) } /// Truncates the single `value`, expressed in nanoseconds since the From a58a851bea765d3ee40eb2405a13463c478dcd3e Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sun, 10 Mar 2024 09:25:31 -0400 Subject: [PATCH 13/17] Merge fixes --- .../physical-expr/src/datetime_expressions.rs | 1 - datafusion/physical-expr/src/functions.rs | 18 ------------------ .../proto/src/logical_plan/from_proto.rs | 4 ++-- 3 files changed, 2 insertions(+), 21 deletions(-) diff --git a/datafusion/physical-expr/src/datetime_expressions.rs b/datafusion/physical-expr/src/datetime_expressions.rs index 574f73ebc6a8..07c29fefdebb 100644 --- a/datafusion/physical-expr/src/datetime_expressions.rs +++ b/datafusion/physical-expr/src/datetime_expressions.rs @@ -19,7 +19,6 @@ use std::sync::Arc; -use arrow::datatypes::TimeUnit; use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions}; use arrow::{ array::{Array, ArrayRef, PrimitiveArray}, diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index 82e8f2e86e4d..84aa0c94a22d 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -439,24 +439,6 @@ pub fn create_physical_fun( BuiltinScalarFunction::ConcatWithSeparator => Arc::new(|args| { make_scalar_function_inner(string_expressions::concat_ws)(args) }), - BuiltinScalarFunction::Now => { - // bind value for now at plan time - Arc::new(datetime_expressions::make_now( - execution_props.query_execution_start_time, - )) - } - BuiltinScalarFunction::CurrentDate => { - // bind value for current_date at plan time - Arc::new(datetime_expressions::make_current_date( - execution_props.query_execution_start_time, - )) - } - BuiltinScalarFunction::CurrentTime => { - // bind value for current_time at plan time - Arc::new(datetime_expressions::make_current_time( - execution_props.query_execution_start_time, - )) - } BuiltinScalarFunction::MakeDate => Arc::new(datetime_expressions::make_date), BuiltinScalarFunction::ToChar => Arc::new(datetime_expressions::to_char), BuiltinScalarFunction::InitCap => Arc::new(|args| match args[0].data_type() { diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 6ef56ced4758..1af661ad8e5f 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -55,8 +55,8 @@ use datafusion_expr::{ bit_length, btrim, cbrt, ceil, character_length, chr, coalesce, concat_expr, concat_ws_expr, cos, cosh, cot, degrees, digest, ends_with, exp, expr::{self, InList, Sort, WindowFunction}, - factorial, find_in_set, floor, gcd, initcap, iszero, lcm, left, - levenshtein, ln, log, log10, log2, + factorial, find_in_set, floor, gcd, initcap, iszero, lcm, left, levenshtein, ln, log, + log10, log2, logical_plan::{PlanType, StringifiedPlan}, lower, lpad, ltrim, md5, nanvl, octet_length, overlay, pi, power, radians, random, repeat, replace, reverse, right, round, rpad, rtrim, sha224, sha256, sha384, sha512, From b63feb3ef8aae5fc00cbafdced1a06df9f75705f Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sun, 10 Mar 2024 09:33:05 -0400 Subject: [PATCH 14/17] Removed make_now from incorrect merge. --- datafusion/physical-expr/src/datetime_expressions.rs | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/datafusion/physical-expr/src/datetime_expressions.rs b/datafusion/physical-expr/src/datetime_expressions.rs index 07c29fefdebb..d4b342153162 100644 --- a/datafusion/physical-expr/src/datetime_expressions.rs +++ b/datafusion/physical-expr/src/datetime_expressions.rs @@ -34,17 +34,6 @@ use chrono::NaiveDate; use datafusion_common::{exec_err, Result, ScalarValue}; use datafusion_expr::ColumnarValue; -pub fn make_now( - now_ts: DateTime, -) -> impl Fn(&[ColumnarValue]) -> Result { - let now_ts = now_ts.timestamp_nanos_opt(); - move |_arg| { - Ok(ColumnarValue::Scalar(ScalarValue::TimestampNanosecond( - now_ts, - Some("+00:00".into()), - ))) - } -} /// Returns a string representation of a date, time, timestamp or duration based /// on a Chrono pattern. From 3986baa6daa292109a3fc083dc5f50c08a97ecec Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sun, 10 Mar 2024 10:30:51 -0400 Subject: [PATCH 15/17] fmt fix. --- datafusion/physical-expr/src/datetime_expressions.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/physical-expr/src/datetime_expressions.rs b/datafusion/physical-expr/src/datetime_expressions.rs index d4b342153162..e0e86e7bd44b 100644 --- a/datafusion/physical-expr/src/datetime_expressions.rs +++ b/datafusion/physical-expr/src/datetime_expressions.rs @@ -34,7 +34,6 @@ use chrono::NaiveDate; use datafusion_common::{exec_err, Result, ScalarValue}; use datafusion_expr::ColumnarValue; - /// Returns a string representation of a date, time, timestamp or duration based /// on a Chrono pattern. /// From 92a13708134ae2fce034d1936a0a6f020d2a5cfa Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Wed, 13 Mar 2024 10:47:49 -0400 Subject: [PATCH 16/17] Updates after correcting merge conflicts. --- datafusion/expr/src/built_in_function.rs | 2 +- datafusion/proto/src/generated/pbjson.rs | 12 ------------ datafusion/proto/src/generated/prost.rs | 16 ++++------------ datafusion/proto/src/logical_plan/from_proto.rs | 16 ++++++++-------- 4 files changed, 13 insertions(+), 33 deletions(-) diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index b881af18d92c..0593ed4703bf 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -361,7 +361,7 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Translate => Volatility::Immutable, BuiltinScalarFunction::Trim => Volatility::Immutable, BuiltinScalarFunction::Upper => Volatility::Immutable, - BuiltinScalarFunction::OverLay => Volatility::Immutable, + BuiltinScalarFunction::OverLay => Volatility::Immutable, BuiltinScalarFunction::Levenshtein => Volatility::Immutable, BuiltinScalarFunction::SubstrIndex => Volatility::Immutable, BuiltinScalarFunction::FindInSet => Volatility::Immutable, diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 37cc1a45785b..be26ccee18c4 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -23157,16 +23157,12 @@ impl serde::Serialize for ScalarFunction { Self::Strpos => "Strpos", Self::Substr => "Substr", Self::ToHex => "ToHex", - Self::Now => "Now", Self::Translate => "Translate", Self::Trim => "Trim", Self::Upper => "Upper", Self::Coalesce => "Coalesce", Self::Power => "Power", - Self::FromUnixtime => "FromUnixtime", Self::Atan2 => "Atan2", - Self::CurrentDate => "CurrentDate", - Self::CurrentTime => "CurrentTime", Self::Uuid => "Uuid", Self::Cbrt => "Cbrt", Self::Acosh => "Acosh", @@ -23264,16 +23260,12 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Strpos", "Substr", "ToHex", - "Now", "Translate", "Trim", "Upper", "Coalesce", "Power", - "FromUnixtime", "Atan2", - "CurrentDate", - "CurrentTime", "Uuid", "Cbrt", "Acosh", @@ -23400,16 +23392,12 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Strpos" => Ok(ScalarFunction::Strpos), "Substr" => Ok(ScalarFunction::Substr), "ToHex" => Ok(ScalarFunction::ToHex), - "Now" => Ok(ScalarFunction::Now), "Translate" => Ok(ScalarFunction::Translate), "Trim" => Ok(ScalarFunction::Trim), "Upper" => Ok(ScalarFunction::Upper), "Coalesce" => Ok(ScalarFunction::Coalesce), "Power" => Ok(ScalarFunction::Power), - "FromUnixtime" => Ok(ScalarFunction::FromUnixtime), "Atan2" => Ok(ScalarFunction::Atan2), - "CurrentDate" => Ok(ScalarFunction::CurrentDate), - "CurrentTime" => Ok(ScalarFunction::CurrentTime), "Uuid" => Ok(ScalarFunction::Uuid), "Cbrt" => Ok(ScalarFunction::Cbrt), "Acosh" => Ok(ScalarFunction::Acosh), diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index c557fb48b191..54d3bffae198 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2902,19 +2902,19 @@ pub enum ScalarFunction { /// 56 was ToTimestampMillis /// 57 was ToTimestampMicros /// 58 was ToTimestampSeconds - Now = 59, + /// 59 was Now Translate = 60, Trim = 61, Upper = 62, Coalesce = 63, Power = 64, /// 65 was StructFun - FromUnixtime = 66, + /// 66 was FromUnixtime Atan2 = 67, /// 68 was DateBin /// 69 was ArrowTypeof - CurrentDate = 70, - CurrentTime = 71, + /// 70 was CurrentDate + /// 71 was CurrentTime Uuid = 72, Cbrt = 73, Acosh = 74, @@ -3035,16 +3035,12 @@ impl ScalarFunction { ScalarFunction::Strpos => "Strpos", ScalarFunction::Substr => "Substr", ScalarFunction::ToHex => "ToHex", - ScalarFunction::Now => "Now", ScalarFunction::Translate => "Translate", ScalarFunction::Trim => "Trim", ScalarFunction::Upper => "Upper", ScalarFunction::Coalesce => "Coalesce", ScalarFunction::Power => "Power", - ScalarFunction::FromUnixtime => "FromUnixtime", ScalarFunction::Atan2 => "Atan2", - ScalarFunction::CurrentDate => "CurrentDate", - ScalarFunction::CurrentTime => "CurrentTime", ScalarFunction::Uuid => "Uuid", ScalarFunction::Cbrt => "Cbrt", ScalarFunction::Acosh => "Acosh", @@ -3136,16 +3132,12 @@ impl ScalarFunction { "Strpos" => Some(Self::Strpos), "Substr" => Some(Self::Substr), "ToHex" => Some(Self::ToHex), - "Now" => Some(Self::Now), "Translate" => Some(Self::Translate), "Trim" => Some(Self::Trim), "Upper" => Some(Self::Upper), "Coalesce" => Some(Self::Coalesce), "Power" => Some(Self::Power), - "FromUnixtime" => Some(Self::FromUnixtime), "Atan2" => Some(Self::Atan2), - "CurrentDate" => Some(Self::CurrentDate), - "CurrentTime" => Some(Self::CurrentTime), "Uuid" => Some(Self::Uuid), "Cbrt" => Some(Self::Cbrt), "Acosh" => Some(Self::Acosh), diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 4b9bd45fd55b..fb7e82d51c52 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -50,19 +50,19 @@ use datafusion_expr::{ acosh, array_element, array_except, array_intersect, array_pop_back, array_pop_front, array_position, array_positions, array_remove, array_remove_all, array_remove_n, array_replace, array_replace_all, array_replace_n, array_resize, array_slice, - array_union, ascii, asinh, atan, atan2, atanh, - bit_length, btrim, cbrt, ceil, character_length, chr, coalesce, concat_expr, - concat_ws_expr, cos, cosh, cot, degrees, digest, ends_with, exp, + array_union, ascii, asinh, atan, atan2, atanh, bit_length, btrim, cbrt, ceil, + character_length, chr, coalesce, concat_expr, concat_ws_expr, cos, cosh, cot, + degrees, digest, ends_with, exp, expr::{self, InList, Sort, WindowFunction}, factorial, find_in_set, floor, gcd, initcap, iszero, lcm, left, levenshtein, ln, log, log10, log2, logical_plan::{PlanType, StringifiedPlan}, lower, lpad, ltrim, md5, nanvl, octet_length, overlay, pi, power, radians, random, repeat, replace, reverse, right, round, rpad, rtrim, sha224, sha256, sha384, sha512, - signum, sin, sinh, split_part, sqrt, starts_with, strpos, substr, - substr_index, substring, to_hex, translate, trim, trunc, upper, uuid, - AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction, BuiltinScalarFunction, - Case, Cast, Expr, GetFieldAccess, GetIndexedField, GroupingSet, + signum, sin, sinh, split_part, sqrt, starts_with, strpos, substr, substr_index, + substring, to_hex, translate, trim, trunc, upper, uuid, AggregateFunction, Between, + BinaryExpr, BuiltInWindowFunction, BuiltinScalarFunction, Case, Cast, Expr, + GetFieldAccess, GetIndexedField, GroupingSet, GroupingSet::GroupingSets, JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame, WindowFrameBound, WindowFrameUnits, @@ -525,7 +525,7 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Coalesce => Self::Coalesce, ScalarFunction::Pi => Self::Pi, ScalarFunction::Power => Self::Power, - ScalarFunction::Atan2 => Self::Atan2, + ScalarFunction::Atan2 => Self::Atan2, ScalarFunction::Nanvl => Self::Nanvl, ScalarFunction::Iszero => Self::Iszero, ScalarFunction::OverLay => Self::OverLay, From 189190cbaeb27adb81443b32461c2a7772ebf116 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Wed, 13 Mar 2024 10:48:52 -0400 Subject: [PATCH 17/17] Only move the tests using now() function from optimizer_integration.rs to the core/tests folder, leave the rest in place. --- .../core/tests/optimizer_integration.rs | 262 ----------- .../optimizer/tests/optimizer_integration.rs | 406 ++++++++++++++++++ 2 files changed, 406 insertions(+), 262 deletions(-) create mode 100644 datafusion/optimizer/tests/optimizer_integration.rs diff --git a/datafusion/core/tests/optimizer_integration.rs b/datafusion/core/tests/optimizer_integration.rs index 4d9eae4f4125..f9696955769e 100644 --- a/datafusion/core/tests/optimizer_integration.rs +++ b/datafusion/core/tests/optimizer_integration.rs @@ -42,199 +42,6 @@ fn init() { let _ = env_logger::try_init(); } -#[test] -fn case_when() -> Result<()> { - let sql = "SELECT CASE WHEN col_int32 > 0 THEN 1 ELSE 0 END FROM test"; - let plan = test_sql(sql)?; - let expected = - "Projection: CASE WHEN test.col_int32 > Int32(0) THEN Int64(1) ELSE Int64(0) END AS CASE WHEN test.col_int32 > Int64(0) THEN Int64(1) ELSE Int64(0) END\ - \n TableScan: test projection=[col_int32]"; - assert_eq!(expected, format!("{plan:?}")); - - let sql = "SELECT CASE WHEN col_uint32 > 0 THEN 1 ELSE 0 END FROM test"; - let plan = test_sql(sql)?; - let expected = "Projection: CASE WHEN test.col_uint32 > UInt32(0) THEN Int64(1) ELSE Int64(0) END AS CASE WHEN test.col_uint32 > Int64(0) THEN Int64(1) ELSE Int64(0) END\ - \n TableScan: test projection=[col_uint32]"; - assert_eq!(expected, format!("{plan:?}")); - Ok(()) -} - -#[test] -fn subquery_filter_with_cast() -> Result<()> { - // regression test for https://github.com/apache/arrow-datafusion/issues/3760 - let sql = "SELECT col_int32 FROM test \ - WHERE col_int32 > (\ - SELECT AVG(col_int32) FROM test \ - WHERE col_utf8 BETWEEN '2002-05-08' \ - AND (cast('2002-05-08' as date) + interval '5 days')\ - )"; - let plan = test_sql(sql)?; - let expected = "Projection: test.col_int32\ - \n Inner Join: Filter: CAST(test.col_int32 AS Float64) > __scalar_sq_1.AVG(test.col_int32)\ - \n TableScan: test projection=[col_int32]\ - \n SubqueryAlias: __scalar_sq_1\ - \n Aggregate: groupBy=[[]], aggr=[[AVG(CAST(test.col_int32 AS Float64))]]\ - \n Projection: test.col_int32\ - \n Filter: test.col_utf8 >= Utf8(\"2002-05-08\") AND test.col_utf8 <= Utf8(\"2002-05-13\")\ - \n TableScan: test projection=[col_int32, col_utf8]"; - assert_eq!(expected, format!("{plan:?}")); - Ok(()) -} - -#[test] -fn case_when_aggregate() -> Result<()> { - let sql = "SELECT col_utf8, SUM(CASE WHEN col_int32 > 0 THEN 1 ELSE 0 END) AS n FROM test GROUP BY col_utf8"; - let plan = test_sql(sql)?; - let expected = "Projection: test.col_utf8, SUM(CASE WHEN test.col_int32 > Int64(0) THEN Int64(1) ELSE Int64(0) END) AS n\ - \n Aggregate: groupBy=[[test.col_utf8]], aggr=[[SUM(CASE WHEN test.col_int32 > Int32(0) THEN Int64(1) ELSE Int64(0) END) AS SUM(CASE WHEN test.col_int32 > Int64(0) THEN Int64(1) ELSE Int64(0) END)]]\ - \n TableScan: test projection=[col_int32, col_utf8]"; - assert_eq!(expected, format!("{plan:?}")); - Ok(()) -} - -#[test] -fn unsigned_target_type() -> Result<()> { - let sql = "SELECT col_utf8 FROM test WHERE col_uint32 > 0"; - let plan = test_sql(sql)?; - let expected = "Projection: test.col_utf8\ - \n Filter: test.col_uint32 > UInt32(0)\ - \n TableScan: test projection=[col_uint32, col_utf8]"; - assert_eq!(expected, format!("{plan:?}")); - Ok(()) -} - -#[test] -fn distribute_by() -> Result<()> { - // regression test for https://github.com/apache/arrow-datafusion/issues/3234 - let sql = "SELECT col_int32, col_utf8 FROM test DISTRIBUTE BY (col_utf8)"; - let plan = test_sql(sql)?; - let expected = "Repartition: DistributeBy(col_utf8)\ - \n TableScan: test projection=[col_int32, col_utf8]"; - assert_eq!(expected, format!("{plan:?}")); - Ok(()) -} - -#[test] -fn semi_join_with_join_filter() -> Result<()> { - // regression test for https://github.com/apache/arrow-datafusion/issues/2888 - let sql = "SELECT col_utf8 FROM test WHERE EXISTS (\ - SELECT col_utf8 FROM test t2 WHERE test.col_int32 = t2.col_int32 \ - AND test.col_uint32 != t2.col_uint32)"; - let plan = test_sql(sql)?; - let expected = "Projection: test.col_utf8\ - \n LeftSemi Join: test.col_int32 = __correlated_sq_1.col_int32 Filter: test.col_uint32 != __correlated_sq_1.col_uint32\ - \n TableScan: test projection=[col_int32, col_uint32, col_utf8]\ - \n SubqueryAlias: __correlated_sq_1\ - \n SubqueryAlias: t2\ - \n TableScan: test projection=[col_int32, col_uint32]"; - assert_eq!(expected, format!("{plan:?}")); - Ok(()) -} - -#[test] -fn anti_join_with_join_filter() -> Result<()> { - // regression test for https://github.com/apache/arrow-datafusion/issues/2888 - let sql = "SELECT col_utf8 FROM test WHERE NOT EXISTS (\ - SELECT col_utf8 FROM test t2 WHERE test.col_int32 = t2.col_int32 \ - AND test.col_uint32 != t2.col_uint32)"; - let plan = test_sql(sql)?; - let expected = "Projection: test.col_utf8\ - \n LeftAnti Join: test.col_int32 = __correlated_sq_1.col_int32 Filter: test.col_uint32 != __correlated_sq_1.col_uint32\ - \n TableScan: test projection=[col_int32, col_uint32, col_utf8]\ - \n SubqueryAlias: __correlated_sq_1\ - \n SubqueryAlias: t2\ - \n TableScan: test projection=[col_int32, col_uint32]"; - assert_eq!(expected, format!("{plan:?}")); - Ok(()) -} - -#[test] -fn where_exists_distinct() -> Result<()> { - let sql = "SELECT col_int32 FROM test WHERE EXISTS (\ - SELECT DISTINCT col_int32 FROM test t2 WHERE test.col_int32 = t2.col_int32)"; - let plan = test_sql(sql)?; - let expected = "LeftSemi Join: test.col_int32 = __correlated_sq_1.col_int32\ - \n TableScan: test projection=[col_int32]\ - \n SubqueryAlias: __correlated_sq_1\ - \n Aggregate: groupBy=[[t2.col_int32]], aggr=[[]]\ - \n SubqueryAlias: t2\ - \n TableScan: test projection=[col_int32]"; - assert_eq!(expected, format!("{plan:?}")); - Ok(()) -} - -#[test] -fn intersect() -> Result<()> { - let sql = "SELECT col_int32, col_utf8 FROM test \ - INTERSECT SELECT col_int32, col_utf8 FROM test \ - INTERSECT SELECT col_int32, col_utf8 FROM test"; - let plan = test_sql(sql)?; - let expected = - "LeftSemi Join: test.col_int32 = test.col_int32, test.col_utf8 = test.col_utf8\ - \n Aggregate: groupBy=[[test.col_int32, test.col_utf8]], aggr=[[]]\ - \n LeftSemi Join: test.col_int32 = test.col_int32, test.col_utf8 = test.col_utf8\ - \n Aggregate: groupBy=[[test.col_int32, test.col_utf8]], aggr=[[]]\ - \n TableScan: test projection=[col_int32, col_utf8]\ - \n TableScan: test projection=[col_int32, col_utf8]\ - \n TableScan: test projection=[col_int32, col_utf8]"; - assert_eq!(expected, format!("{plan:?}")); - Ok(()) -} - -#[test] -fn between_date32_plus_interval() -> Result<()> { - let sql = "SELECT count(1) FROM test \ - WHERE col_date32 between '1998-03-18' AND cast('1998-03-18' as date) + INTERVAL '90 days'"; - let plan = test_sql(sql)?; - let expected = - "Aggregate: groupBy=[[]], aggr=[[COUNT(Int64(1))]]\ - \n Projection: \ - \n Filter: test.col_date32 >= Date32(\"10303\") AND test.col_date32 <= Date32(\"10393\")\ - \n TableScan: test projection=[col_date32]"; - assert_eq!(expected, format!("{plan:?}")); - Ok(()) -} - -#[test] -fn between_date64_plus_interval() -> Result<()> { - let sql = "SELECT count(1) FROM test \ - WHERE col_date64 between '1998-03-18T00:00:00' AND cast('1998-03-18' as date) + INTERVAL '90 days'"; - let plan = test_sql(sql)?; - let expected = - "Aggregate: groupBy=[[]], aggr=[[COUNT(Int64(1))]]\ - \n Projection: \ - \n Filter: test.col_date64 >= Date64(\"890179200000\") AND test.col_date64 <= Date64(\"897955200000\")\ - \n TableScan: test projection=[col_date64]"; - assert_eq!(expected, format!("{plan:?}")); - Ok(()) -} - -#[test] -fn concat_literals() -> Result<()> { - let sql = "SELECT concat(true, col_int32, false, null, 'hello', col_utf8, 12, 3.4) \ - AS col - FROM test"; - let plan = test_sql(sql)?; - let expected = - "Projection: concat(Utf8(\"true\"), CAST(test.col_int32 AS Utf8), Utf8(\"falsehello\"), test.col_utf8, Utf8(\"123.4\")) AS col\ - \n TableScan: test projection=[col_int32, col_utf8]"; - assert_eq!(expected, format!("{plan:?}")); - Ok(()) -} - -#[test] -fn concat_ws_literals() -> Result<()> { - let sql = "SELECT concat_ws('-', true, col_int32, false, null, 'hello', col_utf8, 12, '', 3.4) \ - AS col - FROM test"; - let plan = test_sql(sql)?; - let expected = - "Projection: concat_ws(Utf8(\"-\"), Utf8(\"true\"), CAST(test.col_int32 AS Utf8), Utf8(\"false-hello\"), test.col_utf8, Utf8(\"12--3.4\")) AS col\ - \n TableScan: test projection=[col_int32, col_utf8]"; - assert_eq!(expected, format!("{plan:?}")); - Ok(()) -} - #[test] fn timestamp_nano_ts_none_predicates() -> Result<()> { let sql = "SELECT col_int32 @@ -267,75 +74,6 @@ fn timestamp_nano_ts_utc_predicates() { assert_eq!(expected, format!("{plan:?}")); } -#[test] -fn propagate_empty_relation() { - let sql = "SELECT test.col_int32 FROM test JOIN ( SELECT col_int32 FROM test WHERE false ) AS ta1 ON test.col_int32 = ta1.col_int32;"; - let plan = test_sql(sql).unwrap(); - // when children exist EmptyRelation, it will bottom-up propagate. - let expected = "EmptyRelation"; - assert_eq!(expected, format!("{plan:?}")); -} - -#[test] -fn join_keys_in_subquery_alias() { - let sql = "SELECT * FROM test AS A, ( SELECT col_int32 as key FROM test ) AS B where A.col_int32 = B.key;"; - let plan = test_sql(sql).unwrap(); - let expected = "Inner Join: a.col_int32 = b.key\ - \n SubqueryAlias: a\ - \n Filter: test.col_int32 IS NOT NULL\ - \n TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]\ - \n SubqueryAlias: b\ - \n Projection: test.col_int32 AS key\ - \n Filter: test.col_int32 IS NOT NULL\ - \n TableScan: test projection=[col_int32]"; - - assert_eq!(expected, format!("{plan:?}")); -} - -#[test] -fn join_keys_in_subquery_alias_1() { - let sql = "SELECT * FROM test AS A, ( SELECT test.col_int32 AS key FROM test JOIN test AS C on test.col_int32 = C.col_int32 ) AS B where A.col_int32 = B.key;"; - let plan = test_sql(sql).unwrap(); - let expected = "Inner Join: a.col_int32 = b.key\ - \n SubqueryAlias: a\ - \n Filter: test.col_int32 IS NOT NULL\ - \n TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]\ - \n SubqueryAlias: b\ - \n Projection: test.col_int32 AS key\ - \n Inner Join: test.col_int32 = c.col_int32\ - \n Filter: test.col_int32 IS NOT NULL\ - \n TableScan: test projection=[col_int32]\ - \n SubqueryAlias: c\ - \n Filter: test.col_int32 IS NOT NULL\ - \n TableScan: test projection=[col_int32]"; - assert_eq!(expected, format!("{plan:?}")); -} - -#[test] -fn push_down_filter_groupby_expr_contains_alias() { - let sql = "SELECT * FROM (SELECT (col_int32 + col_uint32) AS c, count(*) FROM test GROUP BY 1) where c > 3"; - let plan = test_sql(sql).unwrap(); - let expected = "Projection: test.col_int32 + test.col_uint32 AS c, COUNT(*)\ - \n Aggregate: groupBy=[[test.col_int32 + CAST(test.col_uint32 AS Int32)]], aggr=[[COUNT(UInt8(1)) AS COUNT(*)]]\ - \n Filter: test.col_int32 + CAST(test.col_uint32 AS Int32) > Int32(3)\ - \n TableScan: test projection=[col_int32, col_uint32]"; - assert_eq!(expected, format!("{plan:?}")); -} - -#[test] -// issue: https://github.com/apache/arrow-datafusion/issues/5334 -fn test_same_name_but_not_ambiguous() { - let sql = "SELECT t1.col_int32 AS col_int32 FROM test t1 intersect SELECT col_int32 FROM test t2"; - let plan = test_sql(sql).unwrap(); - let expected = "LeftSemi Join: t1.col_int32 = t2.col_int32\ - \n Aggregate: groupBy=[[t1.col_int32]], aggr=[[]]\ - \n SubqueryAlias: t1\ - \n TableScan: test projection=[col_int32]\ - \n SubqueryAlias: t2\ - \n TableScan: test projection=[col_int32]"; - assert_eq!(expected, format!("{plan:?}")); -} - fn test_sql(sql: &str) -> Result { // parse the SQL let dialect = GenericDialect {}; // or AnsiDialect, or your own dialect ... diff --git a/datafusion/optimizer/tests/optimizer_integration.rs b/datafusion/optimizer/tests/optimizer_integration.rs new file mode 100644 index 000000000000..acafc0bafaf4 --- /dev/null +++ b/datafusion/optimizer/tests/optimizer_integration.rs @@ -0,0 +1,406 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::collections::HashMap; +use std::sync::Arc; + +use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; +use datafusion_common::config::ConfigOptions; +use datafusion_common::{plan_err, Result}; +use datafusion_expr::{AggregateUDF, LogicalPlan, ScalarUDF, TableSource, WindowUDF}; +use datafusion_optimizer::analyzer::Analyzer; +use datafusion_optimizer::optimizer::Optimizer; +use datafusion_optimizer::{OptimizerConfig, OptimizerContext}; +use datafusion_sql::planner::{ContextProvider, SqlToRel}; +use datafusion_sql::sqlparser::ast::Statement; +use datafusion_sql::sqlparser::dialect::GenericDialect; +use datafusion_sql::sqlparser::parser::Parser; +use datafusion_sql::TableReference; + +#[cfg(test)] +#[ctor::ctor] +fn init() { + // enable logging so RUST_LOG works + let _ = env_logger::try_init(); +} + +#[test] +fn case_when() -> Result<()> { + let sql = "SELECT CASE WHEN col_int32 > 0 THEN 1 ELSE 0 END FROM test"; + let plan = test_sql(sql)?; + let expected = + "Projection: CASE WHEN test.col_int32 > Int32(0) THEN Int64(1) ELSE Int64(0) END AS CASE WHEN test.col_int32 > Int64(0) THEN Int64(1) ELSE Int64(0) END\ + \n TableScan: test projection=[col_int32]"; + assert_eq!(expected, format!("{plan:?}")); + + let sql = "SELECT CASE WHEN col_uint32 > 0 THEN 1 ELSE 0 END FROM test"; + let plan = test_sql(sql)?; + let expected = "Projection: CASE WHEN test.col_uint32 > UInt32(0) THEN Int64(1) ELSE Int64(0) END AS CASE WHEN test.col_uint32 > Int64(0) THEN Int64(1) ELSE Int64(0) END\ + \n TableScan: test projection=[col_uint32]"; + assert_eq!(expected, format!("{plan:?}")); + Ok(()) +} + +#[test] +fn subquery_filter_with_cast() -> Result<()> { + // regression test for https://github.com/apache/arrow-datafusion/issues/3760 + let sql = "SELECT col_int32 FROM test \ + WHERE col_int32 > (\ + SELECT AVG(col_int32) FROM test \ + WHERE col_utf8 BETWEEN '2002-05-08' \ + AND (cast('2002-05-08' as date) + interval '5 days')\ + )"; + let plan = test_sql(sql)?; + let expected = "Projection: test.col_int32\ + \n Inner Join: Filter: CAST(test.col_int32 AS Float64) > __scalar_sq_1.AVG(test.col_int32)\ + \n TableScan: test projection=[col_int32]\ + \n SubqueryAlias: __scalar_sq_1\ + \n Aggregate: groupBy=[[]], aggr=[[AVG(CAST(test.col_int32 AS Float64))]]\ + \n Projection: test.col_int32\ + \n Filter: test.col_utf8 >= Utf8(\"2002-05-08\") AND test.col_utf8 <= Utf8(\"2002-05-13\")\ + \n TableScan: test projection=[col_int32, col_utf8]"; + assert_eq!(expected, format!("{plan:?}")); + Ok(()) +} + +#[test] +fn case_when_aggregate() -> Result<()> { + let sql = "SELECT col_utf8, SUM(CASE WHEN col_int32 > 0 THEN 1 ELSE 0 END) AS n FROM test GROUP BY col_utf8"; + let plan = test_sql(sql)?; + let expected = "Projection: test.col_utf8, SUM(CASE WHEN test.col_int32 > Int64(0) THEN Int64(1) ELSE Int64(0) END) AS n\ + \n Aggregate: groupBy=[[test.col_utf8]], aggr=[[SUM(CASE WHEN test.col_int32 > Int32(0) THEN Int64(1) ELSE Int64(0) END) AS SUM(CASE WHEN test.col_int32 > Int64(0) THEN Int64(1) ELSE Int64(0) END)]]\ + \n TableScan: test projection=[col_int32, col_utf8]"; + assert_eq!(expected, format!("{plan:?}")); + Ok(()) +} + +#[test] +fn unsigned_target_type() -> Result<()> { + let sql = "SELECT col_utf8 FROM test WHERE col_uint32 > 0"; + let plan = test_sql(sql)?; + let expected = "Projection: test.col_utf8\ + \n Filter: test.col_uint32 > UInt32(0)\ + \n TableScan: test projection=[col_uint32, col_utf8]"; + assert_eq!(expected, format!("{plan:?}")); + Ok(()) +} + +#[test] +fn distribute_by() -> Result<()> { + // regression test for https://github.com/apache/arrow-datafusion/issues/3234 + let sql = "SELECT col_int32, col_utf8 FROM test DISTRIBUTE BY (col_utf8)"; + let plan = test_sql(sql)?; + let expected = "Repartition: DistributeBy(col_utf8)\ + \n TableScan: test projection=[col_int32, col_utf8]"; + assert_eq!(expected, format!("{plan:?}")); + Ok(()) +} + +#[test] +fn semi_join_with_join_filter() -> Result<()> { + // regression test for https://github.com/apache/arrow-datafusion/issues/2888 + let sql = "SELECT col_utf8 FROM test WHERE EXISTS (\ + SELECT col_utf8 FROM test t2 WHERE test.col_int32 = t2.col_int32 \ + AND test.col_uint32 != t2.col_uint32)"; + let plan = test_sql(sql)?; + let expected = "Projection: test.col_utf8\ + \n LeftSemi Join: test.col_int32 = __correlated_sq_1.col_int32 Filter: test.col_uint32 != __correlated_sq_1.col_uint32\ + \n TableScan: test projection=[col_int32, col_uint32, col_utf8]\ + \n SubqueryAlias: __correlated_sq_1\ + \n SubqueryAlias: t2\ + \n TableScan: test projection=[col_int32, col_uint32]"; + assert_eq!(expected, format!("{plan:?}")); + Ok(()) +} + +#[test] +fn anti_join_with_join_filter() -> Result<()> { + // regression test for https://github.com/apache/arrow-datafusion/issues/2888 + let sql = "SELECT col_utf8 FROM test WHERE NOT EXISTS (\ + SELECT col_utf8 FROM test t2 WHERE test.col_int32 = t2.col_int32 \ + AND test.col_uint32 != t2.col_uint32)"; + let plan = test_sql(sql)?; + let expected = "Projection: test.col_utf8\ + \n LeftAnti Join: test.col_int32 = __correlated_sq_1.col_int32 Filter: test.col_uint32 != __correlated_sq_1.col_uint32\ + \n TableScan: test projection=[col_int32, col_uint32, col_utf8]\ + \n SubqueryAlias: __correlated_sq_1\ + \n SubqueryAlias: t2\ + \n TableScan: test projection=[col_int32, col_uint32]"; + assert_eq!(expected, format!("{plan:?}")); + Ok(()) +} + +#[test] +fn where_exists_distinct() -> Result<()> { + let sql = "SELECT col_int32 FROM test WHERE EXISTS (\ + SELECT DISTINCT col_int32 FROM test t2 WHERE test.col_int32 = t2.col_int32)"; + let plan = test_sql(sql)?; + let expected = "LeftSemi Join: test.col_int32 = __correlated_sq_1.col_int32\ + \n TableScan: test projection=[col_int32]\ + \n SubqueryAlias: __correlated_sq_1\ + \n Aggregate: groupBy=[[t2.col_int32]], aggr=[[]]\ + \n SubqueryAlias: t2\ + \n TableScan: test projection=[col_int32]"; + assert_eq!(expected, format!("{plan:?}")); + Ok(()) +} + +#[test] +fn intersect() -> Result<()> { + let sql = "SELECT col_int32, col_utf8 FROM test \ + INTERSECT SELECT col_int32, col_utf8 FROM test \ + INTERSECT SELECT col_int32, col_utf8 FROM test"; + let plan = test_sql(sql)?; + let expected = + "LeftSemi Join: test.col_int32 = test.col_int32, test.col_utf8 = test.col_utf8\ + \n Aggregate: groupBy=[[test.col_int32, test.col_utf8]], aggr=[[]]\ + \n LeftSemi Join: test.col_int32 = test.col_int32, test.col_utf8 = test.col_utf8\ + \n Aggregate: groupBy=[[test.col_int32, test.col_utf8]], aggr=[[]]\ + \n TableScan: test projection=[col_int32, col_utf8]\ + \n TableScan: test projection=[col_int32, col_utf8]\ + \n TableScan: test projection=[col_int32, col_utf8]"; + assert_eq!(expected, format!("{plan:?}")); + Ok(()) +} + +#[test] +fn between_date32_plus_interval() -> Result<()> { + let sql = "SELECT count(1) FROM test \ + WHERE col_date32 between '1998-03-18' AND cast('1998-03-18' as date) + INTERVAL '90 days'"; + let plan = test_sql(sql)?; + let expected = + "Aggregate: groupBy=[[]], aggr=[[COUNT(Int64(1))]]\ + \n Projection: \ + \n Filter: test.col_date32 >= Date32(\"10303\") AND test.col_date32 <= Date32(\"10393\")\ + \n TableScan: test projection=[col_date32]"; + assert_eq!(expected, format!("{plan:?}")); + Ok(()) +} + +#[test] +fn between_date64_plus_interval() -> Result<()> { + let sql = "SELECT count(1) FROM test \ + WHERE col_date64 between '1998-03-18T00:00:00' AND cast('1998-03-18' as date) + INTERVAL '90 days'"; + let plan = test_sql(sql)?; + let expected = + "Aggregate: groupBy=[[]], aggr=[[COUNT(Int64(1))]]\ + \n Projection: \ + \n Filter: test.col_date64 >= Date64(\"890179200000\") AND test.col_date64 <= Date64(\"897955200000\")\ + \n TableScan: test projection=[col_date64]"; + assert_eq!(expected, format!("{plan:?}")); + Ok(()) +} + +#[test] +fn concat_literals() -> Result<()> { + let sql = "SELECT concat(true, col_int32, false, null, 'hello', col_utf8, 12, 3.4) \ + AS col + FROM test"; + let plan = test_sql(sql)?; + let expected = + "Projection: concat(Utf8(\"true\"), CAST(test.col_int32 AS Utf8), Utf8(\"falsehello\"), test.col_utf8, Utf8(\"123.4\")) AS col\ + \n TableScan: test projection=[col_int32, col_utf8]"; + assert_eq!(expected, format!("{plan:?}")); + Ok(()) +} + +#[test] +fn concat_ws_literals() -> Result<()> { + let sql = "SELECT concat_ws('-', true, col_int32, false, null, 'hello', col_utf8, 12, '', 3.4) \ + AS col + FROM test"; + let plan = test_sql(sql)?; + let expected = + "Projection: concat_ws(Utf8(\"-\"), Utf8(\"true\"), CAST(test.col_int32 AS Utf8), Utf8(\"false-hello\"), test.col_utf8, Utf8(\"12--3.4\")) AS col\ + \n TableScan: test projection=[col_int32, col_utf8]"; + assert_eq!(expected, format!("{plan:?}")); + Ok(()) +} + +#[test] +fn propagate_empty_relation() { + let sql = "SELECT test.col_int32 FROM test JOIN ( SELECT col_int32 FROM test WHERE false ) AS ta1 ON test.col_int32 = ta1.col_int32;"; + let plan = test_sql(sql).unwrap(); + // when children exist EmptyRelation, it will bottom-up propagate. + let expected = "EmptyRelation"; + assert_eq!(expected, format!("{plan:?}")); +} + +#[test] +fn join_keys_in_subquery_alias() { + let sql = "SELECT * FROM test AS A, ( SELECT col_int32 as key FROM test ) AS B where A.col_int32 = B.key;"; + let plan = test_sql(sql).unwrap(); + let expected = "Inner Join: a.col_int32 = b.key\ + \n SubqueryAlias: a\ + \n Filter: test.col_int32 IS NOT NULL\ + \n TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]\ + \n SubqueryAlias: b\ + \n Projection: test.col_int32 AS key\ + \n Filter: test.col_int32 IS NOT NULL\ + \n TableScan: test projection=[col_int32]"; + + assert_eq!(expected, format!("{plan:?}")); +} + +#[test] +fn join_keys_in_subquery_alias_1() { + let sql = "SELECT * FROM test AS A, ( SELECT test.col_int32 AS key FROM test JOIN test AS C on test.col_int32 = C.col_int32 ) AS B where A.col_int32 = B.key;"; + let plan = test_sql(sql).unwrap(); + let expected = "Inner Join: a.col_int32 = b.key\ + \n SubqueryAlias: a\ + \n Filter: test.col_int32 IS NOT NULL\ + \n TableScan: test projection=[col_int32, col_uint32, col_utf8, col_date32, col_date64, col_ts_nano_none, col_ts_nano_utc]\ + \n SubqueryAlias: b\ + \n Projection: test.col_int32 AS key\ + \n Inner Join: test.col_int32 = c.col_int32\ + \n Filter: test.col_int32 IS NOT NULL\ + \n TableScan: test projection=[col_int32]\ + \n SubqueryAlias: c\ + \n Filter: test.col_int32 IS NOT NULL\ + \n TableScan: test projection=[col_int32]"; + assert_eq!(expected, format!("{plan:?}")); +} + +#[test] +fn push_down_filter_groupby_expr_contains_alias() { + let sql = "SELECT * FROM (SELECT (col_int32 + col_uint32) AS c, count(*) FROM test GROUP BY 1) where c > 3"; + let plan = test_sql(sql).unwrap(); + let expected = "Projection: test.col_int32 + test.col_uint32 AS c, COUNT(*)\ + \n Aggregate: groupBy=[[test.col_int32 + CAST(test.col_uint32 AS Int32)]], aggr=[[COUNT(UInt8(1)) AS COUNT(*)]]\ + \n Filter: test.col_int32 + CAST(test.col_uint32 AS Int32) > Int32(3)\ + \n TableScan: test projection=[col_int32, col_uint32]"; + assert_eq!(expected, format!("{plan:?}")); +} + +#[test] +// issue: https://github.com/apache/arrow-datafusion/issues/5334 +fn test_same_name_but_not_ambiguous() { + let sql = "SELECT t1.col_int32 AS col_int32 FROM test t1 intersect SELECT col_int32 FROM test t2"; + let plan = test_sql(sql).unwrap(); + let expected = "LeftSemi Join: t1.col_int32 = t2.col_int32\ + \n Aggregate: groupBy=[[t1.col_int32]], aggr=[[]]\ + \n SubqueryAlias: t1\ + \n TableScan: test projection=[col_int32]\ + \n SubqueryAlias: t2\ + \n TableScan: test projection=[col_int32]"; + assert_eq!(expected, format!("{plan:?}")); +} + +fn test_sql(sql: &str) -> Result { + // parse the SQL + let dialect = GenericDialect {}; // or AnsiDialect, or your own dialect ... + let ast: Vec = Parser::parse_sql(&dialect, sql).unwrap(); + let statement = &ast[0]; + let context_provider = MyContextProvider::default(); + let sql_to_rel = SqlToRel::new(&context_provider); + let plan = sql_to_rel.sql_statement_to_plan(statement.clone()).unwrap(); + + let config = OptimizerContext::new().with_skip_failing_rules(false); + let analyzer = Analyzer::new(); + let optimizer = Optimizer::new(); + // analyze and optimize the logical plan + let plan = analyzer.execute_and_check(&plan, config.options(), |_, _| {})?; + optimizer.optimize(&plan, &config, |_, _| {}) +} + +#[derive(Default)] +struct MyContextProvider { + options: ConfigOptions, +} + +impl ContextProvider for MyContextProvider { + fn get_table_source(&self, name: TableReference) -> Result> { + let table_name = name.table(); + if table_name.starts_with("test") { + let schema = Schema::new_with_metadata( + vec![ + Field::new("col_int32", DataType::Int32, true), + Field::new("col_uint32", DataType::UInt32, true), + Field::new("col_utf8", DataType::Utf8, true), + Field::new("col_date32", DataType::Date32, true), + Field::new("col_date64", DataType::Date64, true), + // timestamp with no timezone + Field::new( + "col_ts_nano_none", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ), + // timestamp with UTC timezone + Field::new( + "col_ts_nano_utc", + DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())), + true, + ), + ], + HashMap::new(), + ); + + Ok(Arc::new(MyTableSource { + schema: Arc::new(schema), + })) + } else { + plan_err!("table does not exist") + } + } + + fn get_function_meta(&self, _name: &str) -> Option> { + None + } + + fn get_aggregate_meta(&self, _name: &str) -> Option> { + None + } + + fn get_variable_type(&self, _variable_names: &[String]) -> Option { + None + } + + fn get_window_meta(&self, _name: &str) -> Option> { + None + } + + fn options(&self) -> &ConfigOptions { + &self.options + } + + fn udfs_names(&self) -> Vec { + Vec::new() + } + + fn udafs_names(&self) -> Vec { + Vec::new() + } + + fn udwfs_names(&self) -> Vec { + Vec::new() + } +} + +struct MyTableSource { + schema: SchemaRef, +} + +impl TableSource for MyTableSource { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +}