diff --git a/datafusion/functions/benches/to_timestamp.rs b/datafusion/functions/benches/to_timestamp.rs index e734b6832f29..5a87b34caf47 100644 --- a/datafusion/functions/benches/to_timestamp.rs +++ b/datafusion/functions/benches/to_timestamp.rs @@ -20,27 +20,123 @@ extern crate criterion; use std::sync::Arc; use arrow::array::builder::StringBuilder; -use arrow::array::ArrayRef; +use arrow::array::{ArrayRef, StringArray}; +use arrow::compute::cast; +use arrow::datatypes::DataType; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use datafusion_expr::ColumnarValue; use datafusion_functions::datetime::to_timestamp; +fn data() -> StringArray { + let data: Vec<&str> = vec![ + "1997-01-31T09:26:56.123Z", + "1997-01-31T09:26:56.123-05:00", + "1997-01-31 09:26:56.123-05:00", + "2023-01-01 04:05:06.789 -08", + "1997-01-31T09:26:56.123", + "1997-01-31 09:26:56.123", + "1997-01-31 09:26:56", + "1997-01-31 13:26:56", + "1997-01-31 13:26:56+04:00", + "1997-01-31", + ]; + + StringArray::from(data) +} + +fn data_with_formats() -> (StringArray, StringArray, StringArray, StringArray) { + let mut inputs = StringBuilder::new(); + let mut format1_builder = StringBuilder::with_capacity(2, 10); + let mut format2_builder = StringBuilder::with_capacity(2, 10); + let mut format3_builder = StringBuilder::with_capacity(2, 10); + + inputs.append_value("1997-01-31T09:26:56.123Z"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%Z"); + + inputs.append_value("1997-01-31T09:26:56.123-05:00"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%z"); + + inputs.append_value("1997-01-31 09:26:56.123-05:00"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f%Z"); + + inputs.append_value("2023-01-01 04:05:06.789 -08"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f %#z"); + + inputs.append_value("1997-01-31T09:26:56.123"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f"); + + inputs.append_value("1997-01-31 09:26:56.123"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f"); + + inputs.append_value("1997-01-31 09:26:56"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d %H:%M:%S"); + + inputs.append_value("1997-01-31 092656"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d %H%M%S"); + + inputs.append_value("1997-01-31 092656+04:00"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d %H%M%S%:z"); + + inputs.append_value("Sun Jul 8 00:34:60 2001"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d 00:00:00"); + + ( + inputs.finish(), + format1_builder.finish(), + format2_builder.finish(), + format3_builder.finish(), + ) +} fn criterion_benchmark(c: &mut Criterion) { - c.bench_function("to_timestamp_no_formats", |b| { - let mut inputs = StringBuilder::new(); - inputs.append_value("1997-01-31T09:26:56.123Z"); - inputs.append_value("1997-01-31T09:26:56.123-05:00"); - inputs.append_value("1997-01-31 09:26:56.123-05:00"); - inputs.append_value("2023-01-01 04:05:06.789 -08"); - inputs.append_value("1997-01-31T09:26:56.123"); - inputs.append_value("1997-01-31 09:26:56.123"); - inputs.append_value("1997-01-31 09:26:56"); - inputs.append_value("1997-01-31 13:26:56"); - inputs.append_value("1997-01-31 13:26:56+04:00"); - inputs.append_value("1997-01-31"); - - let string_array = ColumnarValue::Array(Arc::new(inputs.finish()) as ArrayRef); + c.bench_function("to_timestamp_no_formats_utf8", |b| { + let string_array = ColumnarValue::Array(Arc::new(data()) as ArrayRef); + + b.iter(|| { + black_box( + to_timestamp() + .invoke(&[string_array.clone()]) + .expect("to_timestamp should work on valid values"), + ) + }) + }); + + c.bench_function("to_timestamp_no_formats_largeutf8", |b| { + let data = cast(&data(), &DataType::LargeUtf8).unwrap(); + let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef); + + b.iter(|| { + black_box( + to_timestamp() + .invoke(&[string_array.clone()]) + .expect("to_timestamp should work on valid values"), + ) + }) + }); + + c.bench_function("to_timestamp_no_formats_utf8view", |b| { + let data = cast(&data(), &DataType::Utf8View).unwrap(); + let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef); b.iter(|| { black_box( @@ -51,67 +147,66 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - c.bench_function("to_timestamp_with_formats", |b| { - let mut inputs = StringBuilder::new(); - let mut format1_builder = StringBuilder::with_capacity(2, 10); - let mut format2_builder = StringBuilder::with_capacity(2, 10); - let mut format3_builder = StringBuilder::with_capacity(2, 10); - - inputs.append_value("1997-01-31T09:26:56.123Z"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%Z"); - - inputs.append_value("1997-01-31T09:26:56.123-05:00"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%z"); - - inputs.append_value("1997-01-31 09:26:56.123-05:00"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f%Z"); - - inputs.append_value("2023-01-01 04:05:06.789 -08"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f %#z"); - - inputs.append_value("1997-01-31T09:26:56.123"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f"); - - inputs.append_value("1997-01-31 09:26:56.123"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f"); - - inputs.append_value("1997-01-31 09:26:56"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%d %H:%M:%S"); - - inputs.append_value("1997-01-31 092656"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%d %H%M%S"); - - inputs.append_value("1997-01-31 092656+04:00"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%d %H%M%S%:z"); - - inputs.append_value("Sun Jul 8 00:34:60 2001"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%d 00:00:00"); + c.bench_function("to_timestamp_with_formats_utf8", |b| { + let (inputs, format1, format2, format3) = data_with_formats(); + + let args = [ + ColumnarValue::Array(Arc::new(inputs) as ArrayRef), + ColumnarValue::Array(Arc::new(format1) as ArrayRef), + ColumnarValue::Array(Arc::new(format2) as ArrayRef), + ColumnarValue::Array(Arc::new(format3) as ArrayRef), + ]; + b.iter(|| { + black_box( + to_timestamp() + .invoke(&args.clone()) + .expect("to_timestamp should work on valid values"), + ) + }) + }); + + c.bench_function("to_timestamp_with_formats_largeutf8", |b| { + let (inputs, format1, format2, format3) = data_with_formats(); + + let args = [ + ColumnarValue::Array( + Arc::new(cast(&inputs, &DataType::LargeUtf8).unwrap()) as ArrayRef + ), + ColumnarValue::Array( + Arc::new(cast(&format1, &DataType::LargeUtf8).unwrap()) as ArrayRef + ), + ColumnarValue::Array( + Arc::new(cast(&format2, &DataType::LargeUtf8).unwrap()) as ArrayRef + ), + ColumnarValue::Array( + Arc::new(cast(&format3, &DataType::LargeUtf8).unwrap()) as ArrayRef + ), + ]; + b.iter(|| { + black_box( + to_timestamp() + .invoke(&args.clone()) + .expect("to_timestamp should work on valid values"), + ) + }) + }); + + c.bench_function("to_timestamp_with_formats_utf8view", |b| { + let (inputs, format1, format2, format3) = data_with_formats(); let args = [ - ColumnarValue::Array(Arc::new(inputs.finish()) as ArrayRef), - ColumnarValue::Array(Arc::new(format1_builder.finish()) as ArrayRef), - ColumnarValue::Array(Arc::new(format2_builder.finish()) as ArrayRef), - ColumnarValue::Array(Arc::new(format3_builder.finish()) as ArrayRef), + ColumnarValue::Array( + Arc::new(cast(&inputs, &DataType::Utf8View).unwrap()) as ArrayRef + ), + ColumnarValue::Array( + Arc::new(cast(&format1, &DataType::Utf8View).unwrap()) as ArrayRef + ), + ColumnarValue::Array( + Arc::new(cast(&format2, &DataType::Utf8View).unwrap()) as ArrayRef + ), + ColumnarValue::Array( + Arc::new(cast(&format3, &DataType::Utf8View).unwrap()) as ArrayRef + ), ]; b.iter(|| { black_box( diff --git a/datafusion/functions/src/datetime/common.rs b/datafusion/functions/src/datetime/common.rs index 89b40a3534d3..6e3106a5bce6 100644 --- a/datafusion/functions/src/datetime/common.rs +++ b/datafusion/functions/src/datetime/common.rs @@ -18,15 +18,16 @@ use std::sync::Arc; use arrow::array::{ - Array, ArrowPrimitiveType, GenericStringArray, OffsetSizeTrait, PrimitiveArray, + Array, ArrowPrimitiveType, AsArray, GenericStringArray, PrimitiveArray, + StringViewArray, }; use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos; use arrow::datatypes::DataType; use chrono::format::{parse, Parsed, StrftimeItems}; use chrono::LocalResult::Single; use chrono::{DateTime, TimeZone, Utc}; -use itertools::Either; +use crate::strings::StringArrayType; use datafusion_common::cast::as_generic_string_array; use datafusion_common::{ exec_err, unwrap_or_internal_err, DataFusionError, Result, ScalarType, ScalarValue, @@ -41,14 +42,15 @@ pub(crate) fn string_to_timestamp_nanos_shim(s: &str) -> Result { string_to_timestamp_nanos(s).map_err(|e| e.into()) } -/// Checks that all the arguments from the second are of type [Utf8] or [LargeUtf8] +/// Checks that all the arguments from the second are of type [Utf8], [LargeUtf8] or [Utf8View] /// /// [Utf8]: DataType::Utf8 /// [LargeUtf8]: DataType::LargeUtf8 +/// [Utf8View]: DataType::Utf8View pub(crate) fn validate_data_types(args: &[ColumnarValue], name: &str) -> Result<()> { for (idx, a) in args.iter().skip(1).enumerate() { match a.data_type() { - DataType::Utf8 | DataType::LargeUtf8 => { + DataType::Utf8View | DataType::LargeUtf8 | DataType::Utf8 => { // all good } _ => { @@ -178,26 +180,43 @@ pub(crate) fn string_to_timestamp_millis_formatted(s: &str, format: &str) -> Res .timestamp_millis()) } -pub(crate) fn handle<'a, O, F, S>( - args: &'a [ColumnarValue], +pub(crate) fn handle( + args: &[ColumnarValue], op: F, name: &str, ) -> Result where O: ArrowPrimitiveType, S: ScalarType, - F: Fn(&'a str) -> Result, + F: Fn(&str) -> Result, { match &args[0] { ColumnarValue::Array(a) => match a.data_type() { - DataType::Utf8 | DataType::LargeUtf8 => Ok(ColumnarValue::Array(Arc::new( - unary_string_to_primitive_function::(&[a.as_ref()], op, name)?, + DataType::Utf8View => Ok(ColumnarValue::Array(Arc::new( + unary_string_to_primitive_function::<&StringViewArray, O, _>( + a.as_ref().as_string_view(), + op, + )?, + ))), + DataType::LargeUtf8 => Ok(ColumnarValue::Array(Arc::new( + unary_string_to_primitive_function::<&GenericStringArray, O, _>( + a.as_ref().as_string::(), + op, + )?, + ))), + DataType::Utf8 => Ok(ColumnarValue::Array(Arc::new( + unary_string_to_primitive_function::<&GenericStringArray, O, _>( + a.as_ref().as_string::(), + op, + )?, ))), other => exec_err!("Unsupported data type {other:?} for function {name}"), }, ColumnarValue::Scalar(scalar) => match scalar { - ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => { - let result = a.as_ref().map(|x| (op)(x)).transpose()?; + ScalarValue::Utf8View(a) + | ScalarValue::LargeUtf8(a) + | ScalarValue::Utf8(a) => { + let result = a.as_ref().map(|x| op(x)).transpose()?; Ok(ColumnarValue::Scalar(S::scalar(result))) } other => exec_err!("Unsupported data type {other:?} for function {name}"), @@ -205,11 +224,11 @@ where } } -// given an function that maps a `&str`, `&str` to an arrow native type, +// Given a function that maps a `&str`, `&str` to an arrow native type, // returns a `ColumnarValue` where the function is applied to either a `ArrayRef` or `ScalarValue` // depending on the `args`'s variant. -pub(crate) fn handle_multiple<'a, O, F, S, M>( - args: &'a [ColumnarValue], +pub(crate) fn handle_multiple( + args: &[ColumnarValue], op: F, op2: M, name: &str, @@ -217,24 +236,24 @@ pub(crate) fn handle_multiple<'a, O, F, S, M>( where O: ArrowPrimitiveType, S: ScalarType, - F: Fn(&'a str, &'a str) -> Result, + F: Fn(&str, &str) -> Result, M: Fn(O::Native) -> O::Native, { match &args[0] { ColumnarValue::Array(a) => match a.data_type() { - DataType::Utf8 | DataType::LargeUtf8 => { + DataType::Utf8View | DataType::LargeUtf8 | DataType::Utf8 => { // validate the column types for (pos, arg) in args.iter().enumerate() { match arg { ColumnarValue::Array(arg) => match arg.data_type() { - DataType::Utf8 | DataType::LargeUtf8 => { + DataType::Utf8View | DataType::LargeUtf8 | DataType::Utf8 => { // all good } other => return exec_err!("Unsupported data type {other:?} for function {name}, arg # {pos}"), }, ColumnarValue::Scalar(arg) => { match arg.data_type() { - DataType::Utf8 | DataType::LargeUtf8 => { + DataType::Utf8View| DataType::LargeUtf8 | DataType::Utf8 => { // all good } other => return exec_err!("Unsupported data type {other:?} for function {name}, arg # {pos}"), @@ -244,7 +263,7 @@ where } Ok(ColumnarValue::Array(Arc::new( - strings_to_primitive_function::(args, op, op2, name)?, + strings_to_primitive_function::(args, op, op2, name)?, ))) } other => { @@ -253,7 +272,9 @@ where }, // if the first argument is a scalar utf8 all arguments are expected to be scalar utf8 ColumnarValue::Scalar(scalar) => match scalar { - ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => { + ScalarValue::Utf8View(a) + | ScalarValue::LargeUtf8(a) + | ScalarValue::Utf8(a) => { let a = a.as_ref(); // ASK: Why do we trust `a` to be non-null at this point? let a = unwrap_or_internal_err!(a); @@ -262,7 +283,9 @@ where for (pos, v) in args.iter().enumerate().skip(1) { let ColumnarValue::Scalar( - ScalarValue::Utf8(x) | ScalarValue::LargeUtf8(x), + ScalarValue::Utf8View(x) + | ScalarValue::LargeUtf8(x) + | ScalarValue::Utf8(x), ) = v else { return exec_err!("Unsupported data type {v:?} for function {name}, arg # {pos}"); @@ -299,18 +322,16 @@ where /// # Errors /// This function errors iff: /// * the number of arguments is not > 1 or -/// * the array arguments are not castable to a `GenericStringArray` or /// * the function `op` errors for all input -pub(crate) fn strings_to_primitive_function<'a, T, O, F, F2>( - args: &'a [ColumnarValue], +pub(crate) fn strings_to_primitive_function( + args: &[ColumnarValue], op: F, op2: F2, name: &str, ) -> Result> where O: ArrowPrimitiveType, - T: OffsetSizeTrait, - F: Fn(&'a str, &'a str) -> Result, + F: Fn(&str, &str) -> Result, F2: Fn(O::Native) -> O::Native, { if args.len() < 2 { @@ -321,50 +342,90 @@ where ); } - // this will throw the error if any of the array args are not castable to GenericStringArray - let data = args - .iter() - .map(|a| match a { - ColumnarValue::Array(a) => { - Ok(Either::Left(as_generic_string_array::(a.as_ref())?)) + match &args[0] { + ColumnarValue::Array(a) => match a.data_type() { + DataType::Utf8View => { + let string_array = a.as_string_view(); + handle_array_op::( + &string_array, + &args[1..], + op, + op2, + ) } - ColumnarValue::Scalar(s) => match s { - ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => Ok(Either::Right(a)), - other => exec_err!( - "Unexpected scalar type encountered '{other}' for function '{name}'" - ), - }, - }) - .collect::, &Option>>>>()?; - - let first_arg = &data.first().unwrap().left().unwrap(); + DataType::LargeUtf8 => { + let string_array = as_generic_string_array::(&a)?; + handle_array_op::, F, F2>( + &string_array, + &args[1..], + op, + op2, + ) + } + DataType::Utf8 => { + let string_array = as_generic_string_array::(&a)?; + handle_array_op::, F, F2>( + &string_array, + &args[1..], + op, + op2, + ) + } + other => exec_err!( + "Unsupported data type {other:?} for function substr,\ + expected Utf8View, Utf8 or LargeUtf8." + ), + }, + other => exec_err!( + "Received {} data type, expected only array", + other.data_type() + ), + } +} - first_arg +fn handle_array_op<'a, O, V, F, F2>( + first: &V, + args: &[ColumnarValue], + op: F, + op2: F2, +) -> Result> +where + V: StringArrayType<'a>, + O: ArrowPrimitiveType, + F: Fn(&str, &str) -> Result, + F2: Fn(O::Native) -> O::Native, +{ + first .iter() .enumerate() .map(|(pos, x)| { let mut val = None; - if let Some(x) = x { - let param_args = data.iter().skip(1); - - // go through the args and find the first successful result. Only the last - // failure will be returned if no successful result was received. - for param_arg in param_args { - // param_arg is an array, use the corresponding index into the array as the arg - // we're currently parsing - let p = *param_arg; - let r = if p.is_left() { - let p = p.left().unwrap(); - op(x, p.value(pos)) - } - // args is a scalar, use it directly - else if let Some(p) = p.right().unwrap() { - op(x, p.as_str()) - } else { - continue; - }; + for arg in args { + let v = match arg { + ColumnarValue::Array(a) => match a.data_type() { + DataType::Utf8View => Ok(a.as_string_view().value(pos)), + DataType::LargeUtf8 => Ok(a.as_string::().value(pos)), + DataType::Utf8 => Ok(a.as_string::().value(pos)), + other => exec_err!("Unexpected type encountered '{other}'"), + }, + ColumnarValue::Scalar(s) => match s { + ScalarValue::Utf8View(a) + | ScalarValue::LargeUtf8(a) + | ScalarValue::Utf8(a) => { + if let Some(v) = a { + Ok(v.as_str()) + } else { + continue; + } + } + other => { + exec_err!("Unexpected scalar type encountered '{other}'") + } + }, + }?; + let r = op(x, v); if r.is_ok() { val = Some(Ok(op2(r.unwrap()))); break; @@ -385,28 +446,16 @@ where /// # Errors /// This function errors iff: /// * the number of arguments is not 1 or -/// * the first argument is not castable to a `GenericStringArray` or /// * the function `op` errors -fn unary_string_to_primitive_function<'a, T, O, F>( - args: &[&'a dyn Array], +fn unary_string_to_primitive_function<'a, StringArrType, O, F>( + array: StringArrType, op: F, - name: &str, ) -> Result> where + StringArrType: StringArrayType<'a>, O: ArrowPrimitiveType, - T: OffsetSizeTrait, F: Fn(&'a str) -> Result, { - if args.len() != 1 { - return exec_err!( - "{:?} args were supplied but {} takes exactly one argument", - args.len(), - name - ); - } - - let array = as_generic_string_array::(args[0])?; - // first map is the iterator, second is for the `Option<_>` array.iter().map(|x| x.map(&op).transpose()).collect() } diff --git a/datafusion/functions/src/datetime/to_date.rs b/datafusion/functions/src/datetime/to_date.rs index 2803fd042b99..b21fe995cea6 100644 --- a/datafusion/functions/src/datetime/to_date.rs +++ b/datafusion/functions/src/datetime/to_date.rs @@ -17,7 +17,7 @@ use crate::datetime::common::*; use arrow::datatypes::DataType; -use arrow::datatypes::DataType::Date32; +use arrow::datatypes::DataType::*; use arrow::error::ArrowError::ParseError; use arrow::{array::types::Date32Type, compute::kernels::cast_utils::Parser}; use datafusion_common::error::DataFusionError; @@ -151,13 +151,10 @@ impl ScalarUDFImpl for ToDateFunc { } match args[0].data_type() { - DataType::Int32 - | DataType::Int64 - | DataType::Null - | DataType::Float64 - | DataType::Date32 - | DataType::Date64 => args[0].cast_to(&DataType::Date32, None), - DataType::Utf8 => self.to_date(args), + Int32 | Int64 | Null | Float64 | Date32 | Date64 => { + args[0].cast_to(&Date32, None) + } + Utf8View | LargeUtf8 | Utf8 => self.to_date(args), other => { exec_err!("Unsupported data type {:?} for function to_date", other) } @@ -171,9 +168,11 @@ impl ScalarUDFImpl for ToDateFunc { #[cfg(test)] mod tests { + use arrow::array::{Array, Date32Array, GenericStringArray, StringViewArray}; use arrow::{compute::kernels::cast_utils::Parser, datatypes::Date32Type}; use datafusion_common::ScalarValue; use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + use std::sync::Arc; use super::ToDateFunc; @@ -204,9 +203,17 @@ mod tests { ]; for tc in &test_cases { - let date_scalar = ScalarValue::Utf8(Some(tc.date_str.to_string())); - let to_date_result = - ToDateFunc::new().invoke(&[ColumnarValue::Scalar(date_scalar)]); + test_scalar(ScalarValue::Utf8(Some(tc.date_str.to_string())), tc); + test_scalar(ScalarValue::LargeUtf8(Some(tc.date_str.to_string())), tc); + test_scalar(ScalarValue::Utf8View(Some(tc.date_str.to_string())), tc); + + test_array::>(tc); + test_array::>(tc); + test_array::(tc); + } + + fn test_scalar(sv: ScalarValue, tc: &TestCase) { + let to_date_result = ToDateFunc::new().invoke(&[ColumnarValue::Scalar(sv)]); match to_date_result { Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => { @@ -220,6 +227,33 @@ mod tests { _ => panic!("Could not convert '{}' to Date", tc.date_str), } } + + fn test_array(tc: &TestCase) + where + A: From> + Array + 'static, + { + let date_array = A::from(vec![tc.date_str]); + let to_date_result = + ToDateFunc::new().invoke(&[ColumnarValue::Array(Arc::new(date_array))]); + + match to_date_result { + Ok(ColumnarValue::Array(a)) => { + assert_eq!(a.len(), 1); + + let expected = Date32Type::parse_formatted(tc.date_str, "%Y-%m-%d"); + let mut builder = Date32Array::builder(4); + builder.append_value(expected.unwrap()); + + assert_eq!( + &builder.finish() as &dyn Array, + a.as_ref(), + "{}: to_date created wrong value", + tc.name + ); + } + _ => panic!("Could not convert '{}' to Date", tc.date_str), + } + } } #[test] @@ -271,12 +305,26 @@ mod tests { ]; for tc in &test_cases { - let formatted_date_scalar = - ScalarValue::Utf8(Some(tc.formatted_date.to_string())); + test_scalar(ScalarValue::Utf8(Some(tc.formatted_date.to_string())), tc); + test_scalar( + ScalarValue::LargeUtf8(Some(tc.formatted_date.to_string())), + tc, + ); + test_scalar( + ScalarValue::Utf8View(Some(tc.formatted_date.to_string())), + tc, + ); + + test_array::>(tc); + test_array::>(tc); + test_array::(tc); + } + + fn test_scalar(sv: ScalarValue, tc: &TestCase) { let format_scalar = ScalarValue::Utf8(Some(tc.format_str.to_string())); let to_date_result = ToDateFunc::new().invoke(&[ - ColumnarValue::Scalar(formatted_date_scalar), + ColumnarValue::Scalar(sv), ColumnarValue::Scalar(format_scalar), ]); @@ -291,6 +339,41 @@ mod tests { ), } } + + fn test_array(tc: &TestCase) + where + A: From> + Array + 'static, + { + let date_array = A::from(vec![tc.formatted_date]); + let format_array = A::from(vec![tc.format_str]); + + let to_date_result = ToDateFunc::new().invoke(&[ + ColumnarValue::Array(Arc::new(date_array)), + ColumnarValue::Array(Arc::new(format_array)), + ]); + + match to_date_result { + Ok(ColumnarValue::Array(a)) => { + assert_eq!(a.len(), 1); + + let expected = Date32Type::parse_formatted(tc.date_str, "%Y-%m-%d"); + let mut builder = Date32Array::builder(4); + builder.append_value(expected.unwrap()); + + assert_eq!( + &builder.finish() as &dyn Array, a.as_ref(), + "{}: to_date created wrong value for date '{}' with format string '{}'", + tc.name, + tc.formatted_date, + tc.format_str + ); + } + _ => panic!( + "Could not convert '{}' with format string '{}'to Date: {:?}", + tc.formatted_date, tc.format_str, to_date_result + ), + } + } } #[test] diff --git a/datafusion/functions/src/datetime/to_timestamp.rs b/datafusion/functions/src/datetime/to_timestamp.rs index cbb6f37603d2..b17c9a005d1f 100644 --- a/datafusion/functions/src/datetime/to_timestamp.rs +++ b/datafusion/functions/src/datetime/to_timestamp.rs @@ -18,7 +18,7 @@ use std::any::Any; use std::sync::Arc; -use arrow::datatypes::DataType::Timestamp; +use arrow::datatypes::DataType::*; use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; use arrow::datatypes::{ ArrowTimestampType, DataType, TimeUnit, TimestampMicrosecondType, @@ -162,16 +162,16 @@ impl ScalarUDFImpl for ToTimestampFunc { } match args[0].data_type() { - DataType::Int32 | DataType::Int64 => args[0] + Int32 | Int64 => args[0] .cast_to(&Timestamp(Second, None), None)? .cast_to(&Timestamp(Nanosecond, None), None), - DataType::Null | DataType::Float64 | Timestamp(_, None) => { + Null | Float64 | Timestamp(_, None) => { args[0].cast_to(&Timestamp(Nanosecond, None), None) } - DataType::Timestamp(_, Some(tz)) => { + Timestamp(_, Some(tz)) => { args[0].cast_to(&Timestamp(Nanosecond, Some(tz)), None) } - DataType::Utf8 => { + Utf8View | LargeUtf8 | Utf8 => { to_timestamp_impl::(args, "to_timestamp") } other => { @@ -215,13 +215,11 @@ impl ScalarUDFImpl for ToTimestampSecondsFunc { } match args[0].data_type() { - DataType::Null | DataType::Int32 | DataType::Int64 | Timestamp(_, None) => { + Null | Int32 | Int64 | Timestamp(_, None) => { args[0].cast_to(&Timestamp(Second, None), None) } - DataType::Timestamp(_, Some(tz)) => { - args[0].cast_to(&Timestamp(Second, Some(tz)), None) - } - DataType::Utf8 => { + Timestamp(_, Some(tz)) => args[0].cast_to(&Timestamp(Second, Some(tz)), None), + Utf8View | LargeUtf8 | Utf8 => { to_timestamp_impl::(args, "to_timestamp_seconds") } other => { @@ -265,13 +263,13 @@ impl ScalarUDFImpl for ToTimestampMillisFunc { } match args[0].data_type() { - DataType::Null | DataType::Int32 | DataType::Int64 | Timestamp(_, None) => { + Null | Int32 | Int64 | Timestamp(_, None) => { args[0].cast_to(&Timestamp(Millisecond, None), None) } - DataType::Timestamp(_, Some(tz)) => { + Timestamp(_, Some(tz)) => { args[0].cast_to(&Timestamp(Millisecond, Some(tz)), None) } - DataType::Utf8 => { + Utf8View | LargeUtf8 | Utf8 => { to_timestamp_impl::(args, "to_timestamp_millis") } other => { @@ -315,13 +313,13 @@ impl ScalarUDFImpl for ToTimestampMicrosFunc { } match args[0].data_type() { - DataType::Null | DataType::Int32 | DataType::Int64 | Timestamp(_, None) => { + Null | Int32 | Int64 | Timestamp(_, None) => { args[0].cast_to(&Timestamp(Microsecond, None), None) } - DataType::Timestamp(_, Some(tz)) => { + Timestamp(_, Some(tz)) => { args[0].cast_to(&Timestamp(Microsecond, Some(tz)), None) } - DataType::Utf8 => { + Utf8View | LargeUtf8 | Utf8 => { to_timestamp_impl::(args, "to_timestamp_micros") } other => { @@ -365,13 +363,13 @@ impl ScalarUDFImpl for ToTimestampNanosFunc { } match args[0].data_type() { - DataType::Null | DataType::Int32 | DataType::Int64 | Timestamp(_, None) => { + Null | Int32 | Int64 | Timestamp(_, None) => { args[0].cast_to(&Timestamp(Nanosecond, None), None) } - DataType::Timestamp(_, Some(tz)) => { + Timestamp(_, Some(tz)) => { args[0].cast_to(&Timestamp(Nanosecond, Some(tz)), None) } - DataType::Utf8 => { + Utf8View | LargeUtf8 | Utf8 => { to_timestamp_impl::(args, "to_timestamp_nanos") } other => { diff --git a/datafusion/sqllogictest/test_files/dates.slt b/datafusion/sqllogictest/test_files/dates.slt index 1ef56b1a7e11..4425eee33373 100644 --- a/datafusion/sqllogictest/test_files/dates.slt +++ b/datafusion/sqllogictest/test_files/dates.slt @@ -194,6 +194,14 @@ create table ts_utf8_data(ts varchar(100), format varchar(100)) as values ('1926632005', '%s'), ('2000-01-01T01:01:01+07:00', '%+'); +statement ok +create table ts_largeutf8_data as +select arrow_cast(ts, 'LargeUtf8') as ts, arrow_cast(format, 'LargeUtf8') as format from ts_utf8_data; + +statement ok +create table ts_utf8view_data as +select arrow_cast(ts, 'Utf8View') as ts, arrow_cast(format, 'Utf8View') as format from ts_utf8_data; + # verify date data using tables with formatting options query D SELECT to_date(t.ts, t.format) from ts_utf8_data as t @@ -204,6 +212,24 @@ SELECT to_date(t.ts, t.format) from ts_utf8_data as t 2031-01-19 1999-12-31 +query D +SELECT to_date(t.ts, t.format) from ts_largeutf8_data as t +---- +2020-09-08 +2031-01-19 +2020-09-08 +2031-01-19 +1999-12-31 + +query D +SELECT to_date(t.ts, t.format) from ts_utf8view_data as t +---- +2020-09-08 +2031-01-19 +2020-09-08 +2031-01-19 +1999-12-31 + # verify date data using tables with formatting options query D SELECT to_date(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z') from ts_utf8_data as t @@ -214,6 +240,24 @@ SELECT to_date(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z') 2031-01-19 1999-12-31 +query D +SELECT to_date(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z') from ts_largeutf8_data as t +---- +2020-09-08 +2031-01-19 +2020-09-08 +2031-01-19 +1999-12-31 + +query D +SELECT to_date(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z') from ts_utf8view_data as t +---- +2020-09-08 +2031-01-19 +2020-09-08 +2031-01-19 +1999-12-31 + # verify date data using tables with formatting options where at least one column cannot be parsed query error Error parsing timestamp from '1926632005' using format '%d-%m-%Y %H:%M:%S%#z': input contains invalid characters SELECT to_date(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%d-%m-%Y %H:%M:%S%#z') from ts_utf8_data as t @@ -228,6 +272,24 @@ SELECT to_date(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%s', '%q', '%d-%m-%Y %H:%M:%S%#z', 2031-01-19 1999-12-31 +query D +SELECT to_date(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%s', '%q', '%d-%m-%Y %H:%M:%S%#z', '%+') from ts_largeutf8_data as t +---- +2020-09-08 +2031-01-19 +2020-09-08 +2031-01-19 +1999-12-31 + +query D +SELECT to_date(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%s', '%q', '%d-%m-%Y %H:%M:%S%#z', '%+') from ts_utf8view_data as t +---- +2020-09-08 +2031-01-19 +2020-09-08 +2031-01-19 +1999-12-31 + # timestamp data using tables with formatting options in an array is not supported at this time query error function unsupported data type at index 1: SELECT to_date(t.ts, make_array('%Y-%m-%d %H/%M/%S%#z', '%s', '%q', '%d-%m-%Y %H:%M:%S%#z', '%+')) from ts_utf8_data as t diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index 2f2a81eb17c7..997dca719147 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -901,6 +901,26 @@ logical_plan 01)Projection: find_in_set(test.column1_utf8view, Utf8View("a,b,c,d")) AS c 02)--TableScan: test projection=[column1_utf8view] +## Ensure no casts for to_date +query TT +EXPLAIN SELECT + to_date(column1_utf8view, 'a,b,c,d') as c +FROM test; +---- +logical_plan +01)Projection: to_date(test.column1_utf8view, Utf8("a,b,c,d")) AS c +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for to_timestamp +query TT +EXPLAIN SELECT + to_timestamp(column1_utf8view, 'a,b,c,d') as c +FROM test; +---- +logical_plan +01)Projection: to_timestamp(test.column1_utf8view, Utf8("a,b,c,d")) AS c +02)--TableScan: test projection=[column1_utf8view] + ## Ensure no casts for binary operators # `~` operator (regex match) query TT diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index 7a7a8a8703ec..a680e0db522d 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -2191,6 +2191,14 @@ create table ts_utf8_data(ts varchar(100), format varchar(100)) as values ('1926632005', '%s'), ('2000-01-01T01:01:01+07:00', '%+'); +statement ok +create table ts_largeutf8_data as +select arrow_cast(ts, 'LargeUtf8') as ts, arrow_cast(format, 'LargeUtf8') as format from ts_utf8_data; + +statement ok +create table ts_utf8view_data as +select arrow_cast(ts, 'Utf8View') as ts, arrow_cast(format, 'Utf8View') as format from ts_utf8_data; + # verify timestamp data using tables with formatting options query P SELECT to_timestamp(t.ts, t.format) from ts_utf8_data as t @@ -2201,9 +2209,84 @@ SELECT to_timestamp(t.ts, t.format) from ts_utf8_data as t 2031-01-19T23:33:25 1999-12-31T18:01:01 +query PPPPP +SELECT to_timestamp(t.ts, t.format), + to_timestamp_seconds(t.ts, t.format), + to_timestamp_millis(t.ts, t.format), + to_timestamp_micros(t.ts, t.format), + to_timestamp_nanos(t.ts, t.format) + from ts_largeutf8_data as t +---- +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 +1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 + +query PPPPP +SELECT to_timestamp(t.ts, t.format), + to_timestamp_seconds(t.ts, t.format), + to_timestamp_millis(t.ts, t.format), + to_timestamp_micros(t.ts, t.format), + to_timestamp_nanos(t.ts, t.format) + from ts_utf8view_data as t +---- +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 +1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 + # verify timestamp data using tables with formatting options +query PPPPP +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z'), + to_timestamp_seconds(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z'), + to_timestamp_millis(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z'), + to_timestamp_micros(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z'), + to_timestamp_nanos(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z') + from ts_utf8_data as t +---- +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 +1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 + +query PPPPP +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z'), + to_timestamp_seconds(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z'), + to_timestamp_millis(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z'), + to_timestamp_micros(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z'), + to_timestamp_nanos(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z') + from ts_largeutf8_data as t +---- +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 +1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 + +query PPPPP +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z'), + to_timestamp_seconds(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z'), + to_timestamp_millis(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z'), + to_timestamp_micros(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z'), + to_timestamp_nanos(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z') + from ts_utf8view_data as t +---- +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 +1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 + +# verify timestamp data using tables with formatting options where at least one column cannot be parsed +query error Error parsing timestamp from '1926632005' using format '%d-%m-%Y %H:%M:%S%#z': input contains invalid characters +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%d-%m-%Y %H:%M:%S%#z') from ts_utf8_data as t + +# verify timestamp data using tables with formatting options where one of the formats is invalid query P -SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z') from ts_utf8_data as t +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%s', '%q', '%d-%m-%Y %H:%M:%S%#z', '%+') from ts_utf8_data as t ---- 2020-09-08T12:00:00 2031-01-19T18:33:25 @@ -2211,13 +2294,17 @@ SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S 2031-01-19T23:33:25 1999-12-31T18:01:01 -# verify timestamp data using tables with formatting options where at least one column cannot be parsed -query error Error parsing timestamp from '1926632005' using format '%d-%m-%Y %H:%M:%S%#z': input contains invalid characters -SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%d-%m-%Y %H:%M:%S%#z') from ts_utf8_data as t +query P +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%s', '%q', '%d-%m-%Y %H:%M:%S%#z', '%+') from ts_largeutf8_data as t +---- +2020-09-08T12:00:00 +2031-01-19T18:33:25 +2020-09-08T12:00:00 +2031-01-19T23:33:25 +1999-12-31T18:01:01 -# verify timestamp data using tables with formatting options where one of the formats is invalid query P -SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%s', '%q', '%d-%m-%Y %H:%M:%S%#z', '%+') from ts_utf8_data as t +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%s', '%q', '%d-%m-%Y %H:%M:%S%#z', '%+') from ts_utf8view_data as t ---- 2020-09-08T12:00:00 2031-01-19T18:33:25