Skip to content

Commit

Permalink
Update TO_DATE, TO_TIMESTAMP scalar functions to support LargeUtf8, U…
Browse files Browse the repository at this point in the history
…tf8View (#12929)

* Update to_date and to_timestamp* udfs to support largeutf8 and utf8view. Benchmark updated as well

* datetime depends on string expressions until #12898 lands

* update to reflect the stringarraytype move to a common path

* Update datafusion/functions/src/datetime/common.rs

---------

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
  • Loading branch information
Omega359 and alamb authored Oct 16, 2024
1 parent 90720c0 commit 435f959
Show file tree
Hide file tree
Showing 7 changed files with 584 additions and 190 deletions.
243 changes: 169 additions & 74 deletions datafusion/functions/benches/to_timestamp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,27 +20,123 @@ extern crate criterion;
use std::sync::Arc;

use arrow::array::builder::StringBuilder;
use arrow::array::ArrayRef;
use arrow::array::{ArrayRef, StringArray};
use arrow::compute::cast;
use arrow::datatypes::DataType;
use criterion::{black_box, criterion_group, criterion_main, Criterion};

use datafusion_expr::ColumnarValue;
use datafusion_functions::datetime::to_timestamp;

fn data() -> StringArray {
let data: Vec<&str> = vec![
"1997-01-31T09:26:56.123Z",
"1997-01-31T09:26:56.123-05:00",
"1997-01-31 09:26:56.123-05:00",
"2023-01-01 04:05:06.789 -08",
"1997-01-31T09:26:56.123",
"1997-01-31 09:26:56.123",
"1997-01-31 09:26:56",
"1997-01-31 13:26:56",
"1997-01-31 13:26:56+04:00",
"1997-01-31",
];

StringArray::from(data)
}

fn data_with_formats() -> (StringArray, StringArray, StringArray, StringArray) {
let mut inputs = StringBuilder::new();
let mut format1_builder = StringBuilder::with_capacity(2, 10);
let mut format2_builder = StringBuilder::with_capacity(2, 10);
let mut format3_builder = StringBuilder::with_capacity(2, 10);

inputs.append_value("1997-01-31T09:26:56.123Z");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%Z");

inputs.append_value("1997-01-31T09:26:56.123-05:00");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%z");

inputs.append_value("1997-01-31 09:26:56.123-05:00");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f%Z");

inputs.append_value("2023-01-01 04:05:06.789 -08");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f %#z");

inputs.append_value("1997-01-31T09:26:56.123");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f");

inputs.append_value("1997-01-31 09:26:56.123");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f");

inputs.append_value("1997-01-31 09:26:56");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%d %H:%M:%S");

inputs.append_value("1997-01-31 092656");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%d %H%M%S");

inputs.append_value("1997-01-31 092656+04:00");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%d %H%M%S%:z");

inputs.append_value("Sun Jul 8 00:34:60 2001");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%d 00:00:00");

(
inputs.finish(),
format1_builder.finish(),
format2_builder.finish(),
format3_builder.finish(),
)
}
fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("to_timestamp_no_formats", |b| {
let mut inputs = StringBuilder::new();
inputs.append_value("1997-01-31T09:26:56.123Z");
inputs.append_value("1997-01-31T09:26:56.123-05:00");
inputs.append_value("1997-01-31 09:26:56.123-05:00");
inputs.append_value("2023-01-01 04:05:06.789 -08");
inputs.append_value("1997-01-31T09:26:56.123");
inputs.append_value("1997-01-31 09:26:56.123");
inputs.append_value("1997-01-31 09:26:56");
inputs.append_value("1997-01-31 13:26:56");
inputs.append_value("1997-01-31 13:26:56+04:00");
inputs.append_value("1997-01-31");

let string_array = ColumnarValue::Array(Arc::new(inputs.finish()) as ArrayRef);
c.bench_function("to_timestamp_no_formats_utf8", |b| {
let string_array = ColumnarValue::Array(Arc::new(data()) as ArrayRef);

b.iter(|| {
black_box(
to_timestamp()
.invoke(&[string_array.clone()])
.expect("to_timestamp should work on valid values"),
)
})
});

c.bench_function("to_timestamp_no_formats_largeutf8", |b| {
let data = cast(&data(), &DataType::LargeUtf8).unwrap();
let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef);

b.iter(|| {
black_box(
to_timestamp()
.invoke(&[string_array.clone()])
.expect("to_timestamp should work on valid values"),
)
})
});

c.bench_function("to_timestamp_no_formats_utf8view", |b| {
let data = cast(&data(), &DataType::Utf8View).unwrap();
let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef);

b.iter(|| {
black_box(
Expand All @@ -51,67 +147,66 @@ fn criterion_benchmark(c: &mut Criterion) {
})
});

c.bench_function("to_timestamp_with_formats", |b| {
let mut inputs = StringBuilder::new();
let mut format1_builder = StringBuilder::with_capacity(2, 10);
let mut format2_builder = StringBuilder::with_capacity(2, 10);
let mut format3_builder = StringBuilder::with_capacity(2, 10);

inputs.append_value("1997-01-31T09:26:56.123Z");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%Z");

inputs.append_value("1997-01-31T09:26:56.123-05:00");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%z");

inputs.append_value("1997-01-31 09:26:56.123-05:00");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f%Z");

inputs.append_value("2023-01-01 04:05:06.789 -08");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f %#z");

inputs.append_value("1997-01-31T09:26:56.123");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f");

inputs.append_value("1997-01-31 09:26:56.123");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f");

inputs.append_value("1997-01-31 09:26:56");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%d %H:%M:%S");

inputs.append_value("1997-01-31 092656");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%d %H%M%S");

inputs.append_value("1997-01-31 092656+04:00");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%d %H%M%S%:z");

inputs.append_value("Sun Jul 8 00:34:60 2001");
format1_builder.append_value("%+");
format2_builder.append_value("%c");
format3_builder.append_value("%Y-%m-%d 00:00:00");
c.bench_function("to_timestamp_with_formats_utf8", |b| {
let (inputs, format1, format2, format3) = data_with_formats();

let args = [
ColumnarValue::Array(Arc::new(inputs) as ArrayRef),
ColumnarValue::Array(Arc::new(format1) as ArrayRef),
ColumnarValue::Array(Arc::new(format2) as ArrayRef),
ColumnarValue::Array(Arc::new(format3) as ArrayRef),
];
b.iter(|| {
black_box(
to_timestamp()
.invoke(&args.clone())
.expect("to_timestamp should work on valid values"),
)
})
});

c.bench_function("to_timestamp_with_formats_largeutf8", |b| {
let (inputs, format1, format2, format3) = data_with_formats();

let args = [
ColumnarValue::Array(
Arc::new(cast(&inputs, &DataType::LargeUtf8).unwrap()) as ArrayRef
),
ColumnarValue::Array(
Arc::new(cast(&format1, &DataType::LargeUtf8).unwrap()) as ArrayRef
),
ColumnarValue::Array(
Arc::new(cast(&format2, &DataType::LargeUtf8).unwrap()) as ArrayRef
),
ColumnarValue::Array(
Arc::new(cast(&format3, &DataType::LargeUtf8).unwrap()) as ArrayRef
),
];
b.iter(|| {
black_box(
to_timestamp()
.invoke(&args.clone())
.expect("to_timestamp should work on valid values"),
)
})
});

c.bench_function("to_timestamp_with_formats_utf8view", |b| {
let (inputs, format1, format2, format3) = data_with_formats();

let args = [
ColumnarValue::Array(Arc::new(inputs.finish()) as ArrayRef),
ColumnarValue::Array(Arc::new(format1_builder.finish()) as ArrayRef),
ColumnarValue::Array(Arc::new(format2_builder.finish()) as ArrayRef),
ColumnarValue::Array(Arc::new(format3_builder.finish()) as ArrayRef),
ColumnarValue::Array(
Arc::new(cast(&inputs, &DataType::Utf8View).unwrap()) as ArrayRef
),
ColumnarValue::Array(
Arc::new(cast(&format1, &DataType::Utf8View).unwrap()) as ArrayRef
),
ColumnarValue::Array(
Arc::new(cast(&format2, &DataType::Utf8View).unwrap()) as ArrayRef
),
ColumnarValue::Array(
Arc::new(cast(&format3, &DataType::Utf8View).unwrap()) as ArrayRef
),
];
b.iter(|| {
black_box(
Expand Down
Loading

0 comments on commit 435f959

Please sign in to comment.