From 5abef4141390dc7e8b89f4c82f7a72c21ab1a340 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 18 Sep 2024 22:17:31 +0800 Subject: [PATCH] Improve benchmark for ltrim (#12513) * complete benchmark for ltrim. * improve benchmarks. * remove unused param. * fix bench. * refactor to remove repeated codes. * fix clippy. * Update datafusion/functions/benches/ltrim.rs Co-authored-by: Andrew Lamb * improve codes and add more comments. * fix clippy. --------- Co-authored-by: Andrew Lamb --- datafusion/functions/benches/ltrim.rs | 223 ++++++++++++++++++++++++-- 1 file changed, 206 insertions(+), 17 deletions(-) diff --git a/datafusion/functions/benches/ltrim.rs b/datafusion/functions/benches/ltrim.rs index 01acb9de3381..b3fa5ef4fdff 100644 --- a/datafusion/functions/benches/ltrim.rs +++ b/datafusion/functions/benches/ltrim.rs @@ -17,32 +17,221 @@ extern crate criterion; -use arrow::array::{ArrayRef, StringArray}; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray}; +use criterion::{ + black_box, criterion_group, criterion_main, measurement::Measurement, BenchmarkGroup, + Criterion, SamplingMode, +}; use datafusion_common::ScalarValue; -use datafusion_expr::ColumnarValue; +use datafusion_expr::{ColumnarValue, ScalarUDF}; use datafusion_functions::string; -use std::sync::Arc; +use rand::{distributions::Alphanumeric, rngs::StdRng, Rng, SeedableRng}; +use std::{fmt, sync::Arc}; -fn create_args(size: usize, characters: &str) -> Vec { - let iter = - std::iter::repeat(format!("{}datafusion{}", characters, characters)).take(size); - let array = Arc::new(StringArray::from_iter_values(iter)) as ArrayRef; +pub fn seedable_rng() -> StdRng { + StdRng::seed_from_u64(42) +} + +#[derive(Clone, Copy)] +pub enum StringArrayType { + Utf8View, + Utf8, + LargeUtf8, +} + +impl fmt::Display for StringArrayType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + StringArrayType::Utf8View => f.write_str("string_view"), + StringArrayType::Utf8 => f.write_str("string"), + StringArrayType::LargeUtf8 => f.write_str("large_string"), + } + } +} + +/// returns an array of strings, and `characters` as a ScalarValue +pub fn create_string_array_and_characters( + size: usize, + characters: &str, + trimmed: &str, + remaining_len: usize, + string_array_type: StringArrayType, +) -> (ArrayRef, ScalarValue) { + let rng = &mut seedable_rng(); + + // Create `size` rows: + // - 10% rows will be `None` + // - Other 90% will be strings with same `remaining_len` lengths + // We will build the string array on it later. + let string_iter = (0..size).map(|_| { + if rng.gen::() < 0.1 { + None + } else { + let mut value = trimmed.as_bytes().to_vec(); + let generated = rng.sample_iter(&Alphanumeric).take(remaining_len); + value.extend(generated); + Some(String::from_utf8(value).unwrap()) + } + }); + + // Build the target `string array` and `characters` according to `string_array_type` + match string_array_type { + StringArrayType::Utf8View => ( + Arc::new(string_iter.collect::()), + ScalarValue::Utf8View(Some(characters.to_string())), + ), + StringArrayType::Utf8 => ( + Arc::new(string_iter.collect::()), + ScalarValue::Utf8(Some(characters.to_string())), + ), + StringArrayType::LargeUtf8 => ( + Arc::new(string_iter.collect::()), + ScalarValue::LargeUtf8(Some(characters.to_string())), + ), + } +} + +/// Create args for the ltrim benchmark +/// Inputs: +/// - size: rows num of the test array +/// - characters: the characters we need to trim +/// - trimmed: the part in the testing string that will be trimmed +/// - remaining_len: the len of the remaining part of testing string after trimming +/// - string_array_type: the method used to store the testing strings +/// +/// Outputs: +/// - testing string array +/// - trimmed characters +/// +fn create_args( + size: usize, + characters: &str, + trimmed: &str, + remaining_len: usize, + string_array_type: StringArrayType, +) -> Vec { + let (string_array, pattern) = create_string_array_and_characters( + size, + characters, + trimmed, + remaining_len, + string_array_type, + ); vec![ - ColumnarValue::Array(array), - ColumnarValue::Scalar(ScalarValue::Utf8(Some(characters.to_string()))), + ColumnarValue::Array(string_array), + ColumnarValue::Scalar(pattern), ] } +#[allow(clippy::too_many_arguments)] +fn run_with_string_type( + group: &mut BenchmarkGroup<'_, M>, + ltrim: &ScalarUDF, + size: usize, + len: usize, + characters: &str, + trimmed: &str, + remaining_len: usize, + string_type: StringArrayType, +) { + let args = create_args(size, characters, trimmed, remaining_len, string_type); + group.bench_function( + format!( + "{string_type} [size={size}, len_before={len}, len_after={remaining_len}]", + ), + |b| b.iter(|| black_box(ltrim.invoke(&args))), + ); +} + +#[allow(clippy::too_many_arguments)] +fn run_one_group( + c: &mut Criterion, + group_name: &str, + ltrim: &ScalarUDF, + string_types: &[StringArrayType], + size: usize, + len: usize, + characters: &str, + trimmed: &str, + remaining_len: usize, +) { + let mut group = c.benchmark_group(group_name); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + + for string_type in string_types { + run_with_string_type( + &mut group, + ltrim, + size, + len, + characters, + trimmed, + remaining_len, + *string_type, + ); + } + + group.finish(); +} + fn criterion_benchmark(c: &mut Criterion) { let ltrim = string::ltrim(); - for char in ["\"", "Header:"] { - for size in [1024, 4096, 8192] { - let args = create_args(size, char); - c.bench_function(&format!("ltrim {}: {}", char, size), |b| { - b.iter(|| black_box(ltrim.invoke(&args))) - }); - } + let characters = ",!()"; + + let string_types = [ + StringArrayType::Utf8View, + StringArrayType::Utf8, + StringArrayType::LargeUtf8, + ]; + for size in [1024, 4096, 8192] { + // len=12, trimmed_len=4, len_after_ltrim=8 + let len = 12; + let trimmed = characters; + let remaining_len = len - trimmed.len(); + run_one_group( + c, + "INPUT LEN <= 12", + <rim, + &string_types, + size, + len, + characters, + trimmed, + remaining_len, + ); + + // len=64, trimmed_len=4, len_after_ltrim=60 + let len = 64; + let trimmed = characters; + let remaining_len = len - trimmed.len(); + run_one_group( + c, + "INPUT LEN > 12, OUTPUT LEN > 12", + <rim, + &string_types, + size, + len, + characters, + trimmed, + remaining_len, + ); + + // len=64, trimmed_len=56, len_after_ltrim=8 + let len = 64; + let trimmed = characters.repeat(15); + let remaining_len = len - trimmed.len(); + run_one_group( + c, + "INPUT LEN > 12, OUTPUT LEN <= 12", + <rim, + &string_types, + size, + len, + characters, + &trimmed, + remaining_len, + ); } }