From a23f50768deea4757593233056ad10cf847c35ff Mon Sep 17 00:00:00 2001 From: Val Lorentz Date: Sun, 31 Mar 2024 13:40:21 +0200 Subject: [PATCH] parquet: Add tests for page pruning on unsigned integers (#9888) --- datafusion/core/tests/parquet/mod.rs | 43 ++++++- datafusion/core/tests/parquet/page_pruning.rs | 114 ++++++++++++++++++ 2 files changed, 155 insertions(+), 2 deletions(-) diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index 368637d024e6..1da86a0363a5 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -22,7 +22,7 @@ use arrow::{ Array, ArrayRef, BinaryArray, Date32Array, Date64Array, FixedSizeBinaryArray, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, - TimestampSecondArray, + TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }, datatypes::{DataType, Field, Schema}, record_batch::RecordBatch, @@ -65,6 +65,7 @@ enum Scenario { Dates, Int, Int32Range, + UInt, Float64, Decimal, DecimalBloomFilterInt32, @@ -387,7 +388,7 @@ fn make_timestamp_batch(offset: Duration) -> RecordBatch { .unwrap() } -/// Return record batch with i32 sequence +/// Return record batch with i8, i16, i32, and i64 sequences /// /// Columns are named /// "i8" -> Int8Array @@ -417,6 +418,36 @@ fn make_int_batches(start: i8, end: i8) -> RecordBatch { .unwrap() } +/// Return record batch with i8, i16, i32, and i64 sequences +/// +/// Columns are named +/// "u8" -> UInt8Array +/// "u16" -> UInt16Array +/// "u32" -> UInt32Array +/// "u64" -> UInt64Array +fn make_uint_batches(start: u8, end: u8) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![ + Field::new("u8", DataType::UInt8, true), + Field::new("u16", DataType::UInt16, true), + Field::new("u32", DataType::UInt32, true), + Field::new("u64", DataType::UInt64, true), + ])); + let v8: Vec = (start..end).collect(); + let v16: Vec = (start as _..end as _).collect(); + let v32: Vec = (start as _..end as _).collect(); + let v64: Vec = (start as _..end as _).collect(); + RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt8Array::from(v8)) as ArrayRef, + Arc::new(UInt16Array::from(v16)) as ArrayRef, + Arc::new(UInt32Array::from(v32)) as ArrayRef, + Arc::new(UInt64Array::from(v64)) as ArrayRef, + ], + ) + .unwrap() +} + fn make_int32_range(start: i32, end: i32) -> RecordBatch { let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); let v = vec![start, end]; @@ -620,6 +651,14 @@ fn create_data_batch(scenario: Scenario) -> Vec { Scenario::Int32Range => { vec![make_int32_range(0, 10), make_int32_range(200000, 300000)] } + Scenario::UInt => { + vec![ + make_uint_batches(0, 5), + make_uint_batches(1, 6), + make_uint_batches(5, 10), + make_uint_batches(250, 255), + ] + } Scenario::Float64 => { vec![ make_f64_batch(vec![-5.0, -4.0, -3.0, -2.0, -1.0]), diff --git a/datafusion/core/tests/parquet/page_pruning.rs b/datafusion/core/tests/parquet/page_pruning.rs index e9e99cd3f88e..da9617f13ee9 100644 --- a/datafusion/core/tests/parquet/page_pruning.rs +++ b/datafusion/core/tests/parquet/page_pruning.rs @@ -515,6 +515,120 @@ int_tests!(16); int_tests!(32); int_tests!(64); +macro_rules! uint_tests { + ($bits:expr) => { + paste::item! { + #[tokio::test] + // null count min max + // page-0 0 0 4 + // page-1 0 1 5 + // page-2 0 5 9 + // page-3 0 250 254 + async fn []() { + test_prune( + Scenario::UInt, + &format!("SELECT * FROM t where u{} < 6", $bits), + Some(0), + Some(5), + 11, + ) + .await; + } + + #[tokio::test] + async fn []() { + test_prune( + Scenario::UInt, + &format!("SELECT * FROM t where u{} > 253", $bits), + Some(0), + Some(15), + 1, + ) + .await; + } + + #[tokio::test] + async fn []() { + test_prune( + Scenario::UInt, + &format!("SELECT * FROM t where u{} = 6", $bits), + Some(0), + Some(15), + 1, + ) + .await; + } + + #[tokio::test] + async fn []() { + test_prune( + Scenario::UInt, + &format!("SELECT * FROM t where power(u{}, 2) = 36 and u{} = 6", $bits, $bits), + Some(0), + Some(15), + 1, + ) + .await; + } + + #[tokio::test] + async fn []() { + test_prune( + Scenario::UInt, + &format!("SELECT * FROM t where power(u{}, 2) = 25", $bits), + Some(0), + Some(0), + 2, + ) + .await; + } + + #[tokio::test] + async fn []() { + test_prune( + Scenario::UInt, + &format!("SELECT * FROM t where u{}+1 = 6", $bits), + Some(0), + Some(0), + 2, + ) + .await; + } + + #[tokio::test] + async fn []() { + // result of sql "SELECT * FROM t where in (1)" + test_prune( + Scenario::UInt, + &format!("SELECT * FROM t where u{} in (6)", $bits), + Some(0), + Some(15), + 1, + ) + .await; + } + + #[tokio::test] + async fn []() { + // result of sql "SELECT * FROM t where not in (6)" prune nothing + test_prune( + Scenario::UInt, + &format!("SELECT * FROM t where u{} not in (6)", $bits), + Some(0), + Some(0), + 19, + ) + .await; + } + } + } +} + +uint_tests!(8); +uint_tests!(16); +uint_tests!(32); +uint_tests!(64); + #[tokio::test] // null count min max // page-0 0 -5.0 -1.0