diff --git a/rust/src/delta_datafusion.rs b/rust/src/delta_datafusion.rs index c45648a772..465347d2a6 100644 --- a/rust/src/delta_datafusion.rs +++ b/rust/src/delta_datafusion.rs @@ -57,7 +57,7 @@ use object_store::{path::Path, ObjectMeta}; use url::Url; use crate::builder::ensure_table_uri; -use crate::{action, open_table, open_table_with_storage_options}; +use crate::{action, open_table, open_table_with_storage_options, SchemaDataType}; use crate::{schema, DeltaTableBuilder}; use crate::{DeltaResult, Invariant}; use crate::{DeltaTable, DeltaTableError}; @@ -242,6 +242,13 @@ fn get_prune_stats(table: &DeltaTable, column: &Column, get_max: bool) -> Option .ok() .map(|s| s.get_field_with_name(&column.name).ok())??; + // See issue 1214. Binary type does not support natural order which is required for Datafusion to prune + if let SchemaDataType::primitive(t) = &field.get_type() { + if t == "binary" { + return None; + } + } + let data_type = field.get_type().try_into().ok()?; let partition_columns = &table.get_metadata().ok()?.partition_columns; diff --git a/rust/tests/datafusion_test.rs b/rust/tests/datafusion_test.rs index c70d56a813..23fd805afe 100644 --- a/rust/tests/datafusion_test.rs +++ b/rust/tests/datafusion_test.rs @@ -485,7 +485,10 @@ async fn test_files_scanned() -> Result<()> { non_existent_value, } = test.to_owned(); let column = column.to_owned(); - //TODO: The following types don't have proper stats written. + // TODO: The following types don't have proper stats written. + // See issue #1208 for decimal type + // See issue #1209 for dates + // Min and Max is not calculated for binary columns. This matches the Spark writer if column == "decimal" || column == "date" || column == "binary" { continue; } @@ -539,14 +542,10 @@ async fn test_files_scanned() -> Result<()> { file3_value, non_existent_value, } = test; - //TODO: Float, timestamp, decimal, date, binary partitions are not supported by the writer - if column == "float32" - || column == "float64" - || column == "timestamp" - || column == "decimal" - || column == "date" - || column == "binary" - { + // TODO: Float and decimal partitions are not supported by the writer + // binary fails since arrow does not implement a natural order + // The current Datafusion pruning implementation does not work for binary columns since they do not have a natural order. See #1214 + if column == "float32" || column == "float64" || column == "decimal" || column == "binary" { continue; } println!("test {}", column);