diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index 95b78fcc6d24..62b3185314fd 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -274,6 +274,7 @@ message PartitionedFile { string path = 1; uint64 size = 2; uint64 last_modified_ns = 3; + repeated ScalarValue partition_values = 4; } message CsvFormat { @@ -294,7 +295,7 @@ message ListingTableScanNode { ProjectionColumns projection = 4; Schema schema = 5; repeated LogicalExprNode filters = 6; - repeated string partitions = 7; + repeated string table_partition_cols = 7; bool collect_stat = 8; uint32 target_partitions = 9; oneof FileFormatType { @@ -613,33 +614,28 @@ message ScanLimit { uint32 limit = 1; } -message ParquetScanExecNode { +message FileScanExecConf { repeated FileGroup file_groups = 1; Schema schema = 2; - uint32 batch_size = 4; - repeated uint32 projection = 6; - ScanLimit limit = 7; - Statistics statistics = 8; + uint32 batch_size = 3; + repeated uint32 projection = 4; + ScanLimit limit = 5; + Statistics statistics = 6; + repeated string table_partition_cols = 7; +} + +message ParquetScanExecNode { + FileScanExecConf base_conf = 1; } message CsvScanExecNode { - repeated FileGroup file_groups = 1; - Schema schema = 2; - bool has_header = 3; - uint32 batch_size = 4; - string delimiter = 5; - repeated uint32 projection = 6; - ScanLimit limit = 7; - Statistics statistics = 8; + FileScanExecConf base_conf = 1; + bool has_header = 2; + string delimiter = 3; } message AvroScanExecNode { - repeated FileGroup file_groups = 1; - Schema schema = 2; - uint32 batch_size = 4; - repeated uint32 projection = 6; - ScanLimit limit = 7; - Statistics statistics = 8; + FileScanExecConf base_conf = 1; } enum PartitionMode { diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index 26231c5e25c7..259fcb3482a7 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -191,7 +191,7 @@ impl TryInto for &protobuf::LogicalPlanNode { let options = ListingOptions { file_extension: scan.file_extension.clone(), format: file_format, - partitions: scan.partitions.clone(), + table_partition_cols: scan.table_partition_cols.clone(), collect_stat: scan.collect_stat, target_partitions: scan.target_partitions as usize, }; diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs index ae25d72d57f9..1d1d48e8a4a3 100644 --- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs @@ -755,8 +755,11 @@ impl TryInto for &LogicalPlan { .options() .file_extension .clone(), - partitions: listing_table.options().partitions.clone(), - path: listing_table.path().to_owned(), + table_partition_cols: listing_table + .options() + .table_partition_cols + .clone(), + path: listing_table.table_path().to_owned(), schema: Some(schema), projection, filters, diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index dce354ac69fa..99d2de03258f 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -30,7 +30,7 @@ use crate::serde::protobuf::ShuffleReaderPartition; use crate::serde::scheduler::PartitionLocation; use crate::serde::{from_proto_binary_op, proto_error, protobuf, str_to_byte}; use crate::{convert_box_required, convert_required, into_required}; -use chrono::{DateTime, NaiveDateTime, TimeZone, Utc}; +use chrono::{TimeZone, Utc}; use datafusion::arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion::catalog::catalog::{ CatalogList, CatalogProvider, MemoryCatalogList, MemoryCatalogProvider, @@ -46,7 +46,9 @@ use datafusion::logical_plan::{ }; use datafusion::physical_plan::aggregates::{create_aggregate_expr, AggregateFunction}; use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; -use datafusion::physical_plan::file_format::{AvroExec, CsvExec, ParquetExec}; +use datafusion::physical_plan::file_format::{ + AvroExec, CsvExec, ParquetExec, PhysicalPlanConfig, +}; use datafusion::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec}; use datafusion::physical_plan::hash_join::PartitionMode; use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; @@ -118,64 +120,21 @@ impl TryInto> for &protobuf::PhysicalPlanNode { .try_into()?; Ok(Arc::new(FilterExec::try_new(predicate, input)?)) } - PhysicalPlanType::CsvScan(scan) => { - let schema = Arc::new(convert_required!(scan.schema)?); - let projection = scan.projection.iter().map(|i| *i as usize).collect(); - let statistics = convert_required!(scan.statistics)?; - - Ok(Arc::new(CsvExec::new( - Arc::new(LocalFileSystem {}), - scan.file_groups - .iter() - .map(|p| p.into()) - .collect::>>(), - statistics, - schema, - scan.has_header, - str_to_byte(&scan.delimiter)?, - Some(projection), - scan.batch_size as usize, - scan.limit.as_ref().map(|sl| sl.limit as usize), - ))) - } + PhysicalPlanType::CsvScan(scan) => Ok(Arc::new(CsvExec::new( + scan.base_conf.as_ref().unwrap().try_into()?, + scan.has_header, + str_to_byte(&scan.delimiter)?, + ))), PhysicalPlanType::ParquetScan(scan) => { - let schema = Arc::new(convert_required!(scan.schema)?); - let projection = scan.projection.iter().map(|i| *i as usize).collect(); - let statistics = convert_required!(scan.statistics)?; - Ok(Arc::new(ParquetExec::new( - Arc::new(LocalFileSystem {}), - scan.file_groups - .iter() - .map(|p| p.into()) - .collect::>>(), - statistics, - schema, - Some(projection), + scan.base_conf.as_ref().unwrap().try_into()?, // TODO predicate should be de-serialized None, - scan.batch_size as usize, - scan.limit.as_ref().map(|sl| sl.limit as usize), - ))) - } - PhysicalPlanType::AvroScan(scan) => { - let schema = Arc::new(convert_required!(scan.schema)?); - let projection = scan.projection.iter().map(|i| *i as usize).collect(); - let statistics = convert_required!(scan.statistics)?; - - Ok(Arc::new(AvroExec::new( - Arc::new(LocalFileSystem {}), - scan.file_groups - .iter() - .map(|p| p.into()) - .collect::>>(), - statistics, - schema, - Some(projection), - scan.batch_size as usize, - scan.limit.as_ref().map(|sl| sl.limit as usize), ))) } + PhysicalPlanType::AvroScan(scan) => Ok(Arc::new(AvroExec::new( + scan.base_conf.as_ref().unwrap().try_into()?, + ))), PhysicalPlanType::CoalesceBatches(coalesce_batches) => { let input: Arc = convert_box_required!(coalesce_batches.input)?; @@ -738,9 +697,11 @@ pub fn parse_protobuf_hash_partitioning( } } -impl From<&protobuf::PartitionedFile> for PartitionedFile { - fn from(val: &protobuf::PartitionedFile) -> Self { - PartitionedFile { +impl TryFrom<&protobuf::PartitionedFile> for PartitionedFile { + type Error = BallistaError; + + fn try_from(val: &protobuf::PartitionedFile) -> Result { + Ok(PartitionedFile { file_meta: FileMeta { sized_file: SizedFile { path: val.path.clone(), @@ -752,13 +713,23 @@ impl From<&protobuf::PartitionedFile> for PartitionedFile { Some(Utc.timestamp_nanos(val.last_modified_ns as i64)) }, }, - } + partition_values: val + .partition_values + .iter() + .map(|v| v.try_into()) + .collect::, _>>()?, + }) } } -impl From<&protobuf::FileGroup> for Vec { - fn from(val: &protobuf::FileGroup) -> Self { - val.files.iter().map(|f| f.into()).collect() +impl TryFrom<&protobuf::FileGroup> for Vec { + type Error = BallistaError; + + fn try_from(val: &protobuf::FileGroup) -> Result { + val.files + .iter() + .map(|f| f.try_into()) + .collect::, _>>() } } @@ -795,3 +766,37 @@ impl TryInto for &protobuf::Statistics { }) } } + +impl TryInto for &protobuf::FileScanExecConf { + type Error = BallistaError; + + fn try_into(self) -> Result { + let schema = Arc::new(convert_required!(self.schema)?); + let projection = self + .projection + .iter() + .map(|i| *i as usize) + .collect::>(); + let projection = if projection.is_empty() { + None + } else { + Some(projection) + }; + let statistics = convert_required!(self.statistics)?; + + Ok(PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_schema: schema, + file_groups: self + .file_groups + .iter() + .map(|f| f.try_into()) + .collect::, _>>()?, + statistics, + projection, + batch_size: self.batch_size as usize, + limit: self.limit.as_ref().map(|sl| sl.limit as usize), + table_partition_cols: vec![], + }) + } +} diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs index 52285eea0a9c..afbb02a4f216 100644 --- a/ballista/rust/core/src/serde/physical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/to_proto.rs @@ -26,7 +26,6 @@ use std::{ sync::Arc, }; -use datafusion::physical_plan::hash_aggregate::AggregateMode; use datafusion::physical_plan::hash_join::{HashJoinExec, PartitionMode}; use datafusion::physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; use datafusion::physical_plan::projection::ProjectionExec; @@ -43,6 +42,9 @@ use datafusion::physical_plan::{ file_format::ParquetExec, }; use datafusion::physical_plan::{file_format::AvroExec, filter::FilterExec}; +use datafusion::physical_plan::{ + file_format::PhysicalPlanConfig, hash_aggregate::AggregateMode, +}; use datafusion::{ datasource::PartitionedFile, physical_plan::coalesce_batches::CoalesceBatchesExec, }; @@ -244,90 +246,29 @@ impl TryInto for Arc { ))), }) } else if let Some(exec) = plan.downcast_ref::() { - let file_groups = exec - .file_groups() - .iter() - .map(|p| p.as_slice().into()) - .collect(); Ok(protobuf::PhysicalPlanNode { physical_plan_type: Some(PhysicalPlanType::CsvScan( protobuf::CsvScanExecNode { - file_groups, - statistics: Some((&exec.statistics()).into()), - limit: exec - .limit() - .map(|l| protobuf::ScanLimit { limit: l as u32 }), - projection: exec - .projection() - .as_ref() - .ok_or_else(|| { - BallistaError::General( - "projection in CsvExec does not exist.".to_owned(), - ) - })? - .iter() - .map(|n| *n as u32) - .collect(), - schema: Some(exec.file_schema().as_ref().into()), + base_conf: Some(exec.base_config().try_into()?), has_header: exec.has_header(), delimiter: byte_to_string(exec.delimiter())?, - batch_size: exec.batch_size() as u32, }, )), }) } else if let Some(exec) = plan.downcast_ref::() { - let file_groups = exec - .file_groups() - .iter() - .map(|p| p.as_slice().into()) - .collect(); - Ok(protobuf::PhysicalPlanNode { physical_plan_type: Some(PhysicalPlanType::ParquetScan( protobuf::ParquetScanExecNode { - file_groups, - statistics: Some((&exec.statistics()).into()), - limit: exec - .limit() - .map(|l| protobuf::ScanLimit { limit: l as u32 }), - schema: Some(exec.schema().as_ref().into()), - projection: exec - .projection() - .as_ref() - .iter() - .map(|n| *n as u32) - .collect(), - batch_size: exec.batch_size() as u32, + base_conf: Some(exec.base_config().try_into()?), + // TODO serialize predicates }, )), }) } else if let Some(exec) = plan.downcast_ref::() { - let file_groups = exec - .file_groups() - .iter() - .map(|p| p.as_slice().into()) - .collect(); Ok(protobuf::PhysicalPlanNode { physical_plan_type: Some(PhysicalPlanType::AvroScan( protobuf::AvroScanExecNode { - file_groups, - statistics: Some((&exec.statistics()).into()), - limit: exec - .limit() - .map(|l| protobuf::ScanLimit { limit: l as u32 }), - projection: exec - .projection() - .as_ref() - .ok_or_else(|| { - BallistaError::General( - "projection in AvroExec does not exist.".to_owned(), - ) - })? - .iter() - .map(|n| *n as u32) - .collect(), - schema: Some(exec.file_schema().as_ref().into()), - batch_size: exec.batch_size() as u32, + base_conf: Some(exec.base_config().try_into()?), }, )), }) @@ -674,9 +615,11 @@ fn try_parse_when_then_expr( }) } -impl From<&PartitionedFile> for protobuf::PartitionedFile { - fn from(pf: &PartitionedFile) -> protobuf::PartitionedFile { - protobuf::PartitionedFile { +impl TryFrom<&PartitionedFile> for protobuf::PartitionedFile { + type Error = BallistaError; + + fn try_from(pf: &PartitionedFile) -> Result { + Ok(protobuf::PartitionedFile { path: pf.file_meta.path().to_owned(), size: pf.file_meta.size(), last_modified_ns: pf @@ -684,15 +627,25 @@ impl From<&PartitionedFile> for protobuf::PartitionedFile { .last_modified .map(|ts| ts.timestamp_nanos() as u64) .unwrap_or(0), - } + partition_values: pf + .partition_values + .iter() + .map(|v| v.try_into()) + .collect::, _>>()?, + }) } } -impl From<&[PartitionedFile]> for protobuf::FileGroup { - fn from(gr: &[PartitionedFile]) -> protobuf::FileGroup { - protobuf::FileGroup { - files: gr.iter().map(|f| f.into()).collect(), - } +impl TryFrom<&[PartitionedFile]> for protobuf::FileGroup { + type Error = BallistaError; + + fn try_from(gr: &[PartitionedFile]) -> Result { + Ok(protobuf::FileGroup { + files: gr + .iter() + .map(|f| f.try_into()) + .collect::, _>>()?, + }) } } @@ -722,3 +675,32 @@ impl From<&Statistics> for protobuf::Statistics { } } } + +impl TryFrom<&PhysicalPlanConfig> for protobuf::FileScanExecConf { + type Error = BallistaError; + fn try_from( + conf: &PhysicalPlanConfig, + ) -> Result { + let file_groups = conf + .file_groups + .iter() + .map(|p| p.as_slice().try_into()) + .collect::, _>>()?; + + Ok(protobuf::FileScanExecConf { + file_groups, + statistics: Some((&conf.statistics).into()), + limit: conf.limit.map(|l| protobuf::ScanLimit { limit: l as u32 }), + projection: conf + .projection + .as_ref() + .unwrap_or(&vec![]) + .iter() + .map(|n| *n as u32) + .collect(), + schema: Some(conf.file_schema.as_ref().into()), + batch_size: conf.batch_size as u32, + table_partition_cols: conf.table_partition_cols.to_vec(), + }) + } +} diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index bfe87efb9f74..7bc6510ac2ed 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -496,7 +496,7 @@ fn get_table( file_extension: extension.to_owned(), target_partitions, collect_stat: true, - partitions: vec![], + table_partition_cols: vec![], }; Ok(Arc::new(ListingTable::new( diff --git a/datafusion/src/datasource/file_format/avro.rs b/datafusion/src/datasource/file_format/avro.rs index c6326962e34a..515584b16c03 100644 --- a/datafusion/src/datasource/file_format/avro.rs +++ b/datafusion/src/datasource/file_format/avro.rs @@ -25,11 +25,12 @@ use arrow::{self, datatypes::SchemaRef}; use async_trait::async_trait; use futures::StreamExt; -use super::{FileFormat, PhysicalPlanConfig}; +use super::FileFormat; use crate::avro_to_arrow::read_avro_schema_from_reader; use crate::datasource::object_store::{ObjectReader, ObjectReaderStream}; use crate::error::Result; -use crate::physical_plan::file_format::AvroExec; +use crate::logical_plan::Expr; +use crate::physical_plan::file_format::{AvroExec, PhysicalPlanConfig}; use crate::physical_plan::ExecutionPlan; use crate::physical_plan::Statistics; @@ -61,16 +62,9 @@ impl FileFormat for AvroFormat { async fn create_physical_plan( &self, conf: PhysicalPlanConfig, + _filters: &[Expr], ) -> Result> { - let exec = AvroExec::new( - conf.object_store, - conf.files, - conf.statistics, - conf.schema, - conf.projection, - conf.batch_size, - conf.limit, - ); + let exec = AvroExec::new(conf); Ok(Arc::new(exec)) } } @@ -79,12 +73,9 @@ impl FileFormat for AvroFormat { #[cfg(feature = "avro")] mod tests { use crate::{ - datasource::{ - object_store::local::{ - local_file_meta, local_object_reader, local_object_reader_stream, - LocalFileSystem, - }, - PartitionedFile, + datasource::object_store::local::{ + local_object_reader, local_object_reader_stream, local_unpartitioned_file, + LocalFileSystem, }, physical_plan::collect, }; @@ -349,7 +340,7 @@ mod tests { let testdata = crate::test_util::arrow_test_data(); let filename = format!("{}/avro/{}", testdata, file_name); let format = AvroFormat {}; - let schema = format + let file_schema = format .infer_schema(local_object_reader_stream(vec![filename.clone()])) .await .expect("Schema inference"); @@ -357,20 +348,21 @@ mod tests { .infer_stats(local_object_reader(filename.clone())) .await .expect("Stats inference"); - let files = vec![vec![PartitionedFile { - file_meta: local_file_meta(filename.to_owned()), - }]]; + let file_groups = vec![vec![local_unpartitioned_file(filename.to_owned())]]; let exec = format - .create_physical_plan(PhysicalPlanConfig { - object_store: Arc::new(LocalFileSystem {}), - schema, - files, - statistics, - projection: projection.clone(), - batch_size, - filters: vec![], - limit, - }) + .create_physical_plan( + PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_schema, + file_groups, + statistics, + projection: projection.clone(), + batch_size, + limit, + table_partition_cols: vec![], + }, + &[], + ) .await?; Ok(exec) } diff --git a/datafusion/src/datasource/file_format/csv.rs b/datafusion/src/datasource/file_format/csv.rs index f9959943a2e4..337511316c51 100644 --- a/datafusion/src/datasource/file_format/csv.rs +++ b/datafusion/src/datasource/file_format/csv.rs @@ -25,10 +25,11 @@ use arrow::{self, datatypes::SchemaRef}; use async_trait::async_trait; use futures::StreamExt; -use super::{FileFormat, PhysicalPlanConfig}; +use super::FileFormat; use crate::datasource::object_store::{ObjectReader, ObjectReaderStream}; use crate::error::Result; -use crate::physical_plan::file_format::CsvExec; +use crate::logical_plan::Expr; +use crate::physical_plan::file_format::{CsvExec, PhysicalPlanConfig}; use crate::physical_plan::ExecutionPlan; use crate::physical_plan::Statistics; @@ -123,18 +124,9 @@ impl FileFormat for CsvFormat { async fn create_physical_plan( &self, conf: PhysicalPlanConfig, + _filters: &[Expr], ) -> Result> { - let exec = CsvExec::new( - conf.object_store, - conf.files, - conf.statistics, - conf.schema, - self.has_header, - self.delimiter, - conf.projection, - conf.batch_size, - conf.limit, - ); + let exec = CsvExec::new(conf, self.has_header, self.delimiter); Ok(Arc::new(exec)) } } @@ -148,10 +140,9 @@ mod tests { datasource::{ file_format::PhysicalPlanConfig, object_store::local::{ - local_file_meta, local_object_reader, local_object_reader_stream, - LocalFileSystem, + local_object_reader, local_object_reader_stream, + local_unpartitioned_file, LocalFileSystem, }, - PartitionedFile, }, physical_plan::collect, }; @@ -261,7 +252,7 @@ mod tests { let testdata = crate::test_util::arrow_test_data(); let filename = format!("{}/csv/{}", testdata, file_name); let format = CsvFormat::default(); - let schema = format + let file_schema = format .infer_schema(local_object_reader_stream(vec![filename.clone()])) .await .expect("Schema inference"); @@ -269,20 +260,21 @@ mod tests { .infer_stats(local_object_reader(filename.clone())) .await .expect("Stats inference"); - let files = vec![vec![PartitionedFile { - file_meta: local_file_meta(filename.to_owned()), - }]]; + let file_groups = vec![vec![local_unpartitioned_file(filename.to_owned())]]; let exec = format - .create_physical_plan(PhysicalPlanConfig { - object_store: Arc::new(LocalFileSystem {}), - schema, - files, - statistics, - projection: projection.clone(), - batch_size, - filters: vec![], - limit, - }) + .create_physical_plan( + PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_schema, + file_groups, + statistics, + projection: projection.clone(), + batch_size, + limit, + table_partition_cols: vec![], + }, + &[], + ) .await?; Ok(exec) } diff --git a/datafusion/src/datasource/file_format/json.rs b/datafusion/src/datasource/file_format/json.rs index a579831c7241..72bbee665a61 100644 --- a/datafusion/src/datasource/file_format/json.rs +++ b/datafusion/src/datasource/file_format/json.rs @@ -32,6 +32,7 @@ use super::FileFormat; use super::PhysicalPlanConfig; use crate::datasource::object_store::{ObjectReader, ObjectReaderStream}; use crate::error::Result; +use crate::logical_plan::Expr; use crate::physical_plan::file_format::NdJsonExec; use crate::physical_plan::ExecutionPlan; use crate::physical_plan::Statistics; @@ -93,16 +94,9 @@ impl FileFormat for JsonFormat { async fn create_physical_plan( &self, conf: PhysicalPlanConfig, + _filters: &[Expr], ) -> Result> { - let exec = NdJsonExec::new( - conf.object_store, - conf.files, - conf.statistics, - conf.schema, - conf.projection, - conf.batch_size, - conf.limit, - ); + let exec = NdJsonExec::new(conf); Ok(Arc::new(exec)) } } @@ -116,10 +110,9 @@ mod tests { datasource::{ file_format::PhysicalPlanConfig, object_store::local::{ - local_file_meta, local_object_reader, local_object_reader_stream, - LocalFileSystem, + local_object_reader, local_object_reader_stream, + local_unpartitioned_file, LocalFileSystem, }, - PartitionedFile, }, physical_plan::collect, }; @@ -212,7 +205,7 @@ mod tests { ) -> Result> { let filename = "tests/jsons/2.json"; let format = JsonFormat::default(); - let schema = format + let file_schema = format .infer_schema(local_object_reader_stream(vec![filename.to_owned()])) .await .expect("Schema inference"); @@ -220,20 +213,21 @@ mod tests { .infer_stats(local_object_reader(filename.to_owned())) .await .expect("Stats inference"); - let files = vec![vec![PartitionedFile { - file_meta: local_file_meta(filename.to_owned()), - }]]; + let file_groups = vec![vec![local_unpartitioned_file(filename.to_owned())]]; let exec = format - .create_physical_plan(PhysicalPlanConfig { - object_store: Arc::new(LocalFileSystem {}), - schema, - files, - statistics, - projection: projection.clone(), - batch_size, - filters: vec![], - limit, - }) + .create_physical_plan( + PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_schema, + file_groups, + statistics, + projection: projection.clone(), + batch_size, + limit, + table_partition_cols: vec![], + }, + &[], + ) .await?; Ok(exec) } diff --git a/datafusion/src/datasource/file_format/mod.rs b/datafusion/src/datasource/file_format/mod.rs index d545596f6e5c..54491615fc4c 100644 --- a/datafusion/src/datasource/file_format/mod.rs +++ b/datafusion/src/datasource/file_format/mod.rs @@ -29,33 +29,12 @@ use std::sync::Arc; use crate::arrow::datatypes::SchemaRef; use crate::error::Result; use crate::logical_plan::Expr; +use crate::physical_plan::file_format::PhysicalPlanConfig; use crate::physical_plan::{ExecutionPlan, Statistics}; use async_trait::async_trait; -use super::object_store::{ObjectReader, ObjectReaderStream, ObjectStore}; -use super::PartitionedFile; - -/// The configurations to be passed when creating a physical plan for -/// a given file format. -pub struct PhysicalPlanConfig { - /// Store from which the `files` should be fetched - pub object_store: Arc, - /// Schema before projection - pub schema: SchemaRef, - /// List of files to be processed, grouped into partitions - pub files: Vec>, - /// Estimated overall statistics of the plan, taking `filters` into account - pub statistics: Statistics, - /// Columns on which to project the data - pub projection: Option>, - /// The maximum number of records per arrow column - pub batch_size: usize, - /// The filters that were pushed down to this execution plan - pub filters: Vec, - /// The minimum number of records required from this source plan - pub limit: Option, -} +use super::object_store::{ObjectReader, ObjectReaderStream}; /// This trait abstracts all the file format specific implementations /// from the `TableProvider`. This helps code re-utilization accross @@ -81,5 +60,6 @@ pub trait FileFormat: Send + Sync + fmt::Debug { async fn create_physical_plan( &self, conf: PhysicalPlanConfig, + filters: &[Expr], ) -> Result>; } diff --git a/datafusion/src/datasource/file_format/parquet.rs b/datafusion/src/datasource/file_format/parquet.rs index 424a2985a3f7..819f37448636 100644 --- a/datafusion/src/datasource/file_format/parquet.rs +++ b/datafusion/src/datasource/file_format/parquet.rs @@ -42,6 +42,7 @@ use crate::datasource::{create_max_min_accs, get_col_stats}; use crate::error::DataFusionError; use crate::error::Result; use crate::logical_plan::combine_filters; +use crate::logical_plan::Expr; use crate::physical_plan::expressions::{MaxAccumulator, MinAccumulator}; use crate::physical_plan::file_format::ParquetExec; use crate::physical_plan::ExecutionPlan; @@ -104,26 +105,18 @@ impl FileFormat for ParquetFormat { async fn create_physical_plan( &self, conf: PhysicalPlanConfig, + filters: &[Expr], ) -> Result> { // If enable pruning then combine the filters to build the predicate. // If disable pruning then set the predicate to None, thus readers // will not prune data based on the statistics. let predicate = if self.enable_pruning { - combine_filters(&conf.filters) + combine_filters(filters) } else { None }; - Ok(Arc::new(ParquetExec::new( - conf.object_store, - conf.files, - conf.statistics, - conf.schema, - conf.projection, - predicate, - conf.batch_size, - conf.limit, - ))) + Ok(Arc::new(ParquetExec::new(conf, predicate))) } } @@ -330,12 +323,9 @@ impl ChunkReader for ChunkObjectReader { #[cfg(test)] mod tests { use crate::{ - datasource::{ - object_store::local::{ - local_file_meta, local_object_reader, local_object_reader_stream, - LocalFileSystem, - }, - PartitionedFile, + datasource::object_store::local::{ + local_object_reader, local_object_reader_stream, local_unpartitioned_file, + LocalFileSystem, }, physical_plan::collect, }; @@ -595,7 +585,7 @@ mod tests { let testdata = crate::test_util::parquet_test_data(); let filename = format!("{}/{}", testdata, file_name); let format = ParquetFormat::default(); - let schema = format + let file_schema = format .infer_schema(local_object_reader_stream(vec![filename.clone()])) .await .expect("Schema inference"); @@ -603,20 +593,21 @@ mod tests { .infer_stats(local_object_reader(filename.clone())) .await .expect("Stats inference"); - let files = vec![vec![PartitionedFile { - file_meta: local_file_meta(filename.clone()), - }]]; + let file_groups = vec![vec![local_unpartitioned_file(filename.clone())]]; let exec = format - .create_physical_plan(PhysicalPlanConfig { - object_store: Arc::new(LocalFileSystem {}), - schema, - files, - statistics, - projection: projection.clone(), - batch_size, - filters: vec![], - limit, - }) + .create_physical_plan( + PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_schema, + file_groups, + statistics, + projection: projection.clone(), + batch_size, + limit, + table_partition_cols: vec![], + }, + &[], + ) .await?; Ok(exec) } diff --git a/datafusion/src/datasource/listing/helpers.rs b/datafusion/src/datasource/listing/helpers.rs new file mode 100644 index 000000000000..912179c36f06 --- /dev/null +++ b/datafusion/src/datasource/listing/helpers.rs @@ -0,0 +1,723 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Helper functions for the table implementation + +use std::sync::Arc; + +use arrow::{ + array::{ + Array, ArrayBuilder, ArrayRef, Date64Array, Date64Builder, StringArray, + StringBuilder, UInt64Array, UInt64Builder, + }, + datatypes::{DataType, Field, Schema}, + record_batch::RecordBatch, +}; +use chrono::{TimeZone, Utc}; +use futures::{ + stream::{self}, + StreamExt, TryStreamExt, +}; +use log::debug; + +use crate::{ + error::Result, + execution::context::ExecutionContext, + logical_plan::{self, Expr, ExpressionVisitor, Recursion}, + physical_plan::functions::Volatility, + scalar::ScalarValue, +}; + +use crate::datasource::{ + object_store::{FileMeta, ObjectStore, SizedFile}, + MemTable, PartitionedFile, PartitionedFileStream, +}; + +const FILE_SIZE_COLUMN_NAME: &str = "_df_part_file_size_"; +const FILE_PATH_COLUMN_NAME: &str = "_df_part_file_path_"; +const FILE_MODIFIED_COLUMN_NAME: &str = "_df_part_file_modified_"; + +/// The `ExpressionVisitor` for `expr_applicable_for_cols`. Walks the tree to +/// validate that the given expression is applicable with only the `col_names` +/// set of columns. +struct ApplicabilityVisitor<'a> { + col_names: &'a [String], + is_applicable: &'a mut bool, +} + +impl ApplicabilityVisitor<'_> { + fn visit_volatility(self, volatility: Volatility) -> Recursion { + match volatility { + Volatility::Immutable => Recursion::Continue(self), + // TODO: Stable functions could be `applicable`, but that would require access to the context + Volatility::Stable | Volatility::Volatile => { + *self.is_applicable = false; + Recursion::Stop(self) + } + } + } +} + +impl ExpressionVisitor for ApplicabilityVisitor<'_> { + fn pre_visit(self, expr: &Expr) -> Result> { + let rec = match expr { + Expr::Column(logical_plan::Column { ref name, .. }) => { + *self.is_applicable &= self.col_names.contains(name); + Recursion::Stop(self) // leaf node anyway + } + Expr::Literal(_) + | Expr::Alias(_, _) + | Expr::ScalarVariable(_) + | Expr::Not(_) + | Expr::IsNotNull(_) + | Expr::IsNull(_) + | Expr::Negative(_) + | Expr::Cast { .. } + | Expr::TryCast { .. } + | Expr::BinaryExpr { .. } + | Expr::Between { .. } + | Expr::InList { .. } + | Expr::GetIndexedField { .. } + | Expr::Case { .. } => Recursion::Continue(self), + + Expr::ScalarFunction { fun, .. } => self.visit_volatility(fun.volatility()), + Expr::ScalarUDF { fun, .. } => { + self.visit_volatility(fun.signature.volatility) + } + + // TODO other expressions are not handled yet: + // - AGGREGATE, WINDOW and SORT should not end up in filter conditions, except maybe in some edge cases + // - Can `Wildcard` be considered as a `Literal`? + // - ScalarVariable could be `applicable`, but that would require access to the context + Expr::AggregateUDF { .. } + | Expr::AggregateFunction { .. } + | Expr::Sort { .. } + | Expr::WindowFunction { .. } + | Expr::Wildcard => { + *self.is_applicable = false; + Recursion::Stop(self) + } + }; + Ok(rec) + } +} + +/// Check whether the given expression can be resolved using only the columns `col_names`. +/// This means that if this function returns true: +/// - the table provider can filter the table partition values with this expression +/// - the expression can be marked as `TableProviderFilterPushDown::Exact` once this filtering +/// was performed +pub fn expr_applicable_for_cols(col_names: &[String], expr: &Expr) -> bool { + let mut is_applicable = true; + expr.accept(ApplicabilityVisitor { + col_names, + is_applicable: &mut is_applicable, + }) + .unwrap(); + is_applicable +} + +/// Partition the list of files into `n` groups +pub fn split_files( + partitioned_files: Vec, + n: usize, +) -> Vec> { + if partitioned_files.is_empty() { + return vec![]; + } + // effectively this is div with rounding up instead of truncating + let chunk_size = (partitioned_files.len() + n - 1) / n; + partitioned_files + .chunks(chunk_size) + .map(|c| c.to_vec()) + .collect() +} + +/// Discover the partitions on the given path and prune out files +/// that belong to irrelevant partitions using `filters` expressions. +/// `filters` might contain expressions that can be resolved only at the +/// file level (e.g. Parquet row group pruning). +/// +/// TODO for tables with many files (10k+), it will usually more efficient +/// to first list the folders relative to the first partition dimension, +/// prune those, then list only the contain of the remaining folders. +pub async fn pruned_partition_list( + store: &dyn ObjectStore, + table_path: &str, + filters: &[Expr], + file_extension: &str, + table_partition_cols: &[String], +) -> Result { + // if no partition col => simply list all the files + if table_partition_cols.is_empty() { + return Ok(Box::pin( + store + .list_file_with_suffix(table_path, file_extension) + .await? + .map(|f| { + Ok(PartitionedFile { + partition_values: vec![], + file_meta: f?, + }) + }), + )); + } + + let applicable_filters: Vec<_> = filters + .iter() + .filter(|f| expr_applicable_for_cols(table_partition_cols, f)) + .collect(); + let stream_path = table_path.to_owned(); + if applicable_filters.is_empty() { + // Parse the partition values while listing all the files + // Note: We might avoid parsing the partition values if they are not used in any projection, + // but the cost of parsing will likely be far dominated by the time to fetch the listing from + // the object store. + let table_partition_cols_stream = table_partition_cols.to_vec(); + Ok(Box::pin( + store + .list_file_with_suffix(table_path, file_extension) + .await? + .filter_map(move |f| { + let stream_path = stream_path.clone(); + let table_partition_cols_stream = table_partition_cols_stream.clone(); + async move { + let file_meta = match f { + Ok(fm) => fm, + Err(err) => return Some(Err(err)), + }; + let parsed_path = parse_partitions_for_path( + &stream_path, + file_meta.path(), + &table_partition_cols_stream, + ) + .map(|p| { + p.iter() + .map(|&pn| ScalarValue::Utf8(Some(pn.to_owned()))) + .collect() + }); + + parsed_path.map(|partition_values| { + Ok(PartitionedFile { + partition_values, + file_meta, + }) + }) + } + }), + )) + } else { + // parse the partition values and serde them as a RecordBatch to filter them + // TODO avoid collecting but have a streaming memory table instead + let batches: Vec = store + .list_file_with_suffix(table_path, file_extension) + .await? + // TODO we set an arbitrary high batch size here, it does not matter as we list + // all the files anyway. This number will need to be adjusted according to the object + // store if we switch to a streaming-stlye pruning of the files. For instance S3 lists + // 1000 items at a time so batches of 1000 would be ideal with S3 as store. + .chunks(1024) + .map(|v| v.into_iter().collect::>>()) + .map(move |metas| paths_to_batch(table_partition_cols, &stream_path, &metas?)) + .try_collect() + .await?; + + let mem_table = MemTable::try_new(batches[0].schema(), vec![batches])?; + + // Filter the partitions using a local datafusion context + // TODO having the external context would allow us to resolve `Volatility::Stable` + // scalar functions (`ScalarFunction` & `ScalarUDF`) and `ScalarVariable`s + let mut ctx = ExecutionContext::new(); + let mut df = ctx.read_table(Arc::new(mem_table))?; + for filter in applicable_filters { + df = df.filter(filter.clone())?; + } + let filtered_batches = df.collect().await?; + + Ok(Box::pin(stream::iter( + batches_to_paths(&filtered_batches).into_iter().map(Ok), + ))) + } +} + +/// convert the paths of the files to a record batch with the following columns: +/// - one column for the file size named `_df_part_file_size_` +/// - one column for with the original path named `_df_part_file_path_` +/// - one column for with the last modified date named `_df_part_file_modified_` +/// - ... one column by partition ... +/// +/// Note: For the last modified date, this looses precisions higher than millisecond. +fn paths_to_batch( + table_partition_cols: &[String], + table_path: &str, + metas: &[FileMeta], +) -> Result { + let mut key_builder = StringBuilder::new(metas.len()); + let mut length_builder = UInt64Builder::new(metas.len()); + let mut modified_builder = Date64Builder::new(metas.len()); + let mut partition_builders = table_partition_cols + .iter() + .map(|_| StringBuilder::new(metas.len())) + .collect::>(); + for file_meta in metas { + if let Some(partition_values) = + parse_partitions_for_path(table_path, file_meta.path(), table_partition_cols) + { + key_builder.append_value(file_meta.path())?; + length_builder.append_value(file_meta.size())?; + match file_meta.last_modified { + Some(lm) => modified_builder.append_value(lm.timestamp_millis())?, + None => modified_builder.append_null()?, + } + for (i, part_val) in partition_values.iter().enumerate() { + partition_builders[i].append_value(part_val)?; + } + } else { + debug!("No partitioning for path {}", file_meta.path()); + } + } + + // finish all builders + let mut col_arrays: Vec = vec![ + ArrayBuilder::finish(&mut key_builder), + ArrayBuilder::finish(&mut length_builder), + ArrayBuilder::finish(&mut modified_builder), + ]; + for mut partition_builder in partition_builders { + col_arrays.push(ArrayBuilder::finish(&mut partition_builder)); + } + + // put the schema together + let mut fields = vec![ + Field::new(FILE_PATH_COLUMN_NAME, DataType::Utf8, false), + Field::new(FILE_SIZE_COLUMN_NAME, DataType::UInt64, false), + Field::new(FILE_MODIFIED_COLUMN_NAME, DataType::Date64, false), + ]; + for pn in table_partition_cols { + fields.push(Field::new(pn, DataType::Utf8, false)); + } + + let batch = RecordBatch::try_new(Arc::new(Schema::new(fields)), col_arrays)?; + Ok(batch) +} + +/// convert a set of record batches created by `paths_to_batch()` back to partitioned files. +fn batches_to_paths(batches: &[RecordBatch]) -> Vec { + batches + .iter() + .flat_map(|batch| { + let key_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let length_array = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + let modified_array = batch + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + + (0..batch.num_rows()).map(move |row| PartitionedFile { + file_meta: FileMeta { + last_modified: match modified_array.is_null(row) { + false => Some(Utc.timestamp_millis(modified_array.value(row))), + true => None, + }, + sized_file: SizedFile { + path: key_array.value(row).to_owned(), + size: length_array.value(row), + }, + }, + partition_values: (3..batch.columns().len()) + .map(|col| { + ScalarValue::try_from_array(batch.column(col), row).unwrap() + }) + .collect(), + }) + }) + .collect() +} + +/// Extract the partition values for the given `file_path` (in the given `table_path`) +/// associated to the partitions defined by `table_partition_cols` +fn parse_partitions_for_path<'a>( + table_path: &str, + file_path: &'a str, + table_partition_cols: &[String], +) -> Option> { + let subpath = file_path.strip_prefix(table_path)?; + + // ignore whether table_path ended with "/" or not + let subpath = match subpath.strip_prefix('/') { + Some(subpath) => subpath, + None => subpath, + }; + + let mut part_values = vec![]; + for (path, pn) in subpath.split('/').zip(table_partition_cols) { + match path.split_once('=') { + Some((name, val)) if name == pn => part_values.push(val), + _ => return None, + } + } + Some(part_values) +} + +#[cfg(test)] +mod tests { + use crate::{ + logical_plan::{case, col, lit}, + test::object_store::TestObjectStore, + }; + + use super::*; + + #[test] + fn test_split_files() { + let new_partitioned_file = |path: &str| PartitionedFile::new(path.to_owned(), 10); + let files = vec![ + new_partitioned_file("a"), + new_partitioned_file("b"), + new_partitioned_file("c"), + new_partitioned_file("d"), + new_partitioned_file("e"), + ]; + + let chunks = split_files(files.clone(), 1); + assert_eq!(1, chunks.len()); + assert_eq!(5, chunks[0].len()); + + let chunks = split_files(files.clone(), 2); + assert_eq!(2, chunks.len()); + assert_eq!(3, chunks[0].len()); + assert_eq!(2, chunks[1].len()); + + let chunks = split_files(files.clone(), 5); + assert_eq!(5, chunks.len()); + assert_eq!(1, chunks[0].len()); + assert_eq!(1, chunks[1].len()); + assert_eq!(1, chunks[2].len()); + assert_eq!(1, chunks[3].len()); + assert_eq!(1, chunks[4].len()); + + let chunks = split_files(files, 123); + assert_eq!(5, chunks.len()); + assert_eq!(1, chunks[0].len()); + assert_eq!(1, chunks[1].len()); + assert_eq!(1, chunks[2].len()); + assert_eq!(1, chunks[3].len()); + assert_eq!(1, chunks[4].len()); + + let chunks = split_files(vec![], 2); + assert_eq!(0, chunks.len()); + } + + #[tokio::test] + async fn test_pruned_partition_list_empty() { + let store = TestObjectStore::new_arc(&[ + ("tablepath/mypartition=val1/notparquetfile", 100), + ("tablepath/file.parquet", 100), + ]); + let filter = Expr::eq(col("mypartition"), lit("val1")); + let pruned = pruned_partition_list( + store.as_ref(), + "tablepath/", + &[filter], + ".parquet", + &[String::from("mypartition")], + ) + .await + .expect("partition pruning failed") + .collect::>() + .await; + + assert_eq!(pruned.len(), 0); + } + + #[tokio::test] + async fn test_pruned_partition_list() { + let store = TestObjectStore::new_arc(&[ + ("tablepath/mypartition=val1/file.parquet", 100), + ("tablepath/mypartition=val2/file.parquet", 100), + ("tablepath/mypartition=val1/other=val3/file.parquet", 100), + ]); + let filter = Expr::eq(col("mypartition"), lit("val1")); + let pruned = pruned_partition_list( + store.as_ref(), + "tablepath/", + &[filter], + ".parquet", + &[String::from("mypartition")], + ) + .await + .expect("partition pruning failed") + .collect::>() + .await; + + assert_eq!(pruned.len(), 2); + let f1 = pruned[0].as_ref().expect("first item not an error"); + assert_eq!( + &f1.file_meta.sized_file.path, + "tablepath/mypartition=val1/file.parquet" + ); + assert_eq!( + &f1.partition_values, + &[ScalarValue::Utf8(Some(String::from("val1"))),] + ); + let f2 = pruned[1].as_ref().expect("second item not an error"); + assert_eq!( + &f2.file_meta.sized_file.path, + "tablepath/mypartition=val1/other=val3/file.parquet" + ); + assert_eq!( + &f2.partition_values, + &[ScalarValue::Utf8(Some(String::from("val1"))),] + ); + } + + #[tokio::test] + async fn test_pruned_partition_list_multi() { + let store = TestObjectStore::new_arc(&[ + ("tablepath/part1=p1v1/file.parquet", 100), + ("tablepath/part1=p1v2/part2=p2v1/file1.parquet", 100), + ("tablepath/part1=p1v2/part2=p2v1/file2.parquet", 100), + ("tablepath/part1=p1v3/part2=p2v1/file2.parquet", 100), + ("tablepath/part1=p1v2/part2=p2v2/file2.parquet", 100), + ]); + let filter1 = Expr::eq(col("part1"), lit("p1v2")); + let filter2 = Expr::eq(col("part2"), lit("p2v1")); + // filter3 cannot be resolved at partition pruning + let filter3 = Expr::eq(col("part2"), col("other")); + let pruned = pruned_partition_list( + store.as_ref(), + "tablepath/", + &[filter1, filter2, filter3], + ".parquet", + &[String::from("part1"), String::from("part2")], + ) + .await + .expect("partition pruning failed") + .collect::>() + .await; + + assert_eq!(pruned.len(), 2); + let f1 = pruned[0].as_ref().expect("first item not an error"); + assert_eq!( + &f1.file_meta.sized_file.path, + "tablepath/part1=p1v2/part2=p2v1/file1.parquet" + ); + assert_eq!( + &f1.partition_values, + &[ + ScalarValue::Utf8(Some(String::from("p1v2"))), + ScalarValue::Utf8(Some(String::from("p2v1"))) + ] + ); + let f2 = pruned[1].as_ref().expect("second item not an error"); + assert_eq!( + &f2.file_meta.sized_file.path, + "tablepath/part1=p1v2/part2=p2v1/file2.parquet" + ); + assert_eq!( + &f2.partition_values, + &[ + ScalarValue::Utf8(Some(String::from("p1v2"))), + ScalarValue::Utf8(Some(String::from("p2v1"))) + ] + ); + } + + #[test] + fn test_parse_partitions_for_path() { + assert_eq!( + Some(vec![]), + parse_partitions_for_path("bucket/mytable", "bucket/mytable/file.csv", &[]) + ); + assert_eq!( + None, + parse_partitions_for_path( + "bucket/othertable", + "bucket/mytable/file.csv", + &[] + ) + ); + assert_eq!( + None, + parse_partitions_for_path( + "bucket/mytable", + "bucket/mytable/file.csv", + &[String::from("mypartition")] + ) + ); + assert_eq!( + Some(vec!["v1"]), + parse_partitions_for_path( + "bucket/mytable", + "bucket/mytable/mypartition=v1/file.csv", + &[String::from("mypartition")] + ) + ); + assert_eq!( + Some(vec!["v1"]), + parse_partitions_for_path( + "bucket/mytable/", + "bucket/mytable/mypartition=v1/file.csv", + &[String::from("mypartition")] + ) + ); + // Only hive style partitioning supported for now: + assert_eq!( + None, + parse_partitions_for_path( + "bucket/mytable", + "bucket/mytable/v1/file.csv", + &[String::from("mypartition")] + ) + ); + assert_eq!( + Some(vec!["v1", "v2"]), + parse_partitions_for_path( + "bucket/mytable", + "bucket/mytable/mypartition=v1/otherpartition=v2/file.csv", + &[String::from("mypartition"), String::from("otherpartition")] + ) + ); + assert_eq!( + Some(vec!["v1"]), + parse_partitions_for_path( + "bucket/mytable", + "bucket/mytable/mypartition=v1/otherpartition=v2/file.csv", + &[String::from("mypartition")] + ) + ); + } + + #[test] + fn test_path_batch_roundtrip_no_partiton() { + let files = vec![ + FileMeta { + sized_file: SizedFile { + path: String::from("mybucket/tablepath/part1=val1/file.parquet"), + size: 100, + }, + last_modified: Some(Utc.timestamp_millis(1634722979123)), + }, + FileMeta { + sized_file: SizedFile { + path: String::from("mybucket/tablepath/part1=val2/file.parquet"), + size: 100, + }, + last_modified: None, + }, + ]; + + let batches = paths_to_batch(&[], "mybucket/tablepath", &files) + .expect("Serialization of file list to batch failed"); + + let parsed_files = batches_to_paths(&[batches]); + assert_eq!(parsed_files.len(), 2); + assert_eq!(&parsed_files[0].partition_values, &[]); + assert_eq!(&parsed_files[1].partition_values, &[]); + + let parsed_metas = parsed_files + .into_iter() + .map(|pf| pf.file_meta) + .collect::>(); + assert_eq!(parsed_metas, files); + } + + #[test] + fn test_path_batch_roundtrip_with_partition() { + let files = vec![ + FileMeta { + sized_file: SizedFile { + path: String::from("mybucket/tablepath/part1=val1/file.parquet"), + size: 100, + }, + last_modified: Some(Utc.timestamp_millis(1634722979123)), + }, + FileMeta { + sized_file: SizedFile { + path: String::from("mybucket/tablepath/part1=val2/file.parquet"), + size: 100, + }, + last_modified: None, + }, + ]; + + let batches = + paths_to_batch(&[String::from("part1")], "mybucket/tablepath", &files) + .expect("Serialization of file list to batch failed"); + + let parsed_files = batches_to_paths(&[batches]); + assert_eq!(parsed_files.len(), 2); + assert_eq!( + &parsed_files[0].partition_values, + &[ScalarValue::Utf8(Some(String::from("val1")))] + ); + assert_eq!( + &parsed_files[1].partition_values, + &[ScalarValue::Utf8(Some(String::from("val2")))] + ); + + let parsed_metas = parsed_files + .into_iter() + .map(|pf| pf.file_meta) + .collect::>(); + assert_eq!(parsed_metas, files); + } + + #[test] + fn test_expr_applicable_for_cols() { + assert!(expr_applicable_for_cols( + &[String::from("c1")], + &Expr::eq(col("c1"), lit("value")) + )); + assert!(!expr_applicable_for_cols( + &[String::from("c1")], + &Expr::eq(col("c2"), lit("value")) + )); + assert!(!expr_applicable_for_cols( + &[String::from("c1")], + &Expr::eq(col("c1"), col("c2")) + )); + assert!(expr_applicable_for_cols( + &[String::from("c1"), String::from("c2")], + &Expr::eq(col("c1"), col("c2")) + )); + assert!(expr_applicable_for_cols( + &[String::from("c1"), String::from("c2")], + &(Expr::eq(col("c1"), col("c2").alias("c2_alias"))).not() + )); + assert!(expr_applicable_for_cols( + &[String::from("c1"), String::from("c2")], + &(case(col("c1")) + .when(lit("v1"), lit(true)) + .otherwise(lit(false)) + .expect("valid case expr")) + )); + // static expression not relvant in this context but we + // test it as an edge case anyway in case we want to generalize + // this helper function + assert!(expr_applicable_for_cols(&[], &lit(true))); + } +} diff --git a/datafusion/src/datasource/listing/mod.rs b/datafusion/src/datasource/listing/mod.rs new file mode 100644 index 000000000000..c8b92418ba2f --- /dev/null +++ b/datafusion/src/datasource/listing/mod.rs @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! A table that uses the `ObjectStore` listing capability +//! to get the list of files to process. + +mod helpers; +mod table; + +pub use table::{ListingOptions, ListingTable}; diff --git a/datafusion/src/datasource/listing.rs b/datafusion/src/datasource/listing/table.rs similarity index 63% rename from datafusion/src/datasource/listing.rs rename to datafusion/src/datasource/listing/table.rs index 4af82d0c185a..aadc340b46c9 100644 --- a/datafusion/src/datasource/listing.rs +++ b/datafusion/src/datasource/listing/table.rs @@ -15,30 +15,31 @@ // specific language governing permissions and limitations // under the License. -//! A table that uses the `ObjectStore` listing capability -//! to get the list of files to process. +//! The table implementation. use std::{any::Any, sync::Arc}; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow::datatypes::{Field, Schema, SchemaRef}; use async_trait::async_trait; use futures::StreamExt; use crate::{ - datasource::PartitionedFile, - error::{DataFusionError, Result}, + error::Result, logical_plan::Expr, - physical_plan::{ExecutionPlan, Statistics}, + physical_plan::{ + empty::EmptyExec, + file_format::{PhysicalPlanConfig, DEFAULT_PARTITION_COLUMN_DATATYPE}, + ExecutionPlan, Statistics, + }, }; -use super::{ - datasource::TableProviderFilterPushDown, - file_format::{FileFormat, PhysicalPlanConfig}, - get_statistics_with_limit, - object_store::ObjectStore, - PartitionedFileStream, TableProvider, +use crate::datasource::{ + datasource::TableProviderFilterPushDown, file_format::FileFormat, + get_statistics_with_limit, object_store::ObjectStore, PartitionedFile, TableProvider, }; +use super::helpers::{expr_applicable_for_cols, pruned_partition_list, split_files}; + /// Options for creating a `ListingTable` pub struct ListingOptions { /// A suffix on which files should be filtered (leave empty to @@ -51,9 +52,9 @@ pub struct ListingOptions { /// partitioning expected should be named "a" and "b": /// - If there is a third level of partitioning it will be ignored. /// - Files that don't follow this partitioning will be ignored. - /// Note that only `DataType::Utf8` is supported for the column type. - /// TODO implement case where partitions.len() > 0 - pub partitions: Vec, + /// Note that only `DEFAULT_PARTITION_COLUMN_DATATYPE` is currently + /// supported for the column type. + pub table_partition_cols: Vec, /// Set true to try to guess statistics from the files. /// This can add a lot of overhead as it will usually require files /// to be opened and at least partially parsed. @@ -74,14 +75,14 @@ impl ListingOptions { Self { file_extension: String::new(), format, - partitions: vec![], + table_partition_cols: vec![], collect_stat: false, target_partitions: 1, } } /// Infer the schema of the files at the given path on the provided object store. - /// The inferred schema should include the partitioning columns. + /// The inferred schema does not include the partitioning columns. /// /// This method will not be called by the table itself but before creating it. /// This way when creating the logical plan we can decide to resolve the schema @@ -96,12 +97,7 @@ impl ListingOptions { .await? .map(move |file_meta| object_store.file_reader(file_meta?.sized_file)); let file_schema = self.format.infer_schema(Box::pin(file_stream)).await?; - // Add the partition columns to the file schema - let mut fields = file_schema.fields().clone(); - for part in &self.partitions { - fields.push(Field::new(part, DataType::Utf8, false)); - } - Ok(Arc::new(Schema::new(fields))) + Ok(file_schema) } } @@ -109,24 +105,40 @@ impl ListingOptions { /// or file system listing capability to get the list of files. pub struct ListingTable { object_store: Arc, - path: String, - schema: SchemaRef, + table_path: String, + /// File fields only + file_schema: SchemaRef, + /// File fields + partition columns + table_schema: SchemaRef, options: ListingOptions, } impl ListingTable { /// Create new table that lists the FS to get the files to scan. + /// The provided `schema` must be resolved before creating the table + /// and should contain the fields of the file without the table + /// partitioning columns. pub fn new( object_store: Arc, - path: String, - // the schema must be resolved before creating the table - schema: SchemaRef, + table_path: String, + file_schema: SchemaRef, options: ListingOptions, ) -> Self { + // Add the partition columns to the file schema + let mut table_fields = file_schema.fields().clone(); + for part in &options.table_partition_cols { + table_fields.push(Field::new( + part, + DEFAULT_PARTITION_COLUMN_DATATYPE.clone(), + false, + )); + } + Self { object_store, - path, - schema, + table_path, + file_schema, + table_schema: Arc::new(Schema::new(table_fields)), options, } } @@ -136,8 +148,8 @@ impl ListingTable { &self.object_store } /// Get path ref - pub fn path(&self) -> &str { - &self.path + pub fn table_path(&self) -> &str { + &self.table_path } /// Get options ref pub fn options(&self) -> &ListingOptions { @@ -152,7 +164,7 @@ impl TableProvider for ListingTable { } fn schema(&self) -> SchemaRef { - Arc::clone(&self.schema) + Arc::clone(&self.table_schema) } async fn scan( @@ -162,58 +174,76 @@ impl TableProvider for ListingTable { filters: &[Expr], limit: Option, ) -> Result> { - // TODO object_store_registry should be provided as param here - let (partitioned_file_lists, statistics) = self - .list_files_for_scan( - Arc::clone(&self.object_store), - &self.path, - filters, - limit, - ) - .await?; + let (partitioned_file_lists, statistics) = + self.list_files_for_scan(filters, limit).await?; + + // if no files need to be read, return an `EmptyExec` + if partitioned_file_lists.is_empty() { + let schema = self.schema(); + let projected_schema = match &projection { + None => schema, + Some(p) => Arc::new(Schema::new( + p.iter().map(|i| schema.field(*i).clone()).collect(), + )), + }; + return Ok(Arc::new(EmptyExec::new(false, projected_schema))); + } + // create the execution plan self.options .format - .create_physical_plan(PhysicalPlanConfig { - object_store: Arc::clone(&self.object_store), - schema: self.schema(), - files: partitioned_file_lists, - statistics, - projection: projection.clone(), - batch_size, - filters: filters.to_vec(), - limit, - }) + .create_physical_plan( + PhysicalPlanConfig { + object_store: Arc::clone(&self.object_store), + file_schema: Arc::clone(&self.file_schema), + file_groups: partitioned_file_lists, + statistics, + projection: projection.clone(), + batch_size, + limit, + table_partition_cols: self.options.table_partition_cols.clone(), + }, + filters, + ) .await } fn supports_filter_pushdown( &self, - _filter: &Expr, + filter: &Expr, ) -> Result { - Ok(TableProviderFilterPushDown::Inexact) + if expr_applicable_for_cols(&self.options.table_partition_cols, filter) { + // if filter can be handled by partiton pruning, it is exact + Ok(TableProviderFilterPushDown::Exact) + } else { + // otherwise, we still might be able to handle the filter with file + // level mechanisms such as Parquet row group pruning. + Ok(TableProviderFilterPushDown::Inexact) + } } } impl ListingTable { + /// Get the list of files for a scan as well as the file level statistics. + /// The list is grouped to let the execution plan know how the files should + /// be distributed to different threads / executors. async fn list_files_for_scan<'a>( &'a self, - object_store: Arc, - path: &'a str, filters: &'a [Expr], limit: Option, ) -> Result<(Vec>, Statistics)> { // list files (with partitions) let file_list = pruned_partition_list( - object_store.as_ref(), - path, + self.object_store.as_ref(), + &self.table_path, filters, &self.options.file_extension, - &self.options.partitions, + &self.options.table_partition_cols, ) .await?; // collect the statistics if required by the config + let object_store = Arc::clone(&self.object_store); let files = file_list.then(move |part_file| { let object_store = object_store.clone(); async move { @@ -232,13 +262,6 @@ impl ListingTable { let (files, statistics) = get_statistics_with_limit(files, self.schema(), limit).await?; - if files.is_empty() { - return Err(DataFusionError::Plan(format!( - "No files found at {} with file extension {}", - self.path, self.options.file_extension, - ))); - } - Ok(( split_files(files, self.options.target_partitions), statistics, @@ -246,98 +269,21 @@ impl ListingTable { } } -/// Discover the partitions on the given path and prune out files -/// relative to irrelevant partitions using `filters` expressions -async fn pruned_partition_list( - store: &dyn ObjectStore, - path: &str, - _filters: &[Expr], - file_extension: &str, - partition_names: &[String], -) -> Result { - if partition_names.is_empty() { - Ok(Box::pin( - store - .list_file_with_suffix(path, file_extension) - .await? - .map(|f| Ok(PartitionedFile { file_meta: f? })), - )) - } else { - todo!("use filters to prune partitions") - } -} - -fn split_files( - partitioned_files: Vec, - n: usize, -) -> Vec> { - let mut chunk_size = partitioned_files.len() / n; - if partitioned_files.len() % n > 0 { - chunk_size += 1; - } - partitioned_files - .chunks(chunk_size) - .map(|c| c.to_vec()) - .collect() -} - #[cfg(test)] mod tests { + use arrow::datatypes::DataType; + use crate::{ datasource::{ file_format::{avro::AvroFormat, parquet::ParquetFormat}, - object_store::{local::LocalFileSystem, FileMeta, ObjectStore, SizedFile}, + object_store::local::LocalFileSystem, }, - test::object_store::TestObjectStore, + logical_plan::{col, lit}, + test::{columns, object_store::TestObjectStore}, }; use super::*; - #[test] - fn test_split_files() { - let new_partitioned_file = |path: &str| PartitionedFile { - file_meta: FileMeta { - sized_file: SizedFile { - path: path.to_owned(), - size: 10, - }, - last_modified: None, - }, - }; - let files = vec![ - new_partitioned_file("a"), - new_partitioned_file("b"), - new_partitioned_file("c"), - new_partitioned_file("d"), - new_partitioned_file("e"), - ]; - - let chunks = split_files(files.clone(), 1); - assert_eq!(1, chunks.len()); - assert_eq!(5, chunks[0].len()); - - let chunks = split_files(files.clone(), 2); - assert_eq!(2, chunks.len()); - assert_eq!(3, chunks[0].len()); - assert_eq!(2, chunks[1].len()); - - let chunks = split_files(files.clone(), 5); - assert_eq!(5, chunks.len()); - assert_eq!(1, chunks[0].len()); - assert_eq!(1, chunks[1].len()); - assert_eq!(1, chunks[2].len()); - assert_eq!(1, chunks[3].len()); - assert_eq!(1, chunks[4].len()); - - let chunks = split_files(files, 123); - assert_eq!(5, chunks.len()); - assert_eq!(1, chunks[0].len()); - assert_eq!(1, chunks[1].len()); - assert_eq!(1, chunks[2].len()); - assert_eq!(1, chunks[3].len()); - assert_eq!(1, chunks[4].len()); - } - #[tokio::test] async fn read_single_file() -> Result<()> { let table = load_table("alltypes_plain.parquet").await?; @@ -358,9 +304,47 @@ mod tests { } #[tokio::test] - async fn file_listings() -> Result<()> { + async fn read_empty_table() -> Result<()> { + let store = TestObjectStore::new_arc(&[("table/p1=v1/file.avro", 100)]); + + let opt = ListingOptions { + file_extension: ".avro".to_owned(), + format: Arc::new(AvroFormat {}), + table_partition_cols: vec![String::from("p1")], + target_partitions: 4, + collect_stat: true, + }; + + let file_schema = Schema::new(vec![Field::new("a", DataType::Boolean, false)]); + + let table = + ListingTable::new(store, "table/".to_owned(), Arc::new(file_schema), opt); + assert_eq!( + columns(&table.schema()), + vec!["a".to_owned(), "p1".to_owned()] + ); + + // this will filter out the only file in the store + let filter = Expr::not_eq(col("p1"), lit("v1")); + + let scan = table + .scan(&None, 1024, &[filter], None) + .await + .expect("Empty execution plan"); + + assert!(scan.as_any().is::()); + assert_eq!( + columns(&scan.schema()), + vec!["a".to_owned(), "p1".to_owned()] + ); + + Ok(()) + } + + #[tokio::test] + async fn test_assert_list_files_for_scan_grouping() -> Result<()> { // more expected partitions than files - assert_partitioning( + assert_list_files_for_scan_grouping( &[ "bucket/key-prefix/file0", "bucket/key-prefix/file1", @@ -375,7 +359,7 @@ mod tests { .await?; // as many expected partitions as files - assert_partitioning( + assert_list_files_for_scan_grouping( &[ "bucket/key-prefix/file0", "bucket/key-prefix/file1", @@ -389,7 +373,7 @@ mod tests { .await?; // more files as expected partitions - assert_partitioning( + assert_list_files_for_scan_grouping( &[ "bucket/key-prefix/file0", "bucket/key-prefix/file1", @@ -403,13 +387,11 @@ mod tests { ) .await?; - // no files - assert_partitioning(&[], "bucket/key-prefix/", 2, 0) - .await - .expect_err("no files"); + // no files => no groups + assert_list_files_for_scan_grouping(&[], "bucket/key-prefix/", 2, 0).await?; // files that don't match the prefix - assert_partitioning( + assert_list_files_for_scan_grouping( &[ "bucket/key-prefix/file0", "bucket/key-prefix/file1", @@ -429,7 +411,7 @@ mod tests { let opt = ListingOptions { file_extension: "parquet".to_owned(), format: Arc::new(ParquetFormat::default()), - partitions: vec![], + table_partition_cols: vec![], target_partitions: 2, collect_stat: true, }; @@ -445,13 +427,13 @@ mod tests { /// Check that the files listed by the table match the specified `output_partitioning` /// when the object store contains `files`. - async fn assert_partitioning( + async fn assert_list_files_for_scan_grouping( files: &[&str], table_prefix: &str, target_partitions: usize, output_partitioning: usize, ) -> Result<()> { - let mock_store: Arc = + let mock_store = TestObjectStore::new_arc(&files.iter().map(|f| (*f, 10)).collect::>()); let format = AvroFormat {}; @@ -459,23 +441,17 @@ mod tests { let opt = ListingOptions { file_extension: "".to_owned(), format: Arc::new(format), - partitions: vec![], + table_partition_cols: vec![], target_partitions, collect_stat: true, }; let schema = Schema::new(vec![Field::new("a", DataType::Boolean, false)]); - let table = ListingTable::new( - Arc::clone(&mock_store), - table_prefix.to_owned(), - Arc::new(schema), - opt, - ); + let table = + ListingTable::new(mock_store, table_prefix.to_owned(), Arc::new(schema), opt); - let (file_list, _) = table - .list_files_for_scan(mock_store, table_prefix, &[], None) - .await?; + let (file_list, _) = table.list_files_for_scan(&[], None).await?; assert_eq!(file_list.len(), output_partitioning); diff --git a/datafusion/src/datasource/mod.rs b/datafusion/src/datasource/mod.rs index 2e5330f16cb7..9f4f77f7ea28 100644 --- a/datafusion/src/datasource/mod.rs +++ b/datafusion/src/datasource/mod.rs @@ -33,24 +33,25 @@ use crate::arrow::datatypes::{Schema, SchemaRef}; use crate::error::Result; use crate::physical_plan::expressions::{MaxAccumulator, MinAccumulator}; use crate::physical_plan::{Accumulator, ColumnStatistics, Statistics}; +use crate::scalar::ScalarValue; use futures::StreamExt; use std::pin::Pin; -/// Get all files as well as the summary statistic -/// if the optional `limit` is provided, includes only sufficient files -/// needed to read up to `limit` number of rows +/// Get all files as well as the file level summary statistics (no statistic for partition columns). +/// If the optional `limit` is provided, includes only sufficient files. +/// Needed to read up to `limit` number of rows. /// TODO fix case where `num_rows` and `total_byte_size` are not defined (stat should be None instead of Some(0)) pub async fn get_statistics_with_limit( all_files: impl Stream>, - schema: SchemaRef, + file_schema: SchemaRef, limit: Option, ) -> Result<(Vec, Statistics)> { let mut result_files = vec![]; let mut total_byte_size = 0; - let mut null_counts = vec![0; schema.fields().len()]; + let mut null_counts = vec![0; file_schema.fields().len()]; let mut has_statistics = false; - let (mut max_values, mut min_values) = create_max_min_accs(&schema); + let (mut max_values, mut min_values) = create_max_min_accs(&file_schema); let mut num_rows = 0; let mut is_exact = true; @@ -103,7 +104,7 @@ pub async fn get_statistics_with_limit( let column_stats = if has_statistics { Some(get_col_stats( - &*schema, + &*file_schema, null_counts, &mut max_values, &mut min_values, @@ -128,8 +129,8 @@ pub async fn get_statistics_with_limit( pub struct PartitionedFile { /// Path for the file (e.g. URL, filesystem path, etc) pub file_meta: FileMeta, - // Values of partition columns to be appended to each row - // pub partition_value: Option>, + /// Values of partition columns to be appended to each row + pub partition_values: Vec, // We may include row group range here for a more fine-grained parallel execution } @@ -141,6 +142,7 @@ impl PartitionedFile { sized_file: SizedFile { path, size }, last_modified: None, }, + partition_values: vec![], } } } diff --git a/datafusion/src/datasource/object_store/local.rs b/datafusion/src/datasource/object_store/local.rs index 4f4dbefbca49..b2a2ddfa950b 100644 --- a/datafusion/src/datasource/object_store/local.rs +++ b/datafusion/src/datasource/object_store/local.rs @@ -27,6 +27,7 @@ use futures::{stream, AsyncRead, StreamExt}; use crate::datasource::object_store::{ FileMeta, FileMetaStream, ListEntryStream, ObjectReader, ObjectStore, }; +use crate::datasource::PartitionedFile; use crate::error::DataFusionError; use crate::error::Result; @@ -161,19 +162,22 @@ pub fn local_object_reader_stream(files: Vec) -> ObjectReaderStream { /// Helper method to convert a file location to a `LocalFileReader` pub fn local_object_reader(file: String) -> Arc { LocalFileSystem - .file_reader(local_file_meta(file).sized_file) + .file_reader(local_unpartitioned_file(file).file_meta.sized_file) .expect("File not found") } /// Helper method to fetch the file size and date at given path and create a `FileMeta` -pub fn local_file_meta(file: String) -> FileMeta { +pub fn local_unpartitioned_file(file: String) -> PartitionedFile { let metadata = fs::metadata(&file).expect("Local file metadata"); - FileMeta { - sized_file: SizedFile { - size: metadata.len(), - path: file, + PartitionedFile { + file_meta: FileMeta { + sized_file: SizedFile { + size: metadata.len(), + path: file, + }, + last_modified: metadata.modified().map(chrono::DateTime::from).ok(), }, - last_modified: metadata.modified().map(chrono::DateTime::from).ok(), + partition_values: vec![], } } diff --git a/datafusion/src/datasource/object_store/mod.rs b/datafusion/src/datasource/object_store/mod.rs index 61bc47dc462c..59e184103d2a 100644 --- a/datafusion/src/datasource/object_store/mod.rs +++ b/datafusion/src/datasource/object_store/mod.rs @@ -70,7 +70,7 @@ pub enum ListEntry { } /// The path and size of the file. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] pub struct SizedFile { /// Path of the file. It is relative to the current object /// store (it does not specify the `xx://` scheme). @@ -82,7 +82,7 @@ pub struct SizedFile { /// Description of a file as returned by the listing command of a /// given object store. The resulting path is relative to the /// object store that generated it. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] pub struct FileMeta { /// The path and size of the file. pub sized_file: SizedFile, diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index 01484568616b..9be5038f47c9 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -221,7 +221,7 @@ impl ExecutionContext { .unwrap() .config .target_partitions, - partitions: vec![], + table_partition_cols: vec![], }; // TODO make schema in CreateExternalTable optional instead of empty @@ -442,7 +442,7 @@ impl ExecutionContext { collect_stat: true, file_extension: DEFAULT_PARQUET_EXTENSION.to_owned(), target_partitions, - partitions: vec![], + table_partition_cols: vec![], }; self.register_listing_table(name, uri, listing_options, None) diff --git a/datafusion/src/execution/options.rs b/datafusion/src/execution/options.rs index f0ed6f24c325..c6b5ff646ea3 100644 --- a/datafusion/src/execution/options.rs +++ b/datafusion/src/execution/options.rs @@ -108,7 +108,7 @@ impl<'a> CsvReadOptions<'a> { collect_stat: false, file_extension: self.file_extension.to_owned(), target_partitions, - partitions: vec![], + table_partition_cols: vec![], } } } @@ -143,7 +143,7 @@ impl<'a> AvroReadOptions<'a> { collect_stat: false, file_extension: self.file_extension.to_owned(), target_partitions, - partitions: vec![], + table_partition_cols: vec![], } } } diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index 09c3a14513e5..693bf78fbe0e 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -277,7 +277,7 @@ impl LogicalPlanBuilder { collect_stat: true, file_extension: DEFAULT_PARQUET_EXTENSION.to_owned(), target_partitions, - partitions: vec![], + table_partition_cols: vec![], }; let path: String = path.into(); diff --git a/datafusion/src/physical_optimizer/repartition.rs b/datafusion/src/physical_optimizer/repartition.rs index ea7de7f39839..8ac9dadd9548 100644 --- a/datafusion/src/physical_optimizer/repartition.rs +++ b/datafusion/src/physical_optimizer/repartition.rs @@ -109,26 +109,28 @@ mod tests { use arrow::datatypes::Schema; use super::*; - use crate::datasource::object_store::local::LocalFileSystem; use crate::datasource::PartitionedFile; - use crate::physical_plan::file_format::ParquetExec; + use crate::physical_plan::file_format::{ParquetExec, PhysicalPlanConfig}; use crate::physical_plan::projection::ProjectionExec; use crate::physical_plan::Statistics; use crate::test::object_store::TestObjectStore; #[test] fn added_repartition_to_single_partition() -> Result<()> { - let schema = Arc::new(Schema::empty()); + let file_schema = Arc::new(Schema::empty()); let parquet_project = ProjectionExec::try_new( vec![], Arc::new(ParquetExec::new( - TestObjectStore::new_arc(&[("x", 100)]), - vec![vec![PartitionedFile::new("x".to_string(), 100)]], - Statistics::default(), - schema, - None, - None, - 2048, + PhysicalPlanConfig { + object_store: TestObjectStore::new_arc(&[("x", 100)]), + file_schema, + file_groups: vec![vec![PartitionedFile::new("x".to_string(), 100)]], + statistics: Statistics::default(), + projection: None, + batch_size: 2048, + limit: None, + table_partition_cols: vec![], + }, None, )), )?; @@ -152,19 +154,25 @@ mod tests { #[test] fn repartition_deepest_node() -> Result<()> { - let schema = Arc::new(Schema::empty()); + let file_schema = Arc::new(Schema::empty()); let parquet_project = ProjectionExec::try_new( vec![], Arc::new(ProjectionExec::try_new( vec![], Arc::new(ParquetExec::new( - Arc::new(LocalFileSystem {}), - vec![vec![PartitionedFile::new("x".to_string(), 100)]], - Statistics::default(), - schema, - None, - None, - 2048, + PhysicalPlanConfig { + object_store: TestObjectStore::new_arc(&[("x", 100)]), + file_schema, + file_groups: vec![vec![PartitionedFile::new( + "x".to_string(), + 100, + )]], + statistics: Statistics::default(), + projection: None, + batch_size: 2048, + limit: None, + table_partition_cols: vec![], + }, None, )), )?), diff --git a/datafusion/src/physical_plan/coalesce_partitions.rs b/datafusion/src/physical_plan/coalesce_partitions.rs index 1fd18d2c4f37..9c133def8209 100644 --- a/datafusion/src/physical_plan/coalesce_partitions.rs +++ b/datafusion/src/physical_plan/coalesce_partitions.rs @@ -207,7 +207,7 @@ mod tests { use super::*; use crate::datasource::object_store::local::LocalFileSystem; - use crate::physical_plan::file_format::CsvExec; + use crate::physical_plan::file_format::{CsvExec, PhysicalPlanConfig}; use crate::physical_plan::{collect, common}; use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; use crate::test::{self, assert_is_pending}; @@ -220,15 +220,18 @@ mod tests { let (_, files) = test::create_partitioned_csv("aggregate_test_100.csv", num_partitions)?; let csv = CsvExec::new( - Arc::new(LocalFileSystem {}), - files, - Statistics::default(), - schema, + PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_schema: schema, + file_groups: files, + statistics: Statistics::default(), + projection: None, + batch_size: 1024, + limit: None, + table_partition_cols: vec![], + }, true, b',', - None, - 1024, - None, ); // input should have 4 partitions diff --git a/datafusion/src/physical_plan/file_format/avro.rs b/datafusion/src/physical_plan/file_format/avro.rs index 2420040c08af..b50c0a082686 100644 --- a/datafusion/src/physical_plan/file_format/avro.rs +++ b/datafusion/src/physical_plan/file_format/avro.rs @@ -18,13 +18,11 @@ //! Execution plan for reading line-delimited Avro files #[cfg(feature = "avro")] use crate::avro_to_arrow; -use crate::datasource::object_store::ObjectStore; -use crate::datasource::PartitionedFile; use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ DisplayFormatType, ExecutionPlan, Partitioning, SendableRecordBatchStream, Statistics, }; -use arrow::datatypes::{Schema, SchemaRef}; +use arrow::datatypes::SchemaRef; #[cfg(feature = "avro")] use arrow::error::ArrowError; @@ -34,68 +32,30 @@ use std::sync::Arc; #[cfg(feature = "avro")] use super::file_stream::{BatchIter, FileStream}; +use super::PhysicalPlanConfig; /// Execution plan for scanning Avro data source #[derive(Debug, Clone)] pub struct AvroExec { - object_store: Arc, - file_groups: Vec>, - statistics: Statistics, - file_schema: SchemaRef, - projection: Option>, + base_config: PhysicalPlanConfig, + projected_statistics: Statistics, projected_schema: SchemaRef, - batch_size: usize, - limit: Option, } impl AvroExec { - /// Create a new Avro reader execution plan provided file list and schema - pub fn new( - object_store: Arc, - file_groups: Vec>, - statistics: Statistics, - file_schema: SchemaRef, - projection: Option>, - batch_size: usize, - limit: Option, - ) -> Self { - let projected_schema = match &projection { - None => Arc::clone(&file_schema), - Some(p) => Arc::new(Schema::new( - p.iter().map(|i| file_schema.field(*i).clone()).collect(), - )), - }; + /// Create a new Avro reader execution plan provided base configurations + pub fn new(base_config: PhysicalPlanConfig) -> Self { + let (projected_schema, projected_statistics) = base_config.project(); Self { - object_store, - file_groups, - statistics, - file_schema, - projection, + base_config, projected_schema, - batch_size, - limit, + projected_statistics, } } - /// List of data files - pub fn file_groups(&self) -> &[Vec] { - &self.file_groups - } - /// The schema before projection - pub fn file_schema(&self) -> &SchemaRef { - &self.file_schema - } - /// Optional projection for which columns to load - pub fn projection(&self) -> &Option> { - &self.projection - } - /// Batch size - pub fn batch_size(&self) -> usize { - self.batch_size - } - /// Limit in nr. of rows - pub fn limit(&self) -> Option { - self.limit + /// Ref to the base configs + pub fn base_config(&self) -> &PhysicalPlanConfig { + &self.base_config } } @@ -110,7 +70,7 @@ impl ExecutionPlan for AvroExec { } fn output_partitioning(&self) -> Partitioning { - Partitioning::UnknownPartitioning(self.file_groups.len()) + Partitioning::UnknownPartitioning(self.base_config.file_groups.len()) } fn children(&self) -> Vec> { @@ -140,15 +100,10 @@ impl ExecutionPlan for AvroExec { #[cfg(feature = "avro")] async fn execute(&self, partition: usize) -> Result { - let proj = self.projection.as_ref().map(|p| { - p.iter() - .map(|col_idx| self.file_schema.field(*col_idx).name()) - .cloned() - .collect() - }); + let proj = self.base_config.projected_file_column_names(); - let batch_size = self.batch_size; - let file_schema = Arc::clone(&self.file_schema); + let batch_size = self.base_config.batch_size; + let file_schema = Arc::clone(&self.base_config.file_schema); // The avro reader cannot limit the number of records, so `remaining` is ignored. let fun = move |file, _remaining: &Option| { @@ -167,11 +122,12 @@ impl ExecutionPlan for AvroExec { }; Ok(Box::pin(FileStream::new( - Arc::clone(&self.object_store), - self.file_groups[partition].clone(), + Arc::clone(&self.base_config.object_store), + self.base_config.file_groups[partition].clone(), fun, Arc::clone(&self.projected_schema), - self.limit, + self.base_config.limit, + self.base_config.table_partition_cols.clone(), ))) } @@ -185,16 +141,16 @@ impl ExecutionPlan for AvroExec { write!( f, "AvroExec: files={}, batch_size={}, limit={:?}", - super::FileGroupsDisplay(&self.file_groups), - self.batch_size, - self.limit, + super::FileGroupsDisplay(&self.base_config.file_groups), + self.base_config.batch_size, + self.base_config.limit, ) } } } fn statistics(&self) -> Statistics { - self.statistics.clone() + self.projected_statistics.clone() } } @@ -202,45 +158,59 @@ impl ExecutionPlan for AvroExec { #[cfg(feature = "avro")] mod tests { + use crate::datasource::file_format::{avro::AvroFormat, FileFormat}; use crate::datasource::object_store::local::{ - local_file_meta, local_object_reader_stream, LocalFileSystem, + local_object_reader_stream, local_unpartitioned_file, LocalFileSystem, }; + use crate::scalar::ScalarValue; + use futures::StreamExt; use super::*; #[tokio::test] - async fn test() -> Result<()> { - use futures::StreamExt; - - use crate::datasource::file_format::{avro::AvroFormat, FileFormat}; - + async fn avro_exec_without_partition() -> Result<()> { let testdata = crate::test_util::arrow_test_data(); let filename = format!("{}/avro/alltypes_plain.avro", testdata); - let avro_exec = AvroExec::new( - Arc::new(LocalFileSystem {}), - vec![vec![PartitionedFile { - file_meta: local_file_meta(filename.clone()), - }]], - Statistics::default(), - AvroFormat {} + let avro_exec = AvroExec::new(PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_groups: vec![vec![local_unpartitioned_file(filename.clone())]], + file_schema: AvroFormat {} .infer_schema(local_object_reader_stream(vec![filename])) .await?, - Some(vec![0, 1, 2]), - 1024, - None, - ); + statistics: Statistics::default(), + projection: Some(vec![0, 1, 2]), + batch_size: 1024, + limit: None, + table_partition_cols: vec![], + }); assert_eq!(avro_exec.output_partitioning().partition_count(), 1); - let mut results = avro_exec.execute(0).await?; - let batch = results.next().await.unwrap()?; + let mut results = avro_exec.execute(0).await.expect("plan execution failed"); + let batch = results + .next() + .await + .expect("plan iterator empty") + .expect("plan iterator returned an error"); - assert_eq!(8, batch.num_rows()); - assert_eq!(3, batch.num_columns()); + let expected = vec![ + "+----+----------+-------------+", + "| id | bool_col | tinyint_col |", + "+----+----------+-------------+", + "| 4 | true | 0 |", + "| 5 | false | 1 |", + "| 6 | true | 0 |", + "| 7 | false | 1 |", + "| 2 | true | 0 |", + "| 3 | false | 1 |", + "| 0 | true | 0 |", + "| 1 | false | 1 |", + "+----+----------+-------------+", + ]; - let schema = batch.schema(); - let field_names: Vec<&str> = - schema.fields().iter().map(|f| f.name().as_str()).collect(); - assert_eq!(vec!["id", "bool_col", "tinyint_col"], field_names); + crate::assert_batches_eq!(expected, &[batch]); + + let batch = results.next().await; + assert!(batch.is_none()); let batch = results.next().await; assert!(batch.is_none()); @@ -248,6 +218,57 @@ mod tests { let batch = results.next().await; assert!(batch.is_none()); + Ok(()) + } + + #[tokio::test] + async fn avro_exec_with_partition() -> Result<()> { + let testdata = crate::test_util::arrow_test_data(); + let filename = format!("{}/avro/alltypes_plain.avro", testdata); + let mut partitioned_file = local_unpartitioned_file(filename.clone()); + partitioned_file.partition_values = + vec![ScalarValue::Utf8(Some("2021-10-26".to_owned()))]; + let file_schema = AvroFormat {} + .infer_schema(local_object_reader_stream(vec![filename])) + .await?; + + let avro_exec = AvroExec::new(PhysicalPlanConfig { + // select specific columns of the files as well as the partitioning + // column which is supposed to be the last column in the table schema. + projection: Some(vec![0, 1, file_schema.fields().len(), 2]), + object_store: Arc::new(LocalFileSystem {}), + file_groups: vec![vec![partitioned_file]], + file_schema: file_schema, + statistics: Statistics::default(), + batch_size: 1024, + limit: None, + table_partition_cols: vec!["date".to_owned()], + }); + assert_eq!(avro_exec.output_partitioning().partition_count(), 1); + + let mut results = avro_exec.execute(0).await.expect("plan execution failed"); + let batch = results + .next() + .await + .expect("plan iterator empty") + .expect("plan iterator returned an error"); + + let expected = vec![ + "+----+----------+------------+-------------+", + "| id | bool_col | date | tinyint_col |", + "+----+----------+------------+-------------+", + "| 4 | true | 2021-10-26 | 0 |", + "| 5 | false | 2021-10-26 | 1 |", + "| 6 | true | 2021-10-26 | 0 |", + "| 7 | false | 2021-10-26 | 1 |", + "| 2 | true | 2021-10-26 | 0 |", + "| 3 | false | 2021-10-26 | 1 |", + "| 0 | true | 2021-10-26 | 0 |", + "| 1 | false | 2021-10-26 | 1 |", + "+----+----------+------------+-------------+", + ]; + crate::assert_batches_eq!(expected, &[batch]); + let batch = results.next().await; assert!(batch.is_none()); diff --git a/datafusion/src/physical_plan/file_format/csv.rs b/datafusion/src/physical_plan/file_format/csv.rs index fc82c8fd272e..0057e9e811ab 100644 --- a/datafusion/src/physical_plan/file_format/csv.rs +++ b/datafusion/src/physical_plan/file_format/csv.rs @@ -17,81 +17,48 @@ //! Execution plan for reading CSV files -use crate::datasource::object_store::ObjectStore; -use crate::datasource::PartitionedFile; use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ DisplayFormatType, ExecutionPlan, Partitioning, SendableRecordBatchStream, Statistics, }; use arrow::csv; -use arrow::datatypes::{Schema, SchemaRef}; +use arrow::datatypes::SchemaRef; use std::any::Any; use std::sync::Arc; use async_trait::async_trait; use super::file_stream::{BatchIter, FileStream}; +use super::PhysicalPlanConfig; /// Execution plan for scanning a CSV file #[derive(Debug, Clone)] pub struct CsvExec { - object_store: Arc, - file_groups: Vec>, - /// Schema representing the CSV file - file_schema: SchemaRef, - /// Schema after the projection has been applied + base_config: PhysicalPlanConfig, + projected_statistics: Statistics, projected_schema: SchemaRef, - statistics: Statistics, has_header: bool, delimiter: u8, - projection: Option>, - batch_size: usize, - limit: Option, } impl CsvExec { - /// Create a new CSV reader execution plan provided file list and schema - #[allow(clippy::too_many_arguments)] - pub fn new( - object_store: Arc, - file_groups: Vec>, - statistics: Statistics, - file_schema: SchemaRef, - has_header: bool, - delimiter: u8, - projection: Option>, - batch_size: usize, - limit: Option, - ) -> Self { - let projected_schema = match &projection { - None => Arc::clone(&file_schema), - Some(p) => Arc::new(Schema::new( - p.iter().map(|i| file_schema.field(*i).clone()).collect(), - )), - }; + /// Create a new CSV reader execution plan provided base and specific configurations + pub fn new(base_config: PhysicalPlanConfig, has_header: bool, delimiter: u8) -> Self { + let (projected_schema, projected_statistics) = base_config.project(); Self { - object_store, - file_groups, - file_schema, - statistics, + base_config, + projected_schema, + projected_statistics, has_header, delimiter, - projection, - projected_schema, - batch_size, - limit, } } - /// List of data files - pub fn file_groups(&self) -> &[Vec] { - &self.file_groups - } - /// The schema before projection - pub fn file_schema(&self) -> &SchemaRef { - &self.file_schema + /// Ref to the base configs + pub fn base_config(&self) -> &PhysicalPlanConfig { + &self.base_config } /// true if the first line of each file is a header pub fn has_header(&self) -> bool { @@ -101,18 +68,6 @@ impl CsvExec { pub fn delimiter(&self) -> u8 { self.delimiter } - /// Optional projection for which columns to load - pub fn projection(&self) -> &Option> { - &self.projection - } - /// Batch size - pub fn batch_size(&self) -> usize { - self.batch_size - } - /// Limit in nr. of rows - pub fn limit(&self) -> Option { - self.limit - } } #[async_trait] @@ -129,7 +84,7 @@ impl ExecutionPlan for CsvExec { /// Get the output partitioning of this plan fn output_partitioning(&self) -> Partitioning { - Partitioning::UnknownPartitioning(self.file_groups.len()) + Partitioning::UnknownPartitioning(self.base_config.file_groups.len()) } fn children(&self) -> Vec> { @@ -152,9 +107,9 @@ impl ExecutionPlan for CsvExec { } async fn execute(&self, partition: usize) -> Result { - let batch_size = self.batch_size; - let file_schema = Arc::clone(&self.file_schema); - let projection = self.projection.clone(); + let batch_size = self.base_config.batch_size; + let file_schema = Arc::clone(&self.base_config.file_schema); + let file_projection = self.base_config.file_column_projection_indices(); let has_header = self.has_header; let delimiter = self.delimiter; let start_line = if has_header { 1 } else { 0 }; @@ -168,16 +123,17 @@ impl ExecutionPlan for CsvExec { Some(delimiter), batch_size, bounds, - projection.clone(), + file_projection.clone(), )) as BatchIter }; Ok(Box::pin(FileStream::new( - Arc::clone(&self.object_store), - self.file_groups[partition].clone(), + Arc::clone(&self.base_config.object_store), + self.base_config.file_groups[partition].clone(), fun, Arc::clone(&self.projected_schema), - self.limit, + self.base_config.limit, + self.base_config.table_partition_cols.clone(), ))) } @@ -191,17 +147,17 @@ impl ExecutionPlan for CsvExec { write!( f, "CsvExec: files={}, has_header={}, batch_size={}, limit={:?}", - super::FileGroupsDisplay(&self.file_groups), + super::FileGroupsDisplay(&self.base_config.file_groups), self.has_header, - self.batch_size, - self.limit, + self.base_config.batch_size, + self.base_config.limit, ) } } } fn statistics(&self) -> Statistics { - self.statistics.clone() + self.projected_statistics.clone() } } @@ -209,74 +165,153 @@ impl ExecutionPlan for CsvExec { mod tests { use super::*; use crate::{ - datasource::object_store::local::{local_file_meta, LocalFileSystem}, + datasource::object_store::local::{local_unpartitioned_file, LocalFileSystem}, + scalar::ScalarValue, test::aggr_test_schema, }; use futures::StreamExt; #[tokio::test] async fn csv_exec_with_projection() -> Result<()> { - let schema = aggr_test_schema(); + let file_schema = aggr_test_schema(); let testdata = crate::test_util::arrow_test_data(); let filename = "aggregate_test_100.csv"; let path = format!("{}/csv/{}", testdata, filename); let csv = CsvExec::new( - Arc::new(LocalFileSystem {}), - vec![vec![PartitionedFile { - file_meta: local_file_meta(path), - }]], - Statistics::default(), - schema, + PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_schema, + file_groups: vec![vec![local_unpartitioned_file(path)]], + statistics: Statistics::default(), + projection: Some(vec![0, 2, 4]), + batch_size: 1024, + limit: None, + table_partition_cols: vec![], + }, true, b',', - Some(vec![0, 2, 4]), - 1024, - None, ); - assert_eq!(13, csv.file_schema.fields().len()); + assert_eq!(13, csv.base_config.file_schema.fields().len()); assert_eq!(3, csv.projected_schema.fields().len()); assert_eq!(3, csv.schema().fields().len()); + let mut stream = csv.execute(0).await?; let batch = stream.next().await.unwrap()?; assert_eq!(3, batch.num_columns()); - let batch_schema = batch.schema(); - assert_eq!(3, batch_schema.fields().len()); - assert_eq!("c1", batch_schema.field(0).name()); - assert_eq!("c3", batch_schema.field(1).name()); - assert_eq!("c5", batch_schema.field(2).name()); + assert_eq!(100, batch.num_rows()); + + // slice of the first 5 lines + let expected = vec![ + "+----+-----+------------+", + "| c1 | c3 | c5 |", + "+----+-----+------------+", + "| c | 1 | 2033001162 |", + "| d | -40 | 706441268 |", + "| b | 29 | 994303988 |", + "| a | -85 | 1171968280 |", + "| b | -82 | 1824882165 |", + "+----+-----+------------+", + ]; + + crate::assert_batches_eq!(expected, &[batch.slice(0, 5)]); Ok(()) } #[tokio::test] - async fn csv_exec_without_projection() -> Result<()> { - let schema = aggr_test_schema(); + async fn csv_exec_with_limit() -> Result<()> { + let file_schema = aggr_test_schema(); let testdata = crate::test_util::arrow_test_data(); let filename = "aggregate_test_100.csv"; let path = format!("{}/csv/{}", testdata, filename); let csv = CsvExec::new( - Arc::new(LocalFileSystem {}), - vec![vec![PartitionedFile { - file_meta: local_file_meta(path), - }]], - Statistics::default(), - schema, + PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_schema, + file_groups: vec![vec![local_unpartitioned_file(path)]], + statistics: Statistics::default(), + projection: None, + batch_size: 1024, + limit: Some(5), + table_partition_cols: vec![], + }, true, b',', - None, - 1024, - None, ); - assert_eq!(13, csv.file_schema.fields().len()); + assert_eq!(13, csv.base_config.file_schema.fields().len()); assert_eq!(13, csv.projected_schema.fields().len()); assert_eq!(13, csv.schema().fields().len()); + let mut it = csv.execute(0).await?; let batch = it.next().await.unwrap()?; assert_eq!(13, batch.num_columns()); - let batch_schema = batch.schema(); - assert_eq!(13, batch_schema.fields().len()); - assert_eq!("c1", batch_schema.field(0).name()); - assert_eq!("c2", batch_schema.field(1).name()); - assert_eq!("c3", batch_schema.field(2).name()); + assert_eq!(5, batch.num_rows()); + + let expected = vec![ + "+----+----+-----+--------+------------+----------------------+-----+-------+------------+----------------------+-------------+---------------------+--------------------------------+", + "| c1 | c2 | c3 | c4 | c5 | c6 | c7 | c8 | c9 | c10 | c11 | c12 | c13 |", + "+----+----+-----+--------+------------+----------------------+-----+-------+------------+----------------------+-------------+---------------------+--------------------------------+", + "| c | 2 | 1 | 18109 | 2033001162 | -6513304855495910254 | 25 | 43062 | 1491205016 | 5863949479783605708 | 0.110830784 | 0.9294097332465232 | 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW |", + "| d | 5 | -40 | 22614 | 706441268 | -7542719935673075327 | 155 | 14337 | 3373581039 | 11720144131976083864 | 0.69632107 | 0.3114712539863804 | C2GT5KVyOPZpgKVl110TyZO0NcJ434 |", + "| b | 1 | 29 | -18218 | 994303988 | 5983957848665088916 | 204 | 9489 | 3275293996 | 14857091259186476033 | 0.53840446 | 0.17909035118828576 | AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz |", + "| a | 1 | -85 | -15154 | 1171968280 | 1919439543497968449 | 77 | 52286 | 774637006 | 12101411955859039553 | 0.12285209 | 0.6864391962767343 | 0keZ5G8BffGwgF2RwQD59TFzMStxCB |", + "| b | 5 | -82 | 22080 | 1824882165 | 7373730676428214987 | 208 | 34331 | 3342719438 | 3330177516592499461 | 0.82634634 | 0.40975383525297016 | Ig1QcuKsjHXkproePdERo2w0mYzIqd |", + "+----+----+-----+--------+------------+----------------------+-----+-------+------------+----------------------+-------------+---------------------+--------------------------------+", + ]; + + crate::assert_batches_eq!(expected, &[batch]); + + Ok(()) + } + + #[tokio::test] + async fn csv_exec_with_partition() -> Result<()> { + let file_schema = aggr_test_schema(); + let testdata = crate::test_util::arrow_test_data(); + let filename = "aggregate_test_100.csv"; + // we don't have `/date=xx/` in the path but that is ok because + // partitions are resolved during scan anyway + let path = format!("{}/csv/{}", testdata, filename); + let mut partitioned_file = local_unpartitioned_file(path); + partitioned_file.partition_values = + vec![ScalarValue::Utf8(Some("2021-10-26".to_owned()))]; + let csv = CsvExec::new( + PhysicalPlanConfig { + // we should be able to project on the partition column + // wich is supposed to be after the file fields + projection: Some(vec![0, file_schema.fields().len()]), + object_store: Arc::new(LocalFileSystem {}), + file_schema, + file_groups: vec![vec![partitioned_file]], + statistics: Statistics::default(), + batch_size: 1024, + limit: None, + table_partition_cols: vec!["date".to_owned()], + }, + true, + b',', + ); + assert_eq!(13, csv.base_config.file_schema.fields().len()); + assert_eq!(2, csv.projected_schema.fields().len()); + assert_eq!(2, csv.schema().fields().len()); + + let mut it = csv.execute(0).await?; + let batch = it.next().await.unwrap()?; + assert_eq!(2, batch.num_columns()); + assert_eq!(100, batch.num_rows()); + + // slice of the first 5 lines + let expected = vec![ + "+----+------------+", + "| c1 | date |", + "+----+------------+", + "| c | 2021-10-26 |", + "| d | 2021-10-26 |", + "| b | 2021-10-26 |", + "| a | 2021-10-26 |", + "| b | 2021-10-26 |", + "+----+------------+", + ]; + crate::assert_batches_eq!(expected, &[batch.slice(0, 5)]); Ok(()) } } diff --git a/datafusion/src/physical_plan/file_format/file_stream.rs b/datafusion/src/physical_plan/file_format/file_stream.rs index 55a66f46cf48..958b1721bb39 100644 --- a/datafusion/src/physical_plan/file_format/file_stream.rs +++ b/datafusion/src/physical_plan/file_format/file_stream.rs @@ -23,8 +23,8 @@ use crate::{ datasource::{object_store::ObjectStore, PartitionedFile}, - error::Result as DataFusionResult, physical_plan::RecordBatchStream, + scalar::ScalarValue, }; use arrow::{ datatypes::SchemaRef, @@ -40,8 +40,9 @@ use std::{ task::{Context, Poll}, }; -pub type FileIter = - Box>> + Send + Sync>; +use super::PartitionColumnProjector; + +pub type FileIter = Box + Send + Sync>; pub type BatchIter = Box> + Send + Sync>; /// A closure that creates a file format reader (iterator over `RecordBatch`) from a `Read` object @@ -63,10 +64,13 @@ impl FormatReaderOpener for T where pub struct FileStream { /// An iterator over record batches of the last file returned by file_iter batch_iter: BatchIter, - /// An iterator over input files + /// Partitioning column values for the current batch_iter + partition_values: Vec, + /// An iterator over input files. file_iter: FileIter, - /// The stream schema (file schema after projection) - schema: SchemaRef, + /// The stream schema (file schema including partition columns and after + /// projection). + projected_schema: SchemaRef, /// The remaining number of records to parse, None if no limit remain: Option, /// A closure that takes a reader and an optional remaining number of lines @@ -74,6 +78,10 @@ pub struct FileStream { /// is not capable of limiting the number of records in the last batch, the file /// stream will take care of truncating it. file_reader: F, + /// The partition column projector + pc_projector: PartitionColumnProjector, + /// the store from which to source the files. + object_store: Arc, } impl FileStream { @@ -81,34 +89,48 @@ impl FileStream { object_store: Arc, files: Vec, file_reader: F, - schema: SchemaRef, + projected_schema: SchemaRef, limit: Option, + table_partition_cols: Vec, ) -> Self { - let read_iter = files.into_iter().map(move |f| -> DataFusionResult<_> { - object_store - .file_reader(f.file_meta.sized_file)? - .sync_reader() - }); + let pc_projector = PartitionColumnProjector::new( + Arc::clone(&projected_schema), + &table_partition_cols, + ); Self { - file_iter: Box::new(read_iter), + file_iter: Box::new(files.into_iter()), batch_iter: Box::new(iter::empty()), + partition_values: vec![], remain: limit, - schema, + projected_schema, file_reader, + pc_projector, + object_store, } } - /// Acts as a flat_map of record batches over files. + /// Acts as a flat_map of record batches over files. Adds the partitioning + /// Columns to the returned record batches. fn next_batch(&mut self) -> Option> { match self.batch_iter.next() { - Some(batch) => Some(batch), + Some(Ok(batch)) => { + Some(self.pc_projector.project(batch, &self.partition_values)) + } + Some(Err(e)) => Some(Err(e)), None => match self.file_iter.next() { - Some(Ok(f)) => { - self.batch_iter = (self.file_reader)(f, &self.remain); - self.next_batch() + Some(f) => { + self.partition_values = f.partition_values; + self.object_store + .file_reader(f.file_meta.sized_file) + .and_then(|r| r.sync_reader()) + .map_err(|e| ArrowError::ExternalError(Box::new(e))) + .and_then(|f| { + self.batch_iter = (self.file_reader)(f, &self.remain); + self.next_batch().transpose() + }) + .transpose() } - Some(Err(e)) => Some(Err(ArrowError::ExternalError(Box::new(e)))), None => None, }, } @@ -157,7 +179,7 @@ impl Stream for FileStream { impl RecordBatchStream for FileStream { fn schema(&self) -> SchemaRef { - Arc::clone(&self.schema) + Arc::clone(&self.projected_schema) } } @@ -191,6 +213,7 @@ mod tests { reader, source_schema, limit, + vec![], ); file_stream diff --git a/datafusion/src/physical_plan/file_format/json.rs b/datafusion/src/physical_plan/file_format/json.rs index f9dde67fea2d..9032eb9d5e5d 100644 --- a/datafusion/src/physical_plan/file_format/json.rs +++ b/datafusion/src/physical_plan/file_format/json.rs @@ -18,61 +18,34 @@ //! Execution plan for reading line-delimited JSON files use async_trait::async_trait; -use crate::datasource::object_store::ObjectStore; -use crate::datasource::PartitionedFile; use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ DisplayFormatType, ExecutionPlan, Partitioning, SendableRecordBatchStream, Statistics, }; -use arrow::{ - datatypes::{Schema, SchemaRef}, - json, -}; +use arrow::{datatypes::SchemaRef, json}; use std::any::Any; use std::sync::Arc; use super::file_stream::{BatchIter, FileStream}; +use super::PhysicalPlanConfig; /// Execution plan for scanning NdJson data source #[derive(Debug, Clone)] pub struct NdJsonExec { - object_store: Arc, - file_groups: Vec>, - statistics: Statistics, - file_schema: SchemaRef, - projection: Option>, + base_config: PhysicalPlanConfig, + projected_statistics: Statistics, projected_schema: SchemaRef, - batch_size: usize, - limit: Option, } impl NdJsonExec { - /// Create a new JSON reader execution plan provided file list and schema - pub fn new( - object_store: Arc, - file_groups: Vec>, - statistics: Statistics, - file_schema: SchemaRef, - projection: Option>, - batch_size: usize, - limit: Option, - ) -> Self { - let projected_schema = match &projection { - None => Arc::clone(&file_schema), - Some(p) => Arc::new(Schema::new( - p.iter().map(|i| file_schema.field(*i).clone()).collect(), - )), - }; + /// Create a new JSON reader execution plan provided base configurations + pub fn new(base_config: PhysicalPlanConfig) -> Self { + let (projected_schema, projected_statistics) = base_config.project(); Self { - object_store, - file_groups, - statistics, - file_schema, - projection, + base_config, projected_schema, - batch_size, - limit, + projected_statistics, } } } @@ -88,7 +61,7 @@ impl ExecutionPlan for NdJsonExec { } fn output_partitioning(&self) -> Partitioning { - Partitioning::UnknownPartitioning(self.file_groups.len()) + Partitioning::UnknownPartitioning(self.base_config.file_groups.len()) } fn children(&self) -> Vec> { @@ -110,15 +83,10 @@ impl ExecutionPlan for NdJsonExec { } async fn execute(&self, partition: usize) -> Result { - let proj = self.projection.as_ref().map(|p| { - p.iter() - .map(|col_idx| self.file_schema.field(*col_idx).name()) - .cloned() - .collect() - }); + let proj = self.base_config.projected_file_column_names(); - let batch_size = self.batch_size; - let file_schema = Arc::clone(&self.file_schema); + let batch_size = self.base_config.batch_size; + let file_schema = Arc::clone(&self.base_config.file_schema); // The json reader cannot limit the number of records, so `remaining` is ignored. let fun = move |file, _remaining: &Option| { @@ -131,11 +99,12 @@ impl ExecutionPlan for NdJsonExec { }; Ok(Box::pin(FileStream::new( - Arc::clone(&self.object_store), - self.file_groups[partition].clone(), + Arc::clone(&self.base_config.object_store), + self.base_config.file_groups[partition].clone(), fun, Arc::clone(&self.projected_schema), - self.limit, + self.base_config.limit, + self.base_config.table_partition_cols.clone(), ))) } @@ -149,16 +118,16 @@ impl ExecutionPlan for NdJsonExec { write!( f, "JsonExec: batch_size={}, limit={:?}, files={}", - self.batch_size, - self.limit, - super::FileGroupsDisplay(&self.file_groups), + self.base_config.batch_size, + self.base_config.limit, + super::FileGroupsDisplay(&self.base_config.file_groups), ) } } } fn statistics(&self) -> Statistics { - self.statistics.clone() + self.projected_statistics.clone() } } @@ -169,7 +138,7 @@ mod tests { use crate::datasource::{ file_format::{json::JsonFormat, FileFormat}, object_store::local::{ - local_file_meta, local_object_reader_stream, LocalFileSystem, + local_object_reader_stream, local_unpartitioned_file, LocalFileSystem, }, }; @@ -187,17 +156,16 @@ mod tests { async fn nd_json_exec_file_without_projection() -> Result<()> { use arrow::datatypes::DataType; let path = format!("{}/1.json", TEST_DATA_BASE); - let exec = NdJsonExec::new( - Arc::new(LocalFileSystem {}), - vec![vec![PartitionedFile { - file_meta: local_file_meta(path.clone()), - }]], - Default::default(), - infer_schema(path).await?, - None, - 1024, - Some(3), - ); + let exec = NdJsonExec::new(PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_groups: vec![vec![local_unpartitioned_file(path.clone())]], + file_schema: infer_schema(path).await?, + statistics: Statistics::default(), + projection: None, + batch_size: 1024, + limit: Some(3), + table_partition_cols: vec![], + }); // TODO: this is not where schema inference should be tested @@ -242,17 +210,16 @@ mod tests { #[tokio::test] async fn nd_json_exec_file_projection() -> Result<()> { let path = format!("{}/1.json", TEST_DATA_BASE); - let exec = NdJsonExec::new( - Arc::new(LocalFileSystem {}), - vec![vec![PartitionedFile { - file_meta: local_file_meta(path.clone()), - }]], - Default::default(), - infer_schema(path).await?, - Some(vec![0, 2]), - 1024, - None, - ); + let exec = NdJsonExec::new(PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_groups: vec![vec![local_unpartitioned_file(path.clone())]], + file_schema: infer_schema(path).await?, + statistics: Statistics::default(), + projection: Some(vec![0, 2]), + batch_size: 1024, + limit: None, + table_partition_cols: vec![], + }); let inferred_schema = exec.schema(); assert_eq!(inferred_schema.fields().len(), 2); diff --git a/datafusion/src/physical_plan/file_format/mod.rs b/datafusion/src/physical_plan/file_format/mod.rs index b0b690519eca..d460e9830fe5 100644 --- a/datafusion/src/physical_plan/file_format/mod.rs +++ b/datafusion/src/physical_plan/file_format/mod.rs @@ -24,19 +24,134 @@ mod json; mod parquet; pub use self::parquet::ParquetExec; +use arrow::{ + array::{ArrayData, ArrayRef, DictionaryArray, UInt8BufferBuilder}, + buffer::Buffer, + datatypes::{DataType, Field, Schema, SchemaRef, UInt8Type}, + error::{ArrowError, Result as ArrowResult}, + record_batch::RecordBatch, +}; pub use avro::AvroExec; pub use csv::CsvExec; pub use json::NdJsonExec; -use crate::datasource::PartitionedFile; -use std::fmt::{Display, Formatter, Result}; +use crate::{ + datasource::{object_store::ObjectStore, PartitionedFile}, + scalar::ScalarValue, +}; +use std::{ + collections::HashMap, + fmt::{Display, Formatter, Result as FmtResult}, + sync::Arc, + vec, +}; + +use super::{ColumnStatistics, Statistics}; + +lazy_static! { + /// The datatype used for all partitioning columns for now + pub static ref DEFAULT_PARTITION_COLUMN_DATATYPE: DataType = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)); +} + +/// The base configurations to provide when creating a physical plan for +/// any given file format. +#[derive(Debug, Clone)] +pub struct PhysicalPlanConfig { + /// Store from which the `files` should be fetched + pub object_store: Arc, + /// Schema before projection. It contains the columns that are expected + /// to be in the files without the table partition columns. + pub file_schema: SchemaRef, + /// List of files to be processed, grouped into partitions + pub file_groups: Vec>, + /// Estimated overall statistics of the files, taking `filters` into account. + pub statistics: Statistics, + /// Columns on which to project the data. Indexes that are higher than the + /// number of columns of `file_schema` refer to `table_partition_cols`. + pub projection: Option>, + /// The maximum number of records per arrow column + pub batch_size: usize, + /// The minimum number of records required from this source plan + pub limit: Option, + /// The partitioning column names + pub table_partition_cols: Vec, +} + +impl PhysicalPlanConfig { + /// Project the schema and the statistics on the given column indices + fn project(&self) -> (SchemaRef, Statistics) { + if self.projection.is_none() && self.table_partition_cols.is_empty() { + return (Arc::clone(&self.file_schema), self.statistics.clone()); + } + + let proj_iter: Box> = match &self.projection { + Some(proj) => Box::new(proj.iter().copied()), + None => Box::new( + 0..(self.file_schema.fields().len() + self.table_partition_cols.len()), + ), + }; + + let mut table_fields = vec![]; + let mut table_cols_stats = vec![]; + for idx in proj_iter { + if idx < self.file_schema.fields().len() { + table_fields.push(self.file_schema.field(idx).clone()); + if let Some(file_cols_stats) = &self.statistics.column_statistics { + table_cols_stats.push(file_cols_stats[idx].clone()) + } else { + table_cols_stats.push(ColumnStatistics::default()) + } + } else { + let partition_idx = idx - self.file_schema.fields().len(); + table_fields.push(Field::new( + &self.table_partition_cols[partition_idx], + DEFAULT_PARTITION_COLUMN_DATATYPE.clone(), + false, + )); + // TODO provide accurate stat for partition column (#1186) + table_cols_stats.push(ColumnStatistics::default()) + } + } + + let table_stats = Statistics { + num_rows: self.statistics.num_rows, + is_exact: self.statistics.is_exact, + // TODO correct byte size? + total_byte_size: None, + column_statistics: Some(table_cols_stats), + }; + + let table_schema = Arc::new(Schema::new(table_fields)); + + (table_schema, table_stats) + } + + fn projected_file_column_names(&self) -> Option> { + self.projection.as_ref().map(|p| { + p.iter() + .filter(|col_idx| **col_idx < self.file_schema.fields().len()) + .map(|col_idx| self.file_schema.field(*col_idx).name()) + .cloned() + .collect() + }) + } + + fn file_column_projection_indices(&self) -> Option> { + self.projection.as_ref().map(|p| { + p.iter() + .filter(|col_idx| **col_idx < self.file_schema.fields().len()) + .copied() + .collect() + }) + } +} /// A wrapper to customize partitioned file display #[derive(Debug)] struct FileGroupsDisplay<'a>(&'a [Vec]); impl<'a> Display for FileGroupsDisplay<'a> { - fn fmt(&self, f: &mut Formatter) -> Result { + fn fmt(&self, f: &mut Formatter) -> FmtResult { let parts: Vec<_> = self .0 .iter() @@ -50,3 +165,324 @@ impl<'a> Display for FileGroupsDisplay<'a> { write!(f, "[{}]", parts.join(", ")) } } + +/// A helper that projects partition columns into the file record batches. +/// +/// One interesting trick is the usage of a cache for the key buffers of the partition column +/// dictionaries. Indeed, the partition columns are constant, so the dictionaries that represent them +/// have all their keys equal to 0. This enables us to re-use the same "all-zero" buffer across batches, +/// which makes the space consumption of the partition columns O(batch_size) instead of O(record_count). +struct PartitionColumnProjector { + /// An Arrow buffer initialized to zeros that represents the key array of all partition + /// columns (partition columns are materialized by dictionary arrays with only one + /// value in the dictionary, thus all the keys are equal to zero). + key_buffer_cache: Option, + /// Mapping between the indexes in the list of partition columns and the target + /// schema. Sorted by index in the target schema so that we can iterate on it to + /// insert the partition columns in the target record batch. + projected_partition_indexes: Vec<(usize, usize)>, + /// The schema of the table once the projection was applied. + projected_schema: SchemaRef, +} + +impl PartitionColumnProjector { + // Create a projector to insert the partitioning columns into batches read from files + // - projected_schema: the target schema with both file and partitioning columns + // - table_partition_cols: all the partitioning column names + fn new(projected_schema: SchemaRef, table_partition_cols: &[String]) -> Self { + let mut idx_map = HashMap::new(); + for (partition_idx, partition_name) in table_partition_cols.iter().enumerate() { + if let Ok(schema_idx) = projected_schema.index_of(partition_name) { + idx_map.insert(partition_idx, schema_idx); + } + } + + let mut projected_partition_indexes: Vec<_> = idx_map.into_iter().collect(); + projected_partition_indexes.sort_by(|(_, a), (_, b)| a.cmp(b)); + + Self { + projected_partition_indexes, + key_buffer_cache: None, + projected_schema, + } + } + + // Transform the batch read from the file by inserting the partitioning columns + // to the right positions as deduced from `projected_schema` + // - file_batch: batch read from the file, with internal projection applied + // - partition_values: the list of partition values, one for each partition column + fn project( + &mut self, + file_batch: RecordBatch, + partition_values: &[ScalarValue], + ) -> ArrowResult { + let expected_cols = + self.projected_schema.fields().len() - self.projected_partition_indexes.len(); + + if file_batch.columns().len() != expected_cols { + return Err(ArrowError::SchemaError(format!( + "Unexpected batch schema from file, expected {} cols but got {}", + expected_cols, + file_batch.columns().len() + ))); + } + + let mut cols = file_batch.columns().to_vec(); + for &(pidx, sidx) in &self.projected_partition_indexes { + cols.insert( + sidx, + create_dict_array( + &mut self.key_buffer_cache, + &partition_values[pidx], + file_batch.num_rows(), + ), + ) + } + RecordBatch::try_new(Arc::clone(&self.projected_schema), cols) + } +} + +fn create_dict_array( + key_buffer_cache: &mut Option, + val: &ScalarValue, + len: usize, +) -> ArrayRef { + // build value dictionary + let dict_vals = val.to_array(); + + // build keys array + let sliced_key_buffer = match key_buffer_cache { + Some(buf) if buf.len() >= len => buf.slice(buf.len() - len), + _ => { + let mut key_buffer_builder = UInt8BufferBuilder::new(len); + key_buffer_builder.advance(len); // keys are all 0 + key_buffer_cache.insert(key_buffer_builder.finish()).clone() + } + }; + + // create data type + let data_type = + DataType::Dictionary(Box::new(DataType::UInt8), Box::new(val.get_datatype())); + + debug_assert_eq!(data_type, *DEFAULT_PARTITION_COLUMN_DATATYPE); + + // assemble pieces together + let mut builder = ArrayData::builder(data_type) + .len(len) + .add_buffer(sliced_key_buffer); + builder = builder.add_child_data(dict_vals.data().clone()); + Arc::new(DictionaryArray::::from(builder.build().unwrap())) +} + +#[cfg(test)] +mod tests { + use crate::test::{ + aggr_test_schema, build_table_i32, columns, object_store::TestObjectStore, + }; + + use super::*; + + #[test] + fn physical_plan_config_no_projection() { + let file_schema = aggr_test_schema(); + let conf = config_for_projection( + Arc::clone(&file_schema), + None, + Statistics::default(), + vec!["date".to_owned()], + ); + + let (proj_schema, proj_statistics) = conf.project(); + assert_eq!(proj_schema.fields().len(), file_schema.fields().len() + 1); + assert_eq!( + proj_schema.field(file_schema.fields().len()).name(), + "date", + "partition columns are the last columns" + ); + assert_eq!( + proj_statistics + .column_statistics + .expect("projection creates column statistics") + .len(), + file_schema.fields().len() + 1 + ); + // TODO implement tests for partition column statistics once implemented + + let col_names = conf.projected_file_column_names(); + assert_eq!(col_names, None); + + let col_indices = conf.file_column_projection_indices(); + assert_eq!(col_indices, None); + } + + #[test] + fn physical_plan_config_with_projection() { + let file_schema = aggr_test_schema(); + let conf = config_for_projection( + Arc::clone(&file_schema), + Some(vec![file_schema.fields().len(), 0]), + Statistics { + num_rows: Some(10), + // assign the column index to distinct_count to help assert + // the source statistic after the projection + column_statistics: Some( + (0..file_schema.fields().len()) + .map(|i| ColumnStatistics { + distinct_count: Some(i), + ..Default::default() + }) + .collect(), + ), + ..Default::default() + }, + vec!["date".to_owned()], + ); + + let (proj_schema, proj_statistics) = conf.project(); + assert_eq!( + columns(&proj_schema), + vec!["date".to_owned(), "c1".to_owned()] + ); + let proj_stat_cols = proj_statistics + .column_statistics + .expect("projection creates column statistics"); + assert_eq!(proj_stat_cols.len(), 2); + // TODO implement tests for proj_stat_cols[0] once partition column + // statistics are implemented + assert_eq!(proj_stat_cols[1].distinct_count, Some(0)); + + let col_names = conf.projected_file_column_names(); + assert_eq!(col_names, Some(vec!["c1".to_owned()])); + + let col_indices = conf.file_column_projection_indices(); + assert_eq!(col_indices, Some(vec![0])); + } + + #[test] + fn partition_column_projector() { + let file_batch = build_table_i32( + ("a", &vec![0, 1, 2]), + ("b", &vec![-2, -1, 0]), + ("c", &vec![10, 11, 12]), + ); + let partition_cols = + vec!["year".to_owned(), "month".to_owned(), "day".to_owned()]; + // create a projected schema + let conf = config_for_projection( + file_batch.schema(), + // keep all cols from file and 2 from partitioning + Some(vec![ + 0, + 1, + 2, + file_batch.schema().fields().len(), + file_batch.schema().fields().len() + 2, + ]), + Statistics::default(), + partition_cols.clone(), + ); + let (proj_schema, _) = conf.project(); + // created a projector for that projected schema + let mut proj = PartitionColumnProjector::new(proj_schema, &partition_cols); + + // project first batch + let projected_batch = proj + .project( + // file_batch is ok here because we kept all the file cols in the projection + file_batch, + &[ + ScalarValue::Utf8(Some("2021".to_owned())), + ScalarValue::Utf8(Some("10".to_owned())), + ScalarValue::Utf8(Some("26".to_owned())), + ], + ) + .expect("Projection of partition columns into record batch failed"); + let expected = vec![ + "+---+----+----+------+-----+", + "| a | b | c | year | day |", + "+---+----+----+------+-----+", + "| 0 | -2 | 10 | 2021 | 26 |", + "| 1 | -1 | 11 | 2021 | 26 |", + "| 2 | 0 | 12 | 2021 | 26 |", + "+---+----+----+------+-----+", + ]; + crate::assert_batches_eq!(expected, &[projected_batch]); + + // project another batch that is larger than the previous one + let file_batch = build_table_i32( + ("a", &vec![5, 6, 7, 8, 9]), + ("b", &vec![-10, -9, -8, -7, -6]), + ("c", &vec![12, 13, 14, 15, 16]), + ); + let projected_batch = proj + .project( + // file_batch is ok here because we kept all the file cols in the projection + file_batch, + &[ + ScalarValue::Utf8(Some("2021".to_owned())), + ScalarValue::Utf8(Some("10".to_owned())), + ScalarValue::Utf8(Some("27".to_owned())), + ], + ) + .expect("Projection of partition columns into record batch failed"); + let expected = vec![ + "+---+-----+----+------+-----+", + "| a | b | c | year | day |", + "+---+-----+----+------+-----+", + "| 5 | -10 | 12 | 2021 | 27 |", + "| 6 | -9 | 13 | 2021 | 27 |", + "| 7 | -8 | 14 | 2021 | 27 |", + "| 8 | -7 | 15 | 2021 | 27 |", + "| 9 | -6 | 16 | 2021 | 27 |", + "+---+-----+----+------+-----+", + ]; + crate::assert_batches_eq!(expected, &[projected_batch]); + + // project another batch that is smaller than the previous one + let file_batch = build_table_i32( + ("a", &vec![0, 1, 3]), + ("b", &vec![2, 3, 4]), + ("c", &vec![4, 5, 6]), + ); + let projected_batch = proj + .project( + // file_batch is ok here because we kept all the file cols in the projection + file_batch, + &[ + ScalarValue::Utf8(Some("2021".to_owned())), + ScalarValue::Utf8(Some("10".to_owned())), + ScalarValue::Utf8(Some("28".to_owned())), + ], + ) + .expect("Projection of partition columns into record batch failed"); + let expected = vec![ + "+---+---+---+------+-----+", + "| a | b | c | year | day |", + "+---+---+---+------+-----+", + "| 0 | 2 | 4 | 2021 | 28 |", + "| 1 | 3 | 5 | 2021 | 28 |", + "| 3 | 4 | 6 | 2021 | 28 |", + "+---+---+---+------+-----+", + ]; + crate::assert_batches_eq!(expected, &[projected_batch]); + } + + // sets default for configs that play no role in projections + fn config_for_projection( + file_schema: SchemaRef, + projection: Option>, + statistics: Statistics, + table_partition_cols: Vec, + ) -> PhysicalPlanConfig { + PhysicalPlanConfig { + batch_size: 1024, + file_schema, + file_groups: vec![vec![]], + limit: None, + object_store: TestObjectStore::new_arc(&[]), + projection, + statistics, + table_partition_cols, + } + } +} diff --git a/datafusion/src/physical_plan/file_format/parquet.rs b/datafusion/src/physical_plan/file_format/parquet.rs index d07d2a945e8e..e7980d9aa6d3 100644 --- a/datafusion/src/physical_plan/file_format/parquet.rs +++ b/datafusion/src/physical_plan/file_format/parquet.rs @@ -29,6 +29,7 @@ use crate::{ logical_plan::{Column, Expr}, physical_optimizer::pruning::{PruningPredicate, PruningStatistics}, physical_plan::{ + file_format::PhysicalPlanConfig, metrics::{self, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}, stream::RecordBatchReceiverStream, DisplayFormatType, ExecutionPlan, Partitioning, SendableRecordBatchStream, @@ -60,27 +61,18 @@ use tokio::{ use async_trait::async_trait; +use super::PartitionColumnProjector; + /// Execution plan for scanning one or more Parquet partitions #[derive(Debug, Clone)] pub struct ParquetExec { - object_store: Arc, - /// Grouped list of files. Each group will be processed together by one - /// partition of the `ExecutionPlan`. - file_groups: Vec>, - /// Schema after projection is applied - schema: SchemaRef, - /// Projection for which columns to load - projection: Vec, - /// Batch size - batch_size: usize, - /// Statistics for the data set (sum of statistics for all partitions) - statistics: Statistics, + base_config: PhysicalPlanConfig, + projected_statistics: Statistics, + projected_schema: SchemaRef, /// Execution metrics metrics: ExecutionPlanMetricsSet, /// Optional predicate builder predicate_builder: Option, - /// Optional limit of the number of rows - limit: Option, } /// Stores metrics about the parquet execution for a particular parquet file @@ -95,26 +87,19 @@ struct ParquetFileMetrics { impl ParquetExec { /// Create a new Parquet reader execution plan provided file list and schema. /// Even if `limit` is set, ParquetExec rounds up the number of records to the next `batch_size`. - #[allow(clippy::too_many_arguments)] - pub fn new( - object_store: Arc, - file_groups: Vec>, - statistics: Statistics, - schema: SchemaRef, - projection: Option>, - predicate: Option, - batch_size: usize, - limit: Option, - ) -> Self { + pub fn new(base_config: PhysicalPlanConfig, predicate: Option) -> Self { debug!("Creating ParquetExec, files: {:?}, projection {:?}, predicate: {:?}, limit: {:?}", - file_groups, projection, predicate, limit); + base_config.file_groups, base_config.projection, predicate, base_config.limit); let metrics = ExecutionPlanMetricsSet::new(); let predicate_creation_errors = MetricBuilder::new(&metrics).global_counter("num_predicate_creation_errors"); let predicate_builder = predicate.and_then(|predicate_expr| { - match PruningPredicate::try_new(&predicate_expr, schema.clone()) { + match PruningPredicate::try_new( + &predicate_expr, + base_config.file_schema.clone(), + ) { Ok(predicate_builder) => Some(predicate_builder), Err(e) => { debug!( @@ -127,73 +112,20 @@ impl ParquetExec { } }); - let projection = match projection { - Some(p) => p, - None => (0..schema.fields().len()).collect(), - }; - - let (projected_schema, projected_statistics) = - Self::project(&projection, schema, statistics); + let (projected_schema, projected_statistics) = base_config.project(); Self { - object_store, - file_groups, - schema: projected_schema, - projection, + base_config, + projected_schema, + projected_statistics, metrics, predicate_builder, - batch_size, - statistics: projected_statistics, - limit, } } - fn project( - projection: &[usize], - schema: SchemaRef, - statistics: Statistics, - ) -> (SchemaRef, Statistics) { - let projected_schema = Schema::new( - projection - .iter() - .map(|i| schema.field(*i).clone()) - .collect(), - ); - - let new_column_statistics = statistics.column_statistics.map(|stats| { - let mut projected_stats = Vec::with_capacity(projection.len()); - for proj in projection { - projected_stats.push(stats[*proj].clone()); - } - projected_stats - }); - - let statistics = Statistics { - num_rows: statistics.num_rows, - total_byte_size: statistics.total_byte_size, - column_statistics: new_column_statistics, - is_exact: statistics.is_exact, - }; - - (Arc::new(projected_schema), statistics) - } - - /// List of data files - pub fn file_groups(&self) -> &[Vec] { - &self.file_groups - } - /// Optional projection for which columns to load - pub fn projection(&self) -> &[usize] { - &self.projection - } - /// Batch size - pub fn batch_size(&self) -> usize { - self.batch_size - } - - /// Limit in nr. of rows - pub fn limit(&self) -> Option { - self.limit + /// Ref to the base configs + pub fn base_config(&self) -> &PhysicalPlanConfig { + &self.base_config } } @@ -227,7 +159,7 @@ impl ExecutionPlan for ParquetExec { } fn schema(&self) -> SchemaRef { - self.schema.clone() + Arc::clone(&self.projected_schema) } fn children(&self) -> Vec> { @@ -237,7 +169,7 @@ impl ExecutionPlan for ParquetExec { /// Get the output partitioning of this plan fn output_partitioning(&self) -> Partitioning { - Partitioning::UnknownPartitioning(self.file_groups.len()) + Partitioning::UnknownPartitioning(self.base_config.file_groups.len()) } fn with_new_children( @@ -262,13 +194,20 @@ impl ExecutionPlan for ParquetExec { Receiver>, ) = channel(2); - let partition = self.file_groups[partition_index].clone(); + let partition = self.base_config.file_groups[partition_index].clone(); let metrics = self.metrics.clone(); - let projection = self.projection.clone(); + let projection = match self.base_config.file_column_projection_indices() { + Some(proj) => proj, + None => (0..self.base_config.file_schema.fields().len()).collect(), + }; let predicate_builder = self.predicate_builder.clone(); - let batch_size = self.batch_size; - let limit = self.limit; - let object_store = Arc::clone(&self.object_store); + let batch_size = self.base_config.batch_size; + let limit = self.base_config.limit; + let object_store = Arc::clone(&self.base_config.object_store); + let partition_col_proj = PartitionColumnProjector::new( + Arc::clone(&self.projected_schema), + &self.base_config.table_partition_cols, + ); let join_handle = task::spawn_blocking(move || { if let Err(e) = read_partition( @@ -281,13 +220,14 @@ impl ExecutionPlan for ParquetExec { batch_size, response_tx, limit, + partition_col_proj, ) { println!("Parquet reader thread terminated due to error: {:?}", e); } }); Ok(RecordBatchReceiverStream::create( - &self.schema, + &self.projected_schema, response_rx, join_handle, )) @@ -303,9 +243,9 @@ impl ExecutionPlan for ParquetExec { write!( f, "ParquetExec: batch_size={}, limit={:?}, partitions={}", - self.batch_size, - self.limit, - super::FileGroupsDisplay(&self.file_groups) + self.base_config.batch_size, + self.base_config.limit, + super::FileGroupsDisplay(&self.base_config.file_groups) ) } } @@ -316,7 +256,7 @@ impl ExecutionPlan for ParquetExec { } fn statistics(&self) -> Statistics { - self.statistics.clone() + self.projected_statistics.clone() } } @@ -456,6 +396,7 @@ fn read_partition( batch_size: usize, response_tx: Sender>, limit: Option, + mut partition_column_projector: PartitionColumnProjector, ) -> Result<()> { let mut total_rows = 0; 'outer: for partitioned_file in partition { @@ -483,7 +424,10 @@ fn read_partition( match batch_reader.next() { Some(Ok(batch)) => { total_rows += batch.num_rows(); - send_result(&response_tx, Ok(batch))?; + let proj_batch = partition_column_projector + .project(batch, &partitioned_file.partition_values); + + send_result(&response_tx, proj_batch)?; if limit.map(|l| total_rows >= l).unwrap_or(false) { break 'outer; } @@ -519,7 +463,7 @@ mod tests { use crate::datasource::{ file_format::{parquet::ParquetFormat, FileFormat}, object_store::local::{ - local_file_meta, local_object_reader_stream, LocalFileSystem, + local_object_reader_stream, local_unpartitioned_file, LocalFileSystem, }, }; @@ -533,21 +477,22 @@ mod tests { }; #[tokio::test] - async fn test() -> Result<()> { + async fn parquet_exec_with_projection() -> Result<()> { let testdata = crate::test_util::parquet_test_data(); let filename = format!("{}/alltypes_plain.parquet", testdata); let parquet_exec = ParquetExec::new( - Arc::new(LocalFileSystem {}), - vec![vec![PartitionedFile { - file_meta: local_file_meta(filename.clone()), - }]], - Statistics::default(), - ParquetFormat::default() - .infer_schema(local_object_reader_stream(vec![filename])) - .await?, - Some(vec![0, 1, 2]), - None, - 1024, + PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_groups: vec![vec![local_unpartitioned_file(filename.clone())]], + file_schema: ParquetFormat::default() + .infer_schema(local_object_reader_stream(vec![filename])) + .await?, + statistics: Statistics::default(), + projection: Some(vec![0, 1, 2]), + batch_size: 1024, + limit: None, + table_partition_cols: vec![], + }, None, ); assert_eq!(parquet_exec.output_partitioning().partition_count(), 1); @@ -575,6 +520,62 @@ mod tests { Ok(()) } + #[tokio::test] + async fn parquet_exec_with_partition() -> Result<()> { + let testdata = crate::test_util::parquet_test_data(); + let filename = format!("{}/alltypes_plain.parquet", testdata); + let mut partitioned_file = local_unpartitioned_file(filename.clone()); + partitioned_file.partition_values = vec![ + ScalarValue::Utf8(Some("2021".to_owned())), + ScalarValue::Utf8(Some("10".to_owned())), + ScalarValue::Utf8(Some("26".to_owned())), + ]; + let parquet_exec = ParquetExec::new( + PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_groups: vec![vec![partitioned_file]], + file_schema: ParquetFormat::default() + .infer_schema(local_object_reader_stream(vec![filename])) + .await?, + statistics: Statistics::default(), + // file has 10 cols so index 12 should be month + projection: Some(vec![0, 1, 2, 12]), + batch_size: 1024, + limit: None, + table_partition_cols: vec![ + "year".to_owned(), + "month".to_owned(), + "day".to_owned(), + ], + }, + None, + ); + assert_eq!(parquet_exec.output_partitioning().partition_count(), 1); + + let mut results = parquet_exec.execute(0).await?; + let batch = results.next().await.unwrap()?; + let expected = vec![ + "+----+----------+-------------+-------+", + "| id | bool_col | tinyint_col | month |", + "+----+----------+-------------+-------+", + "| 4 | true | 0 | 10 |", + "| 5 | false | 1 | 10 |", + "| 6 | true | 0 | 10 |", + "| 7 | false | 1 | 10 |", + "| 2 | true | 0 | 10 |", + "| 3 | false | 1 | 10 |", + "| 0 | true | 0 | 10 |", + "| 1 | false | 1 | 10 |", + "+----+----------+-------------+-------+", + ]; + crate::assert_batches_eq!(expected, &[batch]); + + let batch = results.next().await; + assert!(batch.is_none()); + + Ok(()) + } + fn parquet_file_metrics() -> ParquetFileMetrics { let metrics = Arc::new(ExecutionPlanMetricsSet::new()); ParquetFileMetrics::new(0, "file.parquet", &metrics) diff --git a/datafusion/src/physical_plan/filter.rs b/datafusion/src/physical_plan/filter.rs index 79b5ebc508f5..fe0f10313451 100644 --- a/datafusion/src/physical_plan/filter.rs +++ b/datafusion/src/physical_plan/filter.rs @@ -224,10 +224,10 @@ mod tests { use super::*; use crate::datasource::object_store::local::LocalFileSystem; use crate::physical_plan::expressions::*; - use crate::physical_plan::file_format::CsvExec; + use crate::physical_plan::file_format::{CsvExec, PhysicalPlanConfig}; use crate::physical_plan::ExecutionPlan; use crate::scalar::ScalarValue; - use crate::test::{self, aggr_test_schema}; + use crate::test::{self}; use crate::{logical_plan::Operator, physical_plan::collect}; use std::iter::Iterator; @@ -240,15 +240,18 @@ mod tests { test::create_partitioned_csv("aggregate_test_100.csv", partitions)?; let csv = CsvExec::new( - Arc::new(LocalFileSystem {}), - files, - Statistics::default(), - aggr_test_schema(), + PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_schema: Arc::clone(&schema), + file_groups: files, + statistics: Statistics::default(), + projection: None, + batch_size: 1024, + limit: None, + table_partition_cols: vec![], + }, true, b',', - None, - 1024, - None, ); let predicate: Arc = binary( diff --git a/datafusion/src/physical_plan/limit.rs b/datafusion/src/physical_plan/limit.rs index bd48e4d2e5d4..f9c392a9056a 100644 --- a/datafusion/src/physical_plan/limit.rs +++ b/datafusion/src/physical_plan/limit.rs @@ -387,7 +387,7 @@ mod tests { use crate::datasource::object_store::local::LocalFileSystem; use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; use crate::physical_plan::common; - use crate::physical_plan::file_format::CsvExec; + use crate::physical_plan::file_format::{CsvExec, PhysicalPlanConfig}; use crate::test; #[tokio::test] @@ -399,15 +399,18 @@ mod tests { test::create_partitioned_csv("aggregate_test_100.csv", num_partitions)?; let csv = CsvExec::new( - Arc::new(LocalFileSystem {}), - files, - Statistics::default(), - schema, + PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_schema: schema, + file_groups: files, + statistics: Statistics::default(), + projection: None, + batch_size: 1024, + limit: None, + table_partition_cols: vec![], + }, true, b',', - None, - 1024, - None, ); // input should have 4 partitions diff --git a/datafusion/src/physical_plan/projection.rs b/datafusion/src/physical_plan/projection.rs index 794d9a2ec68e..eb335c2100ac 100644 --- a/datafusion/src/physical_plan/projection.rs +++ b/datafusion/src/physical_plan/projection.rs @@ -261,9 +261,9 @@ mod tests { use super::*; use crate::datasource::object_store::local::LocalFileSystem; use crate::physical_plan::expressions::{self, col}; - use crate::physical_plan::file_format::CsvExec; + use crate::physical_plan::file_format::{CsvExec, PhysicalPlanConfig}; use crate::scalar::ScalarValue; - use crate::test::{self, aggr_test_schema}; + use crate::test::{self}; use futures::future; #[tokio::test] @@ -275,15 +275,18 @@ mod tests { test::create_partitioned_csv("aggregate_test_100.csv", partitions)?; let csv = CsvExec::new( - Arc::new(LocalFileSystem {}), - files, - Statistics::default(), - aggr_test_schema(), + PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_schema: Arc::clone(&schema), + file_groups: files, + statistics: Statistics::default(), + projection: None, + batch_size: 1024, + limit: None, + table_partition_cols: vec![], + }, true, b',', - None, - 1024, - None, ); // pick column c1 and name it column c1 in the output schema diff --git a/datafusion/src/physical_plan/sort.rs b/datafusion/src/physical_plan/sort.rs index 499d1f743844..a606906e8680 100644 --- a/datafusion/src/physical_plan/sort.rs +++ b/datafusion/src/physical_plan/sort.rs @@ -314,10 +314,13 @@ mod tests { use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; use crate::physical_plan::expressions::col; use crate::physical_plan::memory::MemoryExec; - use crate::physical_plan::{collect, file_format::CsvExec}; + use crate::physical_plan::{ + collect, + file_format::{CsvExec, PhysicalPlanConfig}, + }; use crate::test::assert_is_pending; use crate::test::exec::assert_strong_count_converges_to_zero; - use crate::test::{self, aggr_test_schema, exec::BlockingExec}; + use crate::test::{self, exec::BlockingExec}; use arrow::array::*; use arrow::datatypes::*; use futures::FutureExt; @@ -330,15 +333,18 @@ mod tests { test::create_partitioned_csv("aggregate_test_100.csv", partitions)?; let csv = CsvExec::new( - Arc::new(LocalFileSystem {}), - files, - Statistics::default(), - aggr_test_schema(), + PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_schema: Arc::clone(&schema), + file_groups: files, + statistics: Statistics::default(), + projection: None, + batch_size: 1024, + limit: None, + table_partition_cols: vec![], + }, true, b',', - None, - 1024, - None, ); let sort_exec = Arc::new(SortExec::try_new( diff --git a/datafusion/src/physical_plan/sort_preserving_merge.rs b/datafusion/src/physical_plan/sort_preserving_merge.rs index 5aaf9789f699..62f4b941f7f7 100644 --- a/datafusion/src/physical_plan/sort_preserving_merge.rs +++ b/datafusion/src/physical_plan/sort_preserving_merge.rs @@ -667,7 +667,7 @@ mod tests { use crate::assert_batches_eq; use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; use crate::physical_plan::expressions::col; - use crate::physical_plan::file_format::CsvExec; + use crate::physical_plan::file_format::{CsvExec, PhysicalPlanConfig}; use crate::physical_plan::memory::MemoryExec; use crate::physical_plan::sort::SortExec; use crate::physical_plan::{collect, common}; @@ -936,15 +936,18 @@ mod tests { test::create_partitioned_csv("aggregate_test_100.csv", partitions).unwrap(); let csv = Arc::new(CsvExec::new( - Arc::new(LocalFileSystem {}), - files, - Statistics::default(), - Arc::clone(&schema), + PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_schema: Arc::clone(&schema), + file_groups: files, + statistics: Statistics::default(), + projection: None, + batch_size: 1024, + limit: None, + table_partition_cols: vec![], + }, true, b',', - None, - 1024, - None, )); let sort = vec![ @@ -1016,15 +1019,18 @@ mod tests { test::create_partitioned_csv("aggregate_test_100.csv", partitions).unwrap(); let csv = Arc::new(CsvExec::new( - Arc::new(LocalFileSystem {}), - files, - Statistics::default(), - schema, + PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_schema: schema, + file_groups: files, + statistics: Statistics::default(), + projection: None, + batch_size: 1024, + limit: None, + table_partition_cols: vec![], + }, true, b',', - None, - 1024, - None, )); let sorted = basic_sort(csv, sort).await; diff --git a/datafusion/src/physical_plan/union.rs b/datafusion/src/physical_plan/union.rs index 43e23850b19e..418be630bed9 100644 --- a/datafusion/src/physical_plan/union.rs +++ b/datafusion/src/physical_plan/union.rs @@ -220,8 +220,12 @@ mod tests { use super::*; use crate::datasource::object_store::{local::LocalFileSystem, ObjectStore}; use crate::test; + use crate::{ - physical_plan::{collect, file_format::CsvExec}, + physical_plan::{ + collect, + file_format::{CsvExec, PhysicalPlanConfig}, + }, scalar::ScalarValue, }; use arrow::record_batch::RecordBatch; @@ -236,27 +240,33 @@ mod tests { let (_, files2) = test::create_partitioned_csv("aggregate_test_100.csv", 5)?; let csv = CsvExec::new( - Arc::clone(&fs), - files, - Statistics::default(), - Arc::clone(&schema), + PhysicalPlanConfig { + object_store: Arc::clone(&fs), + file_schema: Arc::clone(&schema), + file_groups: files, + statistics: Statistics::default(), + projection: None, + batch_size: 1024, + limit: None, + table_partition_cols: vec![], + }, true, b',', - None, - 1024, - None, ); let csv2 = CsvExec::new( - Arc::clone(&fs), - files2, - Statistics::default(), - schema, + PhysicalPlanConfig { + object_store: Arc::clone(&fs), + file_schema: Arc::clone(&schema), + file_groups: files2, + statistics: Statistics::default(), + projection: None, + batch_size: 1024, + limit: None, + table_partition_cols: vec![], + }, true, b',', - None, - 1024, - None, ); let union_exec = Arc::new(UnionExec::new(vec![Arc::new(csv), Arc::new(csv2)])); diff --git a/datafusion/src/physical_plan/windows/mod.rs b/datafusion/src/physical_plan/windows/mod.rs index ef420b2c8351..28bf40293612 100644 --- a/datafusion/src/physical_plan/windows/mod.rs +++ b/datafusion/src/physical_plan/windows/mod.rs @@ -178,7 +178,7 @@ mod tests { use crate::datasource::object_store::local::LocalFileSystem; use crate::physical_plan::aggregates::AggregateFunction; use crate::physical_plan::expressions::col; - use crate::physical_plan::file_format::CsvExec; + use crate::physical_plan::file_format::{CsvExec, PhysicalPlanConfig}; use crate::physical_plan::{collect, Statistics}; use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; use crate::test::{self, aggr_test_schema, assert_is_pending}; @@ -192,15 +192,18 @@ mod tests { let (_, files) = test::create_partitioned_csv("aggregate_test_100.csv", partitions)?; let csv = CsvExec::new( - Arc::new(LocalFileSystem {}), - files, - Statistics::default(), - aggr_test_schema(), + PhysicalPlanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_schema: aggr_test_schema(), + file_groups: files, + statistics: Statistics::default(), + projection: None, + batch_size: 1024, + limit: None, + table_partition_cols: vec![], + }, true, b',', - None, - 1024, - None, ); let input = Arc::new(csv); diff --git a/datafusion/src/test/mod.rs b/datafusion/src/test/mod.rs index f673eb065aaf..c13df55c05df 100644 --- a/datafusion/src/test/mod.rs +++ b/datafusion/src/test/mod.rs @@ -17,7 +17,7 @@ //! Common unit test utility methods -use crate::datasource::object_store::local::local_file_meta; +use crate::datasource::object_store::local::local_unpartitioned_file; use crate::datasource::{MemTable, PartitionedFile, TableProvider}; use crate::error::Result; use crate::logical_plan::{LogicalPlan, LogicalPlanBuilder}; @@ -98,11 +98,7 @@ pub fn create_partitioned_csv( let groups = files .into_iter() - .map(|f| { - vec![PartitionedFile { - file_meta: local_file_meta(f.to_str().unwrap().to_owned()), - }] - }) + .map(|f| vec![local_unpartitioned_file(f.to_str().unwrap().to_owned())]) .collect::>(); Ok((tmp_dir.into_path().to_str().unwrap().to_string(), groups)) diff --git a/datafusion/src/test/object_store.rs b/datafusion/src/test/object_store.rs index 4020b999f7d0..e93b4cd2d410 100644 --- a/datafusion/src/test/object_store.rs +++ b/datafusion/src/test/object_store.rs @@ -14,7 +14,6 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. - //! Object store implem used for testing use std::{ @@ -34,14 +33,14 @@ use futures::{stream, AsyncRead, StreamExt}; #[derive(Debug)] /// An object store implem that is useful for testing. -/// The `ObjectReader`s are filled with zero bytes. +/// `ObjectReader`s are filled with zero bytes. pub struct TestObjectStore { /// The `(path,size)` of the files that "exist" in the store - pub files: Vec<(String, u64)>, + files: Vec<(String, u64)>, } impl TestObjectStore { - pub fn new_arc(files: &[(&str, u64)]) -> Arc { + pub fn new_arc(files: &[(&str, u64)]) -> Arc { Arc::new(Self { files: files.iter().map(|f| (f.0.to_owned(), f.1)).collect(), }) diff --git a/datafusion/tests/common.rs b/datafusion/tests/common.rs new file mode 100644 index 000000000000..3490db5e091f --- /dev/null +++ b/datafusion/tests/common.rs @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! methods that are common to multiple integration test setups + +use std::sync::Arc; + +use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; + +pub fn aggr_test_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("c1", DataType::Utf8, false), + Field::new("c2", DataType::UInt32, false), + Field::new("c3", DataType::Int8, false), + Field::new("c4", DataType::Int16, false), + Field::new("c5", DataType::Int32, false), + Field::new("c6", DataType::Int64, false), + Field::new("c7", DataType::UInt8, false), + Field::new("c8", DataType::UInt16, false), + Field::new("c9", DataType::UInt32, false), + Field::new("c10", DataType::UInt64, false), + Field::new("c11", DataType::Float32, false), + Field::new("c12", DataType::Float64, false), + Field::new("c13", DataType::Utf8, false), + ])) +} diff --git a/datafusion/tests/path_partition.rs b/datafusion/tests/path_partition.rs new file mode 100644 index 000000000000..789511065fc8 --- /dev/null +++ b/datafusion/tests/path_partition.rs @@ -0,0 +1,392 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Test queries on partitioned datasets + +use std::{fs, io, sync::Arc}; + +use async_trait::async_trait; +use datafusion::{ + assert_batches_sorted_eq, + datasource::{ + file_format::{csv::CsvFormat, parquet::ParquetFormat}, + listing::{ListingOptions, ListingTable}, + object_store::{ + local::LocalFileSystem, FileMeta, FileMetaStream, ListEntryStream, + ObjectReader, ObjectStore, SizedFile, + }, + }, + error::{DataFusionError, Result}, + physical_plan::ColumnStatistics, + prelude::ExecutionContext, + test_util::{arrow_test_data, parquet_test_data}, +}; +use futures::{stream, StreamExt}; + +mod common; + +#[tokio::test] +async fn csv_filter_with_file_col() -> Result<()> { + let mut ctx = ExecutionContext::new(); + + register_partitioned_aggregate_csv( + &mut ctx, + &[ + "mytable/date=2021-10-27/file.csv", + "mytable/date=2021-10-28/file.csv", + ], + &["date"], + "mytable", + ); + + let result = ctx + .sql("SELECT c1, c2 FROM t WHERE date='2021-10-27' and date!=c1 LIMIT 5") + .await? + .collect() + .await?; + + let expected = vec![ + "+----+----+", + "| c1 | c2 |", + "+----+----+", + "| a | 1 |", + "| b | 1 |", + "| b | 5 |", + "| c | 2 |", + "| d | 5 |", + "+----+----+", + ]; + assert_batches_sorted_eq!(expected, &result); + + Ok(()) +} + +#[tokio::test] +async fn csv_projection_on_partition() -> Result<()> { + let mut ctx = ExecutionContext::new(); + + register_partitioned_aggregate_csv( + &mut ctx, + &[ + "mytable/date=2021-10-27/file.csv", + "mytable/date=2021-10-28/file.csv", + ], + &["date"], + "mytable", + ); + + let result = ctx + .sql("SELECT c1, date FROM t WHERE date='2021-10-27' LIMIT 5") + .await? + .collect() + .await?; + + let expected = vec![ + "+----+------------+", + "| c1 | date |", + "+----+------------+", + "| a | 2021-10-27 |", + "| b | 2021-10-27 |", + "| b | 2021-10-27 |", + "| c | 2021-10-27 |", + "| d | 2021-10-27 |", + "+----+------------+", + ]; + assert_batches_sorted_eq!(expected, &result); + + Ok(()) +} + +#[tokio::test] +async fn csv_grouping_by_partition() -> Result<()> { + let mut ctx = ExecutionContext::new(); + + register_partitioned_aggregate_csv( + &mut ctx, + &[ + "mytable/date=2021-10-26/file.csv", + "mytable/date=2021-10-27/file.csv", + "mytable/date=2021-10-28/file.csv", + ], + &["date"], + "mytable", + ); + + let result = ctx + .sql("SELECT date, count(*), count(distinct(c1)) FROM t WHERE date<='2021-10-27' GROUP BY date") + .await? + .collect() + .await?; + + let expected = vec![ + "+------------+-----------------+----------------------+", + "| date | COUNT(UInt8(1)) | COUNT(DISTINCT t.c1) |", + "+------------+-----------------+----------------------+", + "| 2021-10-26 | 100 | 5 |", + "| 2021-10-27 | 100 | 5 |", + "+------------+-----------------+----------------------+", + ]; + assert_batches_sorted_eq!(expected, &result); + + Ok(()) +} + +#[tokio::test] +async fn parquet_multiple_partitions() -> Result<()> { + let mut ctx = ExecutionContext::new(); + + register_partitioned_alltypes_parquet( + &mut ctx, + &[ + "year=2021/month=09/day=09/file.parquet", + "year=2021/month=10/day=09/file.parquet", + "year=2021/month=10/day=28/file.parquet", + ], + &["year", "month", "day"], + "", + "alltypes_plain.parquet", + ) + .await; + + let result = ctx + .sql("SELECT id, day FROM t WHERE day=month ORDER BY id") + .await? + .collect() + .await?; + + let expected = vec![ + "+----+-----+", + "| id | day |", + "+----+-----+", + "| 0 | 09 |", + "| 1 | 09 |", + "| 2 | 09 |", + "| 3 | 09 |", + "| 4 | 09 |", + "| 5 | 09 |", + "| 6 | 09 |", + "| 7 | 09 |", + "+----+-----+", + ]; + assert_batches_sorted_eq!(expected, &result); + + Ok(()) +} + +#[tokio::test] +async fn parquet_statistics() -> Result<()> { + let mut ctx = ExecutionContext::new(); + + register_partitioned_alltypes_parquet( + &mut ctx, + &[ + "year=2021/month=09/day=09/file.parquet", + "year=2021/month=10/day=09/file.parquet", + "year=2021/month=10/day=28/file.parquet", + ], + &["year", "month", "day"], + "", + // This is the only file we found in the test set with + // actual stats. It has 1 column / 1 row. + "single_nan.parquet", + ) + .await; + + //// NO PROJECTION //// + let logical_plan = ctx.sql("SELECT * FROM t").await?.to_logical_plan(); + + let physical_plan = ctx.create_physical_plan(&logical_plan).await?; + assert_eq!(physical_plan.schema().fields().len(), 4); + + let stat_cols = physical_plan + .statistics() + .column_statistics + .expect("col stats should be defined"); + assert_eq!(stat_cols.len(), 4); + // stats for the first col are read from the parquet file + assert_eq!(stat_cols[0].null_count, Some(3)); + // TODO assert partition column (1,2,3) stats once implemented (#1186) + assert_eq!(stat_cols[1], ColumnStatistics::default()); + assert_eq!(stat_cols[2], ColumnStatistics::default()); + assert_eq!(stat_cols[3], ColumnStatistics::default()); + + //// WITH PROJECTION //// + let logical_plan = ctx + .sql("SELECT mycol, day FROM t WHERE day='28'") + .await? + .to_logical_plan(); + + let physical_plan = ctx.create_physical_plan(&logical_plan).await?; + assert_eq!(physical_plan.schema().fields().len(), 2); + + let stat_cols = physical_plan + .statistics() + .column_statistics + .expect("col stats should be defined"); + assert_eq!(stat_cols.len(), 2); + // stats for the first col are read from the parquet file + assert_eq!(stat_cols[0].null_count, Some(1)); + // TODO assert partition column stats once implemented (#1186) + assert_eq!(stat_cols[1], ColumnStatistics::default()); + + Ok(()) +} + +#[tokio::test] +async fn parquet_overlapping_columns() -> Result<()> { + let mut ctx = ExecutionContext::new(); + + // `id` is both a column of the file and a partitioning col + register_partitioned_alltypes_parquet( + &mut ctx, + &[ + "id=1/file.parquet", + "id=2/file.parquet", + "id=3/file.parquet", + ], + &["id"], + "", + "alltypes_plain.parquet", + ) + .await; + + let result = ctx.sql("SELECT id FROM t WHERE id=1 ORDER BY id").await; + + assert!( + result.is_err(), + "Dupplicate qualified name should raise error" + ); + Ok(()) +} + +fn register_partitioned_aggregate_csv( + ctx: &mut ExecutionContext, + store_paths: &[&str], + partition_cols: &[&str], + table_path: &str, +) { + let testdata = arrow_test_data(); + let csv_file_path = format!("{}/csv/aggregate_test_100.csv", testdata); + let file_schema = common::aggr_test_schema(); + let object_store = MirroringObjectStore::new_arc(csv_file_path, store_paths); + + let mut options = ListingOptions::new(Arc::new(CsvFormat::default())); + options.table_partition_cols = partition_cols.iter().map(|&s| s.to_owned()).collect(); + + let table = + ListingTable::new(object_store, table_path.to_owned(), file_schema, options); + + ctx.register_table("t", Arc::new(table)) + .expect("registering listing table failed"); +} + +async fn register_partitioned_alltypes_parquet( + ctx: &mut ExecutionContext, + store_paths: &[&str], + partition_cols: &[&str], + table_path: &str, + source_file: &str, +) { + let testdata = parquet_test_data(); + let parquet_file_path = format!("{}/{}", testdata, source_file); + let object_store = + MirroringObjectStore::new_arc(parquet_file_path.clone(), store_paths); + + let mut options = ListingOptions::new(Arc::new(ParquetFormat::default())); + options.table_partition_cols = partition_cols.iter().map(|&s| s.to_owned()).collect(); + options.collect_stat = true; + + let file_schema = options + .infer_schema(Arc::clone(&object_store), store_paths[0]) + .await + .expect("Parquet schema inference failed"); + + let table = + ListingTable::new(object_store, table_path.to_owned(), file_schema, options); + + ctx.register_table("t", Arc::new(table)) + .expect("registering listing table failed"); +} + +#[derive(Debug)] +/// An object store implem that is mirrors a given file to multiple paths. +pub struct MirroringObjectStore { + /// The `(path,size)` of the files that "exist" in the store + files: Vec, + /// The file that will be read at all path + mirrored_file: String, + /// Size of the mirrored file + file_size: u64, +} + +impl MirroringObjectStore { + pub fn new_arc(mirrored_file: String, paths: &[&str]) -> Arc { + let metadata = fs::metadata(&mirrored_file).expect("Local file metadata"); + Arc::new(Self { + files: paths.iter().map(|&f| f.to_owned()).collect(), + mirrored_file, + file_size: metadata.len(), + }) + } +} + +#[async_trait] +impl ObjectStore for MirroringObjectStore { + async fn list_file(&self, prefix: &str) -> Result { + let prefix = prefix.to_owned(); + let size = self.file_size; + Ok(Box::pin( + stream::iter( + self.files + .clone() + .into_iter() + .filter(move |f| f.starts_with(&prefix)), + ) + .map(move |f| { + Ok(FileMeta { + sized_file: SizedFile { path: f, size }, + last_modified: None, + }) + }), + )) + } + + async fn list_dir( + &self, + _prefix: &str, + _delimiter: Option, + ) -> Result { + unimplemented!() + } + + fn file_reader(&self, file: SizedFile) -> Result> { + assert_eq!( + self.file_size, file.size, + "Requested files should have the same size as the mirrored file" + ); + match self.files.iter().find(|&item| &file.path == item) { + Some(_) => Ok(LocalFileSystem {}.file_reader(SizedFile { + path: self.mirrored_file.clone(), + size: self.file_size, + })?), + None => Err(DataFusionError::IoError(io::Error::new( + io::ErrorKind::NotFound, + "not in provided test list", + ))), + } + } +} diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index f1e988814add..cf099193085e 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -49,6 +49,8 @@ use datafusion::{ }; use datafusion::{execution::context::ExecutionContext, physical_plan::displayable}; +mod common; + #[tokio::test] async fn nyc() -> Result<()> { // schema for nyxtaxi csv files @@ -3195,24 +3197,6 @@ async fn explain_analyze_runs_optimizers() { assert_contains!(actual, expected); } -fn aggr_test_schema() -> SchemaRef { - Arc::new(Schema::new(vec![ - Field::new("c1", DataType::Utf8, false), - Field::new("c2", DataType::UInt32, false), - Field::new("c3", DataType::Int8, false), - Field::new("c4", DataType::Int16, false), - Field::new("c5", DataType::Int32, false), - Field::new("c6", DataType::Int64, false), - Field::new("c7", DataType::UInt8, false), - Field::new("c8", DataType::UInt16, false), - Field::new("c9", DataType::UInt32, false), - Field::new("c10", DataType::UInt64, false), - Field::new("c11", DataType::Float32, false), - Field::new("c12", DataType::Float64, false), - Field::new("c13", DataType::Utf8, false), - ])) -} - async fn register_aggregate_csv_by_sql(ctx: &mut ExecutionContext) { let testdata = datafusion::test_util::arrow_test_data(); @@ -3256,7 +3240,7 @@ async fn register_aggregate_csv_by_sql(ctx: &mut ExecutionContext) { async fn register_aggregate_csv(ctx: &mut ExecutionContext) -> Result<()> { let testdata = datafusion::test_util::arrow_test_data(); - let schema = aggr_test_schema(); + let schema = common::aggr_test_schema(); ctx.register_csv( "aggregate_test_100", &format!("{}/csv/aggregate_test_100.csv", testdata),