diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 30ab9a339b54..9ba66dd6098c 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -571,6 +571,9 @@ config_namespace! { /// when an exact selectivity cannot be determined. Valid values are /// between 0 (no selectivity) and 100 (all rows are selected). pub default_filter_selectivity: u8, default = 20 + + /// When set to true, the optimizer will not attempt to convert Union to Interleave + pub prefer_existing_union: bool, default = false } } @@ -1364,12 +1367,31 @@ impl TableOptions { /// Options that control how Parquet files are read, including global options /// that apply to all columns and optional column-specific overrides +/// +/// Closely tied to [`ParquetWriterOptions`](crate::file_options::parquet_writer::ParquetWriterOptions). +/// Properties not included in [`TableParquetOptions`] may not be configurable at the external API +/// (e.g. sorting_columns). #[derive(Clone, Default, Debug, PartialEq)] pub struct TableParquetOptions { /// Global Parquet options that propagates to all columns. pub global: ParquetOptions, /// Column specific options. Default usage is parquet.XX::column. pub column_specific_options: HashMap, + /// Additional file-level metadata to include. Inserted into the key_value_metadata + /// for the written [`FileMetaData`](https://docs.rs/parquet/latest/parquet/file/metadata/struct.FileMetaData.html). + /// + /// Multiple entries are permitted + /// ```sql + /// OPTIONS ( + /// 'format.metadata::key1' '', + /// 'format.metadata::key2' 'value', + /// 'format.metadata::key3' 'value has spaces', + /// 'format.metadata::key4' 'value has special chars :: :', + /// 'format.metadata::key_dupe' 'original will be overwritten', + /// 'format.metadata::key_dupe' 'final' + /// ) + /// ``` + pub key_value_metadata: HashMap>, } impl ConfigField for TableParquetOptions { @@ -1380,8 +1402,24 @@ impl ConfigField for TableParquetOptions { } fn set(&mut self, key: &str, value: &str) -> Result<()> { - // Determine the key if it's a global or column-specific setting - if key.contains("::") { + // Determine if the key is a global, metadata, or column-specific setting + if key.starts_with("metadata::") { + let k = + match key.split("::").collect::>()[..] { + [_meta] | [_meta, ""] => return Err(DataFusionError::Configuration( + "Invalid metadata key provided, missing key in metadata::" + .to_string(), + )), + [_meta, k] => k.into(), + _ => { + return Err(DataFusionError::Configuration(format!( + "Invalid metadata key provided, found too many '::' in \"{key}\"" + ))) + } + }; + self.key_value_metadata.insert(k, Some(value.into())); + Ok(()) + } else if key.contains("::") { self.column_specific_options.set(key, value) } else { self.global.set(key, value) @@ -1773,4 +1811,38 @@ mod tests { .iter() .any(|item| item.key == "format.bloom_filter_enabled::col1")) } + + #[cfg(feature = "parquet")] + #[test] + fn parquet_table_options_config_metadata_entry() { + let mut table_config = TableOptions::new(); + table_config.set_file_format(FileType::PARQUET); + table_config.set("format.metadata::key1", "").unwrap(); + table_config.set("format.metadata::key2", "value2").unwrap(); + table_config + .set("format.metadata::key3", "value with spaces ") + .unwrap(); + table_config + .set("format.metadata::key4", "value with special chars :: :") + .unwrap(); + + let parsed_metadata = table_config.parquet.key_value_metadata.clone(); + assert_eq!(parsed_metadata.get("should not exist1"), None); + assert_eq!(parsed_metadata.get("key1"), Some(&Some("".into()))); + assert_eq!(parsed_metadata.get("key2"), Some(&Some("value2".into()))); + assert_eq!( + parsed_metadata.get("key3"), + Some(&Some("value with spaces ".into())) + ); + assert_eq!( + parsed_metadata.get("key4"), + Some(&Some("value with special chars :: :".into())) + ); + + // duplicate keys are overwritten + table_config.set("format.metadata::key_dupe", "A").unwrap(); + table_config.set("format.metadata::key_dupe", "B").unwrap(); + let parsed_metadata = table_config.parquet.key_value_metadata; + assert_eq!(parsed_metadata.get("key_dupe"), Some(&Some("B".into()))); + } } diff --git a/datafusion/common/src/file_options/mod.rs b/datafusion/common/src/file_options/mod.rs index eb1ce1b364fd..a760619a7ba8 100644 --- a/datafusion/common/src/file_options/mod.rs +++ b/datafusion/common/src/file_options/mod.rs @@ -124,6 +124,10 @@ mod tests { 123 ); + // properties which remain as default on WriterProperties + assert_eq!(properties.key_value_metadata(), None); + assert_eq!(properties.sorting_columns(), None); + Ok(()) } diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index 28e73ba48f53..8ac6bcaa7adf 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -17,11 +17,17 @@ //! Options related to how parquet files should be written -use crate::{config::TableParquetOptions, DataFusionError, Result}; +use crate::{ + config::{ParquetOptions, TableParquetOptions}, + DataFusionError, Result, +}; use parquet::{ basic::{BrotliLevel, GzipLevel, ZstdLevel}, - file::properties::{EnabledStatistics, WriterProperties, WriterVersion}, + file::{ + metadata::KeyValue, + properties::{EnabledStatistics, WriterProperties, WriterVersion}, + }, schema::types::ColumnPath, }; @@ -47,53 +53,87 @@ impl TryFrom<&TableParquetOptions> for ParquetWriterOptions { type Error = DataFusionError; fn try_from(parquet_options: &TableParquetOptions) -> Result { - let parquet_session_options = &parquet_options.global; - let mut builder = WriterProperties::builder() - .set_data_page_size_limit(parquet_session_options.data_pagesize_limit) - .set_write_batch_size(parquet_session_options.write_batch_size) - .set_writer_version(parse_version_string( - &parquet_session_options.writer_version, - )?) - .set_dictionary_page_size_limit( - parquet_session_options.dictionary_page_size_limit, - ) - .set_max_row_group_size(parquet_session_options.max_row_group_size) - .set_created_by(parquet_session_options.created_by.clone()) - .set_column_index_truncate_length( - parquet_session_options.column_index_truncate_length, + let ParquetOptions { + data_pagesize_limit, + write_batch_size, + writer_version, + dictionary_page_size_limit, + max_row_group_size, + created_by, + column_index_truncate_length, + data_page_row_count_limit, + bloom_filter_enabled, + encoding, + dictionary_enabled, + compression, + statistics_enabled, + max_statistics_size, + bloom_filter_fpp, + bloom_filter_ndv, + // below is not part of ParquetWriterOptions + enable_page_index: _, + pruning: _, + skip_metadata: _, + metadata_size_hint: _, + pushdown_filters: _, + reorder_filters: _, + allow_single_file_parallelism: _, + maximum_parallel_row_group_writers: _, + maximum_buffered_record_batches_per_stream: _, + } = &parquet_options.global; + + let key_value_metadata = if !parquet_options.key_value_metadata.is_empty() { + Some( + parquet_options + .key_value_metadata + .clone() + .drain() + .map(|(key, value)| KeyValue { key, value }) + .collect::>(), ) - .set_data_page_row_count_limit( - parquet_session_options.data_page_row_count_limit, - ) - .set_bloom_filter_enabled(parquet_session_options.bloom_filter_enabled); + } else { + None + }; - if let Some(encoding) = &parquet_session_options.encoding { + let mut builder = WriterProperties::builder() + .set_data_page_size_limit(*data_pagesize_limit) + .set_write_batch_size(*write_batch_size) + .set_writer_version(parse_version_string(writer_version.as_str())?) + .set_dictionary_page_size_limit(*dictionary_page_size_limit) + .set_max_row_group_size(*max_row_group_size) + .set_created_by(created_by.clone()) + .set_column_index_truncate_length(*column_index_truncate_length) + .set_data_page_row_count_limit(*data_page_row_count_limit) + .set_bloom_filter_enabled(*bloom_filter_enabled) + .set_key_value_metadata(key_value_metadata); + + if let Some(encoding) = &encoding { builder = builder.set_encoding(parse_encoding_string(encoding)?); } - if let Some(enabled) = parquet_session_options.dictionary_enabled { - builder = builder.set_dictionary_enabled(enabled); + if let Some(enabled) = dictionary_enabled { + builder = builder.set_dictionary_enabled(*enabled); } - if let Some(compression) = &parquet_session_options.compression { + if let Some(compression) = &compression { builder = builder.set_compression(parse_compression_string(compression)?); } - if let Some(statistics) = &parquet_session_options.statistics_enabled { + if let Some(statistics) = &statistics_enabled { builder = builder.set_statistics_enabled(parse_statistics_string(statistics)?); } - if let Some(size) = parquet_session_options.max_statistics_size { - builder = builder.set_max_statistics_size(size); + if let Some(size) = max_statistics_size { + builder = builder.set_max_statistics_size(*size); } - if let Some(fpp) = parquet_session_options.bloom_filter_fpp { - builder = builder.set_bloom_filter_fpp(fpp); + if let Some(fpp) = bloom_filter_fpp { + builder = builder.set_bloom_filter_fpp(*fpp); } - if let Some(ndv) = parquet_session_options.bloom_filter_ndv { - builder = builder.set_bloom_filter_ndv(ndv); + if let Some(ndv) = bloom_filter_ndv { + builder = builder.set_bloom_filter_ndv(*ndv); } for (column, options) in &parquet_options.column_specific_options { @@ -141,6 +181,8 @@ impl TryFrom<&TableParquetOptions> for ParquetWriterOptions { builder.set_column_max_statistics_size(path, max_statistics_size); } } + + // ParquetWriterOptions will have defaults for the remaining fields (e.g. sorting_columns) Ok(ParquetWriterOptions { writer_options: builder.build(), }) diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 731f25b5dbb1..440624eb6f8a 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -1136,7 +1136,7 @@ mod tests { }; use parquet::arrow::arrow_reader::ArrowReaderOptions; use parquet::arrow::ParquetRecordBatchStreamBuilder; - use parquet::file::metadata::{ParquetColumnIndex, ParquetOffsetIndex}; + use parquet::file::metadata::{KeyValue, ParquetColumnIndex, ParquetOffsetIndex}; use parquet::file::page_index::index::Index; use tokio::fs::File; use tokio::io::AsyncWrite; @@ -1865,7 +1865,13 @@ mod tests { }; let parquet_sink = Arc::new(ParquetSink::new( file_sink_config, - TableParquetOptions::default(), + TableParquetOptions { + key_value_metadata: std::collections::HashMap::from([ + ("my-data".to_string(), Some("stuff".to_string())), + ("my-data-bool-key".to_string(), None), + ]), + ..Default::default() + }, )); // create data @@ -1899,7 +1905,10 @@ mod tests { let ( path, FileMetaData { - num_rows, schema, .. + num_rows, + schema, + key_value_metadata, + .. }, ) = written.take(1).next().unwrap(); let path_parts = path.parts().collect::>(); @@ -1915,6 +1924,20 @@ mod tests { "output file metadata should contain col b" ); + let mut key_value_metadata = key_value_metadata.unwrap(); + key_value_metadata.sort_by(|a, b| a.key.cmp(&b.key)); + let expected_metadata = vec![ + KeyValue { + key: "my-data".to_string(), + value: Some("stuff".to_string()), + }, + KeyValue { + key: "my-data-bool-key".to_string(), + value: None, + }, + ]; + assert_eq!(key_value_metadata, expected_metadata); + Ok(()) } diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs index eacc842c342d..7ea4164963cd 100644 --- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs @@ -1192,7 +1192,11 @@ fn ensure_distribution( .collect::>>()?; let children_plans = children.iter().map(|c| c.plan.clone()).collect::>(); - plan = if plan.as_any().is::() && can_interleave(children_plans.iter()) { + + plan = if plan.as_any().is::() + && !config.optimizer.prefer_existing_union + && can_interleave(children_plans.iter()) + { // Add a special case for [`UnionExec`] since we want to "bubble up" // hash-partitioned data. So instead of // @@ -1731,16 +1735,25 @@ pub(crate) mod tests { /// * `TARGET_PARTITIONS` (optional) - number of partitions to repartition to /// * `REPARTITION_FILE_SCANS` (optional) - if true, will repartition file scans /// * `REPARTITION_FILE_MIN_SIZE` (optional) - minimum file size to repartition + /// * `PREFER_EXISTING_UNION` (optional) - if true, will not attempt to convert Union to Interleave macro_rules! assert_optimized { ($EXPECTED_LINES: expr, $PLAN: expr, $FIRST_ENFORCE_DIST: expr) => { - assert_optimized!($EXPECTED_LINES, $PLAN, $FIRST_ENFORCE_DIST, false, 10, false, 1024); + assert_optimized!($EXPECTED_LINES, $PLAN, $FIRST_ENFORCE_DIST, false, 10, false, 1024, false); }; ($EXPECTED_LINES: expr, $PLAN: expr, $FIRST_ENFORCE_DIST: expr, $PREFER_EXISTING_SORT: expr) => { - assert_optimized!($EXPECTED_LINES, $PLAN, $FIRST_ENFORCE_DIST, $PREFER_EXISTING_SORT, 10, false, 1024); + assert_optimized!($EXPECTED_LINES, $PLAN, $FIRST_ENFORCE_DIST, $PREFER_EXISTING_SORT, 10, false, 1024, false); + }; + + ($EXPECTED_LINES: expr, $PLAN: expr, $FIRST_ENFORCE_DIST: expr, $PREFER_EXISTING_SORT: expr, $PREFER_EXISTING_UNION: expr) => { + assert_optimized!($EXPECTED_LINES, $PLAN, $FIRST_ENFORCE_DIST, $PREFER_EXISTING_SORT, 10, false, 1024, $PREFER_EXISTING_UNION); }; ($EXPECTED_LINES: expr, $PLAN: expr, $FIRST_ENFORCE_DIST: expr, $PREFER_EXISTING_SORT: expr, $TARGET_PARTITIONS: expr, $REPARTITION_FILE_SCANS: expr, $REPARTITION_FILE_MIN_SIZE: expr) => { + assert_optimized!($EXPECTED_LINES, $PLAN, $FIRST_ENFORCE_DIST, $PREFER_EXISTING_SORT, $TARGET_PARTITIONS, $REPARTITION_FILE_SCANS, $REPARTITION_FILE_MIN_SIZE, false); + }; + + ($EXPECTED_LINES: expr, $PLAN: expr, $FIRST_ENFORCE_DIST: expr, $PREFER_EXISTING_SORT: expr, $TARGET_PARTITIONS: expr, $REPARTITION_FILE_SCANS: expr, $REPARTITION_FILE_MIN_SIZE: expr, $PREFER_EXISTING_UNION: expr) => { let expected_lines: Vec<&str> = $EXPECTED_LINES.iter().map(|s| *s).collect(); let mut config = ConfigOptions::new(); @@ -1748,6 +1761,7 @@ pub(crate) mod tests { config.optimizer.repartition_file_scans = $REPARTITION_FILE_SCANS; config.optimizer.repartition_file_min_size = $REPARTITION_FILE_MIN_SIZE; config.optimizer.prefer_existing_sort = $PREFER_EXISTING_SORT; + config.optimizer.prefer_existing_union = $PREFER_EXISTING_UNION; // NOTE: These tests verify the joint `EnforceDistribution` + `EnforceSorting` cascade // because they were written prior to the separation of `BasicEnforcement` into @@ -3107,7 +3121,67 @@ pub(crate) mod tests { "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]", ]; assert_optimized!(expected, plan.clone(), true); - assert_optimized!(expected, plan, false); + assert_optimized!(expected, plan.clone(), false); + + Ok(()) + } + + #[test] + fn union_not_to_interleave() -> Result<()> { + // group by (a as a1) + let left = aggregate_exec_with_alias( + parquet_exec(), + vec![("a".to_string(), "a1".to_string())], + ); + // group by (a as a2) + let right = aggregate_exec_with_alias( + parquet_exec(), + vec![("a".to_string(), "a1".to_string())], + ); + + // Union + let plan = Arc::new(UnionExec::new(vec![left, right])); + + // final agg + let plan = + aggregate_exec_with_alias(plan, vec![("a1".to_string(), "a2".to_string())]); + + // Only two RepartitionExecs added, no final RepartitionExec required + let expected = &[ + "AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[]", + "RepartitionExec: partitioning=Hash([a2@0], 10), input_partitions=20", + "AggregateExec: mode=Partial, gby=[a1@0 as a2], aggr=[]", + "UnionExec", + "AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]", + "RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10", + "AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]", + "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", + "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]", + "AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]", + "RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10", + "AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]", + "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", + "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]", + ]; + // no sort in the plan but since we need it as a parameter, make it default false + let prefer_existing_sort = false; + let first_enforce_distribution = true; + let prefer_existing_union = true; + + assert_optimized!( + expected, + plan.clone(), + first_enforce_distribution, + prefer_existing_sort, + prefer_existing_union + ); + assert_optimized!( + expected, + plan, + !first_enforce_distribution, + prefer_existing_sort, + prefer_existing_union + ); Ok(()) } @@ -3661,7 +3735,8 @@ pub(crate) mod tests { true, target_partitions, true, - repartition_size + repartition_size, + false ); let expected = [ @@ -3678,7 +3753,8 @@ pub(crate) mod tests { true, target_partitions, true, - repartition_size + repartition_size, + false ); Ok(()) @@ -3741,7 +3817,7 @@ pub(crate) mod tests { )), vec![("a".to_string(), "a".to_string())], ); - assert_optimized!(expected, plan, true, false, 2, true, 10); + assert_optimized!(expected, plan, true, false, 2, true, 10, false); } Ok(()) } diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 4b8a48062381..f33d30152e83 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -970,6 +970,7 @@ impl TryFrom<&protobuf::TableParquetOptions> for TableParquetOptions { .unwrap() .unwrap(), column_specific_options, + key_value_metadata: Default::default(), }) } } diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 3f2134bf7e9b..4e8f809c5a29 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -467,7 +467,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { expr, substring_from, substring_for, - special: false, + special: _, } => self.sql_substring_to_expr( expr, substring_from, diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index 502dfd4fa6bb..d695e8514b07 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -283,11 +283,73 @@ OPTIONS ( 'format.statistics_enabled::col2' none, 'format.max_statistics_size' 123, 'format.bloom_filter_fpp' 0.001, -'format.bloom_filter_ndv' 100 +'format.bloom_filter_ndv' 100, +'format.metadata::key' 'value' ) ---- 2 +# valid vs invalid metadata + +# accepts map with a single entry +statement ok +COPY source_table +TO 'test_files/scratch/copy/table_with_metadata/' +STORED AS PARQUET +OPTIONS ( + 'format.metadata::key' 'value' +) + +# accepts multiple entries (on different keys) +statement ok +COPY source_table +TO 'test_files/scratch/copy/table_with_metadata/' +STORED AS PARQUET +OPTIONS ( + 'format.metadata::key1' '', + 'format.metadata::key2' 'value', + 'format.metadata::key3' 'value with spaces', + 'format.metadata::key4' 'value with special chars :: :' +) + +# accepts multiple entries with the same key (will overwrite) +statement ok +COPY source_table +TO 'test_files/scratch/copy/table_with_metadata/' +STORED AS PARQUET +OPTIONS ( + 'format.metadata::key1' 'value', + 'format.metadata::key1' 'value' +) + +# errors if key is missing +statement error DataFusion error: Invalid or Unsupported Configuration: Invalid metadata key provided, missing key in metadata:: +COPY source_table +TO 'test_files/scratch/copy/table_with_metadata/' +STORED AS PARQUET +OPTIONS ( + 'format.metadata::' 'value' +) + +# errors if key contains internal '::' +statement error DataFusion error: Invalid or Unsupported Configuration: Invalid metadata key provided, found too many '::' in "metadata::key::extra" +COPY source_table +TO 'test_files/scratch/copy/table_with_metadata/' +STORED AS PARQUET +OPTIONS ( + 'format.metadata::key::extra' 'value' +) + +# errors for invalid property (not stating `format.metadata`) +statement error DataFusion error: Invalid or Unsupported Configuration: Config value "wrong-metadata" not found on ColumnOptions +COPY source_table +TO 'test_files/scratch/copy/table_with_metadata/' +STORED AS PARQUET +OPTIONS ( + 'format.wrong-metadata::key' 'value' +) + + # validate multiple parquet file output with all options set statement ok CREATE EXTERNAL TABLE validate_parquet_with_options STORED AS PARQUET LOCATION 'test_files/scratch/copy/table_with_options/'; diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index adc577f12f91..ff63416b3a10 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -1871,6 +1871,17 @@ SELECT digest('','blake3'); ---- af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 + +query T +SELECT substring('alphabet', 1) +---- +alphabet + +query T +SELECT substring('alphabet', 3, 2) +---- +ph + query T SELECT substring('alphabet' from 2 for 1); ---- @@ -1886,6 +1897,24 @@ SELECT substring('alphabet' for 1); ---- a +# The 'from' and 'for' parameters don't support string types, because they should be treated as +# regular expressions, which we have not implemented yet. +query error DataFusion error: Error during planning: No function matches the given name and argument types +SELECT substring('alphabet' FROM '3') + +query error DataFusion error: Error during planning: No function matches the given name and argument types +SELECT substring('alphabet' FROM '3' FOR '2') + +query error DataFusion error: Error during planning: No function matches the given name and argument types +SELECT substring('alphabet' FROM '3' FOR 2) + +query error DataFusion error: Error during planning: No function matches the given name and argument types +SELECT substring('alphabet' FROM 3 FOR '2') + +query error DataFusion error: Error during planning: No function matches the given name and argument types +SELECT substring('alphabet' FOR '2') + + ##### csv_query_nullif_divide_by_0 diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 8f4b1a3816a3..c64279ede02c 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -216,6 +216,7 @@ datafusion.optimizer.hash_join_single_partition_threshold 1048576 datafusion.optimizer.hash_join_single_partition_threshold_rows 131072 datafusion.optimizer.max_passes 3 datafusion.optimizer.prefer_existing_sort false +datafusion.optimizer.prefer_existing_union false datafusion.optimizer.prefer_hash_join true datafusion.optimizer.repartition_aggregations true datafusion.optimizer.repartition_file_min_size 10485760 @@ -294,6 +295,7 @@ datafusion.optimizer.hash_join_single_partition_threshold 1048576 The maximum es datafusion.optimizer.hash_join_single_partition_threshold_rows 131072 The maximum estimated size in rows for one input side of a HashJoin will be collected into a single partition datafusion.optimizer.max_passes 3 Number of times that the optimizer will attempt to optimize the plan datafusion.optimizer.prefer_existing_sort false When true, DataFusion will opportunistically remove sorts when the data is already sorted, (i.e. setting `preserve_order` to true on `RepartitionExec` and using `SortPreservingMergeExec`) When false, DataFusion will maximize plan parallelism using `RepartitionExec` even if this requires subsequently resorting data using a `SortExec`. +datafusion.optimizer.prefer_existing_union false When set to true, the optimizer will not attempt to convert Union to Interleave datafusion.optimizer.prefer_hash_join true When set to true, the physical plan optimizer will prefer HashJoin over SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but consumes more memory datafusion.optimizer.repartition_aggregations true Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level datafusion.optimizer.repartition_file_min_size 10485760 Minimum total files size in bytes to perform file scan repartitioning. diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 3ee3778177c4..a90f03306268 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -103,6 +103,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.optimizer.hash_join_single_partition_threshold | 1048576 | The maximum estimated size in bytes for one input side of a HashJoin will be collected into a single partition | | datafusion.optimizer.hash_join_single_partition_threshold_rows | 131072 | The maximum estimated size in rows for one input side of a HashJoin will be collected into a single partition | | datafusion.optimizer.default_filter_selectivity | 20 | The default filter selectivity used by Filter Statistics when an exact selectivity cannot be determined. Valid values are between 0 (no selectivity) and 100 (all rows are selected). | +| datafusion.optimizer.prefer_existing_union | false | When set to true, the optimizer will not attempt to convert Union to Interleave | | datafusion.explain.logical_plan_only | false | When set to true, the explain statement will only print logical plans | | datafusion.explain.physical_plan_only | false | When set to true, the explain statement will only print physical plans | | datafusion.explain.show_statistics | false | When set to true, the explain statement will print operator statistics for physical plans |