Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(11367): parquet writer defaults #34

Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
261 changes: 260 additions & 1 deletion datafusion/common/src/file_options/parquet_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,13 @@ pub(crate) fn parse_statistics_string(str_setting: &str) -> Result<EnabledStatis
#[cfg(feature = "parquet")]
#[cfg(test)]
mod tests {
use parquet::{basic::Compression, file::properties::EnabledStatistics};
use parquet::{
basic::Compression,
file::properties::{
BloomFilterProperties, EnabledStatistics, DEFAULT_BLOOM_FILTER_FPP,
DEFAULT_BLOOM_FILTER_NDV,
},
};
use std::collections::HashMap;

use crate::config::{ColumnOptions, ParquetOptions};
Expand Down Expand Up @@ -566,4 +572,257 @@ mod tests {
"the writer_props should have the same configuration as the session's TableParquetOptions",
);
}

#[test]
fn test_defaults_match() {
wiedld marked this conversation as resolved.
Show resolved Hide resolved
// ensure the global settings are the same
let default_table_writer_opts = TableParquetOptions::default();
let default_parquet_opts = ParquetOptions::default();
assert_eq!(
default_table_writer_opts.global,
default_parquet_opts,
"should have matching defaults for TableParquetOptions.global and ParquetOptions",
);

// confirm the TableParquetOptions::default matches the WriterProperties::default, once both converted to WriterProperties
let default_writer_props = WriterProperties::new();
let from_datafusion_defaults =
WriterPropertiesBuilder::try_from(&default_table_writer_opts)
.unwrap()
.build();

// Expected: how the defaults should not match
assert_ne!(
default_writer_props.created_by(),
from_datafusion_defaults.created_by(),
"should have different created_by sources",
);
assert!(
default_writer_props.created_by().starts_with("parquet-rs version"),
"should indicate that writer_props defaults came from the extern parquet crate",
);
assert!(
default_table_writer_opts
.global
.created_by
.starts_with("datafusion version"),
"should indicate that table_parquet_opts defaults came from datafusion",
);

// TODO: in a followup PR, start the discussion of which of these defaults should be changed,
wiedld marked this conversation as resolved.
Show resolved Hide resolved
// and if the change should be in datafusion's default or the extern parquet's default.
// refer to https://github.com/apache/datafusion/issues/11367

// TODO: does not match
assert_eq!(
default_writer_props.compression(&"default".into()),
Compression::UNCOMPRESSED,
"extern parquet's default is None"
);
assert!(
matches!(
from_datafusion_defaults.compression(&"default".into()),
Compression::ZSTD(_)
),
"datafusion's default is zstd"
);

// TODO: does not match
wiedld marked this conversation as resolved.
Show resolved Hide resolved
assert_eq!(
default_writer_props.data_page_row_count_limit(),
20_000,
"extern parquet's default data_page_row_count_limit is 20_000"
);
assert_eq!(
from_datafusion_defaults.data_page_row_count_limit(),
usize::MAX,
"datafusion's default is usize::MAX"
);

// TODO: does not match
assert_eq!(
default_writer_props.column_index_truncate_length(),
Some(64),
"extern parquet's default is 64"
);
assert_eq!(
from_datafusion_defaults.column_index_truncate_length(),
None,
"datafusion's default is None"
);

// TODO: matches once create WriterProps, but only due to parquet's override
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this comment -- is there any action item here?

Copy link
Collaborator Author

@wiedld wiedld Jul 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added another commit to re-work the code comments for this test. Hopefully that makes the misalignments more clear.

Please let me know @alamb if I should change anything else before moving this over into an upstream PR.

assert!(
default_writer_props.dictionary_enabled(&"default".into()),
"extern parquet's default is true"
);
assert_eq!(
default_table_writer_opts.global.dictionary_enabled, None,
"datafusion's has no default"
);
assert!(
from_datafusion_defaults.dictionary_enabled(&"default".into()),
"should see the extern parquet's default over-riding datafusion's None",
);

// TODO: matches once create WriterProps, but only due to parquet's override
assert_eq!(
default_writer_props.statistics_enabled(&"default".into()),
EnabledStatistics::Page,
"extern parquet's default is page"
);
assert_eq!(
default_table_writer_opts.global.statistics_enabled, None,
"datafusion's has no default"
);
assert_eq!(
from_datafusion_defaults.statistics_enabled(&"default".into()),
EnabledStatistics::Page,
"should see the extern parquet's default over-riding datafusion's None",
);

// TODO: matches once create WriterProps, but only due to parquet's override
assert_eq!(
default_writer_props.max_statistics_size(&"default".into()),
4096,
"extern parquet's default is 4096"
);
assert_eq!(
default_table_writer_opts.global.max_statistics_size, None,
"datafusion's has no default"
);
assert_eq!(
default_writer_props.max_statistics_size(&"default".into()),
4096,
"should see the extern parquet's default over-riding datafusion's None",
);

// TODO: temporarily set the extern parquet's defaults back to the datafusion defaults.
let mut from_extern_parquet =
session_config_from_writer_props(&default_writer_props);
from_extern_parquet.global.compression = Some("zstd(3)".into());
from_extern_parquet.global.data_page_row_count_limit = usize::MAX;
from_extern_parquet.global.column_index_truncate_length = None;
from_extern_parquet.global.dictionary_enabled = None;
from_extern_parquet.global.statistics_enabled = None;
from_extern_parquet.global.max_statistics_size = None;

// Expected: the remaining should match
let same_created_by = default_table_writer_opts.global.created_by.clone(); // we expect these to be different
from_extern_parquet.global.created_by = same_created_by; // we expect these to be different
assert_eq!(
default_table_writer_opts,
from_extern_parquet,
"the writer_props should have the same configuration as the session's TableParquetOptions",
);
}

#[test]
fn test_bloom_filter_defaults() {
// the TableParquetOptions::default, with only the bloom filter turned on
let mut default_table_writer_opts = TableParquetOptions::default();
default_table_writer_opts.global.bloom_filter_on_write = true;

// the WriterProperties::default, with only the bloom filter turned on
let default_writer_props = WriterProperties::new();
let from_datafusion_defaults =
WriterPropertiesBuilder::try_from(&default_table_writer_opts)
.unwrap()
.set_bloom_filter_enabled(true)
.build();

// TODO: should have same behavior in either.
// refer to https://github.com/apache/datafusion/issues/11367
assert_ne!(
default_writer_props.bloom_filter_properties(&"default".into()),
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
"parquet and datafusion props, will not have the same bloom filter props",
);
assert_eq!(
default_writer_props.bloom_filter_properties(&"default".into()),
None,
"extern parquet's default remains None"
);
assert_eq!(
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
Some(&BloomFilterProperties::default()),
"datafusion's has BloomFilterProperties::default",
);
}

#[test]
fn test_bloom_filter_set_fpp_only() {
// the TableParquetOptions::default, with only fpp set
let mut default_table_writer_opts = TableParquetOptions::default();
default_table_writer_opts.global.bloom_filter_on_write = true;
default_table_writer_opts.global.bloom_filter_fpp = Some(0.42);

// the WriterProperties::default, with only fpp set
let default_writer_props = WriterProperties::new();
let from_datafusion_defaults =
WriterPropertiesBuilder::try_from(&default_table_writer_opts)
.unwrap()
.set_bloom_filter_enabled(true)
.set_bloom_filter_fpp(0.42)
.build();

// TODO: should have same behavior in either.
// refer to https://github.com/apache/datafusion/issues/11367
assert_ne!(
default_writer_props.bloom_filter_properties(&"default".into()),
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
"parquet and datafusion props, will not have the same bloom filter props",
);
assert_eq!(
default_writer_props.bloom_filter_properties(&"default".into()),
None,
"extern parquet's default remains None"
);
assert_eq!(
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
Some(&BloomFilterProperties {
fpp: 0.42,
ndv: DEFAULT_BLOOM_FILTER_NDV
}),
"datafusion's has BloomFilterProperties",
);
}

#[test]
fn test_bloom_filter_set_ndv_only() {
// the TableParquetOptions::default, with only ndv set
let mut default_table_writer_opts = TableParquetOptions::default();
default_table_writer_opts.global.bloom_filter_on_write = true;
default_table_writer_opts.global.bloom_filter_ndv = Some(42);

// the WriterProperties::default, with only ndv set
let default_writer_props = WriterProperties::new();
let from_datafusion_defaults =
WriterPropertiesBuilder::try_from(&default_table_writer_opts)
.unwrap()
.set_bloom_filter_enabled(true)
.set_bloom_filter_ndv(42)
.build();

// TODO: should have same behavior in either.
// refer to https://github.com/apache/datafusion/issues/11367
assert_ne!(
default_writer_props.bloom_filter_properties(&"default".into()),
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
"parquet and datafusion props, will not have the same bloom filter props",
);
assert_eq!(
default_writer_props.bloom_filter_properties(&"default".into()),
None,
"extern parquet's default remains None"
);
assert_eq!(
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
Some(&BloomFilterProperties {
fpp: DEFAULT_BLOOM_FILTER_FPP,
ndv: 42
}),
"datafusion's has BloomFilterProperties",
);
}
}