From a67d73bfcd1da703be58dbf1982ef6aa9f690527 Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Sat, 15 Jun 2024 13:53:32 -0400 Subject: [PATCH 01/20] wip create and register ext file types with session --- datafusion/common/src/config.rs | 2 ++ .../common/src/file_options/file_type.rs | 3 +++ .../core/src/execution/session_state.rs | 24 +++++++++++++++++-- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 1c431d04cd35..0a7681ab95b8 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -1658,6 +1658,8 @@ config_namespace! { } } +pub trait FormatOptionsExt: Display {} + #[derive(Debug, Clone, PartialEq)] #[allow(clippy::large_enum_variant)] pub enum FormatOptions { diff --git a/datafusion/common/src/file_options/file_type.rs b/datafusion/common/src/file_options/file_type.rs index fc0bb7445645..cf7195ebe972 100644 --- a/datafusion/common/src/file_options/file_type.rs +++ b/datafusion/common/src/file_options/file_type.rs @@ -40,6 +40,9 @@ pub trait GetExt { fn get_ext(&self) -> String; } +/// Externally Defined FileType +pub trait ExternalFileType: GetExt + Display + Send + Sync {} + /// Readable file type #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum FileType { diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index fed101bd239b..fa312728a8ec 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -41,10 +41,11 @@ use chrono::{DateTime, Utc}; use datafusion_common::alias::AliasGenerator; use datafusion_common::config::{ConfigExtension, ConfigOptions, TableOptions}; use datafusion_common::display::{PlanType, StringifiedPlan, ToStringifiedPlan}; +use datafusion_common::file_options::file_type::ExternalFileType; use datafusion_common::tree_node::TreeNode; use datafusion_common::{ - not_impl_err, plan_datafusion_err, DFSchema, DataFusionError, ResolvedTableReference, - TableReference, + config_err, not_impl_err, plan_datafusion_err, DFSchema, DataFusionError, + ResolvedTableReference, TableReference, }; use datafusion_execution::config::SessionConfig; use datafusion_execution::object_store::ObjectStoreUrl; @@ -111,6 +112,8 @@ pub struct SessionState { window_functions: HashMap>, /// Deserializer registry for extensions. serializer_registry: Arc, + /// Holds registered external FileFormat implementations + file_types: HashMap>, /// Session configuration config: SessionConfig, /// Table options @@ -232,6 +235,7 @@ impl SessionState { aggregate_functions: HashMap::new(), window_functions: HashMap::new(), serializer_registry: Arc::new(EmptySerializerRegistry), + file_types: HashMap::new(), table_options: TableOptions::default_from_session_config(config.options()), config, execution_props: ExecutionProps::new(), @@ -830,6 +834,22 @@ impl SessionState { self.table_options.extensions.insert(extension) } + /// Adds or updates a [ExternalFileType] which can be used with COPY TO or CREATE EXTERNAL TABLE statements for reading + /// and writing files of custom formats. + pub fn register_file_type( + &mut self, + file_type: Arc, + overwrite: bool, + ) -> Result<(), DataFusionError> { + let ext = file_type.get_ext(); + match (self.file_types.entry(ext.clone()), overwrite){ + (Entry::Vacant(e), _) => {e.insert(file_type);}, + (Entry::Occupied(mut e), true) => {e.insert(file_type);}, + (Entry::Occupied(_), false) => return config_err!("File type already registered for extension {ext}. Set overwrite to true to replace this extension."), + }; + Ok(()) + } + /// Get a new TaskContext to run in this session pub fn task_ctx(&self) -> Arc { Arc::new(TaskContext::from(self)) From 85e64ba4ea786b0dc4f95443776c750c72d6a238 Mon Sep 17 00:00:00 2001 From: Lordworms <48054792+Lordworms@users.noreply.github.com> Date: Sat, 15 Jun 2024 09:45:57 -0700 Subject: [PATCH 02/20] Add contains function, and support in datafusion substrait consumer (#10879) * adding new function contains * adding substrait test * adding doc * adding doc * Update docs/source/user-guide/sql/scalar_functions.md Co-authored-by: Alex Huang * adding entry --------- Co-authored-by: Alex Huang --- datafusion/functions/src/string/contains.rs | 143 ++++++++++++++++++ datafusion/functions/src/string/mod.rs | 8 +- .../sqllogictest/test_files/functions.slt | 18 +++ .../substrait/tests/cases/function_test.rs | 58 +++++++ datafusion/substrait/tests/cases/mod.rs | 1 + .../testdata/contains_plan.substrait.json | 133 ++++++++++++++++ .../source/user-guide/sql/scalar_functions.md | 14 ++ 7 files changed, 373 insertions(+), 2 deletions(-) create mode 100644 datafusion/functions/src/string/contains.rs create mode 100644 datafusion/substrait/tests/cases/function_test.rs create mode 100644 datafusion/substrait/tests/testdata/contains_plan.substrait.json diff --git a/datafusion/functions/src/string/contains.rs b/datafusion/functions/src/string/contains.rs new file mode 100644 index 000000000000..faf979f80614 --- /dev/null +++ b/datafusion/functions/src/string/contains.rs @@ -0,0 +1,143 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::utils::make_scalar_function; +use arrow::array::{ArrayRef, OffsetSizeTrait}; +use arrow::datatypes::DataType; +use arrow::datatypes::DataType::Boolean; +use datafusion_common::cast::as_generic_string_array; +use datafusion_common::DataFusionError; +use datafusion_common::Result; +use datafusion_common::{arrow_datafusion_err, exec_err}; +use datafusion_expr::ScalarUDFImpl; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ColumnarValue, Signature, Volatility}; +use std::any::Any; +use std::sync::Arc; +#[derive(Debug)] +pub struct ContainsFunc { + signature: Signature, +} + +impl Default for ContainsFunc { + fn default() -> Self { + ContainsFunc::new() + } +} + +impl ContainsFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![Exact(vec![Utf8, Utf8]), Exact(vec![LargeUtf8, LargeUtf8])], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for ContainsFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "contains" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _: &[DataType]) -> Result { + Ok(Boolean) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(contains::, vec![])(args), + DataType::LargeUtf8 => make_scalar_function(contains::, vec![])(args), + other => { + exec_err!("unsupported data type {other:?} for function contains") + } + } + } +} + +/// use regexp_is_match_utf8_scalar to do the calculation for contains +pub fn contains( + args: &[ArrayRef], +) -> Result { + let mod_str = as_generic_string_array::(&args[0])?; + let match_str = as_generic_string_array::(&args[1])?; + let res = arrow::compute::kernels::comparison::regexp_is_match_utf8( + mod_str, match_str, None, + ) + .map_err(|e| arrow_datafusion_err!(e))?; + + Ok(Arc::new(res) as ArrayRef) +} + +#[cfg(test)] +mod tests { + use crate::string::contains::ContainsFunc; + use crate::utils::test::test_function; + use arrow::array::Array; + use arrow::{array::BooleanArray, datatypes::DataType::Boolean}; + use datafusion_common::Result; + use datafusion_common::ScalarValue; + use datafusion_expr::ColumnarValue; + use datafusion_expr::ScalarUDFImpl; + #[test] + fn test_functions() -> Result<()> { + test_function!( + ContainsFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::from("alph")), + ], + Ok(Some(true)), + bool, + Boolean, + BooleanArray + ); + test_function!( + ContainsFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::from("dddddd")), + ], + Ok(Some(false)), + bool, + Boolean, + BooleanArray + ); + test_function!( + ContainsFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("alphabet")), + ColumnarValue::Scalar(ScalarValue::from("pha")), + ], + Ok(Some(true)), + bool, + Boolean, + BooleanArray + ); + Ok(()) + } +} diff --git a/datafusion/functions/src/string/mod.rs b/datafusion/functions/src/string/mod.rs index 219ef8b5a50f..5bf372c29f2d 100644 --- a/datafusion/functions/src/string/mod.rs +++ b/datafusion/functions/src/string/mod.rs @@ -28,6 +28,7 @@ pub mod chr; pub mod common; pub mod concat; pub mod concat_ws; +pub mod contains; pub mod ends_with; pub mod initcap; pub mod levenshtein; @@ -43,7 +44,6 @@ pub mod starts_with; pub mod to_hex; pub mod upper; pub mod uuid; - // create UDFs make_udf_function!(ascii::AsciiFunc, ASCII, ascii); make_udf_function!(bit_length::BitLengthFunc, BIT_LENGTH, bit_length); @@ -66,7 +66,7 @@ make_udf_function!(split_part::SplitPartFunc, SPLIT_PART, split_part); make_udf_function!(to_hex::ToHexFunc, TO_HEX, to_hex); make_udf_function!(upper::UpperFunc, UPPER, upper); make_udf_function!(uuid::UuidFunc, UUID, uuid); - +make_udf_function!(contains::ContainsFunc, CONTAINS, contains); pub mod expr_fn { use datafusion_expr::Expr; @@ -149,6 +149,9 @@ pub mod expr_fn { ),( uuid, "returns uuid v4 as a string value", + ), ( + contains, + "Return true if search_string is found within string. treated it like a reglike", )); #[doc = "Removes all characters, spaces by default, from both sides of a string"] @@ -188,5 +191,6 @@ pub fn functions() -> Vec> { to_hex(), upper(), uuid(), + contains(), ] } diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index df6295d63b81..c3dd791f6ca8 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -1158,3 +1158,21 @@ drop table uuid_table statement ok drop table t + + +# test for contains + +query B +select contains('alphabet', 'pha'); +---- +true + +query B +select contains('alphabet', 'dddd'); +---- +false + +query B +select contains('', ''); +---- +true diff --git a/datafusion/substrait/tests/cases/function_test.rs b/datafusion/substrait/tests/cases/function_test.rs new file mode 100644 index 000000000000..b4c5659a3a49 --- /dev/null +++ b/datafusion/substrait/tests/cases/function_test.rs @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests for Function Compatibility + +#[cfg(test)] +mod tests { + use datafusion::common::Result; + use datafusion::prelude::{CsvReadOptions, SessionContext}; + use datafusion_substrait::logical_plan::consumer::from_substrait_plan; + use std::fs::File; + use std::io::BufReader; + use substrait::proto::Plan; + + #[tokio::test] + async fn contains_function_test() -> Result<()> { + let ctx = create_context().await?; + + let path = "tests/testdata/contains_plan.substrait.json"; + let proto = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + + let plan = from_substrait_plan(&ctx, &proto).await?; + + let plan_str = format!("{:?}", plan); + + assert_eq!( + plan_str, + "Projection: nation.b AS n_name\ + \n Filter: contains(nation.b, Utf8(\"IA\"))\ + \n TableScan: nation projection=[a, b, c, d, e, f]" + ); + Ok(()) + } + + async fn create_context() -> datafusion::common::Result { + let ctx = SessionContext::new(); + ctx.register_csv("nation", "tests/testdata/data.csv", CsvReadOptions::new()) + .await?; + Ok(ctx) + } +} diff --git a/datafusion/substrait/tests/cases/mod.rs b/datafusion/substrait/tests/cases/mod.rs index a31f93087d83..d3ea7695e4b9 100644 --- a/datafusion/substrait/tests/cases/mod.rs +++ b/datafusion/substrait/tests/cases/mod.rs @@ -16,6 +16,7 @@ // under the License. mod consumer_integration; +mod function_test; mod logical_plans; mod roundtrip_logical_plan; mod roundtrip_physical_plan; diff --git a/datafusion/substrait/tests/testdata/contains_plan.substrait.json b/datafusion/substrait/tests/testdata/contains_plan.substrait.json new file mode 100644 index 000000000000..76edde34e3b0 --- /dev/null +++ b/datafusion/substrait/tests/testdata/contains_plan.substrait.json @@ -0,0 +1,133 @@ +{ + "extensionUris": [ + { + "extensionUriAnchor": 1, + "uri": "https://github.com/substrait-io/substrait/blob/main/extensions/functions_string.yaml" + } + ], + "extensions": [ + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 1, + "name": "contains:str_str" + } + } + ], + "relations": [ + { + "root": { + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 4 + ] + } + }, + "input": { + "filter": { + "input": { + "read": { + "common": { + "direct": {} + }, + "baseSchema": { + "names": [ + "n_nationkey", + "n_name", + "n_regionkey", + "n_comment" + ], + "struct": { + "types": [ + { + "i32": { + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "string": { + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i32": { + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "string": { + "nullability": "NULLABILITY_REQUIRED" + } + } + ], + "nullability": "NULLABILITY_REQUIRED" + } + }, + "namedTable": { + "names": [ + "nation" + ] + } + } + }, + "condition": { + "scalarFunction": { + "functionReference": 1, + "outputType": { + "bool": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": {} + } + } + }, + { + "value": { + "literal": { + "string": "IA" + } + } + } + ] + } + } + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": {} + } + } + ] + } + }, + "names": [ + "n_name" + ] + } + } + ], + "version": { + "minorNumber": 38, + "producer": "ibis-substrait" + } +} \ No newline at end of file diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 10c52bc5de9e..ec34dbf9ba6c 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -681,6 +681,7 @@ _Alias of [nvl](#nvl)._ - [substr_index](#substr_index) - [find_in_set](#find_in_set) - [position](#position) +- [contains](#contains) ### `ascii` @@ -1443,6 +1444,19 @@ position(substr in origstr) - **substr**: The pattern string. - **origstr**: The model string. +### `contains` + +Return true if search_string is found within string. + +``` +contains(string, search_string) +``` + +#### Arguments + +- **string**: The pattern string. +- **search_string**: The model string. + ## Time and Date Functions - [now](#now) From 05d329b391aea3571ae02c75460cc07d74461d8e Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Sat, 15 Jun 2024 21:56:56 -0400 Subject: [PATCH 03/20] logical planning updated --- .../common/src/file_options/file_type.rs | 16 +++++ datafusion/core/src/dataframe/mod.rs | 13 ++--- datafusion/core/src/dataframe/parquet.rs | 9 ++- .../core/src/execution/session_state.rs | 13 +++++ datafusion/core/src/physical_planner.rs | 2 +- datafusion/expr/src/logical_plan/builder.rs | 6 +- datafusion/expr/src/logical_plan/display.rs | 4 +- datafusion/expr/src/logical_plan/dml.rs | 6 +- datafusion/expr/src/logical_plan/plan.rs | 8 +-- datafusion/expr/src/logical_plan/tree_node.rs | 4 +- datafusion/sql/src/planner.rs | 6 ++ datafusion/sql/src/statement.rs | 58 ++++++++++--------- 12 files changed, 93 insertions(+), 52 deletions(-) diff --git a/datafusion/common/src/file_options/file_type.rs b/datafusion/common/src/file_options/file_type.rs index cf7195ebe972..ffbde0394cd4 100644 --- a/datafusion/common/src/file_options/file_type.rs +++ b/datafusion/common/src/file_options/file_type.rs @@ -43,6 +43,22 @@ pub trait GetExt { /// Externally Defined FileType pub trait ExternalFileType: GetExt + Display + Send + Sync {} +pub struct ExternalCSV {} + +impl GetExt for ExternalCSV { + fn get_ext(&self) -> String { + "csv".to_string() + } +} + +impl Display for ExternalCSV { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", "csv") + } +} + +impl ExternalFileType for ExternalCSV {} + /// Readable file type #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum FileType { diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index b5c58eff577c..19e1067c1a09 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -44,7 +44,8 @@ use arrow::array::{Array, ArrayRef, Int64Array, StringArray}; use arrow::compute::{cast, concat}; use arrow::datatypes::{DataType, Field}; use arrow_schema::{Schema, SchemaRef}; -use datafusion_common::config::{CsvOptions, FormatOptions, JsonOptions}; +use datafusion_common::config::{CsvOptions, JsonOptions}; +use datafusion_common::file_options::file_type::ExternalCSV; use datafusion_common::{ plan_err, Column, DFSchema, DataFusionError, ParamValues, SchemaError, UnnestOptions, }; @@ -1236,13 +1237,11 @@ impl DataFrame { "Overwrites are not implemented for DataFrame::write_csv.".to_owned(), )); } - let props = writer_options - .unwrap_or_else(|| self.session_state.default_table_options().csv); let plan = LogicalPlanBuilder::copy_to( self.plan, path.into(), - FormatOptions::CSV(props), + Arc::new(ExternalCSV {}), HashMap::new(), options.partition_by, )? @@ -1291,13 +1290,11 @@ impl DataFrame { )); } - let props = writer_options - .unwrap_or_else(|| self.session_state.default_table_options().json); - let plan = LogicalPlanBuilder::copy_to( self.plan, path.into(), - FormatOptions::JSON(props), + // TODO! update for json + Arc::new(ExternalCSV {}), Default::default(), options.partition_by, )? diff --git a/datafusion/core/src/dataframe/parquet.rs b/datafusion/core/src/dataframe/parquet.rs index 0ec46df0ae5d..64bdde51ff94 100644 --- a/datafusion/core/src/dataframe/parquet.rs +++ b/datafusion/core/src/dataframe/parquet.rs @@ -15,11 +15,15 @@ // specific language governing permissions and limitations // under the License. +use std::sync::Arc; + use super::{ DataFrame, DataFrameWriteOptions, DataFusionError, LogicalPlanBuilder, RecordBatch, }; -use datafusion_common::config::{FormatOptions, TableParquetOptions}; +use datafusion_common::{ + config::TableParquetOptions, file_options::file_type::ExternalCSV, +}; impl DataFrame { /// Execute the `DataFrame` and write the results to Parquet file(s). @@ -63,7 +67,8 @@ impl DataFrame { let plan = LogicalPlanBuilder::copy_to( self.plan, path.into(), - FormatOptions::PARQUET(props), + // TODO! update for parquet + Arc::new(ExternalCSV {}), Default::default(), options.partition_by, )? diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index fa312728a8ec..a134b7dadaec 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -992,6 +992,19 @@ impl<'a> ContextProvider for SessionContextProvider<'a> { fn udwf_names(&self) -> Vec { self.state.window_functions().keys().cloned().collect() } + + fn get_file_type( + &self, + ext: &str, + ) -> datafusion_common::Result> { + self.state + .file_types + .get(ext) + .ok_or(plan_datafusion_err!( + "There is no registered file type with ext {ext}" + )) + .map(|file_type| file_type.clone()) + } } impl FunctionRegistry for SessionState { diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 4f9187595018..031dc64acfa8 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -759,7 +759,7 @@ impl DefaultPhysicalPlanner { LogicalPlan::Copy(CopyTo { input, output_url, - format_options, + file_type, partition_by, options: source_option_tuples, }) => { diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 2f1ece32ab15..9a588c4db39b 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -48,8 +48,8 @@ use crate::{ }; use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; -use datafusion_common::config::FormatOptions; use datafusion_common::display::ToStringifiedPlan; +use datafusion_common::file_options::file_type::ExternalFileType; use datafusion_common::{ get_target_functional_dependencies, internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, @@ -271,14 +271,14 @@ impl LogicalPlanBuilder { pub fn copy_to( input: LogicalPlan, output_url: String, - format_options: FormatOptions, + file_type: Arc, options: HashMap, partition_by: Vec, ) -> Result { Ok(Self::from(LogicalPlan::Copy(CopyTo { input: Arc::new(input), output_url, - format_options, + file_type, options, partition_by, }))) diff --git a/datafusion/expr/src/logical_plan/display.rs b/datafusion/expr/src/logical_plan/display.rs index 707cff8ab5f1..81fd03555abb 100644 --- a/datafusion/expr/src/logical_plan/display.rs +++ b/datafusion/expr/src/logical_plan/display.rs @@ -425,7 +425,7 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> { LogicalPlan::Copy(CopyTo { input: _, output_url, - format_options, + file_type, partition_by: _, options, }) => { @@ -437,7 +437,7 @@ impl<'a, 'b> PgJsonVisitor<'a, 'b> { json!({ "Node Type": "CopyTo", "Output URL": output_url, - "Format Options": format!("{}", format_options), + "File Type": format!("{}", file_type.get_ext()), "Options": op_str }) } diff --git a/datafusion/expr/src/logical_plan/dml.rs b/datafusion/expr/src/logical_plan/dml.rs index 13f3759ab8c0..fa7011763525 100644 --- a/datafusion/expr/src/logical_plan/dml.rs +++ b/datafusion/expr/src/logical_plan/dml.rs @@ -21,7 +21,7 @@ use std::hash::{Hash, Hasher}; use std::sync::Arc; use arrow::datatypes::{DataType, Field, Schema}; -use datafusion_common::config::FormatOptions; +use datafusion_common::file_options::file_type::ExternalFileType; use datafusion_common::{DFSchemaRef, TableReference}; use crate::LogicalPlan; @@ -35,8 +35,8 @@ pub struct CopyTo { pub output_url: String, /// Determines which, if any, columns should be used for hive-style partitioned writes pub partition_by: Vec, - /// File format options. - pub format_options: FormatOptions, + /// File type trait + pub file_type: Arc, /// SQL Options that can affect the formats pub options: HashMap, } diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 02378ab3fc1b..bf8a65c39aec 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -848,13 +848,13 @@ impl LogicalPlan { LogicalPlan::Copy(CopyTo { input: _, output_url, - format_options, + file_type, options, partition_by, }) => Ok(LogicalPlan::Copy(CopyTo { input: Arc::new(inputs.swap_remove(0)), output_url: output_url.clone(), - format_options: format_options.clone(), + file_type: file_type.clone(), options: options.clone(), partition_by: partition_by.clone(), })), @@ -1750,7 +1750,7 @@ impl LogicalPlan { LogicalPlan::Copy(CopyTo { input: _, output_url, - format_options, + file_type, options, .. }) => { @@ -1760,7 +1760,7 @@ impl LogicalPlan { .collect::>() .join(", "); - write!(f, "CopyTo: format={format_options} output_url={output_url} options: ({op_str})") + write!(f, "CopyTo: file_type={} output_url={output_url} options: ({op_str})", file_type.get_ext()) } LogicalPlan::Ddl(ddl) => { write!(f, "{}", ddl.display()) diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs index 86c0cffd80a1..a47906f20322 100644 --- a/datafusion/expr/src/logical_plan/tree_node.rs +++ b/datafusion/expr/src/logical_plan/tree_node.rs @@ -256,14 +256,14 @@ impl TreeNode for LogicalPlan { input, output_url, partition_by, - format_options, + file_type, options, }) => rewrite_arc(input, f)?.update_data(|input| { LogicalPlan::Copy(CopyTo { input, output_url, partition_by, - format_options, + file_type, options, }) }), diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index a92e64597e82..401d8163db92 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -21,6 +21,7 @@ use std::sync::Arc; use std::vec; use arrow_schema::*; +use datafusion_common::file_options::file_type::ExternalFileType; use datafusion_common::{ field_not_found, internal_err, plan_datafusion_err, SchemaError, }; @@ -48,6 +49,11 @@ use crate::utils::make_decimal_type; pub trait ContextProvider { /// Getter for a datasource fn get_table_source(&self, name: TableReference) -> Result>; + + fn get_file_type(&self, _ext: &str) -> Result> { + not_impl_err!("Registered file types are not supported") + } + /// Getter for a table function fn get_table_function_source( &self, diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index d10956efb66c..d921586787da 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -34,8 +34,8 @@ use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{ exec_err, not_impl_err, plan_datafusion_err, plan_err, schema_err, unqualified_field_not_found, Column, Constraints, DFSchema, DFSchemaRef, - DataFusionError, FileType, Result, ScalarValue, SchemaError, SchemaReference, - TableReference, ToDFSchema, + DataFusionError, Result, ScalarValue, SchemaError, SchemaReference, TableReference, + ToDFSchema, }; use datafusion_expr::dml::CopyTo; use datafusion_expr::expr_rewriter::normalize_col_with_schemas_and_ambiguity_check; @@ -899,31 +899,35 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } } - let file_type = if let Some(file_type) = statement.stored_as { - FileType::from_str(&file_type).map_err(|_| { - DataFusionError::Configuration(format!("Unknown FileType {}", file_type)) - })? + let maybe_file_type = if let Some(stored_as) = &statement.stored_as { + if let Ok(ext_file_type) = self.context_provider.get_file_type(stored_as) { + Some(ext_file_type) + } else { + None + } } else { - let e = || { - DataFusionError::Configuration( - "Format not explicitly set and unable to get file extension! Use STORED AS to define file format." - .to_string(), - ) - }; - // try to infer file format from file extension - let extension: &str = &Path::new(&statement.target) - .extension() - .ok_or_else(e)? - .to_str() - .ok_or_else(e)? - .to_lowercase(); - - FileType::from_str(extension).map_err(|e| { - DataFusionError::Configuration(format!( - "{}. Use STORED AS to define file format.", - e - )) - })? + None + }; + + let file_type = match maybe_file_type { + Some(ft) => ft, + None => { + let e = || { + DataFusionError::Configuration( + "Format not explicitly set and unable to get file extension! Use STORED AS to define file format." + .to_string(), + ) + }; + // try to infer file format from file extension + let extension: &str = &Path::new(&statement.target) + .extension() + .ok_or_else(e)? + .to_str() + .ok_or_else(e)? + .to_lowercase(); + + self.context_provider.get_file_type(extension)? + } }; let partition_by = statement @@ -938,7 +942,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Ok(LogicalPlan::Copy(CopyTo { input: Arc::new(input), output_url: statement.target, - format_options: file_type.into(), + file_type, partition_by, options, })) From 027131bf9609e6adf6bb76569e5eb83e0eee3e34 Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Sun, 16 Jun 2024 17:17:20 -0400 Subject: [PATCH 04/20] compiling --- .../common/src/file_options/file_type.rs | 14 +++- .../core/src/datasource/file_format/mod.rs | 56 +++++++++++++++- .../core/src/execution/session_state.rs | 23 +++---- datafusion/core/src/physical_planner.rs | 64 +++++++++---------- 4 files changed, 108 insertions(+), 49 deletions(-) diff --git a/datafusion/common/src/file_options/file_type.rs b/datafusion/common/src/file_options/file_type.rs index ffbde0394cd4..7bcaca11a5da 100644 --- a/datafusion/common/src/file_options/file_type.rs +++ b/datafusion/common/src/file_options/file_type.rs @@ -17,6 +17,7 @@ //! File type abstraction +use std::any::Any; use std::fmt::{self, Display}; use std::str::FromStr; @@ -41,7 +42,12 @@ pub trait GetExt { } /// Externally Defined FileType -pub trait ExternalFileType: GetExt + Display + Send + Sync {} +pub trait ExternalFileType: GetExt + Display + Send + Sync { + /// Returns the table source as [`Any`] so that it can be + /// downcast to a specific implementation. + fn as_any(&self) -> &dyn Any; +} + pub struct ExternalCSV {} @@ -57,7 +63,11 @@ impl Display for ExternalCSV { } } -impl ExternalFileType for ExternalCSV {} +impl ExternalFileType for ExternalCSV { + fn as_any(&self) -> &dyn Any{ + self + } +} /// Readable file type #[derive(Debug, Clone, PartialEq, Eq, Hash)] diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index 9462cde43610..bc21a42368c7 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -32,7 +32,7 @@ pub mod parquet; pub mod write; use std::any::Any; -use std::fmt; +use std::fmt::{self, Display}; use std::sync::Arc; use crate::arrow::datatypes::SchemaRef; @@ -41,7 +41,8 @@ use crate::error::Result; use crate::execution::context::SessionState; use crate::physical_plan::{ExecutionPlan, Statistics}; -use datafusion_common::not_impl_err; +use datafusion_common::file_options::file_type::ExternalFileType; +use datafusion_common::{internal_err, not_impl_err, GetExt}; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement}; use async_trait::async_trait; @@ -58,6 +59,10 @@ pub trait FileFormat: Send + Sync + fmt::Debug { /// downcast to a specific implementation. fn as_any(&self) -> &dyn Any; + fn get_ext(&self) -> String{ + panic!("Undefined get_ext") + } + /// Infer the common schema of the provided objects. The objects will usually /// be analysed up to a given number of records or files (as specified in the /// format config) then give the estimated common schema. This might fail if @@ -106,6 +111,53 @@ pub trait FileFormat: Send + Sync + fmt::Debug { } } +pub struct DefaultFileFormat{ + file_format: Arc +} + +impl DefaultFileFormat{ + pub fn new(file_format: Arc) -> Self{ + Self{ file_format } + } +} + +impl ExternalFileType for DefaultFileFormat{ + fn as_any(&self) -> &dyn Any { + self + } +} + +impl Display for DefaultFileFormat{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.file_format.fmt(f) + } +} + +impl GetExt for DefaultFileFormat{ + fn get_ext(&self) -> String { + self.file_format.get_ext() + } +} + +pub fn format_as_file_type( + file_format: Arc +) -> Arc{ + Arc::new(DefaultFileFormat{file_format}) +} + +pub fn file_type_to_format( + file_type: &Arc, +) -> datafusion_common::Result> { + match file_type + .as_ref() + .as_any() + .downcast_ref::() + { + Some(source) => Ok(source.file_format.clone()), + _ => internal_err!("FileType was not DefaultFileFormat"), + } +} + #[cfg(test)] pub(crate) mod test_util { use std::ops::Range; diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index a134b7dadaec..2be6938e3235 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -25,6 +25,7 @@ use crate::catalog::{ MemoryCatalogProviderList, }; use crate::datasource::cte_worktable::CteWorkTable; +use crate::datasource::file_format::{format_as_file_type, FileFormat}; use crate::datasource::function::{TableFunction, TableFunctionImpl}; use crate::datasource::provider::{DefaultTableFactory, TableProviderFactory}; use crate::datasource::provider_as_source; @@ -113,7 +114,7 @@ pub struct SessionState { /// Deserializer registry for extensions. serializer_registry: Arc, /// Holds registered external FileFormat implementations - file_types: HashMap>, + file_formats: HashMap>, /// Session configuration config: SessionConfig, /// Table options @@ -235,7 +236,7 @@ impl SessionState { aggregate_functions: HashMap::new(), window_functions: HashMap::new(), serializer_registry: Arc::new(EmptySerializerRegistry), - file_types: HashMap::new(), + file_formats: HashMap::new(), table_options: TableOptions::default_from_session_config(config.options()), config, execution_props: ExecutionProps::new(), @@ -836,15 +837,15 @@ impl SessionState { /// Adds or updates a [ExternalFileType] which can be used with COPY TO or CREATE EXTERNAL TABLE statements for reading /// and writing files of custom formats. - pub fn register_file_type( + pub fn register_file_format( &mut self, - file_type: Arc, + file_format: Arc, overwrite: bool, ) -> Result<(), DataFusionError> { - let ext = file_type.get_ext(); - match (self.file_types.entry(ext.clone()), overwrite){ - (Entry::Vacant(e), _) => {e.insert(file_type);}, - (Entry::Occupied(mut e), true) => {e.insert(file_type);}, + let ext = file_format.get_ext(); + match (self.file_formats.entry(ext.clone()), overwrite){ + (Entry::Vacant(e), _) => {e.insert(file_format);}, + (Entry::Occupied(mut e), true) => {e.insert(file_format);}, (Entry::Occupied(_), false) => return config_err!("File type already registered for extension {ext}. Set overwrite to true to replace this extension."), }; Ok(()) @@ -998,12 +999,12 @@ impl<'a> ContextProvider for SessionContextProvider<'a> { ext: &str, ) -> datafusion_common::Result> { self.state - .file_types + .file_formats .get(ext) .ok_or(plan_datafusion_err!( - "There is no registered file type with ext {ext}" + "There is no registered file format with ext {ext}" )) - .map(|file_type| file_type.clone()) + .map(|file_type| format_as_file_type(file_type.clone())) } } diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 031dc64acfa8..1de550a18d1e 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -22,13 +22,8 @@ use std::collections::HashMap; use std::fmt::Write; use std::sync::Arc; -use crate::datasource::file_format::arrow::ArrowFormat; -use crate::datasource::file_format::avro::AvroFormat; -use crate::datasource::file_format::csv::CsvFormat; -use crate::datasource::file_format::json::JsonFormat; #[cfg(feature = "parquet")] -use crate::datasource::file_format::parquet::ParquetFormat; -use crate::datasource::file_format::FileFormat; +use crate::datasource::file_format::{file_type_to_format}; use crate::datasource::listing::ListingTableUrl; use crate::datasource::physical_plan::FileSinkConfig; use crate::datasource::source_as_provider; @@ -74,11 +69,10 @@ use arrow::compute::SortOptions; use arrow::datatypes::{Schema, SchemaRef}; use arrow_array::builder::StringBuilder; use arrow_array::RecordBatch; -use datafusion_common::config::FormatOptions; use datafusion_common::display::ToStringifiedPlan; use datafusion_common::{ exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, - FileType, ScalarValue, + ScalarValue, }; use datafusion_expr::dml::CopyTo; use datafusion_expr::expr::{ @@ -786,32 +780,34 @@ impl DefaultPhysicalPlanner { table_partition_cols, overwrite: false, }; - let mut table_options = session_state.default_table_options(); - let sink_format: Arc = match format_options { - FormatOptions::CSV(options) => { - table_options.csv = options.clone(); - table_options.set_file_format(FileType::CSV); - table_options.alter_with_string_hash_map(source_option_tuples)?; - Arc::new(CsvFormat::default().with_options(table_options.csv)) - } - FormatOptions::JSON(options) => { - table_options.json = options.clone(); - table_options.set_file_format(FileType::JSON); - table_options.alter_with_string_hash_map(source_option_tuples)?; - Arc::new(JsonFormat::default().with_options(table_options.json)) - } - #[cfg(feature = "parquet")] - FormatOptions::PARQUET(options) => { - table_options.parquet = options.clone(); - table_options.set_file_format(FileType::PARQUET); - table_options.alter_with_string_hash_map(source_option_tuples)?; - Arc::new( - ParquetFormat::default().with_options(table_options.parquet), - ) - } - FormatOptions::AVRO => Arc::new(AvroFormat {}), - FormatOptions::ARROW => Arc::new(ArrowFormat {}), - }; + + let sink_format = file_type_to_format(file_type)?; + + // let sink_format: Arc = match format_options { + // FormatOptions::CSV(options) => { + // table_options.csv = options.clone(); + // table_options.set_file_format(FileType::CSV); + // table_options.alter_with_string_hash_map(source_option_tuples)?; + // Arc::new(CsvFormat::default().with_options(table_options.csv)) + // } + // FormatOptions::JSON(options) => { + // table_options.json = options.clone(); + // table_options.set_file_format(FileType::JSON); + // table_options.alter_with_string_hash_map(source_option_tuples)?; + // Arc::new(JsonFormat::default().with_options(table_options.json)) + // } + // #[cfg(feature = "parquet")] + // FormatOptions::PARQUET(options) => { + // table_options.parquet = options.clone(); + // table_options.set_file_format(FileType::PARQUET); + // table_options.alter_with_string_hash_map(source_option_tuples)?; + // Arc::new( + // ParquetFormat::default().with_options(table_options.parquet), + // ) + // } + // FormatOptions::AVRO => Arc::new(AvroFormat {}), + // FormatOptions::ARROW => Arc::new(ArrowFormat {}), + // }; sink_format .create_writer_physical_plan(input_exec, session_state, config, None) From 54f5f8fb639f309bd27e89789e067e75c0af09fe Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Tue, 18 Jun 2024 18:51:29 -0400 Subject: [PATCH 05/20] removing filetype enum --- datafusion/common/src/config.rs | 6 +- .../common/src/file_options/file_type.rs | 205 +++++++----------- datafusion/common/src/lib.rs | 2 +- datafusion/core/src/dataframe/mod.rs | 25 ++- datafusion/core/src/dataframe/parquet.rs | 18 +- .../core/src/datasource/file_format/mod.rs | 37 +++- datafusion/core/src/physical_planner.rs | 31 +-- datafusion/proto/gen/src/main.rs | 1 + datafusion/proto/proto/datafusion.proto | 8 +- datafusion/proto/src/generated/pbjson.rs | 93 ++------ datafusion/proto/src/generated/prost.rs | 21 +- datafusion/proto/src/logical_plan/mod.rs | 32 ++- .../proto/src/physical_plan/from_proto.rs | 22 +- .../proto/src/physical_plan/to_proto.rs | 28 +-- .../tests/cases/roundtrip_logical_plan.rs | 28 ++- 15 files changed, 210 insertions(+), 347 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 0a7681ab95b8..c55abb0213f5 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -24,7 +24,7 @@ use std::str::FromStr; use crate::error::_config_err; use crate::parsers::CompressionTypeVariant; -use crate::{DataFusionError, FileType, Result}; +use crate::{DataFusionError, Result}; /// A macro that wraps a configuration struct and automatically derives /// [`Default`] and [`ConfigField`] for it, allowing it to be used @@ -1132,10 +1132,6 @@ pub struct TableOptions { /// Configuration options for JSON file handling. pub json: JsonOptions, - /// The current file format that the table operations should assume. This option allows - /// for dynamic switching between the supported file types (e.g., CSV, Parquet, JSON). - pub current_format: Option, - /// Optional extensions that can be used to extend or customize the behavior of the table /// options. Extensions can be registered using `Extensions::insert` and might include /// custom file handling logic, additional configuration parameters, or other enhancements. diff --git a/datafusion/common/src/file_options/file_type.rs b/datafusion/common/src/file_options/file_type.rs index 7bcaca11a5da..1affea44a1a9 100644 --- a/datafusion/common/src/file_options/file_type.rs +++ b/datafusion/common/src/file_options/file_type.rs @@ -18,11 +18,7 @@ //! File type abstraction use std::any::Any; -use std::fmt::{self, Display}; -use std::str::FromStr; - -use crate::config::FormatOptions; -use crate::error::{DataFusionError, Result}; +use std::fmt::Display; /// The default file extension of arrow files pub const DEFAULT_ARROW_EXTENSION: &str = ".arrow"; @@ -48,128 +44,77 @@ pub trait ExternalFileType: GetExt + Display + Send + Sync { fn as_any(&self) -> &dyn Any; } - -pub struct ExternalCSV {} - -impl GetExt for ExternalCSV { - fn get_ext(&self) -> String { - "csv".to_string() - } -} - -impl Display for ExternalCSV { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}", "csv") - } -} - -impl ExternalFileType for ExternalCSV { - fn as_any(&self) -> &dyn Any{ - self - } -} - -/// Readable file type -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub enum FileType { - /// Apache Arrow file - ARROW, - /// Apache Avro file - AVRO, - /// Apache Parquet file - #[cfg(feature = "parquet")] - PARQUET, - /// CSV file - CSV, - /// JSON file - JSON, -} - -impl From<&FormatOptions> for FileType { - fn from(value: &FormatOptions) -> Self { - match value { - FormatOptions::CSV(_) => FileType::CSV, - FormatOptions::JSON(_) => FileType::JSON, - #[cfg(feature = "parquet")] - FormatOptions::PARQUET(_) => FileType::PARQUET, - FormatOptions::AVRO => FileType::AVRO, - FormatOptions::ARROW => FileType::ARROW, - } - } -} - -impl GetExt for FileType { - fn get_ext(&self) -> String { - match self { - FileType::ARROW => DEFAULT_ARROW_EXTENSION.to_owned(), - FileType::AVRO => DEFAULT_AVRO_EXTENSION.to_owned(), - #[cfg(feature = "parquet")] - FileType::PARQUET => DEFAULT_PARQUET_EXTENSION.to_owned(), - FileType::CSV => DEFAULT_CSV_EXTENSION.to_owned(), - FileType::JSON => DEFAULT_JSON_EXTENSION.to_owned(), - } - } -} - -impl Display for FileType { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let out = match self { - FileType::CSV => "csv", - FileType::JSON => "json", - #[cfg(feature = "parquet")] - FileType::PARQUET => "parquet", - FileType::AVRO => "avro", - FileType::ARROW => "arrow", - }; - write!(f, "{}", out) - } -} - -impl FromStr for FileType { - type Err = DataFusionError; - - fn from_str(s: &str) -> Result { - let s = s.to_uppercase(); - match s.as_str() { - "ARROW" => Ok(FileType::ARROW), - "AVRO" => Ok(FileType::AVRO), - #[cfg(feature = "parquet")] - "PARQUET" => Ok(FileType::PARQUET), - "CSV" => Ok(FileType::CSV), - "JSON" | "NDJSON" => Ok(FileType::JSON), - _ => Err(DataFusionError::NotImplemented(format!( - "Unknown FileType: {s}" - ))), - } - } -} - -#[cfg(test)] -#[cfg(feature = "parquet")] -mod tests { - use std::str::FromStr; - - use crate::error::DataFusionError; - use crate::FileType; - - #[test] - fn from_str() { - for (ext, file_type) in [ - ("csv", FileType::CSV), - ("CSV", FileType::CSV), - ("json", FileType::JSON), - ("JSON", FileType::JSON), - ("avro", FileType::AVRO), - ("AVRO", FileType::AVRO), - ("parquet", FileType::PARQUET), - ("PARQUET", FileType::PARQUET), - ] { - assert_eq!(FileType::from_str(ext).unwrap(), file_type); - } - - assert!(matches!( - FileType::from_str("Unknown"), - Err(DataFusionError::NotImplemented(_)) - )); - } -} +// /// Readable file type +// #[derive(Debug, Clone, PartialEq, Eq, Hash)] +// pub enum FileType { +// /// Apache Arrow file +// ARROW, +// /// Apache Avro file +// AVRO, +// /// Apache Parquet file +// #[cfg(feature = "parquet")] +// PARQUET, +// /// CSV file +// CSV, +// /// JSON file +// JSON, +// } + +// impl From<&FormatOptions> for FileType { +// fn from(value: &FormatOptions) -> Self { +// match value { +// FormatOptions::CSV(_) => FileType::CSV, +// FormatOptions::JSON(_) => FileType::JSON, +// #[cfg(feature = "parquet")] +// FormatOptions::PARQUET(_) => FileType::PARQUET, +// FormatOptions::AVRO => FileType::AVRO, +// FormatOptions::ARROW => FileType::ARROW, +// } +// } +// } + +// impl GetExt for FileType { +// fn get_ext(&self) -> String { +// match self { +// FileType::ARROW => DEFAULT_ARROW_EXTENSION.to_owned(), +// FileType::AVRO => DEFAULT_AVRO_EXTENSION.to_owned(), +// #[cfg(feature = "parquet")] +// FileType::PARQUET => DEFAULT_PARQUET_EXTENSION.to_owned(), +// FileType::CSV => DEFAULT_CSV_EXTENSION.to_owned(), +// FileType::JSON => DEFAULT_JSON_EXTENSION.to_owned(), +// } +// } +// } + +// impl Display for FileType { +// fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +// let out = match self { +// FileType::CSV => "csv", +// FileType::JSON => "json", +// #[cfg(feature = "parquet")] +// FileType::PARQUET => "parquet", +// FileType::AVRO => "avro", +// FileType::ARROW => "arrow", +// }; +// write!(f, "{}", out) +// } +// } + +// impl FromStr for FileType { +// type Err = DataFusionError; + +// fn from_str(s: &str) -> Result { +// let s = s.to_uppercase(); +// match s.as_str() { +// "ARROW" => Ok(FileType::ARROW), +// "AVRO" => Ok(FileType::AVRO), +// #[cfg(feature = "parquet")] +// "PARQUET" => Ok(FileType::PARQUET), +// "CSV" => Ok(FileType::CSV), +// "JSON" | "NDJSON" => Ok(FileType::JSON), +// _ => Err(DataFusionError::NotImplemented(format!( +// "Unknown FileType: {s}" +// ))), +// } +// } +// } diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index e64acd0bfefe..21c13eebfd11 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -54,7 +54,7 @@ pub use error::{ SharedResult, }; pub use file_options::file_type::{ - FileType, GetExt, DEFAULT_ARROW_EXTENSION, DEFAULT_AVRO_EXTENSION, + GetExt, DEFAULT_ARROW_EXTENSION, DEFAULT_AVRO_EXTENSION, DEFAULT_CSV_EXTENSION, DEFAULT_JSON_EXTENSION, DEFAULT_PARQUET_EXTENSION, }; pub use functional_dependencies::{ diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 19e1067c1a09..45205a8dfe73 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -26,6 +26,9 @@ use std::sync::Arc; use crate::arrow::record_batch::RecordBatch; use crate::arrow::util::pretty; +use crate::datasource::file_format::csv::CsvFormat; +use crate::datasource::file_format::format_as_file_type; +use crate::datasource::file_format::json::JsonFormat; use crate::datasource::{provider_as_source, MemTable, TableProvider}; use crate::error::Result; use crate::execution::context::{SessionState, TaskContext}; @@ -45,7 +48,6 @@ use arrow::compute::{cast, concat}; use arrow::datatypes::{DataType, Field}; use arrow_schema::{Schema, SchemaRef}; use datafusion_common::config::{CsvOptions, JsonOptions}; -use datafusion_common::file_options::file_type::ExternalCSV; use datafusion_common::{ plan_err, Column, DFSchema, DataFusionError, ParamValues, SchemaError, UnnestOptions, }; @@ -1238,10 +1240,18 @@ impl DataFrame { )); } + let format = if let Some(csv_opts) = writer_options { + CsvFormat::default().with_options(csv_opts) + } else { + CsvFormat::default() + }; + + let file_type = format_as_file_type(Arc::new(format)); + let plan = LogicalPlanBuilder::copy_to( self.plan, path.into(), - Arc::new(ExternalCSV {}), + file_type, HashMap::new(), options.partition_by, )? @@ -1290,11 +1300,18 @@ impl DataFrame { )); } + let format = if let Some(json_opts) = writer_options { + JsonFormat::default().with_options(json_opts) + } else { + JsonFormat::default() + }; + + let file_type = format_as_file_type(Arc::new(format)); + let plan = LogicalPlanBuilder::copy_to( self.plan, path.into(), - // TODO! update for json - Arc::new(ExternalCSV {}), + file_type, Default::default(), options.partition_by, )? diff --git a/datafusion/core/src/dataframe/parquet.rs b/datafusion/core/src/dataframe/parquet.rs index 64bdde51ff94..274269fa662b 100644 --- a/datafusion/core/src/dataframe/parquet.rs +++ b/datafusion/core/src/dataframe/parquet.rs @@ -17,13 +17,13 @@ use std::sync::Arc; +use crate::datasource::file_format::{format_as_file_type, parquet::ParquetFormat}; + use super::{ DataFrame, DataFrameWriteOptions, DataFusionError, LogicalPlanBuilder, RecordBatch, }; -use datafusion_common::{ - config::TableParquetOptions, file_options::file_type::ExternalCSV, -}; +use datafusion_common::config::TableParquetOptions; impl DataFrame { /// Execute the `DataFrame` and write the results to Parquet file(s). @@ -61,14 +61,18 @@ impl DataFrame { )); } - let props = writer_options - .unwrap_or_else(|| self.session_state.default_table_options().parquet); + let format = if let Some(parquet_opts) = writer_options { + ParquetFormat::default().with_options(parquet_opts) + } else { + ParquetFormat::default() + }; + + let file_type = format_as_file_type(Arc::new(format)); let plan = LogicalPlanBuilder::copy_to( self.plan, path.into(), - // TODO! update for parquet - Arc::new(ExternalCSV {}), + file_type, Default::default(), options.partition_by, )? diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index bc21a42368c7..48da1302bcc7 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -32,6 +32,7 @@ pub mod parquet; pub mod write; use std::any::Any; +use std::collections::HashMap; use std::fmt::{self, Display}; use std::sync::Arc; @@ -59,10 +60,22 @@ pub trait FileFormat: Send + Sync + fmt::Debug { /// downcast to a specific implementation. fn as_any(&self) -> &dyn Any; - fn get_ext(&self) -> String{ + /// Returns the extension for this FileFormat, e.g. "csv" + fn get_ext(&self) -> String { panic!("Undefined get_ext") } + /// Updates internal state based on options passed in via SQL strings, e.g. + /// COPY table TO custom.format OPTIONS (custom.option1 true) + fn update_options_from_string_hashmap( + &self, + _options: &HashMap, + ) -> Result<()> { + not_impl_err!( + "Updating options from SQL strings unsupported for this FileFormat." + ) + } + /// Infer the common schema of the provided objects. The objects will usually /// be analysed up to a given number of records or files (as specified in the /// format config) then give the estimated common schema. This might fail if @@ -111,38 +124,38 @@ pub trait FileFormat: Send + Sync + fmt::Debug { } } -pub struct DefaultFileFormat{ - file_format: Arc +pub struct DefaultFileFormat { + file_format: Arc, } -impl DefaultFileFormat{ - pub fn new(file_format: Arc) -> Self{ - Self{ file_format } +impl DefaultFileFormat { + pub fn new(file_format: Arc) -> Self { + Self { file_format } } } -impl ExternalFileType for DefaultFileFormat{ +impl ExternalFileType for DefaultFileFormat { fn as_any(&self) -> &dyn Any { self } } -impl Display for DefaultFileFormat{ +impl Display for DefaultFileFormat { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.file_format.fmt(f) } } -impl GetExt for DefaultFileFormat{ +impl GetExt for DefaultFileFormat { fn get_ext(&self) -> String { self.file_format.get_ext() } } pub fn format_as_file_type( - file_format: Arc -) -> Arc{ - Arc::new(DefaultFileFormat{file_format}) + file_format: Arc, +) -> Arc { + Arc::new(DefaultFileFormat { file_format }) } pub fn file_type_to_format( diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 1de550a18d1e..055c317af418 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -23,7 +23,7 @@ use std::fmt::Write; use std::sync::Arc; #[cfg(feature = "parquet")] -use crate::datasource::file_format::{file_type_to_format}; +use crate::datasource::file_format::file_type_to_format; use crate::datasource::listing::ListingTableUrl; use crate::datasource::physical_plan::FileSinkConfig; use crate::datasource::source_as_provider; @@ -780,34 +780,9 @@ impl DefaultPhysicalPlanner { table_partition_cols, overwrite: false, }; - - let sink_format = file_type_to_format(file_type)?; - // let sink_format: Arc = match format_options { - // FormatOptions::CSV(options) => { - // table_options.csv = options.clone(); - // table_options.set_file_format(FileType::CSV); - // table_options.alter_with_string_hash_map(source_option_tuples)?; - // Arc::new(CsvFormat::default().with_options(table_options.csv)) - // } - // FormatOptions::JSON(options) => { - // table_options.json = options.clone(); - // table_options.set_file_format(FileType::JSON); - // table_options.alter_with_string_hash_map(source_option_tuples)?; - // Arc::new(JsonFormat::default().with_options(table_options.json)) - // } - // #[cfg(feature = "parquet")] - // FormatOptions::PARQUET(options) => { - // table_options.parquet = options.clone(); - // table_options.set_file_format(FileType::PARQUET); - // table_options.alter_with_string_hash_map(source_option_tuples)?; - // Arc::new( - // ParquetFormat::default().with_options(table_options.parquet), - // ) - // } - // FormatOptions::AVRO => Arc::new(AvroFormat {}), - // FormatOptions::ARROW => Arc::new(ArrowFormat {}), - // }; + let sink_format = file_type_to_format(file_type)?; + sink_format.update_options_from_string_hashmap(source_option_tuples)?; sink_format .create_writer_physical_plan(input_exec, session_state, config, None) diff --git a/datafusion/proto/gen/src/main.rs b/datafusion/proto/gen/src/main.rs index 22c16eacb093..d38a41a01ac2 100644 --- a/datafusion/proto/gen/src/main.rs +++ b/datafusion/proto/gen/src/main.rs @@ -29,6 +29,7 @@ fn main() -> Result<(), String> { let descriptor_path = proto_dir.join("proto/proto_descriptor.bin"); prost_build::Config::new() + .protoc_arg("--experimental_allow_proto3_optional") .file_descriptor_set_path(&descriptor_path) .out_dir(out_dir) .compile_well_known_types() diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 83223a04d023..7f399b569a5c 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -251,13 +251,7 @@ message DistinctOnNode { message CopyToNode { LogicalPlanNode input = 1; string output_url = 2; - oneof format_options { - datafusion_common.CsvOptions csv = 8; - datafusion_common.JsonOptions json = 9; - datafusion_common.TableParquetOptions parquet = 10; - datafusion_common.AvroOptions avro = 11; - datafusion_common.ArrowOptions arrow = 12; - } + bytes file_type = 3; repeated string partition_by = 7; } diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index f298dd241abf..60654cc75e7f 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -2566,10 +2566,10 @@ impl serde::Serialize for CopyToNode { if !self.output_url.is_empty() { len += 1; } - if !self.partition_by.is_empty() { + if !self.file_type.is_empty() { len += 1; } - if self.format_options.is_some() { + if !self.partition_by.is_empty() { len += 1; } let mut struct_ser = serializer.serialize_struct("datafusion.CopyToNode", len)?; @@ -2579,28 +2579,13 @@ impl serde::Serialize for CopyToNode { if !self.output_url.is_empty() { struct_ser.serialize_field("outputUrl", &self.output_url)?; } + if !self.file_type.is_empty() { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("fileType", pbjson::private::base64::encode(&self.file_type).as_str())?; + } if !self.partition_by.is_empty() { struct_ser.serialize_field("partitionBy", &self.partition_by)?; } - if let Some(v) = self.format_options.as_ref() { - match v { - copy_to_node::FormatOptions::Csv(v) => { - struct_ser.serialize_field("csv", v)?; - } - copy_to_node::FormatOptions::Json(v) => { - struct_ser.serialize_field("json", v)?; - } - copy_to_node::FormatOptions::Parquet(v) => { - struct_ser.serialize_field("parquet", v)?; - } - copy_to_node::FormatOptions::Avro(v) => { - struct_ser.serialize_field("avro", v)?; - } - copy_to_node::FormatOptions::Arrow(v) => { - struct_ser.serialize_field("arrow", v)?; - } - } - } struct_ser.end() } } @@ -2614,25 +2599,18 @@ impl<'de> serde::Deserialize<'de> for CopyToNode { "input", "output_url", "outputUrl", + "file_type", + "fileType", "partition_by", "partitionBy", - "csv", - "json", - "parquet", - "avro", - "arrow", ]; #[allow(clippy::enum_variant_names)] enum GeneratedField { Input, OutputUrl, + FileType, PartitionBy, - Csv, - Json, - Parquet, - Avro, - Arrow, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -2656,12 +2634,8 @@ impl<'de> serde::Deserialize<'de> for CopyToNode { match value { "input" => Ok(GeneratedField::Input), "outputUrl" | "output_url" => Ok(GeneratedField::OutputUrl), + "fileType" | "file_type" => Ok(GeneratedField::FileType), "partitionBy" | "partition_by" => Ok(GeneratedField::PartitionBy), - "csv" => Ok(GeneratedField::Csv), - "json" => Ok(GeneratedField::Json), - "parquet" => Ok(GeneratedField::Parquet), - "avro" => Ok(GeneratedField::Avro), - "arrow" => Ok(GeneratedField::Arrow), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -2683,8 +2657,8 @@ impl<'de> serde::Deserialize<'de> for CopyToNode { { let mut input__ = None; let mut output_url__ = None; + let mut file_type__ = None; let mut partition_by__ = None; - let mut format_options__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::Input => { @@ -2699,54 +2673,27 @@ impl<'de> serde::Deserialize<'de> for CopyToNode { } output_url__ = Some(map_.next_value()?); } + GeneratedField::FileType => { + if file_type__.is_some() { + return Err(serde::de::Error::duplicate_field("fileType")); + } + file_type__ = + Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0) + ; + } GeneratedField::PartitionBy => { if partition_by__.is_some() { return Err(serde::de::Error::duplicate_field("partitionBy")); } partition_by__ = Some(map_.next_value()?); } - GeneratedField::Csv => { - if format_options__.is_some() { - return Err(serde::de::Error::duplicate_field("csv")); - } - format_options__ = map_.next_value::<::std::option::Option<_>>()?.map(copy_to_node::FormatOptions::Csv) -; - } - GeneratedField::Json => { - if format_options__.is_some() { - return Err(serde::de::Error::duplicate_field("json")); - } - format_options__ = map_.next_value::<::std::option::Option<_>>()?.map(copy_to_node::FormatOptions::Json) -; - } - GeneratedField::Parquet => { - if format_options__.is_some() { - return Err(serde::de::Error::duplicate_field("parquet")); - } - format_options__ = map_.next_value::<::std::option::Option<_>>()?.map(copy_to_node::FormatOptions::Parquet) -; - } - GeneratedField::Avro => { - if format_options__.is_some() { - return Err(serde::de::Error::duplicate_field("avro")); - } - format_options__ = map_.next_value::<::std::option::Option<_>>()?.map(copy_to_node::FormatOptions::Avro) -; - } - GeneratedField::Arrow => { - if format_options__.is_some() { - return Err(serde::de::Error::duplicate_field("arrow")); - } - format_options__ = map_.next_value::<::std::option::Option<_>>()?.map(copy_to_node::FormatOptions::Arrow) -; - } } } Ok(CopyToNode { input: input__, output_url: output_url__.unwrap_or_default(), + file_type: file_type__.unwrap_or_default(), partition_by: partition_by__.unwrap_or_default(), - format_options: format_options__, }) } } diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index fa0217e9ef4f..6b5891ef4e70 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -411,27 +411,10 @@ pub struct CopyToNode { pub input: ::core::option::Option<::prost::alloc::boxed::Box>, #[prost(string, tag = "2")] pub output_url: ::prost::alloc::string::String, + #[prost(bytes = "vec", tag = "3")] + pub file_type: ::prost::alloc::vec::Vec, #[prost(string, repeated, tag = "7")] pub partition_by: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, - #[prost(oneof = "copy_to_node::FormatOptions", tags = "8, 9, 10, 11, 12")] - pub format_options: ::core::option::Option, -} -/// Nested message and enum types in `CopyToNode`. -pub mod copy_to_node { - #[allow(clippy::derive_partial_eq_without_eq)] - #[derive(Clone, PartialEq, ::prost::Oneof)] - pub enum FormatOptions { - #[prost(message, tag = "8")] - Csv(super::super::datafusion_common::CsvOptions), - #[prost(message, tag = "9")] - Json(super::super::datafusion_common::JsonOptions), - #[prost(message, tag = "10")] - Parquet(super::super::datafusion_common::TableParquetOptions), - #[prost(message, tag = "11")] - Avro(super::super::datafusion_common::AvroOptions), - #[prost(message, tag = "12")] - Arrow(super::super::datafusion_common::ArrowOptions), - } } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs index ef37150a35db..8a12c42bb61d 100644 --- a/datafusion/proto/src/logical_plan/mod.rs +++ b/datafusion/proto/src/logical_plan/mod.rs @@ -33,6 +33,7 @@ use crate::protobuf::{proto_error, FromProtoError, ToProtoError}; use arrow::datatypes::{DataType, Schema, SchemaRef}; #[cfg(feature = "parquet")] use datafusion::datasource::file_format::parquet::ParquetFormat; +use datafusion::datasource::file_format::{file_type_to_format, format_as_file_type}; use datafusion::{ datasource::{ file_format::{avro::AvroFormat, csv::CsvFormat, FileFormat}, @@ -43,6 +44,7 @@ use datafusion::{ datasource::{provider_as_source, source_as_provider}, prelude::SessionContext, }; +use datafusion_common::file_options::file_type::ExternalFileType; use datafusion_common::{ context, internal_datafusion_err, internal_err, not_impl_err, DataFusionError, Result, TableReference, @@ -114,6 +116,22 @@ pub trait LogicalExtensionCodec: Debug + Send + Sync { buf: &mut Vec, ) -> Result<()>; + fn try_decode_file_format( + &self, + _buf: &[u8], + _ctx: &SessionContext, + ) -> Result> { + not_impl_err!("LogicalExtensionCoden is not provided for file format") + } + + fn try_encode_file_format( + &self, + _buf: &[u8], + _node: Arc, + ) -> Result<()> { + Ok(()) + } + fn try_decode_udf(&self, name: &str, _buf: &[u8]) -> Result> { not_impl_err!("LogicalExtensionCodec is not provided for scalar function {name}") } @@ -829,12 +847,16 @@ impl AsLogicalPlan for LogicalPlanNode { let input: LogicalPlan = into_logical_plan!(copy.input, ctx, extension_codec)?; + let file_type: Arc = format_as_file_type( + extension_codec.try_decode_file_format(©.file_type, ctx)?, + ); + Ok(datafusion_expr::LogicalPlan::Copy( datafusion_expr::dml::CopyTo { input: Arc::new(input), output_url: copy.output_url.clone(), partition_by: copy.partition_by.clone(), - format_options: convert_required!(copy.format_options)?, + file_type, options: Default::default(), }, )) @@ -1609,7 +1631,7 @@ impl AsLogicalPlan for LogicalPlanNode { LogicalPlan::Copy(dml::CopyTo { input, output_url, - format_options, + file_type, partition_by, .. }) => { @@ -1618,12 +1640,16 @@ impl AsLogicalPlan for LogicalPlanNode { extension_codec, )?; + let buf = Vec::new(); + extension_codec + .try_encode_file_format(&buf, file_type_to_format(file_type)?)?; + Ok(protobuf::LogicalPlanNode { logical_plan_type: Some(LogicalPlanType::CopyTo(Box::new( protobuf::CopyToNode { input: Some(Box::new(input)), output_url: output_url.to_string(), - format_options: Some(format_options.try_into()?), + file_type: buf, partition_by: partition_by.clone(), }, ))), diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 0a91df568a1d..82bc8bdad1c2 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -41,14 +41,13 @@ use datafusion::physical_plan::expressions::{ }; use datafusion::physical_plan::windows::{create_window_expr, schema_add_window_field}; use datafusion::physical_plan::{Partitioning, PhysicalExpr, WindowExpr}; -use datafusion_common::config::FormatOptions; use datafusion_common::{not_impl_err, DataFusionError, Result}; use datafusion_proto_common::common::proto_error; use crate::convert_required; use crate::logical_plan::{self}; +use crate::protobuf; use crate::protobuf::physical_expr_node::ExprType; -use crate::protobuf::{self, copy_to_node}; use super::PhysicalExtensionCodec; @@ -651,22 +650,3 @@ impl TryFrom<&protobuf::FileSinkConfig> for FileSinkConfig { }) } } - -impl TryFrom<©_to_node::FormatOptions> for FormatOptions { - type Error = DataFusionError; - fn try_from(value: ©_to_node::FormatOptions) -> Result { - Ok(match value { - copy_to_node::FormatOptions::Csv(options) => { - FormatOptions::CSV(options.try_into()?) - } - copy_to_node::FormatOptions::Json(options) => { - FormatOptions::JSON(options.try_into()?) - } - copy_to_node::FormatOptions::Parquet(options) => { - FormatOptions::PARQUET(options.try_into()?) - } - copy_to_node::FormatOptions::Avro(_) => FormatOptions::AVRO, - copy_to_node::FormatOptions::Arrow(_) => FormatOptions::ARROW, - }) - } -} diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index ef462ac94b9a..8a6faacff1fa 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -41,12 +41,11 @@ use datafusion::{ }, physical_plan::expressions::LikeExpr, }; -use datafusion_common::config::FormatOptions; use datafusion_common::{internal_err, not_impl_err, DataFusionError, Result}; use crate::protobuf::{ - self, copy_to_node, physical_aggregate_expr_node, physical_window_expr_node, - PhysicalSortExprNode, PhysicalSortExprNodeCollection, + self, physical_aggregate_expr_node, physical_window_expr_node, PhysicalSortExprNode, + PhysicalSortExprNodeCollection, }; use super::PhysicalExtensionCodec; @@ -749,26 +748,3 @@ impl TryFrom<&FileSinkConfig> for protobuf::FileSinkConfig { }) } } - -impl TryFrom<&FormatOptions> for copy_to_node::FormatOptions { - type Error = DataFusionError; - fn try_from(value: &FormatOptions) -> std::result::Result { - Ok(match value { - FormatOptions::CSV(options) => { - copy_to_node::FormatOptions::Csv(options.try_into()?) - } - FormatOptions::JSON(options) => { - copy_to_node::FormatOptions::Json(options.try_into()?) - } - FormatOptions::PARQUET(options) => { - copy_to_node::FormatOptions::Parquet(options.try_into()?) - } - FormatOptions::AVRO => { - copy_to_node::FormatOptions::Avro(protobuf::AvroOptions {}) - } - FormatOptions::ARROW => { - copy_to_node::FormatOptions::Arrow(protobuf::ArrowOptions {}) - } - }) - } -} diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index d0f1c4aade5e..a097741251f2 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -26,6 +26,7 @@ use arrow::datatypes::{ DataType, Field, Fields, Int32Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, Schema, SchemaRef, TimeUnit, UnionFields, UnionMode, }; +use datafusion_common::file_options::file_type::{ExternalCSV, ExternalFileType}; use datafusion_functions_aggregate::count::count_udaf; use prost::Message; @@ -41,7 +42,7 @@ use datafusion::functions_aggregate::expr_fn::{ }; use datafusion::prelude::*; use datafusion::test_util::{TestTableFactory, TestTableProvider}; -use datafusion_common::config::{FormatOptions, TableOptions}; +use datafusion_common::config::TableOptions; use datafusion_common::scalar::ScalarStructBuilder; use datafusion_common::{ internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, DFSchemaRef, @@ -325,11 +326,13 @@ async fn roundtrip_logical_plan_copy_to_sql_options() -> Result<()> { table_options.set_file_format(FileType::CSV); table_options.set("format.delimiter", ";")?; + let file_type: Arc = Arc::new(ExternalCSV {}); + let plan = LogicalPlan::Copy(CopyTo { input: Arc::new(input), output_url: "test.csv".to_string(), partition_by: vec!["a".to_string(), "b".to_string(), "c".to_string()], - format_options: FormatOptions::CSV(table_options.csv.clone()), + file_type, options: Default::default(), }); @@ -359,10 +362,12 @@ async fn roundtrip_logical_plan_copy_to_writer_options() -> Result<()> { parquet_format.global.dictionary_page_size_limit = 444; parquet_format.global.max_row_group_size = 555; + let file_type: Arc = Arc::new(ExternalCSV {}); + let plan = LogicalPlan::Copy(CopyTo { input: Arc::new(input), output_url: "test.parquet".to_string(), - format_options: FormatOptions::PARQUET(parquet_format.clone()), + file_type, partition_by: vec!["a".to_string(), "b".to_string(), "c".to_string()], options: Default::default(), }); @@ -375,10 +380,7 @@ async fn roundtrip_logical_plan_copy_to_writer_options() -> Result<()> { LogicalPlan::Copy(copy_to) => { assert_eq!("test.parquet", copy_to.output_url); assert_eq!(vec!["a", "b", "c"], copy_to.partition_by); - assert_eq!( - copy_to.format_options, - FormatOptions::PARQUET(parquet_format) - ); + assert_eq!(copy_to.file_type.get_ext(), "parquet".to_string()); } _ => panic!(), } @@ -391,11 +393,13 @@ async fn roundtrip_logical_plan_copy_to_arrow() -> Result<()> { let input = create_csv_scan(&ctx).await?; + let file_type: Arc = Arc::new(ExternalCSV {}); + let plan = LogicalPlan::Copy(CopyTo { input: Arc::new(input), output_url: "test.arrow".to_string(), partition_by: vec!["a".to_string(), "b".to_string(), "c".to_string()], - format_options: FormatOptions::ARROW, + file_type, options: Default::default(), }); @@ -406,7 +410,7 @@ async fn roundtrip_logical_plan_copy_to_arrow() -> Result<()> { match logical_round_trip { LogicalPlan::Copy(copy_to) => { assert_eq!("test.arrow", copy_to.output_url); - assert_eq!(FormatOptions::ARROW, copy_to.format_options); + assert_eq!("arrow".to_string(), copy_to.file_type.get_ext()); assert_eq!(vec!["a", "b", "c"], copy_to.partition_by); } _ => panic!(), @@ -432,11 +436,13 @@ async fn roundtrip_logical_plan_copy_to_csv() -> Result<()> { csv_format.time_format = Some("HH:mm:ss".to_string()); csv_format.null_value = Some("NIL".to_string()); + let file_type: Arc = Arc::new(ExternalCSV {}); + let plan = LogicalPlan::Copy(CopyTo { input: Arc::new(input), output_url: "test.csv".to_string(), partition_by: vec!["a".to_string(), "b".to_string(), "c".to_string()], - format_options: FormatOptions::CSV(csv_format.clone()), + file_type, options: Default::default(), }); @@ -447,7 +453,7 @@ async fn roundtrip_logical_plan_copy_to_csv() -> Result<()> { match logical_round_trip { LogicalPlan::Copy(copy_to) => { assert_eq!("test.csv", copy_to.output_url); - assert_eq!(FormatOptions::CSV(csv_format), copy_to.format_options); + assert_eq!("csv".to_string(), copy_to.file_type.get_ext()); assert_eq!(vec!["a", "b", "c"], copy_to.partition_by); } _ => panic!(), From 0274eec11fc0d4169b51803574e2938a91a04b7b Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Wed, 19 Jun 2024 07:29:25 -0400 Subject: [PATCH 06/20] compiling --- datafusion/common/src/config.rs | 6 +- .../common/src/file_options/file_type.rs | 144 +++++++++--------- datafusion/common/src/lib.rs | 1 + .../tests/cases/roundtrip_logical_plan.rs | 11 +- 4 files changed, 86 insertions(+), 76 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index c55abb0213f5..0a7681ab95b8 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -24,7 +24,7 @@ use std::str::FromStr; use crate::error::_config_err; use crate::parsers::CompressionTypeVariant; -use crate::{DataFusionError, Result}; +use crate::{DataFusionError, FileType, Result}; /// A macro that wraps a configuration struct and automatically derives /// [`Default`] and [`ConfigField`] for it, allowing it to be used @@ -1132,6 +1132,10 @@ pub struct TableOptions { /// Configuration options for JSON file handling. pub json: JsonOptions, + /// The current file format that the table operations should assume. This option allows + /// for dynamic switching between the supported file types (e.g., CSV, Parquet, JSON). + pub current_format: Option, + /// Optional extensions that can be used to extend or customize the behavior of the table /// options. Extensions can be registered using `Extensions::insert` and might include /// custom file handling logic, additional configuration parameters, or other enhancements. diff --git a/datafusion/common/src/file_options/file_type.rs b/datafusion/common/src/file_options/file_type.rs index 1affea44a1a9..3eb2cb5ac129 100644 --- a/datafusion/common/src/file_options/file_type.rs +++ b/datafusion/common/src/file_options/file_type.rs @@ -18,7 +18,11 @@ //! File type abstraction use std::any::Any; -use std::fmt::Display; +use std::fmt::{self, Display}; +use std::str::FromStr; + +use crate::config::FormatOptions; +use crate::DataFusionError; /// The default file extension of arrow files pub const DEFAULT_ARROW_EXTENSION: &str = ".arrow"; @@ -44,77 +48,77 @@ pub trait ExternalFileType: GetExt + Display + Send + Sync { fn as_any(&self) -> &dyn Any; } -// /// Readable file type -// #[derive(Debug, Clone, PartialEq, Eq, Hash)] -// pub enum FileType { -// /// Apache Arrow file -// ARROW, -// /// Apache Avro file -// AVRO, -// /// Apache Parquet file -// #[cfg(feature = "parquet")] -// PARQUET, -// /// CSV file -// CSV, -// /// JSON file -// JSON, -// } +/// Readable file type +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum FileType { + /// Apache Arrow file + ARROW, + /// Apache Avro file + AVRO, + /// Apache Parquet file + #[cfg(feature = "parquet")] + PARQUET, + /// CSV file + CSV, + /// JSON file + JSON, +} -// impl From<&FormatOptions> for FileType { -// fn from(value: &FormatOptions) -> Self { -// match value { -// FormatOptions::CSV(_) => FileType::CSV, -// FormatOptions::JSON(_) => FileType::JSON, -// #[cfg(feature = "parquet")] -// FormatOptions::PARQUET(_) => FileType::PARQUET, -// FormatOptions::AVRO => FileType::AVRO, -// FormatOptions::ARROW => FileType::ARROW, -// } -// } -// } +impl From<&FormatOptions> for FileType { + fn from(value: &FormatOptions) -> Self { + match value { + FormatOptions::CSV(_) => FileType::CSV, + FormatOptions::JSON(_) => FileType::JSON, + #[cfg(feature = "parquet")] + FormatOptions::PARQUET(_) => FileType::PARQUET, + FormatOptions::AVRO => FileType::AVRO, + FormatOptions::ARROW => FileType::ARROW, + } + } +} -// impl GetExt for FileType { -// fn get_ext(&self) -> String { -// match self { -// FileType::ARROW => DEFAULT_ARROW_EXTENSION.to_owned(), -// FileType::AVRO => DEFAULT_AVRO_EXTENSION.to_owned(), -// #[cfg(feature = "parquet")] -// FileType::PARQUET => DEFAULT_PARQUET_EXTENSION.to_owned(), -// FileType::CSV => DEFAULT_CSV_EXTENSION.to_owned(), -// FileType::JSON => DEFAULT_JSON_EXTENSION.to_owned(), -// } -// } -// } +impl GetExt for FileType { + fn get_ext(&self) -> String { + match self { + FileType::ARROW => DEFAULT_ARROW_EXTENSION.to_owned(), + FileType::AVRO => DEFAULT_AVRO_EXTENSION.to_owned(), + #[cfg(feature = "parquet")] + FileType::PARQUET => DEFAULT_PARQUET_EXTENSION.to_owned(), + FileType::CSV => DEFAULT_CSV_EXTENSION.to_owned(), + FileType::JSON => DEFAULT_JSON_EXTENSION.to_owned(), + } + } +} -// impl Display for FileType { -// fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { -// let out = match self { -// FileType::CSV => "csv", -// FileType::JSON => "json", -// #[cfg(feature = "parquet")] -// FileType::PARQUET => "parquet", -// FileType::AVRO => "avro", -// FileType::ARROW => "arrow", -// }; -// write!(f, "{}", out) -// } -// } +impl Display for FileType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let out = match self { + FileType::CSV => "csv", + FileType::JSON => "json", + #[cfg(feature = "parquet")] + FileType::PARQUET => "parquet", + FileType::AVRO => "avro", + FileType::ARROW => "arrow", + }; + write!(f, "{}", out) + } +} -// impl FromStr for FileType { -// type Err = DataFusionError; +impl FromStr for FileType { + type Err = DataFusionError; -// fn from_str(s: &str) -> Result { -// let s = s.to_uppercase(); -// match s.as_str() { -// "ARROW" => Ok(FileType::ARROW), -// "AVRO" => Ok(FileType::AVRO), -// #[cfg(feature = "parquet")] -// "PARQUET" => Ok(FileType::PARQUET), -// "CSV" => Ok(FileType::CSV), -// "JSON" | "NDJSON" => Ok(FileType::JSON), -// _ => Err(DataFusionError::NotImplemented(format!( -// "Unknown FileType: {s}" -// ))), -// } -// } -// } + fn from_str(s: &str) -> Result { + let s = s.to_uppercase(); + match s.as_str() { + "ARROW" => Ok(FileType::ARROW), + "AVRO" => Ok(FileType::AVRO), + #[cfg(feature = "parquet")] + "PARQUET" => Ok(FileType::PARQUET), + "CSV" => Ok(FileType::CSV), + "JSON" | "NDJSON" => Ok(FileType::JSON), + _ => Err(DataFusionError::NotImplemented(format!( + "Unknown FileType: {s}" + ))), + } + } +} diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index 21c13eebfd11..e91f30c55d81 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -70,6 +70,7 @@ pub use stats::{ColumnStatistics, Statistics}; pub use table_reference::{ResolvedTableReference, TableReference}; pub use unnest::UnnestOptions; pub use utils::project_schema; +pub use file_options::file_type::FileType; /// Downcast an Arrow Array to a concrete type, return an `DataFusionError::Internal` if the cast is /// not possible. In normal usage of DataFusion the downcast should always succeed. diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index a097741251f2..2d99958731d9 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -26,7 +26,8 @@ use arrow::datatypes::{ DataType, Field, Fields, Int32Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, Schema, SchemaRef, TimeUnit, UnionFields, UnionMode, }; -use datafusion_common::file_options::file_type::{ExternalCSV, ExternalFileType}; +use datafusion::datasource::file_format::csv::CsvFormat; +use datafusion::datasource::file_format::format_as_file_type; use datafusion_functions_aggregate::count::count_udaf; use prost::Message; @@ -326,7 +327,7 @@ async fn roundtrip_logical_plan_copy_to_sql_options() -> Result<()> { table_options.set_file_format(FileType::CSV); table_options.set("format.delimiter", ";")?; - let file_type: Arc = Arc::new(ExternalCSV {}); + let file_type = format_as_file_type(Arc::new(CsvFormat::default()));; let plan = LogicalPlan::Copy(CopyTo { input: Arc::new(input), @@ -362,7 +363,7 @@ async fn roundtrip_logical_plan_copy_to_writer_options() -> Result<()> { parquet_format.global.dictionary_page_size_limit = 444; parquet_format.global.max_row_group_size = 555; - let file_type: Arc = Arc::new(ExternalCSV {}); + let file_type = format_as_file_type(Arc::new(CsvFormat::default()));; let plan = LogicalPlan::Copy(CopyTo { input: Arc::new(input), @@ -393,7 +394,7 @@ async fn roundtrip_logical_plan_copy_to_arrow() -> Result<()> { let input = create_csv_scan(&ctx).await?; - let file_type: Arc = Arc::new(ExternalCSV {}); + let file_type = format_as_file_type(Arc::new(CsvFormat::default())); let plan = LogicalPlan::Copy(CopyTo { input: Arc::new(input), @@ -436,7 +437,7 @@ async fn roundtrip_logical_plan_copy_to_csv() -> Result<()> { csv_format.time_format = Some("HH:mm:ss".to_string()); csv_format.null_value = Some("NIL".to_string()); - let file_type: Arc = Arc::new(ExternalCSV {}); + let file_type = format_as_file_type(Arc::new(CsvFormat::default())); let plan = LogicalPlan::Copy(CopyTo { input: Arc::new(input), From 431988168d07024fa952acb246747a0b40bd9218 Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Wed, 19 Jun 2024 22:04:12 -0400 Subject: [PATCH 07/20] working on tests --- .../external_dependency/dataframe-to-s3.rs | 6 +- datafusion/common/src/config.rs | 83 +++++++--------- .../common/src/file_options/file_type.rs | 86 +---------------- datafusion/common/src/file_options/mod.rs | 10 +- datafusion/common/src/lib.rs | 5 +- datafusion/core/src/dataframe/mod.rs | 16 ++-- datafusion/core/src/dataframe/parquet.rs | 8 +- .../core/src/datasource/file_format/arrow.rs | 47 ++++++++- .../core/src/datasource/file_format/avro.rs | 49 ++++++++++ .../core/src/datasource/file_format/csv.rs | 80 +++++++++++++--- .../file_format/file_compression_type.rs | 83 ++-------------- .../core/src/datasource/file_format/json.rs | 67 ++++++++++++- .../core/src/datasource/file_format/mod.rs | 69 ++++++++----- .../src/datasource/file_format/parquet.rs | 76 ++++++++++++++- .../core/src/datasource/listing/table.rs | 96 ++++++++----------- .../src/datasource/listing_table_factory.rs | 41 +++----- .../core/src/datasource/physical_plan/csv.rs | 24 +++-- .../core/src/datasource/physical_plan/json.rs | 14 ++- .../datasource/physical_plan/parquet/mod.rs | 4 +- .../core/src/execution/session_state.rs | 54 +++++++++-- datafusion/core/src/physical_planner.rs | 16 ++-- datafusion/core/src/test/mod.rs | 19 ++-- datafusion/expr/src/logical_plan/builder.rs | 4 +- datafusion/expr/src/logical_plan/dml.rs | 4 +- datafusion/proto/src/logical_plan/mod.rs | 10 +- .../tests/cases/roundtrip_logical_plan.rs | 16 ++-- datafusion/sql/src/planner.rs | 4 +- 27 files changed, 570 insertions(+), 421 deletions(-) diff --git a/datafusion-examples/examples/external_dependency/dataframe-to-s3.rs b/datafusion-examples/examples/external_dependency/dataframe-to-s3.rs index 4d71ed758912..e75ba5dd5328 100644 --- a/datafusion-examples/examples/external_dependency/dataframe-to-s3.rs +++ b/datafusion-examples/examples/external_dependency/dataframe-to-s3.rs @@ -20,10 +20,10 @@ use std::sync::Arc; use datafusion::dataframe::DataFrameWriteOptions; use datafusion::datasource::file_format::parquet::ParquetFormat; +use datafusion::datasource::file_format::FileFormat; use datafusion::datasource::listing::ListingOptions; use datafusion::error::Result; use datafusion::prelude::*; -use datafusion_common::{FileType, GetExt}; use object_store::aws::AmazonS3Builder; use url::Url; @@ -54,7 +54,7 @@ async fn main() -> Result<()> { let path = format!("s3://{bucket_name}/test_data/"); let file_format = ParquetFormat::default().with_enable_pruning(true); let listing_options = ListingOptions::new(Arc::new(file_format)) - .with_file_extension(FileType::PARQUET.get_ext()); + .with_file_extension(ParquetFormat::default().get_ext()); ctx.register_listing_table("test", &path, listing_options, None, None) .await?; @@ -79,7 +79,7 @@ async fn main() -> Result<()> { let file_format = ParquetFormat::default().with_enable_pruning(true); let listing_options = ListingOptions::new(Arc::new(file_format)) - .with_file_extension(FileType::PARQUET.get_ext()); + .with_file_extension(ParquetFormat::default().get_ext()); ctx.register_listing_table("test2", &out_path, listing_options, None, None) .await?; diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 0a7681ab95b8..02ab9045d6de 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -24,7 +24,7 @@ use std::str::FromStr; use crate::error::_config_err; use crate::parsers::CompressionTypeVariant; -use crate::{DataFusionError, FileType, Result}; +use crate::{DataFusionError, Result}; /// A macro that wraps a configuration struct and automatically derives /// [`Default`] and [`ConfigField`] for it, allowing it to be used @@ -1116,6 +1116,16 @@ macro_rules! extensions_options { } } +/// These file types have special built in behavior for configuration. +/// Use TableOptions::Extensions for configuring other file types. +#[derive(Debug, Clone)] +pub enum ConfigFileType{ + CSV, + #[cfg(feature = "parquet")] + PARQUET, + JSON +} + /// Represents the configuration options available for handling different table formats within a data processing application. /// This struct encompasses options for various file formats including CSV, Parquet, and JSON, allowing for flexible configuration /// of parsing and writing behaviors specific to each format. Additionally, it supports extending functionality through custom extensions. @@ -1134,7 +1144,7 @@ pub struct TableOptions { /// The current file format that the table operations should assume. This option allows /// for dynamic switching between the supported file types (e.g., CSV, Parquet, JSON). - pub current_format: Option, + pub current_format: Option, /// Optional extensions that can be used to extend or customize the behavior of the table /// options. Extensions can be registered using `Extensions::insert` and might include @@ -1149,19 +1159,9 @@ impl ConfigField for TableOptions { /// If a format is selected, it visits only the settings relevant to that format. Otherwise, /// it visits all available format settings. fn visit(&self, v: &mut V, _key_prefix: &str, _description: &'static str) { - if let Some(file_type) = &self.current_format { - match file_type { - #[cfg(feature = "parquet")] - FileType::PARQUET => self.parquet.visit(v, "format", ""), - FileType::CSV => self.csv.visit(v, "format", ""), - FileType::JSON => self.json.visit(v, "format", ""), - _ => {} - } - } else { - self.csv.visit(v, "csv", ""); - self.parquet.visit(v, "parquet", ""); - self.json.visit(v, "json", ""); - } + self.csv.visit(v, "csv", ""); + self.parquet.visit(v, "parquet", ""); + self.json.visit(v, "json", ""); } /// Sets a configuration value for a specific key within `TableOptions`. @@ -1188,12 +1188,9 @@ impl ConfigField for TableOptions { match key { "format" => match format { #[cfg(feature = "parquet")] - FileType::PARQUET => self.parquet.set(rem, value), - FileType::CSV => self.csv.set(rem, value), - FileType::JSON => self.json.set(rem, value), - _ => { - _config_err!("Config value \"{key}\" is not supported on {}", format) - } + ConfigFileType::PARQUET => self.parquet.set(rem, value), + ConfigFileType::CSV => self.csv.set(rem, value), + ConfigFileType::JSON => self.json.set(rem, value), }, _ => _config_err!("Config value \"{key}\" not found on TableOptions"), } @@ -1210,15 +1207,6 @@ impl TableOptions { Self::default() } - /// Sets the file format for the table. - /// - /// # Parameters - /// - /// * `format`: The file format to use (e.g., CSV, Parquet). - pub fn set_file_format(&mut self, format: FileType) { - self.current_format = Some(format); - } - /// Creates a new `TableOptions` instance initialized with settings from a given session config. /// /// # Parameters @@ -1249,6 +1237,15 @@ impl TableOptions { clone } + /// Sets the file format for the table. + /// + /// # Parameters + /// + /// * `format`: The file format to use (e.g., CSV, Parquet). + pub fn set_file_format(&mut self, format: ConfigFileType) { + self.current_format = Some(format); + } + /// Sets the extensions for this `TableOptions` instance. /// /// # Parameters @@ -1357,7 +1354,7 @@ impl TableOptions { } let mut v = Visitor(vec![]); - self.visit(&mut v, "format", ""); + //self.visit(&mut v, "format", ""); v.0.extend(self.extensions.0.values().flat_map(|e| e.0.entries())); v.0 @@ -1684,19 +1681,6 @@ impl Display for FormatOptions { } } -impl From for FormatOptions { - fn from(value: FileType) -> Self { - match value { - FileType::ARROW => FormatOptions::ARROW, - FileType::AVRO => FormatOptions::AVRO, - #[cfg(feature = "parquet")] - FileType::PARQUET => FormatOptions::PARQUET(TableParquetOptions::default()), - FileType::CSV => FormatOptions::CSV(CsvOptions::default()), - FileType::JSON => FormatOptions::JSON(JsonOptions::default()), - } - } -} - #[cfg(test)] mod tests { use std::any::Any; @@ -1705,7 +1689,6 @@ mod tests { use crate::config::{ ConfigEntry, ConfigExtension, ExtensionOptions, Extensions, TableOptions, }; - use crate::FileType; #[derive(Default, Debug, Clone)] pub struct TestExtensionConfig { @@ -1763,7 +1746,7 @@ mod tests { let mut extension = Extensions::new(); extension.insert(TestExtensionConfig::default()); let mut table_config = TableOptions::new().with_extensions(extension); - table_config.set_file_format(FileType::CSV); + //table_config.set_file_format(FileType::CSV); table_config.set("format.delimiter", ";").unwrap(); assert_eq!(table_config.csv.delimiter, b';'); table_config.set("test.bootstrap.servers", "asd").unwrap(); @@ -1780,7 +1763,7 @@ mod tests { #[test] fn csv_u8_table_options() { let mut table_config = TableOptions::new(); - table_config.set_file_format(FileType::CSV); + //table_config.set_file_format(FileType::CSV); table_config.set("format.delimiter", ";").unwrap(); assert_eq!(table_config.csv.delimiter as char, ';'); table_config.set("format.escape", "\"").unwrap(); @@ -1793,7 +1776,7 @@ mod tests { #[test] fn parquet_table_options() { let mut table_config = TableOptions::new(); - table_config.set_file_format(FileType::PARQUET); + //table_config.set_file_format(FileType::PARQUET); table_config .set("format.bloom_filter_enabled::col1", "true") .unwrap(); @@ -1807,7 +1790,7 @@ mod tests { #[test] fn parquet_table_options_config_entry() { let mut table_config = TableOptions::new(); - table_config.set_file_format(FileType::PARQUET); + //table_config.set_file_format(FileType::PARQUET); table_config .set("format.bloom_filter_enabled::col1", "true") .unwrap(); @@ -1821,7 +1804,7 @@ mod tests { #[test] fn parquet_table_options_config_metadata_entry() { let mut table_config = TableOptions::new(); - table_config.set_file_format(FileType::PARQUET); + //table_config.set_file_format(FileType::PARQUET); table_config.set("format.metadata::key1", "").unwrap(); table_config.set("format.metadata::key2", "value2").unwrap(); table_config diff --git a/datafusion/common/src/file_options/file_type.rs b/datafusion/common/src/file_options/file_type.rs index 3eb2cb5ac129..2648f7289798 100644 --- a/datafusion/common/src/file_options/file_type.rs +++ b/datafusion/common/src/file_options/file_type.rs @@ -18,11 +18,7 @@ //! File type abstraction use std::any::Any; -use std::fmt::{self, Display}; -use std::str::FromStr; - -use crate::config::FormatOptions; -use crate::DataFusionError; +use std::fmt::Display; /// The default file extension of arrow files pub const DEFAULT_ARROW_EXTENSION: &str = ".arrow"; @@ -41,84 +37,10 @@ pub trait GetExt { fn get_ext(&self) -> String; } -/// Externally Defined FileType -pub trait ExternalFileType: GetExt + Display + Send + Sync { +/// Defines the functionality needed for logical planning for +/// a type of file which will be read or written to storage. +pub trait FileType: GetExt + Display + Send + Sync { /// Returns the table source as [`Any`] so that it can be /// downcast to a specific implementation. fn as_any(&self) -> &dyn Any; } - -/// Readable file type -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub enum FileType { - /// Apache Arrow file - ARROW, - /// Apache Avro file - AVRO, - /// Apache Parquet file - #[cfg(feature = "parquet")] - PARQUET, - /// CSV file - CSV, - /// JSON file - JSON, -} - -impl From<&FormatOptions> for FileType { - fn from(value: &FormatOptions) -> Self { - match value { - FormatOptions::CSV(_) => FileType::CSV, - FormatOptions::JSON(_) => FileType::JSON, - #[cfg(feature = "parquet")] - FormatOptions::PARQUET(_) => FileType::PARQUET, - FormatOptions::AVRO => FileType::AVRO, - FormatOptions::ARROW => FileType::ARROW, - } - } -} - -impl GetExt for FileType { - fn get_ext(&self) -> String { - match self { - FileType::ARROW => DEFAULT_ARROW_EXTENSION.to_owned(), - FileType::AVRO => DEFAULT_AVRO_EXTENSION.to_owned(), - #[cfg(feature = "parquet")] - FileType::PARQUET => DEFAULT_PARQUET_EXTENSION.to_owned(), - FileType::CSV => DEFAULT_CSV_EXTENSION.to_owned(), - FileType::JSON => DEFAULT_JSON_EXTENSION.to_owned(), - } - } -} - -impl Display for FileType { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let out = match self { - FileType::CSV => "csv", - FileType::JSON => "json", - #[cfg(feature = "parquet")] - FileType::PARQUET => "parquet", - FileType::AVRO => "avro", - FileType::ARROW => "arrow", - }; - write!(f, "{}", out) - } -} - -impl FromStr for FileType { - type Err = DataFusionError; - - fn from_str(s: &str) -> Result { - let s = s.to_uppercase(); - match s.as_str() { - "ARROW" => Ok(FileType::ARROW), - "AVRO" => Ok(FileType::AVRO), - #[cfg(feature = "parquet")] - "PARQUET" => Ok(FileType::PARQUET), - "CSV" => Ok(FileType::CSV), - "JSON" | "NDJSON" => Ok(FileType::JSON), - _ => Err(DataFusionError::NotImplemented(format!( - "Unknown FileType: {s}" - ))), - } - } -} diff --git a/datafusion/common/src/file_options/mod.rs b/datafusion/common/src/file_options/mod.rs index 59040b4290b0..46166946ce39 100644 --- a/datafusion/common/src/file_options/mod.rs +++ b/datafusion/common/src/file_options/mod.rs @@ -35,7 +35,7 @@ mod tests { config::TableOptions, file_options::{csv_writer::CsvWriterOptions, json_writer::JsonWriterOptions}, parsers::CompressionTypeVariant, - FileType, Result, + Result, }; use parquet::{ @@ -76,7 +76,7 @@ mod tests { option_map.insert("format.bloom_filter_ndv".to_owned(), "123".to_owned()); let mut table_config = TableOptions::new(); - table_config.set_file_format(FileType::PARQUET); + //table_config.set_file_format(FileType::PARQUET); table_config.alter_with_string_hash_map(&option_map)?; let parquet_options = ParquetWriterOptions::try_from(&table_config.parquet)?; @@ -181,7 +181,7 @@ mod tests { ); let mut table_config = TableOptions::new(); - table_config.set_file_format(FileType::PARQUET); + //table_config.set_file_format(FileType::PARQUET); table_config.alter_with_string_hash_map(&option_map)?; let parquet_options = ParquetWriterOptions::try_from(&table_config.parquet)?; @@ -284,7 +284,7 @@ mod tests { option_map.insert("format.delimiter".to_owned(), ";".to_owned()); let mut table_config = TableOptions::new(); - table_config.set_file_format(FileType::CSV); + //table_config.set_file_format(FileType::CSV); table_config.alter_with_string_hash_map(&option_map)?; let csv_options = CsvWriterOptions::try_from(&table_config.csv)?; @@ -306,7 +306,7 @@ mod tests { option_map.insert("format.compression".to_owned(), "gzip".to_owned()); let mut table_config = TableOptions::new(); - table_config.set_file_format(FileType::JSON); + //table_config.set_file_format(FileType::JSON); table_config.alter_with_string_hash_map(&option_map)?; let json_options = JsonWriterOptions::try_from(&table_config.json)?; diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index e91f30c55d81..c275152642f0 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -54,8 +54,8 @@ pub use error::{ SharedResult, }; pub use file_options::file_type::{ - GetExt, DEFAULT_ARROW_EXTENSION, DEFAULT_AVRO_EXTENSION, - DEFAULT_CSV_EXTENSION, DEFAULT_JSON_EXTENSION, DEFAULT_PARQUET_EXTENSION, + GetExt, DEFAULT_ARROW_EXTENSION, DEFAULT_AVRO_EXTENSION, DEFAULT_CSV_EXTENSION, + DEFAULT_JSON_EXTENSION, DEFAULT_PARQUET_EXTENSION, }; pub use functional_dependencies::{ aggregate_functional_dependencies, get_required_group_by_exprs_indices, @@ -70,7 +70,6 @@ pub use stats::{ColumnStatistics, Statistics}; pub use table_reference::{ResolvedTableReference, TableReference}; pub use unnest::UnnestOptions; pub use utils::project_schema; -pub use file_options::file_type::FileType; /// Downcast an Arrow Array to a concrete type, return an `DataFusionError::Internal` if the cast is /// not possible. In normal usage of DataFusion the downcast should always succeed. diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 45205a8dfe73..751dc20a2f08 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -26,9 +26,9 @@ use std::sync::Arc; use crate::arrow::record_batch::RecordBatch; use crate::arrow::util::pretty; -use crate::datasource::file_format::csv::CsvFormat; +use crate::datasource::file_format::csv::CsvFormatFactory; use crate::datasource::file_format::format_as_file_type; -use crate::datasource::file_format::json::JsonFormat; +use crate::datasource::file_format::json::JsonFormatFactory; use crate::datasource::{provider_as_source, MemTable, TableProvider}; use crate::error::Result; use crate::execution::context::{SessionState, TaskContext}; @@ -1241,12 +1241,12 @@ impl DataFrame { } let format = if let Some(csv_opts) = writer_options { - CsvFormat::default().with_options(csv_opts) + Arc::new(CsvFormatFactory::new_with_options(csv_opts)) } else { - CsvFormat::default() + Arc::new(CsvFormatFactory::new()) }; - let file_type = format_as_file_type(Arc::new(format)); + let file_type = format_as_file_type(format); let plan = LogicalPlanBuilder::copy_to( self.plan, @@ -1301,12 +1301,12 @@ impl DataFrame { } let format = if let Some(json_opts) = writer_options { - JsonFormat::default().with_options(json_opts) + Arc::new(JsonFormatFactory::new_with_options(json_opts)) } else { - JsonFormat::default() + Arc::new(JsonFormatFactory::new()) }; - let file_type = format_as_file_type(Arc::new(format)); + let file_type = format_as_file_type(format); let plan = LogicalPlanBuilder::copy_to( self.plan, diff --git a/datafusion/core/src/dataframe/parquet.rs b/datafusion/core/src/dataframe/parquet.rs index 274269fa662b..f40d2ab516bb 100644 --- a/datafusion/core/src/dataframe/parquet.rs +++ b/datafusion/core/src/dataframe/parquet.rs @@ -17,7 +17,7 @@ use std::sync::Arc; -use crate::datasource::file_format::{format_as_file_type, parquet::ParquetFormat}; +use crate::datasource::file_format::{format_as_file_type, parquet::ParquetFormatFactory}; use super::{ DataFrame, DataFrameWriteOptions, DataFusionError, LogicalPlanBuilder, RecordBatch, @@ -62,12 +62,12 @@ impl DataFrame { } let format = if let Some(parquet_opts) = writer_options { - ParquetFormat::default().with_options(parquet_opts) + Arc::new(ParquetFormatFactory::new_with_options(parquet_opts)) } else { - ParquetFormat::default() + Arc::new(ParquetFormatFactory::new()) }; - let file_type = format_as_file_type(Arc::new(format)); + let file_type = format_as_file_type(format); let plan = LogicalPlanBuilder::copy_to( self.plan, diff --git a/datafusion/core/src/datasource/file_format/arrow.rs b/datafusion/core/src/datasource/file_format/arrow.rs index 8c6790541597..abe19cfe5ec2 100644 --- a/datafusion/core/src/datasource/file_format/arrow.rs +++ b/datafusion/core/src/datasource/file_format/arrow.rs @@ -21,12 +21,14 @@ use std::any::Any; use std::borrow::Cow; +use std::collections::HashMap; use std::fmt::{self, Debug}; use std::sync::Arc; use super::file_compression_type::FileCompressionType; use super::write::demux::start_demuxer_task; use super::write::{create_writer, SharedBuffer}; +use super::FileFormatFactory; use crate::datasource::file_format::FileFormat; use crate::datasource::physical_plan::{ ArrowExec, FileGroupDisplay, FileScanConfig, FileSinkConfig, @@ -40,7 +42,8 @@ use arrow::ipc::reader::FileReader; use arrow::ipc::writer::IpcWriteOptions; use arrow::ipc::{root_as_message, CompressionType}; use arrow_schema::{ArrowError, Schema, SchemaRef}; -use datafusion_common::{not_impl_err, DataFusionError, Statistics}; +use datafusion_common::parsers::CompressionTypeVariant; +use datafusion_common::{not_impl_err, DataFusionError, GetExt, Statistics, DEFAULT_ARROW_EXTENSION}; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement}; use datafusion_physical_plan::insert::{DataSink, DataSinkExec}; @@ -61,6 +64,34 @@ const INITIAL_BUFFER_BYTES: usize = 1048576; /// If the buffered Arrow data exceeds this size, it is flushed to object store const BUFFER_FLUSH_BYTES: usize = 1024000; +/// Factory struct used to create [ArrowFormat] +pub struct ArrowFormatFactory; + +impl ArrowFormatFactory{ + /// Creates an instance of [ArrowFormatFactory] + pub fn new() -> Self{ + Self{} + } +} + + +impl FileFormatFactory for ArrowFormatFactory{ + fn create(&self, _state: &SessionState, _format_options: &HashMap) -> Result>{ + Ok(Arc::new(ArrowFormat::default())) + } + + fn default(&self) -> Arc{ + Arc::new(ArrowFormat::default()) + } +} + +impl GetExt for ArrowFormatFactory{ + fn get_ext(&self) -> String{ + // Removes the dot, i.e. ".parquet" -> "parquet" + DEFAULT_ARROW_EXTENSION[1..].to_string() + } +} + /// Arrow `FileFormat` implementation. #[derive(Default, Debug)] pub struct ArrowFormat; @@ -71,6 +102,20 @@ impl FileFormat for ArrowFormat { self } + fn get_ext(&self) -> String{ + ArrowFormatFactory::new().get_ext() + } + + fn get_ext_with_compression(&self, file_compression_type: &FileCompressionType) -> Result{ + let ext = self.get_ext(); + match file_compression_type.get_variant() { + CompressionTypeVariant::UNCOMPRESSED => Ok(ext), + _ => Err(DataFusionError::Internal( + "Arrow FileFormat does not support compression.".into(), + )), + } + } + async fn infer_schema( &self, _state: &SessionState, diff --git a/datafusion/core/src/datasource/file_format/avro.rs b/datafusion/core/src/datasource/file_format/avro.rs index 7b2c26a2c4f9..5088034e06b7 100644 --- a/datafusion/core/src/datasource/file_format/avro.rs +++ b/datafusion/core/src/datasource/file_format/avro.rs @@ -18,15 +18,22 @@ //! [`AvroFormat`] Apache Avro [`FileFormat`] abstractions use std::any::Any; +use std::collections::HashMap; use std::sync::Arc; use arrow::datatypes::Schema; use arrow::datatypes::SchemaRef; use async_trait::async_trait; +use datafusion_common::parsers::CompressionTypeVariant; +use datafusion_common::DataFusionError; +use datafusion_common::GetExt; +use datafusion_common::DEFAULT_AVRO_EXTENSION; use datafusion_physical_expr::PhysicalExpr; use object_store::{GetResultPayload, ObjectMeta, ObjectStore}; +use super::file_compression_type::FileCompressionType; use super::FileFormat; +use super::FileFormatFactory; use crate::datasource::avro_to_arrow::read_avro_schema_from_reader; use crate::datasource::physical_plan::{AvroExec, FileScanConfig}; use crate::error::Result; @@ -34,6 +41,34 @@ use crate::execution::context::SessionState; use crate::physical_plan::ExecutionPlan; use crate::physical_plan::Statistics; +/// Factory struct used to create [AvroFormat] +pub struct AvroFormatFactory; + +impl AvroFormatFactory{ + /// Creates an instance of [AvroFormatFactory] + pub fn new() -> Self{ + Self{} + } +} + + +impl FileFormatFactory for AvroFormatFactory{ + fn create(&self, _state: &SessionState, _format_options: &HashMap) -> Result>{ + Ok(Arc::new(AvroFormat::default())) + } + + fn default(&self) -> Arc{ + Arc::new(AvroFormat::default()) + } +} + +impl GetExt for AvroFormatFactory{ + fn get_ext(&self) -> String{ + // Removes the dot, i.e. ".parquet" -> "parquet" + DEFAULT_AVRO_EXTENSION[1..].to_string() + } +} + /// Avro `FileFormat` implementation. #[derive(Default, Debug)] pub struct AvroFormat; @@ -44,6 +79,20 @@ impl FileFormat for AvroFormat { self } + fn get_ext(&self) -> String{ + AvroFormatFactory::new().get_ext() + } + + fn get_ext_with_compression(&self, file_compression_type: &FileCompressionType) -> Result{ + let ext = self.get_ext(); + match file_compression_type.get_variant() { + CompressionTypeVariant::UNCOMPRESSED => Ok(ext), + _ => Err(DataFusionError::Internal( + "Avro FileFormat does not support compression.".into(), + )), + } + } + async fn infer_schema( &self, _state: &SessionState, diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index 2139b35621f2..c08a101acb60 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -18,12 +18,12 @@ //! [`CsvFormat`], Comma Separated Value (CSV) [`FileFormat`] abstractions use std::any::Any; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::fmt::{self, Debug}; use std::sync::Arc; use super::write::orchestration::stateless_multipart_put; -use super::FileFormat; +use super::{FileFormat, FileFormatFactory}; use crate::datasource::file_format::file_compression_type::FileCompressionType; use crate::datasource::file_format::write::BatchSerializer; use crate::datasource::physical_plan::{ @@ -40,9 +40,9 @@ use arrow::array::RecordBatch; use arrow::csv::WriterBuilder; use arrow::datatypes::SchemaRef; use arrow::datatypes::{DataType, Field, Fields, Schema}; -use datafusion_common::config::CsvOptions; +use datafusion_common::config::{ConfigField, ConfigFileType, CsvOptions}; use datafusion_common::file_options::csv_writer::CsvWriterOptions; -use datafusion_common::{exec_err, not_impl_err, DataFusionError}; +use datafusion_common::{exec_err, not_impl_err, DataFusionError, GetExt, DEFAULT_CSV_EXTENSION}; use datafusion_execution::TaskContext; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement}; use datafusion_physical_plan::metrics::MetricsSet; @@ -53,6 +53,56 @@ use futures::stream::BoxStream; use futures::{pin_mut, Stream, StreamExt, TryStreamExt}; use object_store::{delimited::newline_delimited_stream, ObjectMeta, ObjectStore}; +/// Factory struct used to create [AvroFormat] +pub struct CsvFormatFactory{ + options: Option +} + +impl CsvFormatFactory{ + /// Creates an instance of [CsvFormatFactory] + pub fn new() -> Self{ + Self{ options: None } + } + + /// Creates an instance of [CsvFormatFactory] with customized default options + pub fn new_with_options(options: CsvOptions) -> Self{ + Self{ options: Some(options) } + } +} + +impl FileFormatFactory for CsvFormatFactory{ + fn create(&self, state: &SessionState, format_options: &HashMap) -> Result>{ + let csv_options = match &self.options{ + None => { + let mut table_options = state.default_table_options(); + table_options.set_file_format(ConfigFileType::CSV); + table_options.alter_with_string_hash_map(&format_options)?; + table_options.csv + }, + Some(csv_options) => { + let mut csv_options = csv_options.clone(); + for (k, v) in format_options { + csv_options.set(k, v)?; + } + csv_options + } + }; + + Ok(Arc::new(CsvFormat::default().with_options(csv_options))) + } + + fn default(&self) -> Arc{ + Arc::new(CsvFormat::default()) + } +} + +impl GetExt for CsvFormatFactory{ + fn get_ext(&self) -> String{ + // Removes the dot, i.e. ".parquet" -> "parquet" + DEFAULT_CSV_EXTENSION[1..].to_string() + } +} + /// Character Separated Value `FileFormat` implementation. #[derive(Debug, Default)] pub struct CsvFormat { @@ -206,6 +256,15 @@ impl FileFormat for CsvFormat { self } + fn get_ext(&self) -> String{ + CsvFormatFactory::new().get_ext() + } + + fn get_ext_with_compression(&self, file_compression_type: &FileCompressionType) -> Result{ + let ext = self.get_ext(); + Ok(format!("{}{}", ext, file_compression_type.get_ext())) + } + async fn infer_schema( &self, state: &SessionState, @@ -558,7 +617,6 @@ mod tests { use datafusion_common::cast::as_string_array; use datafusion_common::internal_err; use datafusion_common::stats::Precision; - use datafusion_common::{FileType, GetExt}; use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; use datafusion_expr::{col, lit}; @@ -1060,9 +1118,9 @@ mod tests { .with_repartition_file_min_size(0) .with_target_partitions(n_partitions); let ctx = SessionContext::new_with_config(config); - let file_format = CsvFormat::default().with_has_header(false); - let listing_options = ListingOptions::new(Arc::new(file_format)) - .with_file_extension(FileType::CSV.get_ext()); + let file_format = Arc::new(CsvFormat::default().with_has_header(false)); + let listing_options = ListingOptions::new(file_format.clone()) + .with_file_extension(file_format.get_ext()); ctx.register_listing_table( "empty", "tests/data/empty_files/all_empty/", @@ -1113,9 +1171,9 @@ mod tests { .with_repartition_file_min_size(0) .with_target_partitions(n_partitions); let ctx = SessionContext::new_with_config(config); - let file_format = CsvFormat::default().with_has_header(false); - let listing_options = ListingOptions::new(Arc::new(file_format)) - .with_file_extension(FileType::CSV.get_ext()); + let file_format = Arc::new(CsvFormat::default().with_has_header(false)); + let listing_options = ListingOptions::new(file_format.clone()) + .with_file_extension(file_format.get_ext()); ctx.register_listing_table( "empty", "tests/data/empty_files/some_empty", diff --git a/datafusion/core/src/datasource/file_format/file_compression_type.rs b/datafusion/core/src/datasource/file_format/file_compression_type.rs index c1fbe352d37b..4f7e213f0ced 100644 --- a/datafusion/core/src/datasource/file_format/file_compression_type.rs +++ b/datafusion/core/src/datasource/file_format/file_compression_type.rs @@ -22,7 +22,7 @@ use std::str::FromStr; use crate::error::{DataFusionError, Result}; use datafusion_common::parsers::CompressionTypeVariant::{self, *}; -use datafusion_common::{FileType, GetExt}; +use datafusion_common::GetExt; #[cfg(feature = "compression")] use async_compression::tokio::bufread::{ @@ -112,6 +112,11 @@ impl FileCompressionType { variant: UNCOMPRESSED, }; + /// Read only access to self.variant + pub fn get_variant(&self) -> &CompressionTypeVariant{ + &self.variant + } + /// The file is compressed or not pub const fn is_compressed(&self) -> bool { self.variant.is_compressed() @@ -245,90 +250,16 @@ pub trait FileTypeExt { fn get_ext_with_compression(&self, c: FileCompressionType) -> Result; } -impl FileTypeExt for FileType { - fn get_ext_with_compression(&self, c: FileCompressionType) -> Result { - let ext = self.get_ext(); - - match self { - FileType::JSON | FileType::CSV => Ok(format!("{}{}", ext, c.get_ext())), - FileType::AVRO | FileType::ARROW => match c.variant { - UNCOMPRESSED => Ok(ext), - _ => Err(DataFusionError::Internal( - "FileCompressionType can be specified for CSV/JSON FileType.".into(), - )), - }, - #[cfg(feature = "parquet")] - FileType::PARQUET => match c.variant { - UNCOMPRESSED => Ok(ext), - _ => Err(DataFusionError::Internal( - "FileCompressionType can be specified for CSV/JSON FileType.".into(), - )), - }, - } - } -} - #[cfg(test)] mod tests { use std::str::FromStr; - use crate::datasource::file_format::file_compression_type::{ - FileCompressionType, FileTypeExt, - }; + use crate::datasource::file_format::file_compression_type::FileCompressionType; use crate::error::DataFusionError; - use datafusion_common::file_options::file_type::FileType; - use bytes::Bytes; use futures::StreamExt; - #[test] - fn get_ext_with_compression() { - for (file_type, compression, extension) in [ - (FileType::CSV, FileCompressionType::UNCOMPRESSED, ".csv"), - (FileType::CSV, FileCompressionType::GZIP, ".csv.gz"), - (FileType::CSV, FileCompressionType::XZ, ".csv.xz"), - (FileType::CSV, FileCompressionType::BZIP2, ".csv.bz2"), - (FileType::CSV, FileCompressionType::ZSTD, ".csv.zst"), - (FileType::JSON, FileCompressionType::UNCOMPRESSED, ".json"), - (FileType::JSON, FileCompressionType::GZIP, ".json.gz"), - (FileType::JSON, FileCompressionType::XZ, ".json.xz"), - (FileType::JSON, FileCompressionType::BZIP2, ".json.bz2"), - (FileType::JSON, FileCompressionType::ZSTD, ".json.zst"), - ] { - assert_eq!( - file_type.get_ext_with_compression(compression).unwrap(), - extension - ); - } - - let mut ty_ext_tuple = vec![]; - ty_ext_tuple.push((FileType::AVRO, ".avro")); - #[cfg(feature = "parquet")] - ty_ext_tuple.push((FileType::PARQUET, ".parquet")); - - // Cannot specify compression for these file types - for (file_type, extension) in ty_ext_tuple { - assert_eq!( - file_type - .get_ext_with_compression(FileCompressionType::UNCOMPRESSED) - .unwrap(), - extension - ); - for compression in [ - FileCompressionType::GZIP, - FileCompressionType::XZ, - FileCompressionType::BZIP2, - FileCompressionType::ZSTD, - ] { - assert!(matches!( - file_type.get_ext_with_compression(compression), - Err(DataFusionError::Internal(_)) - )); - } - } - } - #[test] fn from_str() { for (ext, compression_type) in [ diff --git a/datafusion/core/src/datasource/file_format/json.rs b/datafusion/core/src/datasource/file_format/json.rs index d5347c498c71..911f55a7a896 100644 --- a/datafusion/core/src/datasource/file_format/json.rs +++ b/datafusion/core/src/datasource/file_format/json.rs @@ -18,13 +18,14 @@ //! [`JsonFormat`]: Line delimited JSON [`FileFormat`] abstractions use std::any::Any; +use std::collections::HashMap; use std::fmt; use std::fmt::Debug; use std::io::BufReader; use std::sync::Arc; use super::write::orchestration::stateless_multipart_put; -use super::{FileFormat, FileScanConfig}; +use super::{FileFormat, FileFormatFactory, FileScanConfig}; use crate::datasource::file_format::file_compression_type::FileCompressionType; use crate::datasource::file_format::write::BatchSerializer; use crate::datasource::physical_plan::FileGroupDisplay; @@ -41,9 +42,9 @@ use arrow::datatypes::SchemaRef; use arrow::json; use arrow::json::reader::{infer_json_schema_from_iterator, ValueIter}; use arrow_array::RecordBatch; -use datafusion_common::config::JsonOptions; +use datafusion_common::config::{ConfigField, ConfigFileType, JsonOptions}; use datafusion_common::file_options::json_writer::JsonWriterOptions; -use datafusion_common::not_impl_err; +use datafusion_common::{not_impl_err, GetExt, DEFAULT_JSON_EXTENSION}; use datafusion_execution::TaskContext; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement}; use datafusion_physical_plan::metrics::MetricsSet; @@ -53,6 +54,57 @@ use async_trait::async_trait; use bytes::{Buf, Bytes}; use object_store::{GetResultPayload, ObjectMeta, ObjectStore}; +/// Factory struct used to create [AvroFormat] +pub struct JsonFormatFactory{ + options: Option +} + +impl JsonFormatFactory{ + /// Creates an instance of [JsonFormatFactory] + pub fn new() -> Self{ + Self{ options: None } + } + + /// Creates an instance of [JsonFormatFactory] with customized default options + pub fn new_with_options(options: JsonOptions) -> Self{ + Self{ options: Some(options) } + } +} + +impl FileFormatFactory for JsonFormatFactory{ + fn create(&self, state: &SessionState, format_options: &HashMap) -> Result>{ + + let json_options = match &self.options{ + None => { + let mut table_options = state.default_table_options(); + table_options.set_file_format(ConfigFileType::JSON); + table_options.alter_with_string_hash_map(&format_options)?; + table_options.json + }, + Some(json_options) => { + let mut json_options = json_options.clone(); + for (k, v) in format_options{ + json_options.set(k, v)?; + } + json_options + } + }; + + Ok(Arc::new(JsonFormat::default().with_options(json_options))) + } + + fn default(&self) -> Arc{ + Arc::new(JsonFormat::default()) + } +} + +impl GetExt for JsonFormatFactory{ + fn get_ext(&self) -> String{ + // Removes the dot, i.e. ".parquet" -> "parquet" + DEFAULT_JSON_EXTENSION[1..].to_string() + } +} + /// New line delimited JSON `FileFormat` implementation. #[derive(Debug, Default)] pub struct JsonFormat { @@ -95,6 +147,15 @@ impl FileFormat for JsonFormat { self } + fn get_ext(&self) -> String{ + JsonFormatFactory::new().get_ext() + } + + fn get_ext_with_compression(&self, file_compression_type: &FileCompressionType) -> Result{ + let ext = self.get_ext(); + Ok(format!("{}{}", ext, file_compression_type.get_ext())) + } + async fn infer_schema( &self, _state: &SessionState, diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index 48da1302bcc7..e8e64144e6bd 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -42,13 +42,27 @@ use crate::error::Result; use crate::execution::context::SessionState; use crate::physical_plan::{ExecutionPlan, Statistics}; -use datafusion_common::file_options::file_type::ExternalFileType; +use datafusion_common::file_options::file_type::FileType; use datafusion_common::{internal_err, not_impl_err, GetExt}; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement}; use async_trait::async_trait; +use file_compression_type::FileCompressionType; use object_store::{ObjectMeta, ObjectStore}; +/// A factory struct which produces [FileFormat] structs based on session and command level options +pub trait FileFormatFactory: Sync + Send + GetExt { + /// Initialize a [FileFormat] and configure based on session and command level options + fn create( + &self, + state: &SessionState, + format_options: &HashMap, + ) -> Result>; + + /// Initialize a [FileFormat] with all options set to default values + fn default(&self) -> Arc; +} + /// This trait abstracts all the file format specific implementations /// from the [`TableProvider`]. This helps code re-utilization across /// providers that support the same file formats. @@ -60,21 +74,14 @@ pub trait FileFormat: Send + Sync + fmt::Debug { /// downcast to a specific implementation. fn as_any(&self) -> &dyn Any; - /// Returns the extension for this FileFormat, e.g. "csv" - fn get_ext(&self) -> String { - panic!("Undefined get_ext") - } + /// Returns the extension for this FileFormat, e.g. "file.csv" -> csv + fn get_ext(&self) -> String; - /// Updates internal state based on options passed in via SQL strings, e.g. - /// COPY table TO custom.format OPTIONS (custom.option1 true) - fn update_options_from_string_hashmap( + /// Returns the extension for this FileFormat when compressed, e.g. "file.csv.gz" -> csv + fn get_ext_with_compression( &self, - _options: &HashMap, - ) -> Result<()> { - not_impl_err!( - "Updating options from SQL strings unsupported for this FileFormat." - ) - } + _file_compression_type: &FileCompressionType, + ) -> Result; /// Infer the common schema of the provided objects. The objects will usually /// be analysed up to a given number of records or files (as specified in the @@ -124,17 +131,23 @@ pub trait FileFormat: Send + Sync + fmt::Debug { } } +/// A container of [FileFormat] which also implements [FileType]. +/// This enables converting an Arc to an Arc. +/// The former trait is a superset of the latter trait, which includes execution time +/// relevant methods. [FileType] is only used in logical planning and only implements +/// the subset of methods required during logical planning. pub struct DefaultFileFormat { - file_format: Arc, + file_format_factory: Arc, } impl DefaultFileFormat { - pub fn new(file_format: Arc) -> Self { - Self { file_format } + /// Constructs a [DefaultFileFormat] wrapper from a [FileFormatFactory] + pub fn new(file_format_factory: Arc) -> Self { + Self { file_format_factory } } } -impl ExternalFileType for DefaultFileFormat { +impl FileType for DefaultFileFormat { fn as_any(&self) -> &dyn Any { self } @@ -142,31 +155,35 @@ impl ExternalFileType for DefaultFileFormat { impl Display for DefaultFileFormat { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.file_format.fmt(f) + self.file_format_factory.default().fmt(f) } } impl GetExt for DefaultFileFormat { fn get_ext(&self) -> String { - self.file_format.get_ext() + self.file_format_factory.get_ext() } } +/// Converts a [FileFormatFactory] to a [FileType] pub fn format_as_file_type( - file_format: Arc, -) -> Arc { - Arc::new(DefaultFileFormat { file_format }) + file_format_factory: Arc, +) -> Arc { + Arc::new(DefaultFileFormat { file_format_factory }) } +/// Converts a [FileType] to a [FileFormatFactory]. +/// Returns an error if the [FileType] cannot be +/// downcasted to a [DefaultFileFormat]. pub fn file_type_to_format( - file_type: &Arc, -) -> datafusion_common::Result> { + file_type: &Arc, +) -> datafusion_common::Result> { match file_type .as_ref() .as_any() .downcast_ref::() { - Some(source) => Ok(source.file_format.clone()), + Some(source) => Ok(source.file_format_factory.clone()), _ => internal_err!("FileType was not DefaultFileFormat"), } } diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 572904254fd7..a3932d116f9a 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -24,7 +24,7 @@ use std::sync::Arc; use super::write::demux::start_demuxer_task; use super::write::{create_writer, SharedBuffer}; -use super::{FileFormat, FileScanConfig}; +use super::{FileFormat, FileFormatFactory, FileScanConfig}; use crate::arrow::array::{ BooleanArray, Float32Array, Float64Array, Int32Array, Int64Array, RecordBatch, }; @@ -44,11 +44,12 @@ use crate::physical_plan::{ Statistics, }; -use datafusion_common::config::TableParquetOptions; +use datafusion_common::config::{ConfigField, ConfigFileType, TableParquetOptions}; use datafusion_common::file_options::parquet_writer::ParquetWriterOptions; +use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::stats::Precision; use datafusion_common::{ - exec_err, internal_datafusion_err, not_impl_err, DataFusionError, + exec_err, internal_datafusion_err, not_impl_err, DataFusionError, GetExt, DEFAULT_PARQUET_EXTENSION }; use datafusion_common_runtime::SpawnedTask; use datafusion_execution::TaskContext; @@ -57,6 +58,7 @@ use datafusion_physical_plan::metrics::MetricsSet; use async_trait::async_trait; use bytes::{BufMut, BytesMut}; +use hashbrown::HashMap; use object_store::buffered::BufWriter; use parquet::arrow::arrow_writer::{ compute_leaves, get_column_writers, ArrowColumnChunk, ArrowColumnWriter, @@ -77,7 +79,6 @@ use tokio::task::JoinSet; use crate::datasource::physical_plan::parquet::ParquetExecBuilder; use futures::{StreamExt, TryStreamExt}; -use hashbrown::HashMap; use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore}; @@ -89,6 +90,59 @@ const INITIAL_BUFFER_BYTES: usize = 1048576; /// this size, it is flushed to object store const BUFFER_FLUSH_BYTES: usize = 1024000; +/// Factory struct used to create [AvroFormat] +pub struct ParquetFormatFactory{ + options: Option +} + +impl ParquetFormatFactory{ + /// Creates an instance of [ParquetFormatFactory] + pub fn new() -> Self{ + Self{ options: None } + } + + /// Creates an instance of [ParquetFormatFactory] with customized default options + pub fn new_with_options(options: TableParquetOptions) -> Self{ + Self{ options: Some(options) } + } +} + +impl FileFormatFactory for ParquetFormatFactory{ + fn create(&self, state: &SessionState, format_options: &std::collections::HashMap) -> Result>{ + + let parquet_options = match &self.options{ + None => { + let mut table_options = state.default_table_options(); + table_options.set_file_format(ConfigFileType::PARQUET); + table_options.alter_with_string_hash_map(&format_options)?; + table_options.parquet + + }, + Some(parquet_options) => { + let mut parquet_options = parquet_options.clone(); + for (k, v) in format_options{ + parquet_options.set(k, v)?; + } + parquet_options + } + }; + + + Ok(Arc::new(ParquetFormat::default().with_options(parquet_options))) + } + + fn default(&self) -> Arc{ + Arc::new(ParquetFormat::default()) + } +} + +impl GetExt for ParquetFormatFactory{ + fn get_ext(&self) -> String{ + // Removes the dot, i.e. ".parquet" -> "parquet" + DEFAULT_PARQUET_EXTENSION[1..].to_string() + } +} + /// The Apache Parquet `FileFormat` implementation #[derive(Debug, Default)] pub struct ParquetFormat { @@ -190,6 +244,20 @@ impl FileFormat for ParquetFormat { self } + fn get_ext(&self) -> String{ + ParquetFormatFactory::new().get_ext() + } + + fn get_ext_with_compression(&self, file_compression_type: &FileCompressionType) -> Result{ + let ext = self.get_ext(); + match file_compression_type.get_variant() { + CompressionTypeVariant::UNCOMPRESSED => Ok(ext), + _ => Err(DataFusionError::Internal( + "Parquet FileFormat does not support compression.".into(), + )), + } + } + async fn infer_schema( &self, state: &SessionState, diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 7f5e80c4988a..35a0859d7bd3 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -25,19 +25,11 @@ use super::helpers::{expr_applicable_for_cols, pruned_partition_list, split_file use super::PartitionedFile; #[cfg(feature = "parquet")] -use crate::datasource::file_format::parquet::ParquetFormat; use crate::datasource::{ create_ordering, get_statistics_with_limit, TableProvider, TableType, }; use crate::datasource::{ - file_format::{ - arrow::ArrowFormat, - avro::AvroFormat, - csv::CsvFormat, - file_compression_type::{FileCompressionType, FileTypeExt}, - json::JsonFormat, - FileFormat, - }, + file_format::{file_compression_type::FileCompressionType, FileFormat}, listing::ListingTableUrl, physical_plan::{FileScanConfig, FileSinkConfig}, }; @@ -51,7 +43,8 @@ use crate::{ use arrow::datatypes::{DataType, Field, SchemaBuilder, SchemaRef}; use arrow_schema::Schema; use datafusion_common::{ - internal_err, plan_err, project_schema, Constraints, FileType, SchemaExt, ToDFSchema, + config_datafusion_err, internal_err, plan_err, + project_schema, Constraints, SchemaExt, ToDFSchema, }; use datafusion_execution::cache::cache_manager::FileStatisticsCache; use datafusion_execution::cache::cache_unit::DefaultFileStatisticsCache; @@ -119,9 +112,7 @@ impl ListingTableConfig { } } - fn infer_file_type(path: &str) -> Result<(FileType, String)> { - let err_msg = format!("Unable to infer file type from path: {path}"); - + fn infer_file_extension(path: &str) -> Result { let mut exts = path.rsplit('.'); let mut splitted = exts.next().unwrap_or(""); @@ -133,14 +124,7 @@ impl ListingTableConfig { splitted = exts.next().unwrap_or(""); } - let file_type = FileType::from_str(splitted) - .map_err(|_| DataFusionError::Internal(err_msg.to_owned()))?; - - let ext = file_type - .get_ext_with_compression(file_compression_type.to_owned()) - .map_err(|_| DataFusionError::Internal(err_msg))?; - - Ok((file_type, ext)) + Ok(splitted.to_string()) } /// Infer `ListingOptions` based on `table_path` suffix. @@ -161,25 +145,15 @@ impl ListingTableConfig { .await .ok_or_else(|| DataFusionError::Internal("No files for table".into()))??; - let (file_type, file_extension) = - ListingTableConfig::infer_file_type(file.location.as_ref())?; + let file_extension = + ListingTableConfig::infer_file_extension(file.location.as_ref())?; - let mut table_options = state.default_table_options(); - table_options.set_file_format(file_type.clone()); - let file_format: Arc = match file_type { - FileType::CSV => { - Arc::new(CsvFormat::default().with_options(table_options.csv)) - } - #[cfg(feature = "parquet")] - FileType::PARQUET => { - Arc::new(ParquetFormat::default().with_options(table_options.parquet)) - } - FileType::AVRO => Arc::new(AvroFormat), - FileType::JSON => { - Arc::new(JsonFormat::default().with_options(table_options.json)) - } - FileType::ARROW => Arc::new(ArrowFormat), - }; + let file_format = state + .get_file_format_factory(&file_extension) + .ok_or(config_datafusion_err!( + "No file_format found with extension {file_extension}" + ))? + .create(&state, &HashMap::new())?; let listing_options = ListingOptions::new(file_format) .with_file_extension(file_extension) @@ -1060,6 +1034,10 @@ impl ListingTable { #[cfg(test)] mod tests { use super::*; + use crate::datasource::file_format::avro::AvroFormat; + use crate::datasource::file_format::csv::CsvFormat; + use crate::datasource::file_format::json::JsonFormat; + use crate::datasource::file_format::parquet::ParquetFormat; #[cfg(feature = "parquet")] use crate::datasource::{provider_as_source, MemTable}; use crate::execution::options::ArrowReadOptions; @@ -1073,7 +1051,7 @@ mod tests { use arrow::record_batch::RecordBatch; use arrow_schema::SortOptions; use datafusion_common::stats::Precision; - use datafusion_common::{assert_contains, GetExt, ScalarValue}; + use datafusion_common::{assert_contains, ScalarValue}; use datafusion_expr::{BinaryExpr, LogicalPlanBuilder, Operator}; use datafusion_physical_expr::PhysicalSortExpr; use datafusion_physical_plan::ExecutionPlanProperties; @@ -1104,6 +1082,8 @@ mod tests { #[cfg(feature = "parquet")] #[tokio::test] async fn load_table_stats_by_default() -> Result<()> { + use crate::datasource::file_format::parquet::ParquetFormat; + let testdata = crate::test_util::parquet_test_data(); let filename = format!("{}/{}", testdata, "alltypes_plain.parquet"); let table_path = ListingTableUrl::parse(filename).unwrap(); @@ -1128,6 +1108,8 @@ mod tests { #[cfg(feature = "parquet")] #[tokio::test] async fn load_table_stats_when_no_stats() -> Result<()> { + use crate::datasource::file_format::parquet::ParquetFormat; + let testdata = crate::test_util::parquet_test_data(); let filename = format!("{}/{}", testdata, "alltypes_plain.parquet"); let table_path = ListingTableUrl::parse(filename).unwrap(); @@ -1162,7 +1144,10 @@ mod tests { let options = ListingOptions::new(Arc::new(ParquetFormat::default())); let schema = options.infer_schema(&state, &table_path).await.unwrap(); - use crate::physical_plan::expressions::col as physical_col; + use crate::{ + datasource::file_format::parquet::ParquetFormat, + physical_plan::expressions::col as physical_col, + }; use std::ops::Add; // (file_sort_order, expected_result) @@ -1253,7 +1238,7 @@ mod tests { register_test_store(&ctx, &[(&path, 100)]); let opt = ListingOptions::new(Arc::new(AvroFormat {})) - .with_file_extension(FileType::AVRO.get_ext()) + .with_file_extension(AvroFormat::default().get_ext()) .with_table_partition_cols(vec![(String::from("p1"), DataType::Utf8)]) .with_target_partitions(4); @@ -1516,7 +1501,7 @@ mod tests { "10".into(), ); helper_test_append_new_files_to_table( - FileType::JSON, + JsonFormat::default().get_ext(), FileCompressionType::UNCOMPRESSED, Some(config_map), 2, @@ -1534,7 +1519,7 @@ mod tests { "10".into(), ); helper_test_append_new_files_to_table( - FileType::CSV, + CsvFormat::default().get_ext(), FileCompressionType::UNCOMPRESSED, Some(config_map), 2, @@ -1552,7 +1537,7 @@ mod tests { "10".into(), ); helper_test_append_new_files_to_table( - FileType::PARQUET, + ParquetFormat::default().get_ext(), FileCompressionType::UNCOMPRESSED, Some(config_map), 2, @@ -1570,7 +1555,7 @@ mod tests { "20".into(), ); helper_test_append_new_files_to_table( - FileType::PARQUET, + ParquetFormat::default().get_ext(), FileCompressionType::UNCOMPRESSED, Some(config_map), 1, @@ -1756,7 +1741,7 @@ mod tests { ); config_map.insert("datafusion.execution.batch_size".into(), "1".into()); helper_test_append_new_files_to_table( - FileType::PARQUET, + ParquetFormat::default().get_ext(), FileCompressionType::UNCOMPRESSED, Some(config_map), 2, @@ -1774,7 +1759,7 @@ mod tests { "zstd".into(), ); let e = helper_test_append_new_files_to_table( - FileType::PARQUET, + ParquetFormat::default().get_ext(), FileCompressionType::UNCOMPRESSED, Some(config_map), 2, @@ -1787,7 +1772,7 @@ mod tests { } async fn helper_test_append_new_files_to_table( - file_type: FileType, + file_type_ext: String, file_compression_type: FileCompressionType, session_config_map: Option>, expected_n_files_per_insert: usize, @@ -1824,8 +1809,8 @@ mod tests { // Register appropriate table depending on file_type we want to test let tmp_dir = TempDir::new()?; - match file_type { - FileType::CSV => { + match file_type_ext.as_str() { + "csv" => { session_ctx .register_csv( "t", @@ -1836,7 +1821,7 @@ mod tests { ) .await?; } - FileType::JSON => { + "json" => { session_ctx .register_json( "t", @@ -1847,7 +1832,7 @@ mod tests { ) .await?; } - FileType::PARQUET => { + "parquet" => { session_ctx .register_parquet( "t", @@ -1856,7 +1841,7 @@ mod tests { ) .await?; } - FileType::AVRO => { + "avro" => { session_ctx .register_avro( "t", @@ -1865,7 +1850,7 @@ mod tests { ) .await?; } - FileType::ARROW => { + "arrow" => { session_ctx .register_arrow( "t", @@ -1874,6 +1859,7 @@ mod tests { ) .await?; } + _ => panic!("Unrecognized file extension {file_type_ext}"), } // Create and register the source table with the provided schema and inserted data diff --git a/datafusion/core/src/datasource/listing_table_factory.rs b/datafusion/core/src/datasource/listing_table_factory.rs index 6e4749824395..1d4d08481895 100644 --- a/datafusion/core/src/datasource/listing_table_factory.rs +++ b/datafusion/core/src/datasource/listing_table_factory.rs @@ -18,14 +18,8 @@ //! Factory for creating ListingTables with default options use std::path::Path; -use std::str::FromStr; use std::sync::Arc; -#[cfg(feature = "parquet")] -use crate::datasource::file_format::parquet::ParquetFormat; -use crate::datasource::file_format::{ - arrow::ArrowFormat, avro::AvroFormat, csv::CsvFormat, json::JsonFormat, FileFormat, -}; use crate::datasource::listing::{ ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl, }; @@ -34,8 +28,8 @@ use crate::datasource::TableProvider; use crate::execution::context::SessionState; use arrow::datatypes::{DataType, SchemaRef}; -use datafusion_common::Result; -use datafusion_common::{arrow_datafusion_err, DataFusionError, FileType}; +use datafusion_common::{arrow_datafusion_err, DataFusionError}; +use datafusion_common::{config_datafusion_err, Result}; use datafusion_expr::CreateExternalTable; use async_trait::async_trait; @@ -58,28 +52,15 @@ impl TableProviderFactory for ListingTableFactory { state: &SessionState, cmd: &CreateExternalTable, ) -> Result> { - let file_type = FileType::from_str(cmd.file_type.as_str()).map_err(|_| { - DataFusionError::Execution(format!("Unknown FileType {}", cmd.file_type)) - })?; - let mut table_options = state.default_table_options(); - table_options.set_file_format(file_type.clone()); - table_options.alter_with_string_hash_map(&cmd.options)?; + let file_format = state + .get_file_format_factory(cmd.file_type.as_str()) + .ok_or(config_datafusion_err!( + "Unable to create table with format {}! Could not find FileFormat.", + cmd.file_type + ))? + .create(state, &cmd.options)?; let file_extension = get_extension(cmd.location.as_str()); - let file_format: Arc = match file_type { - FileType::CSV => { - Arc::new(CsvFormat::default().with_options(table_options.csv)) - } - #[cfg(feature = "parquet")] - FileType::PARQUET => { - Arc::new(ParquetFormat::default().with_options(table_options.parquet)) - } - FileType::AVRO => Arc::new(AvroFormat), - FileType::JSON => { - Arc::new(JsonFormat::default().with_options(table_options.json)) - } - FileType::ARROW => Arc::new(ArrowFormat), - }; let (provided_schema, table_partition_cols) = if cmd.schema.fields().is_empty() { ( @@ -166,7 +147,9 @@ mod tests { use std::collections::HashMap; use super::*; - use crate::execution::context::SessionContext; + use crate::{ + datasource::file_format::csv::CsvFormat, execution::context::SessionContext, + }; use datafusion_common::{Constraints, DFSchema, TableReference}; diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index c06c630c45d1..327fbd976e87 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -534,13 +534,13 @@ mod tests { use super::*; use crate::dataframe::DataFrameWriteOptions; + use crate::datasource::file_format::csv::CsvFormat; use crate::prelude::*; use crate::test::{partitioned_csv_config, partitioned_file_groups}; use crate::{scalar::ScalarValue, test_util::aggr_test_schema}; use arrow::datatypes::*; use datafusion_common::test_util::arrow_test_data; - use datafusion_common::FileType; use object_store::chunked::ChunkedStore; use object_store::local::LocalFileSystem; @@ -561,6 +561,8 @@ mod tests { async fn csv_exec_with_projection( file_compression_type: FileCompressionType, ) -> Result<()> { + use crate::datasource::file_format::csv::CsvFormat; + let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); let file_schema = aggr_test_schema(); @@ -572,7 +574,7 @@ mod tests { path.as_str(), filename, 1, - FileType::CSV, + Arc::new(CsvFormat::default()), file_compression_type.to_owned(), tmp_dir.path(), )?; @@ -627,6 +629,8 @@ mod tests { async fn csv_exec_with_mixed_order_projection( file_compression_type: FileCompressionType, ) -> Result<()> { + use crate::datasource::file_format::csv::CsvFormat; + let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); let session_ctx = SessionContext::new_with_config(cfg); let task_ctx = session_ctx.task_ctx(); @@ -639,7 +643,7 @@ mod tests { path.as_str(), filename, 1, - FileType::CSV, + Arc::new(CsvFormat::default()), file_compression_type.to_owned(), tmp_dir.path(), )?; @@ -694,6 +698,8 @@ mod tests { async fn csv_exec_with_limit( file_compression_type: FileCompressionType, ) -> Result<()> { + use crate::datasource::file_format::csv::CsvFormat; + let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); let session_ctx = SessionContext::new_with_config(cfg); let task_ctx = session_ctx.task_ctx(); @@ -706,7 +712,7 @@ mod tests { path.as_str(), filename, 1, - FileType::CSV, + Arc::new(CsvFormat::default()), file_compression_type.to_owned(), tmp_dir.path(), )?; @@ -759,6 +765,8 @@ mod tests { async fn csv_exec_with_missing_column( file_compression_type: FileCompressionType, ) -> Result<()> { + use crate::datasource::file_format::csv::CsvFormat; + let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); let file_schema = aggr_test_schema_with_missing_col(); @@ -770,7 +778,7 @@ mod tests { path.as_str(), filename, 1, - FileType::CSV, + Arc::new(CsvFormat::default()), file_compression_type.to_owned(), tmp_dir.path(), )?; @@ -813,6 +821,8 @@ mod tests { async fn csv_exec_with_partition( file_compression_type: FileCompressionType, ) -> Result<()> { + use crate::datasource::file_format::csv::CsvFormat; + let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); let file_schema = aggr_test_schema(); @@ -824,7 +834,7 @@ mod tests { path.as_str(), filename, 1, - FileType::CSV, + Arc::new(CsvFormat::default()), file_compression_type.to_owned(), tmp_dir.path(), )?; @@ -929,7 +939,7 @@ mod tests { path.as_str(), filename, 1, - FileType::CSV, + Arc::new(CsvFormat::default()), file_compression_type.to_owned(), tmp_dir.path(), ) diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index e97554a791bd..c051b5d9b57d 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -384,7 +384,6 @@ mod tests { use super::*; use crate::dataframe::DataFrameWriteOptions; - use crate::datasource::file_format::file_compression_type::FileTypeExt; use crate::datasource::file_format::{json::JsonFormat, FileFormat}; use crate::datasource::object_store::ObjectStoreUrl; use crate::execution::context::SessionState; @@ -397,7 +396,6 @@ mod tests { use arrow::array::Array; use arrow::datatypes::{Field, SchemaBuilder}; use datafusion_common::cast::{as_int32_array, as_int64_array, as_string_array}; - use datafusion_common::FileType; use object_store::chunked::ChunkedStore; use object_store::local::LocalFileSystem; use rstest::*; @@ -419,7 +417,7 @@ mod tests { TEST_DATA_BASE, filename, 1, - FileType::JSON, + Arc::new(JsonFormat::default()), file_compression_type.to_owned(), work_dir, ) @@ -453,7 +451,7 @@ mod tests { TEST_DATA_BASE, filename, 1, - FileType::JSON, + Arc::new(JsonFormat::default()), file_compression_type.to_owned(), tmp_dir.path(), ) @@ -472,8 +470,8 @@ mod tests { let path_buf = Path::new(url.path()).join(path); let path = path_buf.to_str().unwrap(); - let ext = FileType::JSON - .get_ext_with_compression(file_compression_type.to_owned()) + let ext = JsonFormat::default() + .get_ext_with_compression(&file_compression_type) .unwrap(); let read_options = NdJsonReadOptions::default() @@ -904,8 +902,8 @@ mod tests { let url: &Url = store_url.as_ref(); let path_buf = Path::new(url.path()).join(path); let path = path_buf.to_str().unwrap(); - let ext = FileType::JSON - .get_ext_with_compression(file_compression_type.to_owned()) + let ext = JsonFormat::default() + .get_ext_with_compression(&file_compression_type) .unwrap(); let read_option = NdJsonReadOptions::default() diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs index ec21c5504c69..c05816f9dc5f 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs @@ -800,7 +800,7 @@ mod tests { use arrow::datatypes::{Field, Schema, SchemaBuilder}; use arrow::record_batch::RecordBatch; use arrow_schema::Fields; - use datafusion_common::{assert_contains, FileType, GetExt, ScalarValue, ToDFSchema}; + use datafusion_common::{assert_contains, ScalarValue, ToDFSchema}; use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::{col, lit, when, Expr}; use datafusion_physical_expr::create_physical_expr; @@ -1996,7 +1996,7 @@ mod tests { // Configure listing options let file_format = ParquetFormat::default().with_enable_pruning(true); let listing_options = ListingOptions::new(Arc::new(file_format)) - .with_file_extension(FileType::PARQUET.get_ext()); + .with_file_extension(ParquetFormat::default().get_ext()); // execute a simple query and write the results to parquet let out_dir = tmp_dir.as_ref().to_str().unwrap().to_string() + "/out"; diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 2be6938e3235..0969bca9364c 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -25,7 +25,12 @@ use crate::catalog::{ MemoryCatalogProviderList, }; use crate::datasource::cte_worktable::CteWorkTable; -use crate::datasource::file_format::{format_as_file_type, FileFormat}; +use crate::datasource::file_format::arrow::ArrowFormatFactory; +use crate::datasource::file_format::avro::AvroFormatFactory; +use crate::datasource::file_format::csv::CsvFormatFactory; +use crate::datasource::file_format::json::JsonFormatFactory; +use crate::datasource::file_format::parquet::ParquetFormatFactory; +use crate::datasource::file_format::{format_as_file_type, FileFormatFactory}; use crate::datasource::function::{TableFunction, TableFunctionImpl}; use crate::datasource::provider::{DefaultTableFactory, TableProviderFactory}; use crate::datasource::provider_as_source; @@ -42,7 +47,7 @@ use chrono::{DateTime, Utc}; use datafusion_common::alias::AliasGenerator; use datafusion_common::config::{ConfigExtension, ConfigOptions, TableOptions}; use datafusion_common::display::{PlanType, StringifiedPlan, ToStringifiedPlan}; -use datafusion_common::file_options::file_type::ExternalFileType; +use datafusion_common::file_options::file_type::FileType; use datafusion_common::tree_node::TreeNode; use datafusion_common::{ config_err, not_impl_err, plan_datafusion_err, DFSchema, DataFusionError, @@ -114,7 +119,7 @@ pub struct SessionState { /// Deserializer registry for extensions. serializer_registry: Arc, /// Holds registered external FileFormat implementations - file_formats: HashMap>, + file_formats: HashMap>, /// Session configuration config: SessionConfig, /// Table options @@ -201,6 +206,8 @@ impl SessionState { table_factories.insert("AVRO".into(), Arc::new(DefaultTableFactory::new())); table_factories.insert("ARROW".into(), Arc::new(DefaultTableFactory::new())); + + if config.create_default_catalog_and_schema() { let default_catalog = MemoryCatalogProvider::new(); @@ -245,6 +252,32 @@ impl SessionState { function_factory: None, }; + #[cfg(feature = "parquet")] + match new_self.register_file_format(Arc::new(ParquetFormatFactory::new()), false){ + Ok(_) => (), + Err(e) => println!("Unable to register default ParquetFormat: {e}") + }; + + match new_self.register_file_format(Arc::new(JsonFormatFactory::new()), false){ + Ok(_) => (), + Err(e) => log::info!("Unable to register default JsonFormat: {e}") + }; + + match new_self.register_file_format(Arc::new(CsvFormatFactory::new()), false){ + Ok(_) => (), + Err(e) => log::info!("Unable to register default CsvFormat: {e}") + }; + + match new_self.register_file_format(Arc::new(AvroFormatFactory::new()), false){ + Ok(_) => (), + Err(e) => log::info!("Unable to register default AvroFormat: {e}") + }; + + match new_self.register_file_format(Arc::new(ArrowFormatFactory::new()), false){ + Ok(_) => (), + Err(e) => log::info!("Unable to register default ArrowFormat: {e}") + }; + // register built in functions functions::register_all(&mut new_self) .expect("can not register built in functions"); @@ -835,11 +868,11 @@ impl SessionState { self.table_options.extensions.insert(extension) } - /// Adds or updates a [ExternalFileType] which can be used with COPY TO or CREATE EXTERNAL TABLE statements for reading + /// Adds or updates a [FileType] which can be used with COPY TO or CREATE EXTERNAL TABLE statements for reading /// and writing files of custom formats. pub fn register_file_format( &mut self, - file_format: Arc, + file_format: Arc, overwrite: bool, ) -> Result<(), DataFusionError> { let ext = file_format.get_ext(); @@ -851,6 +884,15 @@ impl SessionState { Ok(()) } + /// Retrieves a [FileFormatFactory] based on file extension which has been registered + /// via SessionContext::register_file_format + pub fn get_file_format_factory( + &self, + ext: &str, + ) -> Option> { + self.file_formats.get(ext).cloned() + } + /// Get a new TaskContext to run in this session pub fn task_ctx(&self) -> Arc { Arc::new(TaskContext::from(self)) @@ -997,7 +1039,7 @@ impl<'a> ContextProvider for SessionContextProvider<'a> { fn get_file_type( &self, ext: &str, - ) -> datafusion_common::Result> { + ) -> datafusion_common::Result> { self.state .file_formats .get(ext) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 055c317af418..6b060fed5a4a 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -22,8 +22,6 @@ use std::collections::HashMap; use std::fmt::Write; use std::sync::Arc; -#[cfg(feature = "parquet")] -use crate::datasource::file_format::file_type_to_format; use crate::datasource::listing::ListingTableUrl; use crate::datasource::physical_plan::FileSinkConfig; use crate::datasource::source_as_provider; @@ -71,8 +69,7 @@ use arrow_array::builder::StringBuilder; use arrow_array::RecordBatch; use datafusion_common::display::ToStringifiedPlan; use datafusion_common::{ - exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, - ScalarValue, + config_datafusion_err, exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, ScalarValue }; use datafusion_expr::dml::CopyTo; use datafusion_expr::expr::{ @@ -780,9 +777,14 @@ impl DefaultPhysicalPlanner { table_partition_cols, overwrite: false, }; - - let sink_format = file_type_to_format(file_type)?; - sink_format.update_options_from_string_hashmap(source_option_tuples)?; + // TODO, how not to lose options inside FileType (which is actually FileFormat) + let sink_format = session_state + .get_file_format_factory(&file_type.get_ext()) + .ok_or(config_datafusion_err!( + "Unable to copy to format {}! Could not find FileFormat.", + file_type.get_ext() + ))? + .create(session_state, &source_option_tuples)?; sink_format .create_writer_physical_plan(input_exec, session_state, config, None) diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index e91f83f1199b..66469b6bcd37 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -24,9 +24,9 @@ use std::io::{BufReader, BufWriter}; use std::path::Path; use std::sync::Arc; -use crate::datasource::file_format::file_compression_type::{ - FileCompressionType, FileTypeExt, -}; +use crate::datasource::file_format::csv::CsvFormat; +use crate::datasource::file_format::file_compression_type::FileCompressionType; +use crate::datasource::file_format::FileFormat; use crate::datasource::listing::PartitionedFile; use crate::datasource::object_store::ObjectStoreUrl; use crate::datasource::physical_plan::{CsvExec, FileScanConfig}; @@ -40,7 +40,7 @@ use crate::test_util::{aggr_test_schema, arrow_test_data}; use arrow::array::{self, Array, ArrayRef, Decimal128Builder, Int32Array}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; -use datafusion_common::{DataFusionError, FileType, Statistics}; +use datafusion_common::{DataFusionError, Statistics}; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_expr::{EquivalenceProperties, Partitioning, PhysicalSortExpr}; use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec}; @@ -87,7 +87,7 @@ pub fn scan_partitioned_csv(partitions: usize, work_dir: &Path) -> Result, file_compression_type: FileCompressionType, work_dir: &Path, ) -> Result>> { @@ -120,9 +120,8 @@ pub fn partitioned_file_groups( let filename = format!( "partition-{}{}", i, - file_type - .to_owned() - .get_ext_with_compression(file_compression_type.to_owned()) + file_format + .get_ext_with_compression(&file_compression_type) .unwrap() ); let filename = work_dir.join(filename); @@ -167,7 +166,7 @@ pub fn partitioned_file_groups( for (i, line) in f.lines().enumerate() { let line = line.unwrap(); - if i == 0 && file_type == FileType::CSV { + if i == 0 && file_format.get_ext() == CsvFormat::default().get_ext() { // write header to all partitions for w in writers.iter_mut() { w.write_all(line.as_bytes()).unwrap(); diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 9a588c4db39b..0ac842920e08 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -49,7 +49,7 @@ use crate::{ use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; use datafusion_common::display::ToStringifiedPlan; -use datafusion_common::file_options::file_type::ExternalFileType; +use datafusion_common::file_options::file_type::FileType; use datafusion_common::{ get_target_functional_dependencies, internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, @@ -271,7 +271,7 @@ impl LogicalPlanBuilder { pub fn copy_to( input: LogicalPlan, output_url: String, - file_type: Arc, + file_type: Arc, options: HashMap, partition_by: Vec, ) -> Result { diff --git a/datafusion/expr/src/logical_plan/dml.rs b/datafusion/expr/src/logical_plan/dml.rs index fa7011763525..c9eef9bd34cc 100644 --- a/datafusion/expr/src/logical_plan/dml.rs +++ b/datafusion/expr/src/logical_plan/dml.rs @@ -21,7 +21,7 @@ use std::hash::{Hash, Hasher}; use std::sync::Arc; use arrow::datatypes::{DataType, Field, Schema}; -use datafusion_common::file_options::file_type::ExternalFileType; +use datafusion_common::file_options::file_type::FileType; use datafusion_common::{DFSchemaRef, TableReference}; use crate::LogicalPlan; @@ -36,7 +36,7 @@ pub struct CopyTo { /// Determines which, if any, columns should be used for hive-style partitioned writes pub partition_by: Vec, /// File type trait - pub file_type: Arc, + pub file_type: Arc, /// SQL Options that can affect the formats pub options: HashMap, } diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs index 8a12c42bb61d..4732a6b5a596 100644 --- a/datafusion/proto/src/logical_plan/mod.rs +++ b/datafusion/proto/src/logical_plan/mod.rs @@ -33,7 +33,7 @@ use crate::protobuf::{proto_error, FromProtoError, ToProtoError}; use arrow::datatypes::{DataType, Schema, SchemaRef}; #[cfg(feature = "parquet")] use datafusion::datasource::file_format::parquet::ParquetFormat; -use datafusion::datasource::file_format::{file_type_to_format, format_as_file_type}; +use datafusion::datasource::file_format::{file_type_to_format, format_as_file_type, FileFormatFactory}; use datafusion::{ datasource::{ file_format::{avro::AvroFormat, csv::CsvFormat, FileFormat}, @@ -44,7 +44,7 @@ use datafusion::{ datasource::{provider_as_source, source_as_provider}, prelude::SessionContext, }; -use datafusion_common::file_options::file_type::ExternalFileType; +use datafusion_common::file_options::file_type::FileType; use datafusion_common::{ context, internal_datafusion_err, internal_err, not_impl_err, DataFusionError, Result, TableReference, @@ -120,14 +120,14 @@ pub trait LogicalExtensionCodec: Debug + Send + Sync { &self, _buf: &[u8], _ctx: &SessionContext, - ) -> Result> { + ) -> Result> { not_impl_err!("LogicalExtensionCoden is not provided for file format") } fn try_encode_file_format( &self, _buf: &[u8], - _node: Arc, + _node: Arc, ) -> Result<()> { Ok(()) } @@ -847,7 +847,7 @@ impl AsLogicalPlan for LogicalPlanNode { let input: LogicalPlan = into_logical_plan!(copy.input, ctx, extension_codec)?; - let file_type: Arc = format_as_file_type( + let file_type: Arc = format_as_file_type( extension_codec.try_decode_file_format(©.file_type, ctx)?, ); diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 2d99958731d9..798b6a6f9dbc 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -26,7 +26,7 @@ use arrow::datatypes::{ DataType, Field, Fields, Int32Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, Schema, SchemaRef, TimeUnit, UnionFields, UnionMode, }; -use datafusion::datasource::file_format::csv::CsvFormat; +use datafusion::datasource::file_format::csv::CsvFormatFactory; use datafusion::datasource::file_format::format_as_file_type; use datafusion_functions_aggregate::count::count_udaf; use prost::Message; @@ -47,7 +47,7 @@ use datafusion_common::config::TableOptions; use datafusion_common::scalar::ScalarStructBuilder; use datafusion_common::{ internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, DFSchemaRef, - DataFusionError, FileType, Result, ScalarValue, + DataFusionError, Result, ScalarValue, }; use datafusion_expr::dml::CopyTo; use datafusion_expr::expr::{ @@ -323,11 +323,7 @@ async fn roundtrip_logical_plan_copy_to_sql_options() -> Result<()> { let ctx = SessionContext::new(); let input = create_csv_scan(&ctx).await?; - let mut table_options = ctx.copied_table_options(); - table_options.set_file_format(FileType::CSV); - table_options.set("format.delimiter", ";")?; - - let file_type = format_as_file_type(Arc::new(CsvFormat::default()));; + let file_type = format_as_file_type(Arc::new(CsvFormatFactory::new())); let plan = LogicalPlan::Copy(CopyTo { input: Arc::new(input), @@ -363,7 +359,7 @@ async fn roundtrip_logical_plan_copy_to_writer_options() -> Result<()> { parquet_format.global.dictionary_page_size_limit = 444; parquet_format.global.max_row_group_size = 555; - let file_type = format_as_file_type(Arc::new(CsvFormat::default()));; + let file_type = format_as_file_type(Arc::new(CsvFormatFactory::new())); let plan = LogicalPlan::Copy(CopyTo { input: Arc::new(input), @@ -394,7 +390,7 @@ async fn roundtrip_logical_plan_copy_to_arrow() -> Result<()> { let input = create_csv_scan(&ctx).await?; - let file_type = format_as_file_type(Arc::new(CsvFormat::default())); + let file_type = format_as_file_type(Arc::new(CsvFormatFactory::new())); let plan = LogicalPlan::Copy(CopyTo { input: Arc::new(input), @@ -437,7 +433,7 @@ async fn roundtrip_logical_plan_copy_to_csv() -> Result<()> { csv_format.time_format = Some("HH:mm:ss".to_string()); csv_format.null_value = Some("NIL".to_string()); - let file_type = format_as_file_type(Arc::new(CsvFormat::default())); + let file_type = format_as_file_type(Arc::new(CsvFormatFactory::new())); let plan = LogicalPlan::Copy(CopyTo { input: Arc::new(input), diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 401d8163db92..c661497b50fd 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -21,7 +21,7 @@ use std::sync::Arc; use std::vec; use arrow_schema::*; -use datafusion_common::file_options::file_type::ExternalFileType; +use datafusion_common::file_options::file_type::FileType; use datafusion_common::{ field_not_found, internal_err, plan_datafusion_err, SchemaError, }; @@ -50,7 +50,7 @@ pub trait ContextProvider { /// Getter for a datasource fn get_table_source(&self, name: TableReference) -> Result>; - fn get_file_type(&self, _ext: &str) -> Result> { + fn get_file_type(&self, _ext: &str) -> Result> { not_impl_err!("Registered file types are not supported") } From 487da1387d36c7bf4593998b66a049b559e20fb6 Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Wed, 19 Jun 2024 22:13:56 -0400 Subject: [PATCH 08/20] fix some tests --- .../core/src/datasource/file_format/parquet.rs | 4 +++- datafusion/core/src/physical_planner.rs | 12 ++++-------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index a3932d116f9a..516e4875a64b 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -119,6 +119,7 @@ impl FileFormatFactory for ParquetFormatFactory{ }, Some(parquet_options) => { + println!("{:?}", parquet_options); let mut parquet_options = parquet_options.clone(); for (k, v) in format_options{ parquet_options.set(k, v)?; @@ -126,7 +127,8 @@ impl FileFormatFactory for ParquetFormatFactory{ parquet_options } }; - + + println!("{:?}", parquet_options); Ok(Arc::new(ParquetFormat::default().with_options(parquet_options))) } diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 6b060fed5a4a..a0ec2311e717 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -22,6 +22,7 @@ use std::collections::HashMap; use std::fmt::Write; use std::sync::Arc; +use crate::datasource::file_format::file_type_to_format; use crate::datasource::listing::ListingTableUrl; use crate::datasource::physical_plan::FileSinkConfig; use crate::datasource::source_as_provider; @@ -69,7 +70,7 @@ use arrow_array::builder::StringBuilder; use arrow_array::RecordBatch; use datafusion_common::display::ToStringifiedPlan; use datafusion_common::{ - config_datafusion_err, exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, ScalarValue + exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, ScalarValue }; use datafusion_expr::dml::CopyTo; use datafusion_expr::expr::{ @@ -777,13 +778,8 @@ impl DefaultPhysicalPlanner { table_partition_cols, overwrite: false, }; - // TODO, how not to lose options inside FileType (which is actually FileFormat) - let sink_format = session_state - .get_file_format_factory(&file_type.get_ext()) - .ok_or(config_datafusion_err!( - "Unable to copy to format {}! Could not find FileFormat.", - file_type.get_ext() - ))? + + let sink_format = file_type_to_format(file_type)? .create(session_state, &source_option_tuples)?; sink_format From 9527f8288e8f4844502bfb1f042a0f7f6633c2f2 Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Fri, 21 Jun 2024 21:40:04 -0400 Subject: [PATCH 09/20] test fixes --- datafusion/common/src/config.rs | 34 +- datafusion/common/src/file_options/mod.rs | 10 +- datafusion/core/src/dataframe/parquet.rs | 4 +- .../core/src/datasource/file_format/arrow.rs | 37 +- .../core/src/datasource/file_format/avro.rs | 33 +- .../core/src/datasource/file_format/csv.rs | 52 ++- .../file_format/file_compression_type.rs | 2 +- .../core/src/datasource/file_format/json.rs | 51 ++- .../core/src/datasource/file_format/mod.rs | 12 +- .../src/datasource/file_format/parquet.rs | 62 +-- .../core/src/datasource/listing/table.rs | 9 +- .../core/src/execution/session_state.rs | 41 +- datafusion/core/src/physical_planner.rs | 5 +- datafusion/expr/src/logical_plan/plan.rs | 2 +- .../proto/src/logical_plan/file_formats.rs | 399 ++++++++++++++++++ datafusion/proto/src/logical_plan/mod.rs | 7 +- .../tests/cases/roundtrip_logical_plan.rs | 33 +- datafusion/sql/tests/common/mod.rs | 32 +- 18 files changed, 664 insertions(+), 161 deletions(-) create mode 100644 datafusion/proto/src/logical_plan/file_formats.rs diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 02ab9045d6de..85d30108b165 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -1119,11 +1119,11 @@ macro_rules! extensions_options { /// These file types have special built in behavior for configuration. /// Use TableOptions::Extensions for configuring other file types. #[derive(Debug, Clone)] -pub enum ConfigFileType{ +pub enum ConfigFileType { CSV, #[cfg(feature = "parquet")] PARQUET, - JSON + JSON, } /// Represents the configuration options available for handling different table formats within a data processing application. @@ -1159,9 +1159,18 @@ impl ConfigField for TableOptions { /// If a format is selected, it visits only the settings relevant to that format. Otherwise, /// it visits all available format settings. fn visit(&self, v: &mut V, _key_prefix: &str, _description: &'static str) { - self.csv.visit(v, "csv", ""); - self.parquet.visit(v, "parquet", ""); - self.json.visit(v, "json", ""); + if let Some(file_type) = &self.current_format { + match file_type { + #[cfg(feature = "parquet")] + ConfigFileType::PARQUET => self.parquet.visit(v, "format", ""), + ConfigFileType::CSV => self.csv.visit(v, "format", ""), + ConfigFileType::JSON => self.json.visit(v, "format", ""), + } + } else { + self.csv.visit(v, "csv", ""); + self.parquet.visit(v, "parquet", ""); + self.json.visit(v, "json", ""); + } } /// Sets a configuration value for a specific key within `TableOptions`. @@ -1354,7 +1363,7 @@ impl TableOptions { } let mut v = Visitor(vec![]); - //self.visit(&mut v, "format", ""); + self.visit(&mut v, "format", ""); v.0.extend(self.extensions.0.values().flat_map(|e| e.0.entries())); v.0 @@ -1687,7 +1696,8 @@ mod tests { use std::collections::HashMap; use crate::config::{ - ConfigEntry, ConfigExtension, ExtensionOptions, Extensions, TableOptions, + ConfigEntry, ConfigExtension, ConfigFileType, ExtensionOptions, Extensions, + TableOptions, }; #[derive(Default, Debug, Clone)] @@ -1746,7 +1756,7 @@ mod tests { let mut extension = Extensions::new(); extension.insert(TestExtensionConfig::default()); let mut table_config = TableOptions::new().with_extensions(extension); - //table_config.set_file_format(FileType::CSV); + table_config.set_file_format(ConfigFileType::CSV); table_config.set("format.delimiter", ";").unwrap(); assert_eq!(table_config.csv.delimiter, b';'); table_config.set("test.bootstrap.servers", "asd").unwrap(); @@ -1763,7 +1773,7 @@ mod tests { #[test] fn csv_u8_table_options() { let mut table_config = TableOptions::new(); - //table_config.set_file_format(FileType::CSV); + table_config.set_file_format(ConfigFileType::CSV); table_config.set("format.delimiter", ";").unwrap(); assert_eq!(table_config.csv.delimiter as char, ';'); table_config.set("format.escape", "\"").unwrap(); @@ -1776,7 +1786,7 @@ mod tests { #[test] fn parquet_table_options() { let mut table_config = TableOptions::new(); - //table_config.set_file_format(FileType::PARQUET); + table_config.set_file_format(ConfigFileType::PARQUET); table_config .set("format.bloom_filter_enabled::col1", "true") .unwrap(); @@ -1790,7 +1800,7 @@ mod tests { #[test] fn parquet_table_options_config_entry() { let mut table_config = TableOptions::new(); - //table_config.set_file_format(FileType::PARQUET); + table_config.set_file_format(ConfigFileType::PARQUET); table_config .set("format.bloom_filter_enabled::col1", "true") .unwrap(); @@ -1804,7 +1814,7 @@ mod tests { #[test] fn parquet_table_options_config_metadata_entry() { let mut table_config = TableOptions::new(); - //table_config.set_file_format(FileType::PARQUET); + table_config.set_file_format(ConfigFileType::PARQUET); table_config.set("format.metadata::key1", "").unwrap(); table_config.set("format.metadata::key2", "value2").unwrap(); table_config diff --git a/datafusion/common/src/file_options/mod.rs b/datafusion/common/src/file_options/mod.rs index 46166946ce39..955d95616917 100644 --- a/datafusion/common/src/file_options/mod.rs +++ b/datafusion/common/src/file_options/mod.rs @@ -32,7 +32,7 @@ mod tests { use super::parquet_writer::ParquetWriterOptions; use crate::{ - config::TableOptions, + config::{ConfigFileType, TableOptions}, file_options::{csv_writer::CsvWriterOptions, json_writer::JsonWriterOptions}, parsers::CompressionTypeVariant, Result, @@ -76,7 +76,7 @@ mod tests { option_map.insert("format.bloom_filter_ndv".to_owned(), "123".to_owned()); let mut table_config = TableOptions::new(); - //table_config.set_file_format(FileType::PARQUET); + table_config.set_file_format(ConfigFileType::PARQUET); table_config.alter_with_string_hash_map(&option_map)?; let parquet_options = ParquetWriterOptions::try_from(&table_config.parquet)?; @@ -181,7 +181,7 @@ mod tests { ); let mut table_config = TableOptions::new(); - //table_config.set_file_format(FileType::PARQUET); + table_config.set_file_format(ConfigFileType::PARQUET); table_config.alter_with_string_hash_map(&option_map)?; let parquet_options = ParquetWriterOptions::try_from(&table_config.parquet)?; @@ -284,7 +284,7 @@ mod tests { option_map.insert("format.delimiter".to_owned(), ";".to_owned()); let mut table_config = TableOptions::new(); - //table_config.set_file_format(FileType::CSV); + table_config.set_file_format(ConfigFileType::CSV); table_config.alter_with_string_hash_map(&option_map)?; let csv_options = CsvWriterOptions::try_from(&table_config.csv)?; @@ -306,7 +306,7 @@ mod tests { option_map.insert("format.compression".to_owned(), "gzip".to_owned()); let mut table_config = TableOptions::new(); - //table_config.set_file_format(FileType::JSON); + table_config.set_file_format(ConfigFileType::JSON); table_config.alter_with_string_hash_map(&option_map)?; let json_options = JsonWriterOptions::try_from(&table_config.json)?; diff --git a/datafusion/core/src/dataframe/parquet.rs b/datafusion/core/src/dataframe/parquet.rs index f40d2ab516bb..1abb550f5c98 100644 --- a/datafusion/core/src/dataframe/parquet.rs +++ b/datafusion/core/src/dataframe/parquet.rs @@ -17,7 +17,9 @@ use std::sync::Arc; -use crate::datasource::file_format::{format_as_file_type, parquet::ParquetFormatFactory}; +use crate::datasource::file_format::{ + format_as_file_type, parquet::ParquetFormatFactory, +}; use super::{ DataFrame, DataFrameWriteOptions, DataFusionError, LogicalPlanBuilder, RecordBatch, diff --git a/datafusion/core/src/datasource/file_format/arrow.rs b/datafusion/core/src/datasource/file_format/arrow.rs index abe19cfe5ec2..478a11d7e76e 100644 --- a/datafusion/core/src/datasource/file_format/arrow.rs +++ b/datafusion/core/src/datasource/file_format/arrow.rs @@ -43,7 +43,9 @@ use arrow::ipc::writer::IpcWriteOptions; use arrow::ipc::{root_as_message, CompressionType}; use arrow_schema::{ArrowError, Schema, SchemaRef}; use datafusion_common::parsers::CompressionTypeVariant; -use datafusion_common::{not_impl_err, DataFusionError, GetExt, Statistics, DEFAULT_ARROW_EXTENSION}; +use datafusion_common::{ + not_impl_err, DataFusionError, GetExt, Statistics, DEFAULT_ARROW_EXTENSION, +}; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement}; use datafusion_physical_plan::insert::{DataSink, DataSinkExec}; @@ -64,29 +66,33 @@ const INITIAL_BUFFER_BYTES: usize = 1048576; /// If the buffered Arrow data exceeds this size, it is flushed to object store const BUFFER_FLUSH_BYTES: usize = 1024000; +#[derive(Default)] /// Factory struct used to create [ArrowFormat] pub struct ArrowFormatFactory; -impl ArrowFormatFactory{ +impl ArrowFormatFactory { /// Creates an instance of [ArrowFormatFactory] - pub fn new() -> Self{ - Self{} + pub fn new() -> Self { + Self {} } } - -impl FileFormatFactory for ArrowFormatFactory{ - fn create(&self, _state: &SessionState, _format_options: &HashMap) -> Result>{ - Ok(Arc::new(ArrowFormat::default())) +impl FileFormatFactory for ArrowFormatFactory { + fn create( + &self, + _state: &SessionState, + _format_options: &HashMap, + ) -> Result> { + Ok(Arc::new(ArrowFormat)) } - fn default(&self) -> Arc{ - Arc::new(ArrowFormat::default()) + fn default(&self) -> Arc { + Arc::new(ArrowFormat) } } -impl GetExt for ArrowFormatFactory{ - fn get_ext(&self) -> String{ +impl GetExt for ArrowFormatFactory { + fn get_ext(&self) -> String { // Removes the dot, i.e. ".parquet" -> "parquet" DEFAULT_ARROW_EXTENSION[1..].to_string() } @@ -102,11 +108,14 @@ impl FileFormat for ArrowFormat { self } - fn get_ext(&self) -> String{ + fn get_ext(&self) -> String { ArrowFormatFactory::new().get_ext() } - fn get_ext_with_compression(&self, file_compression_type: &FileCompressionType) -> Result{ + fn get_ext_with_compression( + &self, + file_compression_type: &FileCompressionType, + ) -> Result { let ext = self.get_ext(); match file_compression_type.get_variant() { CompressionTypeVariant::UNCOMPRESSED => Ok(ext), diff --git a/datafusion/core/src/datasource/file_format/avro.rs b/datafusion/core/src/datasource/file_format/avro.rs index 5088034e06b7..f4f9adcba7ed 100644 --- a/datafusion/core/src/datasource/file_format/avro.rs +++ b/datafusion/core/src/datasource/file_format/avro.rs @@ -41,29 +41,33 @@ use crate::execution::context::SessionState; use crate::physical_plan::ExecutionPlan; use crate::physical_plan::Statistics; +#[derive(Default)] /// Factory struct used to create [AvroFormat] pub struct AvroFormatFactory; -impl AvroFormatFactory{ +impl AvroFormatFactory { /// Creates an instance of [AvroFormatFactory] - pub fn new() -> Self{ - Self{} + pub fn new() -> Self { + Self {} } } - -impl FileFormatFactory for AvroFormatFactory{ - fn create(&self, _state: &SessionState, _format_options: &HashMap) -> Result>{ - Ok(Arc::new(AvroFormat::default())) +impl FileFormatFactory for AvroFormatFactory { + fn create( + &self, + _state: &SessionState, + _format_options: &HashMap, + ) -> Result> { + Ok(Arc::new(AvroFormat)) } - fn default(&self) -> Arc{ - Arc::new(AvroFormat::default()) + fn default(&self) -> Arc { + Arc::new(AvroFormat) } } -impl GetExt for AvroFormatFactory{ - fn get_ext(&self) -> String{ +impl GetExt for AvroFormatFactory { + fn get_ext(&self) -> String { // Removes the dot, i.e. ".parquet" -> "parquet" DEFAULT_AVRO_EXTENSION[1..].to_string() } @@ -79,11 +83,14 @@ impl FileFormat for AvroFormat { self } - fn get_ext(&self) -> String{ + fn get_ext(&self) -> String { AvroFormatFactory::new().get_ext() } - fn get_ext_with_compression(&self, file_compression_type: &FileCompressionType) -> Result{ + fn get_ext_with_compression( + &self, + file_compression_type: &FileCompressionType, + ) -> Result { let ext = self.get_ext(); match file_compression_type.get_variant() { CompressionTypeVariant::UNCOMPRESSED => Ok(ext), diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index c08a101acb60..3171a473b80f 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -42,7 +42,9 @@ use arrow::datatypes::SchemaRef; use arrow::datatypes::{DataType, Field, Fields, Schema}; use datafusion_common::config::{ConfigField, ConfigFileType, CsvOptions}; use datafusion_common::file_options::csv_writer::CsvWriterOptions; -use datafusion_common::{exec_err, not_impl_err, DataFusionError, GetExt, DEFAULT_CSV_EXTENSION}; +use datafusion_common::{ + exec_err, not_impl_err, DataFusionError, GetExt, DEFAULT_CSV_EXTENSION, +}; use datafusion_execution::TaskContext; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement}; use datafusion_physical_plan::metrics::MetricsSet; @@ -53,32 +55,39 @@ use futures::stream::BoxStream; use futures::{pin_mut, Stream, StreamExt, TryStreamExt}; use object_store::{delimited::newline_delimited_stream, ObjectMeta, ObjectStore}; -/// Factory struct used to create [AvroFormat] -pub struct CsvFormatFactory{ - options: Option +#[derive(Default)] +/// Factory struct used to create [CsvFormatFactory] +pub struct CsvFormatFactory { + options: Option, } -impl CsvFormatFactory{ +impl CsvFormatFactory { /// Creates an instance of [CsvFormatFactory] - pub fn new() -> Self{ - Self{ options: None } + pub fn new() -> Self { + Self { options: None } } /// Creates an instance of [CsvFormatFactory] with customized default options - pub fn new_with_options(options: CsvOptions) -> Self{ - Self{ options: Some(options) } + pub fn new_with_options(options: CsvOptions) -> Self { + Self { + options: Some(options), + } } } -impl FileFormatFactory for CsvFormatFactory{ - fn create(&self, state: &SessionState, format_options: &HashMap) -> Result>{ - let csv_options = match &self.options{ +impl FileFormatFactory for CsvFormatFactory { + fn create( + &self, + state: &SessionState, + format_options: &HashMap, + ) -> Result> { + let csv_options = match &self.options { None => { let mut table_options = state.default_table_options(); table_options.set_file_format(ConfigFileType::CSV); - table_options.alter_with_string_hash_map(&format_options)?; + table_options.alter_with_string_hash_map(format_options)?; table_options.csv - }, + } Some(csv_options) => { let mut csv_options = csv_options.clone(); for (k, v) in format_options { @@ -87,17 +96,17 @@ impl FileFormatFactory for CsvFormatFactory{ csv_options } }; - + Ok(Arc::new(CsvFormat::default().with_options(csv_options))) } - fn default(&self) -> Arc{ + fn default(&self) -> Arc { Arc::new(CsvFormat::default()) } } -impl GetExt for CsvFormatFactory{ - fn get_ext(&self) -> String{ +impl GetExt for CsvFormatFactory { + fn get_ext(&self) -> String { // Removes the dot, i.e. ".parquet" -> "parquet" DEFAULT_CSV_EXTENSION[1..].to_string() } @@ -256,11 +265,14 @@ impl FileFormat for CsvFormat { self } - fn get_ext(&self) -> String{ + fn get_ext(&self) -> String { CsvFormatFactory::new().get_ext() } - fn get_ext_with_compression(&self, file_compression_type: &FileCompressionType) -> Result{ + fn get_ext_with_compression( + &self, + file_compression_type: &FileCompressionType, + ) -> Result { let ext = self.get_ext(); Ok(format!("{}{}", ext, file_compression_type.get_ext())) } diff --git a/datafusion/core/src/datasource/file_format/file_compression_type.rs b/datafusion/core/src/datasource/file_format/file_compression_type.rs index 4f7e213f0ced..a054094822d0 100644 --- a/datafusion/core/src/datasource/file_format/file_compression_type.rs +++ b/datafusion/core/src/datasource/file_format/file_compression_type.rs @@ -113,7 +113,7 @@ impl FileCompressionType { }; /// Read only access to self.variant - pub fn get_variant(&self) -> &CompressionTypeVariant{ + pub fn get_variant(&self) -> &CompressionTypeVariant { &self.variant } diff --git a/datafusion/core/src/datasource/file_format/json.rs b/datafusion/core/src/datasource/file_format/json.rs index 911f55a7a896..223ef05fa67a 100644 --- a/datafusion/core/src/datasource/file_format/json.rs +++ b/datafusion/core/src/datasource/file_format/json.rs @@ -54,52 +54,58 @@ use async_trait::async_trait; use bytes::{Buf, Bytes}; use object_store::{GetResultPayload, ObjectMeta, ObjectStore}; -/// Factory struct used to create [AvroFormat] -pub struct JsonFormatFactory{ - options: Option +#[derive(Default)] +/// Factory struct used to create [JsonFormat] +pub struct JsonFormatFactory { + options: Option, } -impl JsonFormatFactory{ +impl JsonFormatFactory { /// Creates an instance of [JsonFormatFactory] - pub fn new() -> Self{ - Self{ options: None } + pub fn new() -> Self { + Self { options: None } } /// Creates an instance of [JsonFormatFactory] with customized default options - pub fn new_with_options(options: JsonOptions) -> Self{ - Self{ options: Some(options) } + pub fn new_with_options(options: JsonOptions) -> Self { + Self { + options: Some(options), + } } } -impl FileFormatFactory for JsonFormatFactory{ - fn create(&self, state: &SessionState, format_options: &HashMap) -> Result>{ - - let json_options = match &self.options{ +impl FileFormatFactory for JsonFormatFactory { + fn create( + &self, + state: &SessionState, + format_options: &HashMap, + ) -> Result> { + let json_options = match &self.options { None => { let mut table_options = state.default_table_options(); table_options.set_file_format(ConfigFileType::JSON); - table_options.alter_with_string_hash_map(&format_options)?; + table_options.alter_with_string_hash_map(format_options)?; table_options.json - }, + } Some(json_options) => { let mut json_options = json_options.clone(); - for (k, v) in format_options{ + for (k, v) in format_options { json_options.set(k, v)?; } json_options } }; - + Ok(Arc::new(JsonFormat::default().with_options(json_options))) } - fn default(&self) -> Arc{ + fn default(&self) -> Arc { Arc::new(JsonFormat::default()) } } -impl GetExt for JsonFormatFactory{ - fn get_ext(&self) -> String{ +impl GetExt for JsonFormatFactory { + fn get_ext(&self) -> String { // Removes the dot, i.e. ".parquet" -> "parquet" DEFAULT_JSON_EXTENSION[1..].to_string() } @@ -147,11 +153,14 @@ impl FileFormat for JsonFormat { self } - fn get_ext(&self) -> String{ + fn get_ext(&self) -> String { JsonFormatFactory::new().get_ext() } - fn get_ext_with_compression(&self, file_compression_type: &FileCompressionType) -> Result{ + fn get_ext_with_compression( + &self, + file_compression_type: &FileCompressionType, + ) -> Result { let ext = self.get_ext(); Ok(format!("{}{}", ext, file_compression_type.get_ext())) } diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index e8e64144e6bd..bfea42652a7a 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -131,8 +131,8 @@ pub trait FileFormat: Send + Sync + fmt::Debug { } } -/// A container of [FileFormat] which also implements [FileType]. -/// This enables converting an Arc to an Arc. +/// A container of [FileFormatFactory] which also implements [FileType]. +/// This enables converting a dyn FileFormat to a dyn FileType. /// The former trait is a superset of the latter trait, which includes execution time /// relevant methods. [FileType] is only used in logical planning and only implements /// the subset of methods required during logical planning. @@ -143,7 +143,9 @@ pub struct DefaultFileFormat { impl DefaultFileFormat { /// Constructs a [DefaultFileFormat] wrapper from a [FileFormatFactory] pub fn new(file_format_factory: Arc) -> Self { - Self { file_format_factory } + Self { + file_format_factory, + } } } @@ -169,7 +171,9 @@ impl GetExt for DefaultFileFormat { pub fn format_as_file_type( file_format_factory: Arc, ) -> Arc { - Arc::new(DefaultFileFormat { file_format_factory }) + Arc::new(DefaultFileFormat { + file_format_factory, + }) } /// Converts a [FileType] to a [FileFormatFactory]. diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 516e4875a64b..3fe250699c96 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -49,7 +49,8 @@ use datafusion_common::file_options::parquet_writer::ParquetWriterOptions; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::stats::Precision; use datafusion_common::{ - exec_err, internal_datafusion_err, not_impl_err, DataFusionError, GetExt, DEFAULT_PARQUET_EXTENSION + exec_err, internal_datafusion_err, not_impl_err, DataFusionError, GetExt, + DEFAULT_PARQUET_EXTENSION, }; use datafusion_common_runtime::SpawnedTask; use datafusion_execution::TaskContext; @@ -90,56 +91,60 @@ const INITIAL_BUFFER_BYTES: usize = 1048576; /// this size, it is flushed to object store const BUFFER_FLUSH_BYTES: usize = 1024000; -/// Factory struct used to create [AvroFormat] -pub struct ParquetFormatFactory{ - options: Option +#[derive(Default)] +/// Factory struct used to create [ParquetFormat] +pub struct ParquetFormatFactory { + options: Option, } -impl ParquetFormatFactory{ +impl ParquetFormatFactory { /// Creates an instance of [ParquetFormatFactory] - pub fn new() -> Self{ - Self{ options: None } + pub fn new() -> Self { + Self { options: None } } - + /// Creates an instance of [ParquetFormatFactory] with customized default options - pub fn new_with_options(options: TableParquetOptions) -> Self{ - Self{ options: Some(options) } + pub fn new_with_options(options: TableParquetOptions) -> Self { + Self { + options: Some(options), + } } } -impl FileFormatFactory for ParquetFormatFactory{ - fn create(&self, state: &SessionState, format_options: &std::collections::HashMap) -> Result>{ - - let parquet_options = match &self.options{ +impl FileFormatFactory for ParquetFormatFactory { + fn create( + &self, + state: &SessionState, + format_options: &std::collections::HashMap, + ) -> Result> { + let parquet_options = match &self.options { None => { let mut table_options = state.default_table_options(); table_options.set_file_format(ConfigFileType::PARQUET); - table_options.alter_with_string_hash_map(&format_options)?; + table_options.alter_with_string_hash_map(format_options)?; table_options.parquet - - }, + } Some(parquet_options) => { - println!("{:?}", parquet_options); let mut parquet_options = parquet_options.clone(); - for (k, v) in format_options{ + for (k, v) in format_options { parquet_options.set(k, v)?; } parquet_options } }; - println!("{:?}", parquet_options); - - Ok(Arc::new(ParquetFormat::default().with_options(parquet_options))) + Ok(Arc::new( + ParquetFormat::default().with_options(parquet_options), + )) } - fn default(&self) -> Arc{ + fn default(&self) -> Arc { Arc::new(ParquetFormat::default()) } } -impl GetExt for ParquetFormatFactory{ - fn get_ext(&self) -> String{ +impl GetExt for ParquetFormatFactory { + fn get_ext(&self) -> String { // Removes the dot, i.e. ".parquet" -> "parquet" DEFAULT_PARQUET_EXTENSION[1..].to_string() } @@ -246,11 +251,14 @@ impl FileFormat for ParquetFormat { self } - fn get_ext(&self) -> String{ + fn get_ext(&self) -> String { ParquetFormatFactory::new().get_ext() } - fn get_ext_with_compression(&self, file_compression_type: &FileCompressionType) -> Result{ + fn get_ext_with_compression( + &self, + file_compression_type: &FileCompressionType, + ) -> Result { let ext = self.get_ext(); match file_compression_type.get_variant() { CompressionTypeVariant::UNCOMPRESSED => Ok(ext), diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 35a0859d7bd3..74aca82b3ee6 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -24,7 +24,6 @@ use std::{any::Any, sync::Arc}; use super::helpers::{expr_applicable_for_cols, pruned_partition_list, split_files}; use super::PartitionedFile; -#[cfg(feature = "parquet")] use crate::datasource::{ create_ordering, get_statistics_with_limit, TableProvider, TableType, }; @@ -43,8 +42,8 @@ use crate::{ use arrow::datatypes::{DataType, Field, SchemaBuilder, SchemaRef}; use arrow_schema::Schema; use datafusion_common::{ - config_datafusion_err, internal_err, plan_err, - project_schema, Constraints, SchemaExt, ToDFSchema, + config_datafusion_err, internal_err, plan_err, project_schema, Constraints, + SchemaExt, ToDFSchema, }; use datafusion_execution::cache::cache_manager::FileStatisticsCache; use datafusion_execution::cache::cache_unit::DefaultFileStatisticsCache; @@ -153,7 +152,7 @@ impl ListingTableConfig { .ok_or(config_datafusion_err!( "No file_format found with extension {file_extension}" ))? - .create(&state, &HashMap::new())?; + .create(state, &HashMap::new())?; let listing_options = ListingOptions::new(file_format) .with_file_extension(file_extension) @@ -1238,7 +1237,7 @@ mod tests { register_test_store(&ctx, &[(&path, 100)]); let opt = ListingOptions::new(Arc::new(AvroFormat {})) - .with_file_extension(AvroFormat::default().get_ext()) + .with_file_extension(AvroFormat.get_ext()) .with_table_partition_cols(vec![(String::from("p1"), DataType::Utf8)]) .with_target_partitions(4); diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 0969bca9364c..0a5e4c2c2398 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -29,6 +29,7 @@ use crate::datasource::file_format::arrow::ArrowFormatFactory; use crate::datasource::file_format::avro::AvroFormatFactory; use crate::datasource::file_format::csv::CsvFormatFactory; use crate::datasource::file_format::json::JsonFormatFactory; +#[cfg(feature = "parquet")] use crate::datasource::file_format::parquet::ParquetFormatFactory; use crate::datasource::file_format::{format_as_file_type, FileFormatFactory}; use crate::datasource::function::{TableFunction, TableFunctionImpl}; @@ -206,8 +207,6 @@ impl SessionState { table_factories.insert("AVRO".into(), Arc::new(DefaultTableFactory::new())); table_factories.insert("ARROW".into(), Arc::new(DefaultTableFactory::new())); - - if config.create_default_catalog_and_schema() { let default_catalog = MemoryCatalogProvider::new(); @@ -253,31 +252,32 @@ impl SessionState { }; #[cfg(feature = "parquet")] - match new_self.register_file_format(Arc::new(ParquetFormatFactory::new()), false){ + match new_self.register_file_format(Arc::new(ParquetFormatFactory::new()), false) + { Ok(_) => (), - Err(e) => println!("Unable to register default ParquetFormat: {e}") + Err(e) => log::info!("Unable to register default ParquetFormat: {e}"), }; - match new_self.register_file_format(Arc::new(JsonFormatFactory::new()), false){ + match new_self.register_file_format(Arc::new(JsonFormatFactory::new()), false) { Ok(_) => (), - Err(e) => log::info!("Unable to register default JsonFormat: {e}") + Err(e) => log::info!("Unable to register default JsonFormat: {e}"), }; - match new_self.register_file_format(Arc::new(CsvFormatFactory::new()), false){ + match new_self.register_file_format(Arc::new(CsvFormatFactory::new()), false) { Ok(_) => (), - Err(e) => log::info!("Unable to register default CsvFormat: {e}") + Err(e) => log::info!("Unable to register default CsvFormat: {e}"), }; - match new_self.register_file_format(Arc::new(AvroFormatFactory::new()), false){ + match new_self.register_file_format(Arc::new(AvroFormatFactory::new()), false) { Ok(_) => (), - Err(e) => log::info!("Unable to register default AvroFormat: {e}") + Err(e) => log::info!("Unable to register default AvroFormat: {e}"), }; - match new_self.register_file_format(Arc::new(ArrowFormatFactory::new()), false){ + match new_self.register_file_format(Arc::new(ArrowFormatFactory::new()), false) { Ok(_) => (), - Err(e) => log::info!("Unable to register default ArrowFormat: {e}") + Err(e) => log::info!("Unable to register default ArrowFormat: {e}"), }; - + // register built in functions functions::register_all(&mut new_self) .expect("can not register built in functions"); @@ -868,14 +868,14 @@ impl SessionState { self.table_options.extensions.insert(extension) } - /// Adds or updates a [FileType] which can be used with COPY TO or CREATE EXTERNAL TABLE statements for reading + /// Adds or updates a [FileFormatFactory] which can be used with COPY TO or CREATE EXTERNAL TABLE statements for reading /// and writing files of custom formats. pub fn register_file_format( &mut self, file_format: Arc, overwrite: bool, ) -> Result<(), DataFusionError> { - let ext = file_format.get_ext(); + let ext = file_format.get_ext().to_lowercase(); match (self.file_formats.entry(ext.clone()), overwrite){ (Entry::Vacant(e), _) => {e.insert(file_format);}, (Entry::Occupied(mut e), true) => {e.insert(file_format);}, @@ -885,12 +885,12 @@ impl SessionState { } /// Retrieves a [FileFormatFactory] based on file extension which has been registered - /// via SessionContext::register_file_format + /// via SessionContext::register_file_format. Extensions are not case sensitive. pub fn get_file_format_factory( &self, ext: &str, ) -> Option> { - self.file_formats.get(ext).cloned() + self.file_formats.get(&ext.to_lowercase()).cloned() } /// Get a new TaskContext to run in this session @@ -1036,13 +1036,10 @@ impl<'a> ContextProvider for SessionContextProvider<'a> { self.state.window_functions().keys().cloned().collect() } - fn get_file_type( - &self, - ext: &str, - ) -> datafusion_common::Result> { + fn get_file_type(&self, ext: &str) -> datafusion_common::Result> { self.state .file_formats - .get(ext) + .get(&ext.to_lowercase()) .ok_or(plan_datafusion_err!( "There is no registered file format with ext {ext}" )) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index a0ec2311e717..491d3b81ebbc 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -70,7 +70,8 @@ use arrow_array::builder::StringBuilder; use arrow_array::RecordBatch; use datafusion_common::display::ToStringifiedPlan; use datafusion_common::{ - exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, ScalarValue + exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, + ScalarValue, }; use datafusion_expr::dml::CopyTo; use datafusion_expr::expr::{ @@ -780,7 +781,7 @@ impl DefaultPhysicalPlanner { }; let sink_format = file_type_to_format(file_type)? - .create(session_state, &source_option_tuples)?; + .create(session_state, source_option_tuples)?; sink_format .create_writer_physical_plan(input_exec, session_state, config, None) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index bf8a65c39aec..4c3973a4552e 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -1760,7 +1760,7 @@ impl LogicalPlan { .collect::>() .join(", "); - write!(f, "CopyTo: file_type={} output_url={output_url} options: ({op_str})", file_type.get_ext()) + write!(f, "CopyTo: format={} output_url={output_url} options: ({op_str})", file_type.get_ext()) } LogicalPlan::Ddl(ddl) => { write!(f, "{}", ddl.display()) diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs new file mode 100644 index 000000000000..31102b728ec9 --- /dev/null +++ b/datafusion/proto/src/logical_plan/file_formats.rs @@ -0,0 +1,399 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion::{ + datasource::file_format::{ + arrow::ArrowFormatFactory, csv::CsvFormatFactory, json::JsonFormatFactory, + parquet::ParquetFormatFactory, FileFormatFactory, + }, + prelude::SessionContext, +}; +use datafusion_common::not_impl_err; + +use super::LogicalExtensionCodec; + +#[derive(Debug)] +pub struct CsvLogicalExtensionCodec; + +// TODO! This is a placeholder for now and needs to be implemented for real. +impl LogicalExtensionCodec for CsvLogicalExtensionCodec { + fn try_decode( + &self, + _buf: &[u8], + _inputs: &[datafusion_expr::LogicalPlan], + _ctx: &datafusion::prelude::SessionContext, + ) -> datafusion_common::Result { + not_impl_err!("Method not implemented") + } + + fn try_encode( + &self, + _node: &datafusion_expr::Extension, + _buf: &mut Vec, + ) -> datafusion_common::Result<()> { + not_impl_err!("Method not implemented") + } + + fn try_decode_table_provider( + &self, + _buf: &[u8], + _schema: arrow::datatypes::SchemaRef, + _ctx: &datafusion::prelude::SessionContext, + ) -> datafusion_common::Result< + std::sync::Arc, + > { + not_impl_err!("Method not implemented") + } + + fn try_encode_table_provider( + &self, + _node: std::sync::Arc, + _buf: &mut Vec, + ) -> datafusion_common::Result<()> { + not_impl_err!("Method not implemented") + } + + fn try_decode_file_format( + &self, + __buf: &[u8], + __ctx: &SessionContext, + ) -> datafusion_common::Result> { + Ok(Arc::new(CsvFormatFactory::new())) + } + + fn try_encode_file_format( + &self, + __buf: &[u8], + __node: Arc, + ) -> datafusion_common::Result<()> { + Ok(()) + } + + fn try_decode_udf( + &self, + name: &str, + __buf: &[u8], + ) -> datafusion_common::Result> { + not_impl_err!("LogicalExtensionCodec is not provided for scalar function {name}") + } + + fn try_encode_udf( + &self, + __node: &datafusion_expr::ScalarUDF, + __buf: &mut Vec, + ) -> datafusion_common::Result<()> { + Ok(()) + } +} + +#[derive(Debug)] +pub struct JsonLogicalExtensionCodec; + +// TODO! This is a placeholder for now and needs to be implemented for real. +impl LogicalExtensionCodec for JsonLogicalExtensionCodec { + fn try_decode( + &self, + _buf: &[u8], + _inputs: &[datafusion_expr::LogicalPlan], + _ctx: &datafusion::prelude::SessionContext, + ) -> datafusion_common::Result { + not_impl_err!("Method not implemented") + } + + fn try_encode( + &self, + _node: &datafusion_expr::Extension, + _buf: &mut Vec, + ) -> datafusion_common::Result<()> { + not_impl_err!("Method not implemented") + } + + fn try_decode_table_provider( + &self, + _buf: &[u8], + _schema: arrow::datatypes::SchemaRef, + _ctx: &datafusion::prelude::SessionContext, + ) -> datafusion_common::Result< + std::sync::Arc, + > { + not_impl_err!("Method not implemented") + } + + fn try_encode_table_provider( + &self, + _node: std::sync::Arc, + _buf: &mut Vec, + ) -> datafusion_common::Result<()> { + not_impl_err!("Method not implemented") + } + + fn try_decode_file_format( + &self, + __buf: &[u8], + __ctx: &SessionContext, + ) -> datafusion_common::Result> { + Ok(Arc::new(JsonFormatFactory::new())) + } + + fn try_encode_file_format( + &self, + __buf: &[u8], + __node: Arc, + ) -> datafusion_common::Result<()> { + Ok(()) + } + + fn try_decode_udf( + &self, + name: &str, + __buf: &[u8], + ) -> datafusion_common::Result> { + not_impl_err!("LogicalExtensionCodec is not provided for scalar function {name}") + } + + fn try_encode_udf( + &self, + __node: &datafusion_expr::ScalarUDF, + __buf: &mut Vec, + ) -> datafusion_common::Result<()> { + Ok(()) + } +} + +#[derive(Debug)] +pub struct ParquetLogicalExtensionCodec; + +// TODO! This is a placeholder for now and needs to be implemented for real. +impl LogicalExtensionCodec for ParquetLogicalExtensionCodec { + fn try_decode( + &self, + _buf: &[u8], + _inputs: &[datafusion_expr::LogicalPlan], + _ctx: &datafusion::prelude::SessionContext, + ) -> datafusion_common::Result { + not_impl_err!("Method not implemented") + } + + fn try_encode( + &self, + _node: &datafusion_expr::Extension, + _buf: &mut Vec, + ) -> datafusion_common::Result<()> { + not_impl_err!("Method not implemented") + } + + fn try_decode_table_provider( + &self, + _buf: &[u8], + _schema: arrow::datatypes::SchemaRef, + _ctx: &datafusion::prelude::SessionContext, + ) -> datafusion_common::Result< + std::sync::Arc, + > { + not_impl_err!("Method not implemented") + } + + fn try_encode_table_provider( + &self, + _node: std::sync::Arc, + _buf: &mut Vec, + ) -> datafusion_common::Result<()> { + not_impl_err!("Method not implemented") + } + + fn try_decode_file_format( + &self, + __buf: &[u8], + __ctx: &SessionContext, + ) -> datafusion_common::Result> { + Ok(Arc::new(ParquetFormatFactory::new())) + } + + fn try_encode_file_format( + &self, + __buf: &[u8], + __node: Arc, + ) -> datafusion_common::Result<()> { + Ok(()) + } + + fn try_decode_udf( + &self, + name: &str, + __buf: &[u8], + ) -> datafusion_common::Result> { + not_impl_err!("LogicalExtensionCodec is not provided for scalar function {name}") + } + + fn try_encode_udf( + &self, + __node: &datafusion_expr::ScalarUDF, + __buf: &mut Vec, + ) -> datafusion_common::Result<()> { + Ok(()) + } +} + +#[derive(Debug)] +pub struct ArrowLogicalExtensionCodec; + +// TODO! This is a placeholder for now and needs to be implemented for real. +impl LogicalExtensionCodec for ArrowLogicalExtensionCodec { + fn try_decode( + &self, + _buf: &[u8], + _inputs: &[datafusion_expr::LogicalPlan], + _ctx: &datafusion::prelude::SessionContext, + ) -> datafusion_common::Result { + not_impl_err!("Method not implemented") + } + + fn try_encode( + &self, + _node: &datafusion_expr::Extension, + _buf: &mut Vec, + ) -> datafusion_common::Result<()> { + not_impl_err!("Method not implemented") + } + + fn try_decode_table_provider( + &self, + _buf: &[u8], + _schema: arrow::datatypes::SchemaRef, + _ctx: &datafusion::prelude::SessionContext, + ) -> datafusion_common::Result< + std::sync::Arc, + > { + not_impl_err!("Method not implemented") + } + + fn try_encode_table_provider( + &self, + _node: std::sync::Arc, + _buf: &mut Vec, + ) -> datafusion_common::Result<()> { + not_impl_err!("Method not implemented") + } + + fn try_decode_file_format( + &self, + __buf: &[u8], + __ctx: &SessionContext, + ) -> datafusion_common::Result> { + Ok(Arc::new(ArrowFormatFactory::new())) + } + + fn try_encode_file_format( + &self, + __buf: &[u8], + __node: Arc, + ) -> datafusion_common::Result<()> { + Ok(()) + } + + fn try_decode_udf( + &self, + name: &str, + __buf: &[u8], + ) -> datafusion_common::Result> { + not_impl_err!("LogicalExtensionCodec is not provided for scalar function {name}") + } + + fn try_encode_udf( + &self, + __node: &datafusion_expr::ScalarUDF, + __buf: &mut Vec, + ) -> datafusion_common::Result<()> { + Ok(()) + } +} + +#[derive(Debug)] +pub struct AvroLogicalExtensionCodec; + +// TODO! This is a placeholder for now and needs to be implemented for real. +impl LogicalExtensionCodec for AvroLogicalExtensionCodec { + fn try_decode( + &self, + _buf: &[u8], + _inputs: &[datafusion_expr::LogicalPlan], + _ctx: &datafusion::prelude::SessionContext, + ) -> datafusion_common::Result { + not_impl_err!("Method not implemented") + } + + fn try_encode( + &self, + _node: &datafusion_expr::Extension, + _buf: &mut Vec, + ) -> datafusion_common::Result<()> { + not_impl_err!("Method not implemented") + } + + fn try_decode_table_provider( + &self, + _buf: &[u8], + _schema: arrow::datatypes::SchemaRef, + _cts: &datafusion::prelude::SessionContext, + ) -> datafusion_common::Result< + std::sync::Arc, + > { + not_impl_err!("Method not implemented") + } + + fn try_encode_table_provider( + &self, + _node: std::sync::Arc, + _buf: &mut Vec, + ) -> datafusion_common::Result<()> { + not_impl_err!("Method not implemented") + } + + fn try_decode_file_format( + &self, + __buf: &[u8], + __ctx: &SessionContext, + ) -> datafusion_common::Result> { + Ok(Arc::new(ArrowFormatFactory::new())) + } + + fn try_encode_file_format( + &self, + __buf: &[u8], + __node: Arc, + ) -> datafusion_common::Result<()> { + Ok(()) + } + + fn try_decode_udf( + &self, + name: &str, + __buf: &[u8], + ) -> datafusion_common::Result> { + not_impl_err!("LogicalExtensionCodec is not provided for scalar function {name}") + } + + fn try_encode_udf( + &self, + __node: &datafusion_expr::ScalarUDF, + __buf: &mut Vec, + ) -> datafusion_common::Result<()> { + Ok(()) + } +} diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs index 4732a6b5a596..cdb9d5260a0f 100644 --- a/datafusion/proto/src/logical_plan/mod.rs +++ b/datafusion/proto/src/logical_plan/mod.rs @@ -33,7 +33,9 @@ use crate::protobuf::{proto_error, FromProtoError, ToProtoError}; use arrow::datatypes::{DataType, Schema, SchemaRef}; #[cfg(feature = "parquet")] use datafusion::datasource::file_format::parquet::ParquetFormat; -use datafusion::datasource::file_format::{file_type_to_format, format_as_file_type, FileFormatFactory}; +use datafusion::datasource::file_format::{ + file_type_to_format, format_as_file_type, FileFormatFactory, +}; use datafusion::{ datasource::{ file_format::{avro::AvroFormat, csv::CsvFormat, FileFormat}, @@ -66,6 +68,7 @@ use prost::Message; use self::to_proto::serialize_expr; +pub mod file_formats; pub mod from_proto; pub mod to_proto; @@ -121,7 +124,7 @@ pub trait LogicalExtensionCodec: Debug + Send + Sync { _buf: &[u8], _ctx: &SessionContext, ) -> Result> { - not_impl_err!("LogicalExtensionCoden is not provided for file format") + not_impl_err!("LogicalExtensionCodec is not provided for file format") } fn try_encode_file_format( diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 798b6a6f9dbc..e7123c717a11 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -26,9 +26,14 @@ use arrow::datatypes::{ DataType, Field, Fields, Int32Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, Schema, SchemaRef, TimeUnit, UnionFields, UnionMode, }; +use datafusion::datasource::file_format::arrow::ArrowFormatFactory; use datafusion::datasource::file_format::csv::CsvFormatFactory; use datafusion::datasource::file_format::format_as_file_type; +use datafusion::datasource::file_format::parquet::ParquetFormatFactory; use datafusion_functions_aggregate::count::count_udaf; +use datafusion_proto::logical_plan::file_formats::{ + ArrowLogicalExtensionCodec, CsvLogicalExtensionCodec, ParquetLogicalExtensionCodec, +}; use prost::Message; use datafusion::datasource::provider::TableProviderFactory; @@ -333,8 +338,10 @@ async fn roundtrip_logical_plan_copy_to_sql_options() -> Result<()> { options: Default::default(), }); - let bytes = logical_plan_to_bytes(&plan)?; - let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?; + let codec = CsvLogicalExtensionCodec {}; + let bytes = logical_plan_to_bytes_with_extension_codec(&plan, &codec)?; + let logical_round_trip = + logical_plan_from_bytes_with_extension_codec(&bytes, &ctx, &codec)?; assert_eq!(format!("{plan:?}"), format!("{logical_round_trip:?}")); Ok(()) @@ -359,7 +366,7 @@ async fn roundtrip_logical_plan_copy_to_writer_options() -> Result<()> { parquet_format.global.dictionary_page_size_limit = 444; parquet_format.global.max_row_group_size = 555; - let file_type = format_as_file_type(Arc::new(CsvFormatFactory::new())); + let file_type = format_as_file_type(Arc::new(ParquetFormatFactory::new())); let plan = LogicalPlan::Copy(CopyTo { input: Arc::new(input), @@ -369,8 +376,10 @@ async fn roundtrip_logical_plan_copy_to_writer_options() -> Result<()> { options: Default::default(), }); - let bytes = logical_plan_to_bytes(&plan)?; - let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?; + let codec = ParquetLogicalExtensionCodec {}; + let bytes = logical_plan_to_bytes_with_extension_codec(&plan, &codec)?; + let logical_round_trip = + logical_plan_from_bytes_with_extension_codec(&bytes, &ctx, &codec)?; assert_eq!(format!("{plan:?}"), format!("{logical_round_trip:?}")); match logical_round_trip { @@ -390,7 +399,7 @@ async fn roundtrip_logical_plan_copy_to_arrow() -> Result<()> { let input = create_csv_scan(&ctx).await?; - let file_type = format_as_file_type(Arc::new(CsvFormatFactory::new())); + let file_type = format_as_file_type(Arc::new(ArrowFormatFactory::new())); let plan = LogicalPlan::Copy(CopyTo { input: Arc::new(input), @@ -400,8 +409,10 @@ async fn roundtrip_logical_plan_copy_to_arrow() -> Result<()> { options: Default::default(), }); - let bytes = logical_plan_to_bytes(&plan)?; - let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?; + let codec = ArrowLogicalExtensionCodec {}; + let bytes = logical_plan_to_bytes_with_extension_codec(&plan, &codec)?; + let logical_round_trip = + logical_plan_from_bytes_with_extension_codec(&bytes, &ctx, &codec)?; assert_eq!(format!("{plan:?}"), format!("{logical_round_trip:?}")); match logical_round_trip { @@ -443,8 +454,10 @@ async fn roundtrip_logical_plan_copy_to_csv() -> Result<()> { options: Default::default(), }); - let bytes = logical_plan_to_bytes(&plan)?; - let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?; + let codec = CsvLogicalExtensionCodec {}; + let bytes = logical_plan_to_bytes_with_extension_codec(&plan, &codec)?; + let logical_round_trip = + logical_plan_from_bytes_with_extension_codec(&bytes, &ctx, &codec)?; assert_eq!(format!("{plan:?}"), format!("{logical_round_trip:?}")); match logical_round_trip { diff --git a/datafusion/sql/tests/common/mod.rs b/datafusion/sql/tests/common/mod.rs index 893678d6b374..f5caaefb3ea0 100644 --- a/datafusion/sql/tests/common/mod.rs +++ b/datafusion/sql/tests/common/mod.rs @@ -15,16 +15,39 @@ // specific language governing permissions and limitations // under the License. +use std::any::Any; #[cfg(test)] use std::collections::HashMap; +use std::fmt::Display; use std::{sync::Arc, vec}; use arrow_schema::*; use datafusion_common::config::ConfigOptions; -use datafusion_common::{plan_err, Result, TableReference}; +use datafusion_common::file_options::file_type::FileType; +use datafusion_common::{plan_err, GetExt, Result, TableReference}; use datafusion_expr::{AggregateUDF, ScalarUDF, TableSource, WindowUDF}; use datafusion_sql::planner::ContextProvider; +struct MockCsvType {} + +impl GetExt for MockCsvType { + fn get_ext(&self) -> String { + "csv".to_string() + } +} + +impl FileType for MockCsvType { + fn as_any(&self) -> &dyn Any { + self + } +} + +impl Display for MockCsvType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.get_ext()) + } +} + #[derive(Default)] pub(crate) struct MockContextProvider { options: ConfigOptions, @@ -191,6 +214,13 @@ impl ContextProvider for MockContextProvider { &self.options } + fn get_file_type( + &self, + _ext: &str, + ) -> Result> { + Ok(Arc::new(MockCsvType {})) + } + fn create_cte_work_table( &self, _name: &str, From 891afc3158e8b5b7433c70cd31ffd53671939c6c Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Fri, 21 Jun 2024 22:14:57 -0400 Subject: [PATCH 10/20] cli fix --- datafusion-cli/src/exec.rs | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs index 855d6a7cbbc9..f45d0cd724c4 100644 --- a/datafusion-cli/src/exec.rs +++ b/datafusion-cli/src/exec.rs @@ -21,7 +21,6 @@ use std::collections::HashMap; use std::fs::File; use std::io::prelude::*; use std::io::BufReader; -use std::str::FromStr; use crate::helper::split_from_semicolon; use crate::print_format::PrintFormat; @@ -34,6 +33,7 @@ use crate::{ use datafusion::common::instant::Instant; use datafusion::common::plan_datafusion_err; +use datafusion::config::ConfigFileType; use datafusion::datasource::listing::ListingTableUrl; use datafusion::error::{DataFusionError, Result}; use datafusion::logical_expr::{DdlStatement, LogicalPlan}; @@ -42,7 +42,6 @@ use datafusion::prelude::SessionContext; use datafusion::sql::parser::{DFParser, Statement}; use datafusion::sql::sqlparser::dialect::dialect_from_str; -use datafusion::common::FileType; use datafusion::sql::sqlparser; use rustyline::error::ReadlineError; use rustyline::Editor; @@ -291,6 +290,16 @@ impl AdjustedPrintOptions { } } +fn config_file_type_from_str(ext: &str) -> Option{ + match ext.to_lowercase().as_str(){ + "csv" => Some(ConfigFileType::CSV), + "json" => Some(ConfigFileType::JSON), + "parquet" => Some(ConfigFileType::PARQUET), + _ => None, + + } +} + async fn create_plan( ctx: &mut SessionContext, statement: Statement, @@ -302,7 +311,7 @@ async fn create_plan( // will raise Configuration errors. if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &plan { // To support custom formats, treat error as None - let format = FileType::from_str(&cmd.file_type).ok(); + let format = config_file_type_from_str(&cmd.file_type); register_object_store_and_config_extensions( ctx, &cmd.location, @@ -313,13 +322,13 @@ async fn create_plan( } if let LogicalPlan::Copy(copy_to) = &mut plan { - let format: FileType = (©_to.format_options).into(); + let format = config_file_type_from_str(©_to.file_type.get_ext()); register_object_store_and_config_extensions( ctx, ©_to.output_url, ©_to.options, - Some(format), + format, ) .await?; } @@ -357,7 +366,7 @@ pub(crate) async fn register_object_store_and_config_extensions( ctx: &SessionContext, location: &String, options: &HashMap, - format: Option, + format: Option, ) -> Result<()> { // Parse the location URL to extract the scheme and other components let table_path = ListingTableUrl::parse(location)?; @@ -391,7 +400,6 @@ pub(crate) async fn register_object_store_and_config_extensions( mod tests { use super::*; - use datafusion::common::config::FormatOptions; use datafusion::common::plan_err; use url::Url; @@ -401,7 +409,7 @@ mod tests { let plan = ctx.state().create_logical_plan(sql).await?; if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &plan { - let format = FileType::from_str(&cmd.file_type).ok(); + let format = config_file_type_from_str(&cmd.file_type); register_object_store_and_config_extensions( &ctx, &cmd.location, @@ -427,12 +435,12 @@ mod tests { let plan = ctx.state().create_logical_plan(sql).await?; if let LogicalPlan::Copy(cmd) = &plan { - let format: FileType = (&cmd.format_options).into(); + let format = config_file_type_from_str(&cmd.file_type.get_ext()); register_object_store_and_config_extensions( &ctx, &cmd.output_url, &cmd.options, - Some(format), + format, ) .await?; } else { @@ -482,7 +490,7 @@ mod tests { let mut plan = create_plan(&mut ctx, statement).await?; if let LogicalPlan::Copy(copy_to) = &mut plan { assert_eq!(copy_to.output_url, location); - assert!(matches!(copy_to.format_options, FormatOptions::PARQUET(_))); + assert_eq!(copy_to.file_type.get_ext(), "parquet".to_string()); ctx.runtime_env() .object_store_registry .get_store(&Url::parse(©_to.output_url).unwrap())?; From 128bf2bdb0038543450fa01e8cd444383e45d24b Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Fri, 21 Jun 2024 22:15:29 -0400 Subject: [PATCH 11/20] cli fmt --- datafusion-cli/src/exec.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs index f45d0cd724c4..675d62b6d740 100644 --- a/datafusion-cli/src/exec.rs +++ b/datafusion-cli/src/exec.rs @@ -290,13 +290,12 @@ impl AdjustedPrintOptions { } } -fn config_file_type_from_str(ext: &str) -> Option{ - match ext.to_lowercase().as_str(){ +fn config_file_type_from_str(ext: &str) -> Option { + match ext.to_lowercase().as_str() { "csv" => Some(ConfigFileType::CSV), "json" => Some(ConfigFileType::JSON), "parquet" => Some(ConfigFileType::PARQUET), _ => None, - } } From b065cd7812bf71fd7a16d79bc071db40d1e5fca7 Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Sun, 23 Jun 2024 21:49:37 -0400 Subject: [PATCH 12/20] Update datafusion/core/src/datasource/file_format/mod.rs Co-authored-by: Andrew Lamb --- datafusion/core/src/datasource/file_format/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index bfea42652a7a..908c53f97059 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -50,7 +50,9 @@ use async_trait::async_trait; use file_compression_type::FileCompressionType; use object_store::{ObjectMeta, ObjectStore}; -/// A factory struct which produces [FileFormat] structs based on session and command level options +/// Factory for creating [`FileFormat`] instances based on session and command level options +/// +/// Users can provide their own `FileFormatFactory` to support arbitrary file formats pub trait FileFormatFactory: Sync + Send + GetExt { /// Initialize a [FileFormat] and configure based on session and command level options fn create( From 485f754494eefbda7910fea412a4646fe520a7e5 Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Sun, 23 Jun 2024 22:28:00 -0400 Subject: [PATCH 13/20] Update datafusion/core/src/execution/session_state.rs Co-authored-by: Andrew Lamb --- datafusion/core/src/execution/session_state.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 038ef443ceac..64f8b9cc17f7 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -249,10 +249,9 @@ impl SessionState { }; #[cfg(feature = "parquet")] - match new_self.register_file_format(Arc::new(ParquetFormatFactory::new()), false) + if let Err(e) = new_self.register_file_format(Arc::new(ParquetFormatFactory::new()), false) { - Ok(_) => (), - Err(e) => log::info!("Unable to register default ParquetFormat: {e}"), + log::info!("Unable to register default ParquetFormat: {e}"), }; match new_self.register_file_format(Arc::new(JsonFormatFactory::new()), false) { From da8e63537c5800b221e1505fa909a82809ecb6ee Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Sun, 23 Jun 2024 22:18:02 -0400 Subject: [PATCH 14/20] review comments --- datafusion/common/src/config.rs | 12 ++++++------ datafusion/common/src/file_options/mod.rs | 8 ++++---- datafusion/core/src/datasource/file_format/csv.rs | 2 +- datafusion/core/src/datasource/file_format/json.rs | 2 +- .../core/src/datasource/file_format/parquet.rs | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 85d30108b165..abcf5eb4c8f9 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -1251,7 +1251,7 @@ impl TableOptions { /// # Parameters /// /// * `format`: The file format to use (e.g., CSV, Parquet). - pub fn set_file_format(&mut self, format: ConfigFileType) { + pub fn set_config_format(&mut self, format: ConfigFileType) { self.current_format = Some(format); } @@ -1756,7 +1756,7 @@ mod tests { let mut extension = Extensions::new(); extension.insert(TestExtensionConfig::default()); let mut table_config = TableOptions::new().with_extensions(extension); - table_config.set_file_format(ConfigFileType::CSV); + table_config.set_config_format(ConfigFileType::CSV); table_config.set("format.delimiter", ";").unwrap(); assert_eq!(table_config.csv.delimiter, b';'); table_config.set("test.bootstrap.servers", "asd").unwrap(); @@ -1773,7 +1773,7 @@ mod tests { #[test] fn csv_u8_table_options() { let mut table_config = TableOptions::new(); - table_config.set_file_format(ConfigFileType::CSV); + table_config.set_config_format(ConfigFileType::CSV); table_config.set("format.delimiter", ";").unwrap(); assert_eq!(table_config.csv.delimiter as char, ';'); table_config.set("format.escape", "\"").unwrap(); @@ -1786,7 +1786,7 @@ mod tests { #[test] fn parquet_table_options() { let mut table_config = TableOptions::new(); - table_config.set_file_format(ConfigFileType::PARQUET); + table_config.set_config_format(ConfigFileType::PARQUET); table_config .set("format.bloom_filter_enabled::col1", "true") .unwrap(); @@ -1800,7 +1800,7 @@ mod tests { #[test] fn parquet_table_options_config_entry() { let mut table_config = TableOptions::new(); - table_config.set_file_format(ConfigFileType::PARQUET); + table_config.set_config_format(ConfigFileType::PARQUET); table_config .set("format.bloom_filter_enabled::col1", "true") .unwrap(); @@ -1814,7 +1814,7 @@ mod tests { #[test] fn parquet_table_options_config_metadata_entry() { let mut table_config = TableOptions::new(); - table_config.set_file_format(ConfigFileType::PARQUET); + table_config.set_config_format(ConfigFileType::PARQUET); table_config.set("format.metadata::key1", "").unwrap(); table_config.set("format.metadata::key2", "value2").unwrap(); table_config diff --git a/datafusion/common/src/file_options/mod.rs b/datafusion/common/src/file_options/mod.rs index 955d95616917..77781457d0d2 100644 --- a/datafusion/common/src/file_options/mod.rs +++ b/datafusion/common/src/file_options/mod.rs @@ -76,7 +76,7 @@ mod tests { option_map.insert("format.bloom_filter_ndv".to_owned(), "123".to_owned()); let mut table_config = TableOptions::new(); - table_config.set_file_format(ConfigFileType::PARQUET); + table_config.set_config_format(ConfigFileType::PARQUET); table_config.alter_with_string_hash_map(&option_map)?; let parquet_options = ParquetWriterOptions::try_from(&table_config.parquet)?; @@ -181,7 +181,7 @@ mod tests { ); let mut table_config = TableOptions::new(); - table_config.set_file_format(ConfigFileType::PARQUET); + table_config.set_config_format(ConfigFileType::PARQUET); table_config.alter_with_string_hash_map(&option_map)?; let parquet_options = ParquetWriterOptions::try_from(&table_config.parquet)?; @@ -284,7 +284,7 @@ mod tests { option_map.insert("format.delimiter".to_owned(), ";".to_owned()); let mut table_config = TableOptions::new(); - table_config.set_file_format(ConfigFileType::CSV); + table_config.set_config_format(ConfigFileType::CSV); table_config.alter_with_string_hash_map(&option_map)?; let csv_options = CsvWriterOptions::try_from(&table_config.csv)?; @@ -306,7 +306,7 @@ mod tests { option_map.insert("format.compression".to_owned(), "gzip".to_owned()); let mut table_config = TableOptions::new(); - table_config.set_file_format(ConfigFileType::JSON); + table_config.set_config_format(ConfigFileType::JSON); table_config.alter_with_string_hash_map(&option_map)?; let json_options = JsonWriterOptions::try_from(&table_config.json)?; diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index 3171a473b80f..92cb11e2b47a 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -84,7 +84,7 @@ impl FileFormatFactory for CsvFormatFactory { let csv_options = match &self.options { None => { let mut table_options = state.default_table_options(); - table_options.set_file_format(ConfigFileType::CSV); + table_options.set_config_format(ConfigFileType::CSV); table_options.alter_with_string_hash_map(format_options)?; table_options.csv } diff --git a/datafusion/core/src/datasource/file_format/json.rs b/datafusion/core/src/datasource/file_format/json.rs index 223ef05fa67a..007b084f504d 100644 --- a/datafusion/core/src/datasource/file_format/json.rs +++ b/datafusion/core/src/datasource/file_format/json.rs @@ -83,7 +83,7 @@ impl FileFormatFactory for JsonFormatFactory { let json_options = match &self.options { None => { let mut table_options = state.default_table_options(); - table_options.set_file_format(ConfigFileType::JSON); + table_options.set_config_format(ConfigFileType::JSON); table_options.alter_with_string_hash_map(format_options)?; table_options.json } diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 3fe250699c96..f22d3bb185b1 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -120,7 +120,7 @@ impl FileFormatFactory for ParquetFormatFactory { let parquet_options = match &self.options { None => { let mut table_options = state.default_table_options(); - table_options.set_file_format(ConfigFileType::PARQUET); + table_options.set_config_format(ConfigFileType::PARQUET); table_options.alter_with_string_hash_map(format_options)?; table_options.parquet } From 6dec6efe5497678e6ad51480c183358b8a29d518 Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Sun, 23 Jun 2024 22:18:13 -0400 Subject: [PATCH 15/20] review comments --- datafusion-cli/src/exec.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs index d1dc581427db..b78f32e0ac48 100644 --- a/datafusion-cli/src/exec.rs +++ b/datafusion-cli/src/exec.rs @@ -382,7 +382,7 @@ pub(crate) async fn register_object_store_and_config_extensions( // Clone and modify the default table options based on the provided options let mut table_options = ctx.session_state().default_table_options().clone(); if let Some(format) = format { - table_options.set_file_format(format); + table_options.set_config_format(format); } table_options.alter_with_string_hash_map(options)?; From 013ef9135977038afff1b896a71dab1be0371e4c Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Sun, 23 Jun 2024 22:27:52 -0400 Subject: [PATCH 16/20] review comments --- .../core/src/datasource/file_format/mod.rs | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index 908c53f97059..1aa93a106aff 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -138,12 +138,12 @@ pub trait FileFormat: Send + Sync + fmt::Debug { /// The former trait is a superset of the latter trait, which includes execution time /// relevant methods. [FileType] is only used in logical planning and only implements /// the subset of methods required during logical planning. -pub struct DefaultFileFormat { +pub struct DefaultFileType { file_format_factory: Arc, } -impl DefaultFileFormat { - /// Constructs a [DefaultFileFormat] wrapper from a [FileFormatFactory] +impl DefaultFileType { + /// Constructs a [DefaultFileType] wrapper from a [FileFormatFactory] pub fn new(file_format_factory: Arc) -> Self { Self { file_format_factory, @@ -151,19 +151,19 @@ impl DefaultFileFormat { } } -impl FileType for DefaultFileFormat { +impl FileType for DefaultFileType { fn as_any(&self) -> &dyn Any { self } } -impl Display for DefaultFileFormat { +impl Display for DefaultFileType { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.file_format_factory.default().fmt(f) } } -impl GetExt for DefaultFileFormat { +impl GetExt for DefaultFileType { fn get_ext(&self) -> String { self.file_format_factory.get_ext() } @@ -173,24 +173,24 @@ impl GetExt for DefaultFileFormat { pub fn format_as_file_type( file_format_factory: Arc, ) -> Arc { - Arc::new(DefaultFileFormat { + Arc::new(DefaultFileType { file_format_factory, }) } /// Converts a [FileType] to a [FileFormatFactory]. /// Returns an error if the [FileType] cannot be -/// downcasted to a [DefaultFileFormat]. +/// downcasted to a [DefaultFileType]. pub fn file_type_to_format( file_type: &Arc, ) -> datafusion_common::Result> { match file_type .as_ref() .as_any() - .downcast_ref::() + .downcast_ref::() { Some(source) => Ok(source.file_format_factory.clone()), - _ => internal_err!("FileType was not DefaultFileFormat"), + _ => internal_err!("FileType was not DefaultFileType"), } } From 132af14d9c80cf0cd2738d16098cd561f11f1e14 Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Sun, 23 Jun 2024 22:34:16 -0400 Subject: [PATCH 17/20] typo fix --- datafusion/core/src/execution/session_state.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 64f8b9cc17f7..974839ca99c1 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -251,7 +251,7 @@ impl SessionState { #[cfg(feature = "parquet")] if let Err(e) = new_self.register_file_format(Arc::new(ParquetFormatFactory::new()), false) { - log::info!("Unable to register default ParquetFormat: {e}"), + log::info!("Unable to register default ParquetFormat: {e}") }; match new_self.register_file_format(Arc::new(JsonFormatFactory::new()), false) { From 493e82f2fbebab66f8f1c8d3b89e10f8d88ea849 Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Sun, 23 Jun 2024 22:34:28 -0400 Subject: [PATCH 18/20] fmt --- datafusion/core/src/execution/session_state.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 974839ca99c1..23de3b5c3d6c 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -249,9 +249,10 @@ impl SessionState { }; #[cfg(feature = "parquet")] - if let Err(e) = new_self.register_file_format(Arc::new(ParquetFormatFactory::new()), false) + if let Err(e) = + new_self.register_file_format(Arc::new(ParquetFormatFactory::new()), false) { - log::info!("Unable to register default ParquetFormat: {e}") + log::info!("Unable to register default ParquetFormat: {e}") }; match new_self.register_file_format(Arc::new(JsonFormatFactory::new()), false) { From bcc3119e19e2d642d907952d587f58cfe61aa0d3 Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Mon, 24 Jun 2024 08:35:31 -0400 Subject: [PATCH 19/20] fix err log style --- .../core/src/execution/session_state.rs | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 23de3b5c3d6c..1e7c193321f1 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -255,24 +255,28 @@ impl SessionState { log::info!("Unable to register default ParquetFormat: {e}") }; - match new_self.register_file_format(Arc::new(JsonFormatFactory::new()), false) { - Ok(_) => (), - Err(e) => log::info!("Unable to register default JsonFormat: {e}"), + if let Err(e) = + new_self.register_file_format(Arc::new(JsonFormatFactory::new()), false) + { + log::info!("Unable to register default JsonFormat: {e}") }; - match new_self.register_file_format(Arc::new(CsvFormatFactory::new()), false) { - Ok(_) => (), - Err(e) => log::info!("Unable to register default CsvFormat: {e}"), + if let Err(e) = + new_self.register_file_format(Arc::new(CsvFormatFactory::new()), false) + { + log::info!("Unable to register default CsvFormat: {e}") }; - match new_self.register_file_format(Arc::new(AvroFormatFactory::new()), false) { - Ok(_) => (), - Err(e) => log::info!("Unable to register default AvroFormat: {e}"), + if let Err(e) = + new_self.register_file_format(Arc::new(ArrowFormatFactory::new()), false) + { + log::info!("Unable to register default ArrowFormat: {e}") }; - match new_self.register_file_format(Arc::new(ArrowFormatFactory::new()), false) { - Ok(_) => (), - Err(e) => log::info!("Unable to register default ArrowFormat: {e}"), + if let Err(e) = + new_self.register_file_format(Arc::new(AvroFormatFactory::new()), false) + { + log::info!("Unable to register default AvroFormat: {e}") }; // register built in functions From 11071ac2d21b06e90ca6b60085bf2fdfbea0fe55 Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Tue, 25 Jun 2024 07:21:17 -0400 Subject: [PATCH 20/20] fmt --- datafusion/core/src/datasource/file_format/parquet.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index e461d51ee9e3..44c9cc4ec4a9 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -38,8 +38,8 @@ use crate::physical_plan::{ Statistics, }; -use datafusion_common::config::{ConfigField, ConfigFileType, TableParquetOptions}; use arrow::compute::sum; +use datafusion_common::config::{ConfigField, ConfigFileType, TableParquetOptions}; use datafusion_common::file_options::parquet_writer::ParquetWriterOptions; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::stats::Precision;