apache · alamb · Sep 10, 2024 · Aug 29, 2024 · Aug 29, 2024 · Aug 29, 2024
diff --git a/benchmarks/src/clickbench.rs b/benchmarks/src/clickbench.rs
@@ -121,7 +121,7 @@ impl RunOpt {
             .options_mut()
             .execution
             .parquet
-            .schema_force_string_view = self.common.string_view;
+            .schema_force_view_types = self.common.force_view_types;
 
         let ctx = SessionContext::new_with_config(config);
         self.register_hits(&ctx).await?;

diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs
@@ -124,7 +124,7 @@ impl RunOpt {
             .options_mut()
             .execution
             .parquet
-            .schema_force_string_view = self.common.string_view;
+            .schema_force_view_types = self.common.force_view_types;
         let ctx = SessionContext::new_with_config(config);
 
         // register tables
@@ -344,7 +344,7 @@ mod tests {
             partitions: Some(2),
             batch_size: 8192,
             debug: false,
-            string_view: false,
+            force_view_types: false,
         };
         let opt = RunOpt {
             query: Some(query),
@@ -378,7 +378,7 @@ mod tests {
             partitions: Some(2),
             batch_size: 8192,
             debug: false,
-            string_view: false,
+            force_view_types: false,
         };
         let opt = RunOpt {
             query: Some(query),

diff --git a/benchmarks/src/util/options.rs b/benchmarks/src/util/options.rs
@@ -41,7 +41,7 @@ pub struct CommonOpt {
     /// If true, will use StringView/BinaryViewArray instead of String/BinaryArray
     /// when reading ParquetFiles
     #[structopt(long)]
-    pub string_view: bool,
+    pub force_view_types: bool,
 }
 
 impl CommonOpt {

diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -380,6 +380,10 @@ config_namespace! {
         /// the filters are applied in the same order as written in the query
         pub reorder_filters: bool, default = false
 
+        /// (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`,
+        /// and `Binary/BinaryLarge` with `BinaryView`.
+        pub schema_force_view_types: bool, default = false
+
         // The following options affect writing to parquet files
         // and map to parquet::file::properties::WriterProperties
 
@@ -483,10 +487,6 @@ config_namespace! {
         /// writing out already in-memory data, such as from a cached
         /// data frame.
         pub maximum_buffered_record_batches_per_stream: usize, default = 2
-
-        /// (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`,
-        /// and `Binary/BinaryLarge` with `BinaryView`.
-        pub schema_force_string_view: bool, default = false
     }
 }
 

diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs
@@ -175,7 +175,7 @@ impl ParquetOptions {
             maximum_parallel_row_group_writers: _,
             maximum_buffered_record_batches_per_stream: _,
             bloom_filter_on_read: _, // reads not used for writer props
-            schema_force_string_view: _,
+            schema_force_view_types: _,
         } = self;
 
         let mut builder = WriterProperties::builder()
@@ -441,7 +441,7 @@ mod tests {
             maximum_buffered_record_batches_per_stream: defaults
                 .maximum_buffered_record_batches_per_stream,
             bloom_filter_on_read: defaults.bloom_filter_on_read,
-            schema_force_string_view: defaults.schema_force_string_view,
+            schema_force_view_types: defaults.schema_force_view_types,
         }
     }
 
@@ -542,8 +542,7 @@ mod tests {
                 maximum_buffered_record_batches_per_stream: global_options_defaults
                     .maximum_buffered_record_batches_per_stream,
                 bloom_filter_on_read: global_options_defaults.bloom_filter_on_read,
-                schema_force_string_view: global_options_defaults
-                    .schema_force_string_view,
+                schema_force_view_types: global_options_defaults.schema_force_view_types,
             },
             column_specific_options,
             key_value_metadata,

diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs
@@ -229,6 +229,51 @@ pub fn transform_schema_to_view(schema: &Schema) -> Schema {
     Schema::new_with_metadata(transformed_fields, schema.metadata.clone())
 }
 
+/// Coerces the file schema if the table schema uses a view type.
+pub(crate) fn coerce_file_schema_to_view_type(
+    table_schema: &Schema,
+    file_schema: &Schema,
+) -> Option<Schema> {
+    let mut transform = false;
+    let table_fields: HashMap<_, _> = table_schema
+        .fields
+        .iter()
+        .map(|f| {
+            let dt = f.data_type();
+            if dt.equals_datatype(&DataType::Utf8View) {
+                transform = true;
+            }
+            (f.name(), dt)
+        })
+        .collect();
+    if !transform {
+        return None;
+    }
+
+    let transformed_fields: Vec<Arc<Field>> = file_schema
+        .fields
+        .iter()
+        .map(
+            |field| match (table_fields.get(field.name()), field.data_type()) {
+                (Some(DataType::Utf8View), DataType::Utf8)
+                | (Some(DataType::Utf8View), DataType::LargeUtf8) => Arc::new(
+                    Field::new(field.name(), DataType::Utf8View, field.is_nullable()),
+                ),
+                (Some(DataType::BinaryView), DataType::Binary)
+                | (Some(DataType::BinaryView), DataType::LargeBinary) => Arc::new(
+                    Field::new(field.name(), DataType::BinaryView, field.is_nullable()),
+                ),
+                _ => field.clone(),
+            },
+        )
+        .collect();
+
+    Some(Schema::new_with_metadata(
+        transformed_fields,
+        file_schema.metadata.clone(),
+    ))
+}
+
 #[cfg(test)]
 pub(crate) mod test_util {
     use std::ops::Range;