From 05383ea40c8f4f4468ea09ff44a89382ca883377 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sun, 25 Apr 2021 14:02:03 +0000 Subject: [PATCH] WIP. --- Cargo.toml | 8 +- datafusion/Cargo.toml | 5 +- datafusion/README.md | 8 +- datafusion/benches/aggregate_query_sql.rs | 2 +- datafusion/benches/filter_query_sql.rs | 2 +- datafusion/benches/math_query_sql.rs | 2 +- datafusion/benches/sort_limit_query_sql.rs | 4 +- datafusion/src/bin/repl.rs | 2 +- datafusion/src/catalog/information_schema.rs | 32 ++-- datafusion/src/dataframe.rs | 2 +- datafusion/src/datasource/csv.rs | 5 +- datafusion/src/datasource/datasource.rs | 4 +- datafusion/src/datasource/empty.rs | 4 +- datafusion/src/datasource/memory.rs | 10 +- datafusion/src/datasource/parquet.rs | 13 +- datafusion/src/error.rs | 16 +- datafusion/src/execution/context.rs | 41 +++-- datafusion/src/execution/dataframe_impl.rs | 6 +- datafusion/src/lib.rs | 25 ++- datafusion/src/logical_plan/builder.rs | 10 +- datafusion/src/logical_plan/dfschema.rs | 6 +- datafusion/src/logical_plan/display.rs | 6 +- datafusion/src/logical_plan/expr.rs | 18 +- datafusion/src/logical_plan/plan.rs | 12 +- datafusion/src/optimizer/constant_folding.rs | 4 +- datafusion/src/optimizer/filter_push_down.rs | 11 +- .../src/optimizer/hash_build_probe_order.rs | 5 +- .../src/optimizer/projection_push_down.rs | 6 +- datafusion/src/optimizer/utils.rs | 4 +- .../src/physical_optimizer/repartition.rs | 2 +- datafusion/src/physical_plan/aggregates.rs | 2 +- .../src/physical_plan/array_expressions.rs | 6 +- .../src/physical_plan/coalesce_batches.rs | 18 +- datafusion/src/physical_plan/common.rs | 7 +- datafusion/src/physical_plan/cross_join.rs | 8 +- .../src/physical_plan/crypto_expressions.rs | 16 +- datafusion/src/physical_plan/csv.rs | 14 +- .../src/physical_plan/datetime_expressions.rs | 149 +++++++---------- .../src/physical_plan/distinct_expressions.rs | 13 +- datafusion/src/physical_plan/empty.rs | 10 +- datafusion/src/physical_plan/explain.rs | 19 ++- .../src/physical_plan/expressions/average.rs | 18 +- .../src/physical_plan/expressions/binary.rs | 37 +++-- .../src/physical_plan/expressions/case.rs | 12 +- .../src/physical_plan/expressions/cast.rs | 92 +++------- .../src/physical_plan/expressions/coercion.rs | 8 +- .../src/physical_plan/expressions/column.rs | 2 +- .../src/physical_plan/expressions/count.rs | 22 ++- .../src/physical_plan/expressions/in_list.rs | 23 ++- .../physical_plan/expressions/is_not_null.rs | 12 +- .../src/physical_plan/expressions/is_null.rs | 12 +- .../src/physical_plan/expressions/literal.rs | 6 +- .../src/physical_plan/expressions/min_max.rs | 79 +++++---- .../src/physical_plan/expressions/mod.rs | 9 +- .../src/physical_plan/expressions/negative.rs | 25 +-- .../src/physical_plan/expressions/not.rs | 10 +- .../src/physical_plan/expressions/nullif.rs | 14 +- .../src/physical_plan/expressions/sum.rs | 20 +-- .../src/physical_plan/expressions/try_cast.rs | 36 ++-- datafusion/src/physical_plan/filter.rs | 12 +- datafusion/src/physical_plan/functions.rs | 43 +++-- .../src/physical_plan/hash_aggregate.rs | 157 +++++++----------- datafusion/src/physical_plan/hash_join.rs | 148 +++++++---------- datafusion/src/physical_plan/hash_utils.rs | 2 +- datafusion/src/physical_plan/limit.rs | 17 +- .../src/physical_plan/math_expressions.rs | 55 ++---- datafusion/src/physical_plan/memory.rs | 7 +- datafusion/src/physical_plan/merge.rs | 11 +- datafusion/src/physical_plan/mod.rs | 19 ++- datafusion/src/physical_plan/parquet.rs | 16 +- datafusion/src/physical_plan/planner.rs | 20 ++- datafusion/src/physical_plan/projection.rs | 8 +- .../src/physical_plan/regex_expressions.rs | 22 +-- datafusion/src/physical_plan/repartition.rs | 13 +- datafusion/src/physical_plan/sort.rs | 33 ++-- .../src/physical_plan/string_expressions.rs | 71 ++++---- datafusion/src/physical_plan/type_coercion.rs | 4 +- datafusion/src/physical_plan/udaf.rs | 2 +- datafusion/src/physical_plan/udf.rs | 2 +- .../src/physical_plan/unicode_expressions.rs | 74 ++++----- datafusion/src/physical_plan/union.rs | 5 +- datafusion/src/scalar.rs | 108 ++++-------- datafusion/src/sql/planner.rs | 2 +- datafusion/src/test/exec.rs | 6 +- datafusion/src/test/mod.rs | 12 +- datafusion/tests/custom_sources.rs | 8 +- datafusion/tests/dataframe.rs | 4 +- datafusion/tests/provider_filter_pushdown.rs | 6 +- datafusion/tests/sql.rs | 18 +- datafusion/tests/user_defined_plan.rs | 2 +- 90 files changed, 833 insertions(+), 1048 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2f34babdb247b..ebb3051f3ea05 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,10 +18,4 @@ [workspace] members = [ "datafusion", - "datafusion-examples", - "benchmarks", - "ballista/rust/client", - "ballista/rust/core", - "ballista/rust/executor", - "ballista/rust/scheduler", -] \ No newline at end of file +] diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index eaa7031794cf7..4ab17f06a0612 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -42,7 +42,7 @@ path = "src/bin/main.rs" [features] default = ["cli", "crypto_expressions", "regex_expressions", "unicode_expressions"] cli = ["rustyline"] -simd = ["arrow/simd"] +simd = [] crypto_expressions = ["md-5", "sha2"] regex_expressions = ["regex", "lazy_static"] unicode_expressions = ["unicode-segmentation"] @@ -50,8 +50,7 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = "0.7" hashbrown = "0.11" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd", features = ["prettyprint"] } -parquet = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd", features = ["arrow"] } +arrow2 = { git = "https://github.com/jorgecarleitao/arrow2", rev = "5567d2c6487a9cda7cacf43890b73486d8613989" } sqlparser = "0.9.0" clap = "2.33" rustyline = {version = "7.0", optional = true} diff --git a/datafusion/README.md b/datafusion/README.md index ff0b26d7bf031..43cb03589b75b 100644 --- a/datafusion/README.md +++ b/datafusion/README.md @@ -64,8 +64,8 @@ Run a SQL query against data stored in a CSV: ```rust use datafusion::prelude::*; -use arrow::util::pretty::print_batches; -use arrow::record_batch::RecordBatch; +use arrow2::util::pretty::print_batches; +use arrow2::record_batch::RecordBatch; #[tokio::main] async fn main() -> datafusion::error::Result<()> { @@ -87,8 +87,8 @@ Use the DataFrame API to process data stored in a CSV: ```rust use datafusion::prelude::*; -use arrow::util::pretty::print_batches; -use arrow::record_batch::RecordBatch; +use arrow2::util::pretty::print_batches; +use arrow2::record_batch::RecordBatch; #[tokio::main] async fn main() -> datafusion::error::Result<()> { diff --git a/datafusion/benches/aggregate_query_sql.rs b/datafusion/benches/aggregate_query_sql.rs index 8f1a97e198d3b..6f10b03ad4784 100644 --- a/datafusion/benches/aggregate_query_sql.rs +++ b/datafusion/benches/aggregate_query_sql.rs @@ -26,7 +26,7 @@ use tokio::runtime::Runtime; extern crate arrow; extern crate datafusion; -use arrow::{ +use arrow2::{ array::Float32Array, array::Float64Array, array::StringArray, diff --git a/datafusion/benches/filter_query_sql.rs b/datafusion/benches/filter_query_sql.rs index 8600bdc88c6af..c5637b1441fb2 100644 --- a/datafusion/benches/filter_query_sql.rs +++ b/datafusion/benches/filter_query_sql.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::{ +use arrow2::{ array::{Float32Array, Float64Array}, datatypes::{DataType, Field, Schema}, record_batch::RecordBatch, diff --git a/datafusion/benches/math_query_sql.rs b/datafusion/benches/math_query_sql.rs index 1aaa2d3403cfd..71fc864a5439d 100644 --- a/datafusion/benches/math_query_sql.rs +++ b/datafusion/benches/math_query_sql.rs @@ -26,7 +26,7 @@ use tokio::runtime::Runtime; extern crate arrow; extern crate datafusion; -use arrow::{ +use arrow2::{ array::{Float32Array, Float64Array}, datatypes::{DataType, Field, Schema}, record_batch::RecordBatch, diff --git a/datafusion/benches/sort_limit_query_sql.rs b/datafusion/benches/sort_limit_query_sql.rs index be065f32e0090..d5dd434599f29 100644 --- a/datafusion/benches/sort_limit_query_sql.rs +++ b/datafusion/benches/sort_limit_query_sql.rs @@ -24,7 +24,7 @@ use std::sync::{Arc, Mutex}; extern crate arrow; extern crate datafusion; -use arrow::datatypes::{DataType, Field, Schema}; +use arrow2::datatypes::{DataType, Field, Schema}; use datafusion::datasource::{CsvFile, CsvReadOptions, MemTable}; use datafusion::execution::context::ExecutionContext; @@ -57,7 +57,7 @@ fn create_context() -> Arc> { Field::new("c13", DataType::Utf8, false), ])); - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = arrow2::util::test_util::arrow_test_data(); // create CSV data source let csv = CsvFile::try_new( diff --git a/datafusion/src/bin/repl.rs b/datafusion/src/bin/repl.rs index a6aec204c0d3b..9282a82501f5c 100644 --- a/datafusion/src/bin/repl.rs +++ b/datafusion/src/bin/repl.rs @@ -17,7 +17,7 @@ #![allow(bare_trait_objects)] -use arrow::util::pretty; +use arrow2::util::pretty; use clap::{crate_version, App, Arg}; use datafusion::error::Result; use datafusion::execution::context::{ExecutionConfig, ExecutionContext}; diff --git a/datafusion/src/catalog/information_schema.rs b/datafusion/src/catalog/information_schema.rs index 5a7b9d5b6448d..8908660929d4d 100644 --- a/datafusion/src/catalog/information_schema.rs +++ b/datafusion/src/catalog/information_schema.rs @@ -21,8 +21,8 @@ use std::{any, sync::Arc}; -use arrow::{ - array::{StringBuilder, UInt64Builder}, +use arrow2::{ + array::*, datatypes::{DataType, Field, Schema}, record_batch::RecordBatch, }; @@ -178,10 +178,10 @@ impl SchemaProvider for InformationSchemaProvider { /// /// Columns are based on https://www.postgresql.org/docs/current/infoschema-columns.html struct InformationSchemaTablesBuilder { - catalog_names: StringBuilder, - schema_names: StringBuilder, - table_names: StringBuilder, - table_types: StringBuilder, + catalog_names: Utf8Primitive, + schema_names: Utf8Primitive, + table_names: Utf8Primitive, + table_types: Utf8Primitive, } impl InformationSchemaTablesBuilder { @@ -191,10 +191,10 @@ impl InformationSchemaTablesBuilder { // critical code and the number of tables is unavailable here. let default_capacity = 10; Self { - catalog_names: StringBuilder::new(default_capacity), - schema_names: StringBuilder::new(default_capacity), - table_names: StringBuilder::new(default_capacity), - table_types: StringBuilder::new(default_capacity), + catalog_names: Utf8Primitive::with_capacity(default_capacity), + schema_names: Utf8Primitive::with_capacity(default_capacity), + table_names: Utf8Primitive::with_capacity(default_capacity), + table_types: Utf8Primitive::with_capacity(default_capacity), } } @@ -205,14 +205,10 @@ impl InformationSchemaTablesBuilder { table_name: impl AsRef, ) { // Note: append_value is actually infallable. - self.catalog_names - .append_value(catalog_name.as_ref()) - .unwrap(); - self.schema_names - .append_value(schema_name.as_ref()) - .unwrap(); - self.table_names.append_value(table_name.as_ref()).unwrap(); - self.table_types.append_value("BASE TABLE").unwrap(); + self.catalog_names.push(Some(&catalog_name.as_ref())); + self.schema_names.push(Some(&schema_name.as_ref())); + self.table_names.push(Some(&table_name.as_ref())); + self.table_types.push(Some(&"BASE TABLE")); } fn add_system_table( diff --git a/datafusion/src/dataframe.rs b/datafusion/src/dataframe.rs index 9c7c2ef96d6be..c244b2d1d71ea 100644 --- a/datafusion/src/dataframe.rs +++ b/datafusion/src/dataframe.rs @@ -17,7 +17,7 @@ //! DataFrame API for building and executing query plans. -use crate::arrow::record_batch::RecordBatch; +use crate::arrow2::record_batch::RecordBatch; use crate::error::Result; use crate::logical_plan::{ DFSchema, Expr, FunctionRegistry, JoinType, LogicalPlan, Partitioning, diff --git a/datafusion/src/datasource/csv.rs b/datafusion/src/datasource/csv.rs index 6f6c9abe07741..e8d4e8cc37401 100644 --- a/datafusion/src/datasource/csv.rs +++ b/datafusion/src/datasource/csv.rs @@ -25,7 +25,7 @@ //! use datafusion::datasource::TableProvider; //! use datafusion::datasource::csv::{CsvFile, CsvReadOptions}; //! -//! let testdata = arrow::util::test_util::arrow_test_data(); +//! let testdata = arrow2::util::test_util::arrow_test_data(); //! let csvdata = CsvFile::try_new( //! &format!("{}/csv/aggregate_test_100.csv", testdata), //! CsvReadOptions::new().delimiter(b'|'), @@ -33,7 +33,8 @@ //! let schema = csvdata.schema(); //! ``` -use arrow::datatypes::SchemaRef; +use arrow2::datatypes::Schema; +type SchemaRef = Arc; use std::any::Any; use std::string::String; use std::sync::Arc; diff --git a/datafusion/src/datasource/datasource.rs b/datafusion/src/datasource/datasource.rs index e2b07336486cb..6f1fe39e93eeb 100644 --- a/datafusion/src/datasource/datasource.rs +++ b/datafusion/src/datasource/datasource.rs @@ -23,7 +23,9 @@ use std::sync::Arc; use crate::error::Result; use crate::logical_plan::Expr; use crate::physical_plan::ExecutionPlan; -use crate::{arrow::datatypes::SchemaRef, scalar::ScalarValue}; +use crate::{arrow2::datatypes::Schema, scalar::ScalarValue}; + +type SchemaRef = Arc; /// This table statistics are estimates. /// It can not be used directly in the precise compute diff --git a/datafusion/src/datasource/empty.rs b/datafusion/src/datasource/empty.rs index e6140cdb8de69..e0033f29df2e1 100644 --- a/datafusion/src/datasource/empty.rs +++ b/datafusion/src/datasource/empty.rs @@ -20,7 +20,9 @@ use std::any::Any; use std::sync::Arc; -use arrow::datatypes::*; +use arrow2::datatypes::*; + +type SchemaRef = Arc; use crate::datasource::datasource::Statistics; use crate::datasource::TableProvider; diff --git a/datafusion/src/datasource/memory.rs b/datafusion/src/datasource/memory.rs index af40480870287..0877e79c83bf9 100644 --- a/datafusion/src/datasource/memory.rs +++ b/datafusion/src/datasource/memory.rs @@ -24,8 +24,10 @@ use log::debug; use std::any::Any; use std::sync::Arc; -use arrow::datatypes::{Field, Schema, SchemaRef}; -use arrow::record_batch::RecordBatch; +use arrow2::datatypes::{Field, Schema}; +use arrow2::record_batch::RecordBatch; + +type SchemaRef = Arc; use crate::datasource::TableProvider; use crate::error::{DataFusionError, Result}; @@ -221,8 +223,8 @@ impl TableProvider for MemTable { #[cfg(test)] mod tests { use super::*; - use arrow::array::Int32Array; - use arrow::datatypes::{DataType, Field, Schema}; + use arrow2::array::Int32Array; + use arrow2::datatypes::{DataType, Field, Schema}; use futures::StreamExt; use std::collections::HashMap; diff --git a/datafusion/src/datasource/parquet.rs b/datafusion/src/datasource/parquet.rs index 30e47df5f6491..bd169eaf3d1f4 100644 --- a/datafusion/src/datasource/parquet.rs +++ b/datafusion/src/datasource/parquet.rs @@ -21,7 +21,7 @@ use std::any::Any; use std::string::String; use std::sync::Arc; -use arrow::datatypes::*; +use arrow2::datatypes::*; use crate::datasource::datasource::Statistics; use crate::datasource::TableProvider; @@ -32,6 +32,8 @@ use crate::physical_plan::ExecutionPlan; use super::datasource::TableProviderFilterPushDown; +type SchemaRef = Arc; + /// Table-based representation of a `ParquetFile`. pub struct ParquetTable { path: String, @@ -106,11 +108,8 @@ impl TableProvider for ParquetTable { #[cfg(test)] mod tests { use super::*; - use arrow::array::{ - BinaryArray, BooleanArray, Float32Array, Float64Array, Int32Array, - TimestampNanosecondArray, - }; - use arrow::record_batch::RecordBatch; + use arrow2::array::*; + use arrow2::record_batch::RecordBatch; use futures::StreamExt; #[tokio::test] @@ -328,7 +327,7 @@ mod tests { } fn load_table(name: &str) -> Result> { - let testdata = arrow::util::test_util::parquet_test_data(); + let testdata = arrow2::util::test_util::parquet_test_data(); let filename = format!("{}/{}", testdata, name); let table = ParquetTable::try_new(&filename, 2)?; Ok(Arc::new(table)) diff --git a/datafusion/src/error.rs b/datafusion/src/error.rs index 903faeabf6954..b7b656733fde0 100644 --- a/datafusion/src/error.rs +++ b/datafusion/src/error.rs @@ -22,8 +22,7 @@ use std::fmt::{Display, Formatter}; use std::io; use std::result; -use arrow::error::ArrowError; -use parquet::errors::ParquetError; +use arrow2::error::ArrowError; use sqlparser::parser::ParserError; /// Result type for operations that could result in an [DataFusionError] @@ -35,8 +34,6 @@ pub type Result = result::Result; pub enum DataFusionError { /// Error returned by arrow. ArrowError(ArrowError), - /// Wraps an error from the Parquet crate - ParquetError(ParquetError), /// Error associated to I/O operations and associated traits. IoError(io::Error), /// Error returned when SQL is syntactically incorrect. @@ -59,7 +56,7 @@ pub enum DataFusionError { } impl DataFusionError { - /// Wraps this [DataFusionError] as an [arrow::error::ArrowError]. + /// Wraps this [DataFusionError] as an [arrow2::error::ArrowError]. pub fn into_arrow_external_error(self) -> ArrowError { ArrowError::from_external_error(Box::new(self)) } @@ -77,12 +74,6 @@ impl From for DataFusionError { } } -impl From for DataFusionError { - fn from(e: ParquetError) -> Self { - DataFusionError::ParquetError(e) - } -} - impl From for DataFusionError { fn from(e: ParserError) -> Self { DataFusionError::SQL(e) @@ -93,9 +84,6 @@ impl Display for DataFusionError { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match *self { DataFusionError::ArrowError(ref desc) => write!(f, "Arrow error: {}", desc), - DataFusionError::ParquetError(ref desc) => { - write!(f, "Parquet error: {}", desc) - } DataFusionError::IoError(ref desc) => write!(f, "IO error: {}", desc), DataFusionError::SQL(ref desc) => { write!(f, "SQL error: {:?}", desc) diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index c394d3895622a..e7a177b6357cb 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -37,7 +37,7 @@ use std::{ use futures::{StreamExt, TryStreamExt}; use tokio::task::{self, JoinHandle}; -use arrow::csv; +use arrow2::io::csv; use crate::catalog::{ catalog::{CatalogProvider, MemoryCatalogProvider}, @@ -72,8 +72,7 @@ use crate::sql::{ }; use crate::variable::{VarProvider, VarType}; use crate::{dataframe::DataFrame, physical_plan::udaf::AggregateUDF}; -use parquet::arrow::ArrowWriter; -use parquet::file::properties::WriterProperties; +use arrow2::io::parquet; /// ExecutionContext is the main interface for executing queries with DataFusion. The context /// provides the following functionality: @@ -846,20 +845,18 @@ mod tests { datasource::MemTable, logical_plan::create_udaf, physical_plan::expressions::AvgAccumulator, }; - use arrow::array::{ - Array, ArrayRef, BinaryArray, DictionaryArray, Float64Array, Int32Array, - Int64Array, LargeBinaryArray, LargeStringArray, StringArray, - TimestampNanosecondArray, - }; - use arrow::compute::add; - use arrow::datatypes::*; - use arrow::record_batch::RecordBatch; + use arrow2::array::*; + use arrow2::datatypes::*; + use arrow2::record_batch::RecordBatch; use std::fs::File; use std::thread::{self, JoinHandle}; use std::{io::prelude::*, sync::Mutex}; use tempfile::TempDir; use test::*; + type ArrayRef = Arc; + type SchemaRef = Arc; + #[tokio::test] async fn parallel_projection() -> Result<()> { let partition_count = 4; @@ -1659,7 +1656,7 @@ mod tests { // C, 1 // A, 1 - let str_array: LargeStringArray = vec!["A", "B", "A", "A", "C", "A"] + let str_array: Utf8Array = vec!["A", "B", "A", "A", "C", "A"] .into_iter() .map(Some) .collect(); @@ -1699,7 +1696,7 @@ mod tests { #[tokio::test] async fn group_by_dictionary() { - async fn run_test_case() { + async fn run_test_case() { let mut ctx = ExecutionContext::new(); // input data looks like: @@ -1764,14 +1761,14 @@ mod tests { assert_batches_sorted_eq!(expected, &results); } - run_test_case::().await; - run_test_case::().await; - run_test_case::().await; - run_test_case::().await; - run_test_case::().await; - run_test_case::().await; - run_test_case::().await; - run_test_case::().await; + run_test_case::().await; + run_test_case::().await; + run_test_case::().await; + run_test_case::().await; + run_test_case::().await; + run_test_case::().await; + run_test_case::().await; + run_test_case::().await; } async fn run_count_distinct_integers_aggregated_scenario( @@ -2335,7 +2332,7 @@ mod tests { .as_any() .downcast_ref::() .expect("cast failed"); - Ok(Arc::new(add(l, r)?) as ArrayRef) + Ok(Arc::new(add::add(l, r)?) as ArrayRef) }; let myfunc = make_scalar_function(myfunc); diff --git a/datafusion/src/execution/dataframe_impl.rs b/datafusion/src/execution/dataframe_impl.rs index 2a0c39aa48ebd..1f323ca2b3dcb 100644 --- a/datafusion/src/execution/dataframe_impl.rs +++ b/datafusion/src/execution/dataframe_impl.rs @@ -19,7 +19,7 @@ use std::sync::{Arc, Mutex}; -use crate::arrow::record_batch::RecordBatch; +use crate::arrow2::record_batch::RecordBatch; use crate::error::Result; use crate::execution::context::{ExecutionContext, ExecutionContextState}; use crate::logical_plan::{ @@ -182,7 +182,7 @@ mod tests { use crate::logical_plan::*; use crate::{datasource::csv::CsvReadOptions, physical_plan::ColumnarValue}; use crate::{physical_plan::functions::ScalarFunctionImplementation, test}; - use arrow::datatypes::DataType; + use arrow2::datatypes::DataType; #[test] fn select_columns() -> Result<()> { @@ -363,7 +363,7 @@ mod tests { fn register_aggregate_csv(ctx: &mut ExecutionContext) -> Result<()> { let schema = test::aggr_test_schema(); - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = arrow2::util::test_util::arrow_test_data(); ctx.register_csv( "aggregate_test_100", &format!("{}/csv/aggregate_test_100.csv", testdata), diff --git a/datafusion/src/lib.rs b/datafusion/src/lib.rs index 252d168114add..39fa0b0095000 100644 --- a/datafusion/src/lib.rs +++ b/datafusion/src/lib.rs @@ -39,7 +39,7 @@ //! ```rust //! # use datafusion::prelude::*; //! # use datafusion::error::Result; -//! # use arrow::record_batch::RecordBatch; +//! # use arrow2::record_batch::RecordBatch; //! //! # #[tokio::main] //! # async fn main() -> Result<()> { @@ -57,7 +57,7 @@ //! let results: Vec = df.collect().await?; //! //! // format the results -//! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)?; +//! let pretty_results = arrow2::util::pretty::pretty_format_batches(&results)?; //! //! let expected = vec![ //! "+---+--------+", @@ -77,7 +77,7 @@ //! ``` //! # use datafusion::prelude::*; //! # use datafusion::error::Result; -//! # use arrow::record_batch::RecordBatch; +//! # use arrow2::record_batch::RecordBatch; //! //! # #[tokio::main] //! # async fn main() -> Result<()> { @@ -92,7 +92,7 @@ //! let results: Vec = df.collect().await?; //! //! // format the results -//! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)?; +//! let pretty_results = arrow2::util::pretty::pretty_format_batches(&results)?; //! //! let expected = vec![ //! "+---+--------+", @@ -130,7 +130,7 @@ //! ### Logical plan //! //! Logical planning yields [`logical plans`](logical_plan::LogicalPlan) and [`logical expressions`](logical_plan::Expr). -//! These are [`Schema`](arrow::datatypes::Schema)-aware traits that represent statements whose result is independent of how it should physically be executed. +//! These are [`Schema`](arrow2::datatypes::Schema)-aware traits that represent statements whose result is independent of how it should physically be executed. //! //! A [`LogicalPlan`](logical_plan::LogicalPlan) is a Direct Asyclic graph of other [`LogicalPlan`s](logical_plan::LogicalPlan) and each node contains logical expressions ([`Expr`s](logical_plan::Expr)). //! All of these are located in [`logical_plan`](logical_plan). @@ -152,12 +152,12 @@ //! Broadly speaking, //! //! * an [`ExecutionPlan`](physical_plan::ExecutionPlan) receives a partition number and asyncronosly returns -//! an iterator over [`RecordBatch`](arrow::record_batch::RecordBatch) -//! (a node-specific struct that implements [`RecordBatchReader`](arrow::record_batch::RecordBatchReader)) -//! * a [`PhysicalExpr`](physical_plan::PhysicalExpr) receives a [`RecordBatch`](arrow::record_batch::RecordBatch) -//! and returns an [`Array`](arrow::array::Array) -//! * an [`AggregateExpr`](physical_plan::AggregateExpr) receives [`RecordBatch`es](arrow::record_batch::RecordBatch) -//! and returns a [`RecordBatch`](arrow::record_batch::RecordBatch) of a single row(*) +//! an iterator over [`RecordBatch`](arrow2::record_batch::RecordBatch) +//! (a node-specific struct that implements [`RecordBatchReader`](arrow2::record_batch::RecordBatchReader)) +//! * a [`PhysicalExpr`](physical_plan::PhysicalExpr) receives a [`RecordBatch`](arrow2::record_batch::RecordBatch) +//! and returns an [`Array`](arrow2::array::Array) +//! * an [`AggregateExpr`](physical_plan::AggregateExpr) receives [`RecordBatch`es](arrow2::record_batch::RecordBatch) +//! and returns a [`RecordBatch`](arrow2::record_batch::RecordBatch) of a single row(*) //! //! (*) Technically, it aggregates the results on each partition and then merges the results into a single partition. //! @@ -200,8 +200,7 @@ pub mod sql; pub mod variable; // re-export dependencies from arrow-rs to minimise version maintenance for crate users -pub use arrow; -pub use parquet; +pub use arrow2; #[cfg(test)] pub mod test; diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index b6017b743ed70..986799b4b00cd 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -19,11 +19,13 @@ use std::{collections::HashMap, sync::Arc}; -use arrow::{ - datatypes::{Schema, SchemaRef}, +use arrow2::{ + datatypes::Schema, record_batch::RecordBatch, }; +type SchemaRef = Arc; + use crate::datasource::TableProvider; use crate::error::{DataFusionError, Result}; use crate::{ @@ -44,7 +46,7 @@ use std::collections::HashSet; /// # use datafusion::prelude::*; /// # use datafusion::logical_plan::LogicalPlanBuilder; /// # use datafusion::error::Result; -/// # use arrow::datatypes::{Schema, DataType, Field}; +/// # use arrow2::datatypes::{Schema, DataType, Field}; /// # /// # fn main() -> Result<()> { /// # @@ -416,7 +418,7 @@ fn validate_unique_names<'a>( #[cfg(test)] mod tests { - use arrow::datatypes::{DataType, Field}; + use arrow2::datatypes::{DataType, Field}; use super::super::{lit, sum}; use super::*; diff --git a/datafusion/src/logical_plan/dfschema.rs b/datafusion/src/logical_plan/dfschema.rs index 9adb22b43d075..651eecb9aa185 100644 --- a/datafusion/src/logical_plan/dfschema.rs +++ b/datafusion/src/logical_plan/dfschema.rs @@ -24,9 +24,11 @@ use std::sync::Arc; use crate::error::{DataFusionError, Result}; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow2::datatypes::{DataType, Field, Schema}; use std::fmt::{Display, Formatter}; +type SchemaRef = Arc; + /// A reference-counted reference to a `DFSchema`. pub type DFSchemaRef = Arc; @@ -356,7 +358,7 @@ impl DFField { #[cfg(test)] mod tests { use super::*; - use arrow::datatypes::DataType; + use arrow2::datatypes::DataType; #[test] fn from_unqualified_field() { diff --git a/datafusion/src/logical_plan/display.rs b/datafusion/src/logical_plan/display.rs index 76749b547a8f8..5e081e5641533 100644 --- a/datafusion/src/logical_plan/display.rs +++ b/datafusion/src/logical_plan/display.rs @@ -17,7 +17,7 @@ //! This module provides logic for displaying LogicalPlans in various styles use super::{LogicalPlan, PlanVisitor}; -use arrow::datatypes::Schema; +use arrow2::datatypes::Schema; use std::fmt; /// Formats plans with a single line per node. For example: @@ -88,7 +88,7 @@ impl<'a, 'b> PlanVisitor for IndentVisitor<'a, 'b> { /// `foo:Utf8;N` if `foo` is nullable. /// /// ``` -/// use arrow::datatypes::{Field, Schema, DataType}; +/// use arrow2::datatypes::{Field, Schema, DataType}; /// # use datafusion::logical_plan::display_schema; /// let schema = Schema::new(vec![ /// Field::new("id", DataType::Int32, false), @@ -245,7 +245,7 @@ impl<'a, 'b> PlanVisitor for GraphvizVisitor<'a, 'b> { #[cfg(test)] mod tests { - use arrow::datatypes::{DataType, Field}; + use arrow2::datatypes::{DataType, Field}; use super::*; diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index fa9b9e0a2490f..990880a3bf8eb 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -24,7 +24,7 @@ use std::fmt; use std::sync::Arc; use aggregates::{AccumulatorFunctionImplementation, StateTypeFunction}; -use arrow::{compute::can_cast_types, datatypes::DataType}; +use arrow2::{compute::cast::can_cast_types, datatypes::DataType}; use crate::error::{DataFusionError, Result}; use crate::logical_plan::{DFField, DFSchema}; @@ -39,7 +39,7 @@ use std::collections::HashSet; /// represent logical expressions such as `A + 1`, or `CAST(c1 AS /// int)`. /// -/// An `Expr` can compute its [DataType](arrow::datatypes::DataType) +/// An `Expr` can compute its [DataType](arrow2::datatypes::DataType) /// and nullability, and has functions for building up complex /// expressions. /// @@ -211,11 +211,11 @@ pub enum Expr { } impl Expr { - /// Returns the [arrow::datatypes::DataType] of the expression based on [arrow::datatypes::Schema]. + /// Returns the [arrow2::datatypes::DataType] of the expression based on [arrow2::datatypes::Schema]. /// /// # Errors /// - /// This function errors when it is not possible to compute its [arrow::datatypes::DataType]. + /// This function errors when it is not possible to compute its [arrow2::datatypes::DataType]. /// This happens when e.g. the expression refers to a column that does not exist in the schema, or when /// the expression is incorrectly typed (e.g. `[utf8] + [bool]`). pub fn get_type(&self, schema: &DFSchema) -> Result { @@ -280,7 +280,7 @@ impl Expr { } } - /// Returns the nullability of the expression based on [arrow::datatypes::Schema]. + /// Returns the nullability of the expression based on [arrow2::datatypes::Schema]. /// /// # Errors /// @@ -336,14 +336,14 @@ impl Expr { } } - /// Returns the name of this expression based on [arrow::datatypes::Schema]. + /// Returns the name of this expression based on [arrow2::datatypes::Schema]. /// /// This represents how a column with this expression is named when no alias is chosen pub fn name(&self, input_schema: &DFSchema) -> Result { create_name(self, input_schema) } - /// Returns a [arrow::datatypes::Field] compatible with this expression. + /// Returns a [arrow2::datatypes::Field] compatible with this expression. pub fn to_field(&self, input_schema: &DFSchema) -> Result { Ok(DFField::new( None, //TODO qualifier @@ -353,12 +353,12 @@ impl Expr { )) } - /// Wraps this expression in a cast to a target [arrow::datatypes::DataType]. + /// Wraps this expression in a cast to a target [arrow2::datatypes::DataType]. /// /// # Errors /// /// This function errors when it is impossible to cast the - /// expression to the target [arrow::datatypes::DataType]. + /// expression to the target [arrow2::datatypes::DataType]. pub fn cast_to(self, cast_to_type: &DataType, schema: &DFSchema) -> Result { let this_type = self.get_type(schema)?; if this_type == *cast_to_type { diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index 606ef1e222755..1cf8877f945a1 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -23,7 +23,7 @@ use std::{ sync::Arc, }; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow2::datatypes::{DataType, Field, Schema}; use crate::datasource::TableProvider; use crate::sql::parser::FileType; @@ -36,6 +36,8 @@ use super::{ }; use crate::logical_plan::dfschema::DFSchemaRef; +type SchemaRef = Arc; + /// Join type #[derive(Debug, Clone, Copy)] pub enum JoinType { @@ -464,7 +466,7 @@ impl LogicalPlan { /// ``` /// /// ``` - /// use arrow::datatypes::{Field, Schema, DataType}; + /// use arrow2::datatypes::{Field, Schema, DataType}; /// use datafusion::logical_plan::{lit, col, LogicalPlanBuilder}; /// let schema = Schema::new(vec![ /// Field::new("id", DataType::Int32, false), @@ -505,7 +507,7 @@ impl LogicalPlan { /// ``` /// /// ``` - /// use arrow::datatypes::{Field, Schema, DataType}; + /// use arrow2::datatypes::{Field, Schema, DataType}; /// use datafusion::logical_plan::{lit, col, LogicalPlanBuilder}; /// let schema = Schema::new(vec![ /// Field::new("id", DataType::Int32, false), @@ -545,7 +547,7 @@ impl LogicalPlan { /// structure, and one with additional details such as schema. /// /// ``` - /// use arrow::datatypes::{Field, Schema, DataType}; + /// use arrow2::datatypes::{Field, Schema, DataType}; /// use datafusion::logical_plan::{lit, col, LogicalPlanBuilder}; /// let schema = Schema::new(vec![ /// Field::new("id", DataType::Int32, false), @@ -604,7 +606,7 @@ impl LogicalPlan { /// Projection: #id /// ``` /// ``` - /// use arrow::datatypes::{Field, Schema, DataType}; + /// use arrow2::datatypes::{Field, Schema, DataType}; /// use datafusion::logical_plan::{lit, col, LogicalPlanBuilder}; /// let schema = Schema::new(vec![ /// Field::new("id", DataType::Int32, false), diff --git a/datafusion/src/optimizer/constant_folding.rs b/datafusion/src/optimizer/constant_folding.rs index 71c84f6153b62..55d28b0fdb6c9 100644 --- a/datafusion/src/optimizer/constant_folding.rs +++ b/datafusion/src/optimizer/constant_folding.rs @@ -20,7 +20,7 @@ use std::sync::Arc; -use arrow::datatypes::DataType; +use arrow2::datatypes::DataType; use crate::error::Result; use crate::logical_plan::{DFSchemaRef, Expr, ExprRewriter, LogicalPlan, Operator}; @@ -216,7 +216,7 @@ mod tests { col, lit, max, min, DFField, DFSchema, LogicalPlanBuilder, }; - use arrow::datatypes::*; + use arrow2::datatypes::*; fn test_table_scan() -> Result { let schema = Schema::new(vec![ diff --git a/datafusion/src/optimizer/filter_push_down.rs b/datafusion/src/optimizer/filter_push_down.rs index 4622e9fc62dc1..ec23b6e4e3208 100644 --- a/datafusion/src/optimizer/filter_push_down.rs +++ b/datafusion/src/optimizer/filter_push_down.rs @@ -439,7 +439,10 @@ mod tests { use crate::physical_plan::ExecutionPlan; use crate::test::*; use crate::{logical_plan::col, prelude::JoinType}; - use arrow::datatypes::SchemaRef; + use arrow2::datatypes::Schema; + use std::sync::Arc; + + type SchemaRef = Arc; fn assert_optimized_plan_eq(plan: &LogicalPlan, expected: &str) { let rule = FilterPushDown::new(); @@ -929,10 +932,10 @@ mod tests { impl TableProvider for PushDownProvider { fn schema(&self) -> SchemaRef { - Arc::new(arrow::datatypes::Schema::new(vec![ - arrow::datatypes::Field::new( + Arc::new(arrow2::datatypes::Schema::new(vec![ + arrow2::datatypes::Field::new( "a", - arrow::datatypes::DataType::Int32, + arrow2::datatypes::DataType::Int32, true, ), ])) diff --git a/datafusion/src/optimizer/hash_build_probe_order.rs b/datafusion/src/optimizer/hash_build_probe_order.rs index 086e2f03196bd..38f9674fdfeb3 100644 --- a/datafusion/src/optimizer/hash_build_probe_order.rs +++ b/datafusion/src/optimizer/hash_build_probe_order.rs @@ -218,6 +218,9 @@ mod tests { logical_plan::{DFSchema, Expr}, test::*, }; + use arrow2::datatypes::Schema; + + type SchemaRef = Arc; struct TestTableProvider { num_rows: usize, @@ -227,7 +230,7 @@ mod tests { fn as_any(&self) -> &dyn std::any::Any { unimplemented!() } - fn schema(&self) -> arrow::datatypes::SchemaRef { + fn schema(&self) -> SchemaRef { unimplemented!() } diff --git a/datafusion/src/optimizer/projection_push_down.rs b/datafusion/src/optimizer/projection_push_down.rs index 7243fa52d9b32..02b8e66a75b0a 100644 --- a/datafusion/src/optimizer/projection_push_down.rs +++ b/datafusion/src/optimizer/projection_push_down.rs @@ -22,8 +22,8 @@ use crate::error::Result; use crate::logical_plan::{DFField, DFSchema, DFSchemaRef, LogicalPlan, ToDFSchema}; use crate::optimizer::optimizer::OptimizerRule; use crate::optimizer::utils; -use arrow::datatypes::Schema; -use arrow::error::Result as ArrowResult; +use arrow2::datatypes::Schema; +use arrow2::error::Result as ArrowResult; use std::{collections::HashSet, sync::Arc}; use utils::optimize_explain; @@ -297,7 +297,7 @@ mod tests { use crate::logical_plan::{col, lit}; use crate::logical_plan::{max, min, Expr, LogicalPlanBuilder}; use crate::test::*; - use arrow::datatypes::DataType; + use arrow2::datatypes::DataType; #[test] fn aggregate_no_group_by() -> Result<()> { diff --git a/datafusion/src/optimizer/utils.rs b/datafusion/src/optimizer/utils.rs index 0ec3fa7c02a16..b05a0ee0c4603 100644 --- a/datafusion/src/optimizer/utils.rs +++ b/datafusion/src/optimizer/utils.rs @@ -19,7 +19,7 @@ use std::{collections::HashSet, sync::Arc}; -use arrow::datatypes::Schema; +use arrow2::datatypes::Schema; use super::optimizer::OptimizerRule; use crate::logical_plan::{ @@ -415,7 +415,7 @@ pub fn rewrite_expression(expr: &Expr, expressions: &[Expr]) -> Result { mod tests { use super::*; use crate::logical_plan::{col, LogicalPlanBuilder}; - use arrow::datatypes::DataType; + use arrow2::datatypes::DataType; use std::collections::HashSet; #[test] diff --git a/datafusion/src/physical_optimizer/repartition.rs b/datafusion/src/physical_optimizer/repartition.rs index 82f46f9cbbbb6..913660b8b6121 100644 --- a/datafusion/src/physical_optimizer/repartition.rs +++ b/datafusion/src/physical_optimizer/repartition.rs @@ -103,7 +103,7 @@ impl PhysicalOptimizerRule for Repartition { } #[cfg(test)] mod tests { - use arrow::datatypes::Schema; + use arrow2::datatypes::Schema; use super::*; use crate::datasource::datasource::Statistics; diff --git a/datafusion/src/physical_plan/aggregates.rs b/datafusion/src/physical_plan/aggregates.rs index 9417c7c8f05a5..94266fc1255d9 100644 --- a/datafusion/src/physical_plan/aggregates.rs +++ b/datafusion/src/physical_plan/aggregates.rs @@ -34,7 +34,7 @@ use super::{ use crate::error::{DataFusionError, Result}; use crate::physical_plan::distinct_expressions; use crate::physical_plan::expressions; -use arrow::datatypes::{DataType, Schema, TimeUnit}; +use arrow2::datatypes::{DataType, Schema, TimeUnit}; use expressions::{avg_return_type, sum_return_type}; use std::{fmt, str::FromStr, sync::Arc}; diff --git a/datafusion/src/physical_plan/array_expressions.rs b/datafusion/src/physical_plan/array_expressions.rs index a7e03b70e5d21..899bec92ec193 100644 --- a/datafusion/src/physical_plan/array_expressions.rs +++ b/datafusion/src/physical_plan/array_expressions.rs @@ -18,12 +18,14 @@ //! Array expressions use crate::error::{DataFusionError, Result}; -use arrow::array::*; -use arrow::datatypes::DataType; +use arrow2::array::*; +use arrow2::datatypes::DataType; use std::sync::Arc; use super::ColumnarValue; +type ArrayRef = Arc; + macro_rules! downcast_vec { ($ARGS:expr, $ARRAY_TYPE:ident) => {{ $ARGS diff --git a/datafusion/src/physical_plan/coalesce_batches.rs b/datafusion/src/physical_plan/coalesce_batches.rs index b91e0b672eb58..3d2eadb10d1c9 100644 --- a/datafusion/src/physical_plan/coalesce_batches.rs +++ b/datafusion/src/physical_plan/coalesce_batches.rs @@ -28,10 +28,11 @@ use crate::physical_plan::{ ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream, }; -use arrow::compute::kernels::concat::concat; -use arrow::datatypes::SchemaRef; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; +use arrow2::compute::concat::concatenate; +use arrow2::datatypes::Schema; +type SchemaRef = Arc; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; use async_trait::async_trait; use futures::stream::{Stream, StreamExt}; use log::debug; @@ -222,12 +223,13 @@ pub fn concat_batches( } let mut arrays = Vec::with_capacity(schema.fields().len()); for i in 0..schema.fields().len() { - let array = concat( + let array = concatenate( &batches .iter() .map(|batch| batch.column(i).as_ref()) .collect::>(), - )?; + )? + .into(); arrays.push(array); } debug!( @@ -242,8 +244,8 @@ pub fn concat_batches( mod tests { use super::*; use crate::physical_plan::{memory::MemoryExec, repartition::RepartitionExec}; - use arrow::array::UInt32Array; - use arrow::datatypes::{DataType, Field, Schema}; + use arrow2::array::UInt32Array; + use arrow2::datatypes::{DataType, Field, Schema}; #[tokio::test(flavor = "multi_thread")] async fn test_concat_batches() -> Result<()> { diff --git a/datafusion/src/physical_plan/common.rs b/datafusion/src/physical_plan/common.rs index 9de7ee2a32dd8..137ce347b3109 100644 --- a/datafusion/src/physical_plan/common.rs +++ b/datafusion/src/physical_plan/common.rs @@ -25,9 +25,10 @@ use std::task::{Context, Poll}; use super::{RecordBatchStream, SendableRecordBatchStream}; use crate::error::{DataFusionError, Result}; -use arrow::datatypes::SchemaRef; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; +use arrow2::datatypes::Schema; +type SchemaRef = Arc; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; use futures::{Stream, TryStreamExt}; /// Stream of record batches diff --git a/datafusion/src/physical_plan/cross_join.rs b/datafusion/src/physical_plan/cross_join.rs index 4372352d6ecf9..e8eec1011e401 100644 --- a/datafusion/src/physical_plan/cross_join.rs +++ b/datafusion/src/physical_plan/cross_join.rs @@ -22,9 +22,9 @@ use futures::{lock::Mutex, StreamExt}; use std::{any::Any, sync::Arc, task::Poll}; use crate::physical_plan::memory::MemoryStream; -use arrow::datatypes::{Schema, SchemaRef}; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; +use arrow2::datatypes::Schema; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; use futures::{Stream, TryStreamExt}; @@ -40,6 +40,8 @@ use super::{ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchS use crate::physical_plan::coalesce_batches::concat_batches; use log::debug; +type SchemaRef = Arc; + /// Data of the left side type JoinLeftData = RecordBatch; diff --git a/datafusion/src/physical_plan/crypto_expressions.rs b/datafusion/src/physical_plan/crypto_expressions.rs index 8ad876b24d0ce..07a68d30207ce 100644 --- a/datafusion/src/physical_plan/crypto_expressions.rs +++ b/datafusion/src/physical_plan/crypto_expressions.rs @@ -28,8 +28,8 @@ use crate::{ error::{DataFusionError, Result}, scalar::ScalarValue, }; -use arrow::{ - array::{Array, BinaryArray, GenericStringArray, StringOffsetSizeTrait}, +use arrow2::{ + array::{Array, BinaryArray, Offset, Utf8Array}, datatypes::DataType, }; @@ -60,15 +60,15 @@ fn sha_process(input: &str) -> SHA2DigestOutput { /// # Errors /// This function errors when: /// * the number of arguments is not 1 -/// * the first argument is not castable to a `GenericStringArray` +/// * the first argument is not castable to a `Utf8Array` fn unary_binary_function( args: &[&dyn Array], op: F, name: &str, -) -> Result +) -> Result> where R: AsRef<[u8]>, - T: StringOffsetSizeTrait, + T: Offset, F: Fn(&str) -> R, { if args.len() != 1 { @@ -81,7 +81,7 @@ where let array = args[0] .as_any() - .downcast_ref::>() + .downcast_ref::>() .ok_or_else(|| { DataFusionError::Internal("failed to downcast to string".to_string()) })?; @@ -137,9 +137,7 @@ where } } -fn md5_array( - args: &[&dyn Array], -) -> Result> { +fn md5_array(args: &[&dyn Array]) -> Result> { unary_string_function::(args, md5_process, "md5") } diff --git a/datafusion/src/physical_plan/csv.rs b/datafusion/src/physical_plan/csv.rs index 7ee5ae3fd90b0..499b2bd7bf71c 100644 --- a/datafusion/src/physical_plan/csv.rs +++ b/datafusion/src/physical_plan/csv.rs @@ -26,12 +26,14 @@ use std::task::{Context, Poll}; use crate::error::{DataFusionError, Result}; use crate::physical_plan::ExecutionPlan; use crate::physical_plan::{common, Partitioning}; -use arrow::csv; -use arrow::datatypes::{Schema, SchemaRef}; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; +use arrow2::datatypes::Schema; +use arrow2::error::Result as ArrowResult; +use arrow2::io::csv; +use arrow2::record_batch::RecordBatch; use futures::Stream; +type SchemaRef = Arc; + use super::{RecordBatchStream, SendableRecordBatchStream}; use async_trait::async_trait; @@ -346,7 +348,7 @@ mod tests { #[tokio::test] async fn csv_exec_with_projection() -> Result<()> { let schema = aggr_test_schema(); - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = arrow2::util::test_util::arrow_test_data(); let filename = "aggregate_test_100.csv"; let path = format!("{}/csv/{}", testdata, filename); let csv = CsvExec::try_new( @@ -374,7 +376,7 @@ mod tests { #[tokio::test] async fn csv_exec_without_projection() -> Result<()> { let schema = aggr_test_schema(); - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = arrow2::util::test_util::arrow_test_data(); let filename = "aggregate_test_100.csv"; let path = format!("{}/csv/{}", testdata, filename); let csv = CsvExec::try_new( diff --git a/datafusion/src/physical_plan/datetime_expressions.rs b/datafusion/src/physical_plan/datetime_expressions.rs index 7b5816186f27e..147f839e72de4 100644 --- a/datafusion/src/physical_plan/datetime_expressions.rs +++ b/datafusion/src/physical_plan/datetime_expressions.rs @@ -21,24 +21,22 @@ use std::sync::Arc; use super::ColumnarValue; use crate::{ error::{DataFusionError, Result}, - scalar::{ScalarType, ScalarValue}, + scalar::ScalarValue, }; -use arrow::{ - array::{Array, ArrayRef, GenericStringArray, PrimitiveArray, StringOffsetSizeTrait}, - datatypes::{ArrowPrimitiveType, DataType, TimestampNanosecondType}, +use arrow2::{ + array::*, + datatypes::{DataType, TimeUnit}, + types::NativeType, }; -use arrow::{ - array::{ - Date32Array, Date64Array, TimestampMicrosecondArray, TimestampMillisecondArray, - TimestampNanosecondArray, TimestampSecondArray, - }, - compute::kernels::temporal, - datatypes::TimeUnit, - temporal_conversions::timestamp_ns_to_datetime, -}; -use chrono::prelude::*; +use arrow2::{compute::temporal, temporal_conversions::timestamp_ns_to_datetime}; +use chrono::prelude::{DateTime, Local, NaiveDateTime, Utc}; +use chrono::Datelike; use chrono::Duration; use chrono::LocalResult; +use chrono::TimeZone; +use chrono::Timelike; + +type ArrayRef = Arc; #[inline] /// Accepts a string in RFC3339 / ISO8601 standard format and some @@ -185,17 +183,18 @@ fn naive_datetime_to_timestamp(s: &str, datetime: NaiveDateTime) -> Result /// # Errors /// This function errors iff: /// * the number of arguments is not 1 or -/// * the first argument is not castable to a `GenericStringArray` or +/// * the first argument is not castable to a `Utf8Array` or /// * the function `op` errors pub(crate) fn unary_string_to_primitive_function<'a, T, O, F>( args: &[&'a dyn Array], op: F, name: &str, + data_type: DataType, ) -> Result> where - O: ArrowPrimitiveType, - T: StringOffsetSizeTrait, - F: Fn(&'a str) -> Result, + O: NativeType, + T: Offset, + F: Fn(&'a str) -> Result, { if args.len() != 1 { return Err(DataFusionError::Internal(format!( @@ -207,13 +206,17 @@ where let array = args[0] .as_any() - .downcast_ref::>() + .downcast_ref::>() .ok_or_else(|| { DataFusionError::Internal("failed to downcast to string".to_string()) })?; // first map is the iterator, second is for the `Option<_>` - array.iter().map(|x| x.map(|x| op(x)).transpose()).collect() + array + .iter() + .map(|x| x.map(|x| op(x)).transpose()) + .collect::>>() + .map(|x| x.to(data_type)) } // given an function that maps a `&str` to a arrow native type, @@ -223,19 +226,30 @@ fn handle<'a, O, F, S>( args: &'a [ColumnarValue], op: F, name: &str, + data_type: DataType, ) -> Result where - O: ArrowPrimitiveType, - S: ScalarType, - F: Fn(&'a str) -> Result, + O: NativeType, + S: NativeType, + F: Fn(&'a str) -> Result, { match &args[0] { ColumnarValue::Array(a) => match a.data_type() { DataType::Utf8 => Ok(ColumnarValue::Array(Arc::new( - unary_string_to_primitive_function::(&[a.as_ref()], op, name)?, + unary_string_to_primitive_function::( + &[a.as_ref()], + op, + name, + data_type, + )?, ))), DataType::LargeUtf8 => Ok(ColumnarValue::Array(Arc::new( - unary_string_to_primitive_function::(&[a.as_ref()], op, name)?, + unary_string_to_primitive_function::( + &[a.as_ref()], + op, + name, + data_type, + )?, ))), other => Err(DataFusionError::Internal(format!( "Unsupported data type {:?} for function {}", @@ -245,7 +259,7 @@ where ColumnarValue::Scalar(scalar) => match scalar { ScalarValue::Utf8(a) => { let result = a.as_ref().map(|x| (op)(x)).transpose()?; - Ok(ColumnarValue::Scalar(S::scalar(result))) + Ok(ColumnarValue::Scalar(result)) } ScalarValue::LargeUtf8(a) => { let result = a.as_ref().map(|x| (op)(x)).transpose()?; @@ -261,10 +275,11 @@ where /// to_timestamp SQL function pub fn to_timestamp(args: &[ColumnarValue]) -> Result { - handle::( + handle::( args, string_to_timestamp_nanos, "to_timestamp", + DataType::Timestamp(TimeUnit::Nanosecond, None), ) } @@ -320,12 +335,12 @@ pub fn date_trunc(args: &[ColumnarValue]) -> Result { )); }; - let f = |x: Option| x.map(|x| date_trunc_single(granularity, x)).transpose(); + let f = |x: Option<&i64>| x.map(|x| date_trunc_single(granularity, *x)).transpose(); Ok(match array { ColumnarValue::Scalar(scalar) => { if let ScalarValue::TimestampNanosecond(v) = scalar { - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond((f)(*v)?)) + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond((f)(v.as_ref())?)) } else { return Err(DataFusionError::Execution( "array of `date_trunc` must be non-null scalar Utf8".to_string(), @@ -335,67 +350,19 @@ pub fn date_trunc(args: &[ColumnarValue]) -> Result { ColumnarValue::Array(array) => { let array = array .as_any() - .downcast_ref::() + .downcast_ref::>() .unwrap(); let array = array .iter() .map(f) - .collect::>()?; + .collect::>>()? + .to(DataType::Int64); ColumnarValue::Array(Arc::new(array)) } }) } -macro_rules! extract_date_part { - ($ARRAY: expr, $FN:expr) => { - match $ARRAY.data_type() { - DataType::Date32 => { - let array = $ARRAY.as_any().downcast_ref::().unwrap(); - Ok($FN(array)?) - } - DataType::Date64 => { - let array = $ARRAY.as_any().downcast_ref::().unwrap(); - Ok($FN(array)?) - } - DataType::Timestamp(time_unit, None) => match time_unit { - TimeUnit::Second => { - let array = $ARRAY - .as_any() - .downcast_ref::() - .unwrap(); - Ok($FN(array)?) - } - TimeUnit::Millisecond => { - let array = $ARRAY - .as_any() - .downcast_ref::() - .unwrap(); - Ok($FN(array)?) - } - TimeUnit::Microsecond => { - let array = $ARRAY - .as_any() - .downcast_ref::() - .unwrap(); - Ok($FN(array)?) - } - TimeUnit::Nanosecond => { - let array = $ARRAY - .as_any() - .downcast_ref::() - .unwrap(); - Ok($FN(array)?) - } - }, - datatype => Err(DataFusionError::Internal(format!( - "Extract does not support datatype {:?}", - datatype - ))), - } - }; -} - /// DATE_PART SQL function pub fn date_part(args: &[ColumnarValue]) -> Result { if args.len() != 2 { @@ -421,8 +388,8 @@ pub fn date_part(args: &[ColumnarValue]) -> Result { }; let arr = match date_part.to_lowercase().as_str() { - "hour" => extract_date_part!(array, temporal::hour), - "year" => extract_date_part!(array, temporal::year), + "hour" => temporal::hour(&array), + "year" => temporal::year(&array), _ => Err(DataFusionError::Execution(format!( "Date part '{}' not supported", date_part @@ -443,7 +410,8 @@ pub fn date_part(args: &[ColumnarValue]) -> Result { mod tests { use std::sync::Arc; - use arrow::array::{ArrayRef, Int64Array, StringBuilder}; + use arrow2::array::*; + use arrow2::datatypes::*; use super::*; @@ -451,18 +419,15 @@ mod tests { fn to_timestamp_arrays_and_nulls() -> Result<()> { // ensure that arrow array implementation is wired up and handles nulls correctly - let mut string_builder = StringBuilder::new(2); - let mut ts_builder = TimestampNanosecondArray::builder(2); + let string_array = + Utf8Array::::from(&vec![Some("2020-09-08T13:42:29.190855Z"), None]); - string_builder.append_value("2020-09-08T13:42:29.190855Z")?; - ts_builder.append_value(1599572549190855000)?; + let ts_array = Primitive::::from(&[Some(1599572549190855000), None]) + .to(DataType::Timestampm(TimeUnit::Nanosecond, None)); - string_builder.append_null()?; - ts_builder.append_null()?; - let expected_timestamps = &ts_builder.finish() as &dyn Array; + let expected_timestamps = &ts_array as &dyn Array; - let string_array = - ColumnarValue::Array(Arc::new(string_builder.finish()) as ArrayRef); + let string_array = ColumnarValue::Array(Arc::new(string_array) as ArrayRef); let parsed_timestamps = to_timestamp(&[string_array]) .expect("that to_timestamp parsed values without error"); if let ColumnarValue::Array(parsed_array) = parsed_timestamps { diff --git a/datafusion/src/physical_plan/distinct_expressions.rs b/datafusion/src/physical_plan/distinct_expressions.rs index 8534e9c8805cf..aa6a81d0f87e7 100644 --- a/datafusion/src/physical_plan/distinct_expressions.rs +++ b/datafusion/src/physical_plan/distinct_expressions.rs @@ -23,7 +23,10 @@ use std::fmt::Debug; use std::hash::Hash; use std::sync::Arc; -use arrow::datatypes::{DataType, Field}; +use arrow2::array::Array; +use arrow2::datatypes::{DataType, Field}; + +type ArrayRef = Arc; use ahash::RandomState; use std::collections::HashSet; @@ -195,13 +198,13 @@ impl Accumulator for DistinctCountAccumulator { mod tests { use super::*; - use arrow::array::ArrayRef; - use arrow::array::{ + use arrow2::array::Array; + use arrow2::array::{ Int16Array, Int32Array, Int64Array, Int8Array, ListArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; - use arrow::array::{Int32Builder, ListBuilder, UInt64Builder}; - use arrow::datatypes::DataType; + use arrow2::array::{Int32Builder, ListBuilder, UInt64Builder}; + use arrow2::datatypes::DataType; macro_rules! build_list { ($LISTS:expr, $BUILDER_TYPE:ident) => {{ diff --git a/datafusion/src/physical_plan/empty.rs b/datafusion/src/physical_plan/empty.rs index 3011b289507ff..e4e18d5d75b4e 100644 --- a/datafusion/src/physical_plan/empty.rs +++ b/datafusion/src/physical_plan/empty.rs @@ -23,14 +23,16 @@ use std::sync::Arc; use crate::error::{DataFusionError, Result}; use crate::physical_plan::memory::MemoryStream; use crate::physical_plan::{Distribution, ExecutionPlan, Partitioning}; -use arrow::array::NullArray; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use arrow::record_batch::RecordBatch; +use arrow2::array::NullArray; +use arrow2::datatypes::{DataType, Field, Schema}; +use arrow2::record_batch::RecordBatch; use super::SendableRecordBatchStream; use async_trait::async_trait; +type SchemaRef = Arc; + /// Execution plan for empty relation (produces no rows) #[derive(Debug)] pub struct EmptyExec { @@ -108,7 +110,7 @@ impl ExecutionPlan for EmptyExec { DataType::Null, true, )])), - vec![Arc::new(NullArray::new(1))], + vec![Arc::new(NullArray::from_data(1))], )?] } else { vec![] diff --git a/datafusion/src/physical_plan/explain.rs b/datafusion/src/physical_plan/explain.rs index 26d2c94dc80a4..2ddd0cad93e57 100644 --- a/datafusion/src/physical_plan/explain.rs +++ b/datafusion/src/physical_plan/explain.rs @@ -25,7 +25,9 @@ use crate::{ logical_plan::StringifiedPlan, physical_plan::{common::SizedRecordBatchStream, ExecutionPlan}, }; -use arrow::{array::StringBuilder, datatypes::SchemaRef, record_batch::RecordBatch}; +use arrow2::{array::*, datatypes::Schema, record_batch::RecordBatch}; + +type SchemaRef = Arc; use crate::physical_plan::Partitioning; @@ -101,20 +103,19 @@ impl ExecutionPlan for ExplainExec { ))); } - let mut type_builder = StringBuilder::new(self.stringified_plans.len()); - let mut plan_builder = StringBuilder::new(self.stringified_plans.len()); + let mut type_builder = + Utf8Primitive::::with_capacity(self.stringified_plans.len()); + let mut plan_builder = + Utf8Primitive::::with_capacity(self.stringified_plans.len()); for p in &self.stringified_plans { - type_builder.append_value(&String::from(&p.plan_type))?; - plan_builder.append_value(&*p.plan)?; + type_builder.push(Some(&p.plan_type)); + plan_builder.push(Some(&p.plan)); } let record_batch = RecordBatch::try_new( self.schema.clone(), - vec![ - Arc::new(type_builder.finish()), - Arc::new(plan_builder.finish()), - ], + vec![Arc::new(type_builder.into()), Arc::new(plan_builder.into())], )?; Ok(Box::pin(SizedRecordBatchStream::new( diff --git a/datafusion/src/physical_plan/expressions/average.rs b/datafusion/src/physical_plan/expressions/average.rs index 38644129dcd09..ccb80fa9eb456 100644 --- a/datafusion/src/physical_plan/expressions/average.rs +++ b/datafusion/src/physical_plan/expressions/average.rs @@ -24,13 +24,15 @@ use std::sync::Arc; use crate::error::{DataFusionError, Result}; use crate::physical_plan::{Accumulator, AggregateExpr, PhysicalExpr}; use crate::scalar::ScalarValue; -use arrow::compute; -use arrow::datatypes::DataType; -use arrow::{ - array::{ArrayRef, UInt64Array}, +use arrow2::compute; +use arrow2::datatypes::DataType; +use arrow2::{ + array::{Array, UInt64Array}, datatypes::Field, }; +type ArrayRef = Arc; + use super::{format_state_name, sum}; /// AVG aggregate expression @@ -146,7 +148,7 @@ impl Accumulator for AvgAccumulator { fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { let values = &values[0]; - self.count += (values.len() - values.data().null_count()) as u64; + self.count += (values.len() - values.null_count()) as u64; self.sum = sum::sum(&self.sum, &sum::sum_batch(values)?)?; Ok(()) } @@ -168,7 +170,7 @@ impl Accumulator for AvgAccumulator { fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { let counts = states[0].as_any().downcast_ref::().unwrap(); // counts are summed - self.count += compute::sum(counts).unwrap_or(0); + self.count += compute::aggregate::sum(counts).unwrap_or(0); // sums are summed self.sum = sum::sum(&self.sum, &sum::sum_batch(&states[1])?)?; @@ -192,8 +194,8 @@ mod tests { use super::*; use crate::physical_plan::expressions::col; use crate::{error::Result, generic_test_op}; - use arrow::record_batch::RecordBatch; - use arrow::{array::*, datatypes::*}; + use arrow2::record_batch::RecordBatch; + use arrow2::{array::*, datatypes::*}; #[test] fn avg_i32() -> Result<()> { diff --git a/datafusion/src/physical_plan/expressions/binary.rs b/datafusion/src/physical_plan/expressions/binary.rs index 5c2d9ce02f51f..fa40563832032 100644 --- a/datafusion/src/physical_plan/expressions/binary.rs +++ b/datafusion/src/physical_plan/expressions/binary.rs @@ -17,25 +17,26 @@ use std::{any::Any, sync::Arc}; -use arrow::array::*; -use arrow::compute::kernels::arithmetic::{ - add, divide, divide_scalar, multiply, subtract, -}; -use arrow::compute::kernels::boolean::{and_kleene, or_kleene}; -use arrow::compute::kernels::comparison::{eq, gt, gt_eq, lt, lt_eq, neq}; -use arrow::compute::kernels::comparison::{ +use arrow2::array::*; +use arrow2::compute::arithmetics::{add, divide, divide_scalar, multiply, subtract}; +use arrow2::compute::boolean_kleene::{and as and_kleene, or as or_kleene}; +use arrow2::compute::comparison::{eq, gt, gt_eq, lt, lt_eq, neq}; +use arrow2::compute::comparison::{ eq_scalar, gt_eq_scalar, gt_scalar, lt_eq_scalar, lt_scalar, neq_scalar, }; -use arrow::compute::kernels::comparison::{ +use arrow2::compute::comparison::{ eq_utf8, gt_eq_utf8, gt_utf8, like_utf8, like_utf8_scalar, lt_eq_utf8, lt_utf8, neq_utf8, nlike_utf8, nlike_utf8_scalar, }; -use arrow::compute::kernels::comparison::{ +use arrow2::compute::comparison::{ eq_utf8_scalar, gt_eq_utf8_scalar, gt_utf8_scalar, lt_eq_utf8_scalar, lt_utf8_scalar, neq_utf8_scalar, }; -use arrow::datatypes::{DataType, Schema, TimeUnit}; -use arrow::record_batch::RecordBatch; +use arrow2::datatypes::{DataType, Schema, TimeUnit}; +use arrow2::record_batch::RecordBatch; + +use super::ArrayRef; +type StringArray = Utf8Array; use crate::error::{DataFusionError, Result}; use crate::logical_plan::Operator; @@ -567,8 +568,8 @@ pub fn binary( #[cfg(test)] mod tests { - use arrow::datatypes::{ArrowNumericType, Field, Int32Type, SchemaRef}; - use arrow::util::display::array_value_to_string; + use arrow2::datatypes::*; + use arrow2::util::display::array_value_to_string; use super::*; use crate::error::Result; @@ -756,7 +757,7 @@ mod tests { StringArray, DataType::Utf8, vec!["1994-12-13", "1995-01-26"], - Date32Array, + Int32Array, DataType::Date32, vec![9112, 9156], Operator::Eq, @@ -780,7 +781,7 @@ mod tests { StringArray, DataType::Utf8, vec!["1994-12-13T12:34:56", "1995-01-26T01:23:45"], - Date64Array, + Int64Array, DataType::Date64, vec![787322096000, 791083425000], Operator::Eq, @@ -792,7 +793,7 @@ mod tests { StringArray, DataType::Utf8, vec!["1994-12-13T12:34:56", "1995-01-26T01:23:45"], - Date64Array, + Int64Array, DataType::Date64, vec![787322096001, 791083424999], Operator::Lt, @@ -817,7 +818,7 @@ mod tests { // build dictionary let keys_builder = PrimitiveBuilder::::new(10); - let values_builder = arrow::array::StringBuilder::new(10); + let values_builder = arrow2::array::StringBuilder::new(10); let mut dict_builder = StringDictionaryBuilder::new(keys_builder, values_builder); dict_builder.append("one")?; @@ -875,7 +876,7 @@ mod tests { fn array_to_string(array: &ArrayRef) -> Result { let s = (0..array.len()) .map(|i| array_value_to_string(array, i)) - .collect::, arrow::error::ArrowError>>()? + .collect::, arrow2::error::ArrowError>>()? .join("\n"); Ok(s) } diff --git a/datafusion/src/physical_plan/expressions/case.rs b/datafusion/src/physical_plan/expressions/case.rs index e8c500e5ed62b..14d204b4dca23 100644 --- a/datafusion/src/physical_plan/expressions/case.rs +++ b/datafusion/src/physical_plan/expressions/case.rs @@ -17,13 +17,15 @@ use std::{any::Any, sync::Arc}; -use arrow::array::{self, *}; -use arrow::datatypes::{DataType, Schema}; -use arrow::record_batch::RecordBatch; +use arrow2::array::*; +use arrow2::datatypes::{DataType, Schema}; +use arrow2::record_batch::RecordBatch; use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ColumnarValue, PhysicalExpr}; +use super::ArrayRef; + /// The CASE expression is similar to a series of nested if/else and there are two forms that /// can be used. The first form consists of a series of boolean "when" expressions with /// corresponding "then" expressions, and an optional "else" expression. @@ -461,8 +463,8 @@ mod tests { physical_plan::expressions::{binary, col, lit}, scalar::ScalarValue, }; - use arrow::array::StringArray; - use arrow::datatypes::*; + use arrow2::array::StringArray; + use arrow2::datatypes::*; #[test] fn case_with_expr() -> Result<()> { diff --git a/datafusion/src/physical_plan/expressions/cast.rs b/datafusion/src/physical_plan/expressions/cast.rs index ba395f54d917c..ffe4bff091353 100644 --- a/datafusion/src/physical_plan/expressions/cast.rs +++ b/datafusion/src/physical_plan/expressions/cast.rs @@ -23,15 +23,9 @@ use super::ColumnarValue; use crate::error::{DataFusionError, Result}; use crate::physical_plan::PhysicalExpr; use crate::scalar::ScalarValue; -use arrow::compute; -use arrow::compute::kernels; -use arrow::compute::CastOptions; -use arrow::datatypes::{DataType, Schema}; -use arrow::record_batch::RecordBatch; -use compute::can_cast_types; - -/// provide Datafusion default cast options -pub const DEFAULT_DATAFUSION_CAST_OPTIONS: CastOptions = CastOptions { safe: false }; +use arrow2::compute::cast; +use arrow2::datatypes::{DataType, Schema}; +use arrow2::record_batch::RecordBatch; /// CAST expression casts an expression to a specific data type and returns a runtime error on invalid cast #[derive(Debug)] @@ -40,22 +34,12 @@ pub struct CastExpr { expr: Arc, /// The data type to cast to cast_type: DataType, - /// Cast options - cast_options: CastOptions, } impl CastExpr { /// Create a new CastExpr - pub fn new( - expr: Arc, - cast_type: DataType, - cast_options: CastOptions, - ) -> Self { - Self { - expr, - cast_type, - cast_options, - } + pub fn new(expr: Arc, cast_type: DataType) -> Self { + Self { expr, cast_type } } /// The expression to cast @@ -92,20 +76,13 @@ impl PhysicalExpr for CastExpr { fn evaluate(&self, batch: &RecordBatch) -> Result { let value = self.expr.evaluate(batch)?; match value { - ColumnarValue::Array(array) => { - Ok(ColumnarValue::Array(kernels::cast::cast_with_options( - &array, - &self.cast_type, - &self.cast_options, - )?)) - } + ColumnarValue::Array(array) => Ok(ColumnarValue::Array( + cast::cast(array.as_ref(), &self.cast_type)?.into(), + )), ColumnarValue::Scalar(scalar) => { let scalar_array = scalar.to_array(); - let cast_array = kernels::cast::cast_with_options( - &scalar_array, - &self.cast_type, - &self.cast_options, - )?; + let cast_array = + cast::cast(scalar_array.as_ref(), &self.cast_type)?.into(); let cast_scalar = ScalarValue::try_from_array(&cast_array, 0)?; Ok(ColumnarValue::Scalar(cast_scalar)) } @@ -121,13 +98,12 @@ pub fn cast_with_options( expr: Arc, input_schema: &Schema, cast_type: DataType, - cast_options: CastOptions, ) -> Result> { let expr_type = expr.data_type(input_schema)?; if expr_type == cast_type { Ok(expr.clone()) - } else if can_cast_types(&expr_type, &cast_type) { - Ok(Arc::new(CastExpr::new(expr, cast_type, cast_options))) + } else if cast::can_cast_types(&expr_type, &cast_type) { + Ok(Arc::new(CastExpr::new(expr, cast_type))) } else { Err(DataFusionError::Internal(format!( "Unsupported CAST from {:?} to {:?}", @@ -145,12 +121,7 @@ pub fn cast( input_schema: &Schema, cast_type: DataType, ) -> Result> { - cast_with_options( - expr, - input_schema, - cast_type, - DEFAULT_DATAFUSION_CAST_OPTIONS, - ) + cast_with_options(expr, input_schema, cast_type) } #[cfg(test)] @@ -158,11 +129,9 @@ mod tests { use super::*; use crate::error::Result; use crate::physical_plan::expressions::col; - use arrow::array::{StringArray, Time64NanosecondArray}; - use arrow::{ - array::{Array, Int32Array, Int64Array, TimestampNanosecondArray, UInt32Array}, - datatypes::*, - }; + use arrow2::{array::*, datatypes::*}; + + type StringArray = Utf8Array; // runs an end-to-end test of physical type cast // 1. construct a record batch with a column "a" of type A @@ -171,14 +140,14 @@ mod tests { // 4. verify that the resulting expression is of type B // 5. verify that the resulting values are downcastable and correct macro_rules! generic_test_cast { - ($A_ARRAY:ident, $A_TYPE:expr, $A_VEC:expr, $TYPEARRAY:ident, $TYPE:expr, $VEC:expr, $CAST_OPTIONS:expr) => {{ + ($A_ARRAY:ident, $A_TYPE:expr, $A_VEC:expr, $TYPEARRAY:ident, $TYPE:expr, $VEC:expr) => {{ let schema = Schema::new(vec![Field::new("a", $A_TYPE, false)]); let a = $A_ARRAY::from($A_VEC); let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; // verify that we can construct the expression - let expression = cast_with_options(col("a"), &schema, $TYPE, $CAST_OPTIONS)?; + let expression = cast_with_options(col("a"), &schema, $TYPE)?; // verify that its display is correct assert_eq!(format!("CAST(a AS {:?})", $TYPE), format!("{}", expression)); @@ -225,8 +194,7 @@ mod tests { Some(3_u32), Some(4_u32), Some(5_u32) - ], - DEFAULT_DATAFUSION_CAST_OPTIONS + ] ); Ok(()) } @@ -239,8 +207,7 @@ mod tests { vec![1, 2, 3, 4, 5], StringArray, DataType::Utf8, - vec![Some("1"), Some("2"), Some("3"), Some("4"), Some("5")], - DEFAULT_DATAFUSION_CAST_OPTIONS + vec![Some("1"), Some("2"), Some("3"), Some("4"), Some("5")] ); Ok(()) } @@ -249,18 +216,14 @@ mod tests { #[test] fn test_cast_i64_t64() -> Result<()> { let original = vec![1, 2, 3, 4, 5]; - let expected: Vec> = original - .iter() - .map(|i| Some(Time64NanosecondArray::from(vec![*i]).value(0))) - .collect(); + let expected: Vec> = original.iter().map(|i| Some(*i)).collect(); generic_test_cast!( Int64Array, DataType::Int64, original.clone(), - TimestampNanosecondArray, + Int64Array, DataType::Timestamp(TimeUnit::Nanosecond, None), - expected, - DEFAULT_DATAFUSION_CAST_OPTIONS + expected ); Ok(()) } @@ -280,19 +243,14 @@ mod tests { let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]); let a = StringArray::from(vec!["9.1"]); let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - let expression = cast_with_options( - col("a"), - &schema, - DataType::Int32, - DEFAULT_DATAFUSION_CAST_OPTIONS, - )?; + let expression = cast_with_options(col("a"), &schema, DataType::Int32)?; let result = expression.evaluate(&batch); match result { Ok(_) => panic!("expected error"), Err(e) => { assert!(e.to_string().contains( - "Cast error: Cannot cast string '9.1' to value of arrow::datatypes::types::Int32Type type" + "Cast error: Cannot cast string '9.1' to value of arrow2::datatypes::types::Int32Type type" )) } } diff --git a/datafusion/src/physical_plan/expressions/coercion.rs b/datafusion/src/physical_plan/expressions/coercion.rs index e9949f5199e88..73470d5428492 100644 --- a/datafusion/src/physical_plan/expressions/coercion.rs +++ b/datafusion/src/physical_plan/expressions/coercion.rs @@ -17,7 +17,7 @@ //! Coercion rules used to coerce types to match existing expressions' implementations -use arrow::datatypes::DataType; +use arrow2::datatypes::DataType; /// Determine if a DataType is signed numeric or not pub fn is_signed_numeric(dt: &DataType) -> bool { @@ -79,7 +79,7 @@ pub fn dictionary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { - use arrow::datatypes::DataType::*; + use arrow2::datatypes::DataType::*; match (lhs_type, rhs_type) { (Utf8, Utf8) => Some(Utf8), (LargeUtf8, Utf8) => Some(LargeUtf8), @@ -92,7 +92,7 @@ pub fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { - use arrow::datatypes::DataType::*; + use arrow2::datatypes::DataType::*; match (lhs_type, rhs_type) { (Utf8, Date32) => Some(Date32), (Date32, Utf8) => Some(Date32), @@ -106,7 +106,7 @@ pub fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { - use arrow::datatypes::DataType::*; + use arrow2::datatypes::DataType::*; // error on any non-numeric type if !is_numeric(lhs_type) || !is_numeric(rhs_type) { diff --git a/datafusion/src/physical_plan/expressions/column.rs b/datafusion/src/physical_plan/expressions/column.rs index 7e0304e51fe73..de64f8fed98a9 100644 --- a/datafusion/src/physical_plan/expressions/column.rs +++ b/datafusion/src/physical_plan/expressions/column.rs @@ -19,7 +19,7 @@ use std::sync::Arc; -use arrow::{ +use arrow2::{ datatypes::{DataType, Schema}, record_batch::RecordBatch, }; diff --git a/datafusion/src/physical_plan/expressions/count.rs b/datafusion/src/physical_plan/expressions/count.rs index 22459813b7e5b..0d059bf50868f 100644 --- a/datafusion/src/physical_plan/expressions/count.rs +++ b/datafusion/src/physical_plan/expressions/count.rs @@ -20,15 +20,13 @@ use std::any::Any; use std::sync::Arc; +use super::ArrayRef; use crate::error::Result; use crate::physical_plan::{Accumulator, AggregateExpr, PhysicalExpr}; use crate::scalar::ScalarValue; -use arrow::compute; -use arrow::datatypes::DataType; -use arrow::{ - array::{ArrayRef, UInt64Array}, - datatypes::Field, -}; +use arrow2::compute; +use arrow2::datatypes::DataType; +use arrow2::{array::UInt64Array, datatypes::Field}; use super::format_state_name; @@ -100,7 +98,7 @@ impl CountAccumulator { impl Accumulator for CountAccumulator { fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { let array = &values[0]; - self.count += (array.len() - array.data().null_count()) as u64; + self.count += (array.len() - array.null_count()) as u64; Ok(()) } @@ -124,7 +122,7 @@ impl Accumulator for CountAccumulator { fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { let counts = states[0].as_any().downcast_ref::().unwrap(); - let delta = &compute::sum(counts); + let delta = &compute::aggregate::sum(counts); if let Some(d) = delta { self.count += *d; } @@ -146,8 +144,8 @@ mod tests { use crate::physical_plan::expressions::col; use crate::physical_plan::expressions::tests::aggregate; use crate::{error::Result, generic_test_op}; - use arrow::record_batch::RecordBatch; - use arrow::{array::*, datatypes::*}; + use arrow2::record_batch::RecordBatch; + use arrow2::{array::*, datatypes::*}; #[test] fn count_elements() -> Result<()> { @@ -210,7 +208,7 @@ mod tests { #[test] fn count_utf8() -> Result<()> { let a: ArrayRef = - Arc::new(StringArray::from(vec!["a", "bb", "ccc", "dddd", "ad"])); + Arc::new(Utf8Array::::from(vec!["a", "bb", "ccc", "dddd", "ad"])); generic_test_op!( a, DataType::Utf8, @@ -223,7 +221,7 @@ mod tests { #[test] fn count_large_utf8() -> Result<()> { let a: ArrayRef = - Arc::new(LargeStringArray::from(vec!["a", "bb", "ccc", "dddd", "ad"])); + Arc::new(Utf8Array::::from(vec!["a", "bb", "ccc", "dddd", "ad"])); generic_test_op!( a, DataType::LargeUtf8, diff --git a/datafusion/src/physical_plan/expressions/in_list.rs b/datafusion/src/physical_plan/expressions/in_list.rs index 41f111006ea2a..cab06344793e9 100644 --- a/datafusion/src/physical_plan/expressions/in_list.rs +++ b/datafusion/src/physical_plan/expressions/in_list.rs @@ -20,17 +20,15 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::GenericStringArray; -use arrow::array::{ - ArrayRef, BooleanArray, Float32Array, Float64Array, Int16Array, Int32Array, - Int64Array, Int8Array, StringOffsetSizeTrait, UInt16Array, UInt32Array, UInt64Array, - UInt8Array, -}; -use arrow::{ +use arrow2::array::Utf8Array; +use arrow2::array::*; +use arrow2::{ datatypes::{DataType, Schema}, record_batch::RecordBatch, }; +use super::ArrayRef; + use crate::error::Result; use crate::physical_plan::{ColumnarValue, PhysicalExpr}; use crate::scalar::ScalarValue; @@ -130,16 +128,13 @@ impl InListExpr { /// Compare for specific utf8 types #[allow(clippy::unnecessary_wraps)] - fn compare_utf8( + fn compare_utf8( &self, array: ArrayRef, list_values: Vec, negated: bool, ) -> Result { - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); + let array = array.as_any().downcast_ref::>().unwrap(); let mut contains_null = false; let values = list_values @@ -288,7 +283,9 @@ pub fn in_list( #[cfg(test)] mod tests { - use arrow::{array::StringArray, datatypes::Field}; + use arrow2::{array::Utf8Array, datatypes::Field}; + + type StringArray = Utf8Array; use super::*; use crate::error::Result; diff --git a/datafusion/src/physical_plan/expressions/is_not_null.rs b/datafusion/src/physical_plan/expressions/is_not_null.rs index 7ac2110b50221..3b505fe22299f 100644 --- a/datafusion/src/physical_plan/expressions/is_not_null.rs +++ b/datafusion/src/physical_plan/expressions/is_not_null.rs @@ -19,8 +19,8 @@ use std::{any::Any, sync::Arc}; -use arrow::compute; -use arrow::{ +use arrow2::compute; +use arrow2::{ datatypes::{DataType, Schema}, record_batch::RecordBatch, }; @@ -71,7 +71,7 @@ impl PhysicalExpr for IsNotNullExpr { let arg = self.arg.evaluate(batch)?; match arg { ColumnarValue::Array(array) => Ok(ColumnarValue::Array(Arc::new( - compute::is_not_null(array.as_ref())?, + compute::boolean::is_not_null(array.as_ref()), ))), ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::Scalar( ScalarValue::Boolean(Some(!scalar.is_null())), @@ -89,13 +89,15 @@ pub fn is_not_null(arg: Arc) -> Result> mod tests { use super::*; use crate::physical_plan::expressions::col; - use arrow::{ - array::{BooleanArray, StringArray}, + use arrow2::{ + array::{BooleanArray, Utf8Array}, datatypes::*, record_batch::RecordBatch, }; use std::sync::Arc; + type StringArray = Utf8Array; + #[test] fn is_not_null_op() -> Result<()> { let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]); diff --git a/datafusion/src/physical_plan/expressions/is_null.rs b/datafusion/src/physical_plan/expressions/is_null.rs index dfa53f3f7d264..ab5f18b33ca51 100644 --- a/datafusion/src/physical_plan/expressions/is_null.rs +++ b/datafusion/src/physical_plan/expressions/is_null.rs @@ -19,8 +19,8 @@ use std::{any::Any, sync::Arc}; -use arrow::compute; -use arrow::{ +use arrow2::compute; +use arrow2::{ datatypes::{DataType, Schema}, record_batch::RecordBatch, }; @@ -71,7 +71,7 @@ impl PhysicalExpr for IsNullExpr { let arg = self.arg.evaluate(batch)?; match arg { ColumnarValue::Array(array) => Ok(ColumnarValue::Array(Arc::new( - compute::is_null(array.as_ref())?, + compute::boolean::is_null(array.as_ref()), ))), ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::Scalar( ScalarValue::Boolean(Some(scalar.is_null())), @@ -89,13 +89,15 @@ pub fn is_null(arg: Arc) -> Result> { mod tests { use super::*; use crate::physical_plan::expressions::col; - use arrow::{ - array::{BooleanArray, StringArray}, + use arrow2::{ + array::{BooleanArray, Utf8Array}, datatypes::*, record_batch::RecordBatch, }; use std::sync::Arc; + type StringArray = Utf8Array; + #[test] fn is_null_op() -> Result<()> { let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]); diff --git a/datafusion/src/physical_plan/expressions/literal.rs b/datafusion/src/physical_plan/expressions/literal.rs index 3110d39c87e0b..77e60cc6f476e 100644 --- a/datafusion/src/physical_plan/expressions/literal.rs +++ b/datafusion/src/physical_plan/expressions/literal.rs @@ -20,7 +20,7 @@ use std::any::Any; use std::sync::Arc; -use arrow::{ +use arrow2::{ datatypes::{DataType, Schema}, record_batch::RecordBatch, }; @@ -80,8 +80,8 @@ pub fn lit(value: ScalarValue) -> Arc { mod tests { use super::*; use crate::error::Result; - use arrow::array::Int32Array; - use arrow::datatypes::*; + use arrow2::array::Int32Array; + use arrow2::datatypes::*; #[test] fn literal_i32() -> Result<()> { diff --git a/datafusion/src/physical_plan/expressions/min_max.rs b/datafusion/src/physical_plan/expressions/min_max.rs index 5ed14610ada38..7530bc51dd644 100644 --- a/datafusion/src/physical_plan/expressions/min_max.rs +++ b/datafusion/src/physical_plan/expressions/min_max.rs @@ -21,20 +21,17 @@ use std::any::Any; use std::convert::TryFrom; use std::sync::Arc; +use arrow2::array::*; +use arrow2::compute::aggregate::*; +use arrow2::datatypes::*; + use crate::error::{DataFusionError, Result}; use crate::physical_plan::{Accumulator, AggregateExpr, PhysicalExpr}; use crate::scalar::ScalarValue; -use arrow::compute; -use arrow::datatypes::{DataType, TimeUnit}; -use arrow::{ - array::{ - ArrayRef, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, - Int8Array, LargeStringArray, StringArray, TimestampMicrosecondArray, - TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, - UInt16Array, UInt32Array, UInt64Array, UInt8Array, - }, - datatypes::Field, -}; + +type StringArray = Utf8Array; +type LargeStringArray = Utf8Array; +type ArrayRef = Arc; use super::format_state_name; @@ -48,7 +45,7 @@ pub struct Max { } impl Max { - /// Create a new MAX aggregate function + /// Cre§ate a new MAX aggregate function pub fn new(expr: Arc, name: String, data_type: DataType) -> Self { Self { name, @@ -94,7 +91,7 @@ impl AggregateExpr for Max { macro_rules! typed_min_max_batch_string { ($VALUES:expr, $ARRAYTYPE:ident, $SCALAR:ident, $OP:ident) => {{ let array = $VALUES.as_any().downcast_ref::<$ARRAYTYPE>().unwrap(); - let value = compute::$OP(array); + let value = $OP(array); let value = value.and_then(|e| Some(e.to_string())); ScalarValue::$SCALAR(value) }}; @@ -104,7 +101,7 @@ macro_rules! typed_min_max_batch_string { macro_rules! typed_min_max_batch { ($VALUES:expr, $ARRAYTYPE:ident, $SCALAR:ident, $OP:ident) => {{ let array = $VALUES.as_any().downcast_ref::<$ARRAYTYPE>().unwrap(); - let value = compute::$OP(array); + let value = $OP(array); ScalarValue::$SCALAR(value) }}; } @@ -115,13 +112,9 @@ macro_rules! min_max_batch { ($VALUES:expr, $OP:ident) => {{ match $VALUES.data_type() { // all types that have a natural order - DataType::Float64 => { - typed_min_max_batch!($VALUES, Float64Array, Float64, $OP) - } - DataType::Float32 => { - typed_min_max_batch!($VALUES, Float32Array, Float32, $OP) + DataType::Int64 | DataType::Timestamp(TimeUnit::Second, _) => { + typed_min_max_batch!($VALUES, Int64Array, Int64, $OP) } - DataType::Int64 => typed_min_max_batch!($VALUES, Int64Array, Int64, $OP), DataType::Int32 => typed_min_max_batch!($VALUES, Int32Array, Int32, $OP), DataType::Int16 => typed_min_max_batch!($VALUES, Int16Array, Int16, $OP), DataType::Int8 => typed_min_max_batch!($VALUES, Int8Array, Int8, $OP), @@ -130,26 +123,17 @@ macro_rules! min_max_batch { DataType::UInt16 => typed_min_max_batch!($VALUES, UInt16Array, UInt16, $OP), DataType::UInt8 => typed_min_max_batch!($VALUES, UInt8Array, UInt8, $OP), DataType::Timestamp(TimeUnit::Second, _) => { - typed_min_max_batch!($VALUES, TimestampSecondArray, TimestampSecond, $OP) + typed_min_max_batch!($VALUES, Int64Array, TimestampSecond, $OP) + } + DataType::Timestamp(TimeUnit::Millisecond, _) => { + typed_min_max_batch!($VALUES, Int64Array, TimestampMillisecond, $OP) + } + DataType::Timestamp(TimeUnit::Microsecond, _) => { + typed_min_max_batch!($VALUES, Int64Array, TimestampMicrosecond, $OP) + } + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + typed_min_max_batch!($VALUES, Int64Array, TimestampNanosecond, $OP) } - DataType::Timestamp(TimeUnit::Millisecond, _) => typed_min_max_batch!( - $VALUES, - TimestampMillisecondArray, - TimestampMillisecond, - $OP - ), - DataType::Timestamp(TimeUnit::Microsecond, _) => typed_min_max_batch!( - $VALUES, - TimestampMicrosecondArray, - TimestampMicrosecond, - $OP - ), - DataType::Timestamp(TimeUnit::Nanosecond, _) => typed_min_max_batch!( - $VALUES, - TimestampNanosecondArray, - TimestampNanosecond, - $OP - ), other => { // This should have been handled before return Err(DataFusionError::Internal(format!( @@ -170,7 +154,13 @@ fn min_batch(values: &ArrayRef) -> Result { DataType::LargeUtf8 => { typed_min_max_batch_string!(values, LargeStringArray, LargeUtf8, min_string) } - _ => min_max_batch!(values, min), + DataType::Float64 => { + typed_min_max_batch!(values, Float64Array, Float64, min_f64) + } + DataType::Float32 => { + typed_min_max_batch!(values, Float32Array, Float32, min_f32) + } + _ => min_max_batch!(values, min_primitive), }) } @@ -183,6 +173,12 @@ fn max_batch(values: &ArrayRef) -> Result { DataType::LargeUtf8 => { typed_min_max_batch_string!(values, LargeStringArray, LargeUtf8, max_string) } + DataType::Float64 => { + typed_min_max_batch!(values, Float64Array, Float64, max_f64) + } + DataType::Float32 => { + typed_min_max_batch!(values, Float32Array, Float32, max_f32) + } _ => min_max_batch!(values, max), }) } @@ -440,8 +436,7 @@ mod tests { use crate::physical_plan::expressions::col; use crate::physical_plan::expressions::tests::aggregate; use crate::{error::Result, generic_test_op}; - use arrow::datatypes::*; - use arrow::record_batch::RecordBatch; + use arrow2::record_batch::RecordBatch; #[test] fn max_i32() -> Result<()> { diff --git a/datafusion/src/physical_plan/expressions/mod.rs b/datafusion/src/physical_plan/expressions/mod.rs index 6e252205955dc..7d4e266537629 100644 --- a/datafusion/src/physical_plan/expressions/mod.rs +++ b/datafusion/src/physical_plan/expressions/mod.rs @@ -22,8 +22,11 @@ use std::sync::Arc; use super::ColumnarValue; use crate::error::{DataFusionError, Result}; use crate::physical_plan::PhysicalExpr; -use arrow::compute::kernels::sort::{SortColumn, SortOptions}; -use arrow::record_batch::RecordBatch; +use arrow2::array::Array; +use arrow2::compute::sort::{SortColumn, SortOptions}; +use arrow2::record_batch::RecordBatch; + +type ArrayRef = Arc; mod average; #[macro_use] @@ -40,7 +43,7 @@ mod literal; mod min_max; mod negative; mod not; -mod nullif; +//mod nullif; mod sum; mod try_cast; diff --git a/datafusion/src/physical_plan/expressions/negative.rs b/datafusion/src/physical_plan/expressions/negative.rs index 65010c6acd1ec..6d092726f658a 100644 --- a/datafusion/src/physical_plan/expressions/negative.rs +++ b/datafusion/src/physical_plan/expressions/negative.rs @@ -20,14 +20,15 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::ArrayRef; -use arrow::compute::kernels::arithmetic::negate; -use arrow::{ - array::{Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array}, +use arrow2::{ + compute::arithmetics::negate, + array::*, datatypes::{DataType, Schema}, record_batch::RecordBatch, }; +type ArrayRef = Arc; + use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ColumnarValue, PhysicalExpr}; @@ -36,12 +37,12 @@ use super::coercion; /// Invoke a compute kernel on array(s) macro_rules! compute_op { // invoke unary operator - ($OPERAND:expr, $OP:ident, $DT:ident) => {{ + ($OPERAND:expr, $DT:ident) => {{ let operand = $OPERAND .as_any() .downcast_ref::<$DT>() .expect("compute_op failed to downcast array"); - Ok(Arc::new($OP(&operand)?)) + Ok(Arc::new(negate(operand)?)) }}; } @@ -89,12 +90,12 @@ impl PhysicalExpr for NegativeExpr { match arg { ColumnarValue::Array(array) => { let result: Result = match array.data_type() { - DataType::Int8 => compute_op!(array, negate, Int8Array), - DataType::Int16 => compute_op!(array, negate, Int16Array), - DataType::Int32 => compute_op!(array, negate, Int32Array), - DataType::Int64 => compute_op!(array, negate, Int64Array), - DataType::Float32 => compute_op!(array, negate, Float32Array), - DataType::Float64 => compute_op!(array, negate, Float64Array), + DataType::Int8 => compute_op!(array, Int8Array), + DataType::Int16 => compute_op!(array, Int16Array), + DataType::Int32 => compute_op!(array, Int32Array), + DataType::Int64 => compute_op!(array, Int64Array), + DataType::Float32 => compute_op!(array, Float32Array), + DataType::Float64 => compute_op!(array, Float64Array), _ => Err(DataFusionError::Internal(format!( "(- '{:?}') can't be evaluated because the expression's type is {:?}, not signed numeric", self, diff --git a/datafusion/src/physical_plan/expressions/not.rs b/datafusion/src/physical_plan/expressions/not.rs index 23a1a46651dee..5cacc52a42cbb 100644 --- a/datafusion/src/physical_plan/expressions/not.rs +++ b/datafusion/src/physical_plan/expressions/not.rs @@ -25,9 +25,9 @@ use super::ColumnarValue; use crate::error::{DataFusionError, Result}; use crate::physical_plan::PhysicalExpr; use crate::scalar::ScalarValue; -use arrow::array::BooleanArray; -use arrow::datatypes::{DataType, Schema}; -use arrow::record_batch::RecordBatch; +use arrow2::array::BooleanArray; +use arrow2::datatypes::{DataType, Schema}; +use arrow2::record_batch::RecordBatch; /// Not expression #[derive(Debug)] @@ -82,7 +82,7 @@ impl PhysicalExpr for NotExpr { ) })?; Ok(ColumnarValue::Array(Arc::new( - arrow::compute::kernels::boolean::not(array)?, + arrow2::compute::boolean::not(array)?.into(), ))) } ColumnarValue::Scalar(scalar) => { @@ -121,7 +121,7 @@ mod tests { use super::*; use crate::error::Result; use crate::physical_plan::expressions::col; - use arrow::datatypes::*; + use arrow2::datatypes::*; #[test] fn neg_op() -> Result<()> { diff --git a/datafusion/src/physical_plan/expressions/nullif.rs b/datafusion/src/physical_plan/expressions/nullif.rs index 7cc58ed2318f4..a54b18467a7c1 100644 --- a/datafusion/src/physical_plan/expressions/nullif.rs +++ b/datafusion/src/physical_plan/expressions/nullif.rs @@ -20,15 +20,11 @@ use std::sync::Arc; use super::ColumnarValue; use crate::error::{DataFusionError, Result}; use crate::scalar::ScalarValue; -use arrow::array::Array; -use arrow::array::{ - ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, - Int16Array, Int32Array, Int64Array, Int8Array, StringArray, TimestampNanosecondArray, - UInt16Array, UInt32Array, UInt64Array, UInt8Array, -}; -use arrow::compute::kernels::boolean::nullif; -use arrow::compute::kernels::comparison::{eq, eq_scalar, eq_utf8, eq_utf8_scalar}; -use arrow::datatypes::{DataType, TimeUnit}; +use arrow2::array::Array; +use arrow2::array::*; +use arrow2::compute::boolean::nullif; +use arrow2::compute::comparison::*; +use arrow2::datatypes::{DataType, TimeUnit}; /// Invoke a compute kernel on a primitive array and a Boolean Array macro_rules! compute_bool_array_op { diff --git a/datafusion/src/physical_plan/expressions/sum.rs b/datafusion/src/physical_plan/expressions/sum.rs index 6f50894003da6..b87237cb4ed10 100644 --- a/datafusion/src/physical_plan/expressions/sum.rs +++ b/datafusion/src/physical_plan/expressions/sum.rs @@ -24,15 +24,11 @@ use std::sync::Arc; use crate::error::{DataFusionError, Result}; use crate::physical_plan::{Accumulator, AggregateExpr, PhysicalExpr}; use crate::scalar::ScalarValue; -use arrow::compute; -use arrow::datatypes::DataType; -use arrow::{ - array::{ - ArrayRef, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, - Int8Array, UInt16Array, UInt32Array, UInt64Array, UInt8Array, - }, - datatypes::Field, -}; +use arrow2::compute; +use arrow2::datatypes::DataType; +use arrow2::{array::*, datatypes::Field}; + +type ArrayRef = Arc; use super::format_state_name; @@ -124,7 +120,7 @@ impl SumAccumulator { macro_rules! typed_sum_delta_batch { ($VALUES:expr, $ARRAYTYPE:ident, $SCALAR:ident) => {{ let array = $VALUES.as_any().downcast_ref::<$ARRAYTYPE>().unwrap(); - let delta = compute::sum(array); + let delta = compute::aggregate::sum(array); ScalarValue::$SCALAR(delta) }}; } @@ -272,8 +268,8 @@ mod tests { use super::*; use crate::physical_plan::expressions::col; use crate::{error::Result, generic_test_op}; - use arrow::datatypes::*; - use arrow::record_batch::RecordBatch; + use arrow2::datatypes::*; + use arrow2::record_batch::RecordBatch; #[test] fn sum_i32() -> Result<()> { diff --git a/datafusion/src/physical_plan/expressions/try_cast.rs b/datafusion/src/physical_plan/expressions/try_cast.rs index 5e402fdea28ad..e63704963516a 100644 --- a/datafusion/src/physical_plan/expressions/try_cast.rs +++ b/datafusion/src/physical_plan/expressions/try_cast.rs @@ -23,11 +23,10 @@ use super::ColumnarValue; use crate::error::{DataFusionError, Result}; use crate::physical_plan::PhysicalExpr; use crate::scalar::ScalarValue; -use arrow::compute; -use arrow::compute::kernels; -use arrow::datatypes::{DataType, Schema}; -use arrow::record_batch::RecordBatch; -use compute::can_cast_types; +use arrow2::compute; +use arrow2::datatypes::{DataType, Schema}; +use arrow2::record_batch::RecordBatch; +use compute::cast; /// TRY_CAST expression casts an expression to a specific data type and retuns NULL on invalid cast #[derive(Debug)] @@ -78,13 +77,13 @@ impl PhysicalExpr for TryCastExpr { fn evaluate(&self, batch: &RecordBatch) -> Result { let value = self.expr.evaluate(batch)?; match value { - ColumnarValue::Array(array) => Ok(ColumnarValue::Array(kernels::cast::cast( - &array, - &self.cast_type, - )?)), + ColumnarValue::Array(array) => Ok(ColumnarValue::Array( + cast::cast(array.as_ref(), &self.cast_type)?.into(), + )), ColumnarValue::Scalar(scalar) => { let scalar_array = scalar.to_array(); - let cast_array = kernels::cast::cast(&scalar_array, &self.cast_type)?; + let cast_array = + cast::cast(scalar_array.as_ref(), &self.cast_type)?.into(); let cast_scalar = ScalarValue::try_from_array(&cast_array, 0)?; Ok(ColumnarValue::Scalar(cast_scalar)) } @@ -104,7 +103,7 @@ pub fn try_cast( let expr_type = expr.data_type(input_schema)?; if expr_type == cast_type { Ok(expr.clone()) - } else if can_cast_types(&expr_type, &cast_type) { + } else if cast::can_cast_types(&expr_type, &cast_type) { Ok(Arc::new(TryCastExpr::new(expr, cast_type))) } else { Err(DataFusionError::Internal(format!( @@ -119,11 +118,9 @@ mod tests { use super::*; use crate::error::Result; use crate::physical_plan::expressions::col; - use arrow::array::{StringArray, Time64NanosecondArray}; - use arrow::{ - array::{Array, Int32Array, Int64Array, TimestampNanosecondArray, UInt32Array}, - datatypes::*, - }; + use arrow2::{array::*, datatypes::*}; + + type StringArray = Utf8Array; // runs an end-to-end test of physical type cast // 1. construct a record batch with a column "a" of type A @@ -221,15 +218,12 @@ mod tests { #[test] fn test_cast_i64_t64() -> Result<()> { let original = vec![1, 2, 3, 4, 5]; - let expected: Vec> = original - .iter() - .map(|i| Some(Time64NanosecondArray::from(vec![*i]).value(0))) - .collect(); + let expected: Vec> = original.iter().map(|i| Some(*i)).collect(); generic_test_cast!( Int64Array, DataType::Int64, original.clone(), - TimestampNanosecondArray, + Int64Array, DataType::Timestamp(TimeUnit::Nanosecond, None), expected ); diff --git a/datafusion/src/physical_plan/filter.rs b/datafusion/src/physical_plan/filter.rs index 61af78db8ed2a..8264be4b78d7b 100644 --- a/datafusion/src/physical_plan/filter.rs +++ b/datafusion/src/physical_plan/filter.rs @@ -26,11 +26,13 @@ use std::task::{Context, Poll}; use super::{RecordBatchStream, SendableRecordBatchStream}; use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ExecutionPlan, Partitioning, PhysicalExpr}; -use arrow::array::BooleanArray; -use arrow::compute::filter_record_batch; -use arrow::datatypes::{DataType, SchemaRef}; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; +use arrow2::array::BooleanArray; +use arrow2::compute::filter::filter_record_batch; +use arrow2::datatypes::{DataType, Schema}; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; + +type SchemaRef = Arc; use async_trait::async_trait; diff --git a/datafusion/src/physical_plan/functions.rs b/datafusion/src/physical_plan/functions.rs index 56365fec1dc87..9952311a5d796 100644 --- a/datafusion/src/physical_plan/functions.rs +++ b/datafusion/src/physical_plan/functions.rs @@ -42,16 +42,18 @@ use crate::{ error::{DataFusionError, Result}, scalar::ScalarValue, }; -use arrow::{ - array::ArrayRef, - compute::kernels::length::{bit_length, length}, +use arrow2::{ + array::Array, + compute::length::length, datatypes::TimeUnit, - datatypes::{DataType, Field, Int32Type, Int64Type, Schema}, + datatypes::{DataType, Field, Schema}, record_batch::RecordBatch, }; use fmt::{Debug, Formatter}; use std::{any::Any, fmt, str::FromStr, sync::Arc}; +type ArrayRef = Arc; + /// A function's signature, which defines the function's supported argument types. #[derive(Debug, Clone, PartialEq)] pub enum Signature { @@ -773,7 +775,7 @@ pub fn create_physical_expr( DataType::Utf8 => { let func = invoke_if_unicode_expressions_feature_flag!( character_length, - Int32Type, + i32, "character_length" ); make_scalar_function(func)(args) @@ -781,7 +783,7 @@ pub fn create_physical_expr( DataType::LargeUtf8 => { let func = invoke_if_unicode_expressions_feature_flag!( character_length, - Int64Type, + i64, "character_length" ); make_scalar_function(func)(args) @@ -858,7 +860,9 @@ pub fn create_physical_expr( } BuiltinScalarFunction::NullIf => nullif_func, BuiltinScalarFunction::OctetLength => |args| match &args[0] { - ColumnarValue::Array(v) => Ok(ColumnarValue::Array(length(v.as_ref())?)), + ColumnarValue::Array(v) => { + Ok(ColumnarValue::Array(length(v.as_ref())?.into())) + } ColumnarValue::Scalar(v) => match v { ScalarValue::Utf8(v) => Ok(ColumnarValue::Scalar(ScalarValue::Int32( v.as_ref().map(|x| x.len() as i32), @@ -1033,15 +1037,13 @@ pub fn create_physical_expr( }, BuiltinScalarFunction::Strpos => |args| match args[0].data_type() { DataType::Utf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - strpos, Int32Type, "strpos" - ); + let func = + invoke_if_unicode_expressions_feature_flag!(strpos, i32, "strpos"); make_scalar_function(func)(args) } DataType::LargeUtf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - strpos, Int64Type, "strpos" - ); + let func = + invoke_if_unicode_expressions_feature_flag!(strpos, i64, "strpos"); make_scalar_function(func)(args) } other => Err(DataFusionError::Internal(format!( @@ -1067,10 +1069,10 @@ pub fn create_physical_expr( }, BuiltinScalarFunction::ToHex => |args| match args[0].data_type() { DataType::Int32 => { - make_scalar_function(string_expressions::to_hex::)(args) + make_scalar_function(string_expressions::to_hex::)(args) } DataType::Int64 => { - make_scalar_function(string_expressions::to_hex::)(args) + make_scalar_function(string_expressions::to_hex::)(args) } other => Err(DataFusionError::Internal(format!( "Unsupported data type {:?} for function to_hex", @@ -1427,14 +1429,9 @@ mod tests { physical_plan::expressions::{col, lit}, scalar::ScalarValue, }; - use arrow::{ - array::{ - Array, ArrayRef, BinaryArray, BooleanArray, FixedSizeListArray, Float64Array, - Int32Array, ListArray, StringArray, UInt32Array, UInt64Array, - }, - datatypes::Field, - record_batch::RecordBatch, - }; + use arrow2::{array::*, datatypes::Field, record_batch::RecordBatch}; + + type StringArray = Utf8Array; /// $FUNC function to test /// $ARGS arguments (vec) to pass to function diff --git a/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs index fad4fa585034b..380cb96395df9 100644 --- a/datafusion/src/physical_plan/hash_aggregate.rs +++ b/datafusion/src/physical_plan/hash_aggregate.rs @@ -31,37 +31,20 @@ use crate::error::{DataFusionError, Result}; use crate::physical_plan::{Accumulator, AggregateExpr, SQLMetric}; use crate::physical_plan::{Distribution, ExecutionPlan, Partitioning, PhysicalExpr}; -use arrow::{ - array::{Array, UInt32Builder}, - error::{ArrowError, Result as ArrowResult}, -}; -use arrow::{ - array::{ - ArrayRef, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, - Int8Array, StringArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, - }, - compute, -}; -use arrow::{ - array::{BooleanArray, Date32Array, DictionaryArray}, - compute::cast, - datatypes::{ - ArrowDictionaryKeyType, ArrowNativeType, Int16Type, Int32Type, Int64Type, - Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type, - }, -}; -use arrow::{ - datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}, +use arrow2::error::{ArrowError, Result as ArrowResult}; +use arrow2::{array::*, compute}; +use arrow2::{buffer::MutableBuffer, datatypes::*}; +use arrow2::{ + datatypes::{DataType, Field, Schema}, record_batch::RecordBatch, }; use hashbrown::HashMap; use ordered_float::OrderedFloat; use pin_project_lite::pin_project; -use arrow::array::{ - LargeStringArray, TimestampMicrosecondArray, TimestampMillisecondArray, - TimestampNanosecondArray, -}; +type SchemaRef = Arc; +type ArrayRef = Arc; + use async_trait::async_trait; use super::{ @@ -345,7 +328,7 @@ fn group_aggregate_batch( if v.is_empty() { batch_keys.push(key.clone()) }; - v.push(row as u32) + v.push(row as i32) }) // 1.2 .or_insert_with(|| { @@ -355,18 +338,18 @@ fn group_aggregate_batch( let _ = create_group_by_values(&group_values, row, &mut group_by_values); ( key.clone(), - (group_by_values.clone(), accumulator_set, vec![row as u32]), + (group_by_values.clone(), accumulator_set, vec![row as i32]), ) }); } // Collect all indices + offsets based on keys in this vec - let mut batch_indices: UInt32Builder = UInt32Builder::new(0); + let mut batch_indices = MutableBuffer::::with_capacity(0); let mut offsets = vec![0]; let mut offset_so_far = 0; for key in batch_keys.iter() { let (_, _, indices) = accumulators.get_mut(key).unwrap(); - batch_indices.append_slice(&indices)?; + batch_indices.extend_from_slice(&indices)?; offset_so_far += indices.len(); offsets.push(offset_so_far); } @@ -378,14 +361,7 @@ fn group_aggregate_batch( .map(|array| { array .iter() - .map(|array| { - compute::take( - array.as_ref(), - &batch_indices, - None, // None: no index check - ) - .unwrap() - }) + .map(|array| compute::take::take(array.as_ref(), &batch_indices).unwrap()) .collect() // 2.3 }) @@ -412,7 +388,7 @@ fn group_aggregate_batch( .iter() .map(|array| { // 2.3 - array.slice(offsets[0], offsets[1] - offsets[0]) + array.slice(offsets[0], offsets[1] - offsets[0]).into() }) .collect::>(), ) @@ -445,7 +421,7 @@ fn group_aggregate_batch( /// but it also has to to handle the case where the dictionary itself /// is not the same across all record batches (and thus indexes in one /// record batch may not correspond to the same index in another) -fn dictionary_create_key_for_col( +fn dictionary_create_key_for_col( col: &ArrayRef, row: usize, vec: &mut Vec, @@ -512,29 +488,15 @@ fn create_key_for_col(col: &ArrayRef, row: usize, vec: &mut Vec) -> Result<( let array = col.as_any().downcast_ref::().unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); } - DataType::Timestamp(TimeUnit::Millisecond, None) => { - let array = col - .as_any() - .downcast_ref::() - .unwrap(); - vec.extend_from_slice(&array.value(row).to_le_bytes()); - } - DataType::Timestamp(TimeUnit::Microsecond, None) => { + DataType::Timestamp(_, None) => { let array = col .as_any() - .downcast_ref::() - .unwrap(); - vec.extend_from_slice(&array.value(row).to_le_bytes()); - } - DataType::Timestamp(TimeUnit::Nanosecond, None) => { - let array = col - .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); } DataType::Utf8 => { - let array = col.as_any().downcast_ref::().unwrap(); + let array = col.as_any().downcast_ref::>().unwrap(); let value = array.value(row); // store the size vec.extend_from_slice(&value.len().to_le_bytes()); @@ -542,7 +504,7 @@ fn create_key_for_col(col: &ArrayRef, row: usize, vec: &mut Vec) -> Result<( vec.extend_from_slice(value.as_bytes()); } DataType::LargeUtf8 => { - let array = col.as_any().downcast_ref::().unwrap(); + let array = col.as_any().downcast_ref::>().unwrap(); let value = array.value(row); // store the size vec.extend_from_slice(&value.len().to_le_bytes()); @@ -550,33 +512,33 @@ fn create_key_for_col(col: &ArrayRef, row: usize, vec: &mut Vec) -> Result<( vec.extend_from_slice(value.as_bytes()); } DataType::Date32 => { - let array = col.as_any().downcast_ref::().unwrap(); + let array = col.as_any().downcast_ref::().unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); } DataType::Dictionary(index_type, _) => match **index_type { DataType::Int8 => { - dictionary_create_key_for_col::(col, row, vec)?; + dictionary_create_key_for_col::(col, row, vec)?; } DataType::Int16 => { - dictionary_create_key_for_col::(col, row, vec)?; + dictionary_create_key_for_col::(col, row, vec)?; } DataType::Int32 => { - dictionary_create_key_for_col::(col, row, vec)?; + dictionary_create_key_for_col::(col, row, vec)?; } DataType::Int64 => { - dictionary_create_key_for_col::(col, row, vec)?; + dictionary_create_key_for_col::(col, row, vec)?; } DataType::UInt8 => { - dictionary_create_key_for_col::(col, row, vec)?; + dictionary_create_key_for_col::(col, row, vec)?; } DataType::UInt16 => { - dictionary_create_key_for_col::(col, row, vec)?; + dictionary_create_key_for_col::(col, row, vec)?; } DataType::UInt32 => { - dictionary_create_key_for_col::(col, row, vec)?; + dictionary_create_key_for_col::(col, row, vec)?; } DataType::UInt64 => { - dictionary_create_key_for_col::(col, row, vec)?; + dictionary_create_key_for_col::(col, row, vec)?; } _ => return Err(DataFusionError::Internal(format!( "Unsupported GROUP BY type (dictionary index type not supported creating key) {}", @@ -678,7 +640,7 @@ impl GroupedHashAggregateStream { type AccumulatorItem = Box; type Accumulators = - HashMap, (Box<[GroupByScalar]>, Vec, Vec), RandomState>; + HashMap, (Box<[GroupByScalar]>, Vec, Vec), RandomState>; impl Stream for GroupedHashAggregateStream { type Item = ArrowResult; @@ -892,7 +854,7 @@ impl Stream for HashAggregateStream { // check for error in receiving channel and unwrap actual result let result = match result { - Err(e) => Err(ArrowError::ExternalError(Box::new(e))), // error receiving + Err(e) => Err(ArrowError::External("".to_string(), Box::new(e))), // error receiving Ok(result) => result, }; @@ -918,7 +880,7 @@ fn concatenate(arrays: Vec>) -> ArrowResult> { .iter() .map(|a| a[column].as_ref()) .collect::>(); - compute::concat(&array_list) + Ok(compute::concat::concatenate(&array_list)?.into()) }) .collect::>>() } @@ -960,22 +922,22 @@ fn create_batch_from_map( GroupByScalar::UInt32(n) => Arc::new(UInt32Array::from(vec![*n])), GroupByScalar::UInt64(n) => Arc::new(UInt64Array::from(vec![*n])), GroupByScalar::Utf8(str) => { - Arc::new(StringArray::from(vec![&***str])) + Arc::new(Utf8Array::::from(vec![&***str])) } GroupByScalar::LargeUtf8(str) => { - Arc::new(LargeStringArray::from(vec![&***str])) + Arc::new(Utf8Array::::from(vec![&***str])) } GroupByScalar::Boolean(b) => Arc::new(BooleanArray::from(vec![*b])), GroupByScalar::TimeMillisecond(n) => { - Arc::new(TimestampMillisecondArray::from(vec![*n])) + Arc::new(UInt64Array::from(vec![*n])) } GroupByScalar::TimeMicrosecond(n) => { - Arc::new(TimestampMicrosecondArray::from(vec![*n])) + Arc::new(UInt64Array::from(vec![*n])) } GroupByScalar::TimeNanosecond(n) => { - Arc::new(TimestampNanosecondArray::from_vec(vec![*n], None)) + Arc::new(UInt64Array::from_vec(vec![*n], None)) } - GroupByScalar::Date32(n) => Arc::new(Date32Array::from(vec![*n])), + GroupByScalar::Date32(n) => Arc::new(Int32Array::from(vec![*n])), }) .collect::>(); @@ -1000,7 +962,9 @@ fn create_batch_from_map( let columns = columns .iter() .zip(output_schema.fields().iter()) - .map(|(col, desired_field)| cast(col, desired_field.data_type())) + .map(|(col, desired_field)| { + compute::cast::cast(col, desired_field.data_type()) + }) .collect::>>()?; RecordBatch::try_new(Arc::new(output_schema.to_owned()), columns)? @@ -1050,7 +1014,7 @@ fn finalize_aggregation( } /// Extract the value in `col[row]` from a dictionary a GroupByScalar -fn dictionary_create_group_by_value( +fn dictionary_create_group_by_value( col: &ArrayRef, row: usize, ) -> Result { @@ -1112,11 +1076,11 @@ fn create_group_by_value(col: &ArrayRef, row: usize) -> Result { Ok(GroupByScalar::Int64(array.value(row))) } DataType::Utf8 => { - let array = col.as_any().downcast_ref::().unwrap(); + let array = col.as_any().downcast_ref::>().unwrap(); Ok(GroupByScalar::Utf8(Box::new(array.value(row).into()))) } DataType::LargeUtf8 => { - let array = col.as_any().downcast_ref::().unwrap(); + let array = col.as_any().downcast_ref::>().unwrap(); Ok(GroupByScalar::Utf8(Box::new(array.value(row).into()))) } DataType::Boolean => { @@ -1124,39 +1088,30 @@ fn create_group_by_value(col: &ArrayRef, row: usize) -> Result { Ok(GroupByScalar::Boolean(array.value(row))) } DataType::Timestamp(TimeUnit::Millisecond, None) => { - let array = col - .as_any() - .downcast_ref::() - .unwrap(); + let array = col.as_any().downcast_ref::().unwrap(); Ok(GroupByScalar::TimeMillisecond(array.value(row))) } DataType::Timestamp(TimeUnit::Microsecond, None) => { - let array = col - .as_any() - .downcast_ref::() - .unwrap(); + let array = col.as_any().downcast_ref::().unwrap(); Ok(GroupByScalar::TimeMicrosecond(array.value(row))) } DataType::Timestamp(TimeUnit::Nanosecond, None) => { - let array = col - .as_any() - .downcast_ref::() - .unwrap(); + let array = col.as_any().downcast_ref::().unwrap(); Ok(GroupByScalar::TimeNanosecond(array.value(row))) } DataType::Date32 => { - let array = col.as_any().downcast_ref::().unwrap(); + let array = col.as_any().downcast_ref::().unwrap(); Ok(GroupByScalar::Date32(array.value(row))) } DataType::Dictionary(index_type, _) => match **index_type { - DataType::Int8 => dictionary_create_group_by_value::(col, row), - DataType::Int16 => dictionary_create_group_by_value::(col, row), - DataType::Int32 => dictionary_create_group_by_value::(col, row), - DataType::Int64 => dictionary_create_group_by_value::(col, row), - DataType::UInt8 => dictionary_create_group_by_value::(col, row), - DataType::UInt16 => dictionary_create_group_by_value::(col, row), - DataType::UInt32 => dictionary_create_group_by_value::(col, row), - DataType::UInt64 => dictionary_create_group_by_value::(col, row), + DataType::Int8 => dictionary_create_group_by_value::(col, row), + DataType::Int16 => dictionary_create_group_by_value::(col, row), + DataType::Int32 => dictionary_create_group_by_value::(col, row), + DataType::Int64 => dictionary_create_group_by_value::(col, row), + DataType::UInt8 => dictionary_create_group_by_value::(col, row), + DataType::UInt16 => dictionary_create_group_by_value::(col, row), + DataType::UInt32 => dictionary_create_group_by_value::(col, row), + DataType::UInt64 => dictionary_create_group_by_value::(col, row), _ => Err(DataFusionError::NotImplemented(format!( "Unsupported GROUP BY type (dictionary index type not supported) {}", col.data_type(), @@ -1185,7 +1140,7 @@ pub(crate) fn create_group_by_values( #[cfg(test)] mod tests { - use arrow::array::Float64Array; + use arrow2::array::Float64Array; use super::*; use crate::physical_plan::expressions::{col, Avg}; diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index eb2ec33d08699..10ee2e8fd8e6b 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -21,15 +21,7 @@ use ahash::CallHasher; use ahash::RandomState; -use arrow::{ - array::{ - ArrayData, ArrayRef, BooleanArray, LargeStringArray, PrimitiveArray, - TimestampMicrosecondArray, TimestampNanosecondArray, UInt32BufferBuilder, - UInt32Builder, UInt64BufferBuilder, UInt64Builder, - }, - compute, - datatypes::{TimeUnit, UInt32Type, UInt64Type}, -}; +use arrow2::{array::*, compute}; use smallvec::{smallvec, SmallVec}; use std::time::Instant; use std::{any::Any, collections::HashSet}; @@ -40,16 +32,15 @@ use futures::{Stream, StreamExt, TryStreamExt}; use hashbrown::HashMap; use tokio::sync::Mutex; -use arrow::array::Array; -use arrow::datatypes::DataType; -use arrow::datatypes::{Schema, SchemaRef}; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; +use arrow2::array::Array; +use arrow2::buffer::MutableBuffer; +use arrow2::datatypes::DataType; +use arrow2::datatypes::Schema; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; -use arrow::array::{ - Int16Array, Int32Array, Int64Array, Int8Array, StringArray, UInt16Array, UInt32Array, - UInt64Array, UInt8Array, -}; +type ArrayRef = Arc; +type SchemaRef = Arc; use super::expressions::col; use super::{ @@ -68,6 +59,9 @@ use log::debug; type JoinHashMap = HashMap, IdHashBuilder>; type JoinLeftData = Arc<(JoinHashMap, RecordBatch)>; +type StringArray = Utf8Array; +type LargeStringArray = Utf8Array; + /// join execution plan executes partitions in parallel and combines them into a set of /// partitions. #[derive(Debug)] @@ -457,8 +451,8 @@ fn build_batch_from_indices( schema: &Schema, left: &RecordBatch, right: &RecordBatch, - left_indices: UInt64Array, - right_indices: UInt32Array, + left_indices: Int64Array, + right_indices: Int32Array, column_indices: &[ColumnIndex], ) -> ArrowResult { // build the columns of the new [RecordBatch]: @@ -469,10 +463,10 @@ fn build_batch_from_indices( for column_index in column_indices { let array = if column_index.is_left { let array = left.column(column_index.index); - compute::take(array.as_ref(), &left_indices, None)? + compute::take::take(array.as_ref(), &left_indices)?.into() } else { let array = right.column(column_index.index); - compute::take(array.as_ref(), &right_indices, None)? + compute::take::take(array.as_ref(), &right_indices)?.into() }; columns.push(array); } @@ -544,7 +538,7 @@ fn build_join_indexes( left_on: &[String], right_on: &[String], random_state: &RandomState, -) -> Result<(UInt64Array, UInt32Array)> { +) -> Result<(Int64Array, Int32Array)> { let keys_values = right_on .iter() .map(|name| Ok(col(name).evaluate(right)?.into_array(right.num_rows()))) @@ -563,9 +557,8 @@ fn build_join_indexes( match join_type { JoinType::Inner => { - // Using a buffer builder to avoid slower normal builder - let mut left_indices = UInt64BufferBuilder::new(0); - let mut right_indices = UInt32BufferBuilder::new(0); + let mut left_indices = MutableBuffer::::new(); + let mut right_indices = MutableBuffer::::new(); // Visit all of the right rows for (row, hash_value) in hash_values.iter().enumerate() { @@ -578,29 +571,27 @@ fn build_join_indexes( for &i in indices { // Check hash collisions if equal_rows(i as usize, row, &left_join_values, &keys_values)? { - left_indices.append(i); - right_indices.append(row as u32); + left_indices.push(i as i64); + right_indices.push(row as i32); } } } } - let left = ArrayData::builder(DataType::UInt64) - .len(left_indices.len()) - .add_buffer(left_indices.finish()) - .build(); - let right = ArrayData::builder(DataType::UInt32) - .len(right_indices.len()) - .add_buffer(right_indices.finish()) - .build(); - - Ok(( - PrimitiveArray::::from(left), - PrimitiveArray::::from(right), - )) + let left = PrimitiveArray::::from_data( + DataType::Int64, + left_indices.into(), + None, + ); + let right = PrimitiveArray::::from_data( + DataType::Int32, + right_indices.into(), + None, + ); + Ok((left, right)) } JoinType::Left => { - let mut left_indices = UInt64Builder::new(0); - let mut right_indices = UInt32Builder::new(0); + let mut left_indices = MutableBuffer::::new(); + let mut right_indices = Primitive::::with_capacity(0); // Keep track of which item is visited in the build input // TODO: this can be stored more efficiently with a marker @@ -615,8 +606,8 @@ fn build_join_indexes( for &i in indices { // Collision check if equal_rows(i as usize, row, &left_join_values, &keys_values)? { - left_indices.append_value(i)?; - right_indices.append_value(row as u32)?; + left_indices.push(i as i64); + right_indices.push(Some(&(row as i32))); is_visited.insert(i); } } @@ -624,18 +615,25 @@ fn build_join_indexes( } // Add the remaining left rows to the result set with None on the right side for (_, indices) in left { + let casted = indices.into_iter().map(|x| *x as i64).collect::>(); for i in indices.iter() { if !is_visited.contains(i) { - left_indices.append_slice(&indices)?; - right_indices.append_null()?; + left_indices.extend_from_slice(&casted); + right_indices.push(None); } } } - Ok((left_indices.finish(), right_indices.finish())) + let left = PrimitiveArray::::from_data( + DataType::Int64, + left_indices.into(), + None, + ); + let right = right_indices.to(DataType::Int32); + Ok((left, right)) } JoinType::Right => { - let mut left_indices = UInt64Builder::new(0); - let mut right_indices = UInt32Builder::new(0); + let mut left_indices = Primitive::::new(); + let mut right_indices = MutableBuffer::::new(); for (row, hash_value) in hash_values.iter().enumerate() { match left.get(hash_value) { @@ -647,19 +645,25 @@ fn build_join_indexes( &left_join_values, &keys_values, )? { - left_indices.append_value(i)?; - right_indices.append_value(row as u32)?; + left_indices.push(Some(&(i as i64))); + right_indices.push(row as i32); } } } None => { // when no match, add the row with None for the left side - left_indices.append_null()?; - right_indices.append_value(row as u32)?; + left_indices.push(None); + right_indices.push(row as i32); } } } - Ok((left_indices.finish(), right_indices.finish())) + let left = left_indices.to(DataType::Int64); + let right = PrimitiveArray::::from_data( + DataType::Int32, + right_indices.into(), + None, + ); + Ok((left, right)) } } } @@ -804,27 +808,9 @@ pub fn create_hashes<'a>( DataType::Int32 => { hash_array!(Int32Array, col, i32, hashes_buffer, random_state); } - DataType::Int64 => { + DataType::Int64 | DataType::Timestamp(_, _) => { hash_array!(Int64Array, col, i64, hashes_buffer, random_state); } - DataType::Timestamp(TimeUnit::Microsecond, None) => { - hash_array!( - TimestampMicrosecondArray, - col, - i64, - hashes_buffer, - random_state - ); - } - DataType::Timestamp(TimeUnit::Nanosecond, None) => { - hash_array!( - TimestampNanosecondArray, - col, - i64, - hashes_buffer, - random_state - ); - } DataType::Boolean => { hash_array!(BooleanArray, col, u8, hashes_buffer, random_state); } @@ -1250,18 +1236,12 @@ mod tests { &random_state, )?; - let mut left_ids = UInt64Builder::new(0); - left_ids.append_value(0)?; - left_ids.append_value(1)?; - - let mut right_ids = UInt32Builder::new(0); - - right_ids.append_value(0)?; - right_ids.append_value(1)?; + let left = Primitive::::from([0, 1]).to(DataType::Int64); + let right = Primitive::::from([0, 1]).to(DataType::Int32); - assert_eq!(left_ids.finish(), l); + assert_eq!(left, l); - assert_eq!(right_ids.finish(), r); + assert_eq!(right, r); Ok(()) } diff --git a/datafusion/src/physical_plan/hash_utils.rs b/datafusion/src/physical_plan/hash_utils.rs index a38cc092123d4..b8e08c4237858 100644 --- a/datafusion/src/physical_plan/hash_utils.rs +++ b/datafusion/src/physical_plan/hash_utils.rs @@ -18,7 +18,7 @@ //! Functionality used both on logical and physical plans use crate::error::{DataFusionError, Result}; -use arrow::datatypes::{Field, Schema}; +use arrow2::datatypes::{Field, Schema}; use std::collections::HashSet; /// All valid types of joins. diff --git a/datafusion/src/physical_plan/limit.rs b/datafusion/src/physical_plan/limit.rs index c091196483f40..90051bc1690ef 100644 --- a/datafusion/src/physical_plan/limit.rs +++ b/datafusion/src/physical_plan/limit.rs @@ -27,11 +27,14 @@ use futures::stream::StreamExt; use crate::error::{DataFusionError, Result}; use crate::physical_plan::{Distribution, ExecutionPlan, Partitioning}; -use arrow::array::ArrayRef; -use arrow::compute::limit; -use arrow::datatypes::SchemaRef; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; +use arrow2::array::Array; +use arrow2::compute::limit::limit; +use arrow2::datatypes::Schema; +type SchemaRef = Arc; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; + +type ArrayRef = Arc; use super::{RecordBatchStream, SendableRecordBatchStream}; @@ -192,10 +195,10 @@ impl ExecutionPlan for LocalLimitExec { /// Truncate a RecordBatch to maximum of n rows pub fn truncate_batch(batch: &RecordBatch, n: usize) -> RecordBatch { let limited_columns: Vec = (0..batch.num_columns()) - .map(|i| limit(batch.column(i), n)) + .map(|i| limit(batch.column(i).as_ref(), n).into()) .collect(); - RecordBatch::try_new(batch.schema(), limited_columns).unwrap() + RecordBatch::try_new(batch.schema().clone(), limited_columns).unwrap() } /// A Limit stream limits the stream to up to `limit` rows. diff --git a/datafusion/src/physical_plan/math_expressions.rs b/datafusion/src/physical_plan/math_expressions.rs index 382a15f8ccf6e..a21723b2acd3c 100644 --- a/datafusion/src/physical_plan/math_expressions.rs +++ b/datafusion/src/physical_plan/math_expressions.rs @@ -17,56 +17,33 @@ //! Math expressions -use arrow::array::{make_array, Array, ArrayData, Float32Array, Float64Array}; -use arrow::buffer::Buffer; -use arrow::datatypes::{DataType, ToByteSlice}; +use std::sync::Arc; + +use arrow2::array::Array; +use arrow2::compute::arity::unary; +use arrow2::datatypes::DataType; use super::{ColumnarValue, ScalarValue}; use crate::error::{DataFusionError, Result}; -macro_rules! compute_op { - ($ARRAY:expr, $FUNC:ident, $TYPE:ident) => {{ - let len = $ARRAY.len(); - let result = (0..len) - .map(|i| $ARRAY.value(i).$FUNC() as f64) - .collect::>(); - let data = ArrayData::new( - DataType::Float64, - len, - Some($ARRAY.null_count()), - $ARRAY.data().null_buffer().cloned(), - 0, - vec![Buffer::from(result.to_byte_slice())], - vec![], - ); - Ok(make_array(data)) - }}; -} - -macro_rules! downcast_compute_op { - ($ARRAY:expr, $NAME:expr, $FUNC:ident, $TYPE:ident) => {{ - let n = $ARRAY.as_any().downcast_ref::<$TYPE>(); - match n { - Some(array) => compute_op!(array, $FUNC, $TYPE), - _ => Err(DataFusionError::Internal(format!( - "Invalid data type for {}", - $NAME - ))), - } - }}; -} - macro_rules! unary_primitive_array_op { ($VALUE:expr, $NAME:expr, $FUNC:ident) => {{ match ($VALUE) { ColumnarValue::Array(array) => match array.data_type() { DataType::Float32 => { - let result = downcast_compute_op!(array, $NAME, $FUNC, Float32Array); - Ok(ColumnarValue::Array(result?)) + let array = array.as_any().downcast_ref().unwrap(); + let array = unary::( + array, + |x| x.$FUNC() as f64, + &DataType::Float32, + ); + Ok(ColumnarValue::Array(Arc::new(array))) } DataType::Float64 => { - let result = downcast_compute_op!(array, $NAME, $FUNC, Float64Array); - Ok(ColumnarValue::Array(result?)) + let array = array.as_any().downcast_ref().unwrap(); + let array = + unary::(array, |x| x.$FUNC(), &DataType::Float64); + Ok(ColumnarValue::Array(Arc::new(array))) } other => Err(DataFusionError::Internal(format!( "Unsupported data type {:?} for function {}", diff --git a/datafusion/src/physical_plan/memory.rs b/datafusion/src/physical_plan/memory.rs index 9022077559acf..664f309a7614b 100644 --- a/datafusion/src/physical_plan/memory.rs +++ b/datafusion/src/physical_plan/memory.rs @@ -24,9 +24,10 @@ use std::task::{Context, Poll}; use super::{ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream}; use crate::error::{DataFusionError, Result}; -use arrow::datatypes::SchemaRef; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; +use arrow2::datatypes::Schema; +type SchemaRef = Arc; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; use async_trait::async_trait; use futures::Stream; diff --git a/datafusion/src/physical_plan/merge.rs b/datafusion/src/physical_plan/merge.rs index c66532b73ccff..2fd3811b6680d 100644 --- a/datafusion/src/physical_plan/merge.rs +++ b/datafusion/src/physical_plan/merge.rs @@ -28,9 +28,9 @@ use futures::Stream; use async_trait::async_trait; -use arrow::record_batch::RecordBatch; -use arrow::{ - datatypes::SchemaRef, +use arrow2::record_batch::RecordBatch; +use arrow2::{ + datatypes::Schema, error::{ArrowError, Result as ArrowResult}, }; @@ -42,6 +42,8 @@ use crate::physical_plan::Partitioning; use super::SendableRecordBatchStream; use pin_project_lite::pin_project; +type SchemaRef = Arc; + /// Merge execution plan executes partitions in parallel and combines them into a single /// partition. No guarantees are made about the order of the resulting partition. #[derive(Debug)] @@ -129,7 +131,8 @@ impl ExecutionPlan for MergeExec { Err(e) => { // If send fails, plan being torn // down, no place to send the error - let arrow_error = ArrowError::ExternalError(Box::new(e)); + let arrow_error = + ArrowError::External("".to_string(), Box::new(e)); sender.send(Err(arrow_error)).await.ok(); return; } diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index 11f0946c91ff6..bdde0457d4cfd 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -22,13 +22,16 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; use std::{any::Any, pin::Pin}; -use crate::execution::context::ExecutionContextState; -use crate::logical_plan::LogicalPlan; +//use crate::execution::context::ExecutionContextState; +//use crate::logical_plan::LogicalPlan; use crate::{error::Result, scalar::ScalarValue}; -use arrow::datatypes::{DataType, Schema, SchemaRef}; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; -use arrow::{array::ArrayRef, datatypes::Field}; +use arrow2::array::Array; +use arrow2::datatypes::{DataType, Field, Schema}; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; + +type ArrayRef = Arc; +type SchemaRef = Arc; use async_trait::async_trait; use futures::stream::Stream; @@ -36,7 +39,7 @@ use futures::stream::Stream; use self::merge::MergeExec; use hashbrown::HashMap; -/// Trait for types that stream [arrow::record_batch::RecordBatch] +/// Trait for types that stream [arrow2::record_batch::RecordBatch] pub trait RecordBatchStream: Stream> { /// Returns the schema of this `RecordBatchStream`. /// @@ -375,6 +378,8 @@ pub mod string_expressions; pub mod type_coercion; pub mod udaf; pub mod udf; +/* #[cfg(feature = "unicode_expressions")] pub mod unicode_expressions; pub mod union; +*/ diff --git a/datafusion/src/physical_plan/parquet.rs b/datafusion/src/physical_plan/parquet.rs index d41d6968fee0d..5ee3fbbfd125c 100644 --- a/datafusion/src/physical_plan/parquet.rs +++ b/datafusion/src/physical_plan/parquet.rs @@ -41,12 +41,12 @@ use crate::{ optimizer::utils, prelude::ExecutionConfig, }; -use arrow::record_batch::RecordBatch; -use arrow::{ +use arrow2::record_batch::RecordBatch; +use arrow2::{ array::new_null_array, error::{ArrowError, Result as ArrowResult}, }; -use arrow::{ +use arrow2::{ array::{make_array, ArrayData, ArrayRef, BooleanArray, BooleanBufferBuilder}, buffer::MutableBuffer, datatypes::{DataType, Field, Schema, SchemaRef}, @@ -58,7 +58,7 @@ use parquet::file::{ }; use fmt::Debug; -use parquet::arrow::{ArrowReader, ParquetFileArrowReader}; +use parquet::arrow2::{ArrowReader, ParquetFileArrowReader}; use tokio::{ sync::mpsc::{channel, Receiver, Sender}, task, @@ -769,7 +769,7 @@ fn build_statistics_array( .map(|x| x.map(|b| b.len()).unwrap_or(0)) .sum(); let mut builder = - arrow::array::StringBuilder::with_capacity(statistics_count, data_size); + arrow2::array::StringBuilder::with_capacity(statistics_count, data_size); let string_statistics = statistics.map(|x| x.and_then(|bytes| std::str::from_utf8(bytes).ok())); for maybe_string in string_statistics { @@ -807,7 +807,7 @@ fn build_statistics_array( return statistics_array; } // cast statistics array to required data type - arrow::compute::cast(&statistics_array, data_type) + arrow2::compute::cast::cast(&statistics_array, data_type) .unwrap_or_else(|_| new_null_array(data_type, statistics_count)) } @@ -980,7 +980,7 @@ impl RecordBatchStream for ParquetStream { #[cfg(test)] mod tests { use super::*; - use arrow::array::{Int32Array, StringArray}; + use arrow2::array::{Int32Array, StringArray}; use futures::StreamExt; use parquet::basic::Type as PhysicalType; use parquet::schema::types::SchemaDescPtr; @@ -1023,7 +1023,7 @@ mod tests { #[tokio::test] async fn test() -> Result<()> { - let testdata = arrow::util::test_util::parquet_test_data(); + let testdata = arrow2::util::test_util::parquet_test_data(); let filename = format!("{}/alltypes_plain.parquet", testdata); let parquet_exec = ParquetExec::try_from_path( &filename, diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index ae6ad5075d877..ac57d5f4a2b8e 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -45,13 +45,15 @@ use crate::physical_plan::{AggregateExpr, ExecutionPlan, PhysicalExpr, PhysicalP use crate::prelude::JoinType; use crate::scalar::ScalarValue; use crate::variable::VarType; -use arrow::compute::can_cast_types; +use arrow2::compute::cast::can_cast_types; -use arrow::compute::SortOptions; -use arrow::datatypes::{Schema, SchemaRef}; +use arrow2::compute::SortOptions; +use arrow2::datatypes::Schema; use expressions::col; use log::debug; +type SchemaRef = Arc; + /// This trait exposes the ability to plan an [`ExecutionPlan`] out of a [`LogicalPlan`]. pub trait ExtensionPlanner { /// Create a physical plan for a [`UserDefinedLogicalNode`]. @@ -746,7 +748,7 @@ mod tests { logical_plan::{col, lit, sum, LogicalPlanBuilder}, physical_plan::SendableRecordBatchStream, }; - use arrow::datatypes::{DataType, Field, SchemaRef}; + use arrow2::datatypes::{DataType, Field}; use async_trait::async_trait; use fmt::Debug; use std::{any::Any, collections::HashMap, fmt}; @@ -769,7 +771,7 @@ mod tests { #[test] fn test_all_operators() -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = arrow2::util::test_util::arrow_test_data(); let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); @@ -809,7 +811,7 @@ mod tests { #[test] fn test_with_csv_plan() -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = arrow2::util::test_util::arrow_test_data(); let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); @@ -828,7 +830,7 @@ mod tests { #[test] fn errors() -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = arrow2::util::test_util::arrow_test_data(); let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); @@ -930,7 +932,7 @@ mod tests { #[test] fn in_list_types() -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = arrow2::util::test_util::arrow_test_data(); let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); @@ -978,7 +980,7 @@ mod tests { #[test] fn hash_agg_input_schema() -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = arrow2::util::test_util::arrow_test_data(); let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); diff --git a/datafusion/src/physical_plan/projection.rs b/datafusion/src/physical_plan/projection.rs index a881beb453a0c..3d4e4d27f9824 100644 --- a/datafusion/src/physical_plan/projection.rs +++ b/datafusion/src/physical_plan/projection.rs @@ -27,9 +27,11 @@ use std::task::{Context, Poll}; use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ExecutionPlan, Partitioning, PhysicalExpr}; -use arrow::datatypes::{Field, Schema, SchemaRef}; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; +use arrow2::datatypes::{Field, Schema}; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; + +type SchemaRef = Arc; use super::{RecordBatchStream, SendableRecordBatchStream}; use async_trait::async_trait; diff --git a/datafusion/src/physical_plan/regex_expressions.rs b/datafusion/src/physical_plan/regex_expressions.rs index b526e7259ef61..e18ba62265e53 100644 --- a/datafusion/src/physical_plan/regex_expressions.rs +++ b/datafusion/src/physical_plan/regex_expressions.rs @@ -25,31 +25,33 @@ use std::any::type_name; use std::sync::Arc; use crate::error::{DataFusionError, Result}; -use arrow::array::{ArrayRef, GenericStringArray, StringOffsetSizeTrait}; -use arrow::compute; +use arrow2::array::{Array, Offset, Utf8Array}; +use arrow2::compute; use hashbrown::HashMap; use regex::Regex; +type ArrayRef = Arc; + macro_rules! downcast_string_arg { ($ARG:expr, $NAME:expr, $T:ident) => {{ $ARG.as_any() - .downcast_ref::>() + .downcast_ref::>() .ok_or_else(|| { DataFusionError::Internal(format!( "could not cast {} to {}", $NAME, - type_name::>() + type_name::>() )) })? }}; } /// extract a specific group from a string column, using a regular expression -pub fn regexp_match(args: &[ArrayRef]) -> Result { +pub fn regexp_match(args: &[ArrayRef]) -> Result { match args.len() { - 2 => compute::regexp_match(downcast_string_arg!(args[0], "string", T), downcast_string_arg!(args[1], "pattern", T), None) + 2 => compute::regex_match::regex_match(downcast_string_arg!(args[0], "string", T), downcast_string_arg!(args[1], "pattern", T), None) .map_err(DataFusionError::ArrowError), - 3 => compute::regexp_match(downcast_string_arg!(args[0], "string", T), downcast_string_arg!(args[1], "pattern", T), Some(downcast_string_arg!(args[1], "flags", T))) + 3 => compute::regex_match::regex_match(downcast_string_arg!(args[0], "string", T), downcast_string_arg!(args[1], "pattern", T), Some(downcast_string_arg!(args[1], "flags", T))) .map_err(DataFusionError::ArrowError), other => Err(DataFusionError::Internal(format!( "regexp_match was called with {} arguments. It requires at least 2 and at most 3.", @@ -72,7 +74,7 @@ fn regex_replace_posix_groups(replacement: &str) -> String { /// Replaces substring(s) matching a POSIX regular expression. /// /// example: `regexp_replace('Thomas', '.[mN]a.', 'M') = 'ThM'` -pub fn regexp_replace(args: &[ArrayRef]) -> Result { +pub fn regexp_replace(args: &[ArrayRef]) -> Result { // creating Regex is expensive so create hashmap for memoization let mut patterns: HashMap = HashMap::new(); @@ -108,7 +110,7 @@ pub fn regexp_replace(args: &[ArrayRef]) -> Result Ok(None) }) - .collect::>>()?; + .collect::>>()?; Ok(Arc::new(result) as ArrayRef) } @@ -160,7 +162,7 @@ pub fn regexp_replace(args: &[ArrayRef]) -> Result Ok(None) }) - .collect::>>()?; + .collect::>>()?; Ok(Arc::new(result) as ArrayRef) } diff --git a/datafusion/src/physical_plan/repartition.rs b/datafusion/src/physical_plan/repartition.rs index 7243550127bde..b2e897dee99d4 100644 --- a/datafusion/src/physical_plan/repartition.rs +++ b/datafusion/src/physical_plan/repartition.rs @@ -25,9 +25,9 @@ use std::{any::Any, collections::HashMap, vec}; use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ExecutionPlan, Partitioning}; -use arrow::record_batch::RecordBatch; -use arrow::{array::Array, error::Result as ArrowResult}; -use arrow::{compute::take, datatypes::SchemaRef}; +use arrow2::record_batch::RecordBatch; +use arrow2::{array::Array, error::Result as ArrowResult}; +use arrow2::{compute::take, datatypes::Schema}; use tokio_stream::wrappers::UnboundedReceiverStream; use super::{hash_join::create_hashes, RecordBatchStream, SendableRecordBatchStream}; @@ -41,6 +41,7 @@ use tokio::sync::{ }; use tokio::task::JoinHandle; +type SchemaRef = Arc; type MaybeBatch = Option>; /// The repartition operator maps N input partitions to M output partitions based on a @@ -298,9 +299,9 @@ impl RecordBatchStream for RepartitionStream { mod tests { use super::*; use crate::physical_plan::memory::MemoryExec; - use arrow::array::UInt32Array; - use arrow::datatypes::{DataType, Field, Schema}; - use arrow::record_batch::RecordBatch; + use arrow2::array::UInt32Array; + use arrow2::datatypes::{DataType, Field, Schema}; + use arrow2::record_batch::RecordBatch; #[tokio::test] async fn one_to_many_round_robin() -> Result<()> { diff --git a/datafusion/src/physical_plan/sort.rs b/datafusion/src/physical_plan/sort.rs index 010e4068638ba..298fcede42ada 100644 --- a/datafusion/src/physical_plan/sort.rs +++ b/datafusion/src/physical_plan/sort.rs @@ -30,12 +30,12 @@ use hashbrown::HashMap; use pin_project_lite::pin_project; -pub use arrow::compute::SortOptions; -use arrow::compute::{concat, lexsort_to_indices, take, SortColumn, TakeOptions}; -use arrow::datatypes::SchemaRef; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; -use arrow::{array::ArrayRef, error::ArrowError}; +pub use arrow2::compute::sort::SortOptions; +use arrow2::compute::{concat, sort::lexsort_to_indices, sort::SortColumn, take}; +use arrow2::datatypes::Schema; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; +use arrow2::{array::Array, error::ArrowError}; use super::{RecordBatchStream, SendableRecordBatchStream}; use crate::error::{DataFusionError, Result}; @@ -44,6 +44,9 @@ use crate::physical_plan::{ common, Distribution, ExecutionPlan, Partitioning, SQLMetric, }; +type SchemaRef = Arc; +type ArrayRef = Arc; + /// Sort execution plan #[derive(Debug)] pub struct SortExec { @@ -169,7 +172,7 @@ fn sort_batches( .iter() .enumerate() .map(|(i, _)| { - concat( + concat::concatenate( &batches .iter() .map(|batch| batch.column(i).as_ref()) @@ -196,17 +199,7 @@ fn sort_batches( combined_batch .columns() .iter() - .map(|column| { - take( - column.as_ref(), - &indices, - // disable bound check overhead since indices are already generated from - // the same record batch - Some(TakeOptions { - check_bounds: false, - }), - ) - }) + .map(|column| take::take(column.as_ref(), &indices)) .collect::>>()?, ); sorted_batch.map(Some) @@ -308,8 +301,8 @@ mod tests { csv::{CsvExec, CsvReadOptions}, }; use crate::test; - use arrow::array::*; - use arrow::datatypes::*; + use arrow2::array::*; + use arrow2::datatypes::*; #[tokio::test] async fn test_sort() -> Result<()> { diff --git a/datafusion/src/physical_plan/string_expressions.rs b/datafusion/src/physical_plan/string_expressions.rs index 882fe30502fdf..f2be35e5d0f14 100644 --- a/datafusion/src/physical_plan/string_expressions.rs +++ b/datafusion/src/physical_plan/string_expressions.rs @@ -28,25 +28,27 @@ use crate::{ error::{DataFusionError, Result}, scalar::ScalarValue, }; -use arrow::{ +use arrow2::{ array::{ - Array, ArrayRef, BooleanArray, GenericStringArray, Int32Array, Int64Array, - PrimitiveArray, StringArray, StringOffsetSizeTrait, + Array, BooleanArray, Int32Array, Int64Array, Offset, PrimitiveArray, Utf8Array, }, - datatypes::{ArrowNativeType, ArrowPrimitiveType, DataType}, + datatypes::DataType, }; use super::ColumnarValue; +type StringArray = Utf8Array; +type ArrayRef = Arc; + macro_rules! downcast_string_arg { ($ARG:expr, $NAME:expr, $T:ident) => {{ $ARG.as_any() - .downcast_ref::>() + .downcast_ref::>() .ok_or_else(|| { DataFusionError::Internal(format!( "could not cast {} to {}", $NAME, - type_name::>() + type_name::>() )) })? }}; @@ -90,20 +92,20 @@ macro_rules! downcast_vec { } /// applies a unary expression to `args[0]` that is expected to be downcastable to -/// a `GenericStringArray` and returns a `GenericStringArray` (which may have a different offset) +/// a `Utf8Array` and returns a `Utf8Array` (which may have a different offset) /// # Errors /// This function errors when: /// * the number of arguments is not 1 -/// * the first argument is not castable to a `GenericStringArray` +/// * the first argument is not castable to a `Utf8Array` pub(crate) fn unary_string_function<'a, T, O, F, R>( args: &[&'a dyn Array], op: F, name: &str, -) -> Result> +) -> Result> where R: AsRef, - O: StringOffsetSizeTrait, - T: StringOffsetSizeTrait, + O: Offset, + T: Offset, F: Fn(&'a str) -> R, { if args.len() != 1 { @@ -174,7 +176,7 @@ where /// Returns the numeric code of the first character of the argument. /// ascii('x') = 120 -pub fn ascii(args: &[ArrayRef]) -> Result { +pub fn ascii(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); let result = string_array @@ -192,7 +194,7 @@ pub fn ascii(args: &[ArrayRef]) -> Result { /// Removes the longest string containing only characters in characters (a space by default) from the start and end of string. /// btrim('xyxtrimyyx', 'xyz') = 'trim' -pub fn btrim(args: &[ArrayRef]) -> Result { +pub fn btrim(args: &[ArrayRef]) -> Result { match args.len() { 1 => { let string_array = downcast_string_arg!(args[0], "string", T); @@ -204,7 +206,7 @@ pub fn btrim(args: &[ArrayRef]) -> Result { string.trim_start_matches(' ').trim_end_matches(' ') }) }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -227,7 +229,7 @@ pub fn btrim(args: &[ArrayRef]) -> Result { ) } }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -307,7 +309,7 @@ pub fn concat(args: &[ColumnarValue]) -> Result { } Some(owned_string) }) - .collect::(); + .collect::>(); Ok(ColumnarValue::Array(Arc::new(result))) } else { @@ -370,7 +372,7 @@ pub fn concat_ws(args: &[ArrayRef]) -> Result { /// Converts the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters. /// initcap('hi THOMAS') = 'Hi Thomas' -pub fn initcap(args: &[ArrayRef]) -> Result { +pub fn initcap(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); // first map is the iterator, second is for the `Option<_>` @@ -393,7 +395,7 @@ pub fn initcap(args: &[ArrayRef]) -> Result char_vector.iter().collect::() }) }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -406,7 +408,7 @@ pub fn lower(args: &[ColumnarValue]) -> Result { /// Removes the longest string containing only characters in characters (a space by default) from the start of string. /// ltrim('zzzytest', 'xyz') = 'test' -pub fn ltrim(args: &[ArrayRef]) -> Result { +pub fn ltrim(args: &[ArrayRef]) -> Result { match args.len() { 1 => { let string_array = downcast_string_arg!(args[0], "string", T); @@ -414,7 +416,7 @@ pub fn ltrim(args: &[ArrayRef]) -> Result { let result = string_array .iter() .map(|string| string.map(|string: &str| string.trim_start_matches(' '))) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -432,7 +434,7 @@ pub fn ltrim(args: &[ArrayRef]) -> Result { } _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -445,7 +447,7 @@ pub fn ltrim(args: &[ArrayRef]) -> Result { /// Repeats string the specified number of times. /// repeat('Pg', 4) = 'PgPgPgPg' -pub fn repeat(args: &[ArrayRef]) -> Result { +pub fn repeat(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); let number_array = downcast_arg!(args[1], "number", Int64Array); @@ -456,14 +458,14 @@ pub fn repeat(args: &[ArrayRef]) -> Result { (Some(string), Some(number)) => Some(string.repeat(number as usize)), _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } /// Replaces all occurrences in string of substring from with substring to. /// replace('abcdefabcdef', 'cd', 'XX') = 'abXXefabXXef' -pub fn replace(args: &[ArrayRef]) -> Result { +pub fn replace(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); let from_array = downcast_string_arg!(args[1], "from", T); let to_array = downcast_string_arg!(args[2], "to", T); @@ -476,14 +478,14 @@ pub fn replace(args: &[ArrayRef]) -> Result (Some(string), Some(from), Some(to)) => Some(string.replace(from, to)), _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } /// Removes the longest string containing only characters in characters (a space by default) from the end of string. /// rtrim('testxxzx', 'xyz') = 'test' -pub fn rtrim(args: &[ArrayRef]) -> Result { +pub fn rtrim(args: &[ArrayRef]) -> Result { match args.len() { 1 => { let string_array = downcast_string_arg!(args[0], "string", T); @@ -491,7 +493,7 @@ pub fn rtrim(args: &[ArrayRef]) -> Result { let result = string_array .iter() .map(|string| string.map(|string: &str| string.trim_end_matches(' '))) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -509,7 +511,7 @@ pub fn rtrim(args: &[ArrayRef]) -> Result { } _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -522,7 +524,7 @@ pub fn rtrim(args: &[ArrayRef]) -> Result { /// Splits string at occurrences of delimiter and returns the n'th field (counting from one). /// split_part('abc~@~def~@~ghi', '~@~', 2) = 'def' -pub fn split_part(args: &[ArrayRef]) -> Result { +pub fn split_part(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); let delimiter_array = downcast_string_arg!(args[1], "delimiter", T); let n_array = downcast_arg!(args[2], "n", Int64Array); @@ -547,14 +549,14 @@ pub fn split_part(args: &[ArrayRef]) -> Result Ok(None), }) - .collect::>>()?; + .collect::>>()?; Ok(Arc::new(result) as ArrayRef) } /// Returns true if string starts with prefix. /// starts_with('alphabet', 'alph') = 't' -pub fn starts_with(args: &[ArrayRef]) -> Result { +pub fn starts_with(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); let prefix_array = downcast_string_arg!(args[1], "prefix", T); @@ -572,10 +574,7 @@ pub fn starts_with(args: &[ArrayRef]) -> Result(args: &[ArrayRef]) -> Result -where - T::Native: StringOffsetSizeTrait, -{ +pub fn to_hex(args: &[ArrayRef]) -> Result { let integer_array = downcast_primitive_array_arg!(args[0], "integer", T); let result = integer_array @@ -583,7 +582,7 @@ where .map(|integer| { integer.map(|integer| format!("{:x}", integer.to_usize().unwrap())) }) - .collect::>(); + .collect::(); Ok(Arc::new(result) as ArrayRef) } diff --git a/datafusion/src/physical_plan/type_coercion.rs b/datafusion/src/physical_plan/type_coercion.rs index d9f84e7cb8622..063ed67ca3f58 100644 --- a/datafusion/src/physical_plan/type_coercion.rs +++ b/datafusion/src/physical_plan/type_coercion.rs @@ -31,7 +31,7 @@ use std::{sync::Arc, vec}; -use arrow::datatypes::{DataType, Schema, TimeUnit}; +use arrow2::datatypes::{DataType, Schema, TimeUnit}; use super::{functions::Signature, PhysicalExpr}; use crate::error::{DataFusionError, Result}; @@ -205,7 +205,7 @@ pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool { mod tests { use super::*; use crate::physical_plan::expressions::col; - use arrow::datatypes::{DataType, Field, Schema}; + use arrow2::datatypes::{DataType, Field, Schema}; #[test] fn test_maybe_data_types() { diff --git a/datafusion/src/physical_plan/udaf.rs b/datafusion/src/physical_plan/udaf.rs index 3dc6aa402f527..45833e82553dc 100644 --- a/datafusion/src/physical_plan/udaf.rs +++ b/datafusion/src/physical_plan/udaf.rs @@ -21,7 +21,7 @@ use fmt::{Debug, Formatter}; use std::any::Any; use std::fmt; -use arrow::{ +use arrow2::{ datatypes::Field, datatypes::{DataType, Schema}, }; diff --git a/datafusion/src/physical_plan/udf.rs b/datafusion/src/physical_plan/udf.rs index 9189da47bd6f8..8255414641de5 100644 --- a/datafusion/src/physical_plan/udf.rs +++ b/datafusion/src/physical_plan/udf.rs @@ -20,7 +20,7 @@ use fmt::{Debug, Formatter}; use std::fmt; -use arrow::datatypes::Schema; +use arrow2::datatypes::Schema; use crate::error::Result; use crate::{logical_plan::Expr, physical_plan::PhysicalExpr}; diff --git a/datafusion/src/physical_plan/unicode_expressions.rs b/datafusion/src/physical_plan/unicode_expressions.rs index 787ea7ea26730..1b5306dbc0ca6 100644 --- a/datafusion/src/physical_plan/unicode_expressions.rs +++ b/datafusion/src/physical_plan/unicode_expressions.rs @@ -26,24 +26,24 @@ use std::cmp::Ordering; use std::sync::Arc; use crate::error::{DataFusionError, Result}; -use arrow::{ - array::{ - ArrayRef, GenericStringArray, Int64Array, PrimitiveArray, StringOffsetSizeTrait, - }, - datatypes::{ArrowNativeType, ArrowPrimitiveType}, +use arrow2::{ + array::{Array, Int64Array, Offset, PrimitiveArray, Utf8Array}, + types::NativeType, }; use hashbrown::HashMap; use unicode_segmentation::UnicodeSegmentation; +type ArrayRef = Arc; + macro_rules! downcast_string_arg { ($ARG:expr, $NAME:expr, $T:ident) => {{ $ARG.as_any() - .downcast_ref::>() + .downcast_ref::>() .ok_or_else(|| { DataFusionError::Internal(format!( "could not cast {} to {}", $NAME, - type_name::>() + type_name::>() )) })? }}; @@ -63,13 +63,10 @@ macro_rules! downcast_arg { /// Returns number of characters in the string. /// character_length('josé') = 4 -pub fn character_length(args: &[ArrayRef]) -> Result -where - T::Native: StringOffsetSizeTrait, -{ - let string_array: &GenericStringArray = args[0] +pub fn character_length(args: &[ArrayRef]) -> Result { + let string_array: &Utf8Array = args[0] .as_any() - .downcast_ref::>() + .downcast_ref::>() .ok_or_else(|| { DataFusionError::Internal("could not cast string to StringArray".to_string()) })?; @@ -78,7 +75,7 @@ where .iter() .map(|string| { string.map(|string: &str| { - T::Native::from_usize(string.graphemes(true).count()).expect( + T::from_usize(string.graphemes(true).count()).expect( "should not fail as graphemes.count will always return integer", ) }) @@ -90,7 +87,7 @@ where /// Returns first n characters in the string, or when n is negative, returns all but last |n| characters. /// left('abcde', 2) = 'ab' -pub fn left(args: &[ArrayRef]) -> Result { +pub fn left(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); let n_array = downcast_arg!(args[1], "n", Int64Array); @@ -117,14 +114,14 @@ pub fn left(args: &[ArrayRef]) -> Result { }, _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } /// Extends the string to length 'length' by prepending the characters fill (a space by default). If the string is already longer than length then it is truncated (on the right). /// lpad('hi', 5, 'xy') = 'xyxhi' -pub fn lpad(args: &[ArrayRef]) -> Result { +pub fn lpad(args: &[ArrayRef]) -> Result { match args.len() { 2 => { let string_array = downcast_string_arg!(args[0], "string", T); @@ -154,7 +151,7 @@ pub fn lpad(args: &[ArrayRef]) -> Result { } _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -200,7 +197,7 @@ pub fn lpad(args: &[ArrayRef]) -> Result { } _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -213,7 +210,7 @@ pub fn lpad(args: &[ArrayRef]) -> Result { /// Reverses the order of the characters in the string. /// reverse('abcde') = 'edcba' -pub fn reverse(args: &[ArrayRef]) -> Result { +pub fn reverse(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); let result = string_array @@ -221,14 +218,14 @@ pub fn reverse(args: &[ArrayRef]) -> Result .map(|string| { string.map(|string: &str| string.graphemes(true).rev().collect::()) }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } /// Returns last n characters in the string, or when n is negative, returns all but first |n| characters. /// right('abcde', 2) = 'de' -pub fn right(args: &[ArrayRef]) -> Result { +pub fn right(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); let n_array = downcast_arg!(args[1], "n", Int64Array); @@ -269,14 +266,14 @@ pub fn right(args: &[ArrayRef]) -> Result { }, _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } /// Extends the string to length 'length' by appending the characters fill (a space by default). If the string is already longer than length then it is truncated. /// rpad('hi', 5, 'xy') = 'hixyx' -pub fn rpad(args: &[ArrayRef]) -> Result { +pub fn rpad(args: &[ArrayRef]) -> Result { match args.len() { 2 => { let string_array = downcast_string_arg!(args[0], "string", T); @@ -303,7 +300,7 @@ pub fn rpad(args: &[ArrayRef]) -> Result { } _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -340,7 +337,7 @@ pub fn rpad(args: &[ArrayRef]) -> Result { } _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -353,20 +350,17 @@ pub fn rpad(args: &[ArrayRef]) -> Result { /// Returns starting index of specified substring within string, or zero if it's not present. (Same as position(substring in string), but note the reversed argument order.) /// strpos('high', 'ig') = 2 -pub fn strpos(args: &[ArrayRef]) -> Result -where - T::Native: StringOffsetSizeTrait, -{ - let string_array: &GenericStringArray = args[0] +pub fn strpos(args: &[ArrayRef]) -> Result { + let string_array: &Utf8Array = args[0] .as_any() - .downcast_ref::>() + .downcast_ref::>() .ok_or_else(|| { DataFusionError::Internal("could not cast string to StringArray".to_string()) })?; - let substring_array: &GenericStringArray = args[1] + let substring_array: &Utf8Array = args[1] .as_any() - .downcast_ref::>() + .downcast_ref::>() .ok_or_else(|| { DataFusionError::Internal( "could not cast substring to StringArray".to_string(), @@ -382,7 +376,7 @@ where // this method first finds the matching byte using rfind // then maps that to the character index by matching on the grapheme_index of the byte_index Some( - T::Native::from_usize(string.to_string().rfind(substring).map_or( + T::from_usize(string.to_string().rfind(substring).map_or( 0, |byte_offset| { string @@ -412,7 +406,7 @@ where /// Extracts the substring of string starting at the start'th character, and extending for count characters if that is specified. (Same as substring(string from start for count).) /// substr('alphabet', 3) = 'phabet' /// substr('alphabet', 3, 2) = 'ph' -pub fn substr(args: &[ArrayRef]) -> Result { +pub fn substr(args: &[ArrayRef]) -> Result { match args.len() { 2 => { let string_array = downcast_string_arg!(args[0], "string", T); @@ -437,7 +431,7 @@ pub fn substr(args: &[ArrayRef]) -> Result { } _ => None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } @@ -476,7 +470,7 @@ pub fn substr(args: &[ArrayRef]) -> Result { } _ => Ok(None), }) - .collect::>>()?; + .collect::>>()?; Ok(Arc::new(result) as ArrayRef) } @@ -489,7 +483,7 @@ pub fn substr(args: &[ArrayRef]) -> Result { /// Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted. /// translate('12345', '143', 'ax') = 'a2x5' -pub fn translate(args: &[ArrayRef]) -> Result { +pub fn translate(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); let from_array = downcast_string_arg!(args[1], "from", T); let to_array = downcast_string_arg!(args[2], "to", T); @@ -526,7 +520,7 @@ pub fn translate(args: &[ArrayRef]) -> Result None, }) - .collect::>(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } diff --git a/datafusion/src/physical_plan/union.rs b/datafusion/src/physical_plan/union.rs index cbab728a8428b..836045354821e 100644 --- a/datafusion/src/physical_plan/union.rs +++ b/datafusion/src/physical_plan/union.rs @@ -23,7 +23,8 @@ use std::{any::Any, sync::Arc}; -use arrow::datatypes::SchemaRef; +use arrow2::datatypes::Schema; +type SchemaRef = Arc; use super::{ExecutionPlan, Partitioning, SendableRecordBatchStream}; use crate::error::Result; @@ -104,7 +105,7 @@ mod tests { csv::{CsvExec, CsvReadOptions}, }; use crate::test; - use arrow::record_batch::RecordBatch; + use arrow2::record_batch::RecordBatch; #[tokio::test] async fn test_union_partitions() -> Result<()> { diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index 833f707e971ea..a1ae30fd02da6 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -19,21 +19,10 @@ use std::{convert::TryFrom, fmt, iter::repeat, sync::Arc}; -use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit}; -use arrow::{ - array::*, - datatypes::{ArrowNativeType, Float32Type, TimestampNanosecondType}, -}; -use arrow::{ - array::{ - ArrayRef, Int16Builder, Int32Builder, Int64Builder, Int8Builder, ListBuilder, - TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, - UInt16Builder, UInt32Builder, UInt64Builder, UInt8Builder, - }, - datatypes::{ - TimestampMicrosecondType, TimestampMillisecondType, TimestampSecondType, - }, -}; +use arrow2::datatypes::{DataType, Field, IntervalUnit, TimeUnit}; +use arrow2::{array::*, types::NativeType}; + +type ArrayRef = Arc; use crate::error::{DataFusionError, Result}; @@ -114,6 +103,7 @@ macro_rules! build_list { ))), $SIZE, ) + .into() } Some(values) => { let mut builder = ListBuilder::new($VALUE_BUILDER_TY::new(values.len())); @@ -238,87 +228,85 @@ impl ScalarValue { } ScalarValue::Float64(e) => match e { Some(value) => Arc::new(Float64Array::from_value(*value, size)), - None => new_null_array(&DataType::Float64, size), + None => new_null_array(&DataType::Float64, size).into(), }, ScalarValue::Float32(e) => match e { Some(value) => Arc::new(Float32Array::from_value(*value, size)), - None => new_null_array(&DataType::Float32, size), + None => new_null_array(&DataType::Float32, size).into(), }, ScalarValue::Int8(e) => match e { Some(value) => Arc::new(Int8Array::from_value(*value, size)), - None => new_null_array(&DataType::Int8, size), + None => new_null_array(&DataType::Int8, size).into(), }, ScalarValue::Int16(e) => match e { Some(value) => Arc::new(Int16Array::from_value(*value, size)), - None => new_null_array(&DataType::Int16, size), + None => new_null_array(&DataType::Int16, size).into(), }, ScalarValue::Int32(e) => match e { Some(value) => Arc::new(Int32Array::from_value(*value, size)), - None => new_null_array(&DataType::Int32, size), + None => new_null_array(&DataType::Int32, size).into(), }, ScalarValue::Int64(e) => match e { Some(value) => Arc::new(Int64Array::from_value(*value, size)), - None => new_null_array(&DataType::Int64, size), + None => new_null_array(&DataType::Int64, size).into(), }, ScalarValue::UInt8(e) => match e { Some(value) => Arc::new(UInt8Array::from_value(*value, size)), - None => new_null_array(&DataType::UInt8, size), + None => new_null_array(&DataType::UInt8, size).into(), }, ScalarValue::UInt16(e) => match e { Some(value) => Arc::new(UInt16Array::from_value(*value, size)), - None => new_null_array(&DataType::UInt16, size), + None => new_null_array(&DataType::UInt16, size).into(), }, ScalarValue::UInt32(e) => match e { Some(value) => Arc::new(UInt32Array::from_value(*value, size)), - None => new_null_array(&DataType::UInt32, size), + None => new_null_array(&DataType::UInt32, size).into(), }, ScalarValue::UInt64(e) => match e { Some(value) => Arc::new(UInt64Array::from_value(*value, size)), - None => new_null_array(&DataType::UInt64, size), + None => new_null_array(&DataType::UInt64, size).into(), }, ScalarValue::TimestampSecond(e) => match e { - Some(value) => Arc::new(TimestampSecondArray::from_iter_values( - repeat(*value).take(size), - )), + Some(value) => { + Arc::new(UInt64Array::from_iter_values(repeat(*value).take(size))) + } None => { new_null_array(&DataType::Timestamp(TimeUnit::Second, None), size) + .into() } }, ScalarValue::TimestampMillisecond(e) => match e { - Some(value) => Arc::new(TimestampMillisecondArray::from_iter_values( - repeat(*value).take(size), - )), + Some(value) => { + Arc::new(UInt64Array::from_iter_values(repeat(*value).take(size))) + } None => new_null_array( &DataType::Timestamp(TimeUnit::Millisecond, None), size, - ), + ) + .into(), }, ScalarValue::TimestampMicrosecond(e) => match e { - Some(value) => { - Arc::new(TimestampMicrosecondArray::from_value(*value, size)) - } + Some(value) => Arc::new(UInt64Array::from_value(*value, size)), None => new_null_array( &DataType::Timestamp(TimeUnit::Microsecond, None), size, ), }, ScalarValue::TimestampNanosecond(e) => match e { - Some(value) => { - Arc::new(TimestampNanosecondArray::from_value(*value, size)) - } + Some(value) => Arc::new(UInt64Array::from_value(*value, size)), None => { new_null_array(&DataType::Timestamp(TimeUnit::Nanosecond, None), size) } }, ScalarValue::Utf8(e) => match e { Some(value) => { - Arc::new(StringArray::from_iter_values(repeat(value).take(size))) + Arc::new(Utf8Array::::from_iter_values(repeat(value).take(size))) } None => new_null_array(&DataType::Utf8, size), }, ScalarValue::LargeUtf8(e) => match e { Some(value) => { - Arc::new(LargeStringArray::from_iter_values(repeat(value).take(size))) + Arc::new(Utf8Array::::from_iter_values(repeat(value).take(size))) } None => new_null_array(&DataType::LargeUtf8, size), }, @@ -336,12 +324,12 @@ impl ScalarValue { Some(value) => Arc::new( repeat(Some(value.as_slice())) .take(size) - .collect::(), + .collect::>(), ), None => Arc::new( repeat(None::<&str>) .take(size) - .collect::(), + .collect::>(), ), }, ScalarValue::List(values, data_type) => Arc::new(match data_type { @@ -742,42 +730,6 @@ impl fmt::Debug for ScalarValue { } } -/// Trait used to map a NativeTime to a ScalarType. -pub trait ScalarType { - /// returns a scalar from an optional T - fn scalar(r: Option) -> ScalarValue; -} - -impl ScalarType for Float32Type { - fn scalar(r: Option) -> ScalarValue { - ScalarValue::Float32(r) - } -} - -impl ScalarType for TimestampSecondType { - fn scalar(r: Option) -> ScalarValue { - ScalarValue::TimestampSecond(r) - } -} - -impl ScalarType for TimestampMillisecondType { - fn scalar(r: Option) -> ScalarValue { - ScalarValue::TimestampMillisecond(r) - } -} - -impl ScalarType for TimestampMicrosecondType { - fn scalar(r: Option) -> ScalarValue { - ScalarValue::TimestampMicrosecond(r) - } -} - -impl ScalarType for TimestampNanosecondType { - fn scalar(r: Option) -> ScalarValue { - ScalarValue::TimestampNanosecond(r) - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index a40d0becdcb4b..3caffa5568b47 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -39,7 +39,7 @@ use crate::{ sql::parser::{CreateExternalTable, FileType, Statement as DFStatement}, }; -use arrow::datatypes::*; +use arrow2::datatypes::*; use hashbrown::HashMap; use crate::prelude::JoinType; diff --git a/datafusion/src/test/exec.rs b/datafusion/src/test/exec.rs index 04cd29530c016..489d684ed88dd 100644 --- a/datafusion/src/test/exec.rs +++ b/datafusion/src/test/exec.rs @@ -19,11 +19,11 @@ use std::task::{Context, Poll}; -use arrow::{ - datatypes::SchemaRef, error::Result as ArrowResult, record_batch::RecordBatch, -}; +use arrow2::{datatypes::Schema, error::Result as ArrowResult, record_batch::RecordBatch}; use futures::Stream; +type SchemaRef = Arc; + use crate::physical_plan::RecordBatchStream; /// Index into the data that has been returned so far diff --git a/datafusion/src/test/mod.rs b/datafusion/src/test/mod.rs index 926a692261691..8247e28f647f9 100644 --- a/datafusion/src/test/mod.rs +++ b/datafusion/src/test/mod.rs @@ -24,9 +24,9 @@ use array::{ Array, ArrayRef, StringArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, }; -use arrow::array::{self, Int32Array}; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use arrow::record_batch::RecordBatch; +use arrow2::array::{self, Int32Array}; +use arrow2::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow2::record_batch::RecordBatch; use std::fs::File; use std::io::prelude::*; use std::io::{BufReader, BufWriter}; @@ -52,7 +52,7 @@ pub fn create_table_dual() -> Arc { /// Generated partitioned copy of a CSV file pub fn create_partitioned_csv(filename: &str, partitions: usize) -> Result { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = arrow2::util::test_util::arrow_test_data(); let path = format!("{}/csv/{}", testdata, filename); let tmp_dir = TempDir::new()?; @@ -292,7 +292,7 @@ macro_rules! assert_batches_eq { let expected_lines: Vec = $EXPECTED_LINES.iter().map(|&s| s.into()).collect(); - let formatted = arrow::util::pretty::pretty_format_batches($CHUNKS).unwrap(); + let formatted = arrow2::util::pretty::pretty_format_batches($CHUNKS).unwrap(); let actual_lines: Vec<&str> = formatted.trim().lines().collect(); @@ -326,7 +326,7 @@ macro_rules! assert_batches_sorted_eq { expected_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() } - let formatted = arrow::util::pretty::pretty_format_batches($CHUNKS).unwrap(); + let formatted = arrow2::util::pretty::pretty_format_batches($CHUNKS).unwrap(); // fix for windows: \r\n --> let mut actual_lines: Vec<&str> = formatted.trim().lines().collect(); diff --git a/datafusion/tests/custom_sources.rs b/datafusion/tests/custom_sources.rs index a00dd6ac28216..460466de92dbf 100644 --- a/datafusion/tests/custom_sources.rs +++ b/datafusion/tests/custom_sources.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::Int32Array; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; +use arrow2::array::Int32Array; +use arrow2::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow2::error::Result as ArrowResult; +use arrow2::record_batch::RecordBatch; use datafusion::error::{DataFusionError, Result}; use datafusion::{ diff --git a/datafusion/tests/dataframe.rs b/datafusion/tests/dataframe.rs index b93e21f4ababb..b6465bcb41c09 100644 --- a/datafusion/tests/dataframe.rs +++ b/datafusion/tests/dataframe.rs @@ -17,8 +17,8 @@ use std::sync::Arc; -use arrow::datatypes::{DataType, Field, Schema}; -use arrow::{ +use arrow2::datatypes::{DataType, Field, Schema}; +use arrow2::{ array::{Int32Array, StringArray}, record_batch::RecordBatch, }; diff --git a/datafusion/tests/provider_filter_pushdown.rs b/datafusion/tests/provider_filter_pushdown.rs index 0bf67bea8b9d4..1696dcb15dc7a 100644 --- a/datafusion/tests/provider_filter_pushdown.rs +++ b/datafusion/tests/provider_filter_pushdown.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{as_primitive_array, Int32Builder, UInt64Array}; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use arrow::record_batch::RecordBatch; +use arrow2::array::{as_primitive_array, Int32Builder, UInt64Array}; +use arrow2::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow2::record_batch::RecordBatch; use async_trait::async_trait; use datafusion::datasource::datasource::{ Statistics, TableProvider, TableProviderFilterPushDown, diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 70baffc700ba2..d53ccf93add5b 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -24,9 +24,9 @@ use chrono::Duration; extern crate arrow; extern crate datafusion; -use arrow::{array::*, datatypes::TimeUnit}; -use arrow::{datatypes::Int32Type, datatypes::Int64Type, record_batch::RecordBatch}; -use arrow::{ +use arrow2::{array::*, datatypes::TimeUnit}; +use arrow2::{datatypes::Int32Type, datatypes::Int64Type, record_batch::RecordBatch}; +use arrow2::{ datatypes::{DataType, Field, Schema, SchemaRef}, util::display::array_value_to_string, }; @@ -123,7 +123,7 @@ async fn parquet_query() { #[tokio::test] async fn parquet_single_nan_schema() { let mut ctx = ExecutionContext::new(); - let testdata = arrow::util::test_util::parquet_test_data(); + let testdata = arrow2::util::test_util::parquet_test_data(); ctx.register_parquet("single_nan", &format!("{}/single_nan.parquet", testdata)) .unwrap(); let sql = "SELECT mycol FROM single_nan"; @@ -141,7 +141,7 @@ async fn parquet_single_nan_schema() { #[ignore = "Test ignored, will be enabled as part of the nested Parquet reader"] async fn parquet_list_columns() { let mut ctx = ExecutionContext::new(); - let testdata = arrow::util::test_util::parquet_test_data(); + let testdata = arrow2::util::test_util::parquet_test_data(); ctx.register_parquet( "list_columns", &format!("{}/list_columns.parquet", testdata), @@ -1484,7 +1484,7 @@ fn aggr_test_schema() -> SchemaRef { } async fn register_aggregate_csv_by_sql(ctx: &mut ExecutionContext) { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = arrow2::util::test_util::arrow_test_data(); // TODO: The following c9 should be migrated to UInt32 and c10 should be UInt64 once // unsigned is supported. @@ -1524,7 +1524,7 @@ async fn register_aggregate_csv_by_sql(ctx: &mut ExecutionContext) { } fn register_aggregate_csv(ctx: &mut ExecutionContext) -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = arrow2::util::test_util::arrow_test_data(); let schema = aggr_test_schema(); ctx.register_csv( "aggregate_test_100", @@ -1551,7 +1551,7 @@ fn register_aggregate_simple_csv(ctx: &mut ExecutionContext) -> Result<()> { } fn register_alltypes_parquet(ctx: &mut ExecutionContext) { - let testdata = arrow::util::test_util::parquet_test_data(); + let testdata = arrow2::util::test_util::parquet_test_data(); ctx.register_parquet( "alltypes_plain", &format!("{}/alltypes_plain.parquet", testdata), @@ -2740,7 +2740,7 @@ async fn test_cast_expressions_error() -> Result<()> { Ok(_) => panic!("expected error"), Err(e) => { assert!(e.to_string().contains( - "Cast error: Cannot cast string 'c' to value of arrow::datatypes::types::Int32Type type" + "Cast error: Cannot cast string 'c' to value of arrow2::datatypes::types::Int32Type type" )) } } diff --git a/datafusion/tests/user_defined_plan.rs b/datafusion/tests/user_defined_plan.rs index f9f24430104c8..194c39ce846e7 100644 --- a/datafusion/tests/user_defined_plan.rs +++ b/datafusion/tests/user_defined_plan.rs @@ -60,7 +60,7 @@ use futures::{Stream, StreamExt}; -use arrow::{ +use arrow2::{ array::{Int64Array, StringArray}, datatypes::SchemaRef, error::ArrowError,