From 3b8d1a19dd66c3062aa0dd12cfba425e33195f02 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Tue, 6 Aug 2024 03:46:25 -0400 Subject: [PATCH 01/11] avoid copy --- .../src/datasource/physical_plan/arrow_file.rs | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index b4edc221c1f8..745392de87f2 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -297,10 +297,7 @@ impl FileOpener for ArrowOpener { for (dict_block, dict_result) in footer.dictionaries().iter().flatten().zip(dict_results) { - decoder.read_dictionary( - dict_block, - &Buffer::from_bytes(dict_result.into()), - )?; + decoder.read_dictionary(dict_block, &Buffer::from_bytes(dict_result.into()))?; } // filter recordbatches according to range @@ -335,12 +332,11 @@ impl FileOpener for ArrowOpener { .into_iter() .zip(recordbatch_results) .filter_map(move |(block, data)| { - decoder - .read_record_batch( - &block, - &Buffer::from_bytes(data.into()), - ) - .transpose() + match decoder.read_record_batch(&block, &Buffer::from_bytes(data.into())) { + Ok(Some(record_batch)) => Some(Ok(record_batch)), + Ok(None) => None, + Err(err) => Some(Err(err)), + } }), ) .boxed()) From 05bf29ca8dd5c4a0c09462c16cf08cba146f6cc8 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Tue, 6 Aug 2024 03:48:07 -0400 Subject: [PATCH 02/11] fmt --- .../core/src/datasource/physical_plan/arrow_file.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index 745392de87f2..152d0bf09c5b 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -297,7 +297,10 @@ impl FileOpener for ArrowOpener { for (dict_block, dict_result) in footer.dictionaries().iter().flatten().zip(dict_results) { - decoder.read_dictionary(dict_block, &Buffer::from_bytes(dict_result.into()))?; + decoder.read_dictionary( + dict_block, + &Buffer::from_bytes(dict_result.into()), + )?; } // filter recordbatches according to range @@ -332,7 +335,10 @@ impl FileOpener for ArrowOpener { .into_iter() .zip(recordbatch_results) .filter_map(move |(block, data)| { - match decoder.read_record_batch(&block, &Buffer::from_bytes(data.into())) { + match decoder.read_record_batch( + &block, + &Buffer::from_bytes(data.into()), + ) { Ok(Some(record_batch)) => Some(Ok(record_batch)), Ok(None) => None, Err(err) => Some(Err(err)), From 26d28260d720f327c1e767e055144bb0c2d04b4f Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Wed, 7 Aug 2024 00:32:05 -0400 Subject: [PATCH 03/11] update sqlogictest --- .../sqllogictest/test_files/aggregate.slt | 6 +- .../sqllogictest/test_files/arrow_typeof.slt | 8 +- datafusion/sqllogictest/test_files/ddl.slt | 2 +- .../sqllogictest/test_files/describe.slt | 4 +- .../sqllogictest/test_files/explain.slt | 12 +-- datafusion/sqllogictest/test_files/expr.slt | 80 +++++++------- .../test_files/information_schema.slt | 4 +- .../sqllogictest/test_files/interval.slt | 102 +++++++++--------- datafusion/sqllogictest/test_files/map.slt | 2 +- datafusion/sqllogictest/test_files/math.slt | 24 ++--- .../test_files/repartition_scan.slt | 8 +- .../sqllogictest/test_files/timestamps.slt | 16 +-- 12 files changed, 134 insertions(+), 134 deletions(-) diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index c68a6c345caa..3d9c5534e237 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -1788,7 +1788,7 @@ from values (interval '2 month 15 days'), (interval '-2 month') ---- -Interval(MonthDayNano) 0 years -2 mons 0 days 0 hours 0 mins 0.000000000 secs 0 years 2 mons 15 days 0 hours 0 mins 0.000000000 secs +Interval(MonthDayNano) -2 mons 2 mons 15 days # aggregate Interval(DayTime) min/max query T?? @@ -1799,7 +1799,7 @@ from values (arrow_cast('-3 minutes', 'Interval(DayTime)')), (arrow_cast('30 minutes', 'Interval(DayTime)')); ---- -Interval(DayTime) 0 years 0 mons 0 days 0 hours -3 mins 0.000 secs 0 years 0 mons 0 days 1 hours 0 mins 0.000 secs +Interval(DayTime) -3 mins 1 hours # aggregate Interval(YearMonth) min/max query T?? @@ -1810,7 +1810,7 @@ from values (arrow_cast('13 months', 'Interval(YearMonth)')), (arrow_cast('1 year', 'Interval(YearMonth)')); ---- -Interval(YearMonth) -1 years 0 mons 0 days 0 hours 0 mins 0.00 secs 1 years 1 mons 0 days 0 hours 0 mins 0.00 secs +Interval(YearMonth) -1 years 0 mons 1 years 1 mons # aggregate query II diff --git a/datafusion/sqllogictest/test_files/arrow_typeof.slt b/datafusion/sqllogictest/test_files/arrow_typeof.slt index 448706744305..73183b60675a 100644 --- a/datafusion/sqllogictest/test_files/arrow_typeof.slt +++ b/datafusion/sqllogictest/test_files/arrow_typeof.slt @@ -289,22 +289,22 @@ query ? --- select arrow_cast(interval '30 minutes', 'Interval(MonthDayNano)'); ---- -0 years 0 mons 0 days 0 hours 30 mins 0.000000000 secs +30 mins query ? select arrow_cast('30 minutes', 'Interval(DayTime)'); ---- -0 years 0 mons 0 days 0 hours 30 mins 0.000 secs +30 mins query ? select arrow_cast('1 year 5 months', 'Interval(YearMonth)'); ---- -1 years 5 mons 0 days 0 hours 0 mins 0.00 secs +1 years 5 mons query ? select arrow_cast('30 minutes', 'Interval(MonthDayNano)'); ---- -0 years 0 mons 0 days 0 hours 30 mins 0.000000000 secs +30 mins ## Duration diff --git a/datafusion/sqllogictest/test_files/ddl.slt b/datafusion/sqllogictest/test_files/ddl.slt index a35e688479e7..c6b718d91831 100644 --- a/datafusion/sqllogictest/test_files/ddl.slt +++ b/datafusion/sqllogictest/test_files/ddl.slt @@ -707,7 +707,7 @@ create table t (i interval, x int) as values (interval '5 days 3 nanoseconds', C query ?I select * from t; ---- -0 years 0 mons 5 days 0 hours 0 mins 0.000000003 secs 1 +5 days 0.000000003 secs 1 statement ok drop table t; diff --git a/datafusion/sqllogictest/test_files/describe.slt b/datafusion/sqllogictest/test_files/describe.slt index a15c3a109cab..57cb8a29fcc7 100644 --- a/datafusion/sqllogictest/test_files/describe.slt +++ b/datafusion/sqllogictest/test_files/describe.slt @@ -81,8 +81,8 @@ int_col Int32 YES bigint_col Int64 YES float_col Float32 YES double_col Float64 YES -date_string_col Utf8 YES -string_col Utf8 YES +date_string_col Utf8View YES +string_col Utf8View YES timestamp_col Timestamp(Nanosecond, None) YES year Int32 YES month Int32 YES diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt index eae4f428b4b4..173a0f8c13fc 100644 --- a/datafusion/sqllogictest/test_files/explain.slt +++ b/datafusion/sqllogictest/test_files/explain.slt @@ -309,8 +309,8 @@ initial_physical_plan 01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] 02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] initial_physical_plan_with_schema -01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N] -02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N] +01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N] +02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N] physical_plan after OutputRequirements 01)OutputRequirementExec, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] 02)--GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] @@ -332,7 +332,7 @@ physical_plan after ProjectionPushdown SAME TEXT AS ABOVE physical_plan after LimitPushdown ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] physical_plan after SanityCheckPlan SAME TEXT AS ABOVE physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] -physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N] +physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N] statement ok @@ -349,8 +349,8 @@ initial_physical_plan_with_stats 01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] 02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] initial_physical_plan_with_schema -01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N] -02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N] +01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N] +02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N] physical_plan after OutputRequirements 01)OutputRequirementExec 02)--GlobalLimitExec: skip=0, fetch=10 @@ -373,7 +373,7 @@ physical_plan after LimitPushdown ParquetExec: file_groups={1 group: [[WORKSPACE physical_plan after SanityCheckPlan SAME TEXT AS ABOVE physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10 physical_plan_with_stats ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] -physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N] +physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N] statement ok diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index 1d5f9ba23d58..886a6c95c035 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -122,203 +122,203 @@ SELECT query ? SELECT interval '1' ---- -0 years 0 mons 0 days 0 hours 0 mins 1.000000000 secs +1.000000000 secs query ? SELECT interval '1 second' ---- -0 years 0 mons 0 days 0 hours 0 mins 1.000000000 secs +1.000000000 secs query ? SELECT interval '500 milliseconds' ---- -0 years 0 mons 0 days 0 hours 0 mins 0.500000000 secs +0.500000000 secs query ? SELECT interval '5 second' ---- -0 years 0 mons 0 days 0 hours 0 mins 5.000000000 secs +5.000000000 secs query ? SELECT interval '0.5 minute' ---- -0 years 0 mons 0 days 0 hours 0 mins 30.000000000 secs +30.000000000 secs query ? SELECT interval '.5 minute' ---- -0 years 0 mons 0 days 0 hours 0 mins 30.000000000 secs +30.000000000 secs query ? SELECT interval '5 minute' ---- -0 years 0 mons 0 days 0 hours 5 mins 0.000000000 secs +5 mins query ? SELECT interval '5 minute 1 second' ---- -0 years 0 mons 0 days 0 hours 5 mins 1.000000000 secs +5 mins 1.000000000 secs query ? SELECT interval '1 hour' ---- -0 years 0 mons 0 days 1 hours 0 mins 0.000000000 secs +1 hours query ? SELECT interval '5 hour' ---- -0 years 0 mons 0 days 5 hours 0 mins 0.000000000 secs +5 hours query ? SELECT interval '1 day' ---- -0 years 0 mons 1 days 0 hours 0 mins 0.000000000 secs +1 days query ? SELECT interval '1 week' ---- -0 years 0 mons 7 days 0 hours 0 mins 0.000000000 secs +7 days query ? SELECT interval '2 weeks' ---- -0 years 0 mons 14 days 0 hours 0 mins 0.000000000 secs +14 days query ? SELECT interval '1 day 1' ---- -0 years 0 mons 1 days 0 hours 0 mins 1.000000000 secs +1 days 1.000000000 secs query ? SELECT interval '0.5' ---- -0 years 0 mons 0 days 0 hours 0 mins 0.500000000 secs +0.500000000 secs query ? SELECT interval '0.5 day 1' ---- -0 years 0 mons 0 days 12 hours 0 mins 1.000000000 secs +12 hours 1.000000000 secs query ? SELECT interval '0.49 day' ---- -0 years 0 mons 0 days 11 hours 45 mins 36.000000000 secs +11 hours 45 mins 36.000000000 secs query ? SELECT interval '0.499 day' ---- -0 years 0 mons 0 days 11 hours 58 mins 33.600000000 secs +11 hours 58 mins 33.600000000 secs query ? SELECT interval '0.4999 day' ---- -0 years 0 mons 0 days 11 hours 59 mins 51.360000000 secs +11 hours 59 mins 51.360000000 secs query ? SELECT interval '0.49999 day' ---- -0 years 0 mons 0 days 11 hours 59 mins 59.136000000 secs +11 hours 59 mins 59.136000000 secs query ? SELECT interval '0.49999999999 day' ---- -0 years 0 mons 0 days 11 hours 59 mins 59.999999136 secs +11 hours 59 mins 59.999999136 secs query ? SELECT interval '5 day' ---- -0 years 0 mons 5 days 0 hours 0 mins 0.000000000 secs +5 days # Hour is ignored, this matches PostgreSQL query ? SELECT interval '5 day' hour ---- -0 years 0 mons 5 days 0 hours 0 mins 0.000000000 secs +5 days query ? SELECT interval '5 day 4 hours 3 minutes 2 seconds 100 milliseconds' ---- -0 years 0 mons 5 days 4 hours 3 mins 2.100000000 secs +5 days 4 hours 3 mins 2.100000000 secs query ? SELECT interval '0.5 month' ---- -0 years 0 mons 15 days 0 hours 0 mins 0.000000000 secs +15 days query ? SELECT interval '0.5' month ---- -0 years 0 mons 15 days 0 hours 0 mins 0.000000000 secs +15 days query ? SELECT interval '1 month' ---- -0 years 1 mons 0 days 0 hours 0 mins 0.000000000 secs +1 mons query ? SELECT interval '1' MONTH ---- -0 years 1 mons 0 days 0 hours 0 mins 0.000000000 secs +1 mons query ? SELECT interval '5 month' ---- -0 years 5 mons 0 days 0 hours 0 mins 0.000000000 secs +5 mons query ? SELECT interval '13 month' ---- -0 years 13 mons 0 days 0 hours 0 mins 0.000000000 secs +13 mons query ? SELECT interval '0.5 year' ---- -0 years 6 mons 0 days 0 hours 0 mins 0.000000000 secs +6 mons query ? SELECT interval '1 year' ---- -0 years 12 mons 0 days 0 hours 0 mins 0.000000000 secs +12 mons query ? SELECT interval '1 decade' ---- -0 years 120 mons 0 days 0 hours 0 mins 0.000000000 secs +120 mons query ? SELECT interval '2 decades' ---- -0 years 240 mons 0 days 0 hours 0 mins 0.000000000 secs +240 mons query ? SELECT interval '1 century' ---- -0 years 1200 mons 0 days 0 hours 0 mins 0.000000000 secs +1200 mons query ? SELECT interval '2 year' ---- -0 years 24 mons 0 days 0 hours 0 mins 0.000000000 secs +24 mons query ? SELECT interval '1 year 1 day' ---- -0 years 12 mons 1 days 0 hours 0 mins 0.000000000 secs +12 mons 1 days query ? SELECT interval '1 year 1 day 1 hour' ---- -0 years 12 mons 1 days 1 hours 0 mins 0.000000000 secs +12 mons 1 days 1 hours query ? SELECT interval '1 year 1 day 1 hour 1 minute' ---- -0 years 12 mons 1 days 1 hours 1 mins 0.000000000 secs +12 mons 1 days 1 hours 1 mins query ? SELECT interval '1 year 1 day 1 hour 1 minute 1 second' ---- -0 years 12 mons 1 days 1 hours 1 mins 1.000000000 secs +12 mons 1 days 1 hours 1 mins 1.000000000 secs query I SELECT ascii('') diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 0cbbbf3c608c..75ee29b02b95 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -201,7 +201,7 @@ datafusion.execution.parquet.metadata_size_hint NULL datafusion.execution.parquet.pruning true datafusion.execution.parquet.pushdown_filters false datafusion.execution.parquet.reorder_filters false -datafusion.execution.parquet.schema_force_string_view false +datafusion.execution.parquet.schema_force_string_view true datafusion.execution.parquet.skip_metadata true datafusion.execution.parquet.statistics_enabled page datafusion.execution.parquet.write_batch_size 1024 @@ -290,7 +290,7 @@ datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the datafusion.execution.parquet.pruning true (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file datafusion.execution.parquet.pushdown_filters false (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization". datafusion.execution.parquet.reorder_filters false (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query -datafusion.execution.parquet.schema_force_string_view false (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`. +datafusion.execution.parquet.schema_force_string_view true (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`. datafusion.execution.parquet.skip_metadata true (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata datafusion.execution.parquet.statistics_enabled page (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting datafusion.execution.parquet.write_batch_size 1024 (writing) Sets write_batch_size in bytes diff --git a/datafusion/sqllogictest/test_files/interval.slt b/datafusion/sqllogictest/test_files/interval.slt index afb262cf95a5..077f38d5d5bb 100644 --- a/datafusion/sqllogictest/test_files/interval.slt +++ b/datafusion/sqllogictest/test_files/interval.slt @@ -45,250 +45,250 @@ Interval(MonthDayNano) Interval(MonthDayNano) query ? select interval '5' years ---- -0 years 0 mons 0 days 0 hours 0 mins 5.000000000 secs +5.000000000 secs # check all different kinds of intervals query ? select interval '5' year ---- -0 years 60 mons 0 days 0 hours 0 mins 0.000000000 secs +60 mons query ? select interval '5' month ---- -0 years 5 mons 0 days 0 hours 0 mins 0.000000000 secs +5 mons query ? select interval '5' months ---- -0 years 0 mons 0 days 0 hours 0 mins 5.000000000 secs +5.000000000 secs query ? select interval '5' week ---- -0 years 0 mons 35 days 0 hours 0 mins 0.000000000 secs +35 days query ? select interval '5' day ---- -0 years 0 mons 5 days 0 hours 0 mins 0.000000000 secs +5 days query ? select interval '5' hour ---- -0 years 0 mons 0 days 5 hours 0 mins 0.000000000 secs +5 hours ## This seems wrong (5 mons) query ? select interval '5' hours ---- -0 years 0 mons 0 days 0 hours 0 mins 5.000000000 secs +5.000000000 secs query ? select interval '5' minute ---- -0 years 0 mons 0 days 0 hours 5 mins 0.000000000 secs +5 mins query ? select interval '5' second ---- -0 years 0 mons 0 days 0 hours 0 mins 5.000000000 secs +5.000000000 secs query ? select interval '5' millisecond ---- -0 years 0 mons 0 days 0 hours 0 mins 0.005000000 secs +0.005000000 secs query ? select interval '5' milliseconds ---- -0 years 0 mons 0 days 0 hours 0 mins 0.005000000 secs +0.005000000 secs query ? select interval '5' microsecond ---- -0 years 0 mons 0 days 0 hours 0 mins 0.000005000 secs +0.000005000 secs query ? select interval '5' microseconds ---- -0 years 0 mons 0 days 0 hours 0 mins 0.000005000 secs +0.000005000 secs query ? select interval '5' nanosecond ---- -0 years 0 mons 0 days 0 hours 0 mins 0.000000005 secs +0.000000005 secs query ? select interval '5' nanoseconds ---- -0 years 0 mons 0 days 0 hours 0 mins 0.000000005 secs +0.000000005 secs query ? select interval '5 YEAR' ---- -0 years 60 mons 0 days 0 hours 0 mins 0.000000000 secs +60 mons query ? select interval '5 MONTH' ---- -0 years 5 mons 0 days 0 hours 0 mins 0.000000000 secs +5 mons query ? select interval '5 WEEK' ---- -0 years 0 mons 35 days 0 hours 0 mins 0.000000000 secs +35 days query ? select interval '5 DAY' ---- -0 years 0 mons 5 days 0 hours 0 mins 0.000000000 secs +5 days query ? select interval '5 HOUR' ---- -0 years 0 mons 0 days 5 hours 0 mins 0.000000000 secs +5 hours query ? select interval '5 HOURS' ---- -0 years 0 mons 0 days 5 hours 0 mins 0.000000000 secs +5 hours query ? select interval '5 MINUTE' ---- -0 years 0 mons 0 days 0 hours 5 mins 0.000000000 secs +5 mins query ? select interval '5 SECOND' ---- -0 years 0 mons 0 days 0 hours 0 mins 5.000000000 secs +5.000000000 secs query ? select interval '5 SECONDS' ---- -0 years 0 mons 0 days 0 hours 0 mins 5.000000000 secs +5.000000000 secs query ? select interval '5 MILLISECOND' ---- -0 years 0 mons 0 days 0 hours 0 mins 0.005000000 secs +0.005000000 secs query ? select interval '5 MILLISECONDS' ---- -0 years 0 mons 0 days 0 hours 0 mins 0.005000000 secs +0.005000000 secs query ? select interval '5 MICROSECOND' ---- -0 years 0 mons 0 days 0 hours 0 mins 0.000005000 secs +0.000005000 secs query ? select interval '5 MICROSECONDS' ---- -0 years 0 mons 0 days 0 hours 0 mins 0.000005000 secs +0.000005000 secs query ? select interval '5 NANOSECOND' ---- -0 years 0 mons 0 days 0 hours 0 mins 0.000000005 secs +0.000000005 secs query ? select interval '5 NANOSECONDS' ---- -0 years 0 mons 0 days 0 hours 0 mins 0.000000005 secs +0.000000005 secs query ? select interval '5 YEAR 5 MONTH 5 DAY 5 HOUR 5 MINUTE 5 SECOND 5 MILLISECOND 5 MICROSECOND 5 NANOSECOND' ---- -0 years 65 mons 5 days 5 hours 5 mins 5.005005005 secs +65 mons 5 days 5 hours 5 mins 5.005005005 secs # Interval with string literal addition query ? select interval '1 month' + '1 month' ---- -0 years 2 mons 0 days 0 hours 0 mins 0.000000000 secs +2 mons # Interval with string literal addition and leading field query ? select interval '1' + '1' month ---- -0 years 2 mons 0 days 0 hours 0 mins 0.000000000 secs +2 mons # Interval with nested string literal addition query ? select interval '1 month' + '1 month' + '1 month' ---- -0 years 3 mons 0 days 0 hours 0 mins 0.000000000 secs +3 mons # Interval with nested string literal addition and leading field query ? select interval '1' + '1' + '1' month ---- -0 years 3 mons 0 days 0 hours 0 mins 0.000000000 secs +3 mons # Interval mega nested string literal addition query ? select interval '1 year' + '1 month' + '1 day' + '1 hour' + '1 minute' + '1 second' + '1 millisecond' + '1 microsecond' + '1 nanosecond' ---- -0 years 13 mons 1 days 1 hours 1 mins 1.001001001 secs +13 mons 1 days 1 hours 1 mins 1.001001001 secs # Interval with string literal subtraction query ? select interval '1 month' - '1 day'; ---- -0 years 1 mons -1 days 0 hours 0 mins 0.000000000 secs +1 mons -1 days # Interval with string literal subtraction and leading field query ? select interval '5' - '1' - '2' year; ---- -0 years 24 mons 0 days 0 hours 0 mins 0.000000000 secs +24 mons # Interval with nested string literal subtraction query ? select interval '1 month' - '1 day' - '1 hour'; ---- -0 years 1 mons -1 days -1 hours 0 mins 0.000000000 secs +1 mons -1 days -1 hours # Interval with nested string literal subtraction and leading field query ? select interval '10' - '1' - '1' month; ---- -0 years 8 mons 0 days 0 hours 0 mins 0.000000000 secs +8 mons # Interval mega nested string literal subtraction query ? select interval '1 year' - '1 month' - '1 day' - '1 hour' - '1 minute' - '1 second' - '1 millisecond' - '1 microsecond' - '1 nanosecond' ---- -0 years 11 mons -1 days -1 hours -1 mins -1.001001001 secs +11 mons -1 days -1 hours -1 mins -1.001001001 secs # Interval with string literal negation and leading field query ? select -interval '5' - '1' - '2' year; ---- -0 years -96 mons 0 days 0 hours 0 mins 0.000000000 secs +-96 mons # Interval with nested string literal negation query ? select -interval '1 month' + '1 day' + '1 hour'; ---- -0 years -1 mons 1 days 1 hours 0 mins 0.000000000 secs +-1 mons 1 days 1 hours # Interval with nested string literal negation and leading field query ? select -interval '10' - '1' - '1' month; ---- -0 years -12 mons 0 days 0 hours 0 mins 0.000000000 secs +-12 mons # Interval mega nested string literal negation query ? select -interval '1 year' - '1 month' - '1 day' - '1 hour' - '1 minute' - '1 second' - '1 millisecond' - '1 microsecond' - '1 nanosecond' ---- -0 years -13 mons -1 days -1 hours -1 mins -1.001001001 secs +-13 mons -1 days -1 hours -1 mins -1.001001001 secs # Interval string literal + date query D @@ -343,7 +343,7 @@ select arrow_typeof(i) from t; ---- -0 years 0 mons 5 days 0 hours 0 mins 0.000000003 secs Interval(MonthDayNano) +5 days 0.000000003 secs Interval(MonthDayNano) statement ok @@ -359,8 +359,8 @@ insert into t values ('6 days 7 nanoseconds'::interval) query ? rowsort select -i from t order by 1; ---- -0 years 0 mons -5 days 0 hours 0 mins -0.000000003 secs -0 years 0 mons -6 days 0 hours 0 mins -0.000000007 secs +-5 days -0.000000003 secs +-6 days -0.000000007 secs query ?T rowsort select @@ -368,8 +368,8 @@ select arrow_typeof(i) from t; ---- -0 years 0 mons 5 days 0 hours 0 mins 0.000000003 secs Interval(MonthDayNano) -0 years 0 mons 6 days 0 hours 0 mins 0.000000007 secs Interval(MonthDayNano) +5 days 0.000000003 secs Interval(MonthDayNano) +6 days 0.000000007 secs Interval(MonthDayNano) statement ok drop table t; diff --git a/datafusion/sqllogictest/test_files/map.slt b/datafusion/sqllogictest/test_files/map.slt index eb350c22bb5d..ab23f9291662 100644 --- a/datafusion/sqllogictest/test_files/map.slt +++ b/datafusion/sqllogictest/test_files/map.slt @@ -26,7 +26,7 @@ describe data; ---- ints Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) NO strings Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) NO -timestamp Utf8 NO +timestamp Utf8View NO query ??T SELECT * FROM data ORDER by ints['bytes'] DESC LIMIT 10; diff --git a/datafusion/sqllogictest/test_files/math.slt b/datafusion/sqllogictest/test_files/math.slt index 6884d762612d..279491584ea4 100644 --- a/datafusion/sqllogictest/test_files/math.slt +++ b/datafusion/sqllogictest/test_files/math.slt @@ -252,19 +252,19 @@ select abs(c1), abs(c2), abs(c3), abs(c4) from test_nullable_integer where datas NULL NULL NULL NULL # abs: Int8 overlow -statement error DataFusion error: Arrow error: Compute error: Int8Array overflow on abs\(-128\) +statement error DataFusion error: Arrow error: Arithmetic overflow: Int8Array overflow on abs\(-128\) select abs(c1) from test_nullable_integer where dataset = 'mins' # abs: Int16 overlow -statement error DataFusion error: Arrow error: Compute error: Int16Array overflow on abs\(-32768\) +statement error DataFusion error: Arrow error: Arithmetic overflow: Int16Array overflow on abs\(-32768\) select abs(c2) from test_nullable_integer where dataset = 'mins' # abs: Int32 overlow -statement error DataFusion error: Arrow error: Compute error: Int32Array overflow on abs\(-2147483648\) +statement error DataFusion error: Arrow error: Arithmetic overflow: Int32Array overflow on abs\(-2147483648\) select abs(c3) from test_nullable_integer where dataset = 'mins' # abs: Int64 overlow -statement error DataFusion error: Arrow error: Compute error: Int64Array overflow on abs\(-9223372036854775808\) +statement error DataFusion error: Arrow error: Arithmetic overflow: Int64Array overflow on abs\(-9223372036854775808\) select abs(c4) from test_nullable_integer where dataset = 'mins' statement ok @@ -620,15 +620,15 @@ select gcd(a, b), gcd(c*d + 1, abs(e)) + f from signed_integers; NULL NULL # gcd(i64::MIN, i64::MIN) -query error DataFusion error: Arrow error: Compute error: Signed integer overflow in GCD\(\-9223372036854775808, \-9223372036854775808\) +query error DataFusion error: Arrow error: Arithmetic overflow:Signed integer overflow in GCD\(\-9223372036854775808, \-9223372036854775808\) select gcd(-9223372036854775808, -9223372036854775808); # gcd(i64::MIN, 0) -query error DataFusion error: Arrow error: Compute error: Signed integer overflow in GCD\(\-9223372036854775808, 0\) +query error DataFusion error: Arrow error: Arithmetic overflow:Signed integer overflow in GCD\(\-9223372036854775808, 0\) select gcd(-9223372036854775808, 0); # gcd(0, i64::MIN) -query error DataFusion error: Arrow error: Compute error: Signed integer overflow in GCD\(0, \-9223372036854775808\) +query error DataFusion error: Arrow error: Arithmetic overflow:Signed integer overflow in GCD\(0, \-9223372036854775808\) select gcd(0, -9223372036854775808); @@ -662,22 +662,22 @@ select lcm(a, b), lcm(c, d), lcm(e, f) from signed_integers; NULL NULL NULL # Result cannot fit in i64 -query error DataFusion error: Arrow error: Compute error: Signed integer overflow in LCM\(\-9223372036854775808, \-9223372036854775808\) +query error DataFusion error: Arrow error: Arithmetic overflow:Signed integer overflow in LCM\(\-9223372036854775808, \-9223372036854775808\) select lcm(-9223372036854775808, -9223372036854775808); -query error DataFusion error: Arrow error: Compute error: Signed integer overflow in LCM\(1, \-9223372036854775808\) +query error DataFusion error: Arrow error: Arithmetic overflow:Signed integer overflow in LCM\(1, \-9223372036854775808\) select lcm(1, -9223372036854775808); # Overflow on multiplication -query error DataFusion error: Arrow error: Compute error: Signed integer overflow in LCM\(2, 9223372036854775803\) +query error DataFusion error: Arrow error: Arithmetic overflow:Signed integer overflow in LCM\(2, 9223372036854775803\) select lcm(2, 9223372036854775803); -query error DataFusion error: Arrow error: Compute error: Overflow happened on: 2107754225 \^ 1221660777 +query error DataFusion error: Arrow error: Arithmetic overflow: Overflow happened on: 2107754225 \^ 1221660777 select power(2107754225, 1221660777); # factorial overflow -query error DataFusion error: Arrow error: Compute error: Overflow happened on FACTORIAL\(350943270\) +query error DataFusion error: Arrow error: Arithmetic overflow: Overflow happened on FACTORIAL\(350943270\) select FACTORIAL(350943270); statement ok diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index 6b9cb521f5f8..4c86312f9e51 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -61,7 +61,7 @@ logical_plan physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: column1@0 != 42 -03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..104], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:104..208], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:208..312], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:312..414]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..87], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:87..174], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:174..261], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:261..347]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] # disable round robin repartitioning statement ok @@ -77,7 +77,7 @@ logical_plan physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: column1@0 != 42 -03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..104], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:104..208], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:208..312], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:312..414]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..87], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:87..174], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:174..261], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:261..347]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] # enable round robin repartitioning again statement ok @@ -102,7 +102,7 @@ physical_plan 02)--SortExec: expr=[column1@0 ASC NULLS LAST], preserve_partitioning=[true] 03)----CoalesceBatchesExec: target_batch_size=8192 04)------FilterExec: column1@0 != 42 -05)--------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..205], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:205..405, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..5], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:5..210], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:210..414]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +05)--------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..172], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:172..338, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..178], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:178..347]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] ## Read the files as though they are ordered @@ -138,7 +138,7 @@ physical_plan 01)SortPreservingMergeExec: [column1@0 ASC NULLS LAST] 02)--CoalesceBatchesExec: target_batch_size=8192 03)----FilterExec: column1@0 != 42 -04)------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..202], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..207], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:207..414], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:202..405]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +04)------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..169], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..173], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:173..347], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:169..338]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] # Cleanup statement ok diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index fb0fd8397f2d..4b11e338da70 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -1509,19 +1509,19 @@ SELECT val, ts1 - ts2 FROM foo ORDER BY ts2 - ts1; query ? SELECT i1 - i2 FROM bar; ---- -0 years 0 mons -1 days 0 hours 0 mins 0.000000000 secs -0 years 2 mons -13 days 0 hours 0 mins 0.000000000 secs -0 years 0 mons 1 days 2 hours 56 mins 0.000000000 secs -0 years 0 mons 1 days 0 hours 0 mins -3.999999993 secs +-1 days +2 mons -13 days +1 days 2 hours 56 mins +1 days -3.999999993 secs # Interval + Interval query ? SELECT i1 + i2 FROM bar; ---- -0 years 0 mons 3 days 0 hours 0 mins 0.000000000 secs -0 years 2 mons 13 days 0 hours 0 mins 0.000000000 secs -0 years 0 mons 1 days 3 hours 4 mins 0.000000000 secs -0 years 0 mons 1 days 0 hours 0 mins 4.000000007 secs +3 days +2 mons 13 days +1 days 3 hours 4 mins +1 days 4.000000007 secs # Timestamp - Interval query P From af8204ae62173c51076a9b302296878e9b7304a8 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Wed, 7 Aug 2024 02:14:43 -0400 Subject: [PATCH 04/11] support string view in stats --- .../src/datasource/file_format/parquet.rs | 112 ++++++++++++++---- .../physical_plan/parquet/row_group_filter.rs | 2 + datafusion/core/tests/parquet/page_pruning.rs | 13 +- 3 files changed, 103 insertions(+), 24 deletions(-) diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index f233f3842c8c..55295795e24f 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -332,7 +332,7 @@ impl FileFormat for ParquetFormat { async fn infer_stats( &self, - _state: &SessionState, + state: &SessionState, store: &Arc, table_schema: SchemaRef, object: &ObjectMeta, @@ -342,6 +342,11 @@ impl FileFormat for ParquetFormat { table_schema, object, self.metadata_size_hint(), + state + .config_options() + .execution + .parquet + .schema_force_string_view, ) .await?; Ok(stats) @@ -481,9 +486,10 @@ async fn fetch_statistics( table_schema: SchemaRef, file: &ObjectMeta, metadata_size_hint: Option, + force_string_view: bool, ) -> Result { let metadata = fetch_parquet_metadata(store, file, metadata_size_hint).await?; - statistics_from_parquet_meta_calc(&metadata, table_schema) + statistics_from_parquet_meta_calc(&metadata, table_schema, force_string_view) } /// Convert statistics in [`ParquetMetaData`] into [`Statistics`] using ['StatisticsConverter`] @@ -493,6 +499,7 @@ async fn fetch_statistics( pub fn statistics_from_parquet_meta_calc( metadata: &ParquetMetaData, table_schema: SchemaRef, + force_string_view: bool, ) -> Result { let row_groups_metadata = metadata.row_groups(); @@ -514,10 +521,13 @@ pub fn statistics_from_parquet_meta_calc( statistics.total_byte_size = Precision::Exact(total_byte_size); let file_metadata = metadata.file_metadata(); - let file_schema = parquet_to_arrow_schema( + let mut file_schema = parquet_to_arrow_schema( file_metadata.schema_descr(), file_metadata.key_value_metadata(), )?; + if force_string_view { + file_schema = transform_schema_to_view(&file_schema); + } statistics.column_statistics = if has_statistics { let (mut max_accs, mut min_accs) = create_max_min_accs(&table_schema); @@ -578,7 +588,7 @@ pub async fn statistics_from_parquet_meta( metadata: &ParquetMetaData, table_schema: SchemaRef, ) -> Result { - statistics_from_parquet_meta_calc(metadata, table_schema) + statistics_from_parquet_meta_calc(metadata, table_schema, false) } fn summarize_min_max_null_counts( @@ -1278,8 +1288,20 @@ mod tests { let format = ParquetFormat::default(); let schema = format.infer_schema(&ctx, &store, &meta).await.unwrap(); - let stats = - fetch_statistics(store.as_ref(), schema.clone(), &meta[0], None).await?; + let use_string_view = ctx + .config_options() + .execution + .parquet + .schema_force_string_view; + + let stats = fetch_statistics( + store.as_ref(), + schema.clone(), + &meta[0], + None, + use_string_view, + ) + .await?; assert_eq!(stats.num_rows, Precision::Exact(3)); let c1_stats = &stats.column_statistics[0]; @@ -1287,7 +1309,9 @@ mod tests { assert_eq!(c1_stats.null_count, Precision::Exact(1)); assert_eq!(c2_stats.null_count, Precision::Exact(3)); - let stats = fetch_statistics(store.as_ref(), schema, &meta[1], None).await?; + let stats = + fetch_statistics(store.as_ref(), schema, &meta[1], None, use_string_view) + .await?; assert_eq!(stats.num_rows, Precision::Exact(3)); let c1_stats = &stats.column_statistics[0]; let c2_stats = &stats.column_statistics[1]; @@ -1460,15 +1484,25 @@ mod tests { let session = SessionContext::new(); let ctx = session.state(); + let use_string_view = ctx + .config_options() + .execution + .parquet + .schema_force_string_view; let format = ParquetFormat::default().with_metadata_size_hint(Some(9)); let schema = format .infer_schema(&ctx, &store.upcast(), &meta) .await .unwrap(); - let stats = - fetch_statistics(store.upcast().as_ref(), schema.clone(), &meta[0], Some(9)) - .await?; + let stats = fetch_statistics( + store.upcast().as_ref(), + schema.clone(), + &meta[0], + Some(9), + use_string_view, + ) + .await?; assert_eq!(stats.num_rows, Precision::Exact(3)); let c1_stats = &stats.column_statistics[0]; @@ -1500,6 +1534,7 @@ mod tests { schema.clone(), &meta[0], Some(size_hint), + use_string_view, ) .await?; @@ -1548,7 +1583,15 @@ mod tests { // Fetch statistics for first file let pq_meta = fetch_parquet_metadata(store.as_ref(), &files[0], None).await?; - let stats = statistics_from_parquet_meta_calc(&pq_meta, schema.clone())?; + let stats = statistics_from_parquet_meta_calc( + &pq_meta, + schema.clone(), + state + .config_options() + .execution + .parquet + .schema_force_string_view, + )?; assert_eq!(stats.num_rows, Precision::Exact(4)); // column c_dic @@ -1590,25 +1633,49 @@ mod tests { let format = ParquetFormat::default(); let schema = format.infer_schema(&state, &store, &files).await.unwrap(); + let use_string_view = state + .config_options() + .execution + .parquet + .schema_force_string_view; + let null_i64 = ScalarValue::Int64(None); - let null_utf8 = ScalarValue::Utf8(None); + let null_utf8 = if use_string_view { + ScalarValue::Utf8View(None) + } else { + ScalarValue::Utf8(None) + }; // Fetch statistics for first file + let pq_meta = fetch_parquet_metadata(store.as_ref(), &files[0], None).await?; - let stats = statistics_from_parquet_meta_calc(&pq_meta, schema.clone())?; + let stats = + statistics_from_parquet_meta_calc(&pq_meta, schema.clone(), use_string_view)?; // assert_eq!(stats.num_rows, Precision::Exact(3)); // column c1 let c1_stats = &stats.column_statistics[0]; assert_eq!(c1_stats.null_count, Precision::Exact(1)); - assert_eq!( - c1_stats.max_value, - Precision::Exact(ScalarValue::Utf8(Some("bar".to_string()))) - ); - assert_eq!( - c1_stats.min_value, - Precision::Exact(ScalarValue::Utf8(Some("Foo".to_string()))) - ); + if use_string_view { + assert_eq!( + c1_stats.max_value, + Precision::Exact(ScalarValue::Utf8View(Some("bar".to_string()))) + ); + assert_eq!( + c1_stats.min_value, + Precision::Exact(ScalarValue::Utf8View(Some("Foo".to_string()))) + ); + } else { + assert_eq!( + c1_stats.max_value, + Precision::Exact(ScalarValue::Utf8(Some("bar".to_string()))) + ); + assert_eq!( + c1_stats.min_value, + Precision::Exact(ScalarValue::Utf8(Some("Foo".to_string()))) + ); + } + // column c2: missing from the file so the table treats all 3 rows as null let c2_stats = &stats.column_statistics[1]; assert_eq!(c2_stats.null_count, Precision::Exact(3)); @@ -1617,7 +1684,8 @@ mod tests { // Fetch statistics for second file let pq_meta = fetch_parquet_metadata(store.as_ref(), &files[1], None).await?; - let stats = statistics_from_parquet_meta_calc(&pq_meta, schema.clone())?; + let stats = + statistics_from_parquet_meta_calc(&pq_meta, schema.clone(), use_string_view)?; assert_eq!(stats.num_rows, Precision::Exact(3)); // column c1: missing from the file so the table treats all 3 rows as null let c1_stats = &stats.column_statistics[0]; diff --git a/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs b/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs index 6a6910748fc8..249a1c52b312 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs @@ -265,7 +265,9 @@ impl PruningStatistics for BloomFilterStatistics { .map(|value| { match value { ScalarValue::Utf8(Some(v)) => sbbf.check(&v.as_str()), + ScalarValue::Utf8View(Some(v)) => sbbf.check(&v.as_str()), ScalarValue::Binary(Some(v)) => sbbf.check(v), + ScalarValue::BinaryView(Some(v)) => sbbf.check(v), ScalarValue::FixedSizeBinary(_size, Some(v)) => sbbf.check(v), ScalarValue::Boolean(Some(v)) => sbbf.check(v), ScalarValue::Float64(Some(v)) => sbbf.check(v), diff --git a/datafusion/core/tests/parquet/page_pruning.rs b/datafusion/core/tests/parquet/page_pruning.rs index 15efd4bcd9dd..d7813168889b 100644 --- a/datafusion/core/tests/parquet/page_pruning.rs +++ b/datafusion/core/tests/parquet/page_pruning.rs @@ -29,7 +29,7 @@ use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::SessionContext; use datafusion_common::{ScalarValue, ToDFSchema}; use datafusion_expr::execution_props::ExecutionProps; -use datafusion_expr::{col, lit, Expr}; +use datafusion_expr::{cast, col, lit, Expr}; use datafusion_physical_expr::create_physical_expr; use futures::StreamExt; @@ -150,7 +150,16 @@ async fn page_index_filter_one_col() { let task_ctx = session_ctx.task_ctx(); // 5.create filter date_string_col == 1; - let filter = col("date_string_col").eq(lit("01/01/09")); + let force_string_view = state + .config_options() + .execution + .parquet + .schema_force_string_view; + let filter = if force_string_view { + col("date_string_col").eq(cast(lit("01/01/09"), arrow_schema::DataType::Utf8View)) + } else { + col("date_string_col").eq(lit("01/01/09")) + }; let parquet_exec = get_parquet_exec(&state, filter).await; let mut results = parquet_exec.execute(0, task_ctx.clone()).unwrap(); let batch = results.next().await.unwrap().unwrap(); From c83fee9351990feb03094ded03d14a9da706d06f Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Wed, 7 Aug 2024 04:11:04 -0400 Subject: [PATCH 05/11] update tests --- Cargo.toml | 17 ++++++++++++-- benchmarks/Cargo.toml | 14 ++++++++++++ datafusion/common/Cargo.toml | 2 +- datafusion/common/src/config.rs | 2 +- datafusion/common/src/scalar/mod.rs | 6 ++--- .../src/datasource/file_format/parquet.rs | 10 ++++----- .../physical_plan/parquet/page_filter.rs | 2 +- .../functions/src/regex/regexpreplace.rs | 2 +- .../src/binary_view_map.rs | 2 +- .../physical-plan/src/coalesce_batches.rs | 5 +++-- datafusion/sql/src/unparser/expr.rs | 18 +++++++-------- .../engines/datafusion_engine/normalize.rs | 4 +++- .../sqllogictest/test_files/arrow_typeof.slt | 2 +- datafusion/sqllogictest/test_files/math.slt | 22 +++++++++---------- 14 files changed, 69 insertions(+), 39 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3431c4673e0c..5b9906f326cf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ # under the License. [workspace] -exclude = ["datafusion-cli", "dev/depcheck"] +exclude = ["datafusion-cli", "dev/depcheck", "datafusion-examples"] members = [ "datafusion/common", "datafusion/common-runtime", @@ -40,7 +40,6 @@ members = [ "datafusion/sqllogictest", "datafusion/substrait", "datafusion/wasmtest", - "datafusion-examples", "docs", "test-utils", "benchmarks", @@ -158,3 +157,17 @@ large_futures = "warn" [workspace.lints.rust] unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)"] } unused_imports = "deny" + +[patch.crates-io] +arrow = { git = "https://github.com/apache/arrow-rs.git" } +arrow-array = { git = "https://github.com/apache/arrow-rs.git" } +arrow-buffer = { git = "https://github.com/apache/arrow-rs.git" } +arrow-cast = { git = "https://github.com/apache/arrow-rs.git" } +arrow-data = { git = "https://github.com/apache/arrow-rs.git" } +arrow-ipc = { git = "https://github.com/apache/arrow-rs.git" } +arrow-schema = { git = "https://github.com/apache/arrow-rs.git" } +arrow-select = { git = "https://github.com/apache/arrow-rs.git" } +arrow-string = { git = "https://github.com/apache/arrow-rs.git" } +arrow-ord = { git = "https://github.com/apache/arrow-rs.git" } +arrow-flight = { git = "https://github.com/apache/arrow-rs.git" } +parquet = { git = "https://github.com/apache/arrow-rs.git" } diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 7f29f7471b6f..09e074bfae24 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -53,3 +53,17 @@ tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } [dev-dependencies] datafusion-proto = { workspace = true } + +[patch.crates-io] +arrow = { git = "https://github.com/apache/arrow-rs.git" } +arrow-array = { git = "https://github.com/apache/arrow-rs.git" } +arrow-buffer = { git = "https://github.com/apache/arrow-rs.git" } +arrow-cast = { git = "https://github.com/apache/arrow-rs.git" } +arrow-data = { git = "https://github.com/apache/arrow-rs.git" } +arrow-ipc = { git = "https://github.com/apache/arrow-rs.git" } +arrow-schema = { git = "https://github.com/apache/arrow-rs.git" } +arrow-select = { git = "https://github.com/apache/arrow-rs.git" } +arrow-string = { git = "https://github.com/apache/arrow-rs.git" } +arrow-ord = { git = "https://github.com/apache/arrow-rs.git" } +arrow-flight = { git = "https://github.com/apache/arrow-rs.git" } +parquet = { git = "https://github.com/apache/arrow-rs.git" } diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 85dfb2e8f73a..8de6dbdece5f 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -60,7 +60,7 @@ libc = "0.2.140" num_cpus = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } -pyo3 = { version = "0.21.0", optional = true } +pyo3 = { version = "0.22.0", optional = true } sqlparser = { workspace = true } [target.'cfg(target_family = "wasm")'.dependencies] diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index b5204b343f05..3f4ea2b26441 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -481,7 +481,7 @@ config_namespace! { /// (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, /// and `Binary/BinaryLarge` with `BinaryView`. - pub schema_force_string_view: bool, default = false + pub schema_force_string_view: bool, default = true } } diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 286df339adcf..606c4e8bd395 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -4329,7 +4329,7 @@ mod tests { .strip_backtrace(); assert_eq!( err, - "Arrow error: Compute error: Overflow happened on: 2147483647 - -2147483648" + "Arrow error: Arithmetic overflow: Overflow happened on: 2147483647 - -2147483648" ) } @@ -4350,7 +4350,7 @@ mod tests { .sub_checked(&int_value_2) .unwrap_err() .strip_backtrace(); - assert_eq!(err, "Arrow error: Compute error: Overflow happened on: 9223372036854775807 - -9223372036854775808") + assert_eq!(err, "Arrow error: Arithmetic overflow: Overflow happened on: 9223372036854775807 - -9223372036854775808") } #[test] @@ -5866,7 +5866,7 @@ mod tests { let root_err = err.find_root(); match root_err{ DataFusionError::ArrowError( - ArrowError::ComputeError(_), + ArrowError::ArithmeticOverflow(_), _, ) => {} _ => return Err(err), diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 55295795e24f..d6c334469945 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -1248,7 +1248,7 @@ mod tests { use arrow_schema::{DataType, Field}; use async_trait::async_trait; use datafusion_common::cast::{ - as_binary_array, as_boolean_array, as_float32_array, as_float64_array, + as_binary_view_array, as_boolean_array, as_float32_array, as_float64_array, as_int32_array, as_timestamp_nanosecond_array, }; use datafusion_common::config::ParquetOptions; @@ -1799,8 +1799,8 @@ mod tests { bigint_col: Int64\n\ float_col: Float32\n\ double_col: Float64\n\ - date_string_col: Binary\n\ - string_col: Binary\n\ + date_string_col: BinaryView\n\ + string_col: BinaryView\n\ timestamp_col: Timestamp(Nanosecond, None)", y ); @@ -1956,7 +1956,7 @@ mod tests { assert_eq!(1, batches[0].num_columns()); assert_eq!(8, batches[0].num_rows()); - let array = as_binary_array(batches[0].column(0))?; + let array = as_binary_view_array(batches[0].column(0))?; let mut values: Vec<&str> = vec![]; for i in 0..batches[0].num_rows() { values.push(std::str::from_utf8(array.value(i)).unwrap()); @@ -2070,7 +2070,7 @@ mod tests { let int_col_offset = offset_index.get(4).unwrap(); // 325 pages in int_col - assert_eq!(int_col_offset.len(), 325); + assert_eq!(int_col_offset.page_locations().len(), 325); match int_col_index { Index::INT32(index) => { assert_eq!(index.indexes.len(), 325); diff --git a/datafusion/core/src/datasource/physical_plan/parquet/page_filter.rs b/datafusion/core/src/datasource/physical_plan/parquet/page_filter.rs index e4d26a460ecd..db3ee8c2a51d 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/page_filter.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/page_filter.rs @@ -406,7 +406,7 @@ impl<'a> PagesPruningStatistics<'a> { converter, column_index, offset_index, - page_offsets, + page_offsets: &page_offsets.page_locations, }) } diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index d28c6cd36d65..28068d06b518 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -402,7 +402,7 @@ fn _regexp_replace_static_pattern_replace( let string_view_array = as_string_view_array(&args[0])?; let mut builder = StringViewBuilder::with_capacity(string_view_array.len()) - .with_block_size(1024 * 1024 * 2); + .with_fixed_block_size(1024 * 1024 * 2); for val in string_view_array.iter() { if let Some(val) = val { diff --git a/datafusion/physical-expr-common/src/binary_view_map.rs b/datafusion/physical-expr-common/src/binary_view_map.rs index 18bc6801aa60..e2fb025afbba 100644 --- a/datafusion/physical-expr-common/src/binary_view_map.rs +++ b/datafusion/physical-expr-common/src/binary_view_map.rs @@ -149,7 +149,7 @@ where output_type, map: hashbrown::raw::RawTable::with_capacity(INITIAL_MAP_CAPACITY), map_size: 0, - builder: GenericByteViewBuilder::new().with_block_size(2 * 1024 * 1024), + builder: GenericByteViewBuilder::new().with_fixed_block_size(2 * 1024 * 1024), random_state: RandomState::new(), hashes_buffer: vec![], null: None, diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs index de42a55ad350..4013d25b5f71 100644 --- a/datafusion/physical-plan/src/coalesce_batches.rs +++ b/datafusion/physical-plan/src/coalesce_batches.rs @@ -494,7 +494,7 @@ fn gc_string_view_batch(batch: &RecordBatch) -> RecordBatch { // See https://github.com/apache/arrow-rs/issues/6094 for more details. let mut builder = StringViewBuilder::with_capacity(s.len()); if ideal_buffer_size > 0 { - builder = builder.with_block_size(ideal_buffer_size as u32); + builder = builder.with_fixed_block_size(ideal_buffer_size as u32); } for v in s.iter() { @@ -804,7 +804,8 @@ mod tests { impl StringViewTest { /// Create a `StringViewArray` with the parameters specified in this struct fn build(self) -> StringViewArray { - let mut builder = StringViewBuilder::with_capacity(100).with_block_size(8192); + let mut builder = + StringViewBuilder::with_capacity(100).with_fixed_block_size(8192); loop { for &v in self.strings.iter() { builder.append_option(v); diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index de130754ab1a..f1ff30239027 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -2077,49 +2077,49 @@ mod tests { "1 YEAR 1 MONTH 1 DAY 3 HOUR 10 MINUTE 20 SECOND", ), IntervalStyle::PostgresVerbose, - r#"INTERVAL '0 YEARS 13 MONS 1 DAYS 3 HOURS 10 MINS 20.000000000 SECS'"#, + r#"INTERVAL '13 MONS 1 DAYS 3 HOURS 10 MINS 20.000000000 SECS'"#, ), ( interval_month_day_nano_lit("1.5 MONTH"), IntervalStyle::PostgresVerbose, - r#"INTERVAL '0 YEARS 1 MONS 15 DAYS 0 HOURS 0 MINS 0.000000000 SECS'"#, + r#"INTERVAL '1 MONS 15 DAYS'"#, ), ( interval_month_day_nano_lit("-3 MONTH"), IntervalStyle::PostgresVerbose, - r#"INTERVAL '0 YEARS -3 MONS 0 DAYS 0 HOURS 0 MINS 0.000000000 SECS'"#, + r#"INTERVAL '-3 MONS'"#, ), ( interval_month_day_nano_lit("1 MONTH") .add(interval_month_day_nano_lit("1 DAY")), IntervalStyle::PostgresVerbose, - r#"(INTERVAL '0 YEARS 1 MONS 0 DAYS 0 HOURS 0 MINS 0.000000000 SECS' + INTERVAL '0 YEARS 0 MONS 1 DAYS 0 HOURS 0 MINS 0.000000000 SECS')"#, + r#"(INTERVAL '1 MONS' + INTERVAL '1 DAYS')"#, ), ( interval_month_day_nano_lit("1 MONTH") .sub(interval_month_day_nano_lit("1 DAY")), IntervalStyle::PostgresVerbose, - r#"(INTERVAL '0 YEARS 1 MONS 0 DAYS 0 HOURS 0 MINS 0.000000000 SECS' - INTERVAL '0 YEARS 0 MONS 1 DAYS 0 HOURS 0 MINS 0.000000000 SECS')"#, + r#"(INTERVAL '1 MONS' - INTERVAL '1 DAYS')"#, ), ( interval_datetime_lit("10 DAY 1 HOUR 10 MINUTE 20 SECOND"), IntervalStyle::PostgresVerbose, - r#"INTERVAL '0 YEARS 0 MONS 10 DAYS 1 HOURS 10 MINS 20.000 SECS'"#, + r#"INTERVAL '10 DAYS 1 HOURS 10 MINS 20.000 SECS'"#, ), ( interval_datetime_lit("10 DAY 1.5 HOUR 10 MINUTE 20 SECOND"), IntervalStyle::PostgresVerbose, - r#"INTERVAL '0 YEARS 0 MONS 10 DAYS 1 HOURS 40 MINS 20.000 SECS'"#, + r#"INTERVAL '10 DAYS 1 HOURS 40 MINS 20.000 SECS'"#, ), ( interval_year_month_lit("1 YEAR 1 MONTH"), IntervalStyle::PostgresVerbose, - r#"INTERVAL '1 YEARS 1 MONS 0 DAYS 0 HOURS 0 MINS 0.00 SECS'"#, + r#"INTERVAL '1 YEARS 1 MONS'"#, ), ( interval_year_month_lit("1.5 YEAR 1 MONTH"), IntervalStyle::PostgresVerbose, - r#"INTERVAL '1 YEARS 7 MONS 0 DAYS 0 HOURS 0 MINS 0.00 SECS'"#, + r#"INTERVAL '1 YEARS 7 MONS'"#, ), ( interval_year_month_lit("1 YEAR 1 MONTH"), diff --git a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs index 66ffeadf8cec..f0f117d7658b 100644 --- a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs +++ b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs @@ -267,7 +267,9 @@ pub(crate) fn convert_schema_to_types(columns: &Fields) -> Vec { | DataType::Float64 | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => DFColumnType::Float, - DataType::Utf8 | DataType::LargeUtf8 => DFColumnType::Text, + DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => { + DFColumnType::Text + } DataType::Date32 | DataType::Date64 | DataType::Time32(_) diff --git a/datafusion/sqllogictest/test_files/arrow_typeof.slt b/datafusion/sqllogictest/test_files/arrow_typeof.slt index 73183b60675a..4e3e84246872 100644 --- a/datafusion/sqllogictest/test_files/arrow_typeof.slt +++ b/datafusion/sqllogictest/test_files/arrow_typeof.slt @@ -424,7 +424,7 @@ select arrow_cast([1, 2, 3], 'FixedSizeList(3, Int64)'); [1, 2, 3] # Tests for Utf8View -query ?T +query TT select arrow_cast('MyAwesomeString', 'Utf8View'), arrow_typeof(arrow_cast('MyAwesomeString', 'Utf8View')) ---- MyAwesomeString Utf8View diff --git a/datafusion/sqllogictest/test_files/math.slt b/datafusion/sqllogictest/test_files/math.slt index 279491584ea4..eece56942317 100644 --- a/datafusion/sqllogictest/test_files/math.slt +++ b/datafusion/sqllogictest/test_files/math.slt @@ -252,19 +252,19 @@ select abs(c1), abs(c2), abs(c3), abs(c4) from test_nullable_integer where datas NULL NULL NULL NULL # abs: Int8 overlow -statement error DataFusion error: Arrow error: Arithmetic overflow: Int8Array overflow on abs\(-128\) +statement error DataFusion error: Arrow error: Compute error: Int8Array overflow on abs\(-128\) select abs(c1) from test_nullable_integer where dataset = 'mins' # abs: Int16 overlow -statement error DataFusion error: Arrow error: Arithmetic overflow: Int16Array overflow on abs\(-32768\) +statement error DataFusion error: Arrow error: Compute error: Int16Array overflow on abs\(-32768\) select abs(c2) from test_nullable_integer where dataset = 'mins' # abs: Int32 overlow -statement error DataFusion error: Arrow error: Arithmetic overflow: Int32Array overflow on abs\(-2147483648\) +statement error DataFusion error: Arrow error: Compute error: Int32Array overflow on abs\(-2147483648\) select abs(c3) from test_nullable_integer where dataset = 'mins' # abs: Int64 overlow -statement error DataFusion error: Arrow error: Arithmetic overflow: Int64Array overflow on abs\(-9223372036854775808\) +statement error DataFusion error: Arrow error: Compute error: Int64Array overflow on abs\(-9223372036854775808\) select abs(c4) from test_nullable_integer where dataset = 'mins' statement ok @@ -620,15 +620,15 @@ select gcd(a, b), gcd(c*d + 1, abs(e)) + f from signed_integers; NULL NULL # gcd(i64::MIN, i64::MIN) -query error DataFusion error: Arrow error: Arithmetic overflow:Signed integer overflow in GCD\(\-9223372036854775808, \-9223372036854775808\) +query error DataFusion error: Arrow error: Compute error: Signed integer overflow in GCD\(\-9223372036854775808, \-9223372036854775808\) select gcd(-9223372036854775808, -9223372036854775808); # gcd(i64::MIN, 0) -query error DataFusion error: Arrow error: Arithmetic overflow:Signed integer overflow in GCD\(\-9223372036854775808, 0\) +query error DataFusion error: Arrow error: Compute error: Signed integer overflow in GCD\(\-9223372036854775808, 0\) select gcd(-9223372036854775808, 0); # gcd(0, i64::MIN) -query error DataFusion error: Arrow error: Arithmetic overflow:Signed integer overflow in GCD\(0, \-9223372036854775808\) +query error DataFusion error: Arrow error: Compute error: Signed integer overflow in GCD\(0, \-9223372036854775808\) select gcd(0, -9223372036854775808); @@ -662,14 +662,14 @@ select lcm(a, b), lcm(c, d), lcm(e, f) from signed_integers; NULL NULL NULL # Result cannot fit in i64 -query error DataFusion error: Arrow error: Arithmetic overflow:Signed integer overflow in LCM\(\-9223372036854775808, \-9223372036854775808\) +query error DataFusion error: Arrow error: Compute error: Signed integer overflow in LCM\(\-9223372036854775808, \-9223372036854775808\) select lcm(-9223372036854775808, -9223372036854775808); -query error DataFusion error: Arrow error: Arithmetic overflow:Signed integer overflow in LCM\(1, \-9223372036854775808\) +query error DataFusion error: Arrow error: Compute error: Signed integer overflow in LCM\(1, \-9223372036854775808\) select lcm(1, -9223372036854775808); # Overflow on multiplication -query error DataFusion error: Arrow error: Arithmetic overflow:Signed integer overflow in LCM\(2, 9223372036854775803\) +query error DataFusion error: Arrow error: Compute error: Signed integer overflow in LCM\(2, 9223372036854775803\) select lcm(2, 9223372036854775803); @@ -677,7 +677,7 @@ query error DataFusion error: Arrow error: Arithmetic overflow: Overflow happene select power(2107754225, 1221660777); # factorial overflow -query error DataFusion error: Arrow error: Arithmetic overflow: Overflow happened on FACTORIAL\(350943270\) +query error DataFusion error: Arrow error: Compute error: Overflow happened on FACTORIAL\(350943270\) select FACTORIAL(350943270); statement ok From 57feefd6b05b87cce00c59edf814bb2917868de4 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Fri, 9 Aug 2024 04:45:21 -0400 Subject: [PATCH 06/11] allow Utf8View in subtrait --- datafusion/substrait/src/physical_plan/producer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/substrait/src/physical_plan/producer.rs b/datafusion/substrait/src/physical_plan/producer.rs index 57fe68c4a780..1879646bc3db 100644 --- a/datafusion/substrait/src/physical_plan/producer.rs +++ b/datafusion/substrait/src/physical_plan/producer.rs @@ -153,7 +153,7 @@ fn to_substrait_type(data_type: &DataType, nullable: bool) -> Result { nullability, })), }), - DataType::Utf8 => Ok(Type { + DataType::Utf8 | DataType::Utf8View => Ok(Type { kind: Some(Kind::String(SubstraitString { type_variation_reference: 0, nullability, From dc075c362975fa8c4dab9393a67205d09abba0b7 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Fri, 9 Aug 2024 04:47:11 -0400 Subject: [PATCH 07/11] fmt --- datafusion/common/src/config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 3f4ea2b26441..9fa131df38e5 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -481,7 +481,7 @@ config_namespace! { /// (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, /// and `Binary/BinaryLarge` with `BinaryView`. - pub schema_force_string_view: bool, default = true + pub schema_force_string_view: bool, default = true } } From 225b3aa99c1ceff8453e32198d56bf59cff3e878 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Fri, 9 Aug 2024 05:06:47 -0400 Subject: [PATCH 08/11] update tests --- datafusion-cli/Cargo.lock | 59 ++++++++++------------- datafusion-cli/Cargo.toml | 14 ++++++ datafusion/sql/tests/cases/plan_to_sql.rs | 4 +- 3 files changed, 41 insertions(+), 36 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 2eb93da7c020..244d36e4ff4a 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -131,8 +131,7 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "arrow" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05048a8932648b63f21c37d88b552ccc8a65afb6dfe9fc9f30ce79174c2e7a85" +source = "git+https://github.com/apache/arrow-rs.git#3e02689e3464bc8cf929a0d116888fb6f59999fa" dependencies = [ "arrow-arith", "arrow-array", @@ -152,8 +151,7 @@ dependencies = [ [[package]] name = "arrow-arith" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d8a57966e43bfe9a3277984a14c24ec617ad874e4c0e1d2a1b083a39cfbf22c" +source = "git+https://github.com/apache/arrow-rs.git#3e02689e3464bc8cf929a0d116888fb6f59999fa" dependencies = [ "arrow-array", "arrow-buffer", @@ -167,8 +165,7 @@ dependencies = [ [[package]] name = "arrow-array" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f4a9468c882dc66862cef4e1fd8423d47e67972377d85d80e022786427768c" +source = "git+https://github.com/apache/arrow-rs.git#3e02689e3464bc8cf929a0d116888fb6f59999fa" dependencies = [ "ahash", "arrow-buffer", @@ -184,8 +181,7 @@ dependencies = [ [[package]] name = "arrow-buffer" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c975484888fc95ec4a632cdc98be39c085b1bb518531b0c80c5d462063e5daa1" +source = "git+https://github.com/apache/arrow-rs.git#3e02689e3464bc8cf929a0d116888fb6f59999fa" dependencies = [ "bytes", "half", @@ -195,8 +191,7 @@ dependencies = [ [[package]] name = "arrow-cast" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da26719e76b81d8bc3faad1d4dbdc1bcc10d14704e63dc17fc9f3e7e1e567c8e" +source = "git+https://github.com/apache/arrow-rs.git#3e02689e3464bc8cf929a0d116888fb6f59999fa" dependencies = [ "arrow-array", "arrow-buffer", @@ -216,8 +211,7 @@ dependencies = [ [[package]] name = "arrow-csv" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c13c36dc5ddf8c128df19bab27898eea64bf9da2b555ec1cd17a8ff57fba9ec2" +source = "git+https://github.com/apache/arrow-rs.git#3e02689e3464bc8cf929a0d116888fb6f59999fa" dependencies = [ "arrow-array", "arrow-buffer", @@ -235,8 +229,7 @@ dependencies = [ [[package]] name = "arrow-data" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd9d6f18c65ef7a2573ab498c374d8ae364b4a4edf67105357491c031f716ca5" +source = "git+https://github.com/apache/arrow-rs.git#3e02689e3464bc8cf929a0d116888fb6f59999fa" dependencies = [ "arrow-buffer", "arrow-schema", @@ -247,8 +240,7 @@ dependencies = [ [[package]] name = "arrow-ipc" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e786e1cdd952205d9a8afc69397b317cfbb6e0095e445c69cda7e8da5c1eeb0f" +source = "git+https://github.com/apache/arrow-rs.git#3e02689e3464bc8cf929a0d116888fb6f59999fa" dependencies = [ "arrow-array", "arrow-buffer", @@ -262,8 +254,7 @@ dependencies = [ [[package]] name = "arrow-json" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb22284c5a2a01d73cebfd88a33511a3234ab45d66086b2ca2d1228c3498e445" +source = "git+https://github.com/apache/arrow-rs.git#3e02689e3464bc8cf929a0d116888fb6f59999fa" dependencies = [ "arrow-array", "arrow-buffer", @@ -282,8 +273,7 @@ dependencies = [ [[package]] name = "arrow-ord" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42745f86b1ab99ef96d1c0bcf49180848a64fe2c7a7a0d945bc64fa2b21ba9bc" +source = "git+https://github.com/apache/arrow-rs.git#3e02689e3464bc8cf929a0d116888fb6f59999fa" dependencies = [ "arrow-array", "arrow-buffer", @@ -297,8 +287,7 @@ dependencies = [ [[package]] name = "arrow-row" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cd09a518c602a55bd406bcc291a967b284cfa7a63edfbf8b897ea4748aad23c" +source = "git+https://github.com/apache/arrow-rs.git#3e02689e3464bc8cf929a0d116888fb6f59999fa" dependencies = [ "ahash", "arrow-array", @@ -311,14 +300,12 @@ dependencies = [ [[package]] name = "arrow-schema" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e972cd1ff4a4ccd22f86d3e53e835c2ed92e0eea6a3e8eadb72b4f1ac802cf8" +source = "git+https://github.com/apache/arrow-rs.git#3e02689e3464bc8cf929a0d116888fb6f59999fa" [[package]] name = "arrow-select" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "600bae05d43483d216fb3494f8c32fdbefd8aa4e1de237e790dbb3d9f44690a3" +source = "git+https://github.com/apache/arrow-rs.git#3e02689e3464bc8cf929a0d116888fb6f59999fa" dependencies = [ "ahash", "arrow-array", @@ -331,8 +318,7 @@ dependencies = [ [[package]] name = "arrow-string" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dc1985b67cb45f6606a248ac2b4a288849f196bab8c657ea5589f47cdd55e6" +source = "git+https://github.com/apache/arrow-rs.git#3e02689e3464bc8cf929a0d116888fb6f59999fa" dependencies = [ "arrow-array", "arrow-buffer", @@ -347,13 +333,14 @@ dependencies = [ [[package]] name = "assert_cmd" -version = "2.0.15" +version = "2.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc65048dd435533bb1baf2ed9956b9a278fbfdcf90301b39ee117f06c0199d37" +checksum = "dc1835b7f27878de8525dc71410b5a31cdcc5f230aed5ba5df968e09c201b23d" dependencies = [ "anstyle", "bstr", "doc-comment", + "libc", "predicates", "predicates-core", "predicates-tree", @@ -2621,8 +2608,7 @@ dependencies = [ [[package]] name = "parquet" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e977b9066b4d3b03555c22bdc442f3fadebd96a39111249113087d0edb2691cd" +source = "git+https://github.com/apache/arrow-rs.git#3e02689e3464bc8cf929a0d116888fb6f59999fa" dependencies = [ "ahash", "arrow-array", @@ -4386,10 +4372,15 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "2.0.12+zstd.1.5.6" +version = "2.0.13+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a4e40c320c3cb459d9a9ff6de98cff88f4751ee9275d140e2be94a2b74e4c13" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" dependencies = [ "cc", "pkg-config", ] + +[[patch.unused]] +name = "arrow-flight" +version = "52.2.0" +source = "git+https://github.com/apache/arrow-rs.git#3e02689e3464bc8cf929a0d116888fb6f59999fa" diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index cbd9ffd0feba..2259e17eedff 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -62,3 +62,17 @@ assert_cmd = "2.0" ctor = "0.2.0" predicates = "3.0" rstest = "0.17" + +[patch.crates-io] +arrow = { git = "https://github.com/apache/arrow-rs.git" } +arrow-array = { git = "https://github.com/apache/arrow-rs.git" } +arrow-buffer = { git = "https://github.com/apache/arrow-rs.git" } +arrow-cast = { git = "https://github.com/apache/arrow-rs.git" } +arrow-data = { git = "https://github.com/apache/arrow-rs.git" } +arrow-ipc = { git = "https://github.com/apache/arrow-rs.git" } +arrow-schema = { git = "https://github.com/apache/arrow-rs.git" } +arrow-select = { git = "https://github.com/apache/arrow-rs.git" } +arrow-string = { git = "https://github.com/apache/arrow-rs.git" } +arrow-ord = { git = "https://github.com/apache/arrow-rs.git" } +arrow-flight = { git = "https://github.com/apache/arrow-rs.git" } +parquet = { git = "https://github.com/apache/arrow-rs.git" } diff --git a/datafusion/sql/tests/cases/plan_to_sql.rs b/datafusion/sql/tests/cases/plan_to_sql.rs index 179fc108e6d2..b29b6ef8d7fa 100644 --- a/datafusion/sql/tests/cases/plan_to_sql.rs +++ b/datafusion/sql/tests/cases/plan_to_sql.rs @@ -601,7 +601,7 @@ fn sql_round_trip(query: &str, expect: &str) { fn test_interval_lhs_eq() { sql_round_trip( "select interval '2 seconds' = interval '2 seconds'", - "SELECT (INTERVAL '0 YEARS 0 MONS 0 DAYS 0 HOURS 0 MINS 2.000000000 SECS' = INTERVAL '0 YEARS 0 MONS 0 DAYS 0 HOURS 0 MINS 2.000000000 SECS')", + "SELECT (INTERVAL '2.000000000 SECS' = INTERVAL '2.000000000 SECS')", ); } @@ -609,6 +609,6 @@ fn test_interval_lhs_eq() { fn test_interval_lhs_lt() { sql_round_trip( "select interval '2 seconds' < interval '2 seconds'", - "SELECT (INTERVAL '0 YEARS 0 MONS 0 DAYS 0 HOURS 0 MINS 2.000000000 SECS' < INTERVAL '0 YEARS 0 MONS 0 DAYS 0 HOURS 0 MINS 2.000000000 SECS')", + "SELECT (INTERVAL '2.000000000 SECS' < INTERVAL '2.000000000 SECS')", ); } From c65fff6cf1318fa7fa493b3b7b47b203f9947693 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Fri, 9 Aug 2024 05:47:37 -0400 Subject: [PATCH 09/11] make examples work --- Cargo.toml | 3 ++- benchmarks/Cargo.toml | 14 -------------- datafusion-examples/Cargo.toml | 4 ++-- datafusion/proto-common/Cargo.toml | 2 +- datafusion/proto/Cargo.toml | 2 +- datafusion/substrait/Cargo.toml | 2 +- docs/source/user-guide/configs.md | 2 +- 7 files changed, 8 insertions(+), 21 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5b9906f326cf..0b7c7f1cab3d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ # under the License. [workspace] -exclude = ["datafusion-cli", "dev/depcheck", "datafusion-examples"] +exclude = ["datafusion-cli", "dev/depcheck", ] members = [ "datafusion/common", "datafusion/common-runtime", @@ -24,6 +24,7 @@ members = [ "datafusion/core", "datafusion/expr", "datafusion/execution", + "datafusion-examples", "datafusion/functions-aggregate", "datafusion/functions", "datafusion/functions-nested", diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 09e074bfae24..7f29f7471b6f 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -53,17 +53,3 @@ tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } [dev-dependencies] datafusion-proto = { workspace = true } - -[patch.crates-io] -arrow = { git = "https://github.com/apache/arrow-rs.git" } -arrow-array = { git = "https://github.com/apache/arrow-rs.git" } -arrow-buffer = { git = "https://github.com/apache/arrow-rs.git" } -arrow-cast = { git = "https://github.com/apache/arrow-rs.git" } -arrow-data = { git = "https://github.com/apache/arrow-rs.git" } -arrow-ipc = { git = "https://github.com/apache/arrow-rs.git" } -arrow-schema = { git = "https://github.com/apache/arrow-rs.git" } -arrow-select = { git = "https://github.com/apache/arrow-rs.git" } -arrow-string = { git = "https://github.com/apache/arrow-rs.git" } -arrow-ord = { git = "https://github.com/apache/arrow-rs.git" } -arrow-flight = { git = "https://github.com/apache/arrow-rs.git" } -parquet = { git = "https://github.com/apache/arrow-rs.git" } diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 626c365af21c..20a39ae3bd97 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -72,14 +72,14 @@ log = { workspace = true } mimalloc = { version = "0.1", default-features = false } num_cpus = { workspace = true } object_store = { workspace = true, features = ["aws", "http"] } -prost = { version = "0.12", default-features = false } +prost = { version = "0.13.1", default-features = false } prost-derive = { version = "0.13", default-features = false } serde = { version = "1.0.136", features = ["derive"] } serde_json = { workspace = true } tempfile = { workspace = true } test-utils = { path = "../test-utils" } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } -tonic = "0.11" +tonic = "0.12" url = { workspace = true } uuid = "1.7" diff --git a/datafusion/proto-common/Cargo.toml b/datafusion/proto-common/Cargo.toml index e5d65827cdec..2e54f6517e6a 100644 --- a/datafusion/proto-common/Cargo.toml +++ b/datafusion/proto-common/Cargo.toml @@ -45,7 +45,7 @@ chrono = { workspace = true } datafusion-common = { workspace = true } object_store = { workspace = true } pbjson = { version = "0.6.0", optional = true } -prost = "0.12.0" +prost = "0.13.1" serde = { version = "1.0", optional = true } serde_json = { workspace = true, optional = true } diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml index 95d9e6700a50..e896b8e201a9 100644 --- a/datafusion/proto/Cargo.toml +++ b/datafusion/proto/Cargo.toml @@ -53,7 +53,7 @@ datafusion-expr = { workspace = true } datafusion-proto-common = { workspace = true } object_store = { workspace = true } pbjson = { version = "0.6.0", optional = true } -prost = "0.12.0" +prost = "0.13.1" serde = { version = "1.0", optional = true } serde_json = { workspace = true, optional = true } diff --git a/datafusion/substrait/Cargo.toml b/datafusion/substrait/Cargo.toml index 9e7ef9632ad3..d4b4a84ca00a 100644 --- a/datafusion/substrait/Cargo.toml +++ b/datafusion/substrait/Cargo.toml @@ -39,7 +39,7 @@ datafusion = { workspace = true, default-features = true } itertools = { workspace = true } object_store = { workspace = true } pbjson-types = "0.6" -prost = "0.12" +prost = "0.13.1" substrait = { version = "0.36.0", features = ["serde"] } url = { workspace = true } diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index e0c8391a259a..e2155726a055 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -76,7 +76,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.execution.parquet.allow_single_file_parallelism | true | (writing) Controls whether DataFusion will attempt to speed up writing parquet files by serializing them in parallel. Each column in each row group in each output file are serialized in parallel leveraging a maximum possible core count of n_files*n_row_groups*n_columns. | | datafusion.execution.parquet.maximum_parallel_row_group_writers | 1 | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. | | datafusion.execution.parquet.maximum_buffered_record_batches_per_stream | 2 | (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. | -| datafusion.execution.parquet.schema_force_string_view | false | (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`. | +| datafusion.execution.parquet.schema_force_string_view | true | (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`. | | datafusion.execution.aggregate.scalar_update_factor | 10 | Specifies the threshold for using `ScalarValue`s to update accumulators during high-cardinality aggregations for each input batch. The aggregation is considered high-cardinality if the number of affected groups is greater than or equal to `batch_size / scalar_update_factor`. In such cases, `ScalarValue`s are utilized for updating accumulators, rather than the default batch-slice approach. This can lead to performance improvements. By adjusting the `scalar_update_factor`, you can balance the trade-off between more efficient accumulator updates and the number of groups affected. | | datafusion.execution.planning_concurrency | 0 | Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system | | datafusion.execution.sort_spill_reservation_bytes | 10485760 | Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured). | From 8311bd68ff8c127ec7975d9ff55101bc9ac16fa8 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Fri, 9 Aug 2024 06:00:59 -0400 Subject: [PATCH 10/11] prost version --- datafusion/substrait/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/substrait/Cargo.toml b/datafusion/substrait/Cargo.toml index d4b4a84ca00a..b8bbc0b9f95e 100644 --- a/datafusion/substrait/Cargo.toml +++ b/datafusion/substrait/Cargo.toml @@ -39,7 +39,7 @@ datafusion = { workspace = true, default-features = true } itertools = { workspace = true } object_store = { workspace = true } pbjson-types = "0.6" -prost = "0.13.1" +prost = "0.12.0" substrait = { version = "0.36.0", features = ["serde"] } url = { workspace = true } From ec82289b374e62297161346cc8ce10ce806381b3 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Fri, 9 Aug 2024 06:31:48 -0400 Subject: [PATCH 11/11] update --- Cargo.toml | 2 +- datafusion/core/example.parquet | Bin 976 -> 793 bytes 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 0b7c7f1cab3d..532df42c9619 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ # under the License. [workspace] -exclude = ["datafusion-cli", "dev/depcheck", ] +exclude = ["datafusion-cli", "dev/depcheck"] members = [ "datafusion/common", "datafusion/common-runtime", diff --git a/datafusion/core/example.parquet b/datafusion/core/example.parquet index 17f7473cd221426b545a5f437c42efdc6b1702b3..68078928fac7927cc4a26a2ad6e2f94865fe560b 100644 GIT binary patch delta 196 zcmcb>K9g-i{KTb^lU*3YCqHDAW6@M&V4D1&Q69pOW0IE#i7-gYh`EWDh%qpLSU(uy z%vH>jBbclh875C*ax?I;1Sw?@`@kwHA~ugj)Qv%nK}M8C(v*QsKvIG+Nt8#7Nvw}i zY!jo}4@R|JjFUwfbwpsATG&Lz#GXJjO?F@wXI;gt_J(P446}|POvx;el2xptZj&c7 Pn{re#GB5-<1{neXj!hM$})jExqTG-Y5Dkd$Cd66FzN66<3W+r+2_QsB%4 zRvs|-Pppcc8m$+N}&1) yoT6f44ICi#+t`rQBfItxt6B@|WT1Pw8N`->)b9hTpL~bOlp~9gfg!*#$PfU-Ry0un