Skip to content

Commit

Permalink
Implement support for LargeString and LargeBinary for StringView and …
Browse files Browse the repository at this point in the history
…BinaryView (#11034)

* implement large binary

* add tests for large string

* better comments for string coercion
  • Loading branch information
XiangpengHao authored Jun 21, 2024
1 parent 959856b commit 19ed182
Show file tree
Hide file tree
Showing 3 changed files with 119 additions and 12 deletions.
36 changes: 24 additions & 12 deletions datafusion/expr/src/type_coercion/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -922,17 +922,21 @@ fn string_concat_internal_coercion(
}
}

/// Coercion rules for string types (Utf8/LargeUtf8): If at least one argument is
/// a string type and both arguments can be coerced into a string type, coerce
/// to string type.
/// Coercion rules for string view types (Utf8/LargeUtf8/Utf8View):
/// If at least one argument is a string view, we coerce to string view
/// based on the observation that StringArray to StringViewArray is cheap but not vice versa.
///
/// Between Utf8 and LargeUtf8, we coerce to LargeUtf8.
fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
// If Utf8View is in any side, we coerce to Utf8View.
(Utf8View, Utf8View | Utf8 | LargeUtf8) | (Utf8 | LargeUtf8, Utf8View) => {
Some(Utf8View)
}
// Then, if LargeUtf8 is in any side, we coerce to LargeUtf8.
(LargeUtf8, Utf8 | LargeUtf8) | (Utf8, LargeUtf8) => Some(LargeUtf8),
(Utf8, Utf8) => Some(Utf8),
(LargeUtf8, Utf8) => Some(LargeUtf8),
(Utf8, LargeUtf8) => Some(LargeUtf8),
(LargeUtf8, LargeUtf8) => Some(LargeUtf8),
(Utf8View, Utf8View) | (Utf8View, Utf8) | (Utf8, Utf8View) => Some(Utf8View),
_ => None,
}
}
Expand Down Expand Up @@ -982,18 +986,26 @@ fn binary_to_string_coercion(
}
}

/// Coercion rules for binary types (Binary/LargeBinary): If at least one argument is
/// Coercion rules for binary types (Binary/LargeBinary/BinaryView): If at least one argument is
/// a binary type and both arguments can be coerced into a binary type, coerce
/// to binary type.
fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
(Binary | Utf8, Binary) | (Binary, Utf8) => Some(Binary),
(LargeBinary | Binary | Utf8 | LargeUtf8, LargeBinary)
| (LargeBinary, Binary | Utf8 | LargeUtf8) => Some(LargeBinary),
(BinaryView, BinaryView) | (BinaryView, Binary) | (Binary, BinaryView) => {
// If BinaryView is in any side, we coerce to BinaryView.
(BinaryView, BinaryView | Binary | LargeBinary | Utf8 | LargeUtf8 | Utf8View)
| (LargeBinary | Binary | Utf8 | LargeUtf8 | Utf8View, BinaryView) => {
Some(BinaryView)
}
// Prefer LargeBinary over Binary
(LargeBinary | Binary | Utf8 | LargeUtf8 | Utf8View, LargeBinary)
| (LargeBinary, Binary | Utf8 | LargeUtf8 | Utf8View) => Some(LargeBinary),

// If Utf8View/LargeUtf8 presents need to be large Binary
(Utf8View | LargeUtf8, Binary) | (Binary, Utf8View | LargeUtf8) => {
Some(LargeBinary)
}
(Binary, Utf8) | (Utf8, Binary) => Some(Binary),
_ => None,
}
}
Expand Down
48 changes: 48 additions & 0 deletions datafusion/sqllogictest/test_files/binary_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ SELECT
arrow_cast(column2, 'Utf8') as column2_utf8,
arrow_cast(column1, 'Binary') AS column1_binary,
arrow_cast(column2, 'Binary') AS column2_binary,
arrow_cast(column1, 'LargeBinary') AS column1_large_binary,
arrow_cast(column2, 'LargeBinary') AS column2_large_binary,
arrow_cast(arrow_cast(column1, 'Binary'), 'BinaryView') AS column1_binaryview,
arrow_cast(arrow_cast(column2, 'Binary'), 'BinaryView') AS column2_binaryview,
arrow_cast(column1, 'Dictionary(Int32, Binary)') AS column1_dict,
Expand Down Expand Up @@ -120,6 +122,21 @@ Xiangpeng Xiangpeng true true false false
Raphael R false false true true
NULL R NULL NULL NULL NULL

# test BinaryViewArray with LargeBinary columns
query TTBBBB
select
column1_utf8, column2_utf8,
column1_binaryview = column2_large_binary,
column2_large_binary = column1_binaryview,
column1_binaryview <> column2_large_binary,
column2_large_binary <> column1_binaryview
from test;
----
Andrew X false false true true
Xiangpeng Xiangpeng true true false false
Raphael R false false true true
NULL R NULL NULL NULL NULL

# BinaryView column to Binary scalar
query TTBBBB
select
Expand All @@ -135,6 +152,21 @@ Xiangpeng Xiangpeng false false true true
Raphael R false false true true
NULL R NULL NULL NULL NULL

# BinaryView column to LargeBinary scalar
query TTBBBB
select
column1_utf8, column2_utf8,
column1_binaryview = arrow_cast('Andrew', 'LargeBinary'),
arrow_cast('Andrew', 'LargeBinary') = column1_binaryview,
column1_binaryview <> arrow_cast('Andrew', 'LargeBinary'),
arrow_cast('Andrew', 'LargeBinary') <> column1_binaryview
from test;
----
Andrew X true true false false
Xiangpeng Xiangpeng false false true true
Raphael R false false true true
NULL R NULL NULL NULL NULL

# Binary column to BinaryView scalar
query TTBBBB
select
Expand All @@ -150,5 +182,21 @@ Xiangpeng Xiangpeng false false true true
Raphael R false false true true
NULL R NULL NULL NULL NULL


# LargeBinary column to BinaryView scalar
query TTBBBB
select
column1_utf8, column2_utf8,
column1_large_binary = arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'),
arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') = column1_large_binary,
column1_large_binary <> arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'),
arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') <> column1_large_binary
from test;
----
Andrew X true true false false
Xiangpeng Xiangpeng false false true true
Raphael R false false true true
NULL R NULL NULL NULL NULL

statement ok
drop table test;
47 changes: 47 additions & 0 deletions datafusion/sqllogictest/test_files/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ create table test as
SELECT
arrow_cast(column1, 'Utf8') as column1_utf8,
arrow_cast(column2, 'Utf8') as column2_utf8,
arrow_cast(column1, 'LargeUtf8') as column1_large_utf8,
arrow_cast(column2, 'LargeUtf8') as column2_large_utf8,
arrow_cast(column1, 'Utf8View') as column1_utf8view,
arrow_cast(column2, 'Utf8View') as column2_utf8view,
arrow_cast(column1, 'Dictionary(Int32, Utf8)') as column1_dict,
Expand Down Expand Up @@ -118,6 +120,22 @@ Xiangpeng Xiangpeng true true false false
Raphael R false false true true
NULL R NULL NULL NULL NULL

# test StringViewArray with LargeUtf8 columns
query TTBBBB
select
column1_utf8, column2_utf8,
column1_utf8view = column2_large_utf8,
column2_large_utf8 = column1_utf8view,
column1_utf8view <> column2_large_utf8,
column2_large_utf8 <> column1_utf8view
from test;
----
Andrew X false false true true
Xiangpeng Xiangpeng true true false false
Raphael R false false true true
NULL R NULL NULL NULL NULL


# StringView column to String scalar
query TTBBBB
select
Expand All @@ -133,6 +151,21 @@ Xiangpeng Xiangpeng false false true true
Raphael R false false true true
NULL R NULL NULL NULL NULL

# StringView column to LargeString scalar
query TTBBBB
select
column1_utf8, column2_utf8,
column1_utf8view = arrow_cast('Andrew', 'LargeUtf8'),
arrow_cast('Andrew', 'LargeUtf8') = column1_utf8view,
column1_utf8view <> arrow_cast('Andrew', 'LargeUtf8'),
arrow_cast('Andrew', 'LargeUtf8') <> column1_utf8view
from test;
----
Andrew X true true false false
Xiangpeng Xiangpeng false false true true
Raphael R false false true true
NULL R NULL NULL NULL NULL

# String column to StringView scalar
query TTBBBB
select
Expand All @@ -148,6 +181,20 @@ Xiangpeng Xiangpeng false false true true
Raphael R false false true true
NULL R NULL NULL NULL NULL

# LargeString column to StringView scalar
query TTBBBB
select
column1_utf8, column2_utf8,
column1_large_utf8 = arrow_cast('Andrew', 'Utf8View'),
arrow_cast('Andrew', 'Utf8View') = column1_large_utf8,
column1_large_utf8 <> arrow_cast('Andrew', 'Utf8View'),
arrow_cast('Andrew', 'Utf8View') <> column1_large_utf8
from test;
----
Andrew X true true false false
Xiangpeng Xiangpeng false false true true
Raphael R false false true true
NULL R NULL NULL NULL NULL

########
## StringView to Dictionary
Expand Down

0 comments on commit 19ed182

Please sign in to comment.