-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix comparison of dictionary arrays #1606
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,8 +21,8 @@ use crate::arrow::datatypes::DataType; | |
use crate::error::{DataFusionError, Result}; | ||
use crate::logical_plan::Operator; | ||
use crate::physical_plan::expressions::coercion::{ | ||
dictionary_coercion, eq_coercion, is_numeric, like_coercion, string_coercion, | ||
temporal_coercion, | ||
dictionary_coercion, eq_coercion, is_dictionary, is_numeric, like_coercion, | ||
string_coercion, temporal_coercion, | ||
}; | ||
use crate::scalar::{MAX_PRECISION_FOR_DECIMAL128, MAX_SCALE_FOR_DECIMAL128}; | ||
|
||
|
@@ -77,7 +77,9 @@ pub(crate) fn coerce_types( | |
} | ||
|
||
fn comparison_eq_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> { | ||
if lhs_type == rhs_type { | ||
// can't compare dictionaries directly due to | ||
// https://github.com/apache/arrow-rs/issues/1201 | ||
if lhs_type == rhs_type && !is_dictionary(lhs_type) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will do so -- it is a good point There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in e27110f |
||
// same type => equality is possible | ||
return Some(lhs_type.clone()); | ||
} | ||
|
@@ -90,7 +92,9 @@ fn comparison_order_coercion( | |
lhs_type: &DataType, | ||
rhs_type: &DataType, | ||
) -> Option<DataType> { | ||
if lhs_type == rhs_type { | ||
// can't compare dictionaries directly due to | ||
// https://github.com/apache/arrow-rs/issues/1201 | ||
if lhs_type == rhs_type && !is_dictionary(lhs_type) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's same with above. |
||
// same type => all good | ||
return Some(lhs_type.clone()); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -147,6 +147,10 @@ pub fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<Dat | |
} | ||
} | ||
|
||
pub(crate) fn is_dictionary(t: &DataType) -> bool { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The duplication of code between this module and There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. #1607 <-- doesn't remove duplication but it at least consolidates the logic into a single module There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. another point I want to point out is that the arrow-rs has some codes to check data type, and the logic is also in datafusion, for example, https://docs.rs/arrow/7.0.0/arrow/datatypes/enum.DataType.html#method.is_numeric, Maybe we should refactor them and make the codebase clear. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Filed as #1613 |
||
matches!(t, DataType::Dictionary(_, _)) | ||
} | ||
|
||
/// Coercion rule for numerical types: The type that both lhs and rhs | ||
/// can be casted to for numerical calculation, while maintaining | ||
/// maximum precision | ||
|
@@ -158,8 +162,10 @@ pub fn numerical_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<Da | |
return None; | ||
}; | ||
|
||
// same type => all good | ||
if lhs_type == rhs_type { | ||
// can't compare dictionaries directly due to | ||
// https://github.com/apache/arrow-rs/issues/1201 | ||
if lhs_type == rhs_type && !is_dictionary(lhs_type) { | ||
// same type => all good | ||
return Some(lhs_type.clone()); | ||
} | ||
|
||
|
@@ -182,7 +188,9 @@ pub fn numerical_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<Da | |
|
||
// coercion rules for equality operations. This is a superset of all numerical coercion rules. | ||
pub fn eq_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> { | ||
if lhs_type == rhs_type { | ||
// can't compare dictionaries directly due to | ||
// https://github.com/apache/arrow-rs/issues/1201 | ||
if lhs_type == rhs_type && !is_dictionary(lhs_type) { | ||
// same type => equality is possible | ||
return Some(lhs_type.clone()); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -655,19 +655,28 @@ async fn query_nested_get_indexed_field_on_struct() -> Result<()> { | |
async fn query_on_string_dictionary() -> Result<()> { | ||
// Test to ensure DataFusion can operate on dictionary types | ||
// Use StringDictionary (32 bit indexes = keys) | ||
let array = vec![Some("one"), None, Some("three")] | ||
let d1: DictionaryArray<Int32Type> = | ||
vec![Some("one"), None, Some("three")].into_iter().collect(); | ||
|
||
let d2: DictionaryArray<Int32Type> = vec![Some("blarg"), None, Some("three")] | ||
.into_iter() | ||
.collect::<DictionaryArray<Int32Type>>(); | ||
.collect(); | ||
|
||
let d3: StringArray = vec![Some("XYZ"), None, Some("three")].into_iter().collect(); | ||
|
||
let batch = | ||
RecordBatch::try_from_iter(vec![("d1", Arc::new(array) as ArrayRef)]).unwrap(); | ||
let batch = RecordBatch::try_from_iter(vec![ | ||
("d1", Arc::new(d1) as ArrayRef), | ||
("d2", Arc::new(d2) as ArrayRef), | ||
("d3", Arc::new(d3) as ArrayRef), | ||
]) | ||
.unwrap(); | ||
|
||
let table = MemTable::try_new(batch.schema(), vec![vec![batch]])?; | ||
let mut ctx = ExecutionContext::new(); | ||
ctx.register_table("test", Arc::new(table))?; | ||
|
||
// Basic SELECT | ||
let sql = "SELECT * FROM test"; | ||
let sql = "SELECT d1 FROM test"; | ||
let actual = execute_to_batches(&mut ctx, sql).await; | ||
let expected = vec![ | ||
"+-------+", | ||
|
@@ -681,7 +690,7 @@ async fn query_on_string_dictionary() -> Result<()> { | |
assert_batches_eq!(expected, &actual); | ||
|
||
// basic filtering | ||
let sql = "SELECT * FROM test WHERE d1 IS NOT NULL"; | ||
let sql = "SELECT d1 FROM test WHERE d1 IS NOT NULL"; | ||
let actual = execute_to_batches(&mut ctx, sql).await; | ||
let expected = vec![ | ||
"+-------+", | ||
|
@@ -693,8 +702,56 @@ async fn query_on_string_dictionary() -> Result<()> { | |
]; | ||
assert_batches_eq!(expected, &actual); | ||
|
||
// comparison with constant | ||
let sql = "SELECT d1 FROM test WHERE d1 = 'three'"; | ||
let actual = execute_to_batches(&mut ctx, sql).await; | ||
let expected = vec![ | ||
"+-------+", | ||
"| d1 |", | ||
"+-------+", | ||
"| three |", | ||
"+-------+", | ||
]; | ||
assert_batches_eq!(expected, &actual); | ||
|
||
// comparison with another dictionary column | ||
let sql = "SELECT d1 FROM test WHERE d1 = d2"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this test fails without the change |
||
let actual = execute_to_batches(&mut ctx, sql).await; | ||
let expected = vec![ | ||
"+-------+", | ||
"| d1 |", | ||
"+-------+", | ||
"| three |", | ||
"+-------+", | ||
]; | ||
assert_batches_eq!(expected, &actual); | ||
|
||
// order comparison with another dictionary column | ||
let sql = "SELECT d1 FROM test WHERE d1 <= d2"; | ||
let actual = execute_to_batches(&mut ctx, sql).await; | ||
let expected = vec![ | ||
"+-------+", | ||
"| d1 |", | ||
"+-------+", | ||
"| three |", | ||
"+-------+", | ||
]; | ||
assert_batches_eq!(expected, &actual); | ||
|
||
// comparison with a non dictionary column | ||
let sql = "SELECT d1 FROM test WHERE d1 = d3"; | ||
let actual = execute_to_batches(&mut ctx, sql).await; | ||
let expected = vec![ | ||
"+-------+", | ||
"| d1 |", | ||
"+-------+", | ||
"| three |", | ||
"+-------+", | ||
]; | ||
assert_batches_eq!(expected, &actual); | ||
|
||
// filtering with constant | ||
let sql = "SELECT * FROM test WHERE d1 = 'three'"; | ||
let sql = "SELECT d1 FROM test WHERE d1 = 'three'"; | ||
let actual = execute_to_batches(&mut ctx, sql).await; | ||
let expected = vec![ | ||
"+-------+", | ||
|
@@ -719,6 +776,20 @@ async fn query_on_string_dictionary() -> Result<()> { | |
]; | ||
assert_batches_eq!(expected, &actual); | ||
|
||
// Expression evaluation with two dictionaries | ||
let sql = "SELECT concat(d1, d2) FROM test"; | ||
let actual = execute_to_batches(&mut ctx, sql).await; | ||
let expected = vec![ | ||
"+-------------------------+", | ||
"| concat(test.d1,test.d2) |", | ||
"+-------------------------+", | ||
"| oneblarg |", | ||
"| |", | ||
"| threethree |", | ||
"+-------------------------+", | ||
]; | ||
assert_batches_eq!(expected, &actual); | ||
|
||
// aggregation | ||
let sql = "SELECT COUNT(d1) FROM test"; | ||
let actual = execute_to_batches(&mut ctx, sql).await; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the arrow
eq
kernels don't supportDictionaryArray
<==>DictionaryArray
comparisons yet, so we need to handle this case specially. I will also file a ticket to add native DictionaryArray comparison support.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
apache/arrow-rs#1201