-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Do not rename struct fields when coercing types in CASE
#14384
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -961,23 +961,31 @@ fn struct_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> | |
return None; | ||
} | ||
|
||
let types = std::iter::zip(lhs_fields.iter(), rhs_fields.iter()) | ||
let coerced_types = std::iter::zip(lhs_fields.iter(), rhs_fields.iter()) | ||
.map(|(lhs, rhs)| comparison_coercion(lhs.data_type(), rhs.data_type())) | ||
.collect::<Option<Vec<DataType>>>()?; | ||
|
||
let fields = types | ||
// preserve the field name and nullability | ||
let orig_fields = std::iter::zip(lhs_fields.iter(), rhs_fields.iter()); | ||
|
||
let fields: Vec<FieldRef> = coerced_types | ||
.into_iter() | ||
.enumerate() | ||
.map(|(i, datatype)| { | ||
Arc::new(Field::new(format!("c{i}"), datatype, true)) | ||
}) | ||
.collect::<Vec<FieldRef>>(); | ||
.zip(orig_fields) | ||
.map(|(datatype, (lhs, rhs))| coerce_fields(datatype, lhs, rhs)) | ||
.collect(); | ||
Some(Struct(fields.into())) | ||
} | ||
_ => None, | ||
} | ||
} | ||
|
||
/// returns the result of coercing two fields to a common type | ||
fn coerce_fields(common_type: DataType, lhs: &FieldRef, rhs: &FieldRef) -> FieldRef { | ||
let is_nullable = lhs.is_nullable() || rhs.is_nullable(); | ||
let name = lhs.name(); // pick the name from the left field | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if the name is not the same, return error There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree that would be better -- can I do it in a follow on PR? In my opinion this PR makes things better (doesn't lose field names) even though it is not perfect and I would like to fix the regression since 43 What I think we should do is switch There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I think type union resolution is the correct on for CASE There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I filed to track the issue and plan to merge this one in when ready |
||
Arc::new(Field::new(name, common_type, is_nullable)) | ||
} | ||
|
||
/// Returns the output type of applying mathematics operations such as | ||
/// `+` to arguments of `lhs_type` and `rhs_type`. | ||
fn mathematics_numerical_coercion( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -308,3 +308,113 @@ NULL NULL false | |
|
||
statement ok | ||
drop table foo | ||
|
||
|
||
# Test coercion of inner struct field names | ||
# Reproducer for https://github.com/apache/datafusion/issues/14383 | ||
statement ok | ||
create table t as values | ||
( | ||
100, -- column1 int (so the case isn't constant folded) | ||
{ 'foo': 'bar' }, -- column2 has List of Struct w/ Utf8 | ||
{ 'foo': arrow_cast('baz', 'Utf8View') }, -- column3 has List of Struct w/ Utf8View | ||
{ 'foo': arrow_cast('blarg', 'Utf8View') } -- column4 has List of Struct w/ Utf8View | ||
); | ||
|
||
|
||
# Note field name is foo | ||
query ??? | ||
SELECT column2, column3, column4 FROM t; | ||
---- | ||
{foo: bar} {foo: baz} {foo: blarg} | ||
|
||
# Coerce fields, expect the field name to be the name of the first arg to case | ||
# the field should not be named 'c0' | ||
query ? | ||
SELECT | ||
case | ||
when column1 > 0 then column2 | ||
when column1 < 0 then column3 | ||
else column4 | ||
end | ||
FROM t; | ||
---- | ||
{foo: bar} | ||
|
||
query ? | ||
SELECT | ||
case | ||
when column1 > 0 then column3 -- different arg order shouldn't affect name | ||
when column1 < 0 then column4 | ||
else column2 | ||
end | ||
FROM t; | ||
---- | ||
{foo: baz} | ||
|
||
query ? | ||
SELECT | ||
case | ||
when column1 > 0 then column4 -- different arg order shouldn't affect name | ||
when column1 < 0 then column2 | ||
else column3 | ||
end | ||
FROM t; | ||
---- | ||
{foo: blarg} | ||
|
||
statement ok | ||
drop table t | ||
|
||
|
||
# Test coercion of inner struct field names with different orders / missing fields | ||
statement ok | ||
create table t as values | ||
( | ||
100, -- column1 int (so the case isn't constant folded) | ||
{ 'foo': 'a', 'xxx': 'b' }, -- column2: Struct with fields foo, xxx | ||
{ 'xxx': 'c', 'foo': 'd' }, -- column3: Struct with fields xxx, foo | ||
{ 'xxx': 'e' } -- column4: Struct with field xxx (no second field) | ||
); | ||
|
||
# Note field names are in different orders | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test documents some strange behavior that I don't really know if is a bug or not DataFusion treats the field orders as important, and when coercing two structs, it does it in field order (not by name) So in this case This PR doesn't change this behavior, but I felt that was an important behavior to document in tests, so I did so There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Specifically the basic case comparison goes through "comparison_coercion" (fixed in this PR) which uses the list field order But when coercing list elements, that goes through @jayzhan211 is the eventual plan to unify thee code paths? Or maybe the case coercion should use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We follow how DuckDB works, I think both field order and name should be the same There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To be clear this PR doesn't change the existing behavior of comparison_coercion other than not renaming field names |
||
query ??? | ||
SELECT column2, column3, column4 FROM t; | ||
---- | ||
{foo: a, xxx: b} {xxx: c, foo: d} {xxx: e} | ||
|
||
# coerce structs with different field orders, | ||
# (note the *value*s are from column2 but the field name is 'xxx', as the coerced | ||
# type takes the field name from the last argument (column3) | ||
query ? | ||
SELECT | ||
case | ||
when column1 > 0 then column2 -- always true | ||
else column3 | ||
end | ||
FROM t; | ||
---- | ||
{xxx: a, foo: b} | ||
|
||
# coerce structs with different field orders | ||
query ? | ||
SELECT | ||
case | ||
when column1 < 0 then column2 -- always false | ||
else column3 | ||
end | ||
FROM t; | ||
---- | ||
{xxx: c, foo: d} | ||
|
||
# coerce structs with subset of fields | ||
query error Failed to coerce then | ||
SELECT | ||
case | ||
when column1 > 0 then column3 | ||
else column4 | ||
end | ||
FROM t; | ||
|
||
statement ok | ||
drop table t |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
here is the fix. I suspect this doesn't matter in most cases as the result of a comparison is
boolean
so the names of the intermediate fields don't matter.However, in a
CASE
(and union for that matter) the names do matter, so this PR preserves them