From c33cb853330c7d38955e1ea7c72ac60768a6824f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 1 Feb 2025 07:00:50 -0500 Subject: [PATCH] Do not rename struct fields when coercing types in `CASE` (#14384) * Fix field name during struct equality coercion * fix bug * Add more tests * Update tests per #14396 --- .../expr-common/src/type_coercion/binary.rs | 22 ++-- datafusion/sqllogictest/test_files/case.slt | 110 ++++++++++++++++++ 2 files changed, 125 insertions(+), 7 deletions(-) diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 83f47883add9..571c17119427 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -961,23 +961,31 @@ fn struct_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option return None; } - let types = std::iter::zip(lhs_fields.iter(), rhs_fields.iter()) + let coerced_types = std::iter::zip(lhs_fields.iter(), rhs_fields.iter()) .map(|(lhs, rhs)| comparison_coercion(lhs.data_type(), rhs.data_type())) .collect::>>()?; - let fields = types + // preserve the field name and nullability + let orig_fields = std::iter::zip(lhs_fields.iter(), rhs_fields.iter()); + + let fields: Vec = coerced_types .into_iter() - .enumerate() - .map(|(i, datatype)| { - Arc::new(Field::new(format!("c{i}"), datatype, true)) - }) - .collect::>(); + .zip(orig_fields) + .map(|(datatype, (lhs, rhs))| coerce_fields(datatype, lhs, rhs)) + .collect(); Some(Struct(fields.into())) } _ => None, } } +/// returns the result of coercing two fields to a common type +fn coerce_fields(common_type: DataType, lhs: &FieldRef, rhs: &FieldRef) -> FieldRef { + let is_nullable = lhs.is_nullable() || rhs.is_nullable(); + let name = lhs.name(); // pick the name from the left field + Arc::new(Field::new(name, common_type, is_nullable)) +} + /// Returns the output type of applying mathematics operations such as /// `+` to arguments of `lhs_type` and `rhs_type`. fn mathematics_numerical_coercion( diff --git a/datafusion/sqllogictest/test_files/case.slt b/datafusion/sqllogictest/test_files/case.slt index a339c2aa037e..46e9c86c7591 100644 --- a/datafusion/sqllogictest/test_files/case.slt +++ b/datafusion/sqllogictest/test_files/case.slt @@ -308,3 +308,113 @@ NULL NULL false statement ok drop table foo + + +# Test coercion of inner struct field names +# Reproducer for https://github.com/apache/datafusion/issues/14383 +statement ok +create table t as values +( + 100, -- column1 int (so the case isn't constant folded) + { 'foo': 'bar' }, -- column2 has List of Struct w/ Utf8 + { 'foo': arrow_cast('baz', 'Utf8View') }, -- column3 has List of Struct w/ Utf8View + { 'foo': arrow_cast('blarg', 'Utf8View') } -- column4 has List of Struct w/ Utf8View +); + + +# Note field name is foo +query ??? +SELECT column2, column3, column4 FROM t; +---- +{foo: bar} {foo: baz} {foo: blarg} + +# Coerce fields, expect the field name to be the name of the first arg to case +# the field should not be named 'c0' +query ? +SELECT + case + when column1 > 0 then column2 + when column1 < 0 then column3 + else column4 + end +FROM t; +---- +{foo: bar} + +query ? +SELECT + case + when column1 > 0 then column3 -- different arg order shouldn't affect name + when column1 < 0 then column4 + else column2 + end +FROM t; +---- +{foo: baz} + +query ? +SELECT + case + when column1 > 0 then column4 -- different arg order shouldn't affect name + when column1 < 0 then column2 + else column3 + end +FROM t; +---- +{foo: blarg} + +statement ok +drop table t + + +# Test coercion of inner struct field names with different orders / missing fields +statement ok +create table t as values +( + 100, -- column1 int (so the case isn't constant folded) + { 'foo': 'a', 'xxx': 'b' }, -- column2: Struct with fields foo, xxx + { 'xxx': 'c', 'foo': 'd' }, -- column3: Struct with fields xxx, foo + { 'xxx': 'e' } -- column4: Struct with field xxx (no second field) +); + +# Note field names are in different orders +query ??? +SELECT column2, column3, column4 FROM t; +---- +{foo: a, xxx: b} {xxx: c, foo: d} {xxx: e} + +# coerce structs with different field orders, +# (note the *value*s are from column2 but the field name is 'xxx', as the coerced +# type takes the field name from the last argument (column3) +query ? +SELECT + case + when column1 > 0 then column2 -- always true + else column3 + end +FROM t; +---- +{xxx: a, foo: b} + +# coerce structs with different field orders +query ? +SELECT + case + when column1 < 0 then column2 -- always false + else column3 + end +FROM t; +---- +{xxx: c, foo: d} + +# coerce structs with subset of fields +query error Failed to coerce then +SELECT + case + when column1 > 0 then column3 + else column4 + end +FROM t; + +statement ok +drop table t