[SPARK-47900] Fix check for implicit (UTF8_BINARY) collation

### What changes were proposed in this pull request? Fix method name and logic for cases where we want to check if the string has the UTF8 binary (implicit) collation. ### Why are the changes needed? apache#45592 introduced session level collation which meant that the concept of default collation is not the same as the implicit collation. Method in `SchemaUtils` had its name and logic changed to mean that the collation is binary orderable but that was not it's original intent. It should check if the collation is not implicit/UTF8 binary (`id != 0`). ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing unit tests. ### Was this patch authored or co-authored using generative AI tooling? No Closes apache#46116 from stefankandic/fixBinaryCheckLogic. Authored-by: Stefan Kandic <stefan.kandic@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
7mming7 · Apr 22, 2024 · b20356e · b20356e
1 parent 2fb31de
commit b20356e
Show file tree

Hide file tree

Showing 5 changed files with 14 additions and 10 deletions.
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala
@@ -36,6 +36,10 @@ class StringType private(val collationId: Int) extends AtomicType with Serializa
    */
   def supportsBinaryEquality: Boolean =
     CollationFactory.fetchCollation(collationId).supportsBinaryEquality
+
+  def isUTF8BinaryCollation: Boolean =
+    collationId == CollationFactory.UTF8_BINARY_COLLATION_ID
+
   def isUTF8BinaryLcaseCollation: Boolean =
     collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID
 
@@ -54,7 +58,7 @@ class StringType private(val collationId: Int) extends AtomicType with Serializa
    * If this is an UTF8_BINARY collation output is `string` due to backwards compatibility.
    */
   override def typeName: String =
-    if (collationId == 0) "string"
+    if (isUTF8BinaryCollation) "string"
     else s"string collate ${CollationFactory.fetchCollation(collationId).collationName}"
 
   override def equals(obj: Any): Boolean =

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GeneratedColumn.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GeneratedColumn.scala
@@ -163,9 +163,9 @@ object GeneratedColumn {
         s"generation expression data type ${analyzed.dataType.simpleString} " +
         s"is incompatible with column data type ${dataType.simpleString}")
     }
-    if (analyzed.exists(e => SchemaUtils.hasNonBinarySortableCollatedString(e.dataType))) {
+    if (analyzed.exists(e => SchemaUtils.hasNonUTF8BinaryCollation(e.dataType))) {
       throw unsupportedExpressionError(
-        "generation expression cannot contain non-binary orderable collated string type")
+        "generation expression cannot contain non utf8 binary collated string type")
     }
   }
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala
@@ -295,11 +295,11 @@ private[spark] object SchemaUtils {
   def escapeMetaCharacters(str: String): String = SparkSchemaUtils.escapeMetaCharacters(str)
 
   /**
-   * Checks if a given data type has a non-default collation string type.
+   * Checks if a given data type has a non utf8 binary (implicit) collation type.
    */
-  def hasNonBinarySortableCollatedString(dt: DataType): Boolean = {
+  def hasNonUTF8BinaryCollation(dt: DataType): Boolean = {
     dt.existsRecursively {
-      case st: StringType => !st.supportsBinaryOrdering
+      case st: StringType => !st.isUTF8BinaryCollation
       case _ => false
     }
   }

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala
@@ -294,7 +294,7 @@ object DataSourceUtils extends PredicateHelper {
       case childExpression @ (_: Attribute | _: GetStructField) =>
         // don't push down filters for types with non-binary sortable collation
         // as it could lead to incorrect results
-        SchemaUtils.hasNonBinarySortableCollatedString(childExpression.dataType)
+        SchemaUtils.hasNonUTF8BinaryCollation(childExpression.dataType)
 
       case _ => false
     }

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala
@@ -747,7 +747,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper {
         "fieldName" -> "c2",
         "expressionStr" -> "SUBSTRING(c1, 0, 1)",
         "reason" ->
-          "generation expression cannot contain non-binary orderable collated string type"))
+          "generation expression cannot contain non utf8 binary collated string type"))
 
     checkError(
       exception = intercept[AnalysisException] {
@@ -765,7 +765,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper {
         "fieldName" -> "c2",
         "expressionStr" -> "LOWER(c1)",
         "reason" ->
-          "generation expression cannot contain non-binary orderable collated string type"))
+          "generation expression cannot contain non utf8 binary collated string type"))
 
     checkError(
       exception = intercept[AnalysisException] {
@@ -783,7 +783,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper {
         "fieldName" -> "c2",
         "expressionStr" -> "UCASE(struct1.a)",
         "reason" ->
-          "generation expression cannot contain non-binary orderable collated string type"))
+          "generation expression cannot contain non utf8 binary collated string type"))
   }
 
   test("SPARK-47431: Default collation set to UNICODE, literal test") {