apache · andygrove · Jun 18, 2024 · Jun 16, 2024 · Jun 16, 2024 · Jun 17, 2024
diff --git a/.github/workflows/pr_build.yml b/.github/workflows/pr_build.yml
@@ -254,15 +254,11 @@ jobs:
       matrix:
         java_version: [8, 17]
         test-target: [java]
-        spark-version: ['3.2', '3.3']
+        spark-version: ['3.3']
         scala-version: ['2.12', '2.13']
         exclude:
-          - java_version: 17
-            spark-version: '3.2'
           - java_version: 8
             spark-version: '3.3'
-          - spark-version: '3.2'
-            scala-version: '2.13'
       fail-fast: false
     name: macos-14(Silicon)/java ${{ matrix.java_version }}-spark-${{matrix.spark-version}}-scala-${{matrix.scala-version}}/${{ matrix.test-target }}
     runs-on: macos-14

diff --git a/common/src/main/scala/org/apache/comet/parquet/CometParquetUtils.scala b/common/src/main/scala/org/apache/comet/parquet/CometParquetUtils.scala
@@ -20,10 +20,9 @@
 package org.apache.comet.parquet
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.spark.sql.comet.shims.ShimCometParquetUtils
 import org.apache.spark.sql.internal.SQLConf
 
-object CometParquetUtils extends ShimCometParquetUtils {
+object CometParquetUtils {
   private val PARQUET_FIELD_ID_WRITE_ENABLED = "spark.sql.parquet.fieldId.write.enabled"
   private val PARQUET_FIELD_ID_READ_ENABLED = "spark.sql.parquet.fieldId.read.enabled"
   private val IGNORE_MISSING_PARQUET_FIELD_ID = "spark.sql.parquet.fieldId.read.ignoreMissing"

diff --git a/common/src/main/scala/org/apache/spark/sql/comet/parquet/CometParquetReadSupport.scala b/common/src/main/scala/org/apache/spark/sql/comet/parquet/CometParquetReadSupport.scala
@@ -27,10 +27,9 @@ import org.apache.parquet.schema._
 import org.apache.parquet.schema.LogicalTypeAnnotation.ListLogicalTypeAnnotation
 import org.apache.parquet.schema.Type.Repetition
 import org.apache.spark.sql.errors.QueryExecutionErrors
+import org.apache.spark.sql.execution.datasources.parquet.ParquetUtils
 import org.apache.spark.sql.types._
 
-import org.apache.comet.parquet.CometParquetUtils
-
 /**
  * This class is copied & slightly modified from [[ParquetReadSupport]] in Spark. Changes:
  *   - This doesn't extend from Parquet's `ReadSupport` class since that is used for row-based
@@ -53,7 +52,7 @@ object CometParquetReadSupport {
       ignoreMissingIds: Boolean): MessageType = {
     if (!ignoreMissingIds &&
       !containsFieldIds(parquetSchema) &&
-      CometParquetUtils.hasFieldIds(catalystSchema)) {
+      ParquetUtils.hasFieldIds(catalystSchema)) {
       throw new RuntimeException(
         "Spark read schema expects field Ids, " +
           "but Parquet file schema doesn't contain any field Ids.\n" +
@@ -334,14 +333,14 @@ object CometParquetReadSupport {
     }
 
     def matchIdField(f: StructField): Type = {
-      val fieldId = CometParquetUtils.getFieldId(f)
+      val fieldId = ParquetUtils.getFieldId(f)
       idToParquetFieldMap
         .get(fieldId)
         .map { parquetTypes =>
           if (parquetTypes.size > 1) {
             // Need to fail if there is ambiguity, i.e. more than one field is matched
             val parquetTypesString = parquetTypes.map(_.getName).mkString("[", ", ", "]")
-            throw CometParquetUtils.foundDuplicateFieldInFieldIdLookupModeError(
+            throw QueryExecutionErrors.foundDuplicateFieldInFieldIdLookupModeError(
               fieldId,
               parquetTypesString)
           } else {
@@ -355,9 +354,9 @@ object CometParquetReadSupport {
         }
     }
 
-    val shouldMatchById = useFieldId && CometParquetUtils.hasFieldIds(structType)
+    val shouldMatchById = useFieldId && ParquetUtils.hasFieldIds(structType)
     structType.map { f =>
-      if (shouldMatchById && CometParquetUtils.hasFieldId(f)) {
+      if (shouldMatchById && ParquetUtils.hasFieldId(f)) {
         matchIdField(f)
       } else if (caseSensitive) {
         matchCaseSensitiveField(f)

diff --git a/...rc/main/scala/org/apache/spark/sql/comet/parquet/CometSparkToParquetSchemaConverter.scala b/...rc/main/scala/org/apache/spark/sql/comet/parquet/CometSparkToParquetSchemaConverter.scala
@@ -26,6 +26,7 @@ import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._
 import org.apache.parquet.schema.Type.Repetition._
 import org.apache.spark.sql.errors.QueryCompilationErrors
 import org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaConverter
+import org.apache.spark.sql.execution.datasources.parquet.ParquetUtils
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
@@ -66,8 +67,8 @@ class CometSparkToParquetSchemaConverter(
    */
   def convertField(field: StructField): Type = {
     val converted = convertField(field, if (field.nullable) OPTIONAL else REQUIRED)
-    if (useFieldId && CometParquetUtils.hasFieldId(field)) {
-      converted.withId(CometParquetUtils.getFieldId(field))
+    if (useFieldId && ParquetUtils.hasFieldId(field)) {
+      converted.withId(ParquetUtils.getFieldId(field))
     } else {
       converted
     }

diff --git a/common/src/main/spark-3.x/org/apache/comet/shims/ShimBatchReader.scala b/common/src/main/spark-3.x/org/apache/comet/shims/ShimBatchReader.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.execution.datasources.PartitionedFile
 
 object ShimBatchReader {
 
-  // TODO: remove after dropping Spark 3.2 & 3.3 support and directly call PartitionedFile
+  // TODO: remove after dropping Spark 3.3 support and directly call PartitionedFile
   def newPartitionedFile(partitionValues: InternalRow, file: String): PartitionedFile =
     classOf[PartitionedFile].getDeclaredConstructors
       .map(c =>

diff --git a/common/src/main/spark-3.x/org/apache/comet/shims/ShimFileFormat.scala b/common/src/main/spark-3.x/org/apache/comet/shims/ShimFileFormat.scala
@@ -21,15 +21,12 @@ package org.apache.comet.shims
 
 object ShimFileFormat {
 
-  // TODO: remove after dropping Spark 3.2 & 3.3 support and directly use FileFormat.ROW_INDEX
+  // TODO: remove after dropping Spark 3.3 support and directly use FileFormat.ROW_INDEX
   val ROW_INDEX = "row_index"
 
   // A name for a temporary column that holds row indexes computed by the file format reader
   // until they can be placed in the _metadata struct.
-  // TODO: remove after dropping Spark 3.2 & 3.3 support and directly use
+  // TODO: remove after dropping Spark 3.3 support and directly use
   //       FileFormat.ROW_INDEX_TEMPORARY_COLUMN_NAME
   val ROW_INDEX_TEMPORARY_COLUMN_NAME: String = s"_tmp_metadata_$ROW_INDEX"
-
-  // TODO: remove after dropping Spark 3.2 support and use FileFormat.OPTION_RETURNING_BATCH
-  val OPTION_RETURNING_BATCH = "returning_batch"
 }
diff --git a/common/src/main/spark-3.x/org/apache/comet/shims/ShimResolveDefaultColumns.scala b/common/src/main/spark-3.x/org/apache/comet/shims/ShimResolveDefaultColumns.scala
@@ -24,7 +24,7 @@ import scala.util.Try
 import org.apache.spark.sql.types.{StructField, StructType}
 
 object ShimResolveDefaultColumns {
-  // TODO: remove after dropping Spark 3.2 & 3.3 support and directly use ResolveDefaultColumns
+  // TODO: remove after dropping Spark 3.3 support and directly use ResolveDefaultColumns
   def getExistenceDefaultValue(field: StructField): Any =
     Try {
       // scalastyle:off classforname

diff --git a/common/src/main/spark-3.x/org/apache/spark/sql/comet/shims/ShimCometParquetUtils.scala b/common/src/main/spark-3.x/org/apache/spark/sql/comet/shims/ShimCometParquetUtils.scala
diff --git a/common/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometParquetUtils.scala b/common/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimCometParquetUtils.scala
diff --git a/core/src/errors.rs b/core/src/errors.rs
@@ -61,7 +61,7 @@ pub enum CometError {
     Internal(String),
 
     // Note that this message format is based on Spark 3.4 and is more detailed than the message
-    // returned by Spark 3.2 or 3.3
+    // returned by Spark 3.3
     #[error("[CAST_INVALID_INPUT] The value '{value}' of the type \"{from_type}\" cannot be cast to \"{to_type}\" \
         because it is malformed. Correct the value as per the syntax, or change its target type. \
         Use `try_cast` to tolerate malformed input and return NULL instead. If necessary \

diff --git a/docs/source/contributor-guide/adding_a_new_expression.md b/docs/source/contributor-guide/adding_a_new_expression.md
@@ -46,7 +46,7 @@ The `QueryPlanSerde` object has a method `exprToProto`, which is responsible for
 For example, the `unhex` function looks like this:
 
 ```scala
-case e: Unhex if !isSpark32 =>
+case e: Unhex =>
   val unHex = unhexSerde(e)
 
   val childExpr = exprToProtoInternal(unHex._1, inputs)
@@ -59,7 +59,6 @@ case e: Unhex if !isSpark32 =>
 
 A few things to note here:
 
-* The `isSpark32` check is used to fall back to Spark's implementation of `unhex` in Spark 3.2. This is somewhat context specific, because in this case, due to a bug in Spark 3.2 for `unhex`, we want to use the Spark implementation and not a Comet implementation that would behave differently if correct.
 * The function is recursively called on child expressions, so you'll need to make sure that the child expressions are also converted to protobuf.
 * `scalarExprToProtoWithReturnType` is for scalar functions that need return type information. Your expression may use a different method depending on the type of expression.
 
@@ -71,8 +70,6 @@ For example, this is the test case for the `unhex` expression:
 
 ```scala
 test("unhex") {
-  assume(!isSpark32, "unhex function has incorrect behavior in 3.2")  // used to skip the test in Spark 3.2
-
   val table = "unhex_table"
   withTable(table) {
     sql(s"create table $table(col string) using parquet")
@@ -172,11 +169,11 @@ pub(super) fn spark_unhex(args: &[ColumnarValue]) -> Result<ColumnarValue, DataF
 If the expression you're adding has different behavior across different Spark versions, you'll need to account for that in your implementation. There are two tools at your disposal to help with this:
 
 1. Shims that exist in `spark/src/main/spark-$SPARK_VERSION/org/apache/comet/shims/CometExprShim.scala` for each Spark version. These shims are used to provide compatibility between different Spark versions.
-2. Variables that correspond to the Spark version, such as `isSpark32`, which can be used to conditionally execute code based on the Spark version.
+2. Variables that correspond to the Spark version, such as `isSpark33Plus`, which can be used to conditionally execute code based on the Spark version.
 
 ## Shimming to Support Different Spark Versions
 
-By adding shims for each Spark version, you can provide a consistent interface for the expression across different Spark versions. For example, `unhex` added a new optional parameter is Spark 3.4, for if it should `failOnError` or not. So for version 3.2 and 3.3, the shim is:
+By adding shims for each Spark version, you can provide a consistent interface for the expression across different Spark versions. For example, `unhex` added a new optional parameter is Spark 3.4, for if it should `failOnError` or not. So for version 3.3, the shim is:
 
 ```scala
 trait CometExprShim {

diff --git a/docs/source/user-guide/installation.md b/docs/source/user-guide/installation.md
@@ -28,7 +28,7 @@ Make sure the following requirements are met and software installed on your mach
 
 ## Requirements
 
-- Apache Spark 3.2, 3.3, or 3.4
+- Apache Spark 3.3, or 3.4
 - JDK 8 and up
 - GLIBC 2.17 (Centos 7) and up
 

diff --git a/docs/source/user-guide/overview.md b/docs/source/user-guide/overview.md
@@ -40,7 +40,7 @@ The following diagram illustrates the architecture of Comet:
 
 ## Current Status
 
-The project is currently integrated into Apache Spark 3.2, 3.3, and 3.4.
+The project is currently integrated into Apache Spark 3.3, and 3.4.
 
 ## Feature Parity with Apache Spark
 

diff --git a/pom.xml b/pom.xml
@@ -517,20 +517,6 @@ under the License.
       </properties>
     </profile>
 
-    <profile>
-      <id>spark-3.2</id>
-      <properties>
-        <scala.version>2.12.15</scala.version>
-        <spark.version>3.2.2</spark.version>
-        <spark.version.short>3.2</spark.version.short>
-        <parquet.version>1.12.0</parquet.version>
-        <!-- we don't add special test suits for spark-3.2, so a not existed dir is specified-->
-        <additional.3_3.test.source>not-needed-yet</additional.3_3.test.source>
-        <additional.3_4.test.source>not-needed-yet</additional.3_4.test.source>
-        <shims.minorVerSrc>spark-3.2</shims.minorVerSrc>
-      </properties>
-    </profile>
-
     <profile>
       <id>spark-3.3</id>
       <properties>

diff --git a/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala b/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala
@@ -96,7 +96,7 @@ class CometSparkSessionExtensions
                 isSchemaSupported(scanExec.scan.asInstanceOf[ParquetScan].readDataSchema) &&
                 isSchemaSupported(scanExec.scan.asInstanceOf[ParquetScan].readPartitionSchema) &&
                 // Comet does not support pushedAggregate
-                getPushedAggregate(scanExec.scan.asInstanceOf[ParquetScan]).isEmpty =>
+                scanExec.scan.asInstanceOf[ParquetScan].pushedAggregate.isEmpty =>
             val cometScan = CometParquetScan(scanExec.scan.asInstanceOf[ParquetScan])
             logInfo("Comet extension enabled for Scan")
             CometBatchScanExec(
@@ -116,7 +116,7 @@ class CometSparkSessionExtensions
               s"Partition schema $readPartitionSchema is not supported")
             // Comet does not support pushedAggregate
             val info3 = createMessage(
-              getPushedAggregate(scanExec.scan.asInstanceOf[ParquetScan]).isDefined,
+              scanExec.scan.asInstanceOf[ParquetScan].pushedAggregate.isDefined,
               "Comet does not support pushed aggregate")
             withInfos(scanExec, Seq(info1, info2, info3).flatten.toSet)
             scanExec
@@ -992,8 +992,7 @@ object CometSparkSessionExtensions extends Logging {
     case BooleanType | ByteType | ShortType | IntegerType | LongType | FloatType | DoubleType |
         BinaryType | StringType | _: DecimalType | DateType | TimestampType =>
       true
-    // `TimestampNTZType` is private in Spark 3.2.
-    case t: DataType if t.typeName == "timestamp_ntz" && !isSpark32 => true
+    case t: DataType if t.typeName == "timestamp_ntz" => true
     case dt =>
       logInfo(s"Comet extension is disabled because data type $dt is not supported")
       false
@@ -1015,11 +1014,6 @@ object CometSparkSessionExtensions extends Logging {
     }
   }
 
-  /** Used for operations that weren't available in Spark 3.2 */
-  def isSpark32: Boolean = {
-    org.apache.spark.SPARK_VERSION.matches("3\\.2\\..*")
-  }
-
   def isSpark33Plus: Boolean = {
     org.apache.spark.SPARK_VERSION >= "3.3"
   }

diff --git a/spark/src/main/scala/org/apache/comet/parquet/ParquetFilters.scala b/spark/src/main/scala/org/apache/comet/parquet/ParquetFilters.scala
@@ -45,8 +45,8 @@ import org.apache.comet.CometSparkSessionExtensions.isSpark34Plus
 import org.apache.comet.shims.ShimSQLConf
 
 /**
- * Copied from Spark 3.2 & 3.4, in order to fix Parquet shading issue. TODO: find a way to remove
- * this duplication
+ * Copied from Spark 3.4, in order to fix Parquet shading issue. TODO: find a way to remove this
+ * duplication
  *
  * Some utility function to convert Spark data source filters to Parquet filters.
  */