diff --git a/.github/workflows/util/install_spark_resources.sh b/.github/workflows/util/install_spark_resources.sh index 1b00fe3ff293..ad454f601a1c 100755 --- a/.github/workflows/util/install_spark_resources.sh +++ b/.github/workflows/util/install_spark_resources.sh @@ -50,13 +50,13 @@ case "$1" in 3.4) # Spark-3.4 cd ${INSTALL_DIR} && \ - wget -nv https://archive.apache.org/dist/spark/spark-3.4.3/spark-3.4.3-bin-hadoop3.tgz && \ - tar --strip-components=1 -xf spark-3.4.3-bin-hadoop3.tgz spark-3.4.3-bin-hadoop3/jars/ && \ - rm -rf spark-3.4.3-bin-hadoop3.tgz && \ + wget -nv https://archive.apache.org/dist/spark/spark-3.4.4/spark-3.4.4-bin-hadoop3.tgz && \ + tar --strip-components=1 -xf spark-3.4.4-bin-hadoop3.tgz spark-3.4.4-bin-hadoop3/jars/ && \ + rm -rf spark-3.4.4-bin-hadoop3.tgz && \ mkdir -p ${INSTALL_DIR}/shims/spark34/spark_home/assembly/target/scala-2.12 && \ mv jars ${INSTALL_DIR}/shims/spark34/spark_home/assembly/target/scala-2.12 && \ - wget -nv https://github.com/apache/spark/archive/refs/tags/v3.4.3.tar.gz && \ - tar --strip-components=1 -xf v3.4.3.tar.gz spark-3.4.3/sql/core/src/test/resources/ && \ + wget -nv https://github.com/apache/spark/archive/refs/tags/v3.4.4.tar.gz && \ + tar --strip-components=1 -xf v3.4.4.tar.gz spark-3.4.4/sql/core/src/test/resources/ && \ mkdir -p shims/spark34/spark_home/ && \ mv sql shims/spark34/spark_home/ ;; diff --git a/.github/workflows/velox_backend.yml b/.github/workflows/velox_backend.yml index ec79bc8b1bea..3c198cf83f6a 100644 --- a/.github/workflows/velox_backend.yml +++ b/.github/workflows/velox_backend.yml @@ -747,20 +747,27 @@ jobs: with: name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - - name: Prepare + - name: Prepare spark.test.home for Spark 3.4.4 (other tests) run: | + rm -rf /opt/shims/spark34 + bash .github/workflows/util/install_spark_resources.sh 3.4 dnf module -y install python39 && \ alternatives --set python3 /usr/bin/python3.9 && \ pip3 install setuptools && \ - pip3 install pyspark==3.4.3 cython && \ + pip3 install pyspark==3.4.4 cython && \ pip3 install pandas pyarrow - - name: Build and Run unit test for Spark 3.4.3 (other tests) + yum remove -y java-1.8.0-openjdk-devel && yum -y install java-11-openjdk-devel + + - name: Build and Run unit test for Spark 3.4.4 (other tests) run: | cd $GITHUB_WORKSPACE/ export SPARK_SCALA_VERSION=2.12 - $MVN_CMD clean test -Pspark-3.4 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Phudi -Pspark-ut \ + export JAVA_HOME=/usr/lib/jvm/java-11-openjdk + ls -l /opt/shims/spark34/spark_home/ + $MVN_CMD clean test -Pspark-3.4 -Pjava-11 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Phudi -Pspark-ut \ -DargLine="-Dspark.test.home=/opt/shims/spark34/spark_home/" \ - -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags + -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags \ + -DargLine="-Dio.netty.tryReflectionSetAccessible=true" - name: Upload test report if: always() uses: actions/upload-artifact@v4 @@ -791,12 +798,20 @@ jobs: with: name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - - name: Build and Run unit test for Spark 3.4.3 (slow tests) + - name: Prepare + run: | + yum remove -y java-1.8.0-openjdk-devel && yum -y install java-11-openjdk-devel + rm -rf /opt/shims/spark34 + bash .github/workflows/util/install_spark_resources.sh 3.4 + - name: Build and Run unit test for Spark 3.4.4 (slow tests) run: | cd $GITHUB_WORKSPACE/ - $MVN_CMD clean test -Pspark-3.4 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -Phudi \ + export JAVA_HOME=/usr/lib/jvm/java-11-openjdk + ls -l /opt/shims/spark34/spark_home/ + $MVN_CMD clean test -Pspark-3.4 -Pjava-11 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -Phudi \ -DargLine="-Dspark.test.home=/opt/shims/spark34/spark_home/" \ - -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest + -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest \ + -DargLine="-Dio.netty.tryReflectionSetAccessible=true" - name: Upload test report if: always() uses: actions/upload-artifact@v4 diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/python/ArrowEvalPythonExecSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/python/ArrowEvalPythonExecSuite.scala deleted file mode 100644 index c2a191a20d0b..000000000000 --- a/backends-velox/src/test/scala/org/apache/gluten/execution/python/ArrowEvalPythonExecSuite.scala +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gluten.execution.python - -import org.apache.gluten.execution.WholeStageTransformerSuite - -import org.apache.spark.SparkConf -import org.apache.spark.api.python.ColumnarArrowEvalPythonExec -import org.apache.spark.sql.IntegratedUDFTestUtils - -class ArrowEvalPythonExecSuite extends WholeStageTransformerSuite { - - import IntegratedUDFTestUtils._ - import testImplicits.localSeqToDatasetHolder - import testImplicits.newProductEncoder - - override protected val resourcePath: String = "/tpch-data-parquet" - override protected val fileFormat: String = "parquet" - val pyarrowTestUDF = TestScalarPandasUDF(name = "pyarrowUDF") - - override def sparkConf: SparkConf = { - super.sparkConf - .set("spark.sql.shuffle.partitions", "1") - .set("spark.default.parallelism", "1") - .set("spark.executor.cores", "1") - } - - test("arrow_udf test: without projection") { - lazy val base = - Seq(("1", 1), ("1", 2), ("2", 1), ("2", 2), ("3", 1), ("3", 2), ("0", 1), ("3", 0)) - .toDF("a", "b") - lazy val expected = Seq( - ("1", "1"), - ("1", "1"), - ("2", "2"), - ("2", "2"), - ("3", "3"), - ("3", "3"), - ("0", "0"), - ("3", "3") - ).toDF("a", "p_a") - - val df2 = base.select("a").withColumn("p_a", pyarrowTestUDF(base("a"))) - checkSparkOperatorMatch[ColumnarArrowEvalPythonExec](df2) - checkAnswer(df2, expected) - } - - test("arrow_udf test: with unrelated projection") { - lazy val base = - Seq(("1", 1), ("1", 2), ("2", 1), ("2", 2), ("3", 1), ("3", 2), ("0", 1), ("3", 0)) - .toDF("a", "b") - lazy val expected = Seq( - ("1", 1, "1", 2), - ("1", 2, "1", 4), - ("2", 1, "2", 2), - ("2", 2, "2", 4), - ("3", 1, "3", 2), - ("3", 2, "3", 4), - ("0", 1, "0", 2), - ("3", 0, "3", 0) - ).toDF("a", "b", "p_a", "d_b") - - val df = base.withColumn("p_a", pyarrowTestUDF(base("a"))).withColumn("d_b", base("b") * 2) - checkSparkOperatorMatch[ColumnarArrowEvalPythonExec](df) - checkAnswer(df, expected) - } - - test("arrow_udf test: with preprojection") { - lazy val base = - Seq(("1", 1), ("1", 2), ("2", 1), ("2", 2), ("3", 1), ("3", 2), ("0", 1), ("3", 0)) - .toDF("a", "b") - lazy val expected = Seq( - ("1", 1, 2, "1", 2), - ("1", 2, 4, "1", 4), - ("2", 1, 2, "2", 2), - ("2", 2, 4, "2", 4), - ("3", 1, 2, "3", 2), - ("3", 2, 4, "3", 4), - ("0", 1, 2, "0", 2), - ("3", 0, 0, "3", 0) - ).toDF("a", "b", "d_b", "p_a", "p_b") - val df = base - .withColumn("d_b", base("b") * 2) - .withColumn("p_a", pyarrowTestUDF(base("a"))) - .withColumn("p_b", pyarrowTestUDF(base("b") * 2)) - checkAnswer(df, expected) - } -} diff --git a/docs/get-started/Velox.md b/docs/get-started/Velox.md index 81bb88c75aec..dfe7dbf53c8d 100644 --- a/docs/get-started/Velox.md +++ b/docs/get-started/Velox.md @@ -9,7 +9,7 @@ parent: Getting-Started | Type | Version | |-------|------------------------------| -| Spark | 3.2.2, 3.3.1, 3.4.3, 3.5.1 | +| Spark | 3.2.2, 3.3.1, 3.4.4, 3.5.1 | | OS | Ubuntu20.04/22.04, Centos7/8 | | jdk | openjdk8/jdk17 | | scala | 2.12 | @@ -18,7 +18,7 @@ parent: Getting-Started Currently, with static build Gluten+Velox backend supports all the Linux OSes, but is only tested on **Ubuntu20.04/Ubuntu22.04/Centos7/Centos8**. With dynamic build, Gluten+Velox backend support **Ubuntu20.04/Ubuntu22.04/Centos7/Centos8** and their variants. -Currently, the officially supported Spark versions are 3.2.2, 3.3.1, 3.4.3 and 3.5.1. +Currently, the officially supported Spark versions are 3.2.2, 3.3.1, 3.4.4 and 3.5.1. We need to set up the `JAVA_HOME` env. Currently, Gluten supports **java 8** and **java 17**. diff --git a/docs/get-started/build-guide.md b/docs/get-started/build-guide.md index d9c3beaab9dc..32b9ce732b7a 100644 --- a/docs/get-started/build-guide.md +++ b/docs/get-started/build-guide.md @@ -73,5 +73,5 @@ It's name pattern is `gluten--bundle-spark_< |---------------|----------------------|----------------------| | 3.2.2 | 3.2 | 2.12 | | 3.3.1 | 3.3 | 2.12 | -| 3.4.3 | 3.4 | 2.12 | +| 3.4.4 | 3.4 | 2.12 | | 3.5.1 | 3.5 | 2.12 | diff --git a/pom.xml b/pom.xml index 018cd597ac24..393c152265f9 100644 --- a/pom.xml +++ b/pom.xml @@ -59,7 +59,7 @@ 2.12.15 3 3.4 - 3.4.3 + 3.4.4 spark-sql-columnar-shims-spark34 1.5.0 delta-core @@ -322,8 +322,8 @@ 3.4 spark-sql-columnar-shims-spark34 - 3.4.3 - 1.5.0 + 3.4.4 + 1.7.1 delta-core 2.4.0 24 diff --git a/shims/spark34/src/main/scala/org/apache/gluten/sql/shims/spark34/SparkShimProvider.scala b/shims/spark34/src/main/scala/org/apache/gluten/sql/shims/spark34/SparkShimProvider.scala index 2e642366e011..c79626eb21bf 100644 --- a/shims/spark34/src/main/scala/org/apache/gluten/sql/shims/spark34/SparkShimProvider.scala +++ b/shims/spark34/src/main/scala/org/apache/gluten/sql/shims/spark34/SparkShimProvider.scala @@ -20,7 +20,7 @@ import org.apache.gluten.sql.shims.{SparkShimDescriptor, SparkShims} import org.apache.gluten.sql.shims.spark34.SparkShimProvider.DESCRIPTOR object SparkShimProvider { - val DESCRIPTOR = SparkShimDescriptor(3, 4, 3) + val DESCRIPTOR = SparkShimDescriptor(3, 4, 4) } class SparkShimProvider extends org.apache.gluten.sql.shims.SparkShimProvider { diff --git a/shims/spark34/src/main/scala/org/apache/spark/shuffle/SparkSortShuffleWriterUtil.scala b/shims/spark34/src/main/scala/org/apache/spark/shuffle/SparkSortShuffleWriterUtil.scala index 9e684c2afdd4..95b15f04e7cb 100644 --- a/shims/spark34/src/main/scala/org/apache/spark/shuffle/SparkSortShuffleWriterUtil.scala +++ b/shims/spark34/src/main/scala/org/apache/spark/shuffle/SparkSortShuffleWriterUtil.scala @@ -27,6 +27,6 @@ object SparkSortShuffleWriterUtil { context: TaskContext, writeMetrics: ShuffleWriteMetricsReporter, shuffleExecutorComponents: ShuffleExecutorComponents): ShuffleWriter[K, V] = { - new SortShuffleWriter(handle, mapId, context, shuffleExecutorComponents) + new SortShuffleWriter(handle, mapId, context, writeMetrics, shuffleExecutorComponents) } } diff --git a/tools/gluten-it/pom.xml b/tools/gluten-it/pom.xml index 9b1cf10df891..570e45af66d4 100644 --- a/tools/gluten-it/pom.xml +++ b/tools/gluten-it/pom.xml @@ -18,7 +18,7 @@ ${java.version} ${java.version} 2.12.17 - 3.4.3 + 3.4.4 2.12 3 0.3.2-incubating @@ -163,7 +163,7 @@ spark-3.4 - 3.4.3 + 3.4.4 2.12.17