apache · sunchao · Feb 29, 2024 · Feb 28, 2024 · viirya · Feb 29, 2024
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -0,0 +1,158 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: Run TPC-DS Benchmark
+
+concurrency:
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
+  cancel-in-progress: true
+
+on:
+  push:
+    paths-ignore:
+      - "doc/**"
+      - "**.md"
+  pull_request:
+    paths-ignore:
+      - "doc/**"
+      - "**.md"
+  # manual trigger
+  # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
+  workflow_dispatch:
+
+env:
+  RUST_VERSION: nightly
+
+jobs:
+  prepare:
+    name: Build native lib and prepare TPC-DS data
+    runs-on: ubuntu-latest
+    container:
+      image: amd64/rust
+    env:
+      JAVA_VERSION: 11
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup Rust & Java toolchain
+        uses: ./.github/actions/setup-builder
+        with:
+          rust-version: ${{env.RUST_VERSION}}
+          jdk-version: 11
+      - name: Cache Maven dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.m2/repository
+            /root/.m2/repository
+          key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }}
+          restore-keys: |
+            ${{ runner.os }}-java-maven-
+
+      - name: Cache TPC-DS generated data
+        id: cache-tpcds-sf-1
+        uses: actions/cache@v4
+        with:
+          path: ./tpcds-sf-1
+          key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml') }}
+      - name: Checkout tpcds-kit repository
+        if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
+        uses: actions/checkout@v4
+        with:
+          repository: databricks/tpcds-kit
+          path: ./tpcds-kit
+      - name: Build Comet
+        run: make release
+      - name: Upload Comet native lib
+        uses: actions/upload-artifact@v4
+        with:
+          name: libcomet-${{ github.run_id }}
+          path: |
+            core/target/release/libcomet.so
+            core/target/release/libcomet.dylib
+          retention-days: 1 # remove the artifact after 1 day, only valid for this workflow
+          overwrite: true
+      - name: Build tpcds-kit
+        if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
+        run: |
+          apt-get install -y yacc bison flex
+          cd tpcds-kit/tools && make OS=LINUX
+      - name: Generate TPC-DS (SF=1) table data
+        if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
+        run: |
+          cd spark && MAVEN_OPTS='-Xmx20g' ../mvnw exec:java -Dexec.mainClass="org.apache.spark.sql.GenTPCDSData" -Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" -Dexec.args="--dsdgenDir `pwd`/../tpcds-kit/tools --location `pwd`/../tpcds-sf-1 --scaleFactor 1  --numPartitions 1"
+          cd ..
+
+  benchmark:
+    name: Run TPC-DS benchmark
+    runs-on: ubuntu-latest
+    needs: [prepare]
+    container:
+      image: amd64/rust
+    strategy:
+      matrix:
+        join: [sort_merge, broadcast, hash]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup Rust & Java toolchain
+        uses: ./.github/actions/setup-builder
+        with:
+          rust-version: ${{env.RUST_VERSION}}
+          jdk-version: 11
+      - name: Cache Maven dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.m2/repository
+            /root/.m2/repository
+          key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }}
+          restore-keys: |
+            ${{ runner.os }}-java-maven-
+      - name: Restore TPC-DS generated data
+        id: cache-tpcds-sf-1
+        uses: actions/cache@v4
+        with:
+          path: ./tpcds-sf-1
+          key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml') }}
+          fail-on-cache-miss: true # it's always be cached as it should be generated by pre-step if not existed
+      - name: Download Comet native lib
+        uses: actions/download-artifact@v4
+        with:
+          name: libcomet-${{ github.run_id }}
+          path: core/target/release
+      - name: Run TPC-DS queries (Sort merge join)
+        if: matrix.join == 'sort_merge'
+        run: |
+          SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
+        env:
+          SPARK_TPCDS_JOIN_CONF: |
+            spark.sql.autoBroadcastJoinThreshold=-1
+            spark.sql.join.preferSortMergeJoin=true
+      - name: Run TPC-DS queries (Broadcast hash join)
+        if: matrix.join == 'broadcast'
+        run: |
+          SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
+        env:
+          SPARK_TPCDS_JOIN_CONF: |
+            spark.sql.autoBroadcastJoinThreshold=10485760
+      - name: Run TPC-DS queries (Shuffled hash join)
+        if: matrix.join == 'hash'
+        run: |
+          SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
+        env:
+          SPARK_TPCDS_JOIN_CONF: |
+            spark.sql.autoBroadcastJoinThreshold=-1
+            spark.sql.join.forceApplyShuffledHashJoin=true
diff --git a/.github/workflows/pr_build.yml b/.github/workflows/pr_build.yml
@@ -132,64 +132,3 @@ jobs:
       - if: matrix.test-target == 'java'
         name: Java test steps
         uses: ./.github/actions/java-test
-
-  tpcds-1g:
-    name: Run TPC-DS queries with SF=1
-    runs-on: ubuntu-latest
-    container:
-      image: amd64/rust
-    env:
-      JAVA_VERSION: 11
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup Rust & Java toolchain
-        uses: ./.github/actions/setup-builder
-        with:
-          rust-version: ${{env.RUST_VERSION}}
-          jdk-version: 11
-
-      - name: Cache TPC-DS generated data
-        id: cache-tpcds-sf-1
-        uses: actions/cache@v4
-        with:
-          path: ./tpcds-sf-1
-          key: tpcds-${{ hashFiles('.github/workflows/pr_build.yml') }}
-      - name: Checkout tpcds-kit repository
-        if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
-        uses: actions/checkout@v4
-        with:
-          repository: databricks/tpcds-kit
-          path: ./tpcds-kit
-      - name: Build Comet
-        run: make release
-      - name: Build tpcds-kit
-        if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
-        run: |
-          apt-get install -y yacc bison flex
-          cd tpcds-kit/tools && make OS=LINUX
-      - name: Generate TPC-DS (SF=1) table data
-        if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
-        run: |
-          cd spark && MAVEN_OPTS='-Xmx20g' ../mvnw exec:java -Dexec.mainClass="org.apache.spark.sql.GenTPCDSData" -Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" -Dexec.args="--dsdgenDir `pwd`/../tpcds-kit/tools --location `pwd`/../tpcds-sf-1 --scaleFactor 1  --numPartitions 1"
-          cd ..
-      - name: Run TPC-DS queries (Sort merge join)
-        run: |
-          SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
-        env:
-          SPARK_TPCDS_JOIN_CONF: |
-            spark.sql.autoBroadcastJoinThreshold=-1
-            spark.sql.join.preferSortMergeJoin=true
-      - name: Run TPC-DS queries (Broadcast hash join)
-        run: |
-          SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
-        env:
-          SPARK_TPCDS_JOIN_CONF: |
-            spark.sql.autoBroadcastJoinThreshold=10485760
-      - name: Run TPC-DS queries (Shuffled hash join)
-        run: |
-          SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
-        env:
-          SPARK_TPCDS_JOIN_CONF: |
-            spark.sql.autoBroadcastJoinThreshold=-1
-            spark.sql.join.forceApplyShuffledHashJoin=true
-