From c830d1301b2ae2a2323c3c72c94afacd67bcf5f7 Mon Sep 17 00:00:00 2001 From: Xianjin YE Date: Wed, 28 Feb 2024 23:42:26 +0800 Subject: [PATCH] build: Separate and speedup tpc-ds benchmark --- .github/workflows/benchmark.yml | 158 ++++++++++++++++++++++++++++++++ .github/workflows/pr_build.yml | 61 ------------ 2 files changed, 158 insertions(+), 61 deletions(-) create mode 100644 .github/workflows/benchmark.yml diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 000000000..adfa1ae26 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,158 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Run TPC-DS Benchmark + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +on: + push: + paths-ignore: + - "doc/**" + - "**.md" + pull_request: + paths-ignore: + - "doc/**" + - "**.md" + # manual trigger + # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow + workflow_dispatch: + +env: + RUST_VERSION: nightly + +jobs: + prepare: + name: Build native lib and prepare TPC-DS data + runs-on: ubuntu-latest + container: + image: amd64/rust + env: + JAVA_VERSION: 11 + steps: + - uses: actions/checkout@v4 + - name: Setup Rust & Java toolchain + uses: ./.github/actions/setup-builder + with: + rust-version: ${{env.RUST_VERSION}} + jdk-version: 11 + - name: Cache Maven dependencies + uses: actions/cache@v4 + with: + path: | + ~/.m2/repository + /root/.m2/repository + key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-java-maven- + + - name: Cache TPC-DS generated data + id: cache-tpcds-sf-1 + uses: actions/cache@v4 + with: + path: ./tpcds-sf-1 + key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml') }} + - name: Checkout tpcds-kit repository + if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' + uses: actions/checkout@v4 + with: + repository: databricks/tpcds-kit + path: ./tpcds-kit + - name: Build Comet + run: make release + - name: Upload Comet native lib + uses: actions/upload-artifact@v4 + with: + name: libcomet-${{ github.run_id }} + path: | + core/target/release/libcomet.so + core/target/release/libcomet.dylib + retention-days: 1 # remove the artifact after 1 day, only valid for this workflow + overwrite: true + - name: Build tpcds-kit + if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' + run: | + apt-get install -y yacc bison flex + cd tpcds-kit/tools && make OS=LINUX + - name: Generate TPC-DS (SF=1) table data + if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' + run: | + cd spark && MAVEN_OPTS='-Xmx20g' ../mvnw exec:java -Dexec.mainClass="org.apache.spark.sql.GenTPCDSData" -Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" -Dexec.args="--dsdgenDir `pwd`/../tpcds-kit/tools --location `pwd`/../tpcds-sf-1 --scaleFactor 1 --numPartitions 1" + cd .. + + benchmark: + name: Run TPC-DS benchmark + runs-on: ubuntu-latest + needs: [prepare] + container: + image: amd64/rust + strategy: + matrix: + join: [sort_merge, broadcast, hash] + steps: + - uses: actions/checkout@v4 + - name: Setup Rust & Java toolchain + uses: ./.github/actions/setup-builder + with: + rust-version: ${{env.RUST_VERSION}} + jdk-version: 11 + - name: Cache Maven dependencies + uses: actions/cache@v4 + with: + path: | + ~/.m2/repository + /root/.m2/repository + key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-java-maven- + - name: Restore TPC-DS generated data + id: cache-tpcds-sf-1 + uses: actions/cache@v4 + with: + path: ./tpcds-sf-1 + key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml') }} + fail-on-cache-miss: true # it's always be cached as it should be generated by pre-step if not existed + - name: Download Comet native lib + uses: actions/download-artifact@v4 + with: + name: libcomet-${{ github.run_id }} + path: core/target/release + - name: Run TPC-DS queries (Sort merge join) + if: matrix.join == 'sort_merge' + run: | + SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test + env: + SPARK_TPCDS_JOIN_CONF: | + spark.sql.autoBroadcastJoinThreshold=-1 + spark.sql.join.preferSortMergeJoin=true + - name: Run TPC-DS queries (Broadcast hash join) + if: matrix.join == 'broadcast' + run: | + SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test + env: + SPARK_TPCDS_JOIN_CONF: | + spark.sql.autoBroadcastJoinThreshold=10485760 + - name: Run TPC-DS queries (Shuffled hash join) + if: matrix.join == 'hash' + run: | + SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test + env: + SPARK_TPCDS_JOIN_CONF: | + spark.sql.autoBroadcastJoinThreshold=-1 + spark.sql.join.forceApplyShuffledHashJoin=true diff --git a/.github/workflows/pr_build.yml b/.github/workflows/pr_build.yml index fe4dd04a6..316532cee 100644 --- a/.github/workflows/pr_build.yml +++ b/.github/workflows/pr_build.yml @@ -132,64 +132,3 @@ jobs: - if: matrix.test-target == 'java' name: Java test steps uses: ./.github/actions/java-test - - tpcds-1g: - name: Run TPC-DS queries with SF=1 - runs-on: ubuntu-latest - container: - image: amd64/rust - env: - JAVA_VERSION: 11 - steps: - - uses: actions/checkout@v4 - - name: Setup Rust & Java toolchain - uses: ./.github/actions/setup-builder - with: - rust-version: ${{env.RUST_VERSION}} - jdk-version: 11 - - - name: Cache TPC-DS generated data - id: cache-tpcds-sf-1 - uses: actions/cache@v4 - with: - path: ./tpcds-sf-1 - key: tpcds-${{ hashFiles('.github/workflows/pr_build.yml') }} - - name: Checkout tpcds-kit repository - if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' - uses: actions/checkout@v4 - with: - repository: databricks/tpcds-kit - path: ./tpcds-kit - - name: Build Comet - run: make release - - name: Build tpcds-kit - if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' - run: | - apt-get install -y yacc bison flex - cd tpcds-kit/tools && make OS=LINUX - - name: Generate TPC-DS (SF=1) table data - if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' - run: | - cd spark && MAVEN_OPTS='-Xmx20g' ../mvnw exec:java -Dexec.mainClass="org.apache.spark.sql.GenTPCDSData" -Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" -Dexec.args="--dsdgenDir `pwd`/../tpcds-kit/tools --location `pwd`/../tpcds-sf-1 --scaleFactor 1 --numPartitions 1" - cd .. - - name: Run TPC-DS queries (Sort merge join) - run: | - SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test - env: - SPARK_TPCDS_JOIN_CONF: | - spark.sql.autoBroadcastJoinThreshold=-1 - spark.sql.join.preferSortMergeJoin=true - - name: Run TPC-DS queries (Broadcast hash join) - run: | - SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test - env: - SPARK_TPCDS_JOIN_CONF: | - spark.sql.autoBroadcastJoinThreshold=10485760 - - name: Run TPC-DS queries (Shuffled hash join) - run: | - SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test - env: - SPARK_TPCDS_JOIN_CONF: | - spark.sql.autoBroadcastJoinThreshold=-1 - spark.sql.join.forceApplyShuffledHashJoin=true -