| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| |
| name: Run benchmarks |
| |
| on: |
| workflow_dispatch: |
| inputs: |
| class: |
| description: 'Benchmark class' |
| required: true |
| default: '*' |
| jdk: |
| type: choice |
| description: 'JDK version: 17, 21, or 25' |
| required: true |
| default: '17' |
| options: |
| - '17' |
| - '21' |
| - '25' |
| scala: |
| type: choice |
| description: 'Scala version: 2.13' |
| required: true |
| default: '2.13' |
| options: |
| - '2.13' |
| failfast: |
| type: boolean |
| description: 'Failfast' |
| required: true |
| default: true |
| num-splits: |
| description: 'Number of job splits' |
| required: true |
| default: '1' |
| create-commit: |
| type: boolean |
| description: 'Commit the benchmark results to the current branch' |
| required: true |
| default: false |
| |
| jobs: |
| matrix-gen: |
| name: Generate matrix for job splits |
| runs-on: ubuntu-latest |
| outputs: |
| matrix: ${{ steps.set-matrix.outputs.matrix }} |
| env: |
| SPARK_BENCHMARK_NUM_SPLITS: ${{ inputs.num-splits }} |
| steps: |
| - name: Generate matrix |
| id: set-matrix |
| run: echo "matrix=["`seq -s, 1 $SPARK_BENCHMARK_NUM_SPLITS`"]" >> $GITHUB_OUTPUT |
| |
| # Any TPC-DS related updates on this job need to be applied to tpcds-1g job of build_and_test.yml as well |
| tpcds-1g-gen: |
| name: "Generate an TPC-DS dataset with SF=1" |
| if: contains(inputs.class, 'TPCDSQueryBenchmark') || contains(inputs.class, 'LZ4TPCDSDataBenchmark') || contains(inputs.class, 'ZStandardTPCDSDataBenchmark') || contains(inputs.class, '*') |
| runs-on: ubuntu-latest |
| env: |
| SPARK_LOCAL_IP: localhost |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v6 |
| # In order to get diff files |
| with: |
| fetch-depth: 0 |
| - name: Cache SBT and Maven |
| uses: actions/cache@v5 |
| with: |
| path: | |
| build/apache-maven-* |
| build/*.jar |
| ~/.sbt |
| key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build- |
| - name: Cache Coursier local repository |
| uses: actions/cache@v5 |
| with: |
| path: ~/.cache/coursier |
| key: benchmark-coursier-${{ inputs.jdk }}-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} |
| restore-keys: | |
| benchmark-coursier-${{ inputs.jdk }} |
| - name: Cache TPC-DS generated data |
| id: cache-tpcds-sf-1 |
| uses: actions/cache@v5 |
| with: |
| path: | |
| ./tpcds-sf-1 |
| ./tpcds-sf-1-text |
| key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }} |
| - name: Checkout tpcds-kit repository |
| if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' |
| uses: actions/checkout@v6 |
| with: |
| repository: databricks/tpcds-kit |
| ref: 1b7fb7529edae091684201fab142d956d6afd881 |
| path: ./tpcds-kit |
| - name: Build tpcds-kit |
| if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' |
| run: cd tpcds-kit/tools && make OS=LINUX |
| - name: Install Java ${{ inputs.jdk }} |
| if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' |
| uses: actions/setup-java@v5 |
| with: |
| distribution: zulu |
| java-version: ${{ inputs.jdk }} |
| - name: Generate TPC-DS (SF=1) table data |
| if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' |
| run: | |
| build/sbt "sql/Test/runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite" |
| mkdir -p `pwd`/tpcds-sf-1-text && rm -f `pwd`/tpcds-sf-1-text/* && `pwd`/tpcds-kit/tools/dsdgen -DISTRIBUTIONS `pwd`/tpcds-kit/tools/tpcds.idx -SCALE 1 -DIR `pwd`/tpcds-sf-1-text |
| |
| benchmark: |
| name: "Run benchmarks: ${{ inputs.class }} (JDK ${{ inputs.jdk }}, Scala ${{ inputs.scala }}, ${{ matrix.split }} out of ${{ inputs.num-splits }} splits)" |
| if: always() |
| needs: [matrix-gen, tpcds-1g-gen] |
| runs-on: ubuntu-latest |
| strategy: |
| fail-fast: false |
| max-parallel: 20 |
| matrix: |
| split: ${{fromJSON(needs.matrix-gen.outputs.matrix)}} |
| env: |
| SPARK_BENCHMARK_FAILFAST: ${{ inputs.failfast }} |
| SPARK_BENCHMARK_NUM_SPLITS: ${{ inputs.num-splits }} |
| SPARK_BENCHMARK_CUR_SPLIT: ${{ matrix.split }} |
| SPARK_GENERATE_BENCHMARK_FILES: 1 |
| SPARK_LOCAL_IP: localhost |
| # To prevent spark.test.home not being set. See more detail in SPARK-36007. |
| SPARK_HOME: ${{ github.workspace }} |
| SPARK_TPCDS_DATA: ${{ github.workspace }}/tpcds-sf-1 |
| SPARK_TPCDS_DATA_TEXT: ${{ github.workspace }}/tpcds-sf-1-text |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v6 |
| # In order to get diff files |
| with: |
| fetch-depth: 0 |
| - name: Cache SBT and Maven |
| uses: actions/cache@v5 |
| with: |
| path: | |
| build/apache-maven-* |
| build/*.jar |
| ~/.sbt |
| key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build- |
| - name: Cache Coursier local repository |
| uses: actions/cache@v5 |
| with: |
| path: ~/.cache/coursier |
| key: benchmark-coursier-${{ inputs.jdk }}-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} |
| restore-keys: | |
| benchmark-coursier-${{ inputs.jdk }} |
| - name: Install Java ${{ inputs.jdk }} |
| uses: actions/setup-java@v5 |
| with: |
| distribution: zulu |
| java-version: ${{ inputs.jdk }} |
| - name: Cache TPC-DS generated data |
| if: contains(inputs.class, 'TPCDSQueryBenchmark') || contains(inputs.class, 'LZ4TPCDSDataBenchmark') || contains(inputs.class, 'ZStandardTPCDSDataBenchmark') || contains(inputs.class, '*') |
| id: cache-tpcds-sf-1 |
| uses: actions/cache@v5 |
| with: |
| path: | |
| ./tpcds-sf-1 |
| ./tpcds-sf-1-text |
| key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }} |
| - name: Run benchmarks |
| run: | |
| ./build/sbt -Pscala-${{ inputs.scala }} -Pyarn -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pspark-ganglia-lgpl Test/package |
| # Make less noisy |
| cp conf/log4j2.properties.template conf/log4j2.properties |
| sed -i 's/rootLogger.level = info/rootLogger.level = warn/g' conf/log4j2.properties |
| # In benchmark, we use local as master so set driver memory only. Note that GitHub Actions has 7 GB memory limit. |
| bin/spark-submit \ |
| --driver-memory 6g --class org.apache.spark.benchmark.Benchmarks \ |
| --jars "`find . -name '*-SNAPSHOT-tests.jar' -o -name '*avro*-SNAPSHOT.jar' | paste -sd ',' -`,`find ~/.cache/coursier -name 'curator-test-*.jar'`" \ |
| "`find . -name 'spark-core*-SNAPSHOT-tests.jar'`" \ |
| "${{ inputs.class }}" |
| # To keep the directory structure and file permissions, tar them |
| # See also https://github.com/actions/upload-artifact#maintaining-file-permissions-and-case-sensitive-files |
| echo "Preparing the benchmark results:" |
| tar -cvf target/benchmark-results-${{ inputs.jdk }}-${{ inputs.scala }}.tar `git diff --name-only` `git ls-files --others --exclude=tpcds-sf-1 --exclude=tpcds-sf-1-text --exclude-standard` |
| - name: Create a pull request with the results |
| if: ${{ inputs.create-commit && success() }} |
| run: | |
| git config --local user.name "${{ github.actor }}" |
| git config --local user.email "${{ github.event.pusher.email || format('{0}@users.noreply.github.com', github.actor) }}" |
| git add -A |
| git commit -m "Benchmark results for ${{ inputs.class }} (JDK ${{ inputs.jdk }}, Scala ${{ inputs.scala }}, split ${{ matrix.split }} of ${{ inputs.num-splits }})" |
| for i in {1..5}; do |
| echo "Attempt $i to push..." |
| git fetch origin ${{ github.ref_name }} |
| git rebase origin/${{ github.ref_name }} |
| if git push origin ${{ github.ref_name }}:${{ github.ref_name }}; then |
| echo "Push successful." |
| exit 0 |
| else |
| echo "Push failed, retrying in 3 seconds..." |
| sleep 3 |
| fi |
| done |
| echo "Error: Failed to push after 5 attempts." |
| exit 1 |
| - name: Upload benchmark results |
| uses: actions/upload-artifact@v6 |
| with: |
| name: benchmark-results-${{ inputs.jdk }}-${{ inputs.scala }}-${{ matrix.split }} |
| path: target/benchmark-results-${{ inputs.jdk }}-${{ inputs.scala }}.tar |
| |