| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| |
| name: Build and test |
| |
| on: |
| workflow_call: |
| inputs: |
| java: |
| required: false |
| type: string |
| default: 17 |
| branch: |
| description: Branch to run the build against |
| required: false |
| type: string |
| # Change 'master' to 'branch-4.0' in branch-4.0 branch after cutting it. |
| default: master |
| hadoop: |
| description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it. |
| required: false |
| type: string |
| default: hadoop3 |
| envs: |
| description: Additional environment variables to set when running the tests. Should be in JSON format. |
| required: false |
| type: string |
| default: '{}' |
| jobs: |
| description: >- |
| Jobs to run, and should be in JSON format. The values should be matched with the job's key defined |
| in this file, e.g., build. See precondition job below. |
| required: false |
| type: string |
| default: '' |
| jobs: |
| precondition: |
| name: Check changes |
| runs-on: ubuntu-latest |
| env: |
| GITHUB_PREV_SHA: ${{ github.event.before }} |
| outputs: |
| required: ${{ steps.set-outputs.outputs.required }} |
| image_url: ${{ steps.infra-image-outputs.outputs.image_url }} |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v4 |
| with: |
| fetch-depth: 0 |
| repository: apache/spark |
| ref: ${{ inputs.branch }} |
| - name: Sync the current branch with the latest in Apache Spark |
| if: github.repository != 'apache/spark' |
| run: | |
| echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV |
| git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty |
| - name: Check all modules |
| id: set-outputs |
| run: | |
| if [ -z "${{ inputs.jobs }}" ]; then |
| pyspark_modules=`cd dev && python -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"` |
| pyspark=`./dev/is-changed.py -m $pyspark_modules` |
| if [[ "${{ github.repository }}" != 'apache/spark' ]]; then |
| pandas=$pyspark |
| yarn=`./dev/is-changed.py -m yarn` |
| kubernetes=`./dev/is-changed.py -m kubernetes` |
| sparkr=`./dev/is-changed.py -m sparkr` |
| tpcds=`./dev/is-changed.py -m sql` |
| docker=`./dev/is-changed.py -m docker-integration-tests` |
| buf=true |
| ui=true |
| docs=true |
| else |
| pandas=false |
| yarn=false |
| kubernetes=false |
| sparkr=false |
| tpcds=false |
| docker=false |
| buf=false |
| ui=false |
| docs=false |
| fi |
| build=`./dev/is-changed.py -m "core,unsafe,kvstore,avro,utils,network-common,network-shuffle,repl,launcher,examples,sketch,variant,api,catalyst,hive-thriftserver,mllib-local,mllib,graphx,streaming,sql-kafka-0-10,streaming-kafka-0-10,streaming-kinesis-asl,kubernetes,hadoop-cloud,spark-ganglia-lgpl,protobuf,yarn,connect,sql,hive"` |
| precondition=" |
| { |
| \"build\": \"$build\", |
| \"pyspark\": \"$pyspark\", |
| \"pyspark-pandas\": \"$pandas\", |
| \"sparkr\": \"$sparkr\", |
| \"tpcds-1g\": \"$tpcds\", |
| \"docker-integration-tests\": \"$docker\", |
| \"lint\" : \"true\", |
| \"docs\" : \"$docs\", |
| \"yarn\" : \"$yarn\", |
| \"k8s-integration-tests\" : \"$kubernetes\", |
| \"buf\" : \"$buf\", |
| \"ui\" : \"$ui\", |
| }" |
| echo $precondition # For debugging |
| # Remove `\n` to avoid "Invalid format" error |
| precondition="${precondition//$'\n'/}}" |
| echo "required=$precondition" >> $GITHUB_OUTPUT |
| else |
| # This is usually set by scheduled jobs. |
| precondition='${{ inputs.jobs }}' |
| echo $precondition # For debugging |
| precondition="${precondition//$'\n'/}" |
| echo "required=$precondition" >> $GITHUB_OUTPUT |
| fi |
| - name: Generate infra image URL |
| id: infra-image-outputs |
| run: | |
| # Convert to lowercase to meet Docker repo name requirement |
| REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') |
| IMG_NAME="apache-spark-ci-image:${{ inputs.branch }}-${{ github.run_id }}" |
| IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" |
| echo "image_url=$IMG_URL" >> $GITHUB_OUTPUT |
| |
| # Build: build Spark and run the tests for specified modules. |
| build: |
| name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }}" |
| needs: precondition |
| if: fromJson(needs.precondition.outputs.required).build == 'true' |
| runs-on: ubuntu-latest |
| timeout-minutes: 180 |
| strategy: |
| fail-fast: false |
| matrix: |
| java: |
| - ${{ inputs.java }} |
| hadoop: |
| - ${{ inputs.hadoop }} |
| hive: |
| - hive2.3 |
| # Note that the modules below are from sparktestsupport/modules.py. |
| modules: |
| - >- |
| core, unsafe, kvstore, avro, utils, |
| network-common, network-shuffle, repl, launcher, |
| examples, sketch, variant |
| - >- |
| api, catalyst, hive-thriftserver |
| - >- |
| mllib-local, mllib, graphx |
| - >- |
| streaming, sql-kafka-0-10, streaming-kafka-0-10, streaming-kinesis-asl, |
| kubernetes, hadoop-cloud, spark-ganglia-lgpl, protobuf, connect |
| - yarn |
| # Here, we split Hive and SQL tests into some of slow ones and the rest of them. |
| included-tags: [""] |
| excluded-tags: [""] |
| comment: [""] |
| include: |
| # Hive tests |
| - modules: hive |
| java: ${{ inputs.java }} |
| hadoop: ${{ inputs.hadoop }} |
| hive: hive2.3 |
| included-tags: org.apache.spark.tags.SlowHiveTest |
| comment: "- slow tests" |
| - modules: hive |
| java: ${{ inputs.java }} |
| hadoop: ${{ inputs.hadoop }} |
| hive: hive2.3 |
| excluded-tags: org.apache.spark.tags.SlowHiveTest |
| comment: "- other tests" |
| # SQL tests |
| - modules: sql |
| java: ${{ inputs.java }} |
| hadoop: ${{ inputs.hadoop }} |
| hive: hive2.3 |
| included-tags: org.apache.spark.tags.ExtendedSQLTest |
| comment: "- extended tests" |
| - modules: sql |
| java: ${{ inputs.java }} |
| hadoop: ${{ inputs.hadoop }} |
| hive: hive2.3 |
| included-tags: org.apache.spark.tags.SlowSQLTest |
| comment: "- slow tests" |
| - modules: sql |
| java: ${{ inputs.java }} |
| hadoop: ${{ inputs.hadoop }} |
| hive: hive2.3 |
| excluded-tags: org.apache.spark.tags.ExtendedSQLTest,org.apache.spark.tags.SlowSQLTest |
| comment: "- other tests" |
| exclude: |
| # Always run if yarn == 'true', even infra-image is skip (such as non-master job) |
| # In practice, the build will run in individual PR, but not against the individual commit |
| # in Apache Spark repository. |
| - modules: ${{ fromJson(needs.precondition.outputs.required).yarn != 'true' && 'yarn' }} |
| env: |
| MODULES_TO_TEST: ${{ matrix.modules }} |
| EXCLUDED_TAGS: ${{ matrix.excluded-tags }} |
| INCLUDED_TAGS: ${{ matrix.included-tags }} |
| HADOOP_PROFILE: ${{ matrix.hadoop }} |
| HIVE_PROFILE: ${{ matrix.hive }} |
| GITHUB_PREV_SHA: ${{ github.event.before }} |
| SPARK_LOCAL_IP: localhost |
| NOLINT_ON_COMPILE: true |
| SKIP_UNIDOC: true |
| SKIP_MIMA: true |
| SKIP_PACKAGING: true |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v4 |
| # In order to fetch changed files |
| with: |
| fetch-depth: 0 |
| repository: apache/spark |
| ref: ${{ inputs.branch }} |
| - name: Sync the current branch with the latest in Apache Spark |
| if: github.repository != 'apache/spark' |
| run: | |
| echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV |
| git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty |
| # Cache local repositories. Note that GitHub Actions cache has a 10G limit. |
| - name: Cache SBT and Maven |
| uses: actions/cache@v4 |
| with: |
| path: | |
| build/apache-maven-* |
| build/*.jar |
| ~/.sbt |
| key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build- |
| - name: Cache Coursier local repository |
| uses: actions/cache@v4 |
| with: |
| path: ~/.cache/coursier |
| key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} |
| restore-keys: | |
| ${{ matrix.java }}-${{ matrix.hadoop }}-coursier- |
| - name: Free up disk space |
| run: | |
| if [ -f ./dev/free_disk_space ]; then |
| ./dev/free_disk_space |
| fi |
| - name: Install Java ${{ matrix.java }} |
| uses: actions/setup-java@v4 |
| with: |
| distribution: zulu |
| java-version: ${{ matrix.java }} |
| - name: Install Python 3.9 |
| uses: actions/setup-python@v5 |
| # We should install one Python that is higher than 3+ for SQL and Yarn because: |
| # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils. |
| # - Yarn has a Python specific test too, for example, YarnClusterSuite. |
| if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect') |
| with: |
| python-version: '3.9' |
| architecture: x64 |
| - name: Install Python packages (Python 3.9) |
| if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect') |
| run: | |
| python3.9 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.62.0' 'grpcio-status==1.62.0' 'protobuf==4.25.1' |
| python3.9 -m pip list |
| # Run the tests. |
| - name: Run tests |
| env: ${{ fromJSON(inputs.envs) }} |
| shell: 'script -q -e -c "bash {0}"' |
| run: | |
| # Fix for TTY related issues when launching the Ammonite REPL in tests. |
| export TERM=vt100 |
| # Hive "other tests" test needs larger metaspace size based on experiment. |
| if [[ "$MODULES_TO_TEST" == "hive" ]] && [[ "$EXCLUDED_TAGS" == "org.apache.spark.tags.SlowHiveTest" ]]; then export METASPACE_SIZE=2g; fi |
| # SPARK-46283: should delete the following env replacement after SPARK 3.x EOL |
| if [[ "$MODULES_TO_TEST" == *"streaming-kinesis-asl"* ]] && [[ "${{ inputs.branch }}" =~ ^branch-3 ]]; then |
| MODULES_TO_TEST=${MODULES_TO_TEST//streaming-kinesis-asl, /} |
| fi |
| export SERIAL_SBT_TESTS=1 |
| ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" |
| - name: Upload test results to report |
| if: always() |
| uses: actions/upload-artifact@v4 |
| with: |
| name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} |
| path: "**/target/test-reports/*.xml" |
| - name: Upload unit tests log files |
| if: ${{ !success() }} |
| uses: actions/upload-artifact@v4 |
| with: |
| name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} |
| path: "**/target/unit-tests.log" |
| |
| infra-image: |
| name: "Base image build" |
| needs: precondition |
| if: >- |
| fromJson(needs.precondition.outputs.required).pyspark == 'true' || |
| fromJson(needs.precondition.outputs.required).lint == 'true' || |
| fromJson(needs.precondition.outputs.required).sparkr == 'true' |
| runs-on: ubuntu-latest |
| permissions: |
| packages: write |
| steps: |
| - name: Login to GitHub Container Registry |
| uses: docker/login-action@v3 |
| with: |
| registry: ghcr.io |
| username: ${{ github.actor }} |
| password: ${{ secrets.GITHUB_TOKEN }} |
| - name: Checkout Spark repository |
| uses: actions/checkout@v4 |
| # In order to fetch changed files |
| with: |
| fetch-depth: 0 |
| repository: apache/spark |
| ref: ${{ inputs.branch }} |
| - name: Sync the current branch with the latest in Apache Spark |
| if: github.repository != 'apache/spark' |
| run: | |
| echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV |
| git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty |
| - name: Set up QEMU |
| uses: docker/setup-qemu-action@v3 |
| - name: Set up Docker Buildx |
| uses: docker/setup-buildx-action@v3 |
| - name: Build and push |
| id: docker_build |
| uses: docker/build-push-action@v5 |
| with: |
| context: ./dev/infra/ |
| push: true |
| tags: | |
| ${{ needs.precondition.outputs.image_url }} |
| # Use the infra image cache to speed up |
| cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ inputs.branch }} |
| |
| pyspark: |
| needs: [precondition, infra-image] |
| # always run if pyspark == 'true', even infra-image is skip (such as non-master job) |
| if: (!cancelled()) && fromJson(needs.precondition.outputs.required).pyspark == 'true' |
| name: "Build modules: ${{ matrix.modules }}" |
| runs-on: ubuntu-latest |
| timeout-minutes: 180 |
| container: |
| image: ${{ needs.precondition.outputs.image_url }} |
| strategy: |
| fail-fast: false |
| matrix: |
| java: |
| - ${{ inputs.java }} |
| modules: |
| - >- |
| pyspark-sql, pyspark-resource, pyspark-testing |
| - >- |
| pyspark-core, pyspark-errors, pyspark-streaming |
| - >- |
| pyspark-mllib, pyspark-ml, pyspark-ml-connect |
| - >- |
| pyspark-connect |
| - >- |
| pyspark-pandas |
| - >- |
| pyspark-pandas-slow |
| - >- |
| pyspark-pandas-connect-part0 |
| - >- |
| pyspark-pandas-connect-part1 |
| - >- |
| pyspark-pandas-connect-part2 |
| - >- |
| pyspark-pandas-connect-part3 |
| exclude: |
| # Always run if pyspark-pandas == 'true', even infra-image is skip (such as non-master job) |
| # In practice, the build will run in individual PR, but not against the individual commit |
| # in Apache Spark repository. |
| - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas' }} |
| - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-slow' }} |
| - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part0' }} |
| - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part1' }} |
| - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part2' }} |
| - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part3' }} |
| env: |
| MODULES_TO_TEST: ${{ matrix.modules }} |
| PYTHON_TO_TEST: 'python3.11' |
| HADOOP_PROFILE: ${{ inputs.hadoop }} |
| HIVE_PROFILE: hive2.3 |
| GITHUB_PREV_SHA: ${{ github.event.before }} |
| SPARK_LOCAL_IP: localhost |
| SKIP_UNIDOC: true |
| SKIP_MIMA: true |
| SKIP_PACKAGING: true |
| METASPACE_SIZE: 1g |
| BRANCH: ${{ inputs.branch }} |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v4 |
| # In order to fetch changed files |
| with: |
| fetch-depth: 0 |
| repository: apache/spark |
| ref: ${{ inputs.branch }} |
| - name: Add GITHUB_WORKSPACE to git trust safe.directory |
| run: | |
| git config --global --add safe.directory ${GITHUB_WORKSPACE} |
| - name: Sync the current branch with the latest in Apache Spark |
| if: github.repository != 'apache/spark' |
| run: | |
| echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV |
| git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty |
| # Cache local repositories. Note that GitHub Actions cache has a 10G limit. |
| - name: Cache SBT and Maven |
| uses: actions/cache@v4 |
| with: |
| path: | |
| build/apache-maven-* |
| build/*.jar |
| ~/.sbt |
| key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build- |
| - name: Cache Coursier local repository |
| uses: actions/cache@v4 |
| with: |
| path: ~/.cache/coursier |
| key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} |
| restore-keys: | |
| pyspark-coursier- |
| - name: Free up disk space |
| shell: 'script -q -e -c "bash {0}"' |
| run: | |
| if [ -f ./dev/free_disk_space_container ]; then |
| ./dev/free_disk_space_container |
| fi |
| - name: Install Java ${{ matrix.java }} |
| uses: actions/setup-java@v4 |
| with: |
| distribution: zulu |
| java-version: ${{ matrix.java }} |
| - name: List Python packages (${{ env.PYTHON_TO_TEST }}) |
| env: ${{ fromJSON(inputs.envs) }} |
| shell: 'script -q -e -c "bash {0}"' |
| run: | |
| for py in $(echo $PYTHON_TO_TEST | tr "," "\n") |
| do |
| echo $py |
| $py -m pip list |
| done |
| - name: Install Conda for pip packaging test |
| if: contains(matrix.modules, 'pyspark-errors') |
| run: | |
| curl -s https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > miniconda.sh |
| bash miniconda.sh -b -p $HOME/miniconda |
| rm miniconda.sh |
| # Run the tests. |
| - name: Run tests |
| env: ${{ fromJSON(inputs.envs) }} |
| shell: 'script -q -e -c "bash {0}"' |
| run: | |
| if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then |
| export PATH=$PATH:$HOME/miniconda/bin |
| export SKIP_PACKAGING=false |
| echo "Python Packaging Tests Enabled!" |
| fi |
| if [ ! -z "$PYTHON_TO_TEST" ]; then |
| ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --python-executables "$PYTHON_TO_TEST" |
| else |
| # For branch-3.5 and below, it uses the default Python versions. |
| ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" |
| fi |
| - name: Upload coverage to Codecov |
| if: fromJSON(inputs.envs).PYSPARK_CODECOV == 'true' |
| uses: codecov/codecov-action@v4 |
| with: |
| files: ./python/coverage.xml |
| flags: unittests |
| name: PySpark |
| - name: Upload test results to report |
| env: ${{ fromJSON(inputs.envs) }} |
| if: always() |
| uses: actions/upload-artifact@v4 |
| with: |
| name: test-results-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }} |
| path: "**/target/test-reports/*.xml" |
| - name: Upload unit tests log files |
| env: ${{ fromJSON(inputs.envs) }} |
| if: ${{ !success() }} |
| uses: actions/upload-artifact@v4 |
| with: |
| name: unit-tests-log-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }} |
| path: "**/target/unit-tests.log" |
| |
| sparkr: |
| needs: [precondition, infra-image] |
| # always run if sparkr == 'true', even infra-image is skip (such as non-master job) |
| if: (!cancelled()) && fromJson(needs.precondition.outputs.required).sparkr == 'true' |
| name: "Build modules: sparkr" |
| runs-on: ubuntu-latest |
| timeout-minutes: 180 |
| container: |
| image: ${{ needs.precondition.outputs.image_url }} |
| env: |
| HADOOP_PROFILE: ${{ inputs.hadoop }} |
| HIVE_PROFILE: hive2.3 |
| GITHUB_PREV_SHA: ${{ github.event.before }} |
| SPARK_LOCAL_IP: localhost |
| SKIP_UNIDOC: true |
| SKIP_MIMA: true |
| SKIP_PACKAGING: true |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v4 |
| # In order to fetch changed files |
| with: |
| fetch-depth: 0 |
| repository: apache/spark |
| ref: ${{ inputs.branch }} |
| - name: Add GITHUB_WORKSPACE to git trust safe.directory |
| run: | |
| git config --global --add safe.directory ${GITHUB_WORKSPACE} |
| - name: Sync the current branch with the latest in Apache Spark |
| if: github.repository != 'apache/spark' |
| run: | |
| echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV |
| git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty |
| # Cache local repositories. Note that GitHub Actions cache has a 10G limit. |
| - name: Cache SBT and Maven |
| uses: actions/cache@v4 |
| with: |
| path: | |
| build/apache-maven-* |
| build/*.jar |
| ~/.sbt |
| key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build- |
| - name: Cache Coursier local repository |
| uses: actions/cache@v4 |
| with: |
| path: ~/.cache/coursier |
| key: sparkr-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} |
| restore-keys: | |
| sparkr-coursier- |
| - name: Free up disk space |
| run: | |
| if [ -f ./dev/free_disk_space_container ]; then |
| ./dev/free_disk_space_container |
| fi |
| - name: Install Java ${{ inputs.java }} |
| uses: actions/setup-java@v4 |
| with: |
| distribution: zulu |
| java-version: ${{ inputs.java }} |
| - name: Run tests |
| env: ${{ fromJSON(inputs.envs) }} |
| run: | |
| # The followings are also used by `r-lib/actions/setup-r` to avoid |
| # R issues at docker environment |
| export TZ=UTC |
| export _R_CHECK_SYSTEM_CLOCK_=FALSE |
| ./dev/run-tests --parallelism 1 --modules sparkr |
| - name: Upload test results to report |
| if: always() |
| uses: actions/upload-artifact@v4 |
| with: |
| name: test-results-sparkr--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 |
| path: "**/target/test-reports/*.xml" |
| |
| buf: |
| needs: [precondition] |
| if: (!cancelled()) && fromJson(needs.precondition.outputs.required).buf == 'true' |
| name: Protobuf breaking change detection and Python CodeGen check |
| runs-on: ubuntu-latest |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v4 |
| with: |
| fetch-depth: 0 |
| repository: apache/spark |
| ref: ${{ inputs.branch }} |
| - name: Sync the current branch with the latest in Apache Spark |
| if: github.repository != 'apache/spark' |
| run: | |
| git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty |
| - name: Install Buf |
| uses: bufbuild/buf-setup-action@v1 |
| with: |
| github_token: ${{ secrets.GITHUB_TOKEN }} |
| - name: Protocol Buffers Linter |
| uses: bufbuild/buf-lint-action@v1 |
| with: |
| input: core/src/main/protobuf |
| # Change 'branch-3.5' to 'branch-4.0' in master branch after cutting branch-4.0 branch. |
| - name: Breaking change detection against branch-3.5 |
| uses: bufbuild/buf-breaking-action@v1 |
| with: |
| input: connector/connect/common/src/main |
| against: 'https://github.com/apache/spark.git#branch=branch-3.5,subdir=connector/connect/common/src/main' |
| - name: Install Python 3.9 |
| uses: actions/setup-python@v5 |
| with: |
| python-version: '3.9' |
| - name: Install dependencies for Python CodeGen check |
| run: | |
| python3.9 -m pip install 'black==23.9.1' 'protobuf==4.25.1' 'mypy==1.8.0' 'mypy-protobuf==3.3.0' |
| python3.9 -m pip list |
| - name: Python CodeGen check |
| run: ./dev/connect-check-protos.py |
| |
| # Static analysis |
| lint: |
| needs: [precondition, infra-image] |
| # always run if lint == 'true', even infra-image is skip (such as non-master job) |
| if: (!cancelled()) && fromJson(needs.precondition.outputs.required).lint == 'true' |
| name: Linters, licenses, and dependencies |
| runs-on: ubuntu-latest |
| timeout-minutes: 180 |
| env: |
| LC_ALL: C.UTF-8 |
| LANG: C.UTF-8 |
| NOLINT_ON_COMPILE: false |
| PYSPARK_DRIVER_PYTHON: python3.9 |
| PYSPARK_PYTHON: python3.9 |
| GITHUB_PREV_SHA: ${{ github.event.before }} |
| container: |
| image: ${{ needs.precondition.outputs.image_url }} |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v4 |
| with: |
| fetch-depth: 0 |
| repository: apache/spark |
| ref: ${{ inputs.branch }} |
| - name: Add GITHUB_WORKSPACE to git trust safe.directory |
| run: | |
| git config --global --add safe.directory ${GITHUB_WORKSPACE} |
| - name: Sync the current branch with the latest in Apache Spark |
| if: github.repository != 'apache/spark' |
| run: | |
| echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV |
| git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty |
| # Cache local repositories. Note that GitHub Actions cache has a 10G limit. |
| - name: Cache SBT and Maven |
| uses: actions/cache@v4 |
| with: |
| path: | |
| build/apache-maven-* |
| build/*.jar |
| ~/.sbt |
| key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build- |
| - name: Cache Coursier local repository |
| uses: actions/cache@v4 |
| with: |
| path: ~/.cache/coursier |
| key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} |
| restore-keys: | |
| docs-coursier- |
| - name: Cache Maven local repository |
| uses: actions/cache@v4 |
| with: |
| path: ~/.m2/repository |
| key: docs-maven-${{ hashFiles('**/pom.xml') }} |
| restore-keys: | |
| docs-maven- |
| - name: Free up disk space |
| run: | |
| if [ -f ./dev/free_disk_space_container ]; then |
| ./dev/free_disk_space_container |
| fi |
| - name: Install Java ${{ inputs.java }} |
| uses: actions/setup-java@v4 |
| with: |
| distribution: zulu |
| java-version: ${{ inputs.java }} |
| - name: License test |
| run: ./dev/check-license |
| - name: Dependencies test |
| run: ./dev/test-dependencies.sh |
| - name: MIMA test |
| run: ./dev/mima |
| - name: Scala linter |
| run: ./dev/lint-scala |
| - name: Java linter |
| run: ./dev/lint-java |
| - name: Spark connect jvm client mima check |
| run: ./dev/connect-jvm-client-mima-check |
| - name: Install Python linter dependencies for branch-3.4 |
| if: inputs.branch == 'branch-3.4' |
| run: | |
| # SPARK-44554: Copy from https://github.com/apache/spark/blob/a05c27e85829fe742c1828507a1fd180cdc84b54/.github/workflows/build_and_test.yml#L571-L578 |
| # Should delete this section after SPARK 3.4 EOL. |
| python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.920' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0' |
| python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.48.1' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' |
| - name: Install Python linter dependencies for branch-3.5 |
| if: inputs.branch == 'branch-3.5' |
| run: | |
| # SPARK-45212: Copy from https://github.com/apache/spark/blob/555c8def51e5951c7bf5165a332795e9e330ec9d/.github/workflows/build_and_test.yml#L631-L638 |
| # Should delete this section after SPARK 3.5 EOL. |
| python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0' |
| python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.56.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' |
| - name: Install Python dependencies for python linter and documentation generation |
| if: inputs.branch != 'branch-3.4' && inputs.branch != 'branch-3.5' |
| run: | |
| # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5 |
| # See 'ipython_genutils' in SPARK-38517 |
| # See 'docutils<0.18.0' in SPARK-39421 |
| python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \ |
| ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \ |
| 'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \ |
| 'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \ |
| 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' |
| python3.9 -m pip list |
| - name: Python linter |
| run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python |
| # Should delete this section after SPARK 3.5 EOL. |
| - name: Install dependencies for Python code generation check for branch-3.5 |
| if: inputs.branch == 'branch-3.5' |
| run: | |
| # See more in "Installation" https://docs.buf.build/installation#tarball |
| curl -LO https://github.com/bufbuild/buf/releases/download/v1.28.1/buf-Linux-x86_64.tar.gz |
| mkdir -p $HOME/buf |
| tar -xvzf buf-Linux-x86_64.tar.gz -C $HOME/buf --strip-components 1 |
| rm buf-Linux-x86_64.tar.gz |
| python3.9 -m pip install 'protobuf==4.25.1' 'mypy-protobuf==3.3.0' |
| # Should delete this section after SPARK 3.5 EOL. |
| - name: Python code generation check for branch-3.5 |
| if: inputs.branch == 'branch-3.5' |
| run: if test -f ./dev/connect-check-protos.py; then PATH=$PATH:$HOME/buf/bin PYTHON_EXECUTABLE=python3.9 ./dev/connect-check-protos.py; fi |
| # Should delete this section after SPARK 3.5 EOL. |
| - name: Install JavaScript linter dependencies for branch-3.4, branch-3.5 |
| if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5' |
| run: | |
| apt update |
| apt-get install -y nodejs npm |
| - name: JS linter |
| run: ./dev/lint-js |
| # Should delete this section after SPARK 3.5 EOL. |
| - name: Install R linter dependencies for branch-3.4, branch-3.5 |
| if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5' |
| run: | |
| apt update |
| apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev \ |
| libfontconfig1-dev libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev \ |
| libtiff5-dev libjpeg-dev |
| Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')" |
| Rscript -e "devtools::install_version('lintr', version='2.0.1', repos='https://cloud.r-project.org')" |
| - name: Install R linter dependencies and SparkR |
| run: ./R/install-dev.sh |
| - name: R linter |
| run: ./dev/lint-r |
| |
| # Documentation build |
| docs: |
| needs: [precondition, infra-image] |
| # always run if lint == 'true', even infra-image is skip (such as non-master job) |
| if: (!cancelled()) && fromJson(needs.precondition.outputs.required).docs == 'true' |
| name: Documentation generation |
| runs-on: ubuntu-latest |
| timeout-minutes: 180 |
| env: |
| LC_ALL: C.UTF-8 |
| LANG: C.UTF-8 |
| NOLINT_ON_COMPILE: false |
| PYSPARK_DRIVER_PYTHON: python3.9 |
| PYSPARK_PYTHON: python3.9 |
| GITHUB_PREV_SHA: ${{ github.event.before }} |
| container: |
| image: ${{ needs.precondition.outputs.image_url }} |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v4 |
| with: |
| fetch-depth: 0 |
| repository: apache/spark |
| ref: ${{ inputs.branch }} |
| - name: Add GITHUB_WORKSPACE to git trust safe.directory |
| run: | |
| git config --global --add safe.directory ${GITHUB_WORKSPACE} |
| - name: Sync the current branch with the latest in Apache Spark |
| if: github.repository != 'apache/spark' |
| run: | |
| echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV |
| git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty |
| # Cache local repositories. Note that GitHub Actions cache has a 10G limit. |
| - name: Cache SBT and Maven |
| uses: actions/cache@v4 |
| with: |
| path: | |
| build/apache-maven-* |
| build/*.jar |
| ~/.sbt |
| key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build- |
| - name: Cache Coursier local repository |
| uses: actions/cache@v4 |
| with: |
| path: ~/.cache/coursier |
| key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} |
| restore-keys: | |
| docs-coursier- |
| - name: Cache Maven local repository |
| uses: actions/cache@v4 |
| with: |
| path: ~/.m2/repository |
| key: docs-maven-${{ hashFiles('**/pom.xml') }} |
| restore-keys: | |
| docs-maven- |
| - name: Free up disk space |
| run: | |
| if [ -f ./dev/free_disk_space_container ]; then |
| ./dev/free_disk_space_container |
| fi |
| - name: Install Java ${{ inputs.java }} |
| uses: actions/setup-java@v4 |
| with: |
| distribution: zulu |
| java-version: ${{ inputs.java }} |
| - name: Install Python dependencies for python linter and documentation generation |
| if: inputs.branch != 'branch-3.4' && inputs.branch != 'branch-3.5' |
| run: | |
| # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5 |
| # See 'ipython_genutils' in SPARK-38517 |
| # See 'docutils<0.18.0' in SPARK-39421 |
| python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \ |
| ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \ |
| 'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \ |
| 'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \ |
| 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' |
| python3.9 -m pip list |
| - name: Install dependencies for documentation generation for branch-3.4, branch-3.5 |
| if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5' |
| run: | |
| # pandoc is required to generate PySpark APIs as well in nbsphinx. |
| apt-get update -y |
| apt-get install -y libcurl4-openssl-dev pandoc |
| apt-get install -y ruby ruby-dev |
| Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'markdown', 'e1071', 'roxygen2', 'ggplot2', 'mvtnorm', 'statmod'), repos='https://cloud.r-project.org/')" |
| Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" |
| Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" |
| # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5 |
| python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' |
| python3.9 -m pip install ipython_genutils # See SPARK-38517 |
| python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' |
| python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421 |
| - name: Install dependencies for documentation generation |
| run: | |
| # Keep the version of Bundler here in sync with the following locations: |
| # - dev/create-release/spark-rm/Dockerfile |
| # - docs/README.md |
| gem install bundler -v 2.4.22 |
| cd docs |
| bundle install |
| - name: Run documentation build |
| run: | |
| # We need this link because the jekyll build calls `python`. |
| ln -s "$(which python3.9)" "/usr/local/bin/python" |
| # Build docs first with SKIP_API to ensure they are buildable without requiring any |
| # language docs to be built beforehand. |
| cd docs; SKIP_API=1 bundle exec jekyll build; cd .. |
| if [ -f "./dev/is-changed.py" ]; then |
| # Skip PySpark and SparkR docs while keeping Scala/Java/SQL docs |
| pyspark_modules=`cd dev && python3.9 -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"` |
| if [ `./dev/is-changed.py -m $pyspark_modules` = false ]; then export SKIP_PYTHONDOC=1; fi |
| if [ `./dev/is-changed.py -m sparkr` = false ]; then export SKIP_RDOC=1; fi |
| fi |
| cd docs |
| bundle exec jekyll build |
| - name: Tar documentation |
| if: github.repository != 'apache/spark' |
| run: tar cjf site.tar.bz2 docs/_site |
| - name: Upload documentation |
| if: github.repository != 'apache/spark' |
| uses: actions/upload-artifact@v4 |
| with: |
| name: site |
| path: site.tar.bz2 |
| retention-days: 1 |
| |
| # Any TPC-DS related updates on this job need to be applied to tpcds-1g-gen job of benchmark.yml as well |
| tpcds-1g: |
| needs: precondition |
| if: fromJson(needs.precondition.outputs.required).tpcds-1g == 'true' |
| name: Run TPC-DS queries with SF=1 |
| # Pin to 'Ubuntu 20.04' due to 'databricks/tpcds-kit' compilation |
| runs-on: ubuntu-20.04 |
| timeout-minutes: 180 |
| env: |
| SPARK_LOCAL_IP: localhost |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v4 |
| with: |
| fetch-depth: 0 |
| repository: apache/spark |
| ref: ${{ inputs.branch }} |
| - name: Sync the current branch with the latest in Apache Spark |
| if: github.repository != 'apache/spark' |
| run: | |
| git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty |
| - name: Cache SBT and Maven |
| uses: actions/cache@v4 |
| with: |
| path: | |
| build/apache-maven-* |
| build/*.jar |
| ~/.sbt |
| key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build- |
| - name: Cache Coursier local repository |
| uses: actions/cache@v4 |
| with: |
| path: ~/.cache/coursier |
| key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} |
| restore-keys: | |
| tpcds-coursier- |
| - name: Install Java ${{ inputs.java }} |
| uses: actions/setup-java@v4 |
| with: |
| distribution: zulu |
| java-version: ${{ inputs.java }} |
| - name: Cache TPC-DS generated data |
| id: cache-tpcds-sf-1 |
| uses: actions/cache@v4 |
| with: |
| path: ./tpcds-sf-1 |
| key: tpcds-${{ hashFiles('.github/workflows/build_and_test.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }} |
| - name: Checkout tpcds-kit repository |
| if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' |
| uses: actions/checkout@v4 |
| with: |
| repository: databricks/tpcds-kit |
| ref: 2a5078a782192ddb6efbcead8de9973d6ab4f069 |
| path: ./tpcds-kit |
| - name: Build tpcds-kit |
| if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' |
| run: cd tpcds-kit/tools && make OS=LINUX |
| - name: Generate TPC-DS (SF=1) table data |
| if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' |
| run: build/sbt "sql/Test/runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite" |
| - name: Run TPC-DS queries (Sort merge join) |
| run: | |
| SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" |
| env: |
| SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }} |
| SPARK_TPCDS_JOIN_CONF: | |
| spark.sql.autoBroadcastJoinThreshold=-1 |
| spark.sql.join.preferSortMergeJoin=true |
| - name: Run TPC-DS queries (Broadcast hash join) |
| run: | |
| SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" |
| env: |
| SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }} |
| SPARK_TPCDS_JOIN_CONF: | |
| spark.sql.autoBroadcastJoinThreshold=10485760 |
| - name: Run TPC-DS queries (Shuffled hash join) |
| run: | |
| SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" |
| env: |
| SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }} |
| SPARK_TPCDS_JOIN_CONF: | |
| spark.sql.autoBroadcastJoinThreshold=-1 |
| spark.sql.join.forceApplyShuffledHashJoin=true |
| - name: Run TPC-DS queries on collated data |
| run: | |
| SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSCollationQueryTestSuite" |
| - name: Upload test results to report |
| if: always() |
| uses: actions/upload-artifact@v4 |
| with: |
| name: test-results-tpcds--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 |
| path: "**/target/test-reports/*.xml" |
| - name: Upload unit tests log files |
| if: ${{ !success() }} |
| uses: actions/upload-artifact@v4 |
| with: |
| name: unit-tests-log-tpcds--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 |
| path: "**/target/unit-tests.log" |
| |
| docker-integration-tests: |
| needs: precondition |
| if: fromJson(needs.precondition.outputs.required).docker-integration-tests == 'true' |
| name: Run Docker integration tests |
| runs-on: ubuntu-latest |
| timeout-minutes: 180 |
| env: |
| HADOOP_PROFILE: ${{ inputs.hadoop }} |
| HIVE_PROFILE: hive2.3 |
| GITHUB_PREV_SHA: ${{ github.event.before }} |
| SPARK_LOCAL_IP: localhost |
| SKIP_UNIDOC: true |
| SKIP_MIMA: true |
| SKIP_PACKAGING: true |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v4 |
| with: |
| fetch-depth: 0 |
| repository: apache/spark |
| ref: ${{ inputs.branch }} |
| - name: Sync the current branch with the latest in Apache Spark |
| if: github.repository != 'apache/spark' |
| run: | |
| echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV |
| git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty |
| - name: Cache SBT and Maven |
| uses: actions/cache@v4 |
| with: |
| path: | |
| build/apache-maven-* |
| build/*.jar |
| ~/.sbt |
| key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build- |
| - name: Cache Coursier local repository |
| uses: actions/cache@v4 |
| with: |
| path: ~/.cache/coursier |
| key: docker-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} |
| restore-keys: | |
| docker-integration-coursier- |
| - name: Install Java ${{ inputs.java }} |
| uses: actions/setup-java@v4 |
| with: |
| distribution: zulu |
| java-version: ${{ inputs.java }} |
| - name: Run tests |
| env: ${{ fromJSON(inputs.envs) }} |
| run: | |
| ./dev/run-tests --parallelism 1 --modules docker-integration-tests --included-tags org.apache.spark.tags.DockerTest |
| - name: Upload test results to report |
| if: always() |
| uses: actions/upload-artifact@v4 |
| with: |
| name: test-results-docker-integration--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 |
| path: "**/target/test-reports/*.xml" |
| - name: Upload unit tests log files |
| if: ${{ !success() }} |
| uses: actions/upload-artifact@v4 |
| with: |
| name: unit-tests-log-docker-integration--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 |
| path: "**/target/unit-tests.log" |
| |
| k8s-integration-tests: |
| needs: precondition |
| if: fromJson(needs.precondition.outputs.required).k8s-integration-tests == 'true' |
| name: Run Spark on Kubernetes Integration test |
| runs-on: ubuntu-latest |
| timeout-minutes: 180 |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v4 |
| with: |
| fetch-depth: 0 |
| repository: apache/spark |
| ref: ${{ inputs.branch }} |
| - name: Sync the current branch with the latest in Apache Spark |
| if: github.repository != 'apache/spark' |
| run: | |
| echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV |
| git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty |
| - name: Cache SBT and Maven |
| uses: actions/cache@v4 |
| with: |
| path: | |
| build/apache-maven-* |
| build/*.jar |
| ~/.sbt |
| key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build- |
| - name: Cache Coursier local repository |
| uses: actions/cache@v4 |
| with: |
| path: ~/.cache/coursier |
| key: k8s-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} |
| restore-keys: | |
| k8s-integration-coursier- |
| - name: Install Java ${{ inputs.java }} |
| uses: actions/setup-java@v4 |
| with: |
| distribution: zulu |
| java-version: ${{ inputs.java }} |
| - name: start minikube |
| run: | |
| # See more in "Installation" https://minikube.sigs.k8s.io/docs/start/ |
| curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64 |
| sudo install minikube-linux-amd64 /usr/local/bin/minikube |
| rm minikube-linux-amd64 |
| # Github Action limit cpu:2, memory: 6947MB, limit to 2U6G for better resource statistic |
| minikube start --cpus 2 --memory 6144 |
| - name: Print K8S pods and nodes info |
| run: | |
| kubectl get pods -A |
| kubectl describe node |
| - name: Run Spark on K8S integration test |
| run: | |
| # Prepare PV test |
| PVC_TMP_DIR=$(mktemp -d) |
| export PVC_TESTS_HOST_PATH=$PVC_TMP_DIR |
| export PVC_TESTS_VM_PATH=$PVC_TMP_DIR |
| minikube mount ${PVC_TESTS_HOST_PATH}:${PVC_TESTS_VM_PATH} --gid=0 --uid=185 & |
| kubectl create clusterrolebinding serviceaccounts-cluster-admin --clusterrole=cluster-admin --group=system:serviceaccounts || true |
| kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.8.2/installer/volcano-development.yaml || true |
| if [[ "${{ inputs.branch }}" == 'branch-3.5' || "${{ inputs.branch }}" == 'branch-3.4' ]]; then |
| kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.7.0/installer/volcano-development.yaml || true |
| else |
| kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.8.2/installer/volcano-development.yaml || true |
| fi |
| eval $(minikube docker-env) |
| build/sbt -Phadoop-3 -Psparkr -Pkubernetes -Pvolcano -Pkubernetes-integration-tests -Dspark.kubernetes.test.volcanoMaxConcurrencyJobNum=1 -Dtest.exclude.tags=local "kubernetes-integration-tests/test" |
| - name: Upload Spark on K8S integration tests log files |
| if: ${{ !success() }} |
| uses: actions/upload-artifact@v4 |
| with: |
| name: spark-on-kubernetes-it-log |
| path: "**/target/integration-tests.log" |
| |
| ui: |
| needs: [precondition] |
| if: fromJson(needs.precondition.outputs.required).ui == 'true' |
| name: Run Spark UI tests |
| runs-on: ubuntu-latest |
| timeout-minutes: 180 |
| steps: |
| - uses: actions/checkout@v4 |
| - name: Use Node.js |
| uses: actions/setup-node@v4 |
| with: |
| node-version: 20 |
| cache: 'npm' |
| cache-dependency-path: ui-test/package-lock.json |
| - run: | |
| cd ui-test |
| npm install --save-dev |
| node --experimental-vm-modules node_modules/.bin/jest |