blob: 881fb8cb067458f0a27ba19726bb9f21399ef291 [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
name: Build and test
on:
workflow_call:
inputs:
java:
required: false
type: string
default: 17
branch:
description: Branch to run the build against
required: false
type: string
# Change 'master' to 'branch-4.0' in branch-4.0 branch after cutting it.
default: master
hadoop:
description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it.
required: false
type: string
default: hadoop3
envs:
description: Additional environment variables to set when running the tests. Should be in JSON format.
required: false
type: string
default: '{}'
jobs:
description: >-
Jobs to run, and should be in JSON format. The values should be matched with the job's key defined
in this file, e.g., build. See precondition job below.
required: false
type: string
default: ''
jobs:
precondition:
name: Check changes
runs-on: ubuntu-latest
env:
GITHUB_PREV_SHA: ${{ github.event.before }}
outputs:
required: ${{ steps.set-outputs.outputs.required }}
image_url: ${{ steps.infra-image-outputs.outputs.image_url }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
with:
fetch-depth: 0
repository: apache/spark
ref: ${{ inputs.branch }}
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
- name: Check all modules
id: set-outputs
run: |
if [ -z "${{ inputs.jobs }}" ]; then
pyspark_modules=`cd dev && python -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"`
pyspark=`./dev/is-changed.py -m $pyspark_modules`
if [[ "${{ github.repository }}" != 'apache/spark' ]]; then
pandas=$pyspark
yarn=`./dev/is-changed.py -m yarn`
kubernetes=`./dev/is-changed.py -m kubernetes`
sparkr=`./dev/is-changed.py -m sparkr`
tpcds=`./dev/is-changed.py -m sql`
docker=`./dev/is-changed.py -m docker-integration-tests`
buf=true
ui=true
docs=true
else
pandas=false
yarn=false
kubernetes=false
sparkr=false
tpcds=false
docker=false
buf=false
ui=false
docs=false
fi
build=`./dev/is-changed.py -m "core,unsafe,kvstore,avro,utils,network-common,network-shuffle,repl,launcher,examples,sketch,variant,api,catalyst,hive-thriftserver,mllib-local,mllib,graphx,streaming,sql-kafka-0-10,streaming-kafka-0-10,streaming-kinesis-asl,kubernetes,hadoop-cloud,spark-ganglia-lgpl,protobuf,yarn,connect,sql,hive"`
precondition="
{
\"build\": \"$build\",
\"pyspark\": \"$pyspark\",
\"pyspark-pandas\": \"$pandas\",
\"sparkr\": \"$sparkr\",
\"tpcds-1g\": \"$tpcds\",
\"docker-integration-tests\": \"$docker\",
\"lint\" : \"true\",
\"docs\" : \"$docs\",
\"yarn\" : \"$yarn\",
\"k8s-integration-tests\" : \"$kubernetes\",
\"buf\" : \"$buf\",
\"ui\" : \"$ui\",
}"
echo $precondition # For debugging
# Remove `\n` to avoid "Invalid format" error
precondition="${precondition//$'\n'/}}"
echo "required=$precondition" >> $GITHUB_OUTPUT
else
# This is usually set by scheduled jobs.
precondition='${{ inputs.jobs }}'
echo $precondition # For debugging
precondition="${precondition//$'\n'/}"
echo "required=$precondition" >> $GITHUB_OUTPUT
fi
- name: Generate infra image URL
id: infra-image-outputs
run: |
# Convert to lowercase to meet Docker repo name requirement
REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
IMG_NAME="apache-spark-ci-image:${{ inputs.branch }}-${{ github.run_id }}"
IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
echo "image_url=$IMG_URL" >> $GITHUB_OUTPUT
# Build: build Spark and run the tests for specified modules.
build:
name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }}"
needs: precondition
if: fromJson(needs.precondition.outputs.required).build == 'true'
runs-on: ubuntu-latest
timeout-minutes: 180
strategy:
fail-fast: false
matrix:
java:
- ${{ inputs.java }}
hadoop:
- ${{ inputs.hadoop }}
hive:
- hive2.3
# Note that the modules below are from sparktestsupport/modules.py.
modules:
- >-
core, unsafe, kvstore, avro, utils,
network-common, network-shuffle, repl, launcher,
examples, sketch, variant
- >-
api, catalyst, hive-thriftserver
- >-
mllib-local, mllib, graphx
- >-
streaming, sql-kafka-0-10, streaming-kafka-0-10, streaming-kinesis-asl,
kubernetes, hadoop-cloud, spark-ganglia-lgpl, protobuf, connect
- yarn
# Here, we split Hive and SQL tests into some of slow ones and the rest of them.
included-tags: [""]
excluded-tags: [""]
comment: [""]
include:
# Hive tests
- modules: hive
java: ${{ inputs.java }}
hadoop: ${{ inputs.hadoop }}
hive: hive2.3
included-tags: org.apache.spark.tags.SlowHiveTest
comment: "- slow tests"
- modules: hive
java: ${{ inputs.java }}
hadoop: ${{ inputs.hadoop }}
hive: hive2.3
excluded-tags: org.apache.spark.tags.SlowHiveTest
comment: "- other tests"
# SQL tests
- modules: sql
java: ${{ inputs.java }}
hadoop: ${{ inputs.hadoop }}
hive: hive2.3
included-tags: org.apache.spark.tags.ExtendedSQLTest
comment: "- extended tests"
- modules: sql
java: ${{ inputs.java }}
hadoop: ${{ inputs.hadoop }}
hive: hive2.3
included-tags: org.apache.spark.tags.SlowSQLTest
comment: "- slow tests"
- modules: sql
java: ${{ inputs.java }}
hadoop: ${{ inputs.hadoop }}
hive: hive2.3
excluded-tags: org.apache.spark.tags.ExtendedSQLTest,org.apache.spark.tags.SlowSQLTest
comment: "- other tests"
exclude:
# Always run if yarn == 'true', even infra-image is skip (such as non-master job)
# In practice, the build will run in individual PR, but not against the individual commit
# in Apache Spark repository.
- modules: ${{ fromJson(needs.precondition.outputs.required).yarn != 'true' && 'yarn' }}
env:
MODULES_TO_TEST: ${{ matrix.modules }}
EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
INCLUDED_TAGS: ${{ matrix.included-tags }}
HADOOP_PROFILE: ${{ matrix.hadoop }}
HIVE_PROFILE: ${{ matrix.hive }}
GITHUB_PREV_SHA: ${{ github.event.before }}
SPARK_LOCAL_IP: localhost
NOLINT_ON_COMPILE: true
SKIP_UNIDOC: true
SKIP_MIMA: true
SKIP_PACKAGING: true
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
# In order to fetch changed files
with:
fetch-depth: 0
repository: apache/spark
ref: ${{ inputs.branch }}
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
# Cache local repositories. Note that GitHub Actions cache has a 10G limit.
- name: Cache SBT and Maven
uses: actions/cache@v4
with:
path: |
build/apache-maven-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v4
with:
path: ~/.cache/coursier
key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
${{ matrix.java }}-${{ matrix.hadoop }}-coursier-
- name: Free up disk space
run: |
if [ -f ./dev/free_disk_space ]; then
./dev/free_disk_space
fi
- name: Install Java ${{ matrix.java }}
uses: actions/setup-java@v4
with:
distribution: zulu
java-version: ${{ matrix.java }}
- name: Install Python 3.9
uses: actions/setup-python@v5
# We should install one Python that is higher than 3+ for SQL and Yarn because:
# - SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
# - Yarn has a Python specific test too, for example, YarnClusterSuite.
if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect')
with:
python-version: '3.9'
architecture: x64
- name: Install Python packages (Python 3.9)
if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect')
run: |
python3.9 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.62.0' 'grpcio-status==1.62.0' 'protobuf==4.25.1'
python3.9 -m pip list
# Run the tests.
- name: Run tests
env: ${{ fromJSON(inputs.envs) }}
shell: 'script -q -e -c "bash {0}"'
run: |
# Fix for TTY related issues when launching the Ammonite REPL in tests.
export TERM=vt100
# Hive "other tests" test needs larger metaspace size based on experiment.
if [[ "$MODULES_TO_TEST" == "hive" ]] && [[ "$EXCLUDED_TAGS" == "org.apache.spark.tags.SlowHiveTest" ]]; then export METASPACE_SIZE=2g; fi
# SPARK-46283: should delete the following env replacement after SPARK 3.x EOL
if [[ "$MODULES_TO_TEST" == *"streaming-kinesis-asl"* ]] && [[ "${{ inputs.branch }}" =~ ^branch-3 ]]; then
MODULES_TO_TEST=${MODULES_TO_TEST//streaming-kinesis-asl, /}
fi
export SERIAL_SBT_TESTS=1
./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
- name: Upload test results to report
if: always()
uses: actions/upload-artifact@v4
with:
name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
path: "**/target/test-reports/*.xml"
- name: Upload unit tests log files
if: ${{ !success() }}
uses: actions/upload-artifact@v4
with:
name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
path: "**/target/unit-tests.log"
infra-image:
name: "Base image build"
needs: precondition
if: >-
fromJson(needs.precondition.outputs.required).pyspark == 'true' ||
fromJson(needs.precondition.outputs.required).lint == 'true' ||
fromJson(needs.precondition.outputs.required).sparkr == 'true'
runs-on: ubuntu-latest
permissions:
packages: write
steps:
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Checkout Spark repository
uses: actions/checkout@v4
# In order to fetch changed files
with:
fetch-depth: 0
repository: apache/spark
ref: ${{ inputs.branch }}
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build and push
id: docker_build
uses: docker/build-push-action@v5
with:
context: ./dev/infra/
push: true
tags: |
${{ needs.precondition.outputs.image_url }}
# Use the infra image cache to speed up
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ inputs.branch }}
pyspark:
needs: [precondition, infra-image]
# always run if pyspark == 'true', even infra-image is skip (such as non-master job)
if: (!cancelled()) && fromJson(needs.precondition.outputs.required).pyspark == 'true'
name: "Build modules: ${{ matrix.modules }}"
runs-on: ubuntu-latest
timeout-minutes: 180
container:
image: ${{ needs.precondition.outputs.image_url }}
strategy:
fail-fast: false
matrix:
java:
- ${{ inputs.java }}
modules:
- >-
pyspark-sql, pyspark-resource, pyspark-testing
- >-
pyspark-core, pyspark-errors, pyspark-streaming
- >-
pyspark-mllib, pyspark-ml, pyspark-ml-connect
- >-
pyspark-connect
- >-
pyspark-pandas
- >-
pyspark-pandas-slow
- >-
pyspark-pandas-connect-part0
- >-
pyspark-pandas-connect-part1
- >-
pyspark-pandas-connect-part2
- >-
pyspark-pandas-connect-part3
exclude:
# Always run if pyspark-pandas == 'true', even infra-image is skip (such as non-master job)
# In practice, the build will run in individual PR, but not against the individual commit
# in Apache Spark repository.
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas' }}
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-slow' }}
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part0' }}
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part1' }}
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part2' }}
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part3' }}
env:
MODULES_TO_TEST: ${{ matrix.modules }}
PYTHON_TO_TEST: 'python3.11'
HADOOP_PROFILE: ${{ inputs.hadoop }}
HIVE_PROFILE: hive2.3
GITHUB_PREV_SHA: ${{ github.event.before }}
SPARK_LOCAL_IP: localhost
SKIP_UNIDOC: true
SKIP_MIMA: true
SKIP_PACKAGING: true
METASPACE_SIZE: 1g
BRANCH: ${{ inputs.branch }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
# In order to fetch changed files
with:
fetch-depth: 0
repository: apache/spark
ref: ${{ inputs.branch }}
- name: Add GITHUB_WORKSPACE to git trust safe.directory
run: |
git config --global --add safe.directory ${GITHUB_WORKSPACE}
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
# Cache local repositories. Note that GitHub Actions cache has a 10G limit.
- name: Cache SBT and Maven
uses: actions/cache@v4
with:
path: |
build/apache-maven-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v4
with:
path: ~/.cache/coursier
key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
pyspark-coursier-
- name: Free up disk space
shell: 'script -q -e -c "bash {0}"'
run: |
if [ -f ./dev/free_disk_space_container ]; then
./dev/free_disk_space_container
fi
- name: Install Java ${{ matrix.java }}
uses: actions/setup-java@v4
with:
distribution: zulu
java-version: ${{ matrix.java }}
- name: List Python packages (${{ env.PYTHON_TO_TEST }})
env: ${{ fromJSON(inputs.envs) }}
shell: 'script -q -e -c "bash {0}"'
run: |
for py in $(echo $PYTHON_TO_TEST | tr "," "\n")
do
echo $py
$py -m pip list
done
- name: Install Conda for pip packaging test
if: contains(matrix.modules, 'pyspark-errors')
run: |
curl -s https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > miniconda.sh
bash miniconda.sh -b -p $HOME/miniconda
rm miniconda.sh
# Run the tests.
- name: Run tests
env: ${{ fromJSON(inputs.envs) }}
shell: 'script -q -e -c "bash {0}"'
run: |
if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then
export PATH=$PATH:$HOME/miniconda/bin
export SKIP_PACKAGING=false
echo "Python Packaging Tests Enabled!"
fi
if [ ! -z "$PYTHON_TO_TEST" ]; then
./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --python-executables "$PYTHON_TO_TEST"
else
# For branch-3.5 and below, it uses the default Python versions.
./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST"
fi
- name: Upload coverage to Codecov
if: fromJSON(inputs.envs).PYSPARK_CODECOV == 'true'
uses: codecov/codecov-action@v4
with:
files: ./python/coverage.xml
flags: unittests
name: PySpark
- name: Upload test results to report
env: ${{ fromJSON(inputs.envs) }}
if: always()
uses: actions/upload-artifact@v4
with:
name: test-results-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
path: "**/target/test-reports/*.xml"
- name: Upload unit tests log files
env: ${{ fromJSON(inputs.envs) }}
if: ${{ !success() }}
uses: actions/upload-artifact@v4
with:
name: unit-tests-log-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
path: "**/target/unit-tests.log"
sparkr:
needs: [precondition, infra-image]
# always run if sparkr == 'true', even infra-image is skip (such as non-master job)
if: (!cancelled()) && fromJson(needs.precondition.outputs.required).sparkr == 'true'
name: "Build modules: sparkr"
runs-on: ubuntu-latest
timeout-minutes: 180
container:
image: ${{ needs.precondition.outputs.image_url }}
env:
HADOOP_PROFILE: ${{ inputs.hadoop }}
HIVE_PROFILE: hive2.3
GITHUB_PREV_SHA: ${{ github.event.before }}
SPARK_LOCAL_IP: localhost
SKIP_UNIDOC: true
SKIP_MIMA: true
SKIP_PACKAGING: true
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
# In order to fetch changed files
with:
fetch-depth: 0
repository: apache/spark
ref: ${{ inputs.branch }}
- name: Add GITHUB_WORKSPACE to git trust safe.directory
run: |
git config --global --add safe.directory ${GITHUB_WORKSPACE}
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
# Cache local repositories. Note that GitHub Actions cache has a 10G limit.
- name: Cache SBT and Maven
uses: actions/cache@v4
with:
path: |
build/apache-maven-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v4
with:
path: ~/.cache/coursier
key: sparkr-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
sparkr-coursier-
- name: Free up disk space
run: |
if [ -f ./dev/free_disk_space_container ]; then
./dev/free_disk_space_container
fi
- name: Install Java ${{ inputs.java }}
uses: actions/setup-java@v4
with:
distribution: zulu
java-version: ${{ inputs.java }}
- name: Run tests
env: ${{ fromJSON(inputs.envs) }}
run: |
# The followings are also used by `r-lib/actions/setup-r` to avoid
# R issues at docker environment
export TZ=UTC
export _R_CHECK_SYSTEM_CLOCK_=FALSE
./dev/run-tests --parallelism 1 --modules sparkr
- name: Upload test results to report
if: always()
uses: actions/upload-artifact@v4
with:
name: test-results-sparkr--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
path: "**/target/test-reports/*.xml"
buf:
needs: [precondition]
if: (!cancelled()) && fromJson(needs.precondition.outputs.required).buf == 'true'
name: Protobuf breaking change detection and Python CodeGen check
runs-on: ubuntu-latest
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
with:
fetch-depth: 0
repository: apache/spark
ref: ${{ inputs.branch }}
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
- name: Install Buf
uses: bufbuild/buf-setup-action@v1
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
- name: Protocol Buffers Linter
uses: bufbuild/buf-lint-action@v1
with:
input: core/src/main/protobuf
# Change 'branch-3.5' to 'branch-4.0' in master branch after cutting branch-4.0 branch.
- name: Breaking change detection against branch-3.5
uses: bufbuild/buf-breaking-action@v1
with:
input: connector/connect/common/src/main
against: 'https://github.com/apache/spark.git#branch=branch-3.5,subdir=connector/connect/common/src/main'
- name: Install Python 3.9
uses: actions/setup-python@v5
with:
python-version: '3.9'
- name: Install dependencies for Python CodeGen check
run: |
python3.9 -m pip install 'black==23.9.1' 'protobuf==4.25.1' 'mypy==1.8.0' 'mypy-protobuf==3.3.0'
python3.9 -m pip list
- name: Python CodeGen check
run: ./dev/connect-check-protos.py
# Static analysis
lint:
needs: [precondition, infra-image]
# always run if lint == 'true', even infra-image is skip (such as non-master job)
if: (!cancelled()) && fromJson(needs.precondition.outputs.required).lint == 'true'
name: Linters, licenses, and dependencies
runs-on: ubuntu-latest
timeout-minutes: 180
env:
LC_ALL: C.UTF-8
LANG: C.UTF-8
NOLINT_ON_COMPILE: false
PYSPARK_DRIVER_PYTHON: python3.9
PYSPARK_PYTHON: python3.9
GITHUB_PREV_SHA: ${{ github.event.before }}
container:
image: ${{ needs.precondition.outputs.image_url }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
with:
fetch-depth: 0
repository: apache/spark
ref: ${{ inputs.branch }}
- name: Add GITHUB_WORKSPACE to git trust safe.directory
run: |
git config --global --add safe.directory ${GITHUB_WORKSPACE}
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
# Cache local repositories. Note that GitHub Actions cache has a 10G limit.
- name: Cache SBT and Maven
uses: actions/cache@v4
with:
path: |
build/apache-maven-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v4
with:
path: ~/.cache/coursier
key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
docs-coursier-
- name: Cache Maven local repository
uses: actions/cache@v4
with:
path: ~/.m2/repository
key: docs-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
docs-maven-
- name: Free up disk space
run: |
if [ -f ./dev/free_disk_space_container ]; then
./dev/free_disk_space_container
fi
- name: Install Java ${{ inputs.java }}
uses: actions/setup-java@v4
with:
distribution: zulu
java-version: ${{ inputs.java }}
- name: License test
run: ./dev/check-license
- name: Dependencies test
run: ./dev/test-dependencies.sh
- name: MIMA test
run: ./dev/mima
- name: Scala linter
run: ./dev/lint-scala
- name: Java linter
run: ./dev/lint-java
- name: Spark connect jvm client mima check
run: ./dev/connect-jvm-client-mima-check
- name: Install Python linter dependencies for branch-3.4
if: inputs.branch == 'branch-3.4'
run: |
# SPARK-44554: Copy from https://github.com/apache/spark/blob/a05c27e85829fe742c1828507a1fd180cdc84b54/.github/workflows/build_and_test.yml#L571-L578
# Should delete this section after SPARK 3.4 EOL.
python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.920' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0'
python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.48.1' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0'
- name: Install Python linter dependencies for branch-3.5
if: inputs.branch == 'branch-3.5'
run: |
# SPARK-45212: Copy from https://github.com/apache/spark/blob/555c8def51e5951c7bf5165a332795e9e330ec9d/.github/workflows/build_and_test.yml#L631-L638
# Should delete this section after SPARK 3.5 EOL.
python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0'
python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.56.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0'
- name: Install Python dependencies for python linter and documentation generation
if: inputs.branch != 'branch-3.4' && inputs.branch != 'branch-3.5'
run: |
# Should unpin 'sphinxcontrib-*' after upgrading sphinx>5
# See 'ipython_genutils' in SPARK-38517
# See 'docutils<0.18.0' in SPARK-39421
python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \
'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \
'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
python3.9 -m pip list
- name: Python linter
run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python
# Should delete this section after SPARK 3.5 EOL.
- name: Install dependencies for Python code generation check for branch-3.5
if: inputs.branch == 'branch-3.5'
run: |
# See more in "Installation" https://docs.buf.build/installation#tarball
curl -LO https://github.com/bufbuild/buf/releases/download/v1.28.1/buf-Linux-x86_64.tar.gz
mkdir -p $HOME/buf
tar -xvzf buf-Linux-x86_64.tar.gz -C $HOME/buf --strip-components 1
rm buf-Linux-x86_64.tar.gz
python3.9 -m pip install 'protobuf==4.25.1' 'mypy-protobuf==3.3.0'
# Should delete this section after SPARK 3.5 EOL.
- name: Python code generation check for branch-3.5
if: inputs.branch == 'branch-3.5'
run: if test -f ./dev/connect-check-protos.py; then PATH=$PATH:$HOME/buf/bin PYTHON_EXECUTABLE=python3.9 ./dev/connect-check-protos.py; fi
# Should delete this section after SPARK 3.5 EOL.
- name: Install JavaScript linter dependencies for branch-3.4, branch-3.5
if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5'
run: |
apt update
apt-get install -y nodejs npm
- name: JS linter
run: ./dev/lint-js
# Should delete this section after SPARK 3.5 EOL.
- name: Install R linter dependencies for branch-3.4, branch-3.5
if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5'
run: |
apt update
apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev \
libfontconfig1-dev libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev \
libtiff5-dev libjpeg-dev
Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
Rscript -e "devtools::install_version('lintr', version='2.0.1', repos='https://cloud.r-project.org')"
- name: Install R linter dependencies and SparkR
run: ./R/install-dev.sh
- name: R linter
run: ./dev/lint-r
# Documentation build
docs:
needs: [precondition, infra-image]
# always run if lint == 'true', even infra-image is skip (such as non-master job)
if: (!cancelled()) && fromJson(needs.precondition.outputs.required).docs == 'true'
name: Documentation generation
runs-on: ubuntu-latest
timeout-minutes: 180
env:
LC_ALL: C.UTF-8
LANG: C.UTF-8
NOLINT_ON_COMPILE: false
PYSPARK_DRIVER_PYTHON: python3.9
PYSPARK_PYTHON: python3.9
GITHUB_PREV_SHA: ${{ github.event.before }}
container:
image: ${{ needs.precondition.outputs.image_url }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
with:
fetch-depth: 0
repository: apache/spark
ref: ${{ inputs.branch }}
- name: Add GITHUB_WORKSPACE to git trust safe.directory
run: |
git config --global --add safe.directory ${GITHUB_WORKSPACE}
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
# Cache local repositories. Note that GitHub Actions cache has a 10G limit.
- name: Cache SBT and Maven
uses: actions/cache@v4
with:
path: |
build/apache-maven-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v4
with:
path: ~/.cache/coursier
key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
docs-coursier-
- name: Cache Maven local repository
uses: actions/cache@v4
with:
path: ~/.m2/repository
key: docs-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
docs-maven-
- name: Free up disk space
run: |
if [ -f ./dev/free_disk_space_container ]; then
./dev/free_disk_space_container
fi
- name: Install Java ${{ inputs.java }}
uses: actions/setup-java@v4
with:
distribution: zulu
java-version: ${{ inputs.java }}
- name: Install Python dependencies for python linter and documentation generation
if: inputs.branch != 'branch-3.4' && inputs.branch != 'branch-3.5'
run: |
# Should unpin 'sphinxcontrib-*' after upgrading sphinx>5
# See 'ipython_genutils' in SPARK-38517
# See 'docutils<0.18.0' in SPARK-39421
python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \
'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \
'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
python3.9 -m pip list
- name: Install dependencies for documentation generation for branch-3.4, branch-3.5
if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5'
run: |
# pandoc is required to generate PySpark APIs as well in nbsphinx.
apt-get update -y
apt-get install -y libcurl4-openssl-dev pandoc
apt-get install -y ruby ruby-dev
Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'markdown', 'e1071', 'roxygen2', 'ggplot2', 'mvtnorm', 'statmod'), repos='https://cloud.r-project.org/')"
Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')"
Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')"
# Should unpin 'sphinxcontrib-*' after upgrading sphinx>5
python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
python3.9 -m pip install ipython_genutils # See SPARK-38517
python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8'
python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421
- name: Install dependencies for documentation generation
run: |
# Keep the version of Bundler here in sync with the following locations:
# - dev/create-release/spark-rm/Dockerfile
# - docs/README.md
gem install bundler -v 2.4.22
cd docs
bundle install
- name: Run documentation build
run: |
# We need this link because the jekyll build calls `python`.
ln -s "$(which python3.9)" "/usr/local/bin/python"
# Build docs first with SKIP_API to ensure they are buildable without requiring any
# language docs to be built beforehand.
cd docs; SKIP_API=1 bundle exec jekyll build; cd ..
if [ -f "./dev/is-changed.py" ]; then
# Skip PySpark and SparkR docs while keeping Scala/Java/SQL docs
pyspark_modules=`cd dev && python3.9 -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"`
if [ `./dev/is-changed.py -m $pyspark_modules` = false ]; then export SKIP_PYTHONDOC=1; fi
if [ `./dev/is-changed.py -m sparkr` = false ]; then export SKIP_RDOC=1; fi
fi
cd docs
bundle exec jekyll build
- name: Tar documentation
if: github.repository != 'apache/spark'
run: tar cjf site.tar.bz2 docs/_site
- name: Upload documentation
if: github.repository != 'apache/spark'
uses: actions/upload-artifact@v4
with:
name: site
path: site.tar.bz2
retention-days: 1
# Any TPC-DS related updates on this job need to be applied to tpcds-1g-gen job of benchmark.yml as well
tpcds-1g:
needs: precondition
if: fromJson(needs.precondition.outputs.required).tpcds-1g == 'true'
name: Run TPC-DS queries with SF=1
# Pin to 'Ubuntu 20.04' due to 'databricks/tpcds-kit' compilation
runs-on: ubuntu-20.04
timeout-minutes: 180
env:
SPARK_LOCAL_IP: localhost
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
with:
fetch-depth: 0
repository: apache/spark
ref: ${{ inputs.branch }}
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
- name: Cache SBT and Maven
uses: actions/cache@v4
with:
path: |
build/apache-maven-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v4
with:
path: ~/.cache/coursier
key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
tpcds-coursier-
- name: Install Java ${{ inputs.java }}
uses: actions/setup-java@v4
with:
distribution: zulu
java-version: ${{ inputs.java }}
- name: Cache TPC-DS generated data
id: cache-tpcds-sf-1
uses: actions/cache@v4
with:
path: ./tpcds-sf-1
key: tpcds-${{ hashFiles('.github/workflows/build_and_test.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }}
- name: Checkout tpcds-kit repository
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
uses: actions/checkout@v4
with:
repository: databricks/tpcds-kit
ref: 2a5078a782192ddb6efbcead8de9973d6ab4f069
path: ./tpcds-kit
- name: Build tpcds-kit
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
run: cd tpcds-kit/tools && make OS=LINUX
- name: Generate TPC-DS (SF=1) table data
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
run: build/sbt "sql/Test/runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite"
- name: Run TPC-DS queries (Sort merge join)
run: |
SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite"
env:
SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }}
SPARK_TPCDS_JOIN_CONF: |
spark.sql.autoBroadcastJoinThreshold=-1
spark.sql.join.preferSortMergeJoin=true
- name: Run TPC-DS queries (Broadcast hash join)
run: |
SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite"
env:
SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }}
SPARK_TPCDS_JOIN_CONF: |
spark.sql.autoBroadcastJoinThreshold=10485760
- name: Run TPC-DS queries (Shuffled hash join)
run: |
SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite"
env:
SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }}
SPARK_TPCDS_JOIN_CONF: |
spark.sql.autoBroadcastJoinThreshold=-1
spark.sql.join.forceApplyShuffledHashJoin=true
- name: Run TPC-DS queries on collated data
run: |
SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSCollationQueryTestSuite"
- name: Upload test results to report
if: always()
uses: actions/upload-artifact@v4
with:
name: test-results-tpcds--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
path: "**/target/test-reports/*.xml"
- name: Upload unit tests log files
if: ${{ !success() }}
uses: actions/upload-artifact@v4
with:
name: unit-tests-log-tpcds--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
path: "**/target/unit-tests.log"
docker-integration-tests:
needs: precondition
if: fromJson(needs.precondition.outputs.required).docker-integration-tests == 'true'
name: Run Docker integration tests
runs-on: ubuntu-latest
timeout-minutes: 180
env:
HADOOP_PROFILE: ${{ inputs.hadoop }}
HIVE_PROFILE: hive2.3
GITHUB_PREV_SHA: ${{ github.event.before }}
SPARK_LOCAL_IP: localhost
SKIP_UNIDOC: true
SKIP_MIMA: true
SKIP_PACKAGING: true
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
with:
fetch-depth: 0
repository: apache/spark
ref: ${{ inputs.branch }}
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
- name: Cache SBT and Maven
uses: actions/cache@v4
with:
path: |
build/apache-maven-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v4
with:
path: ~/.cache/coursier
key: docker-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
docker-integration-coursier-
- name: Install Java ${{ inputs.java }}
uses: actions/setup-java@v4
with:
distribution: zulu
java-version: ${{ inputs.java }}
- name: Run tests
env: ${{ fromJSON(inputs.envs) }}
run: |
./dev/run-tests --parallelism 1 --modules docker-integration-tests --included-tags org.apache.spark.tags.DockerTest
- name: Upload test results to report
if: always()
uses: actions/upload-artifact@v4
with:
name: test-results-docker-integration--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
path: "**/target/test-reports/*.xml"
- name: Upload unit tests log files
if: ${{ !success() }}
uses: actions/upload-artifact@v4
with:
name: unit-tests-log-docker-integration--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
path: "**/target/unit-tests.log"
k8s-integration-tests:
needs: precondition
if: fromJson(needs.precondition.outputs.required).k8s-integration-tests == 'true'
name: Run Spark on Kubernetes Integration test
runs-on: ubuntu-latest
timeout-minutes: 180
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
with:
fetch-depth: 0
repository: apache/spark
ref: ${{ inputs.branch }}
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
- name: Cache SBT and Maven
uses: actions/cache@v4
with:
path: |
build/apache-maven-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v4
with:
path: ~/.cache/coursier
key: k8s-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
k8s-integration-coursier-
- name: Install Java ${{ inputs.java }}
uses: actions/setup-java@v4
with:
distribution: zulu
java-version: ${{ inputs.java }}
- name: start minikube
run: |
# See more in "Installation" https://minikube.sigs.k8s.io/docs/start/
curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64
sudo install minikube-linux-amd64 /usr/local/bin/minikube
rm minikube-linux-amd64
# Github Action limit cpu:2, memory: 6947MB, limit to 2U6G for better resource statistic
minikube start --cpus 2 --memory 6144
- name: Print K8S pods and nodes info
run: |
kubectl get pods -A
kubectl describe node
- name: Run Spark on K8S integration test
run: |
# Prepare PV test
PVC_TMP_DIR=$(mktemp -d)
export PVC_TESTS_HOST_PATH=$PVC_TMP_DIR
export PVC_TESTS_VM_PATH=$PVC_TMP_DIR
minikube mount ${PVC_TESTS_HOST_PATH}:${PVC_TESTS_VM_PATH} --gid=0 --uid=185 &
kubectl create clusterrolebinding serviceaccounts-cluster-admin --clusterrole=cluster-admin --group=system:serviceaccounts || true
kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.8.2/installer/volcano-development.yaml || true
if [[ "${{ inputs.branch }}" == 'branch-3.5' || "${{ inputs.branch }}" == 'branch-3.4' ]]; then
kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.7.0/installer/volcano-development.yaml || true
else
kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.8.2/installer/volcano-development.yaml || true
fi
eval $(minikube docker-env)
build/sbt -Phadoop-3 -Psparkr -Pkubernetes -Pvolcano -Pkubernetes-integration-tests -Dspark.kubernetes.test.volcanoMaxConcurrencyJobNum=1 -Dtest.exclude.tags=local "kubernetes-integration-tests/test"
- name: Upload Spark on K8S integration tests log files
if: ${{ !success() }}
uses: actions/upload-artifact@v4
with:
name: spark-on-kubernetes-it-log
path: "**/target/integration-tests.log"
ui:
needs: [precondition]
if: fromJson(needs.precondition.outputs.required).ui == 'true'
name: Run Spark UI tests
runs-on: ubuntu-latest
timeout-minutes: 180
steps:
- uses: actions/checkout@v4
- name: Use Node.js
uses: actions/setup-node@v4
with:
node-version: 20
cache: 'npm'
cache-dependency-path: ui-test/package-lock.json
- run: |
cd ui-test
npm install --save-dev
node --experimental-vm-modules node_modules/.bin/jest