blob: a2466ac163ab7e41e26113f8532ec77ea3d731b1 [file]
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
name: Build and test PySpark on macOS
on:
workflow_call:
inputs:
java:
required: false
type: string
default: 17
python:
required: false
type: string
default: 3.12
branch:
description: Branch to run the build against
required: false
type: string
default: master
hadoop:
description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it.
required: false
type: string
default: hadoop3
os:
description: OS to run this build.
required: false
type: string
default: macos-15
arch:
description: The target architecture (x86, x64, arm64) of the Python interpreter.
required: false
type: string
default: arm64
envs:
description: Additional environment variables to set when running the tests. Should be in JSON format.
required: false
type: string
default: '{}'
jobs:
build:
name: "Build modules: ${{ matrix.modules }}"
runs-on: ${{ inputs.os }}
# TODO(SPARK-54466): https://github.com/actions/runner-images/issues/13341
# timeout-minutes: 150
strategy:
fail-fast: false
max-parallel: 20
matrix:
java:
- ${{ inputs.java }}
python:
- ${{inputs.python}}
modules:
- >-
pyspark-sql, pyspark-resource, pyspark-testing
- >-
pyspark-core, pyspark-errors, pyspark-streaming
- >-
pyspark-mllib, pyspark-ml, pyspark-ml-connect
- >-
pyspark-structured-streaming, pyspark-structured-streaming-connect
- >-
pyspark-connect
- >-
pyspark-pandas
- >-
pyspark-pandas-slow
- >-
pyspark-pandas-connect
- >-
pyspark-pandas-slow-connect
env:
MODULES_TO_TEST: ${{ matrix.modules }}
PYTHON_TO_TEST: python${{inputs.python}}
HADOOP_PROFILE: ${{ inputs.hadoop }}
HIVE_PROFILE: hive2.3
# GitHub Actions' default miniconda to use in pip packaging test.
CONDA_PREFIX: /usr/share/miniconda
GITHUB_PREV_SHA: ${{ github.event.before }}
SPARK_LOCAL_IP: localhost
SKIP_UNIDOC: true
SKIP_MIMA: true
SKIP_PACKAGING: true
METASPACE_SIZE: 1g
BRANCH: ${{ inputs.branch }}
PYSPARK_TEST_TIMEOUT: 450
steps:
- name: Checkout Spark repository
uses: actions/checkout@v6
# In order to fetch changed files
with:
fetch-depth: 0
repository: apache/spark
ref: ${{ inputs.branch }}
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
# Cache local repositories. Note that GitHub Actions cache has a 10G limit.
- name: Cache SBT and Maven
# TODO(SPARK-54466): https://github.com/actions/runner-images/issues/13341
if: ${{ runner.os != 'macOS' }}
uses: actions/cache@v5
with:
path: |
build/apache-maven-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
# TODO(SPARK-54466): https://github.com/actions/runner-images/issues/13341
if: ${{ runner.os != 'macOS' }}
uses: actions/cache@v5
with:
path: ~/.cache/coursier
key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
pyspark-coursier-
- name: Install Java ${{ matrix.java }}
uses: actions/setup-java@v5
with:
distribution: zulu
java-version: ${{ matrix.java }}
- name: Install Python ${{matrix.python}}
uses: actions/setup-python@v6
with:
python-version: ${{matrix.python}}
architecture: ${{ inputs.arch }}
- name: Install Python packages (Python ${{matrix.python}})
run: |
python${{matrix.python}} -m pip install --ignore-installed 'blinker>=1.6.2'
python${{matrix.python}} -m pip install --ignore-installed 'six==1.16.0'
python${{matrix.python}} -m pip install numpy 'pyarrow>=23.0.0' 'six==1.16.0' 'pandas==2.3.3' scipy 'plotly<6.0.0' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' unittest-xml-reporting && \
python${{matrix.python}} -m pip install 'grpcio==1.76.0' 'grpcio-status==1.76.0' 'protobuf==6.33.5' 'googleapis-common-protos==1.71.0' 'zstandard==0.25.0' 'graphviz==0.20.3' && \
python${{matrix.python}} -m pip cache purge
- name: List Python packages
run: python${{matrix.python}} -m pip list
# Run the tests.
- name: Run tests
env: ${{ fromJSON(inputs.envs) }}
run: |
if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then
export SKIP_PACKAGING=false
echo "Python Packaging Tests Enabled!"
fi
./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --python-executables "$PYTHON_TO_TEST"
- name: Upload test results to report
env: ${{ fromJSON(inputs.envs) }}
if: always()
uses: actions/upload-artifact@v6
with:
name: test-results-${{ inputs.os }}-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
path: |
**/target/test-reports/*.xml
**/target/surefire-reports/*.xml
- name: Upload unit tests log files
env: ${{ fromJSON(inputs.envs) }}
if: ${{ !success() }}
uses: actions/upload-artifact@v6
with:
name: unit-tests-log-${{ inputs.os }}-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
path: "**/target/unit-tests.log"