.github/workflows/python_hosted_runner_test.yml - spark - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #

 name: Build and test PySpark on macOS

 on:
   workflow_call:
     inputs:
       java:
         required: false
         type: string
         default: 17
       python:
         required: false
         type: string
         default: 3.12
       branch:
         description: Branch to run the build against
         required: false
         type: string
         default: master
       hadoop:
         description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it.
         required: false
         type: string
         default: hadoop3
       os:
         description: OS to run this build.
         required: false
         type: string
         default: macos-15
       arch:
         description: The target architecture (x86, x64, arm64) of the Python interpreter.
         required: false
         type: string
         default: arm64
       envs:
         description: Additional environment variables to set when running the tests. Should be in JSON format.
         required: false
         type: string
         default: '{}'
 jobs:
   build:
     name: "Build modules: ${{ matrix.modules }}"
     runs-on: ${{ inputs.os }}
     # TODO(SPARK-54466): https://github.com/actions/runner-images/issues/13341
     # timeout-minutes: 150
     strategy:
       fail-fast: false
       max-parallel: 20
       matrix:
         java:
           - ${{ inputs.java }}
         python:
           - ${{inputs.python}}
         modules:
           - >-
             pyspark-sql, pyspark-resource, pyspark-testing
           - >-
             pyspark-core, pyspark-errors, pyspark-streaming
           - >-
             pyspark-mllib, pyspark-ml, pyspark-ml-connect
           - >-
             pyspark-structured-streaming, pyspark-structured-streaming-connect
           - >-
             pyspark-connect
           - >-
             pyspark-pandas
           - >-
             pyspark-pandas-slow
           - >-
             pyspark-pandas-connect
           - >-
             pyspark-pandas-slow-connect
     env:
       MODULES_TO_TEST: ${{ matrix.modules }}
       PYTHON_TO_TEST: python${{inputs.python}}
       HADOOP_PROFILE: ${{ inputs.hadoop }}
       HIVE_PROFILE: hive2.3
       # GitHub Actions' default miniconda to use in pip packaging test.
       CONDA_PREFIX: /usr/share/miniconda
       GITHUB_PREV_SHA: ${{ github.event.before }}
       SPARK_LOCAL_IP: localhost
       SKIP_UNIDOC: true
       SKIP_MIMA: true
       SKIP_PACKAGING: true
       METASPACE_SIZE: 1g
       BRANCH: ${{ inputs.branch }}
       PYSPARK_TEST_TIMEOUT: 450
     steps:
       - name: Checkout Spark repository
         uses: actions/checkout@v6
         # In order to fetch changed files
         with:
           fetch-depth: 0
           repository: apache/spark
           ref: ${{ inputs.branch }}
       - name: Sync the current branch with the latest in Apache Spark
         if: github.repository != 'apache/spark'
         run: |
           echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
           git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
           git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
           git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
       # Cache local repositories. Note that GitHub Actions cache has a 10G limit.
       - name: Cache SBT and Maven
         # TODO(SPARK-54466): https://github.com/actions/runner-images/issues/13341
         if: ${{ runner.os != 'macOS' }}
         uses: actions/cache@v5
         with:
           path: |
             build/apache-maven-*
             build/*.jar
             ~/.sbt
           key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
           restore-keys: |
             build-
       - name: Cache Coursier local repository
         # TODO(SPARK-54466): https://github.com/actions/runner-images/issues/13341
         if: ${{ runner.os != 'macOS' }}
         uses: actions/cache@v5
         with:
           path: ~/.cache/coursier
           key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
           restore-keys: |
             pyspark-coursier-
       - name: Install Java ${{ matrix.java }}
         uses: actions/setup-java@v5
         with:
           distribution: zulu
           java-version: ${{ matrix.java }}
       - name: Install Python ${{matrix.python}}
         uses: actions/setup-python@v6
         with:
           python-version: ${{matrix.python}}
           architecture: ${{ inputs.arch }}
       - name: Install Python packages (Python ${{matrix.python}})
         run: |
           python${{matrix.python}} -m pip install --ignore-installed 'blinker>=1.6.2'
           python${{matrix.python}} -m pip install --ignore-installed 'six==1.16.0'
           python${{matrix.python}} -m pip install numpy 'pyarrow>=23.0.0' 'six==1.16.0' 'pandas==2.3.3' scipy 'plotly<6.0.0' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' unittest-xml-reporting && \
           python${{matrix.python}} -m pip install 'grpcio==1.76.0' 'grpcio-status==1.76.0' 'protobuf==6.33.5' 'googleapis-common-protos==1.71.0' 'zstandard==0.25.0' 'graphviz==0.20.3' && \
           python${{matrix.python}} -m pip cache purge
       - name: List Python packages
         run: python${{matrix.python}} -m pip list
       # Run the tests.
       - name: Run tests
         env: ${{ fromJSON(inputs.envs) }}
         run: |
           if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then
             export SKIP_PACKAGING=false
             echo "Python Packaging Tests Enabled!"
           fi
           ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --python-executables "$PYTHON_TO_TEST"
       - name: Upload test results to report
         env: ${{ fromJSON(inputs.envs) }}
         if: always()
         uses: actions/upload-artifact@v6
         with:
           name: test-results-${{ inputs.os }}-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
           path: |
             **/target/test-reports/*.xml
             **/target/surefire-reports/*.xml
       - name: Upload unit tests log files
         env: ${{ fromJSON(inputs.envs) }}
         if: ${{ !success() }}
         uses: actions/upload-artifact@v6
         with:
           name: unit-tests-log-${{ inputs.os }}-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
           path: "**/target/unit-tests.log"
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#

	name: Build and test PySpark on macOS

	on:
	workflow_call:
	inputs:
	java:
	required: false
	type: string
	default: 17
	python:
	required: false
	type: string
	default: 3.12
	branch:
	description: Branch to run the build against
	required: false
	type: string
	default: master
	hadoop:
	description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it.
	required: false
	type: string
	default: hadoop3
	os:
	description: OS to run this build.
	required: false
	type: string
	default: macos-15
	arch:
	description: The target architecture (x86, x64, arm64) of the Python interpreter.
	required: false
	type: string
	default: arm64
	envs:
	description: Additional environment variables to set when running the tests. Should be in JSON format.
	required: false
	type: string
	default: '{}'
	jobs:
	build:
	name: "Build modules: ${{ matrix.modules }}"
	runs-on: ${{ inputs.os }}
	# TODO(SPARK-54466): https://github.com/actions/runner-images/issues/13341
	# timeout-minutes: 150
	strategy:
	fail-fast: false
	max-parallel: 20
	matrix:
	java:
	- ${{ inputs.java }}
	python:
	- ${{inputs.python}}
	modules:
	- >-
	pyspark-sql, pyspark-resource, pyspark-testing
	- >-
	pyspark-core, pyspark-errors, pyspark-streaming
	- >-
	pyspark-mllib, pyspark-ml, pyspark-ml-connect
	- >-
	pyspark-structured-streaming, pyspark-structured-streaming-connect
	- >-
	pyspark-connect
	- >-
	pyspark-pandas
	- >-
	pyspark-pandas-slow
	- >-
	pyspark-pandas-connect
	- >-
	pyspark-pandas-slow-connect
	env:
	MODULES_TO_TEST: ${{ matrix.modules }}
	PYTHON_TO_TEST: python${{inputs.python}}
	HADOOP_PROFILE: ${{ inputs.hadoop }}
	HIVE_PROFILE: hive2.3
	# GitHub Actions' default miniconda to use in pip packaging test.
	CONDA_PREFIX: /usr/share/miniconda
	GITHUB_PREV_SHA: ${{ github.event.before }}
	SPARK_LOCAL_IP: localhost
	SKIP_UNIDOC: true
	SKIP_MIMA: true
	SKIP_PACKAGING: true
	METASPACE_SIZE: 1g
	BRANCH: ${{ inputs.branch }}
	PYSPARK_TEST_TIMEOUT: 450
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v6
	# In order to fetch changed files
	with:
	fetch-depth: 0
	repository: apache/spark
	ref: ${{ inputs.branch }}
	- name: Sync the current branch with the latest in Apache Spark
	if: github.repository != 'apache/spark'
	run: \|
	echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
	git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
	git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
	git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
	# Cache local repositories. Note that GitHub Actions cache has a 10G limit.
	- name: Cache SBT and Maven
	# TODO(SPARK-54466): https://github.com/actions/runner-images/issues/13341
	if: ${{ runner.os != 'macOS' }}
	uses: actions/cache@v5
	with:
	path: \|
	build/apache-maven-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Coursier local repository
	# TODO(SPARK-54466): https://github.com/actions/runner-images/issues/13341
	if: ${{ runner.os != 'macOS' }}
	uses: actions/cache@v5
	with:
	path: ~/.cache/coursier
	key: pyspark-coursier-${{ hashFiles('/pom.xml', '/plugins.sbt') }}
	restore-keys: \|
	pyspark-coursier-
	- name: Install Java ${{ matrix.java }}
	uses: actions/setup-java@v5
	with:
	distribution: zulu
	java-version: ${{ matrix.java }}
	- name: Install Python ${{matrix.python}}
	uses: actions/setup-python@v6
	with:
	python-version: ${{matrix.python}}
	architecture: ${{ inputs.arch }}
	- name: Install Python packages (Python ${{matrix.python}})
	run: \|
	python${{matrix.python}} -m pip install --ignore-installed 'blinker>=1.6.2'
	python${{matrix.python}} -m pip install --ignore-installed 'six==1.16.0'
	python${{matrix.python}} -m pip install numpy 'pyarrow>=23.0.0' 'six==1.16.0' 'pandas==2.3.3' scipy 'plotly<6.0.0' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' unittest-xml-reporting && \
	python${{matrix.python}} -m pip install 'grpcio==1.76.0' 'grpcio-status==1.76.0' 'protobuf==6.33.5' 'googleapis-common-protos==1.71.0' 'zstandard==0.25.0' 'graphviz==0.20.3' && \
	python${{matrix.python}} -m pip cache purge
	- name: List Python packages
	run: python${{matrix.python}} -m pip list
	# Run the tests.
	- name: Run tests
	env: ${{ fromJSON(inputs.envs) }}
	run: \|
	if [[ "$MODULES_TO_TEST" == "pyspark-errors" ]]; then
	export SKIP_PACKAGING=false
	echo "Python Packaging Tests Enabled!"
	fi
	./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --python-executables "$PYTHON_TO_TEST"
	- name: Upload test results to report
	env: ${{ fromJSON(inputs.envs) }}
	if: always()
	uses: actions/upload-artifact@v6
	with:
	name: test-results-${{ inputs.os }}-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
	path: \|
	*/target/test-reports/.xml
	*/target/surefire-reports/.xml
	- name: Upload unit tests log files
	env: ${{ fromJSON(inputs.envs) }}
	if: ${{ !success() }}
	uses: actions/upload-artifact@v6
	with:
	name: unit-tests-log-${{ inputs.os }}-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
	path: "**/target/unit-tests.log"