.github/workflows/maven_test.yml - spark - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #

 name: Build and test using Maven

 on:
   workflow_call:
     inputs:
       java:
         required: false
         type: string
         default: 17
       branch:
         description: Branch to run the build against
         required: false
         type: string
         default: master
       hadoop:
         description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it.
         required: false
         type: string
         default: hadoop3
       os:
         description: OS to run this build.
         required: false
         type: string
         default: ubuntu-22.04
       envs:
         description: Additional environment variables to set when running the tests. Should be in JSON format.
         required: false
         type: string
         default: '{}'
 jobs:
   # Build: build Spark and run the tests for specified modules using maven
   build:
     name: "Build modules using Maven: ${{ matrix.modules }} ${{ matrix.comment }}"
     runs-on: ${{ inputs.os }}
     strategy:
       fail-fast: false
       matrix:
         java:
           - ${{ inputs.java }}
         hadoop:
           - ${{ inputs.hadoop }}
         hive:
           - hive2.3
         modules:
           - >-
             core,launcher,common#unsafe,common#kvstore,common#network-common,common#network-shuffle,common#sketch,common#utils,common#variant
           - >-
             graphx,streaming,hadoop-cloud
           - >-
             mllib-local,mllib
           - >-
             repl,sql#hive-thriftserver
           - >-
             connector#kafka-0-10,connector#kafka-0-10-sql,connector#kafka-0-10-token-provider,connector#spark-ganglia-lgpl,connector#protobuf,connector#avro,connector#kinesis-asl
           - >-
             sql#api,sql#catalyst,resource-managers#yarn,resource-managers#kubernetes#core
         # Here, we split Hive and SQL tests into some of slow ones and the rest of them.
         included-tags: [ "" ]
         excluded-tags: [ "" ]
         comment: [ "" ]
         include:
           # Connect tests
           - modules: connect
             java: ${{ inputs.java }}
             hadoop: ${{ inputs.hadoop }}
             hive: hive2.3
             # TODO(SPARK-47110): Reenble AmmoniteTest tests in Maven builds
             excluded-tags: org.apache.spark.tags.AmmoniteTest
             comment: ""
           # Hive tests
           - modules: sql#hive
             java: ${{ inputs.java }}
             hadoop: ${{ inputs.hadoop }}
             hive: hive2.3
             included-tags: org.apache.spark.tags.SlowHiveTest
             comment: "- slow tests"
           - modules: sql#hive
             java: ${{ inputs.java }}
             hadoop: ${{ inputs.hadoop }}
             hive: hive2.3
             excluded-tags: org.apache.spark.tags.SlowHiveTest
             comment: "- other tests"
           # SQL tests
           - modules: sql#core
             java: ${{ inputs.java }}
             hadoop: ${{ inputs.hadoop }}
             hive: hive2.3
             included-tags: org.apache.spark.tags.ExtendedSQLTest
             comment: "- extended tests"
           - modules: sql#core
             java: ${{ inputs.java }}
             hadoop: ${{ inputs.hadoop }}
             hive: hive2.3
             included-tags: org.apache.spark.tags.SlowSQLTest
             comment: "- slow tests"
           - modules: sql#core
             java: ${{ inputs.java }}
             hadoop: ${{ inputs.hadoop }}
             hive: hive2.3
             excluded-tags: org.apache.spark.tags.ExtendedSQLTest,org.apache.spark.tags.SlowSQLTest
             comment: "- other tests"
     env:
       MODULES_TO_TEST: ${{ matrix.modules }}
       EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
       INCLUDED_TAGS: ${{ matrix.included-tags }}
       HADOOP_PROFILE: ${{ matrix.hadoop }}
       HIVE_PROFILE: ${{ matrix.hive }}
       SPARK_LOCAL_IP: localhost
       GITHUB_PREV_SHA: ${{ github.event.before }}
     steps:
       - name: Checkout Spark repository
         uses: actions/checkout@v4
         # In order to fetch changed files
         with:
           fetch-depth: 0
           repository: apache/spark
           ref: ${{ inputs.branch }}
       - name: Sync the current branch with the latest in Apache Spark
         if: github.repository != 'apache/spark'
         run: |
           echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
           git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
           git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
           git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
       # Cache local repositories. Note that GitHub Actions cache has a 10G limit.
       - name: Cache SBT and Maven
         uses: actions/cache@v4
         with:
           path: |
             build/apache-maven-*
             build/*.jar
             ~/.sbt
           key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
           restore-keys: |
             build-
       - name: Cache Maven local repository
         uses: actions/cache@v4
         with:
           path: ~/.m2/repository
           key: java${{ matrix.java }}-maven-${{ hashFiles('**/pom.xml') }}
           restore-keys: |
             java${{ matrix.java }}-maven-
       - name: Install Java ${{ matrix.java }}
         uses: actions/setup-java@v4
         with:
           distribution: zulu
           java-version: ${{ matrix.java }}
       - name: Install Python 3.11
         uses: actions/setup-python@v5
         # We should install one Python that is higher than 3+ for SQL and Yarn because:
         # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
         # - Yarn has a Python specific test too, for example, YarnClusterSuite.
         # macos (14) already has its Python installed, see also SPARK-47096 and
         # https://github.com/actions/runner-images/blob/main/images/macos/macos-14-Readme.md
         if: contains(inputs.os, 'ubuntu') && (contains(matrix.modules, 'resource-managers#yarn') || (contains(matrix.modules, 'sql#core')) || contains(matrix.modules, 'connect'))
         with:
           python-version: '3.11'
           architecture: x64
       - name: Install Python packages (Python 3.11)
         if: (contains(matrix.modules, 'sql#core')) || contains(matrix.modules, 'connect')
         run: |
           python3.11 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'grpcio==1.62.0' 'grpcio-status==1.62.0' 'protobuf==4.25.1'
           python3.11 -m pip list
       # Run the tests.
       - name: Run tests
         env: ${{ fromJSON(inputs.envs) }}
         run: |
           export MAVEN_OPTS="-Xss64m -Xmx4g -Xms4g -XX:ReservedCodeCacheSize=128m -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
           export MAVEN_CLI_OPTS="--no-transfer-progress"
           export JAVA_VERSION=${{ matrix.java }}
           export ENABLE_KINESIS_TESTS=0
           # Replace with the real module name, for example, connector#kafka-0-10 -> connector/kafka-0-10
           export TEST_MODULES=`echo "$MODULES_TO_TEST" | sed -e "s%#%/%g"`
           ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} clean install
           if [[ "$INCLUDED_TAGS" != "" ]]; then
             ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} -Dtest.include.tags="$INCLUDED_TAGS" test -fae
           elif [[ "$MODULES_TO_TEST" == "connect" ]]; then
             ./build/mvn $MAVEN_CLI_OPTS -Dtest.exclude.tags="$EXCLUDED_TAGS" -Djava.version=${JAVA_VERSION/-ea} -pl connector/connect/client/jvm,connector/connect/common,connector/connect/server test -fae
           elif [[ "$EXCLUDED_TAGS" != "" ]]; then
             ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} -Dtest.exclude.tags="$EXCLUDED_TAGS" test -fae
           elif [[ "$MODULES_TO_TEST" == *"sql#hive-thriftserver"* ]]; then
             # To avoid a compilation loop, for the `sql/hive-thriftserver` module, run `clean install` instead
             ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} clean install -fae
           else
             ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Pspark-ganglia-lgpl -Phadoop-cloud -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} test -fae
           fi
       - name: Clean up local Maven repository
         run: |
           rm -rf ~/.m2/repository/org/apache/spark
       - name: Upload test results to report
         if: always()
         uses: actions/upload-artifact@v4
         with:
           name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
           path: "**/target/test-reports/*.xml"
       - name: Upload unit tests log files
         if: failure()
         uses: actions/upload-artifact@v4
         with:
           name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
           path: "**/target/unit-tests.log"
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#

	name: Build and test using Maven

	on:
	workflow_call:
	inputs:
	java:
	required: false
	type: string
	default: 17
	branch:
	description: Branch to run the build against
	required: false
	type: string
	default: master
	hadoop:
	description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it.
	required: false
	type: string
	default: hadoop3
	os:
	description: OS to run this build.
	required: false
	type: string
	default: ubuntu-22.04
	envs:
	description: Additional environment variables to set when running the tests. Should be in JSON format.
	required: false
	type: string
	default: '{}'
	jobs:
	# Build: build Spark and run the tests for specified modules using maven
	build:
	name: "Build modules using Maven: ${{ matrix.modules }} ${{ matrix.comment }}"
	runs-on: ${{ inputs.os }}
	strategy:
	fail-fast: false
	matrix:
	java:
	- ${{ inputs.java }}
	hadoop:
	- ${{ inputs.hadoop }}
	hive:
	- hive2.3
	modules:
	- >-
	core,launcher,common#unsafe,common#kvstore,common#network-common,common#network-shuffle,common#sketch,common#utils,common#variant
	- >-
	graphx,streaming,hadoop-cloud
	- >-
	mllib-local,mllib
	- >-
	repl,sql#hive-thriftserver
	- >-
	connector#kafka-0-10,connector#kafka-0-10-sql,connector#kafka-0-10-token-provider,connector#spark-ganglia-lgpl,connector#protobuf,connector#avro,connector#kinesis-asl
	- >-
	sql#api,sql#catalyst,resource-managers#yarn,resource-managers#kubernetes#core
	# Here, we split Hive and SQL tests into some of slow ones and the rest of them.
	included-tags: [ "" ]
	excluded-tags: [ "" ]
	comment: [ "" ]
	include:
	# Connect tests
	- modules: connect
	java: ${{ inputs.java }}
	hadoop: ${{ inputs.hadoop }}
	hive: hive2.3
	# TODO(SPARK-47110): Reenble AmmoniteTest tests in Maven builds
	excluded-tags: org.apache.spark.tags.AmmoniteTest
	comment: ""
	# Hive tests
	- modules: sql#hive
	java: ${{ inputs.java }}
	hadoop: ${{ inputs.hadoop }}
	hive: hive2.3
	included-tags: org.apache.spark.tags.SlowHiveTest
	comment: "- slow tests"
	- modules: sql#hive
	java: ${{ inputs.java }}
	hadoop: ${{ inputs.hadoop }}
	hive: hive2.3
	excluded-tags: org.apache.spark.tags.SlowHiveTest
	comment: "- other tests"
	# SQL tests
	- modules: sql#core
	java: ${{ inputs.java }}
	hadoop: ${{ inputs.hadoop }}
	hive: hive2.3
	included-tags: org.apache.spark.tags.ExtendedSQLTest
	comment: "- extended tests"
	- modules: sql#core
	java: ${{ inputs.java }}
	hadoop: ${{ inputs.hadoop }}
	hive: hive2.3
	included-tags: org.apache.spark.tags.SlowSQLTest
	comment: "- slow tests"
	- modules: sql#core
	java: ${{ inputs.java }}
	hadoop: ${{ inputs.hadoop }}
	hive: hive2.3
	excluded-tags: org.apache.spark.tags.ExtendedSQLTest,org.apache.spark.tags.SlowSQLTest
	comment: "- other tests"
	env:
	MODULES_TO_TEST: ${{ matrix.modules }}
	EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
	INCLUDED_TAGS: ${{ matrix.included-tags }}
	HADOOP_PROFILE: ${{ matrix.hadoop }}
	HIVE_PROFILE: ${{ matrix.hive }}
	SPARK_LOCAL_IP: localhost
	GITHUB_PREV_SHA: ${{ github.event.before }}
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v4
	# In order to fetch changed files
	with:
	fetch-depth: 0
	repository: apache/spark
	ref: ${{ inputs.branch }}
	- name: Sync the current branch with the latest in Apache Spark
	if: github.repository != 'apache/spark'
	run: \|
	echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
	git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
	git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
	git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
	# Cache local repositories. Note that GitHub Actions cache has a 10G limit.
	- name: Cache SBT and Maven
	uses: actions/cache@v4
	with:
	path: \|
	build/apache-maven-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Maven local repository
	uses: actions/cache@v4
	with:
	path: ~/.m2/repository
	key: java${{ matrix.java }}-maven-${{ hashFiles('**/pom.xml') }}
	restore-keys: \|
	java${{ matrix.java }}-maven-
	- name: Install Java ${{ matrix.java }}
	uses: actions/setup-java@v4
	with:
	distribution: zulu
	java-version: ${{ matrix.java }}
	- name: Install Python 3.11
	uses: actions/setup-python@v5
	# We should install one Python that is higher than 3+ for SQL and Yarn because:
	# - SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
	# - Yarn has a Python specific test too, for example, YarnClusterSuite.
	# macos (14) already has its Python installed, see also SPARK-47096 and
	# https://github.com/actions/runner-images/blob/main/images/macos/macos-14-Readme.md
	if: contains(inputs.os, 'ubuntu') && (contains(matrix.modules, 'resource-managers#yarn') \|\| (contains(matrix.modules, 'sql#core')) \|\| contains(matrix.modules, 'connect'))
	with:
	python-version: '3.11'
	architecture: x64
	- name: Install Python packages (Python 3.11)
	if: (contains(matrix.modules, 'sql#core')) \|\| contains(matrix.modules, 'connect')
	run: \|
	python3.11 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'grpcio==1.62.0' 'grpcio-status==1.62.0' 'protobuf==4.25.1'
	python3.11 -m pip list
	# Run the tests.
	- name: Run tests
	env: ${{ fromJSON(inputs.envs) }}
	run: \|
	export MAVEN_OPTS="-Xss64m -Xmx4g -Xms4g -XX:ReservedCodeCacheSize=128m -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
	export MAVEN_CLI_OPTS="--no-transfer-progress"
	export JAVA_VERSION=${{ matrix.java }}
	export ENABLE_KINESIS_TESTS=0
	# Replace with the real module name, for example, connector#kafka-0-10 -> connector/kafka-0-10
	export TEST_MODULES=`echo "$MODULES_TO_TEST" \| sed -e "s%#%/%g"`
	./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} clean install
	if [[ "$INCLUDED_TAGS" != "" ]]; then
	./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} -Dtest.include.tags="$INCLUDED_TAGS" test -fae
	elif [[ "$MODULES_TO_TEST" == "connect" ]]; then
	./build/mvn $MAVEN_CLI_OPTS -Dtest.exclude.tags="$EXCLUDED_TAGS" -Djava.version=${JAVA_VERSION/-ea} -pl connector/connect/client/jvm,connector/connect/common,connector/connect/server test -fae
	elif [[ "$EXCLUDED_TAGS" != "" ]]; then
	./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} -Dtest.exclude.tags="$EXCLUDED_TAGS" test -fae
	elif [[ "$MODULES_TO_TEST" == "sql#hive-thriftserver" ]]; then
	# To avoid a compilation loop, for the `sql/hive-thriftserver` module, run `clean install` instead
	./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} clean install -fae
	else
	./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Pspark-ganglia-lgpl -Phadoop-cloud -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} test -fae
	fi
	- name: Clean up local Maven repository
	run: \|
	rm -rf ~/.m2/repository/org/apache/spark
	- name: Upload test results to report
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
	path: "*/target/test-reports/.xml"
	- name: Upload unit tests log files
	if: failure()
	uses: actions/upload-artifact@v4
	with:
	name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
	path: "**/target/unit-tests.log"