.github/workflows/build_and_test.yml - spark - Git at Google

 name: Build and test

 on:
   push:
     branches:
     - '**'
     - '!branch-*.*'

 jobs:
   # Build: build Spark and run the tests for specified modules.
   build:
     name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})"
     # Ubuntu 20.04 is the latest LTS. The next LTS is 22.04.
     runs-on: ubuntu-20.04
     strategy:
       fail-fast: false
       matrix:
         java:
           - 8
         hadoop:
           - hadoop3.2
         hive:
           - hive2.3
         # TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now.
         # Kinesis tests depends on external Amazon kinesis service.
         # Note that the modules below are from sparktestsupport/modules.py.
         modules:
           - >-
             core, unsafe, kvstore, avro,
             network-common, network-shuffle, repl, launcher,
             examples, sketch, graphx
           - >-
             catalyst, hive-thriftserver
           - >-
             streaming, sql-kafka-0-10, streaming-kafka-0-10,
             mllib-local, mllib,
             yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl
         # Here, we split Hive and SQL tests into some of slow ones and the rest of them.
         included-tags: [""]
         excluded-tags: [""]
         comment: [""]
         include:
           # Hive tests
           - modules: hive
             java: 8
             hadoop: hadoop3.2
             hive: hive2.3
             included-tags: org.apache.spark.tags.SlowHiveTest
             comment: "- slow tests"
           - modules: hive
             java: 8
             hadoop: hadoop3.2
             hive: hive2.3
             excluded-tags: org.apache.spark.tags.SlowHiveTest
             comment: "- other tests"
           # SQL tests
           - modules: sql
             java: 8
             hadoop: hadoop3.2
             hive: hive2.3
             included-tags: org.apache.spark.tags.ExtendedSQLTest
             comment: "- slow tests"
           - modules: sql
             java: 8
             hadoop: hadoop3.2
             hive: hive2.3
             excluded-tags: org.apache.spark.tags.ExtendedSQLTest
             comment: "- other tests"
     env:
       MODULES_TO_TEST: ${{ matrix.modules }}
       EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
       INCLUDED_TAGS: ${{ matrix.included-tags }}
       HADOOP_PROFILE: ${{ matrix.hadoop }}
       HIVE_PROFILE: ${{ matrix.hive }}
       # GitHub Actions' default miniconda to use in pip packaging test.
       CONDA_PREFIX: /usr/share/miniconda
       GITHUB_PREV_SHA: ${{ github.event.before }}
       SPARK_LOCAL_IP: localhost
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
       # In order to fetch changed files
       with:
         fetch-depth: 0
         repository: apache/spark
         ref: master
     - name: Sync the current branch with the latest in Apache Spark
       if: github.repository != 'apache/spark'
       id: sync-branch
       run: |
         apache_spark_ref=`git rev-parse HEAD`
         git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/}
         git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
         git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
         echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref"
     # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
     - name: Cache Scala, SBT and Maven
       uses: actions/cache@v2
       with:
         path: |
           build/apache-maven-*
           build/scala-*
           build/*.jar
           ~/.sbt
         key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
         restore-keys: |
           build-
     - name: Cache Coursier local repository
       uses: actions/cache@v2
       with:
         path: ~/.cache/coursier
         key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
         restore-keys: |
           ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-
     - name: Install Java ${{ matrix.java }}
       uses: actions/setup-java@v1
       with:
         java-version: ${{ matrix.java }}
     - name: Install Python 3.8
       uses: actions/setup-python@v2
       # We should install one Python that is higher then 3+ for SQL and Yarn because:
       # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
       # - Yarn has a Python specific test too, for example, YarnClusterSuite.
       if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
       with:
         python-version: 3.8
         architecture: x64
     - name: Install Python packages (Python 3.8)
       if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
       run: |
         python3.8 -m pip install numpy 'pyarrow<3.0.0' pandas scipy xmlrunner
         python3.8 -m pip list
     # Run the tests.
     - name: Run tests
       run: |
         export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }}
         # Hive and SQL tests become flaky when running in parallel as it's too intensive.
         if [[ "$MODULES_TO_TEST" == "hive" ]] || [[ "$MODULES_TO_TEST" == "sql" ]]; then export SERIAL_SBT_TESTS=1; fi
         ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
     - name: Upload test results to report
       if: always()
       uses: actions/upload-artifact@v2
       with:
         name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
         path: "**/target/test-reports/*.xml"
     - name: Upload unit tests log files
       if: failure()
       uses: actions/upload-artifact@v2
       with:
         name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
         path: "**/target/unit-tests.log"

   pyspark:
     name: "Build modules: ${{ matrix.modules }}"
     runs-on: ubuntu-20.04
     container:
       image: dongjoon/apache-spark-github-action-image:20201025
     strategy:
       fail-fast: false
       matrix:
         modules:
           - >-
             pyspark-sql, pyspark-mllib, pyspark-resource
           - >-
             pyspark-core, pyspark-streaming, pyspark-ml
           - >-
             pyspark-pandas
     env:
       MODULES_TO_TEST: ${{ matrix.modules }}
       HADOOP_PROFILE: hadoop3.2
       HIVE_PROFILE: hive2.3
       # GitHub Actions' default miniconda to use in pip packaging test.
       CONDA_PREFIX: /usr/share/miniconda
       GITHUB_PREV_SHA: ${{ github.event.before }}
       SPARK_LOCAL_IP: localhost
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
       # In order to fetch changed files
       with:
         fetch-depth: 0
         repository: apache/spark
         ref: master
     - name: Sync the current branch with the latest in Apache Spark
       if: github.repository != 'apache/spark'
       id: sync-branch
       run: |
         apache_spark_ref=`git rev-parse HEAD`
         git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/}
         git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
         git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
         echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref"
     # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
     - name: Cache Scala, SBT and Maven
       uses: actions/cache@v2
       with:
         path: |
           build/apache-maven-*
           build/scala-*
           build/*.jar
           ~/.sbt
         key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
         restore-keys: |
           build-
     - name: Cache Coursier local repository
       uses: actions/cache@v2
       with:
         path: ~/.cache/coursier
         key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
         restore-keys: |
           pyspark-coursier-
     - name: Install Python 3.6
       uses: actions/setup-python@v2
       with:
         python-version: 3.6
         architecture: x64
     # This step takes much less time (~30s) than other Python versions so it is not included
     # in the Docker image being used. There is also a technical issue to install Python 3.6 on
     # Ubuntu 20.04. See also SPARK-33162.
     - name: Install Python packages (Python 3.6)
       run: |
         python3.6 -m pip install numpy 'pyarrow<3.0.0' pandas scipy xmlrunner
         python3.6 -m pip list
     # Run the tests.
     - name: Run tests
       run: |
         export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }}
         ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST"
     - name: Upload test results to report
       if: always()
       uses: actions/upload-artifact@v2
       with:
         name: test-results-${{ matrix.modules }}--8-hadoop3.2-hive2.3
         path: "**/target/test-reports/*.xml"
     - name: Upload unit tests log files
       if: failure()
       uses: actions/upload-artifact@v2
       with:
         name: unit-tests-log-${{ matrix.modules }}--8-hadoop3.2-hive2.3
         path: "**/target/unit-tests.log"

   sparkr:
     name: "Build modules: sparkr"
     runs-on: ubuntu-20.04
     container:
       image: dongjoon/apache-spark-github-action-image:20201025
     env:
       HADOOP_PROFILE: hadoop3.2
       HIVE_PROFILE: hive2.3
       GITHUB_PREV_SHA: ${{ github.event.before }}
       SPARK_LOCAL_IP: localhost
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
       # In order to fetch changed files
       with:
         fetch-depth: 0
         repository: apache/spark
         ref: master
     - name: Sync the current branch with the latest in Apache Spark
       if: github.repository != 'apache/spark'
       id: sync-branch
       run: |
         apache_spark_ref=`git rev-parse HEAD`
         git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/}
         git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
         git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
         echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref"
     # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
     - name: Cache Scala, SBT and Maven
       uses: actions/cache@v2
       with:
         path: |
           build/apache-maven-*
           build/scala-*
           build/*.jar
           ~/.sbt
         key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
         restore-keys: |
           build-
     - name: Cache Coursier local repository
       uses: actions/cache@v2
       with:
         path: ~/.cache/coursier
         key: sparkr-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
         restore-keys: |
           sparkr-coursier-
     - name: Run tests
       run: |
         # The followings are also used by `r-lib/actions/setup-r` to avoid
         # R issues at docker environment
         export TZ=UTC
         export _R_CHECK_SYSTEM_CLOCK_=FALSE
         export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }}
         ./dev/run-tests --parallelism 2 --modules sparkr
     - name: Upload test results to report
       if: always()
       uses: actions/upload-artifact@v2
       with:
         name: test-results-sparkr--8-hadoop3.2-hive2.3
         path: "**/target/test-reports/*.xml"

   # Static analysis, and documentation build
   lint:
     name: Linters, licenses, dependencies and documentation generation
     runs-on: ubuntu-20.04
     env:
       LC_ALL: C.UTF-8
       LANG: C.UTF-8
     container:
       image: dongjoon/apache-spark-github-action-image:20201025
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
     # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
     - name: Cache Scala, SBT and Maven
       uses: actions/cache@v2
       with:
         path: |
           build/apache-maven-*
           build/scala-*
           build/*.jar
           ~/.sbt
         key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
         restore-keys: |
           build-
     - name: Cache Coursier local repository
       uses: actions/cache@v2
       with:
         path: ~/.cache/coursier
         key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
         restore-keys: |
           docs-coursier-
     - name: Cache Maven local repository
       uses: actions/cache@v2
       with:
         path: ~/.m2/repository
         key: docs-maven-${{ hashFiles('**/pom.xml') }}
         restore-keys: |
           docs-maven-
     - name: Install Python 3.6
       uses: actions/setup-python@v2
       with:
         python-version: 3.6
         architecture: x64
     - name: Install Python linter dependencies
       run: |
         # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
         #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
         python3.6 -m pip install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme ipython nbsphinx mypy numpydoc
     - name: Install R linter dependencies and SparkR
       run: |
         apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev
         Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
         Rscript -e "devtools::install_github('jimhester/lintr@v2.0.1')"
         ./R/install-dev.sh
     - name: Install dependencies for documentation generation
       run: |
         # pandoc is required to generate PySpark APIs as well in nbsphinx.
         apt-get install -y libcurl4-openssl-dev pandoc
         # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
         #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
         python3.6 -m pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx numpydoc
         apt-get update -y
         apt-get install -y ruby ruby-dev
         Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
         gem install bundler
         cd docs
         bundle install
     - name: Scala linter
       run: ./dev/lint-scala
     - name: Java linter
       run: ./dev/lint-java
     - name: Python linter
       run: ./dev/lint-python
     - name: R linter
       run: ./dev/lint-r
     - name: License test
       run: ./dev/check-license
     - name: Dependencies test
       run: ./dev/test-dependencies.sh
     - name: Run documentation build
       run: |
         cd docs
         bundle exec jekyll build

   java-11:
     name: Java 11 build with Maven
     runs-on: ubuntu-20.04
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
     - name: Cache Scala, SBT and Maven
       uses: actions/cache@v2
       with:
         path: |
           build/apache-maven-*
           build/scala-*
           build/*.jar
           ~/.sbt
         key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
         restore-keys: |
           build-
     - name: Cache Maven local repository
       uses: actions/cache@v2
       with:
         path: ~/.m2/repository
         key: java11-maven-${{ hashFiles('**/pom.xml') }}
         restore-keys: |
           java11-maven-
     - name: Install Java 11
       uses: actions/setup-java@v1
       with:
         java-version: 11
     - name: Build with Maven
       run: |
         export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
         export MAVEN_CLI_OPTS="--no-transfer-progress"
         # It uses Maven's 'install' intentionally, see https://github.com/apache/spark/pull/26414.
         ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 install
         rm -rf ~/.m2/repository/org/apache/spark

   scala-213:
     name: Scala 2.13 build with SBT
     runs-on: ubuntu-20.04
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
     - name: Cache Scala, SBT and Maven
       uses: actions/cache@v2
       with:
         path: |
           build/apache-maven-*
           build/scala-*
           build/*.jar
           ~/.sbt
         key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
         restore-keys: |
           build-
     - name: Cache Coursier local repository
       uses: actions/cache@v2
       with:
         path: ~/.cache/coursier
         key: scala-213-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
         restore-keys: |
           scala-213-coursier-
     - name: Install Java 8
       uses: actions/setup-java@v1
       with:
         java-version: 8
     - name: Build with SBT
       run: |
         ./dev/change-scala-version.sh 2.13
         ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pdocker-integration-tests -Pkubernetes-integration-tests -Pspark-ganglia-lgpl -Pscala-2.13 compile test:compile

   hadoop-2:
     name: Hadoop 2 build with SBT
     runs-on: ubuntu-20.04
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
     - name: Cache Scala, SBT and Maven
       uses: actions/cache@v2
       with:
         path: |
           build/apache-maven-*
           build/scala-*
           build/*.jar
           ~/.sbt
         key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
         restore-keys: |
           build-
     - name: Cache Coursier local repository
       uses: actions/cache@v2
       with:
         path: ~/.cache/coursier
         key: hadoop-2-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
         restore-keys: |
           hadoop-2-coursier-
     - name: Install Java 8
       uses: actions/setup-java@v1
       with:
         java-version: 8
     - name: Build with SBT
       run: |
         ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Phadoop-2.7 compile test:compile

   tpcds-1g:
     name: Run TPC-DS queries with SF=1
     runs-on: ubuntu-20.04
     env:
       SPARK_LOCAL_IP: localhost
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
     - name: Cache TPC-DS generated data
       id: cache-tpcds-sf-1
       uses: actions/cache@v2
       with:
         path: ./tpcds-sf-1
         key: tpcds-556111e35d400f56cb0625dc16e9063d54628320
     - name: Checkout TPC-DS (SF=1) generated data repository
       if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
       uses: actions/checkout@v2
       with:
         repository: maropu/spark-tpcds-sf-1
         ref: 556111e35d400f56cb0625dc16e9063d54628320
         path: ./tpcds-sf-1
     - name: Cache Scala, SBT and Maven
       uses: actions/cache@v2
       with:
         path: |
           build/apache-maven-*
           build/scala-*
           build/*.jar
           ~/.sbt
         key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
         restore-keys: |
           build-
     - name: Cache Coursier local repository
       uses: actions/cache@v2
       with:
         path: ~/.cache/coursier
         key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
         restore-keys: |
           tpcds-coursier-
     - name: Install Java 8
       uses: actions/setup-java@v1
       with:
         java-version: 8
     - name: Run TPC-DS queries
       run: |
         SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite"
     - name: Upload test results to report
       if: always()
       uses: actions/upload-artifact@v2
       with:
         name: test-results-tpcds--8-hadoop3.2-hive2.3
         path: "**/target/test-reports/*.xml"
     - name: Upload unit tests log files
       if: failure()
       uses: actions/upload-artifact@v2
       with:
         name: unit-tests-log-tpcds--8-hadoop3.2-hive2.3
         path: "**/target/unit-tests.log"
	name: Build and test

	on:
	push:
	branches:
	- '**'
	- '!branch-.'

	jobs:
	# Build: build Spark and run the tests for specified modules.
	build:
	name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})"
	# Ubuntu 20.04 is the latest LTS. The next LTS is 22.04.
	runs-on: ubuntu-20.04
	strategy:
	fail-fast: false
	matrix:
	java:
	- 8
	hadoop:
	- hadoop3.2
	hive:
	- hive2.3
	# TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now.
	# Kinesis tests depends on external Amazon kinesis service.
	# Note that the modules below are from sparktestsupport/modules.py.
	modules:
	- >-
	core, unsafe, kvstore, avro,
	network-common, network-shuffle, repl, launcher,
	examples, sketch, graphx
	- >-
	catalyst, hive-thriftserver
	- >-
	streaming, sql-kafka-0-10, streaming-kafka-0-10,
	mllib-local, mllib,
	yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl
	# Here, we split Hive and SQL tests into some of slow ones and the rest of them.
	included-tags: [""]
	excluded-tags: [""]
	comment: [""]
	include:
	# Hive tests
	- modules: hive
	java: 8
	hadoop: hadoop3.2
	hive: hive2.3
	included-tags: org.apache.spark.tags.SlowHiveTest
	comment: "- slow tests"
	- modules: hive
	java: 8
	hadoop: hadoop3.2
	hive: hive2.3
	excluded-tags: org.apache.spark.tags.SlowHiveTest
	comment: "- other tests"
	# SQL tests
	- modules: sql
	java: 8
	hadoop: hadoop3.2
	hive: hive2.3
	included-tags: org.apache.spark.tags.ExtendedSQLTest
	comment: "- slow tests"
	- modules: sql
	java: 8
	hadoop: hadoop3.2
	hive: hive2.3
	excluded-tags: org.apache.spark.tags.ExtendedSQLTest
	comment: "- other tests"
	env:
	MODULES_TO_TEST: ${{ matrix.modules }}
	EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
	INCLUDED_TAGS: ${{ matrix.included-tags }}
	HADOOP_PROFILE: ${{ matrix.hadoop }}
	HIVE_PROFILE: ${{ matrix.hive }}
	# GitHub Actions' default miniconda to use in pip packaging test.
	CONDA_PREFIX: /usr/share/miniconda
	GITHUB_PREV_SHA: ${{ github.event.before }}
	SPARK_LOCAL_IP: localhost
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v2
	# In order to fetch changed files
	with:
	fetch-depth: 0
	repository: apache/spark
	ref: master
	- name: Sync the current branch with the latest in Apache Spark
	if: github.repository != 'apache/spark'
	id: sync-branch
	run: \|
	apache_spark_ref=`git rev-parse HEAD`
	git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/}
	git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
	git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
	echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref"
	# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
	- name: Cache Scala, SBT and Maven
	uses: actions/cache@v2
	with:
	path: \|
	build/apache-maven-*
	build/scala-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Coursier local repository
	uses: actions/cache@v2
	with:
	path: ~/.cache/coursier
	key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('/pom.xml', '/plugins.sbt') }}
	restore-keys: \|
	${{ matrix.java }}-${{ matrix.hadoop }}-coursier-
	- name: Install Java ${{ matrix.java }}
	uses: actions/setup-java@v1
	with:
	java-version: ${{ matrix.java }}
	- name: Install Python 3.8
	uses: actions/setup-python@v2
	# We should install one Python that is higher then 3+ for SQL and Yarn because:
	# - SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
	# - Yarn has a Python specific test too, for example, YarnClusterSuite.
	if: contains(matrix.modules, 'yarn') \|\| (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
	with:
	python-version: 3.8
	architecture: x64
	- name: Install Python packages (Python 3.8)
	if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
	run: \|
	python3.8 -m pip install numpy 'pyarrow<3.0.0' pandas scipy xmlrunner
	python3.8 -m pip list
	# Run the tests.
	- name: Run tests
	run: \|
	export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }}
	# Hive and SQL tests become flaky when running in parallel as it's too intensive.
	if [[ "$MODULES_TO_TEST" == "hive" ]] \|\| [[ "$MODULES_TO_TEST" == "sql" ]]; then export SERIAL_SBT_TESTS=1; fi
	./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
	- name: Upload test results to report
	if: always()
	uses: actions/upload-artifact@v2
	with:
	name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
	path: "*/target/test-reports/.xml"
	- name: Upload unit tests log files
	if: failure()
	uses: actions/upload-artifact@v2
	with:
	name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
	path: "**/target/unit-tests.log"

	pyspark:
	name: "Build modules: ${{ matrix.modules }}"
	runs-on: ubuntu-20.04
	container:
	image: dongjoon/apache-spark-github-action-image:20201025
	strategy:
	fail-fast: false
	matrix:
	modules:
	- >-
	pyspark-sql, pyspark-mllib, pyspark-resource
	- >-
	pyspark-core, pyspark-streaming, pyspark-ml
	- >-
	pyspark-pandas
	env:
	MODULES_TO_TEST: ${{ matrix.modules }}
	HADOOP_PROFILE: hadoop3.2
	HIVE_PROFILE: hive2.3
	# GitHub Actions' default miniconda to use in pip packaging test.
	CONDA_PREFIX: /usr/share/miniconda
	GITHUB_PREV_SHA: ${{ github.event.before }}
	SPARK_LOCAL_IP: localhost
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v2
	# In order to fetch changed files
	with:
	fetch-depth: 0
	repository: apache/spark
	ref: master
	- name: Sync the current branch with the latest in Apache Spark
	if: github.repository != 'apache/spark'
	id: sync-branch
	run: \|
	apache_spark_ref=`git rev-parse HEAD`
	git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/}
	git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
	git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
	echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref"
	# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
	- name: Cache Scala, SBT and Maven
	uses: actions/cache@v2
	with:
	path: \|
	build/apache-maven-*
	build/scala-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Coursier local repository
	uses: actions/cache@v2
	with:
	path: ~/.cache/coursier
	key: pyspark-coursier-${{ hashFiles('/pom.xml', '/plugins.sbt') }}
	restore-keys: \|
	pyspark-coursier-
	- name: Install Python 3.6
	uses: actions/setup-python@v2
	with:
	python-version: 3.6
	architecture: x64
	# This step takes much less time (~30s) than other Python versions so it is not included
	# in the Docker image being used. There is also a technical issue to install Python 3.6 on
	# Ubuntu 20.04. See also SPARK-33162.
	- name: Install Python packages (Python 3.6)
	run: \|
	python3.6 -m pip install numpy 'pyarrow<3.0.0' pandas scipy xmlrunner
	python3.6 -m pip list
	# Run the tests.
	- name: Run tests
	run: \|
	export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }}
	./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST"
	- name: Upload test results to report
	if: always()
	uses: actions/upload-artifact@v2
	with:
	name: test-results-${{ matrix.modules }}--8-hadoop3.2-hive2.3
	path: "*/target/test-reports/.xml"
	- name: Upload unit tests log files
	if: failure()
	uses: actions/upload-artifact@v2
	with:
	name: unit-tests-log-${{ matrix.modules }}--8-hadoop3.2-hive2.3
	path: "**/target/unit-tests.log"

	sparkr:
	name: "Build modules: sparkr"
	runs-on: ubuntu-20.04
	container:
	image: dongjoon/apache-spark-github-action-image:20201025
	env:
	HADOOP_PROFILE: hadoop3.2
	HIVE_PROFILE: hive2.3
	GITHUB_PREV_SHA: ${{ github.event.before }}
	SPARK_LOCAL_IP: localhost
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v2
	# In order to fetch changed files
	with:
	fetch-depth: 0
	repository: apache/spark
	ref: master
	- name: Sync the current branch with the latest in Apache Spark
	if: github.repository != 'apache/spark'
	id: sync-branch
	run: \|
	apache_spark_ref=`git rev-parse HEAD`
	git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/}
	git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
	git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
	echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref"
	# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
	- name: Cache Scala, SBT and Maven
	uses: actions/cache@v2
	with:
	path: \|
	build/apache-maven-*
	build/scala-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Coursier local repository
	uses: actions/cache@v2
	with:
	path: ~/.cache/coursier
	key: sparkr-coursier-${{ hashFiles('/pom.xml', '/plugins.sbt') }}
	restore-keys: \|
	sparkr-coursier-
	- name: Run tests
	run: \|
	# The followings are also used by `r-lib/actions/setup-r` to avoid
	# R issues at docker environment
	export TZ=UTC
	export _R_CHECK_SYSTEM_CLOCK_=FALSE
	export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }}
	./dev/run-tests --parallelism 2 --modules sparkr
	- name: Upload test results to report
	if: always()
	uses: actions/upload-artifact@v2
	with:
	name: test-results-sparkr--8-hadoop3.2-hive2.3
	path: "*/target/test-reports/.xml"

	# Static analysis, and documentation build
	lint:
	name: Linters, licenses, dependencies and documentation generation
	runs-on: ubuntu-20.04
	env:
	LC_ALL: C.UTF-8
	LANG: C.UTF-8
	container:
	image: dongjoon/apache-spark-github-action-image:20201025
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v2
	# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
	- name: Cache Scala, SBT and Maven
	uses: actions/cache@v2
	with:
	path: \|
	build/apache-maven-*
	build/scala-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Coursier local repository
	uses: actions/cache@v2
	with:
	path: ~/.cache/coursier
	key: docs-coursier-${{ hashFiles('/pom.xml', '/plugins.sbt') }}
	restore-keys: \|
	docs-coursier-
	- name: Cache Maven local repository
	uses: actions/cache@v2
	with:
	path: ~/.m2/repository
	key: docs-maven-${{ hashFiles('**/pom.xml') }}
	restore-keys: \|
	docs-maven-
	- name: Install Python 3.6
	uses: actions/setup-python@v2
	with:
	python-version: 3.6
	architecture: x64
	- name: Install Python linter dependencies
	run: \|
	# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
	# See also https://github.com/sphinx-doc/sphinx/issues/7551.
	python3.6 -m pip install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme ipython nbsphinx mypy numpydoc
	- name: Install R linter dependencies and SparkR
	run: \|
	apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev
	Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
	Rscript -e "devtools::install_github('jimhester/lintr@v2.0.1')"
	./R/install-dev.sh
	- name: Install dependencies for documentation generation
	run: \|
	# pandoc is required to generate PySpark APIs as well in nbsphinx.
	apt-get install -y libcurl4-openssl-dev pandoc
	# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
	# See also https://github.com/sphinx-doc/sphinx/issues/7551.
	python3.6 -m pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx numpydoc
	apt-get update -y
	apt-get install -y ruby ruby-dev
	Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
	gem install bundler
	cd docs
	bundle install
	- name: Scala linter
	run: ./dev/lint-scala
	- name: Java linter
	run: ./dev/lint-java
	- name: Python linter
	run: ./dev/lint-python
	- name: R linter
	run: ./dev/lint-r
	- name: License test
	run: ./dev/check-license
	- name: Dependencies test
	run: ./dev/test-dependencies.sh
	- name: Run documentation build
	run: \|
	cd docs
	bundle exec jekyll build

	java-11:
	name: Java 11 build with Maven
	runs-on: ubuntu-20.04
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v2
	- name: Cache Scala, SBT and Maven
	uses: actions/cache@v2
	with:
	path: \|
	build/apache-maven-*
	build/scala-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Maven local repository
	uses: actions/cache@v2
	with:
	path: ~/.m2/repository
	key: java11-maven-${{ hashFiles('**/pom.xml') }}
	restore-keys: \|
	java11-maven-
	- name: Install Java 11
	uses: actions/setup-java@v1
	with:
	java-version: 11
	- name: Build with Maven
	run: \|
	export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
	export MAVEN_CLI_OPTS="--no-transfer-progress"
	# It uses Maven's 'install' intentionally, see https://github.com/apache/spark/pull/26414.
	./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 install
	rm -rf ~/.m2/repository/org/apache/spark

	scala-213:
	name: Scala 2.13 build with SBT
	runs-on: ubuntu-20.04
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v2
	- name: Cache Scala, SBT and Maven
	uses: actions/cache@v2
	with:
	path: \|
	build/apache-maven-*
	build/scala-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Coursier local repository
	uses: actions/cache@v2
	with:
	path: ~/.cache/coursier
	key: scala-213-coursier-${{ hashFiles('/pom.xml', '/plugins.sbt') }}
	restore-keys: \|
	scala-213-coursier-
	- name: Install Java 8
	uses: actions/setup-java@v1
	with:
	java-version: 8
	- name: Build with SBT
	run: \|
	./dev/change-scala-version.sh 2.13
	./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pdocker-integration-tests -Pkubernetes-integration-tests -Pspark-ganglia-lgpl -Pscala-2.13 compile test:compile

	hadoop-2:
	name: Hadoop 2 build with SBT
	runs-on: ubuntu-20.04
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v2
	- name: Cache Scala, SBT and Maven
	uses: actions/cache@v2
	with:
	path: \|
	build/apache-maven-*
	build/scala-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Coursier local repository
	uses: actions/cache@v2
	with:
	path: ~/.cache/coursier
	key: hadoop-2-coursier-${{ hashFiles('/pom.xml', '/plugins.sbt') }}
	restore-keys: \|
	hadoop-2-coursier-
	- name: Install Java 8
	uses: actions/setup-java@v1
	with:
	java-version: 8
	- name: Build with SBT
	run: \|
	./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Phadoop-2.7 compile test:compile

	tpcds-1g:
	name: Run TPC-DS queries with SF=1
	runs-on: ubuntu-20.04
	env:
	SPARK_LOCAL_IP: localhost
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v2
	- name: Cache TPC-DS generated data
	id: cache-tpcds-sf-1
	uses: actions/cache@v2
	with:
	path: ./tpcds-sf-1
	key: tpcds-556111e35d400f56cb0625dc16e9063d54628320
	- name: Checkout TPC-DS (SF=1) generated data repository
	if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
	uses: actions/checkout@v2
	with:
	repository: maropu/spark-tpcds-sf-1
	ref: 556111e35d400f56cb0625dc16e9063d54628320
	path: ./tpcds-sf-1
	- name: Cache Scala, SBT and Maven
	uses: actions/cache@v2
	with:
	path: \|
	build/apache-maven-*
	build/scala-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Coursier local repository
	uses: actions/cache@v2
	with:
	path: ~/.cache/coursier
	key: tpcds-coursier-${{ hashFiles('/pom.xml', '/plugins.sbt') }}
	restore-keys: \|
	tpcds-coursier-
	- name: Install Java 8
	uses: actions/setup-java@v1
	with:
	java-version: 8
	- name: Run TPC-DS queries
	run: \|
	SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite"
	- name: Upload test results to report
	if: always()
	uses: actions/upload-artifact@v2
	with:
	name: test-results-tpcds--8-hadoop3.2-hive2.3
	path: "*/target/test-reports/.xml"
	- name: Upload unit tests log files
	if: failure()
	uses: actions/upload-artifact@v2
	with:
	name: unit-tests-log-tpcds--8-hadoop3.2-hive2.3
	path: "**/target/unit-tests.log"