| name: Build and test |
| |
| on: |
| push: |
| branches: |
| - '**' |
| - '!branch-*.*' |
| |
| jobs: |
| # Build: build Spark and run the tests for specified modules. |
| build: |
| name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})" |
| # Ubuntu 20.04 is the latest LTS. The next LTS is 22.04. |
| runs-on: ubuntu-20.04 |
| strategy: |
| fail-fast: false |
| matrix: |
| java: |
| - 8 |
| hadoop: |
| - hadoop3.2 |
| hive: |
| - hive2.3 |
| # TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now. |
| # Kinesis tests depends on external Amazon kinesis service. |
| # Note that the modules below are from sparktestsupport/modules.py. |
| modules: |
| - >- |
| core, unsafe, kvstore, avro, |
| network-common, network-shuffle, repl, launcher, |
| examples, sketch, graphx |
| - >- |
| catalyst, hive-thriftserver |
| - >- |
| streaming, sql-kafka-0-10, streaming-kafka-0-10, |
| mllib-local, mllib, |
| yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl |
| # Here, we split Hive and SQL tests into some of slow ones and the rest of them. |
| included-tags: [""] |
| excluded-tags: [""] |
| comment: [""] |
| include: |
| # Hive tests |
| - modules: hive |
| java: 8 |
| hadoop: hadoop3.2 |
| hive: hive2.3 |
| included-tags: org.apache.spark.tags.SlowHiveTest |
| comment: "- slow tests" |
| - modules: hive |
| java: 8 |
| hadoop: hadoop3.2 |
| hive: hive2.3 |
| excluded-tags: org.apache.spark.tags.SlowHiveTest |
| comment: "- other tests" |
| # SQL tests |
| - modules: sql |
| java: 8 |
| hadoop: hadoop3.2 |
| hive: hive2.3 |
| included-tags: org.apache.spark.tags.ExtendedSQLTest |
| comment: "- slow tests" |
| - modules: sql |
| java: 8 |
| hadoop: hadoop3.2 |
| hive: hive2.3 |
| excluded-tags: org.apache.spark.tags.ExtendedSQLTest |
| comment: "- other tests" |
| env: |
| MODULES_TO_TEST: ${{ matrix.modules }} |
| EXCLUDED_TAGS: ${{ matrix.excluded-tags }} |
| INCLUDED_TAGS: ${{ matrix.included-tags }} |
| HADOOP_PROFILE: ${{ matrix.hadoop }} |
| HIVE_PROFILE: ${{ matrix.hive }} |
| # GitHub Actions' default miniconda to use in pip packaging test. |
| CONDA_PREFIX: /usr/share/miniconda |
| GITHUB_PREV_SHA: ${{ github.event.before }} |
| SPARK_LOCAL_IP: localhost |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v2 |
| # In order to fetch changed files |
| with: |
| fetch-depth: 0 |
| repository: apache/spark |
| ref: master |
| - name: Sync the current branch with the latest in Apache Spark |
| if: github.repository != 'apache/spark' |
| id: sync-branch |
| run: | |
| apache_spark_ref=`git rev-parse HEAD` |
| git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/} |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" |
| echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref" |
| # Cache local repositories. Note that GitHub Actions cache has a 2G limit. |
| - name: Cache Scala, SBT and Maven |
| uses: actions/cache@v2 |
| with: |
| path: | |
| build/apache-maven-* |
| build/scala-* |
| build/*.jar |
| ~/.sbt |
| key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build- |
| - name: Cache Coursier local repository |
| uses: actions/cache@v2 |
| with: |
| path: ~/.cache/coursier |
| key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} |
| restore-keys: | |
| ${{ matrix.java }}-${{ matrix.hadoop }}-coursier- |
| - name: Install Java ${{ matrix.java }} |
| uses: actions/setup-java@v1 |
| with: |
| java-version: ${{ matrix.java }} |
| - name: Install Python 3.8 |
| uses: actions/setup-python@v2 |
| # We should install one Python that is higher then 3+ for SQL and Yarn because: |
| # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils. |
| # - Yarn has a Python specific test too, for example, YarnClusterSuite. |
| if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) |
| with: |
| python-version: 3.8 |
| architecture: x64 |
| - name: Install Python packages (Python 3.8) |
| if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) |
| run: | |
| python3.8 -m pip install numpy 'pyarrow<3.0.0' pandas scipy xmlrunner |
| python3.8 -m pip list |
| # Run the tests. |
| - name: Run tests |
| run: | |
| export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }} |
| # Hive and SQL tests become flaky when running in parallel as it's too intensive. |
| if [[ "$MODULES_TO_TEST" == "hive" ]] || [[ "$MODULES_TO_TEST" == "sql" ]]; then export SERIAL_SBT_TESTS=1; fi |
| ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" |
| - name: Upload test results to report |
| if: always() |
| uses: actions/upload-artifact@v2 |
| with: |
| name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} |
| path: "**/target/test-reports/*.xml" |
| - name: Upload unit tests log files |
| if: failure() |
| uses: actions/upload-artifact@v2 |
| with: |
| name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} |
| path: "**/target/unit-tests.log" |
| |
| pyspark: |
| name: "Build modules: ${{ matrix.modules }}" |
| runs-on: ubuntu-20.04 |
| container: |
| image: dongjoon/apache-spark-github-action-image:20201025 |
| strategy: |
| fail-fast: false |
| matrix: |
| modules: |
| - >- |
| pyspark-sql, pyspark-mllib, pyspark-resource |
| - >- |
| pyspark-core, pyspark-streaming, pyspark-ml |
| - >- |
| pyspark-pandas |
| env: |
| MODULES_TO_TEST: ${{ matrix.modules }} |
| HADOOP_PROFILE: hadoop3.2 |
| HIVE_PROFILE: hive2.3 |
| # GitHub Actions' default miniconda to use in pip packaging test. |
| CONDA_PREFIX: /usr/share/miniconda |
| GITHUB_PREV_SHA: ${{ github.event.before }} |
| SPARK_LOCAL_IP: localhost |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v2 |
| # In order to fetch changed files |
| with: |
| fetch-depth: 0 |
| repository: apache/spark |
| ref: master |
| - name: Sync the current branch with the latest in Apache Spark |
| if: github.repository != 'apache/spark' |
| id: sync-branch |
| run: | |
| apache_spark_ref=`git rev-parse HEAD` |
| git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/} |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" |
| echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref" |
| # Cache local repositories. Note that GitHub Actions cache has a 2G limit. |
| - name: Cache Scala, SBT and Maven |
| uses: actions/cache@v2 |
| with: |
| path: | |
| build/apache-maven-* |
| build/scala-* |
| build/*.jar |
| ~/.sbt |
| key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build- |
| - name: Cache Coursier local repository |
| uses: actions/cache@v2 |
| with: |
| path: ~/.cache/coursier |
| key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} |
| restore-keys: | |
| pyspark-coursier- |
| - name: Install Python 3.6 |
| uses: actions/setup-python@v2 |
| with: |
| python-version: 3.6 |
| architecture: x64 |
| # This step takes much less time (~30s) than other Python versions so it is not included |
| # in the Docker image being used. There is also a technical issue to install Python 3.6 on |
| # Ubuntu 20.04. See also SPARK-33162. |
| - name: Install Python packages (Python 3.6) |
| run: | |
| python3.6 -m pip install numpy 'pyarrow<3.0.0' pandas scipy xmlrunner |
| python3.6 -m pip list |
| # Run the tests. |
| - name: Run tests |
| run: | |
| export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }} |
| ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" |
| - name: Upload test results to report |
| if: always() |
| uses: actions/upload-artifact@v2 |
| with: |
| name: test-results-${{ matrix.modules }}--8-hadoop3.2-hive2.3 |
| path: "**/target/test-reports/*.xml" |
| - name: Upload unit tests log files |
| if: failure() |
| uses: actions/upload-artifact@v2 |
| with: |
| name: unit-tests-log-${{ matrix.modules }}--8-hadoop3.2-hive2.3 |
| path: "**/target/unit-tests.log" |
| |
| sparkr: |
| name: "Build modules: sparkr" |
| runs-on: ubuntu-20.04 |
| container: |
| image: dongjoon/apache-spark-github-action-image:20201025 |
| env: |
| HADOOP_PROFILE: hadoop3.2 |
| HIVE_PROFILE: hive2.3 |
| GITHUB_PREV_SHA: ${{ github.event.before }} |
| SPARK_LOCAL_IP: localhost |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v2 |
| # In order to fetch changed files |
| with: |
| fetch-depth: 0 |
| repository: apache/spark |
| ref: master |
| - name: Sync the current branch with the latest in Apache Spark |
| if: github.repository != 'apache/spark' |
| id: sync-branch |
| run: | |
| apache_spark_ref=`git rev-parse HEAD` |
| git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/} |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD |
| git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" |
| echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref" |
| # Cache local repositories. Note that GitHub Actions cache has a 2G limit. |
| - name: Cache Scala, SBT and Maven |
| uses: actions/cache@v2 |
| with: |
| path: | |
| build/apache-maven-* |
| build/scala-* |
| build/*.jar |
| ~/.sbt |
| key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build- |
| - name: Cache Coursier local repository |
| uses: actions/cache@v2 |
| with: |
| path: ~/.cache/coursier |
| key: sparkr-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} |
| restore-keys: | |
| sparkr-coursier- |
| - name: Run tests |
| run: | |
| # The followings are also used by `r-lib/actions/setup-r` to avoid |
| # R issues at docker environment |
| export TZ=UTC |
| export _R_CHECK_SYSTEM_CLOCK_=FALSE |
| export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }} |
| ./dev/run-tests --parallelism 2 --modules sparkr |
| - name: Upload test results to report |
| if: always() |
| uses: actions/upload-artifact@v2 |
| with: |
| name: test-results-sparkr--8-hadoop3.2-hive2.3 |
| path: "**/target/test-reports/*.xml" |
| |
| # Static analysis, and documentation build |
| lint: |
| name: Linters, licenses, dependencies and documentation generation |
| runs-on: ubuntu-20.04 |
| env: |
| LC_ALL: C.UTF-8 |
| LANG: C.UTF-8 |
| container: |
| image: dongjoon/apache-spark-github-action-image:20201025 |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v2 |
| # Cache local repositories. Note that GitHub Actions cache has a 2G limit. |
| - name: Cache Scala, SBT and Maven |
| uses: actions/cache@v2 |
| with: |
| path: | |
| build/apache-maven-* |
| build/scala-* |
| build/*.jar |
| ~/.sbt |
| key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build- |
| - name: Cache Coursier local repository |
| uses: actions/cache@v2 |
| with: |
| path: ~/.cache/coursier |
| key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} |
| restore-keys: | |
| docs-coursier- |
| - name: Cache Maven local repository |
| uses: actions/cache@v2 |
| with: |
| path: ~/.m2/repository |
| key: docs-maven-${{ hashFiles('**/pom.xml') }} |
| restore-keys: | |
| docs-maven- |
| - name: Install Python 3.6 |
| uses: actions/setup-python@v2 |
| with: |
| python-version: 3.6 |
| architecture: x64 |
| - name: Install Python linter dependencies |
| run: | |
| # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes. |
| # See also https://github.com/sphinx-doc/sphinx/issues/7551. |
| python3.6 -m pip install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme ipython nbsphinx mypy numpydoc |
| - name: Install R linter dependencies and SparkR |
| run: | |
| apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev |
| Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')" |
| Rscript -e "devtools::install_github('jimhester/lintr@v2.0.1')" |
| ./R/install-dev.sh |
| - name: Install dependencies for documentation generation |
| run: | |
| # pandoc is required to generate PySpark APIs as well in nbsphinx. |
| apt-get install -y libcurl4-openssl-dev pandoc |
| # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes. |
| # See also https://github.com/sphinx-doc/sphinx/issues/7551. |
| python3.6 -m pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx numpydoc |
| apt-get update -y |
| apt-get install -y ruby ruby-dev |
| Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')" |
| gem install bundler |
| cd docs |
| bundle install |
| - name: Scala linter |
| run: ./dev/lint-scala |
| - name: Java linter |
| run: ./dev/lint-java |
| - name: Python linter |
| run: ./dev/lint-python |
| - name: R linter |
| run: ./dev/lint-r |
| - name: License test |
| run: ./dev/check-license |
| - name: Dependencies test |
| run: ./dev/test-dependencies.sh |
| - name: Run documentation build |
| run: | |
| cd docs |
| bundle exec jekyll build |
| |
| java-11: |
| name: Java 11 build with Maven |
| runs-on: ubuntu-20.04 |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v2 |
| - name: Cache Scala, SBT and Maven |
| uses: actions/cache@v2 |
| with: |
| path: | |
| build/apache-maven-* |
| build/scala-* |
| build/*.jar |
| ~/.sbt |
| key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build- |
| - name: Cache Maven local repository |
| uses: actions/cache@v2 |
| with: |
| path: ~/.m2/repository |
| key: java11-maven-${{ hashFiles('**/pom.xml') }} |
| restore-keys: | |
| java11-maven- |
| - name: Install Java 11 |
| uses: actions/setup-java@v1 |
| with: |
| java-version: 11 |
| - name: Build with Maven |
| run: | |
| export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN" |
| export MAVEN_CLI_OPTS="--no-transfer-progress" |
| # It uses Maven's 'install' intentionally, see https://github.com/apache/spark/pull/26414. |
| ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 install |
| rm -rf ~/.m2/repository/org/apache/spark |
| |
| scala-213: |
| name: Scala 2.13 build with SBT |
| runs-on: ubuntu-20.04 |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v2 |
| - name: Cache Scala, SBT and Maven |
| uses: actions/cache@v2 |
| with: |
| path: | |
| build/apache-maven-* |
| build/scala-* |
| build/*.jar |
| ~/.sbt |
| key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build- |
| - name: Cache Coursier local repository |
| uses: actions/cache@v2 |
| with: |
| path: ~/.cache/coursier |
| key: scala-213-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} |
| restore-keys: | |
| scala-213-coursier- |
| - name: Install Java 8 |
| uses: actions/setup-java@v1 |
| with: |
| java-version: 8 |
| - name: Build with SBT |
| run: | |
| ./dev/change-scala-version.sh 2.13 |
| ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pdocker-integration-tests -Pkubernetes-integration-tests -Pspark-ganglia-lgpl -Pscala-2.13 compile test:compile |
| |
| hadoop-2: |
| name: Hadoop 2 build with SBT |
| runs-on: ubuntu-20.04 |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v2 |
| - name: Cache Scala, SBT and Maven |
| uses: actions/cache@v2 |
| with: |
| path: | |
| build/apache-maven-* |
| build/scala-* |
| build/*.jar |
| ~/.sbt |
| key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build- |
| - name: Cache Coursier local repository |
| uses: actions/cache@v2 |
| with: |
| path: ~/.cache/coursier |
| key: hadoop-2-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} |
| restore-keys: | |
| hadoop-2-coursier- |
| - name: Install Java 8 |
| uses: actions/setup-java@v1 |
| with: |
| java-version: 8 |
| - name: Build with SBT |
| run: | |
| ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Phadoop-2.7 compile test:compile |
| |
| tpcds-1g: |
| name: Run TPC-DS queries with SF=1 |
| runs-on: ubuntu-20.04 |
| env: |
| SPARK_LOCAL_IP: localhost |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v2 |
| - name: Cache TPC-DS generated data |
| id: cache-tpcds-sf-1 |
| uses: actions/cache@v2 |
| with: |
| path: ./tpcds-sf-1 |
| key: tpcds-556111e35d400f56cb0625dc16e9063d54628320 |
| - name: Checkout TPC-DS (SF=1) generated data repository |
| if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' |
| uses: actions/checkout@v2 |
| with: |
| repository: maropu/spark-tpcds-sf-1 |
| ref: 556111e35d400f56cb0625dc16e9063d54628320 |
| path: ./tpcds-sf-1 |
| - name: Cache Scala, SBT and Maven |
| uses: actions/cache@v2 |
| with: |
| path: | |
| build/apache-maven-* |
| build/scala-* |
| build/*.jar |
| ~/.sbt |
| key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build- |
| - name: Cache Coursier local repository |
| uses: actions/cache@v2 |
| with: |
| path: ~/.cache/coursier |
| key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} |
| restore-keys: | |
| tpcds-coursier- |
| - name: Install Java 8 |
| uses: actions/setup-java@v1 |
| with: |
| java-version: 8 |
| - name: Run TPC-DS queries |
| run: | |
| SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" |
| - name: Upload test results to report |
| if: always() |
| uses: actions/upload-artifact@v2 |
| with: |
| name: test-results-tpcds--8-hadoop3.2-hive2.3 |
| path: "**/target/test-reports/*.xml" |
| - name: Upload unit tests log files |
| if: failure() |
| uses: actions/upload-artifact@v2 |
| with: |
| name: unit-tests-log-tpcds--8-hadoop3.2-hive2.3 |
| path: "**/target/unit-tests.log" |