| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| |
| name: Build / Python-only, Connect-only (master, Python 3.11) |
| |
| on: |
| schedule: |
| - cron: '0 19 * * *' |
| workflow_dispatch: |
| |
| jobs: |
| # Build: build Spark and run the tests for specified modules using SBT |
| build: |
| name: "Build modules: pyspark-client" |
| runs-on: ubuntu-latest |
| timeout-minutes: 120 |
| if: github.repository == 'apache/spark' |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v6 |
| - name: Cache SBT and Maven |
| uses: actions/cache@v5 |
| with: |
| path: | |
| build/apache-maven-* |
| build/*.jar |
| ~/.sbt |
| key: build-spark-connect-python-only-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build-spark-connect-python-only- |
| - name: Cache Coursier local repository |
| uses: actions/cache@v5 |
| with: |
| path: ~/.cache/coursier |
| key: coursier-build-spark-connect-python-only-${{ hashFiles('**/pom.xml') }} |
| restore-keys: | |
| coursier-build-spark-connect-python-only- |
| - name: Install Java 17 |
| uses: actions/setup-java@v5 |
| with: |
| distribution: zulu |
| java-version: 17 |
| - name: Install Python 3.11 |
| uses: actions/setup-python@v6 |
| with: |
| python-version: '3.11' |
| architecture: x64 |
| - name: Build Spark |
| run: | |
| ./build/sbt -Phive Test/package |
| - name: Install pure Python package (pyspark-client) |
| env: |
| SPARK_TESTING: 1 |
| run: | |
| cd python |
| python packaging/client/setup.py sdist |
| cd dist |
| pip install pyspark*client-*.tar.gz |
| pip install 'grpcio==1.76.0' 'grpcio-status==1.76.0' 'protobuf==6.33.5' 'googleapis-common-protos==1.71.0' 'graphviz==0.20.3' 'six==1.16.0' 'pandas==2.3.3' scipy 'plotly<6.0.0' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' 'graphviz==0.20.3' 'torch<2.6.0' torchvision torcheval deepspeed unittest-xml-reporting 'zstandard==0.25.0' |
| - name: List Python packages |
| run: python -m pip list |
| - name: Run tests (local) |
| env: |
| SPARK_TESTING: 1 |
| SPARK_CONNECT_TESTING_REMOTE: sc://localhost |
| run: | |
| # Make less noisy |
| cp conf/log4j2.properties.template conf/log4j2.properties |
| sed -i 's/rootLogger.level = info/rootLogger.level = warn/g' conf/log4j2.properties |
| |
| # Start a Spark Connect server for local |
| PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.9-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \ |
| --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \ |
| --jars "`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`" |
| |
| # Remove Py4J and PySpark zipped library to make sure there is no JVM connection |
| mv python/lib lib.back |
| mv python/pyspark pyspark.back |
| |
| # Several tests related to catalog requires to run them sequencially, e.g., writing a table in a listener. |
| ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect,pyspark-ml-connect |
| # None of tests are dependent on each other in Pandas API on Spark so run them in parallel |
| ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-pandas-connect,pyspark-pandas-slow-connect |
| |
| # Stop Spark Connect server. |
| ./sbin/stop-connect-server.sh |
| mv lib.back python/lib |
| mv pyspark.back python/pyspark |
| |
| - name: Run tests (local-cluster) |
| env: |
| SPARK_TESTING: 1 |
| SPARK_CONNECT_TESTING_REMOTE: sc://localhost |
| run: | |
| # Start a Spark Connect server for local-cluster |
| PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.9-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \ |
| --master "local-cluster[2, 4, 1024]" \ |
| --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \ |
| --jars "`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`" |
| |
| # Remove Py4J and PySpark zipped library to make sure there is no JVM connection |
| mv python/lib lib.back |
| mv python/pyspark pyspark.back |
| |
| ./python/run-tests --parallelism=1 --python-executables=python3 --testnames "pyspark.resource.tests.test_connect_resources,pyspark.sql.tests.connect.client.test_artifact,pyspark.sql.tests.connect.client.test_artifact_localcluster,pyspark.sql.tests.connect.test_resources" |
| |
| # Stop Spark Connect server. |
| ./sbin/stop-connect-server.sh |
| mv lib.back python/lib |
| mv pyspark.back python/pyspark |
| - name: Upload test results to report |
| if: always() |
| uses: actions/upload-artifact@v6 |
| with: |
| name: test-results-spark-connect-python-only |
| path: | |
| **/target/test-reports/*.xml |
| **/target/surefire-reports/*.xml |
| - name: Upload Spark Connect server log file |
| if: ${{ !success() }} |
| uses: actions/upload-artifact@v6 |
| with: |
| name: unit-tests-log-spark-connect-python-only |
| path: logs/*.out |