| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| |
| name: Build / Python-only, Connect-only (master-server, branch-4.0-client, Python 3.11) |
| |
| on: |
| schedule: |
| - cron: '0 20 * * *' |
| workflow_dispatch: |
| |
| jobs: |
| # Build: build Spark and run the tests for specified modules using SBT |
| build: |
| name: "Build modules: pyspark-connect" |
| runs-on: ubuntu-latest |
| timeout-minutes: 100 |
| if: github.repository == 'apache/spark' |
| steps: |
| - name: Checkout Spark repository |
| uses: actions/checkout@v6 |
| with: |
| fetch-depth: 0 |
| - name: Cache SBT and Maven |
| uses: actions/cache@v5 |
| with: |
| path: | |
| build/apache-maven-* |
| build/*.jar |
| ~/.sbt |
| key: build-spark-connect-python-only-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} |
| restore-keys: | |
| build-spark-connect-python-only- |
| - name: Cache Coursier local repository |
| uses: actions/cache@v5 |
| with: |
| path: ~/.cache/coursier |
| key: coursier-build-spark-connect-python-only-${{ hashFiles('**/pom.xml') }} |
| restore-keys: | |
| coursier-build-spark-connect-python-only- |
| - name: Install Java 17 |
| uses: actions/setup-java@v5 |
| with: |
| distribution: zulu |
| java-version: 17 |
| - name: Install Python 3.11 |
| uses: actions/setup-python@v6 |
| with: |
| python-version: '3.11' |
| architecture: x64 |
| - name: Build Spark |
| run: | |
| ./build/sbt -Phive Test/package |
| - name: Install Python dependencies |
| run: | |
| pip install 'numpy' 'pyarrow>=18.0.0' 'pandas==2.2.3' scipy unittest-xml-reporting 'plotly<6.0.0' 'mlflow>=2.8.1' coverage 'matplotlib' openpyxl 'memory-profiler==0.61.0' 'scikit-learn>=1.3.2' |
| |
| # Add Python deps for Spark Connect. |
| pip install 'grpcio==1.76.0' 'grpcio-status==1.76.0' 'protobuf==6.33.5' 'googleapis-common-protos==1.71.0' 'graphviz==0.20.3' 'zstandard==0.25.0' |
| |
| # Add torch as a testing dependency for TorchDistributor |
| pip install 'torch==2.0.1' 'torchvision==0.15.2' torcheval |
| - name: List Python packages |
| run: python -m pip list |
| - name: Run tests |
| env: |
| SPARK_TESTING: 1 |
| SPARK_SKIP_CONNECT_COMPAT_TESTS: 1 |
| SPARK_CONNECT_TESTING_REMOTE: sc://localhost |
| run: | |
| # Make less noisy |
| cp conf/log4j2.properties.template conf/log4j2.properties |
| sed -i 's/rootLogger.level = info/rootLogger.level = warn/g' conf/log4j2.properties |
| |
| # Start a Spark Connect server for local |
| PYTHONPATH="python/lib/pyspark.zip:python/lib/py4j-0.10.9.9-src.zip:$PYTHONPATH" ./sbin/start-connect-server.sh \ |
| --driver-java-options "-Dlog4j.configurationFile=file:$GITHUB_WORKSPACE/conf/log4j2.properties" \ |
| --jars "`find connector/protobuf/target -name spark-protobuf-*SNAPSHOT.jar`,`find connector/avro/target -name spark-avro*SNAPSHOT.jar`" \ |
| --conf spark.sql.execution.arrow.pyspark.validateSchema.enabled=false \ |
| --conf spark.sql.execution.pandas.convertToArrowArraySafely=false |
| |
| # Checkout to branch-4.0 to use the tests in branch-4.0. |
| cd .. |
| git clone --single-branch --branch branch-4.0 $GITHUB_SERVER_URL/$GITHUB_REPOSITORY spark-4.0 |
| cd spark-4.0 |
| |
| # Several tests related to catalog requires to run them sequencially, e.g., writing a table in a listener. |
| # Run branch-4.0 tests |
| ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect |
| # None of tests are dependent on each other in Pandas API on Spark so run them in parallel |
| ./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-pandas-connect,pyspark-pandas-slow-connect |
| - name: Upload test results to report |
| if: always() |
| uses: actions/upload-artifact@v6 |
| with: |
| name: test-results-spark-connect-python-only |
| path: | |
| **/target/test-reports/*.xml |
| **/target/surefire-reports/*.xml |
| - name: Upload Spark Connect server log file |
| if: ${{ !success() }} |
| uses: actions/upload-artifact@v6 |
| with: |
| name: unit-tests-log-spark-connect-python-only |
| path: logs/*.out |