| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| name: PR Build (Linux) |
| |
| concurrency: |
| group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} |
| cancel-in-progress: true |
| |
| on: |
| push: |
| branches: |
| - main |
| paths-ignore: |
| - "doc/**" |
| - "docs/**" |
| - "**.md" |
| - "native/core/benches/**" |
| - "native/spark-expr/benches/**" |
| - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" |
| pull_request: |
| paths-ignore: |
| - "doc/**" |
| - "docs/**" |
| - "**.md" |
| - "native/core/benches/**" |
| - "native/spark-expr/benches/**" |
| - "spark/src/test/scala/org/apache/spark/sql/benchmark/**" |
| # manual trigger |
| # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow |
| workflow_dispatch: |
| |
| env: |
| RUST_VERSION: stable |
| |
| jobs: |
| |
| # Fast lint check - gates all other jobs |
| lint: |
| name: Lint |
| runs-on: ubuntu-latest |
| container: |
| image: amd64/rust |
| steps: |
| - uses: actions/checkout@v6 |
| |
| - name: Check Rust formatting |
| run: | |
| rustup component add rustfmt |
| cd native && cargo fmt --all -- --check |
| |
| # Build native library once and share with all test jobs |
| build-native: |
| needs: lint |
| name: Build Native Library |
| runs-on: ubuntu-latest |
| container: |
| image: amd64/rust |
| steps: |
| - uses: actions/checkout@v6 |
| |
| - name: Setup Rust toolchain |
| uses: ./.github/actions/setup-builder |
| with: |
| rust-version: ${{ env.RUST_VERSION }} |
| jdk-version: 17 # JDK only needed for common module proto generation |
| |
| - name: Restore Cargo cache |
| uses: actions/cache/restore@v5 |
| with: |
| path: | |
| ~/.cargo/registry |
| ~/.cargo/git |
| native/target |
| key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-${{ hashFiles('native/**/*.rs') }} |
| restore-keys: | |
| ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}- |
| |
| - name: Build native library (CI profile) |
| run: | |
| cd native |
| # CI profile: same overflow behavior as release, but faster compilation |
| # (no LTO, parallel codegen) |
| cargo build --profile ci |
| env: |
| RUSTFLAGS: "-Ctarget-cpu=x86-64-v3" |
| |
| - name: Upload native library |
| uses: actions/upload-artifact@v6 |
| with: |
| name: native-lib-linux |
| path: native/target/ci/libcomet.so |
| retention-days: 1 |
| |
| - name: Save Cargo cache |
| uses: actions/cache/save@v5 |
| if: github.ref == 'refs/heads/main' |
| with: |
| path: | |
| ~/.cargo/registry |
| ~/.cargo/git |
| native/target |
| key: ${{ runner.os }}-cargo-ci-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-${{ hashFiles('native/**/*.rs') }} |
| |
| # Run Rust tests (runs in parallel with build-native, uses debug builds) |
| linux-test-rust: |
| needs: lint |
| name: ubuntu-latest/rust-test |
| runs-on: ubuntu-latest |
| container: |
| image: amd64/rust |
| steps: |
| - uses: actions/checkout@v6 |
| |
| - name: Setup Rust & Java toolchain |
| uses: ./.github/actions/setup-builder |
| with: |
| rust-version: ${{ env.RUST_VERSION }} |
| jdk-version: 17 |
| |
| - name: Restore Cargo cache |
| uses: actions/cache/restore@v5 |
| with: |
| path: | |
| ~/.cargo/registry |
| ~/.cargo/git |
| native/target |
| # Note: Java version intentionally excluded - Rust target is JDK-independent |
| key: ${{ runner.os }}-cargo-debug-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-${{ hashFiles('native/**/*.rs') }} |
| restore-keys: | |
| ${{ runner.os }}-cargo-debug-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}- |
| |
| - name: Rust test steps |
| uses: ./.github/actions/rust-test |
| |
| - name: Save Cargo cache |
| uses: actions/cache/save@v5 |
| if: github.ref == 'refs/heads/main' |
| with: |
| path: | |
| ~/.cargo/registry |
| ~/.cargo/git |
| native/target |
| key: ${{ runner.os }}-cargo-debug-${{ hashFiles('native/**/Cargo.lock', 'native/**/Cargo.toml') }}-${{ hashFiles('native/**/*.rs') }} |
| |
| linux-test: |
| needs: build-native |
| strategy: |
| matrix: |
| os: [ubuntu-latest] |
| # the goal with these profiles is to get coverage of all Java, Scala, and Spark |
| # versions without testing all possible combinations, which would be overkill |
| profile: |
| - name: "Spark 3.4, JDK 11, Scala 2.12" |
| java_version: "11" |
| maven_opts: "-Pspark-3.4 -Pscala-2.12" |
| scan_impl: "auto" |
| |
| - name: "Spark 3.5.5, JDK 17, Scala 2.13" |
| java_version: "17" |
| maven_opts: "-Pspark-3.5 -Dspark.version=3.5.5 -Pscala-2.13" |
| scan_impl: "auto" |
| |
| - name: "Spark 3.5.6, JDK 17, Scala 2.13" |
| java_version: "17" |
| maven_opts: "-Pspark-3.5 -Dspark.version=3.5.6 -Pscala-2.13" |
| scan_impl: "auto" |
| |
| - name: "Spark 3.5, JDK 17, Scala 2.12" |
| java_version: "17" |
| maven_opts: "-Pspark-3.5 -Pscala-2.12" |
| scan_impl: "native_datafusion" |
| |
| - name: "Spark 3.5, JDK 17, Scala 2.12" |
| java_version: "17" |
| maven_opts: "-Pspark-3.5 -Pscala-2.12" |
| scan_impl: "native_iceberg_compat" |
| |
| - name: "Spark 4.0, JDK 17" |
| java_version: "17" |
| maven_opts: "-Pspark-4.0" |
| scan_impl: "auto" |
| suite: |
| - name: "fuzz" |
| value: | |
| org.apache.comet.CometFuzzTestSuite |
| org.apache.comet.CometFuzzAggregateSuite |
| org.apache.comet.CometFuzzIcebergSuite |
| org.apache.comet.CometFuzzMathSuite |
| org.apache.comet.DataGeneratorSuite |
| - name: "shuffle" |
| value: | |
| org.apache.comet.exec.CometShuffleSuite |
| org.apache.comet.exec.CometShuffle4_0Suite |
| org.apache.comet.exec.CometNativeColumnarToRowSuite |
| org.apache.comet.exec.CometNativeShuffleSuite |
| org.apache.comet.exec.CometShuffleEncryptionSuite |
| org.apache.comet.exec.CometShuffleManagerSuite |
| org.apache.comet.exec.CometAsyncShuffleSuite |
| org.apache.comet.exec.DisableAQECometShuffleSuite |
| org.apache.comet.exec.DisableAQECometAsyncShuffleSuite |
| org.apache.spark.shuffle.sort.SpillSorterSuite |
| - name: "parquet" |
| value: | |
| org.apache.comet.parquet.CometParquetWriterSuite |
| org.apache.comet.parquet.ParquetReadV1Suite |
| org.apache.comet.parquet.ParquetReadV2Suite |
| org.apache.comet.parquet.ParquetReadFromFakeHadoopFsSuite |
| org.apache.spark.sql.comet.ParquetDatetimeRebaseV1Suite |
| org.apache.spark.sql.comet.ParquetDatetimeRebaseV2Suite |
| org.apache.spark.sql.comet.ParquetEncryptionITCase |
| org.apache.comet.exec.CometNativeReaderSuite |
| org.apache.comet.CometIcebergNativeSuite |
| - name: "csv" |
| value: | |
| org.apache.comet.csv.CometCsvNativeReadSuite |
| - name: "exec" |
| value: | |
| org.apache.comet.exec.CometAggregateSuite |
| org.apache.comet.exec.CometExec3_4PlusSuite |
| org.apache.comet.exec.CometExecSuite |
| org.apache.comet.exec.CometGenerateExecSuite |
| org.apache.comet.exec.CometWindowExecSuite |
| org.apache.comet.exec.CometJoinSuite |
| org.apache.comet.CometNativeSuite |
| org.apache.comet.CometSparkSessionExtensionsSuite |
| org.apache.spark.CometPluginsSuite |
| org.apache.spark.CometPluginsDefaultSuite |
| org.apache.spark.CometPluginsNonOverrideSuite |
| org.apache.spark.CometPluginsUnifiedModeOverrideSuite |
| org.apache.comet.rules.CometScanRuleSuite |
| org.apache.comet.rules.CometExecRuleSuite |
| org.apache.spark.sql.CometTPCDSQuerySuite |
| org.apache.spark.sql.CometTPCDSQueryTestSuite |
| org.apache.spark.sql.CometTPCHQuerySuite |
| org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite |
| org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite |
| org.apache.spark.sql.comet.CometTaskMetricsSuite |
| org.apache.comet.objectstore.NativeConfigSuite |
| - name: "expressions" |
| value: | |
| org.apache.comet.CometExpressionSuite |
| org.apache.comet.CometSqlFileTestSuite |
| org.apache.comet.CometExpressionCoverageSuite |
| org.apache.comet.CometHashExpressionSuite |
| org.apache.comet.CometTemporalExpressionSuite |
| org.apache.comet.CometArrayExpressionSuite |
| org.apache.comet.CometCastSuite |
| org.apache.comet.CometMathExpressionSuite |
| org.apache.comet.CometStringExpressionSuite |
| org.apache.comet.CometBitwiseExpressionSuite |
| org.apache.comet.CometMapExpressionSuite |
| org.apache.comet.CometCsvExpressionSuite |
| org.apache.comet.CometJsonExpressionSuite |
| org.apache.comet.expressions.conditional.CometIfSuite |
| org.apache.comet.expressions.conditional.CometCoalesceSuite |
| org.apache.comet.expressions.conditional.CometCaseWhenSuite |
| - name: "sql" |
| value: | |
| org.apache.spark.sql.CometToPrettyStringSuite |
| fail-fast: false |
| name: ${{ matrix.os }}/${{ matrix.profile.name }}/${{ matrix.profile.scan_impl }} [${{ matrix.suite.name }}] |
| runs-on: ${{ matrix.os }} |
| container: |
| image: amd64/rust |
| env: |
| JAVA_TOOL_OPTIONS: ${{ matrix.profile.java_version == '17' && '--add-exports=java.base/sun.nio.ch=ALL-UNNAMED --add-exports=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED' || '' }} |
| |
| steps: |
| - uses: actions/checkout@v6 |
| |
| - name: Setup Rust & Java toolchain |
| uses: ./.github/actions/setup-builder |
| with: |
| rust-version: ${{ env.RUST_VERSION }} |
| jdk-version: ${{ matrix.profile.java_version }} |
| |
| - name: Download native library |
| uses: actions/download-artifact@v7 |
| with: |
| name: native-lib-linux |
| # Download to release/ since Maven's -Prelease expects libcomet.so there |
| path: native/target/release/ |
| |
| # Restore cargo registry cache (for any cargo commands that might run) |
| - name: Cache Cargo registry |
| uses: actions/cache@v5 |
| with: |
| path: | |
| ~/.cargo/registry |
| ~/.cargo/git |
| key: ${{ runner.os }}-cargo-registry-${{ hashFiles('native/**/Cargo.lock') }} |
| restore-keys: | |
| ${{ runner.os }}-cargo-registry- |
| |
| - name: Java test steps |
| uses: ./.github/actions/java-test |
| with: |
| artifact_name: ${{ matrix.os }}-${{ matrix.profile.name }}-${{ matrix.suite.name }}-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }} |
| suites: ${{ matrix.suite.name == 'sql' && matrix.profile.name == 'Spark 3.4, JDK 11, Scala 2.12' && '' || matrix.suite.value }} |
| maven_opts: ${{ matrix.profile.maven_opts }} |
| scan_impl: ${{ matrix.profile.scan_impl }} |
| upload-test-reports: true |
| skip-native-build: true |
| |
| # TPC-H correctness test - verifies benchmark queries produce correct results |
| verify-benchmark-results-tpch: |
| needs: build-native |
| name: Verify TPC-H Results |
| runs-on: ubuntu-latest |
| container: |
| image: amd64/rust |
| steps: |
| - uses: actions/checkout@v6 |
| |
| - name: Setup Rust & Java toolchain |
| uses: ./.github/actions/setup-builder |
| with: |
| rust-version: ${{ env.RUST_VERSION }} |
| jdk-version: 11 |
| |
| - name: Download native library |
| uses: actions/download-artifact@v7 |
| with: |
| name: native-lib-linux |
| path: native/target/release/ |
| |
| - name: Cache Maven dependencies |
| uses: actions/cache@v5 |
| with: |
| path: | |
| ~/.m2/repository |
| /root/.m2/repository |
| key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }} |
| restore-keys: | |
| ${{ runner.os }}-java-maven- |
| |
| - name: Cache TPC-H data |
| id: cache-tpch |
| uses: actions/cache@v5 |
| with: |
| path: ./tpch |
| key: tpch-${{ hashFiles('.github/workflows/pr_build_linux.yml') }} |
| |
| - name: Build project |
| run: | |
| ./mvnw -B -Prelease install -DskipTests |
| |
| - name: Generate TPC-H data (SF=1) |
| if: steps.cache-tpch.outputs.cache-hit != 'true' |
| run: | |
| cd spark && MAVEN_OPTS='-Xmx20g' ../mvnw -B -Prelease exec:java -Dexec.mainClass="org.apache.spark.sql.GenTPCHData" -Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" -Dexec.args="--location `pwd`/.. --scaleFactor 1 --numPartitions 1 --overwrite" |
| |
| - name: Run TPC-H queries |
| run: | |
| SPARK_HOME=`pwd` SPARK_TPCH_DATA=`pwd`/tpch/sf1_parquet ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCHQuerySuite test |
| |
| # TPC-DS correctness tests - verifies benchmark queries produce correct results |
| verify-benchmark-results-tpcds: |
| needs: build-native |
| name: Verify TPC-DS Results (${{ matrix.join }}) |
| runs-on: ubuntu-latest |
| container: |
| image: amd64/rust |
| strategy: |
| matrix: |
| join: [sort_merge, broadcast, hash] |
| fail-fast: false |
| steps: |
| - uses: actions/checkout@v6 |
| |
| - name: Setup Rust & Java toolchain |
| uses: ./.github/actions/setup-builder |
| with: |
| rust-version: ${{ env.RUST_VERSION }} |
| jdk-version: 11 |
| |
| - name: Download native library |
| uses: actions/download-artifact@v7 |
| with: |
| name: native-lib-linux |
| path: native/target/release/ |
| |
| - name: Cache Maven dependencies |
| uses: actions/cache@v5 |
| with: |
| path: | |
| ~/.m2/repository |
| /root/.m2/repository |
| key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }} |
| restore-keys: | |
| ${{ runner.os }}-java-maven- |
| |
| - name: Cache TPC-DS data |
| id: cache-tpcds |
| uses: actions/cache@v5 |
| with: |
| path: ./tpcds-sf-1 |
| key: tpcds-${{ hashFiles('.github/workflows/pr_build_linux.yml') }} |
| |
| - name: Build project |
| run: | |
| ./mvnw -B -Prelease install -DskipTests |
| |
| - name: Checkout tpcds-kit |
| if: steps.cache-tpcds.outputs.cache-hit != 'true' |
| uses: actions/checkout@v6 |
| with: |
| repository: databricks/tpcds-kit |
| path: ./tpcds-kit |
| |
| - name: Build tpcds-kit |
| if: steps.cache-tpcds.outputs.cache-hit != 'true' |
| run: | |
| apt-get update && apt-get install -y yacc bison flex gcc-12 g++-12 |
| update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 120 --slave /usr/bin/g++ g++ /usr/bin/g++-12 |
| cd tpcds-kit/tools && make OS=LINUX |
| |
| - name: Generate TPC-DS data (SF=1) |
| if: steps.cache-tpcds.outputs.cache-hit != 'true' |
| run: | |
| cd spark && MAVEN_OPTS='-Xmx20g' ../mvnw -B -Prelease exec:java -Dexec.mainClass="org.apache.spark.sql.GenTPCDSData" -Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" -Dexec.args="--dsdgenDir `pwd`/../tpcds-kit/tools --location `pwd`/../tpcds-sf-1 --scaleFactor 1 --numPartitions 1" |
| |
| - name: Run TPC-DS queries (Sort merge join) |
| if: matrix.join == 'sort_merge' |
| run: | |
| SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test |
| env: |
| SPARK_TPCDS_JOIN_CONF: | |
| spark.sql.autoBroadcastJoinThreshold=-1 |
| spark.sql.join.preferSortMergeJoin=true |
| |
| - name: Run TPC-DS queries (Broadcast hash join) |
| if: matrix.join == 'broadcast' |
| run: | |
| SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test |
| env: |
| SPARK_TPCDS_JOIN_CONF: | |
| spark.sql.autoBroadcastJoinThreshold=10485760 |
| |
| - name: Run TPC-DS queries (Shuffled hash join) |
| if: matrix.join == 'hash' |
| run: | |
| SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B -Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test |
| env: |
| SPARK_TPCDS_JOIN_CONF: | |
| spark.sql.autoBroadcastJoinThreshold=-1 |
| spark.sql.join.forceApplyShuffledHashJoin=true |