blob: 3a82c2eff8435781ac70fa041032c85ab0fc61b4 [file]
#!/usr/bin/env bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# verify-ansi-expressions.sh — Verify ANSI expressions by expression-matrix category
#
# Usage:
# cd /root/SourceCode/gluten
# bash dev/verify-ansi-expressions.sh <category> [spark41|spark40|all] [--clean]
#
# category:
# cast — Cast + try_cast
# arithmetic — Arithmetic + Abs/UnaryMinus + try arithmetic
# collection — Collection + try_element_at
# datetime — DateTime/Interval + try_to_timestamp etc.
# math — Math (Round/BRound/conv)
# decimal — Decimal (CheckOverflow)
# string — String + try_parse_url
# aggregate — Aggregate + indirect (Sum/Avg/VAR/STDDEV, needs manual review)
# errors — QueryExecutionAnsiErrorsSuite
# all — All of the above (assembled into a single JVM execution)
#
# spark version (default spark41):
# spark41 — Spark 4.1
# spark40 — Spark 4.0
# all — spark41 first, then spark40
#
set -uo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
export SPARK_ANSI_SQL_MODE=true
export SPARK_TESTING=true
CATEGORY="${1:?Usage: $0 <category> [spark41|spark40|all] [--clean]}"
SPARK_VER="${2:-spark41}"
CLEAN_FLAG=""
if [[ "${3:-}" == "--clean" ]] || [[ "${2:-}" == "--clean" ]]; then
CLEAN_FLAG="--clean"
# if --clean was $2, default spark version
if [[ "${2:-}" == "--clean" ]]; then
SPARK_VER="spark41"
fi
fi
case "${SPARK_VER}" in
spark41) PROFILES="-Pjava-17,spark-4.1,scala-2.13,backends-velox,hadoop-3.3"; UT_MODULE="gluten-ut/spark41" ;;
spark40) PROFILES="-Pjava-17,spark-4.0,scala-2.13,backends-velox,hadoop-3.3"; UT_MODULE="gluten-ut/spark40" ;;
all) ;; # handled in main entry
*) echo "Unknown spark version: ${SPARK_VER}"; echo "Usage: $0 <category> [spark41|spark40|all] [--clean]"; exit 1 ;;
esac
ANSI_ARG="--jvm-arg -Dspark.gluten.sql.ansiFallback.enabled=false"
LOG_TS="$(date '+%Y%m%d_%H%M%S')"
LOG_DIR="/tmp/ansi-matrix/${LOG_TS}"
mkdir -p "${LOG_DIR}"
# Symlink latest run for easy access
ln -sfn "${LOG_DIR}" "/tmp/ansi-matrix/latest"
# ── Suite definitions ──────────────────────────────────────────────
# Suite mapping by expression-matrix category
# Cast + try_cast
CAST_UT=(
-s org.apache.spark.sql.catalyst.expressions.GlutenCastWithAnsiOnSuite
-s org.apache.spark.sql.catalyst.expressions.GlutenCastWithAnsiOffSuite
-s org.apache.spark.sql.catalyst.expressions.GlutenTryCastSuite
)
CAST_BACKENDS=(
-s org.apache.spark.sql.catalyst.expressions.VeloxCastSuite
)
# Arithmetic + Abs/UnaryMinus + try arithmetic
ARITHMETIC_UT=(
-s org.apache.spark.sql.catalyst.expressions.GlutenArithmeticExpressionSuite
-s org.apache.spark.sql.catalyst.expressions.GlutenTryEvalSuite
)
ARITHMETIC_BACKENDS=(
-s org.apache.gluten.functions.ArithmeticAnsiValidateSuite
-s org.apache.gluten.functions.MathFunctionsValidateSuiteAnsiOn
)
# Collection + try_element_at
COLLECTION_UT=(
-s org.apache.spark.sql.catalyst.expressions.GlutenCollectionExpressionsSuite
)
# DateTime/Interval + try_to_timestamp etc.
DATETIME_UT=(
-s org.apache.spark.sql.catalyst.expressions.GlutenDateExpressionsSuite
-s org.apache.spark.sql.catalyst.expressions.GlutenIntervalExpressionsSuite
-s org.apache.spark.sql.GlutenDateFunctionsSuite
)
# Math
MATH_UT=(
-s org.apache.spark.sql.catalyst.expressions.GlutenMathExpressionsSuite
)
# §3.2.4 Decimal
DECIMAL_UT=(
-s org.apache.spark.sql.catalyst.expressions.GlutenDecimalExpressionSuite
)
# String + try_parse_url
STRING_UT=(
-s org.apache.spark.sql.catalyst.expressions.GlutenStringExpressionsSuite
-s org.apache.spark.sql.GlutenUrlFunctionsSuite
)
# Aggregate + indirect (VAR/STDDEV) — needs manual review
AGGREGATE_UT=(
-s org.apache.spark.sql.GlutenDataFrameAggregateSuite
)
# ANSI error semantics
ERRORS_UT=(
-s org.apache.spark.sql.errors.GlutenQueryExecutionAnsiErrorsSuite
)
# ── Run function ──────────────────────────────────────────────
run_single() {
local label="$1"
local module="$2"
local profiles="$3"
shift 3
local log="${LOG_DIR}/${label}-${SPARK_VER}.log"
echo ""
echo "=== ${label}: ${module} (${SPARK_VER}) ==="
./dev/run-scala-test.sh --mvnd \
${CLEAN_FLAG} \
${ANSI_ARG} \
${profiles} \
-pl "${module}" \
"$@" \
2>&1 | tee "${log}"
# Only clean on first run
CLEAN_FLAG=""
}
# ── Collect suites for a category ──────────────────────────
get_ut_suites() {
local cat="$1"
case "${cat}" in
cast) echo "${CAST_UT[*]}" ;;
arithmetic) echo "${ARITHMETIC_UT[*]}" ;;
collection) echo "${COLLECTION_UT[*]}" ;;
datetime) echo "${DATETIME_UT[*]}" ;;
math) echo "${MATH_UT[*]}" ;;
decimal) echo "${DECIMAL_UT[*]}" ;;
string) echo "${STRING_UT[*]}" ;;
aggregate) echo "${AGGREGATE_UT[*]}" ;;
errors) echo "${ERRORS_UT[*]}" ;;
esac
}
get_backends_suites() {
local cat="$1"
case "${cat}" in
cast) echo "${CAST_BACKENDS[*]}" ;;
arithmetic) echo "${ARITHMETIC_BACKENDS[*]}" ;;
*) echo "" ;;
esac
}
ALL_CATEGORIES=(cast arithmetic collection datetime math decimal string aggregate errors)
# ── Category execution ──────────────────────────────────────────────
run_category_single() {
local cat="$1"
local ut_suites
read -ra ut_suites <<< "$(get_ut_suites "${cat}")"
if [[ ${#ut_suites[@]} -gt 0 ]]; then
run_single "${cat}-ut" "${UT_MODULE}" "${PROFILES},spark-ut" "${ut_suites[@]}"
fi
local backends_suites
read -ra backends_suites <<< "$(get_backends_suites "${cat}")"
if [[ ${#backends_suites[@]} -gt 0 ]]; then
run_single "${cat}-backends" "backends-velox" "${PROFILES}" "${backends_suites[@]}"
fi
}
run_all() {
# Assemble all UT suites into one invocation
local all_ut_suites=()
for cat in "${ALL_CATEGORIES[@]}"; do
local suites
read -ra suites <<< "$(get_ut_suites "${cat}")"
all_ut_suites+=("${suites[@]}")
done
echo ""
echo "=== ALL UT suites (single JVM, ${#all_ut_suites[@]} -s args) ==="
run_single "all-ut" "${UT_MODULE}" "${PROFILES},spark-ut" "${all_ut_suites[@]}"
# Assemble all backends suites into one invocation
local all_backends_suites=()
for cat in "${ALL_CATEGORIES[@]}"; do
local suites
read -ra suites <<< "$(get_backends_suites "${cat}")"
if [[ ${#suites[@]} -gt 0 && -n "${suites[0]}" ]]; then
all_backends_suites+=("${suites[@]}")
fi
done
if [[ ${#all_backends_suites[@]} -gt 0 ]]; then
echo ""
echo "=== ALL backends suites (single JVM, ${#all_backends_suites[@]} -s args) ==="
run_single "all-backends" "backends-velox" "${PROFILES}" "${all_backends_suites[@]}"
fi
}
# ── Main entry ──────────────────────────────────────────────
run_for_spark_ver() {
case "${CATEGORY}" in
all) run_all ;;
*) run_category_single "${CATEGORY}" ;;
esac
}
echo "========================================"
echo "ANSI Expression Matrix Verification"
echo "Date: $(date '+%Y-%m-%d %H:%M:%S')"
echo "Category: ${CATEGORY}"
echo "Spark: ${SPARK_VER}"
echo "SPARK_ANSI_SQL_MODE=${SPARK_ANSI_SQL_MODE}"
echo "SPARK_TESTING=${SPARK_TESTING}"
echo "ansiFallback=false"
echo "Logs: ${LOG_DIR}/"
echo "========================================"
if [[ "${SPARK_VER}" == "all" ]]; then
# Run spark41 first, then spark40
SPARK_VER="spark41"
PROFILES="-Pjava-17,spark-4.1,scala-2.13,backends-velox,hadoop-3.3"
UT_MODULE="gluten-ut/spark41"
run_for_spark_ver
SPARK_VER="spark40"
PROFILES="-Pjava-17,spark-4.0,scala-2.13,backends-velox,hadoop-3.3"
UT_MODULE="gluten-ut/spark40"
CLEAN_FLAG="--clean"
run_for_spark_ver
else
run_for_spark_ver
fi
echo ""
echo "========================================"
echo "Verification Complete — ${CATEGORY}"
echo "Logs: ${LOG_DIR}/"
echo "========================================"