blob: 1a23d3b5ac0e509e638034e9c1f89d68d18ed929 [file] [log] [blame]
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
###################################
# Idempotent setup for spark regression tests. Run manually or let run.sh auto-run.
#
# Warning - first time setup may download large amounts of files
# Warning - may clobber conf/spark-defaults.conf
# Warning - it will set the SPARK_HOME environment variable with the spark setup
#
# The script can be called independently like following
# ./setup.sh --sparkVersion ${SPARK_VERSION} --scalaVersion ${SCALA_VERSION} --jar ${JAR_PATH}
# Required Parameters:
# --sparkVersion : the spark version to setup
# --scalaVersion : the scala version of spark to setup
# --jar : path to the local Polaris Spark client jar
#
set -x
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SPARK_VERSION=3.5.6
SCALA_VERSION=2.12
POLARIS_CLIENT_JAR=""
POLARIS_VERSION=""
while [[ $# -gt 0 ]]; do
case "$1" in
--sparkVersion)
SPARK_VERSION="$2"
shift # past argument
shift # past value
;;
--scalaVersion)
SCALA_VERSION="$2"
shift # past argument
shift # past value
;;
--polarisVersion)
POLARIS_VERSION="$2"
shift # past argument
shift # past value
;;
--jar)
POLARIS_CLIENT_JAR="$2"
shift # past argument
shift # past value
;;
--) shift;
break
;;
esac
done
echo "SET UP FOR SPARK_VERSION=${SPARK_VERSION} SCALA_VERSION=${SCALA_VERSION} POLARIS_VERSION=${POLARIS_VERSION} POLARIS_CLIENT_JAR=${POLARIS_CLIENT_JAR}"
if [ "$SCALA_VERSION" == "2.12" ]; then
SPARK_DISTRIBUTION=spark-${SPARK_VERSION}-bin-hadoop3
else
SPARK_DISTRIBUTION=spark-${SPARK_VERSION}-bin-hadoop3-scala${SCALA_VERSION}
fi
echo "Getting spark distribution ${SPARK_DISTRIBUTION}"
if [ -z "${SPARK_HOME}" ]; then
SPARK_HOME=$(realpath ~/${SPARK_DISTRIBUTION})
fi
SPARK_CONF="${SPARK_HOME}/conf/spark-defaults.conf"
DERBY_HOME="/tmp/derby"
# Ensure binaries are downloaded locally
echo 'Verifying Spark binaries...'
if ! [ -f ${SPARK_HOME}/bin/spark-sql ]; then
echo 'Setting up Spark...'
if [ -z "${SPARK_VERSION}" ] || [ -z "${SPARK_DISTRIBUTION}" ]; then
echo 'SPARK_VERSION or SPARK_DISTRIBUTION not set. Please set SPARK_VERSION and SPARK_DISTRIBUTION to the desired version.'
exit 1
fi
if ! [ -f ~/${SPARK_DISTRIBUTION}.tgz ]; then
echo 'Downloading spark distro...'
wget -O ~/${SPARK_DISTRIBUTION}.tgz https://www.apache.org/dyn/closer.lua/spark/spark-${SPARK_VERSION}/${SPARK_DISTRIBUTION}.tgz?action=download
if ! [ -f ~/${SPARK_DISTRIBUTION}.tgz ]; then
if [[ "${OSTYPE}" == "darwin"* ]]; then
echo "Detected OS: mac. Running 'brew install wget' to try again."
brew install wget
wget -O ~/${SPARK_DISTRIBUTION}.tgz https://www.apache.org/dyn/closer.lua/spark/spark-${SPARK_VERSION}/${SPARK_DISTRIBUTION}.tgz?action=download
fi
fi
else
echo 'Found existing Spark tarball'
fi
# check if the download was successful
if ! [ -f ~/${SPARK_DISTRIBUTION}.tgz ]; then
echo 'Failed to download Spark distribution. Please check the logs.'
exit 1
fi
tar xzvf ~/${SPARK_DISTRIBUTION}.tgz -C ~/${TEST_ROOT_DIR}
if [ $? -ne 0 ]; then
echo 'Failed to extract Spark distribution. Please check the logs.'
exit 1
else
echo 'Extracted Spark distribution.'
rm ~/${SPARK_DISTRIBUTION}.tgz
fi
SPARK_HOME=$(realpath ~/${SPARK_DISTRIBUTION})
SPARK_CONF="${SPARK_HOME}/conf/spark-defaults.conf"
else
echo 'Verified Spark distro already installed.'
fi
echo "SPARK_HOME=${SPARK_HOME}"
echo "SPARK_CONF=${SPARK_CONF}"
# Ensure Spark boilerplate conf is set
echo 'Verifying Spark conf...'
if grep 'POLARIS_TESTCONF_V5' ${SPARK_CONF} 2>/dev/null; then
echo 'Verified spark conf'
else
echo 'Setting spark conf...'
# Instead of clobbering existing spark conf, just comment it all out in case it was customized carefully.
sed -i 's/^/# /' ${SPARK_CONF}
# If POLARIS_CLIENT_JAR is provided, set the spark conf to use the jars configuration.
# Otherwise use the packages setting
if [[ -z "$POLARIS_CLIENT_JAR" ]]; then
cat << EOF >> ${SPARK_CONF}
# POLARIS Spark client test conf
spark.jars.packages org.apache.polaris:polaris-spark-3.5_$SCALA_VERSION:$POLARIS_VERSION,io.delta:delta-spark_${SCALA_VERSION}:3.2.1
EOF
else
cat << EOF >> ${SPARK_CONF}
# POLARIS Spark client test conf
spark.jars $POLARIS_CLIENT_JAR
spark.jars.packages io.delta:delta-spark_${SCALA_VERSION}:3.2.1
EOF
fi
cat << EOF >> ${SPARK_CONF}
spark.sql.variable.substitute true
spark.driver.extraJavaOptions -Dderby.system.home=${DERBY_HOME}
spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,io.delta.sql.DeltaSparkSessionExtension
# this configuration is needed for delta table
spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog
spark.sql.catalog.polaris=org.apache.polaris.spark.SparkCatalog
spark.sql.catalog.polaris.uri=http://${POLARIS_HOST:-localhost}:8181/api/catalog
# this configuration is currently only used for iceberg tables, generic tables currently
# don't support credential vending
spark.sql.catalog.polaris.header.X-Iceberg-Access-Delegation=vended-credentials
spark.sql.catalog.polaris.client.region=us-west-2
# configuration required to ensure DataSourceV2 load works correctly for
# different table formats
spark.sql.sources.useV1SourceList=''
EOF
echo 'Success!'
fi
# cleanup derby home if existed
if [ -d "${DERBY_HOME}" ]; then
echo "Directory ${DERBY_HOME} exists. Deleting it..."
rm -rf "${DERBY_HOME}"
fi
echo "Launch spark-sql at ${SPARK_HOME}/bin/spark-sql"
# bootstrap dependencies so that future queries don't need to wait for the downloads.
# this is mostly useful for building the Docker image with all needed dependencies
${SPARK_HOME}/bin/spark-sql -e "SELECT 1"
# ensure SPARK_HOME is setup for later tests
export SPARK_HOME=$SPARK_HOME