python/run-tests - spark - Git at Google

 #!/usr/bin/env bash

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #


 # Figure out where the Spark framework is installed
 FWDIR="$(cd "`dirname "$0"`"; cd ../; pwd)"

 . "$FWDIR"/bin/load-spark-env.sh

 # CD into the python directory to find things on the right path
 cd "$FWDIR/python"

 FAILED=0
 LOG_FILE=unit-tests.log
 START=$(date +"%s")

 rm -f $LOG_FILE

 # Remove the metastore and warehouse directory created by the HiveContext tests in Spark SQL
 rm -rf metastore warehouse

 function run_test() {
     echo -en "Running test: $1 ... " | tee -a $LOG_FILE
     start=$(date +"%s")
     SPARK_TESTING=1 time "$FWDIR"/bin/pyspark $1 > $LOG_FILE 2>&1

     FAILED=$((PIPESTATUS[0]||$FAILED))

     # Fail and exit on the first test failure.
     if [[ $FAILED != 0 ]]; then
         cat $LOG_FILE | grep -v "^[0-9][0-9]*" # filter all lines starting with a number.
         echo -en "\033[31m"  # Red
         echo "Had test failures; see logs."
         echo -en "\033[0m"  # No color
         exit -1
     else
         now=$(date +"%s")
         echo "ok ($(($now - $start))s)"
     fi
 }

 function run_core_tests() {
     echo "Run core tests ..."
     run_test "pyspark.rdd"
     run_test "pyspark.context"
     run_test "pyspark.conf"
     run_test "pyspark.broadcast"
     run_test "pyspark.accumulators"
     run_test "pyspark.serializers"
     run_test "pyspark.profiler"
     run_test "pyspark.shuffle"
     run_test "pyspark.tests"
 }

 function run_sql_tests() {
     echo "Run sql tests ..."
     run_test "pyspark.sql.types"
     run_test "pyspark.sql.context"
     run_test "pyspark.sql.column"
     run_test "pyspark.sql.dataframe"
     run_test "pyspark.sql.group"
     run_test "pyspark.sql.functions"
     run_test "pyspark.sql.readwriter"
     run_test "pyspark.sql.window"
     run_test "pyspark.sql.tests"
 }

 function run_mllib_tests() {
     echo "Run mllib tests ..."
     run_test "pyspark.mllib.classification"
     run_test "pyspark.mllib.clustering"
     run_test "pyspark.mllib.evaluation"
     run_test "pyspark.mllib.feature"
     run_test "pyspark.mllib.fpm"
     run_test "pyspark.mllib.linalg"
     run_test "pyspark.mllib.random"
     run_test "pyspark.mllib.recommendation"
     run_test "pyspark.mllib.regression"
     run_test "pyspark.mllib.stat._statistics"
     run_test "pyspark.mllib.tree"
     run_test "pyspark.mllib.util"
     run_test "pyspark.mllib.tests"
 }

 function run_ml_tests() {
     echo "Run ml tests ..."
     run_test "pyspark.ml.feature"
     run_test "pyspark.ml.classification"
     run_test "pyspark.ml.recommendation"
     run_test "pyspark.ml.regression"
     run_test "pyspark.ml.tuning"
     run_test "pyspark.ml.tests"
     run_test "pyspark.ml.evaluation"
 }

 function run_streaming_tests() {
     echo "Run streaming tests ..."

     KAFKA_ASSEMBLY_DIR="$FWDIR"/external/kafka-assembly
     JAR_PATH="${KAFKA_ASSEMBLY_DIR}/target/scala-${SPARK_SCALA_VERSION}"
     for f in "${JAR_PATH}"/spark-streaming-kafka-assembly-*.jar; do
       if [[ ! -e "$f" ]]; then
         echo "Failed to find Spark Streaming Kafka assembly jar in $KAFKA_ASSEMBLY_DIR" 1>&2
         echo "You need to build Spark with " \
              "'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or" \
              "'build/mvn package' before running this program" 1>&2
         exit 1
       fi
       KAFKA_ASSEMBLY_JAR="$f"
     done

     export PYSPARK_SUBMIT_ARGS="--jars ${KAFKA_ASSEMBLY_JAR} pyspark-shell"
     run_test "pyspark.streaming.util"
     run_test "pyspark.streaming.tests"
 }

 echo "Running PySpark tests. Output is in python/$LOG_FILE."

 export PYSPARK_PYTHON="python"

 # Try to test with Python 2.6, since that's the minimum version that we support:
 if [ $(which python2.6) ]; then
     export PYSPARK_PYTHON="python2.6"
 fi

 echo "Testing with Python version:"
 $PYSPARK_PYTHON --version

 run_core_tests
 run_sql_tests
 run_mllib_tests
 run_ml_tests
 run_streaming_tests

 # Try to test with Python 3
 if [ $(which python3.4) ]; then
     export PYSPARK_PYTHON="python3.4"
     echo "Testing with Python3.4 version:"
     $PYSPARK_PYTHON --version

     run_core_tests
     run_sql_tests
     run_mllib_tests
     run_ml_tests
     run_streaming_tests
 fi

 # Try to test with PyPy
 if [ $(which pypy) ]; then
     export PYSPARK_PYTHON="pypy"
     echo "Testing with PyPy version:"
     $PYSPARK_PYTHON --version

     run_core_tests
     run_sql_tests
     run_streaming_tests
 fi

 if [[ $FAILED == 0 ]]; then
     now=$(date +"%s")
     echo -e "\033[32mTests passed \033[0min $(($now - $START)) seconds"
 fi

 # TODO: in the long-run, it would be nice to use a test runner like `nose`.
 # The doctest fixtures are the current barrier to doing this.
	#!/usr/bin/env bash

	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#


	# Figure out where the Spark framework is installed
	FWDIR="$(cd "`dirname "$0"`"; cd ../; pwd)"

	. "$FWDIR"/bin/load-spark-env.sh

	# CD into the python directory to find things on the right path
	cd "$FWDIR/python"

	FAILED=0
	LOG_FILE=unit-tests.log
	START=$(date +"%s")

	rm -f $LOG_FILE

	# Remove the metastore and warehouse directory created by the HiveContext tests in Spark SQL
	rm -rf metastore warehouse

	function run_test() {
	echo -en "Running test: $1 ... " \| tee -a $LOG_FILE
	start=$(date +"%s")
	SPARK_TESTING=1 time "$FWDIR"/bin/pyspark $1 > $LOG_FILE 2>&1

	FAILED=$((PIPESTATUS[0]\|\|$FAILED))

	# Fail and exit on the first test failure.
	if [[ $FAILED != 0 ]]; then
	cat $LOG_FILE \| grep -v "^[0-9][0-9]*" # filter all lines starting with a number.
	echo -en "\033[31m" # Red
	echo "Had test failures; see logs."
	echo -en "\033[0m" # No color
	exit -1
	else
	now=$(date +"%s")
	echo "ok ($(($now - $start))s)"
	fi
	}

	function run_core_tests() {
	echo "Run core tests ..."
	run_test "pyspark.rdd"
	run_test "pyspark.context"
	run_test "pyspark.conf"
	run_test "pyspark.broadcast"
	run_test "pyspark.accumulators"
	run_test "pyspark.serializers"
	run_test "pyspark.profiler"
	run_test "pyspark.shuffle"
	run_test "pyspark.tests"
	}

	function run_sql_tests() {
	echo "Run sql tests ..."
	run_test "pyspark.sql.types"
	run_test "pyspark.sql.context"
	run_test "pyspark.sql.column"
	run_test "pyspark.sql.dataframe"
	run_test "pyspark.sql.group"
	run_test "pyspark.sql.functions"
	run_test "pyspark.sql.readwriter"
	run_test "pyspark.sql.window"
	run_test "pyspark.sql.tests"
	}

	function run_mllib_tests() {
	echo "Run mllib tests ..."
	run_test "pyspark.mllib.classification"
	run_test "pyspark.mllib.clustering"
	run_test "pyspark.mllib.evaluation"
	run_test "pyspark.mllib.feature"
	run_test "pyspark.mllib.fpm"
	run_test "pyspark.mllib.linalg"
	run_test "pyspark.mllib.random"
	run_test "pyspark.mllib.recommendation"
	run_test "pyspark.mllib.regression"
	run_test "pyspark.mllib.stat._statistics"
	run_test "pyspark.mllib.tree"
	run_test "pyspark.mllib.util"
	run_test "pyspark.mllib.tests"
	}

	function run_ml_tests() {
	echo "Run ml tests ..."
	run_test "pyspark.ml.feature"
	run_test "pyspark.ml.classification"
	run_test "pyspark.ml.recommendation"
	run_test "pyspark.ml.regression"
	run_test "pyspark.ml.tuning"
	run_test "pyspark.ml.tests"
	run_test "pyspark.ml.evaluation"
	}

	function run_streaming_tests() {
	echo "Run streaming tests ..."

	KAFKA_ASSEMBLY_DIR="$FWDIR"/external/kafka-assembly
	JAR_PATH="${KAFKA_ASSEMBLY_DIR}/target/scala-${SPARK_SCALA_VERSION}"
	for f in "${JAR_PATH}"/spark-streaming-kafka-assembly-*.jar; do
	if [[ ! -e "$f" ]]; then
	echo "Failed to find Spark Streaming Kafka assembly jar in $KAFKA_ASSEMBLY_DIR" 1>&2
	echo "You need to build Spark with " \
	"'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or" \
	"'build/mvn package' before running this program" 1>&2
	exit 1
	fi
	KAFKA_ASSEMBLY_JAR="$f"
	done

	export PYSPARK_SUBMIT_ARGS="--jars ${KAFKA_ASSEMBLY_JAR} pyspark-shell"
	run_test "pyspark.streaming.util"
	run_test "pyspark.streaming.tests"
	}

	echo "Running PySpark tests. Output is in python/$LOG_FILE."

	export PYSPARK_PYTHON="python"

	# Try to test with Python 2.6, since that's the minimum version that we support:
	if [ $(which python2.6) ]; then
	export PYSPARK_PYTHON="python2.6"
	fi

	echo "Testing with Python version:"
	$PYSPARK_PYTHON --version

	run_core_tests
	run_sql_tests
	run_mllib_tests
	run_ml_tests
	run_streaming_tests

	# Try to test with Python 3
	if [ $(which python3.4) ]; then
	export PYSPARK_PYTHON="python3.4"
	echo "Testing with Python3.4 version:"
	$PYSPARK_PYTHON --version

	run_core_tests
	run_sql_tests
	run_mllib_tests
	run_ml_tests
	run_streaming_tests
	fi

	# Try to test with PyPy
	if [ $(which pypy) ]; then
	export PYSPARK_PYTHON="pypy"
	echo "Testing with PyPy version:"
	$PYSPARK_PYTHON --version

	run_core_tests
	run_sql_tests
	run_streaming_tests
	fi

	if [[ $FAILED == 0 ]]; then
	now=$(date +"%s")
	echo -e "\033[32mTests passed \033[0min $(($now - $START)) seconds"
	fi

	# TODO: in the long-run, it would be nice to use a test runner like `nose`.
	# The doctest fixtures are the current barrier to doing this.