blob: f689425ee40b6b54e2971de249e20b8e1114b0fc [file] [log] [blame]
#!/usr/bin/env python2
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import print_function
import itertools
from optparse import OptionParser
import os
import re
import sys
import subprocess
from collections import namedtuple
from sparktestsupport import SPARK_HOME, USER_HOME
from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
import sparktestsupport.modules as modules
# -------------------------------------------------------------------------------------------------
# Functions for traversing module dependency graph
# -------------------------------------------------------------------------------------------------
def determine_modules_for_files(filenames):
"""
Given a list of filenames, return the set of modules that contain those files.
If a file is not associated with a more specific submodule, then this method will consider that
file to belong to the 'root' module.
>>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/test/foo"]))
['pyspark-core', 'sql']
>>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])]
['root']
"""
changed_modules = set()
for filename in filenames:
matched_at_least_one_module = False
for module in modules.all_modules:
if module.contains_file(filename):
changed_modules.add(module)
matched_at_least_one_module = True
if not matched_at_least_one_module:
changed_modules.add(modules.root)
return changed_modules
def identify_changed_files_from_git_commits(patch_sha, target_branch=None, target_ref=None):
"""
Given a git commit and target ref, use the set of files changed in the diff in order to
determine which modules' tests should be run.
>>> [x.name for x in determine_modules_for_files( \
identify_changed_files_from_git_commits("fc0a1475ef", target_ref="5da21f07"))]
['graphx']
>>> 'root' in [x.name for x in determine_modules_for_files( \
identify_changed_files_from_git_commits("50a0496a43", target_ref="6765ef9"))]
True
"""
if target_branch is None and target_ref is None:
raise AttributeError("must specify either target_branch or target_ref")
elif target_branch is not None and target_ref is not None:
raise AttributeError("must specify either target_branch or target_ref, not both")
if target_branch is not None:
diff_target = target_branch
run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
else:
diff_target = target_ref
raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target],
universal_newlines=True)
# Remove any empty strings
return [f for f in raw_output.split('\n') if f]
def setup_test_environ(environ):
print("[info] Setup the following environment variables for tests: ")
for (k, v) in environ.items():
print("%s=%s" % (k, v))
os.environ[k] = v
def determine_modules_to_test(changed_modules):
"""
Given a set of modules that have changed, compute the transitive closure of those modules'
dependent modules in order to determine the set of modules that should be tested.
>>> sorted(x.name for x in determine_modules_to_test([modules.root]))
['root']
>>> sorted(x.name for x in determine_modules_to_test([modules.graphx]))
['examples', 'graphx']
>>> x = sorted(x.name for x in determine_modules_to_test([modules.sql]))
>>> x # doctest: +NORMALIZE_WHITESPACE
['examples', 'hive-thriftserver', 'mllib', 'pyspark-ml', \
'pyspark-mllib', 'pyspark-sql', 'sparkr', 'sql']
"""
# If we're going to have to run all of the tests, then we can just short-circuit
# and return 'root'. No module depends on root, so if it appears then it will be
# in changed_modules.
if modules.root in changed_modules:
return [modules.root]
modules_to_test = set()
for module in changed_modules:
modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules))
return modules_to_test.union(set(changed_modules))
# -------------------------------------------------------------------------------------------------
# Functions for working with subprocesses and shell tools
# -------------------------------------------------------------------------------------------------
def get_error_codes(err_code_file):
"""Function to retrieve all block numbers from the `run-tests-codes.sh`
file to maintain backwards compatibility with the `run-tests-jenkins`
script"""
with open(err_code_file, 'r') as f:
err_codes = [e.split()[1].strip().split('=')
for e in f if e.startswith("readonly")]
return dict(err_codes)
ERROR_CODES = get_error_codes(os.path.join(SPARK_HOME, "dev/run-tests-codes.sh"))
def determine_java_executable():
"""Will return the path of the java executable that will be used by Spark's
tests or `None`"""
# Any changes in the way that Spark's build detects java must be reflected
# here. Currently the build looks for $JAVA_HOME/bin/java then falls back to
# the `java` executable on the path
java_home = os.environ.get("JAVA_HOME")
# check if there is an executable at $JAVA_HOME/bin/java
java_exe = which(os.path.join(java_home, "bin", "java")) if java_home else None
# if the java_exe wasn't set, check for a `java` version on the $PATH
return java_exe if java_exe else which("java")
JavaVersion = namedtuple('JavaVersion', ['major', 'minor', 'patch', 'update'])
def determine_java_version(java_exe):
"""Given a valid java executable will return its version in named tuple format
with accessors '.major', '.minor', '.patch', '.update'"""
raw_output = subprocess.check_output([java_exe, "-version"],
stderr=subprocess.STDOUT,
universal_newlines=True)
raw_output_lines = raw_output.split('\n')
# find raw version string, eg 'java version "1.8.0_25"'
raw_version_str = next(x for x in raw_output_lines if " version " in x)
version_str = raw_version_str.split()[-1].strip('"') # eg '1.8.0_25'
version, update = version_str.split('_') # eg ['1.8.0', '25']
# map over the values and convert them to integers
version_info = [int(x) for x in version.split('.') + [update]]
return JavaVersion(major=version_info[0],
minor=version_info[1],
patch=version_info[2],
update=version_info[3])
# -------------------------------------------------------------------------------------------------
# Functions for running the other build and test scripts
# -------------------------------------------------------------------------------------------------
def set_title_and_block(title, err_block):
os.environ["CURRENT_BLOCK"] = ERROR_CODES[err_block]
line_str = '=' * 72
print('')
print(line_str)
print(title)
print(line_str)
def run_apache_rat_checks():
set_title_and_block("Running Apache RAT checks", "BLOCK_RAT")
run_cmd([os.path.join(SPARK_HOME, "dev", "check-license")])
def run_scala_style_checks():
set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE")
run_cmd([os.path.join(SPARK_HOME, "dev", "lint-scala")])
def run_python_style_checks():
set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE")
run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")])
def build_spark_documentation():
set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION")
os.environ["PRODUCTION"] = "1 jekyll build"
os.chdir(os.path.join(SPARK_HOME, "docs"))
jekyll_bin = which("jekyll")
if not jekyll_bin:
print("[error] Cannot find a version of `jekyll` on the system; please",
" install one and retry to build documentation.")
sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
else:
run_cmd([jekyll_bin, "build"])
os.chdir(SPARK_HOME)
def exec_maven(mvn_args=()):
"""Will call Maven in the current directory with the list of mvn_args passed
in and returns the subprocess for any further processing"""
run_cmd([os.path.join(SPARK_HOME, "build", "mvn")] + mvn_args)
def exec_sbt(sbt_args=()):
"""Will call SBT in the current directory with the list of mvn_args passed
in and returns the subprocess for any further processing"""
sbt_cmd = [os.path.join(SPARK_HOME, "build", "sbt")] + sbt_args
sbt_output_filter = re.compile("^.*[info].*Resolving" + "|" +
"^.*[warn].*Merging" + "|" +
"^.*[info].*Including")
# NOTE: echo "q" is needed because sbt on encountering a build file
# with failure (either resolution or compilation) prompts the user for
# input either q, r, etc to quit or retry. This echo is there to make it
# not block.
echo_proc = subprocess.Popen(["echo", "\"q\n\""], stdout=subprocess.PIPE)
sbt_proc = subprocess.Popen(sbt_cmd,
stdin=echo_proc.stdout,
stdout=subprocess.PIPE)
echo_proc.wait()
for line in iter(sbt_proc.stdout.readline, ''):
if not sbt_output_filter.match(line):
print(line, end='')
retcode = sbt_proc.wait()
if retcode > 0:
exit_from_command_with_retcode(sbt_cmd, retcode)
def get_hadoop_profiles(hadoop_version):
"""
For the given Hadoop version tag, return a list of SBT profile flags for
building and testing against that Hadoop version.
"""
sbt_maven_hadoop_profiles = {
"hadoop1.0": ["-Phadoop-1", "-Dhadoop.version=1.2.1"],
"hadoop2.0": ["-Phadoop-1", "-Dhadoop.version=2.0.0-mr1-cdh4.1.1"],
"hadoop2.2": ["-Pyarn", "-Phadoop-2.2"],
"hadoop2.3": ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"],
"hadoop2.6": ["-Pyarn", "-Phadoop-2.6"],
}
if hadoop_version in sbt_maven_hadoop_profiles:
return sbt_maven_hadoop_profiles[hadoop_version]
else:
print("[error] Could not find", hadoop_version, "in the list. Valid options",
" are", sbt_maven_hadoop_profiles.keys())
sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
def build_spark_maven(hadoop_version):
# Enable all of the profiles for the build:
build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
mvn_goals = ["clean", "package", "-DskipTests"]
profiles_and_goals = build_profiles + mvn_goals
print("[info] Building Spark (w/Hive 1.2.1) using Maven with these arguments: ",
" ".join(profiles_and_goals))
exec_maven(profiles_and_goals)
def build_spark_sbt(hadoop_version):
# Enable all of the profiles for the build:
build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
sbt_goals = ["package",
"assembly/assembly",
"streaming-kafka-assembly/assembly",
"streaming-flume-assembly/assembly",
"streaming-mqtt-assembly/assembly",
"streaming-mqtt/test:assembly",
"streaming-kinesis-asl-assembly/assembly"]
profiles_and_goals = build_profiles + sbt_goals
print("[info] Building Spark (w/Hive 1.2.1) using SBT with these arguments: ",
" ".join(profiles_and_goals))
exec_sbt(profiles_and_goals)
def build_apache_spark(build_tool, hadoop_version):
"""Will build Spark against Hive v1.2.1 given the passed in build tool (either `sbt` or
`maven`). Defaults to using `sbt`."""
set_title_and_block("Building Spark", "BLOCK_BUILD")
rm_r("lib_managed")
if build_tool == "maven":
build_spark_maven(hadoop_version)
else:
build_spark_sbt(hadoop_version)
def detect_binary_inop_with_mima():
set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA")
run_cmd([os.path.join(SPARK_HOME, "dev", "mima")])
def run_scala_tests_maven(test_profiles):
mvn_test_goals = ["test", "--fail-at-end"]
profiles_and_goals = test_profiles + mvn_test_goals
print("[info] Running Spark tests using Maven with these arguments: ",
" ".join(profiles_and_goals))
exec_maven(profiles_and_goals)
def run_scala_tests_sbt(test_modules, test_profiles):
sbt_test_goals = set(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules))
if not sbt_test_goals:
return
profiles_and_goals = test_profiles + list(sbt_test_goals)
print("[info] Running Spark tests using SBT with these arguments: ",
" ".join(profiles_and_goals))
exec_sbt(profiles_and_goals)
def run_scala_tests(build_tool, hadoop_version, test_modules):
"""Function to properly execute all tests passed in as a set from the
`determine_test_suites` function"""
set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS")
test_modules = set(test_modules)
test_profiles = get_hadoop_profiles(hadoop_version) + \
list(set(itertools.chain.from_iterable(m.build_profile_flags for m in test_modules)))
if build_tool == "maven":
run_scala_tests_maven(test_profiles)
else:
run_scala_tests_sbt(test_modules, test_profiles)
def run_python_tests(test_modules, parallelism):
set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")
command = [os.path.join(SPARK_HOME, "python", "run-tests")]
if test_modules != [modules.root]:
command.append("--modules=%s" % ','.join(m.name for m in test_modules))
command.append("--parallelism=%i" % parallelism)
run_cmd(command)
def run_sparkr_tests():
set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS")
if which("R"):
run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")])
else:
print("Ignoring SparkR tests as R was not found in PATH")
def parse_opts():
parser = OptionParser(
prog="run-tests"
)
parser.add_option(
"-p", "--parallelism", type="int", default=4,
help="The number of suites to test in parallel (default %default)"
)
(opts, args) = parser.parse_args()
if args:
parser.error("Unsupported arguments: %s" % ' '.join(args))
if opts.parallelism < 1:
parser.error("Parallelism cannot be less than 1")
return opts
def main():
opts = parse_opts()
# Ensure the user home directory (HOME) is valid and is an absolute directory
if not USER_HOME or not os.path.isabs(USER_HOME):
print("[error] Cannot determine your home directory as an absolute path;",
" ensure the $HOME environment variable is set properly.")
sys.exit(1)
os.chdir(SPARK_HOME)
rm_r(os.path.join(SPARK_HOME, "work"))
rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))
os.environ["CURRENT_BLOCK"] = ERROR_CODES["BLOCK_GENERAL"]
java_exe = determine_java_executable()
if not java_exe:
print("[error] Cannot find a version of `java` on the system; please",
" install one and retry.")
sys.exit(2)
java_version = determine_java_version(java_exe)
if java_version.minor < 8:
print("[warn] Java 8 tests will not run because JDK version is < 1.8.")
if os.environ.get("AMPLAB_JENKINS"):
# if we're on the Amplab Jenkins build servers setup variables
# to reflect the environment settings
build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3")
test_env = "amplab_jenkins"
# add path for Python3 in Jenkins if we're calling from a Jenkins machine
os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH")
else:
# else we're running locally and can use local settings
build_tool = "sbt"
hadoop_version = "hadoop2.3"
test_env = "local"
print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
"under environment", test_env)
changed_modules = None
changed_files = None
if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
target_branch = os.environ["ghprbTargetBranch"]
changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
changed_modules = determine_modules_for_files(changed_files)
if not changed_modules:
changed_modules = [modules.root]
print("[info] Found the following changed modules:",
", ".join(x.name for x in changed_modules))
# setup environment variables
# note - the 'root' module doesn't collect environment variables for all modules. Because the
# environment variables should not be set if a module is not changed, even if running the 'root'
# module. So here we should use changed_modules rather than test_modules.
test_environ = {}
for m in changed_modules:
test_environ.update(m.environ)
setup_test_environ(test_environ)
test_modules = determine_modules_to_test(changed_modules)
# license checks
run_apache_rat_checks()
# style checks
if not changed_files or any(f.endswith(".scala") for f in changed_files):
run_scala_style_checks()
if not changed_files or any(f.endswith(".py") for f in changed_files):
run_python_style_checks()
# determine if docs were changed and if we're inside the amplab environment
# note - the below commented out until *all* Jenkins workers can get `jekyll` installed
# if "DOCS" in changed_modules and test_env == "amplab_jenkins":
# build_spark_documentation()
# spark build
build_apache_spark(build_tool, hadoop_version)
# backwards compatibility checks
detect_binary_inop_with_mima()
# run the test suites
run_scala_tests(build_tool, hadoop_version, test_modules)
modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
if modules_with_python_tests:
run_python_tests(modules_with_python_tests, opts.parallelism)
if any(m.should_run_r_tests for m in test_modules):
run_sparkr_tests()
def _test():
import doctest
failure_count = doctest.testmod()[0]
if failure_count:
exit(-1)
if __name__ == "__main__":
_test()
main()