| # |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| |
| import itertools |
| import re |
| |
| all_modules = [] |
| |
| |
| class Module(object): |
| """ |
| A module is the basic abstraction in our test runner script. Each module consists of a set of |
| source files, a set of test commands, and a set of dependencies on other modules. We use modules |
| to define a dependency graph that lets determine which tests to run based on which files have |
| changed. |
| """ |
| |
| def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), environ={}, |
| sbt_test_goals=(), python_test_goals=(), blacklisted_python_implementations=(), |
| test_tags=(), should_run_r_tests=False, should_run_build_tests=False): |
| """ |
| Define a new module. |
| |
| :param name: A short module name, for display in logging and error messages. |
| :param dependencies: A set of dependencies for this module. This should only include direct |
| dependencies; transitive dependencies are resolved automatically. |
| :param source_file_regexes: a set of regexes that match source files belonging to this |
| module. These regexes are applied by attempting to match at the beginning of the |
| filename strings. |
| :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in |
| order to build and test this module (e.g. '-PprofileName'). |
| :param environ: A dict of environment variables that should be set when files in this |
| module are changed. |
| :param sbt_test_goals: A set of SBT test goals for testing this module. |
| :param python_test_goals: A set of Python test goals for testing this module. |
| :param blacklisted_python_implementations: A set of Python implementations that are not |
| supported by this module's Python components. The values in this set should match |
| strings returned by Python's `platform.python_implementation()`. |
| :param test_tags A set of tags that will be excluded when running unit tests if the module |
| is not explicitly changed. |
| :param should_run_r_tests: If true, changes in this module will trigger all R tests. |
| :param should_run_build_tests: If true, changes in this module will trigger build tests. |
| """ |
| self.name = name |
| self.dependencies = dependencies |
| self.source_file_prefixes = source_file_regexes |
| self.sbt_test_goals = sbt_test_goals |
| self.build_profile_flags = build_profile_flags |
| self.environ = environ |
| self.python_test_goals = python_test_goals |
| self.blacklisted_python_implementations = blacklisted_python_implementations |
| self.test_tags = test_tags |
| self.should_run_r_tests = should_run_r_tests |
| self.should_run_build_tests = should_run_build_tests |
| |
| self.dependent_modules = set() |
| for dep in dependencies: |
| dep.dependent_modules.add(self) |
| all_modules.append(self) |
| |
| def contains_file(self, filename): |
| return any(re.match(p, filename) for p in self.source_file_prefixes) |
| |
| |
| sql = Module( |
| name="sql", |
| dependencies=[], |
| source_file_regexes=[ |
| "sql/(?!hive-thriftserver)", |
| "bin/spark-sql", |
| ], |
| build_profile_flags=[ |
| "-Phive", |
| ], |
| sbt_test_goals=[ |
| "catalyst/test", |
| "sql/test", |
| "hive/test", |
| ], |
| test_tags=[ |
| "org.apache.spark.tags.ExtendedHiveTest" |
| ] |
| ) |
| |
| |
| hive_thriftserver = Module( |
| name="hive-thriftserver", |
| dependencies=[sql], |
| source_file_regexes=[ |
| "sql/hive-thriftserver", |
| "sbin/start-thriftserver.sh", |
| ], |
| build_profile_flags=[ |
| "-Phive-thriftserver", |
| ], |
| sbt_test_goals=[ |
| "hive-thriftserver/test", |
| ] |
| ) |
| |
| |
| graphx = Module( |
| name="graphx", |
| dependencies=[], |
| source_file_regexes=[ |
| "graphx/", |
| ], |
| sbt_test_goals=[ |
| "graphx/test" |
| ] |
| ) |
| |
| |
| streaming = Module( |
| name="streaming", |
| dependencies=[], |
| source_file_regexes=[ |
| "streaming", |
| ], |
| sbt_test_goals=[ |
| "streaming/test", |
| ] |
| ) |
| |
| |
| # Don't set the dependencies because changes in other modules should not trigger Kinesis tests. |
| # Kinesis tests depends on external Amazon kinesis service. We should run these tests only when |
| # files in streaming_kinesis_asl are changed, so that if Kinesis experiences an outage, we don't |
| # fail other PRs. |
| streaming_kinesis_asl = Module( |
| name="streaming-kinesis-asl", |
| dependencies=[], |
| source_file_regexes=[ |
| "extras/kinesis-asl/", |
| "extras/kinesis-asl-assembly/", |
| ], |
| build_profile_flags=[ |
| "-Pkinesis-asl", |
| ], |
| environ={ |
| "ENABLE_KINESIS_TESTS": "1" |
| }, |
| sbt_test_goals=[ |
| "streaming-kinesis-asl/test", |
| ] |
| ) |
| |
| |
| streaming_zeromq = Module( |
| name="streaming-zeromq", |
| dependencies=[streaming], |
| source_file_regexes=[ |
| "external/zeromq", |
| ], |
| sbt_test_goals=[ |
| "streaming-zeromq/test", |
| ] |
| ) |
| |
| |
| streaming_twitter = Module( |
| name="streaming-twitter", |
| dependencies=[streaming], |
| source_file_regexes=[ |
| "external/twitter", |
| ], |
| sbt_test_goals=[ |
| "streaming-twitter/test", |
| ] |
| ) |
| |
| |
| streaming_mqtt = Module( |
| name="streaming-mqtt", |
| dependencies=[streaming], |
| source_file_regexes=[ |
| "external/mqtt", |
| "external/mqtt-assembly", |
| ], |
| sbt_test_goals=[ |
| "streaming-mqtt/test", |
| ] |
| ) |
| |
| |
| streaming_kafka = Module( |
| name="streaming-kafka", |
| dependencies=[streaming], |
| source_file_regexes=[ |
| "external/kafka", |
| "external/kafka-assembly", |
| ], |
| sbt_test_goals=[ |
| "streaming-kafka/test", |
| ] |
| ) |
| |
| |
| streaming_flume_sink = Module( |
| name="streaming-flume-sink", |
| dependencies=[streaming], |
| source_file_regexes=[ |
| "external/flume-sink", |
| ], |
| sbt_test_goals=[ |
| "streaming-flume-sink/test", |
| ] |
| ) |
| |
| |
| streaming_flume = Module( |
| name="streaming-flume", |
| dependencies=[streaming], |
| source_file_regexes=[ |
| "external/flume", |
| ], |
| sbt_test_goals=[ |
| "streaming-flume/test", |
| ] |
| ) |
| |
| |
| streaming_flume_assembly = Module( |
| name="streaming-flume-assembly", |
| dependencies=[streaming_flume, streaming_flume_sink], |
| source_file_regexes=[ |
| "external/flume-assembly", |
| ] |
| ) |
| |
| |
| mllib = Module( |
| name="mllib", |
| dependencies=[streaming, sql], |
| source_file_regexes=[ |
| "data/mllib/", |
| "mllib/", |
| ], |
| sbt_test_goals=[ |
| "mllib/test", |
| ] |
| ) |
| |
| |
| examples = Module( |
| name="examples", |
| dependencies=[graphx, mllib, streaming, sql], |
| source_file_regexes=[ |
| "examples/", |
| ], |
| sbt_test_goals=[ |
| "examples/test", |
| ] |
| ) |
| |
| |
| pyspark_core = Module( |
| name="pyspark-core", |
| dependencies=[], |
| source_file_regexes=[ |
| "python/(?!pyspark/(ml|mllib|sql|streaming))" |
| ], |
| python_test_goals=[ |
| "pyspark.rdd", |
| "pyspark.context", |
| "pyspark.conf", |
| "pyspark.broadcast", |
| "pyspark.accumulators", |
| "pyspark.serializers", |
| "pyspark.profiler", |
| "pyspark.shuffle", |
| "pyspark.tests", |
| ] |
| ) |
| |
| |
| pyspark_sql = Module( |
| name="pyspark-sql", |
| dependencies=[pyspark_core, sql], |
| source_file_regexes=[ |
| "python/pyspark/sql" |
| ], |
| python_test_goals=[ |
| "pyspark.sql.types", |
| "pyspark.sql.context", |
| "pyspark.sql.column", |
| "pyspark.sql.dataframe", |
| "pyspark.sql.group", |
| "pyspark.sql.functions", |
| "pyspark.sql.readwriter", |
| "pyspark.sql.window", |
| "pyspark.sql.tests", |
| ] |
| ) |
| |
| |
| pyspark_streaming = Module( |
| name="pyspark-streaming", |
| dependencies=[ |
| pyspark_core, |
| streaming, |
| streaming_kafka, |
| streaming_flume_assembly, |
| streaming_mqtt, |
| streaming_kinesis_asl |
| ], |
| source_file_regexes=[ |
| "python/pyspark/streaming" |
| ], |
| python_test_goals=[ |
| "pyspark.streaming.util", |
| "pyspark.streaming.tests", |
| ] |
| ) |
| |
| |
| pyspark_mllib = Module( |
| name="pyspark-mllib", |
| dependencies=[pyspark_core, pyspark_streaming, pyspark_sql, mllib], |
| source_file_regexes=[ |
| "python/pyspark/mllib" |
| ], |
| python_test_goals=[ |
| "pyspark.mllib.classification", |
| "pyspark.mllib.clustering", |
| "pyspark.mllib.evaluation", |
| "pyspark.mllib.feature", |
| "pyspark.mllib.fpm", |
| "pyspark.mllib.linalg.__init__", |
| "pyspark.mllib.linalg.distributed", |
| "pyspark.mllib.random", |
| "pyspark.mllib.recommendation", |
| "pyspark.mllib.regression", |
| "pyspark.mllib.stat._statistics", |
| "pyspark.mllib.stat.KernelDensity", |
| "pyspark.mllib.tree", |
| "pyspark.mllib.util", |
| "pyspark.mllib.tests", |
| ], |
| blacklisted_python_implementations=[ |
| "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there |
| ] |
| ) |
| |
| |
| pyspark_ml = Module( |
| name="pyspark-ml", |
| dependencies=[pyspark_core, pyspark_mllib], |
| source_file_regexes=[ |
| "python/pyspark/ml/" |
| ], |
| python_test_goals=[ |
| "pyspark.ml.feature", |
| "pyspark.ml.classification", |
| "pyspark.ml.clustering", |
| "pyspark.ml.recommendation", |
| "pyspark.ml.regression", |
| "pyspark.ml.tuning", |
| "pyspark.ml.tests", |
| "pyspark.ml.evaluation", |
| ], |
| blacklisted_python_implementations=[ |
| "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there |
| ] |
| ) |
| |
| sparkr = Module( |
| name="sparkr", |
| dependencies=[sql, mllib], |
| source_file_regexes=[ |
| "R/", |
| ], |
| should_run_r_tests=True |
| ) |
| |
| |
| docs = Module( |
| name="docs", |
| dependencies=[], |
| source_file_regexes=[ |
| "docs/", |
| ] |
| ) |
| |
| build = Module( |
| name="build", |
| dependencies=[], |
| source_file_regexes=[ |
| ".*pom.xml", |
| "dev/test-dependencies.sh", |
| ], |
| should_run_build_tests=True |
| ) |
| |
| ec2 = Module( |
| name="ec2", |
| dependencies=[], |
| source_file_regexes=[ |
| "ec2/", |
| ] |
| ) |
| |
| |
| yarn = Module( |
| name="yarn", |
| dependencies=[], |
| source_file_regexes=[ |
| "yarn/", |
| "network/yarn/", |
| ], |
| sbt_test_goals=[ |
| "yarn/test", |
| "network-yarn/test", |
| ], |
| test_tags=[ |
| "org.apache.spark.tags.ExtendedYarnTest" |
| ] |
| ) |
| |
| # The root module is a dummy module which is used to run all of the tests. |
| # No other modules should directly depend on this module. |
| root = Module( |
| name="root", |
| dependencies=[build], # Changes to build should trigger all tests. |
| source_file_regexes=[], |
| # In order to run all of the tests, enable every test profile: |
| build_profile_flags=list(set( |
| itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))), |
| sbt_test_goals=[ |
| "test", |
| ], |
| python_test_goals=list(itertools.chain.from_iterable(m.python_test_goals for m in all_modules)), |
| should_run_r_tests=True, |
| should_run_build_tests=True |
| ) |