| # |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| |
| from functools import total_ordering |
| import itertools |
| import re |
| |
| all_modules = [] |
| |
| |
| @total_ordering |
| class Module(object): |
| """ |
| A module is the basic abstraction in our test runner script. Each module consists of a set |
| of source files, a set of test commands, and a set of dependencies on other modules. We use |
| modules to define a dependency graph that let us determine which tests to run based on which |
| files have changed. |
| """ |
| |
| def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), |
| environ=None, sbt_test_goals=(), python_test_goals=(), |
| excluded_python_implementations=(), test_tags=(), should_run_r_tests=False, |
| should_run_build_tests=False): |
| """ |
| Define a new module. |
| |
| :param name: A short module name, for display in logging and error messages. |
| :param dependencies: A set of dependencies for this module. This should only include direct |
| dependencies; transitive dependencies are resolved automatically. |
| :param source_file_regexes: a set of regexes that match source files belonging to this |
| module. These regexes are applied by attempting to match at the beginning of the |
| filename strings. |
| :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in |
| order to build and test this module (e.g. '-PprofileName'). |
| :param environ: A dict of environment variables that should be set when files in this |
| module are changed. |
| :param sbt_test_goals: A set of SBT test goals for testing this module. |
| :param python_test_goals: A set of Python test goals for testing this module. |
| :param excluded_python_implementations: A set of Python implementations that are not |
| supported by this module's Python components. The values in this set should match |
| strings returned by Python's `platform.python_implementation()`. |
| :param test_tags A set of tags that will be excluded when running unit tests if the module |
| is not explicitly changed. |
| :param should_run_r_tests: If true, changes in this module will trigger all R tests. |
| :param should_run_build_tests: If true, changes in this module will trigger build tests. |
| """ |
| self.name = name |
| self.dependencies = dependencies |
| self.source_file_prefixes = source_file_regexes |
| self.sbt_test_goals = sbt_test_goals |
| self.build_profile_flags = build_profile_flags |
| self.environ = environ or {} |
| self.python_test_goals = python_test_goals |
| self.excluded_python_implementations = excluded_python_implementations |
| self.test_tags = test_tags |
| self.should_run_r_tests = should_run_r_tests |
| self.should_run_build_tests = should_run_build_tests |
| |
| self.dependent_modules = set() |
| for dep in dependencies: |
| dep.dependent_modules.add(self) |
| all_modules.append(self) |
| |
| def contains_file(self, filename): |
| return any(re.match(p, filename) for p in self.source_file_prefixes) |
| |
| def __repr__(self): |
| return "Module<%s>" % self.name |
| |
| def __lt__(self, other): |
| return self.name < other.name |
| |
| def __eq__(self, other): |
| return self.name == other.name |
| |
| def __ne__(self, other): |
| return not (self.name == other.name) |
| |
| def __hash__(self): |
| return hash(self.name) |
| |
| tags = Module( |
| name="tags", |
| dependencies=[], |
| source_file_regexes=[ |
| "common/tags/", |
| ] |
| ) |
| |
| kvstore = Module( |
| name="kvstore", |
| dependencies=[tags], |
| source_file_regexes=[ |
| "common/kvstore/", |
| ], |
| sbt_test_goals=[ |
| "kvstore/test", |
| ], |
| ) |
| |
| network_common = Module( |
| name="network-common", |
| dependencies=[tags], |
| source_file_regexes=[ |
| "common/network-common/", |
| ], |
| sbt_test_goals=[ |
| "network-common/test", |
| ], |
| ) |
| |
| network_shuffle = Module( |
| name="network-shuffle", |
| dependencies=[tags], |
| source_file_regexes=[ |
| "common/network-shuffle/", |
| ], |
| sbt_test_goals=[ |
| "network-shuffle/test", |
| ], |
| ) |
| |
| unsafe = Module( |
| name="unsafe", |
| dependencies=[tags], |
| source_file_regexes=[ |
| "common/unsafe", |
| ], |
| sbt_test_goals=[ |
| "unsafe/test", |
| ], |
| ) |
| |
| launcher = Module( |
| name="launcher", |
| dependencies=[tags], |
| source_file_regexes=[ |
| "launcher/", |
| ], |
| sbt_test_goals=[ |
| "launcher/test", |
| ], |
| ) |
| |
| core = Module( |
| name="core", |
| dependencies=[kvstore, network_common, network_shuffle, unsafe, launcher], |
| source_file_regexes=[ |
| "core/", |
| ], |
| sbt_test_goals=[ |
| "core/test", |
| ], |
| ) |
| |
| catalyst = Module( |
| name="catalyst", |
| dependencies=[tags, core], |
| source_file_regexes=[ |
| "sql/catalyst/", |
| ], |
| sbt_test_goals=[ |
| "catalyst/test", |
| ], |
| ) |
| |
| sql = Module( |
| name="sql", |
| dependencies=[catalyst], |
| source_file_regexes=[ |
| "sql/core/", |
| ], |
| sbt_test_goals=[ |
| "sql/test", |
| ], |
| ) |
| |
| hive = Module( |
| name="hive", |
| dependencies=[sql], |
| source_file_regexes=[ |
| "sql/hive/", |
| "bin/spark-sql", |
| ], |
| build_profile_flags=[ |
| "-Phive", |
| ], |
| sbt_test_goals=[ |
| "hive/test", |
| ], |
| test_tags=[ |
| "org.apache.spark.tags.ExtendedHiveTest" |
| ] |
| ) |
| |
| repl = Module( |
| name="repl", |
| dependencies=[hive], |
| source_file_regexes=[ |
| "repl/", |
| ], |
| sbt_test_goals=[ |
| "repl/test", |
| ], |
| ) |
| |
| hive_thriftserver = Module( |
| name="hive-thriftserver", |
| dependencies=[hive], |
| source_file_regexes=[ |
| "sql/hive-thriftserver", |
| "sbin/start-thriftserver.sh", |
| ], |
| build_profile_flags=[ |
| "-Phive-thriftserver", |
| ], |
| sbt_test_goals=[ |
| "hive-thriftserver/test", |
| ] |
| ) |
| |
| avro = Module( |
| name="avro", |
| dependencies=[sql], |
| source_file_regexes=[ |
| "external/avro", |
| ], |
| sbt_test_goals=[ |
| "avro/test", |
| ] |
| ) |
| |
| sql_kafka = Module( |
| name="sql-kafka-0-10", |
| dependencies=[sql], |
| source_file_regexes=[ |
| "external/kafka-0-10-sql", |
| ], |
| sbt_test_goals=[ |
| "sql-kafka-0-10/test", |
| ] |
| ) |
| |
| sketch = Module( |
| name="sketch", |
| dependencies=[tags], |
| source_file_regexes=[ |
| "common/sketch/", |
| ], |
| sbt_test_goals=[ |
| "sketch/test" |
| ] |
| ) |
| |
| graphx = Module( |
| name="graphx", |
| dependencies=[tags, core], |
| source_file_regexes=[ |
| "graphx/", |
| ], |
| sbt_test_goals=[ |
| "graphx/test" |
| ] |
| ) |
| |
| streaming = Module( |
| name="streaming", |
| dependencies=[tags, core], |
| source_file_regexes=[ |
| "streaming", |
| ], |
| sbt_test_goals=[ |
| "streaming/test", |
| ] |
| ) |
| |
| |
| # Don't set the dependencies because changes in other modules should not trigger Kinesis tests. |
| # Kinesis tests depends on external Amazon kinesis service. We should run these tests only when |
| # files in streaming_kinesis_asl are changed, so that if Kinesis experiences an outage, we don't |
| # fail other PRs. |
| streaming_kinesis_asl = Module( |
| name="streaming-kinesis-asl", |
| dependencies=[tags, core], |
| source_file_regexes=[ |
| "external/kinesis-asl/", |
| "external/kinesis-asl-assembly/", |
| ], |
| build_profile_flags=[ |
| "-Pkinesis-asl", |
| ], |
| environ={ |
| "ENABLE_KINESIS_TESTS": "1" |
| }, |
| sbt_test_goals=[ |
| "streaming-kinesis-asl/test", |
| ] |
| ) |
| |
| |
| streaming_kafka_0_10 = Module( |
| name="streaming-kafka-0-10", |
| dependencies=[streaming, core], |
| source_file_regexes=[ |
| # The ending "/" is necessary otherwise it will include "sql-kafka" codes |
| "external/kafka-0-10/", |
| "external/kafka-0-10-assembly", |
| "external/kafka-0-10-token-provider", |
| ], |
| sbt_test_goals=[ |
| "streaming-kafka-0-10/test", |
| "token-provider-kafka-0-10/test" |
| ] |
| ) |
| |
| |
| mllib_local = Module( |
| name="mllib-local", |
| dependencies=[tags, core], |
| source_file_regexes=[ |
| "mllib-local", |
| ], |
| sbt_test_goals=[ |
| "mllib-local/test", |
| ] |
| ) |
| |
| |
| mllib = Module( |
| name="mllib", |
| dependencies=[mllib_local, streaming, sql], |
| source_file_regexes=[ |
| "data/mllib/", |
| "mllib/", |
| ], |
| sbt_test_goals=[ |
| "mllib/test", |
| ] |
| ) |
| |
| |
| examples = Module( |
| name="examples", |
| dependencies=[graphx, mllib, streaming, hive], |
| source_file_regexes=[ |
| "examples/", |
| ], |
| sbt_test_goals=[ |
| "examples/test", |
| ] |
| ) |
| |
| pyspark_core = Module( |
| name="pyspark-core", |
| dependencies=[core], |
| source_file_regexes=[ |
| "python/(?!pyspark/(ml|mllib|sql|streaming))" |
| ], |
| python_test_goals=[ |
| # doctests |
| "pyspark.rdd", |
| "pyspark.context", |
| "pyspark.conf", |
| "pyspark.broadcast", |
| "pyspark.accumulators", |
| "pyspark.serializers", |
| "pyspark.profiler", |
| "pyspark.shuffle", |
| "pyspark.util", |
| # unittests |
| "pyspark.tests.test_appsubmit", |
| "pyspark.tests.test_broadcast", |
| "pyspark.tests.test_conf", |
| "pyspark.tests.test_context", |
| "pyspark.tests.test_daemon", |
| "pyspark.tests.test_install_spark", |
| "pyspark.tests.test_join", |
| "pyspark.tests.test_profiler", |
| "pyspark.tests.test_rdd", |
| "pyspark.tests.test_rddbarrier", |
| "pyspark.tests.test_readwrite", |
| "pyspark.tests.test_serializers", |
| "pyspark.tests.test_shuffle", |
| "pyspark.tests.test_taskcontext", |
| "pyspark.tests.test_util", |
| "pyspark.tests.test_worker", |
| ] |
| ) |
| |
| pyspark_sql = Module( |
| name="pyspark-sql", |
| dependencies=[pyspark_core, hive, avro], |
| source_file_regexes=[ |
| "python/pyspark/sql" |
| ], |
| python_test_goals=[ |
| # doctests |
| "pyspark.sql.types", |
| "pyspark.sql.context", |
| "pyspark.sql.session", |
| "pyspark.sql.conf", |
| "pyspark.sql.catalog", |
| "pyspark.sql.column", |
| "pyspark.sql.dataframe", |
| "pyspark.sql.group", |
| "pyspark.sql.functions", |
| "pyspark.sql.readwriter", |
| "pyspark.sql.streaming", |
| "pyspark.sql.udf", |
| "pyspark.sql.window", |
| "pyspark.sql.avro.functions", |
| "pyspark.sql.pandas.conversion", |
| "pyspark.sql.pandas.map_ops", |
| "pyspark.sql.pandas.group_ops", |
| "pyspark.sql.pandas.types", |
| "pyspark.sql.pandas.serializers", |
| "pyspark.sql.pandas.typehints", |
| "pyspark.sql.pandas.utils", |
| # unittests |
| "pyspark.sql.tests.test_arrow", |
| "pyspark.sql.tests.test_catalog", |
| "pyspark.sql.tests.test_column", |
| "pyspark.sql.tests.test_conf", |
| "pyspark.sql.tests.test_context", |
| "pyspark.sql.tests.test_dataframe", |
| "pyspark.sql.tests.test_datasources", |
| "pyspark.sql.tests.test_functions", |
| "pyspark.sql.tests.test_group", |
| "pyspark.sql.tests.test_pandas_cogrouped_map", |
| "pyspark.sql.tests.test_pandas_grouped_map", |
| "pyspark.sql.tests.test_pandas_map", |
| "pyspark.sql.tests.test_pandas_udf", |
| "pyspark.sql.tests.test_pandas_udf_grouped_agg", |
| "pyspark.sql.tests.test_pandas_udf_scalar", |
| "pyspark.sql.tests.test_pandas_udf_typehints", |
| "pyspark.sql.tests.test_pandas_udf_window", |
| "pyspark.sql.tests.test_readwriter", |
| "pyspark.sql.tests.test_serde", |
| "pyspark.sql.tests.test_session", |
| "pyspark.sql.tests.test_streaming", |
| "pyspark.sql.tests.test_types", |
| "pyspark.sql.tests.test_udf", |
| "pyspark.sql.tests.test_utils", |
| ] |
| ) |
| |
| |
| pyspark_resource = Module( |
| name="pyspark-resource", |
| dependencies=[ |
| pyspark_core |
| ], |
| source_file_regexes=[ |
| "python/pyspark/resource" |
| ], |
| python_test_goals=[ |
| # unittests |
| "pyspark.resource.tests.test_resources", |
| ] |
| ) |
| |
| |
| pyspark_streaming = Module( |
| name="pyspark-streaming", |
| dependencies=[ |
| pyspark_core, |
| streaming, |
| streaming_kinesis_asl |
| ], |
| source_file_regexes=[ |
| "python/pyspark/streaming" |
| ], |
| python_test_goals=[ |
| # doctests |
| "pyspark.streaming.util", |
| # unittests |
| "pyspark.streaming.tests.test_context", |
| "pyspark.streaming.tests.test_dstream", |
| "pyspark.streaming.tests.test_kinesis", |
| "pyspark.streaming.tests.test_listener", |
| ] |
| ) |
| |
| |
| pyspark_mllib = Module( |
| name="pyspark-mllib", |
| dependencies=[pyspark_core, pyspark_streaming, pyspark_sql, mllib], |
| source_file_regexes=[ |
| "python/pyspark/mllib" |
| ], |
| python_test_goals=[ |
| # doctests |
| "pyspark.mllib.classification", |
| "pyspark.mllib.clustering", |
| "pyspark.mllib.evaluation", |
| "pyspark.mllib.feature", |
| "pyspark.mllib.fpm", |
| "pyspark.mllib.linalg.__init__", |
| "pyspark.mllib.linalg.distributed", |
| "pyspark.mllib.random", |
| "pyspark.mllib.recommendation", |
| "pyspark.mllib.regression", |
| "pyspark.mllib.stat._statistics", |
| "pyspark.mllib.stat.KernelDensity", |
| "pyspark.mllib.tree", |
| "pyspark.mllib.util", |
| # unittests |
| "pyspark.mllib.tests.test_algorithms", |
| "pyspark.mllib.tests.test_feature", |
| "pyspark.mllib.tests.test_linalg", |
| "pyspark.mllib.tests.test_stat", |
| "pyspark.mllib.tests.test_streaming_algorithms", |
| "pyspark.mllib.tests.test_util", |
| ], |
| excluded_python_implementations=[ |
| "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there |
| ] |
| ) |
| |
| |
| pyspark_ml = Module( |
| name="pyspark-ml", |
| dependencies=[pyspark_core, pyspark_mllib], |
| source_file_regexes=[ |
| "python/pyspark/ml/" |
| ], |
| python_test_goals=[ |
| # doctests |
| "pyspark.ml.classification", |
| "pyspark.ml.clustering", |
| "pyspark.ml.evaluation", |
| "pyspark.ml.feature", |
| "pyspark.ml.fpm", |
| "pyspark.ml.functions", |
| "pyspark.ml.image", |
| "pyspark.ml.linalg.__init__", |
| "pyspark.ml.recommendation", |
| "pyspark.ml.regression", |
| "pyspark.ml.stat", |
| "pyspark.ml.tuning", |
| # unittests |
| "pyspark.ml.tests.test_algorithms", |
| "pyspark.ml.tests.test_base", |
| "pyspark.ml.tests.test_evaluation", |
| "pyspark.ml.tests.test_feature", |
| "pyspark.ml.tests.test_image", |
| "pyspark.ml.tests.test_linalg", |
| "pyspark.ml.tests.test_param", |
| "pyspark.ml.tests.test_persistence", |
| "pyspark.ml.tests.test_pipeline", |
| "pyspark.ml.tests.test_stat", |
| "pyspark.ml.tests.test_training_summary", |
| "pyspark.ml.tests.test_tuning", |
| "pyspark.ml.tests.test_util", |
| "pyspark.ml.tests.test_wrapper", |
| ], |
| excluded_python_implementations=[ |
| "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there |
| ] |
| ) |
| |
| pyspark_pandas = Module( |
| name="pyspark-pandas", |
| dependencies=[pyspark_core, pyspark_sql], |
| source_file_regexes=[ |
| "python/pyspark/pandas/" |
| ], |
| python_test_goals=[ |
| # doctests |
| "pyspark.pandas.accessors", |
| "pyspark.pandas.base", |
| "pyspark.pandas.categorical", |
| "pyspark.pandas.config", |
| "pyspark.pandas.datetimes", |
| "pyspark.pandas.exceptions", |
| "pyspark.pandas.extensions", |
| "pyspark.pandas.frame", |
| "pyspark.pandas.generic", |
| "pyspark.pandas.groupby", |
| "pyspark.pandas.indexing", |
| "pyspark.pandas.internal", |
| "pyspark.pandas.ml", |
| "pyspark.pandas.mlflow", |
| "pyspark.pandas.namespace", |
| "pyspark.pandas.numpy_compat", |
| "pyspark.pandas.series", |
| "pyspark.pandas.sql_processor", |
| "pyspark.pandas.strings", |
| "pyspark.pandas.utils", |
| "pyspark.pandas.window", |
| "pyspark.pandas.indexes.base", |
| "pyspark.pandas.indexes.category", |
| "pyspark.pandas.indexes.datetimes", |
| "pyspark.pandas.indexes.multi", |
| "pyspark.pandas.indexes.numeric", |
| "pyspark.pandas.spark.accessors", |
| "pyspark.pandas.spark.utils", |
| "pyspark.pandas.typedef.typehints", |
| # unittests |
| "pyspark.pandas.tests.indexes.test_base", |
| "pyspark.pandas.tests.indexes.test_category", |
| "pyspark.pandas.tests.indexes.test_datetime", |
| "pyspark.pandas.tests.plot.test_frame_plot", |
| "pyspark.pandas.tests.plot.test_frame_plot_matplotlib", |
| "pyspark.pandas.tests.plot.test_frame_plot_plotly", |
| "pyspark.pandas.tests.plot.test_series_plot", |
| "pyspark.pandas.tests.plot.test_series_plot_matplotlib", |
| "pyspark.pandas.tests.plot.test_series_plot_plotly", |
| "pyspark.pandas.tests.test_categorical", |
| "pyspark.pandas.tests.test_config", |
| "pyspark.pandas.tests.test_csv", |
| "pyspark.pandas.tests.test_dataframe", |
| "pyspark.pandas.tests.test_dataframe_conversion", |
| "pyspark.pandas.tests.test_dataframe_spark_io", |
| "pyspark.pandas.tests.test_default_index", |
| "pyspark.pandas.tests.test_expanding", |
| "pyspark.pandas.tests.test_extension", |
| "pyspark.pandas.tests.test_frame_spark", |
| "pyspark.pandas.tests.test_groupby", |
| "pyspark.pandas.tests.test_indexing", |
| "pyspark.pandas.tests.test_indexops_spark", |
| "pyspark.pandas.tests.test_internal", |
| "pyspark.pandas.tests.test_namespace", |
| "pyspark.pandas.tests.test_numpy_compat", |
| "pyspark.pandas.tests.test_ops_on_diff_frames", |
| "pyspark.pandas.tests.test_ops_on_diff_frames_groupby", |
| "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_expanding", |
| "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling", |
| "pyspark.pandas.tests.test_repr", |
| "pyspark.pandas.tests.test_reshape", |
| "pyspark.pandas.tests.test_rolling", |
| "pyspark.pandas.tests.test_series", |
| "pyspark.pandas.tests.test_series_conversion", |
| "pyspark.pandas.tests.test_series_datetime", |
| "pyspark.pandas.tests.test_series_string", |
| "pyspark.pandas.tests.test_sql", |
| "pyspark.pandas.tests.test_stats", |
| "pyspark.pandas.tests.test_typedef", |
| "pyspark.pandas.tests.test_utils", |
| "pyspark.pandas.tests.test_window", |
| ], |
| excluded_python_implementations=[ |
| "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and |
| # they aren't available there |
| ] |
| ) |
| |
| sparkr = Module( |
| name="sparkr", |
| dependencies=[hive, mllib], |
| source_file_regexes=[ |
| "R/", |
| ], |
| should_run_r_tests=True |
| ) |
| |
| |
| docs = Module( |
| name="docs", |
| dependencies=[], |
| source_file_regexes=[ |
| "docs/", |
| ] |
| ) |
| |
| build = Module( |
| name="build", |
| dependencies=[], |
| source_file_regexes=[ |
| ".*pom.xml", |
| "dev/test-dependencies.sh", |
| ], |
| should_run_build_tests=True |
| ) |
| |
| yarn = Module( |
| name="yarn", |
| dependencies=[], |
| source_file_regexes=[ |
| "resource-managers/yarn/", |
| "common/network-yarn/", |
| ], |
| build_profile_flags=["-Pyarn"], |
| sbt_test_goals=[ |
| "yarn/test", |
| "network-yarn/test", |
| ], |
| test_tags=[ |
| "org.apache.spark.tags.ExtendedYarnTest" |
| ] |
| ) |
| |
| mesos = Module( |
| name="mesos", |
| dependencies=[], |
| source_file_regexes=["resource-managers/mesos/"], |
| build_profile_flags=["-Pmesos"], |
| sbt_test_goals=["mesos/test"] |
| ) |
| |
| kubernetes = Module( |
| name="kubernetes", |
| dependencies=[], |
| source_file_regexes=["resource-managers/kubernetes"], |
| build_profile_flags=["-Pkubernetes"], |
| sbt_test_goals=["kubernetes/test"] |
| ) |
| |
| hadoop_cloud = Module( |
| name="hadoop-cloud", |
| dependencies=[], |
| source_file_regexes=["hadoop-cloud"], |
| build_profile_flags=["-Phadoop-cloud"], |
| sbt_test_goals=["hadoop-cloud/test"] |
| ) |
| |
| spark_ganglia_lgpl = Module( |
| name="spark-ganglia-lgpl", |
| dependencies=[], |
| build_profile_flags=["-Pspark-ganglia-lgpl"], |
| source_file_regexes=[ |
| "external/spark-ganglia-lgpl", |
| ] |
| ) |
| |
| # The root module is a dummy module which is used to run all of the tests. |
| # No other modules should directly depend on this module. |
| root = Module( |
| name="root", |
| dependencies=[build, core], # Changes to build should trigger all tests. |
| source_file_regexes=[], |
| # In order to run all of the tests, enable every test profile: |
| build_profile_flags=list(set( |
| itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))), |
| sbt_test_goals=[ |
| "test", |
| ], |
| python_test_goals=list(itertools.chain.from_iterable(m.python_test_goals for m in all_modules)), |
| should_run_r_tests=True, |
| should_run_build_tests=True |
| ) |