| # |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| |
| from functools import total_ordering |
| import itertools |
| import os |
| import re |
| |
| all_modules = [] |
| |
| |
| @total_ordering |
| class Module(object): |
| """ |
| A module is the basic abstraction in our test runner script. Each module consists of a set |
| of source files, a set of test commands, and a set of dependencies on other modules. We use |
| modules to define a dependency graph that let us determine which tests to run based on which |
| files have changed. |
| """ |
| |
| def __init__( |
| self, |
| name, |
| dependencies, |
| source_file_regexes, |
| build_profile_flags=(), |
| environ=None, |
| sbt_test_goals=(), |
| python_test_goals=(), |
| excluded_python_implementations=(), |
| test_tags=(), |
| should_run_r_tests=False, |
| should_run_build_tests=False, |
| ): |
| """ |
| Define a new module. |
| |
| :param name: A short module name, for display in logging and error messages. |
| :param dependencies: A set of dependencies for this module. This should only include direct |
| dependencies; transitive dependencies are resolved automatically. |
| :param source_file_regexes: a set of regexes that match source files belonging to this |
| module. These regexes are applied by attempting to match at the beginning of the |
| filename strings. |
| :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in |
| order to build and test this module (e.g. '-PprofileName'). |
| :param environ: A dict of environment variables that should be set when files in this |
| module are changed. |
| :param sbt_test_goals: A set of SBT test goals for testing this module. |
| :param python_test_goals: A set of Python test goals for testing this module. |
| :param excluded_python_implementations: A set of Python implementations that are not |
| supported by this module's Python components. The values in this set should match |
| strings returned by Python's `platform.python_implementation()`. |
| :param test_tags A set of tags that will be excluded when running unit tests if the module |
| is not explicitly changed. |
| :param should_run_r_tests: If true, changes in this module will trigger all R tests. |
| :param should_run_build_tests: If true, changes in this module will trigger build tests. |
| """ |
| self.name = name |
| self.dependencies = dependencies |
| self.source_file_prefixes = source_file_regexes |
| self.sbt_test_goals = sbt_test_goals |
| self.build_profile_flags = build_profile_flags |
| self.environ = environ or {} |
| self.python_test_goals = python_test_goals |
| self.excluded_python_implementations = excluded_python_implementations |
| self.test_tags = test_tags |
| self.should_run_r_tests = should_run_r_tests |
| self.should_run_build_tests = should_run_build_tests |
| |
| self.dependent_modules = set() |
| for dep in dependencies: |
| dep.dependent_modules.add(self) |
| all_modules.append(self) |
| |
| def contains_file(self, filename): |
| return any(re.match(p, filename) for p in self.source_file_prefixes) |
| |
| def __repr__(self): |
| return "Module<%s>" % self.name |
| |
| def __lt__(self, other): |
| return self.name < other.name |
| |
| def __eq__(self, other): |
| return self.name == other.name |
| |
| def __ne__(self, other): |
| return not (self.name == other.name) |
| |
| def __hash__(self): |
| return hash(self.name) |
| |
| |
| tags = Module( |
| name="tags", |
| dependencies=[], |
| source_file_regexes=[ |
| "common/tags/", |
| ], |
| ) |
| |
| utils_java = Module( |
| name="utils-java", |
| dependencies=[tags], |
| source_file_regexes=[ |
| "common/utils-java/", |
| ], |
| sbt_test_goals=[ |
| "common-utils-java/test", |
| ], |
| ) |
| |
| utils = Module( |
| name="utils", |
| dependencies=[tags, utils_java], |
| source_file_regexes=[ |
| "common/utils/", |
| ], |
| sbt_test_goals=[ |
| "common-utils/test", |
| ], |
| ) |
| |
| kvstore = Module( |
| name="kvstore", |
| dependencies=[tags], |
| source_file_regexes=[ |
| "common/kvstore/", |
| ], |
| sbt_test_goals=[ |
| "kvstore/test", |
| ], |
| ) |
| |
| network_common = Module( |
| name="network-common", |
| dependencies=[tags, utils_java], |
| source_file_regexes=[ |
| "common/network-common/", |
| ], |
| sbt_test_goals=[ |
| "network-common/test", |
| ], |
| ) |
| |
| network_shuffle = Module( |
| name="network-shuffle", |
| dependencies=[tags], |
| source_file_regexes=[ |
| "common/network-shuffle/", |
| ], |
| sbt_test_goals=[ |
| "network-shuffle/test", |
| ], |
| ) |
| |
| unsafe = Module( |
| name="unsafe", |
| dependencies=[tags, utils], |
| source_file_regexes=[ |
| "common/unsafe", |
| ], |
| sbt_test_goals=[ |
| "unsafe/test", |
| ], |
| ) |
| |
| launcher = Module( |
| name="launcher", |
| dependencies=[tags], |
| source_file_regexes=[ |
| "launcher/", |
| ], |
| sbt_test_goals=[ |
| "launcher/test", |
| ], |
| ) |
| |
| sketch = Module( |
| name="sketch", |
| dependencies=[tags], |
| source_file_regexes=[ |
| "common/sketch/", |
| ], |
| sbt_test_goals=["sketch/test"], |
| ) |
| |
| variant = Module( |
| name="variant", |
| dependencies=[tags], |
| source_file_regexes=[ |
| "common/variant/", |
| ], |
| sbt_test_goals=["variant/test"], |
| ) |
| |
| core = Module( |
| name="core", |
| dependencies=[kvstore, network_common, network_shuffle, unsafe, launcher, utils], |
| source_file_regexes=[ |
| "core/", |
| ], |
| sbt_test_goals=[ |
| "core/test", |
| ], |
| ) |
| |
| api = Module( |
| name="api", |
| dependencies=[utils, unsafe], |
| source_file_regexes=[ |
| "sql/api/", |
| ], |
| ) |
| |
| catalyst = Module( |
| name="catalyst", |
| dependencies=[tags, sketch, variant, core, api], |
| source_file_regexes=[ |
| "sql/catalyst/", |
| ], |
| sbt_test_goals=[ |
| "catalyst/test", |
| ], |
| environ=None |
| if "GITHUB_ACTIONS" not in os.environ |
| else {"ENABLE_DOCKER_INTEGRATION_TESTS": "1"}, |
| ) |
| |
| sql = Module( |
| name="sql", |
| dependencies=[catalyst], |
| source_file_regexes=[ |
| "sql/core/", |
| "python/pyspark/sql/worker/", # analyze_udtf is invoked and tested in JVM |
| ], |
| sbt_test_goals=[ |
| "sql/test", |
| ], |
| environ=None |
| if "GITHUB_ACTIONS" not in os.environ |
| else {"ENABLE_DOCKER_INTEGRATION_TESTS": "1"}, |
| ) |
| |
| hive = Module( |
| name="hive", |
| dependencies=[sql], |
| source_file_regexes=[ |
| "sql/hive/", |
| "bin/spark-sql", |
| ], |
| build_profile_flags=[ |
| "-Phive", |
| ], |
| sbt_test_goals=[ |
| "hive/test", |
| ], |
| test_tags=["org.apache.spark.tags.ExtendedHiveTest"], |
| ) |
| |
| repl = Module( |
| name="repl", |
| dependencies=[hive], |
| source_file_regexes=[ |
| "repl/", |
| ], |
| sbt_test_goals=[ |
| "repl/test", |
| ], |
| ) |
| |
| hive_thriftserver = Module( |
| name="hive-thriftserver", |
| dependencies=[hive], |
| source_file_regexes=[ |
| "sql/hive-thriftserver", |
| "sbin/start-thriftserver.sh", |
| ], |
| build_profile_flags=[ |
| "-Phive-thriftserver", |
| ], |
| sbt_test_goals=[ |
| "hive-thriftserver/test", |
| ], |
| ) |
| |
| avro = Module( |
| name="avro", |
| dependencies=[sql], |
| source_file_regexes=[ |
| "connector/avro", |
| ], |
| sbt_test_goals=[ |
| "avro/test", |
| ], |
| ) |
| |
| sql_kafka = Module( |
| name="sql-kafka-0-10", |
| dependencies=[sql], |
| source_file_regexes=[ |
| "connector/kafka-0-10-sql", |
| ], |
| sbt_test_goals=[ |
| "sql-kafka-0-10/test", |
| ], |
| ) |
| |
| profiler = Module( |
| name="profiler", |
| dependencies=[], |
| build_profile_flags=["-Pjvm-profiler"], |
| source_file_regexes=[ |
| "connector/profiler", |
| ], |
| ) |
| |
| protobuf = Module( |
| name="protobuf", |
| dependencies=[sql], |
| source_file_regexes=[ |
| "connector/protobuf", |
| ], |
| sbt_test_goals=[ |
| "protobuf/test", |
| ], |
| ) |
| |
| graphx = Module( |
| name="graphx", |
| dependencies=[tags, core], |
| source_file_regexes=[ |
| "graphx/", |
| ], |
| sbt_test_goals=["graphx/test"], |
| ) |
| |
| streaming = Module( |
| name="streaming", |
| dependencies=[tags, core], |
| source_file_regexes=[ |
| "streaming", |
| ], |
| sbt_test_goals=[ |
| "streaming/test", |
| ], |
| ) |
| |
| |
| # Don't set the dependencies because changes in other modules should not trigger Kinesis tests. |
| # Kinesis tests depends on external Amazon kinesis service. We should run these tests only when |
| # files in streaming_kinesis_asl are changed, so that if Kinesis experiences an outage, we don't |
| # fail other PRs. |
| streaming_kinesis_asl = Module( |
| name="streaming-kinesis-asl", |
| dependencies=[tags, core], |
| source_file_regexes=[ |
| "connector/kinesis-asl/", |
| "connector/kinesis-asl-assembly/", |
| ], |
| build_profile_flags=[ |
| "-Pkinesis-asl", |
| ], |
| environ={"ENABLE_KINESIS_TESTS": "0"}, |
| sbt_test_goals=[ |
| "streaming-kinesis-asl/test", |
| ], |
| ) |
| |
| |
| streaming_kafka_0_10 = Module( |
| name="streaming-kafka-0-10", |
| dependencies=[streaming, core], |
| source_file_regexes=[ |
| # The ending "/" is necessary otherwise it will include "sql-kafka" codes |
| "connector/kafka-0-10/", |
| "connector/kafka-0-10-assembly", |
| "connector/kafka-0-10-token-provider", |
| ], |
| sbt_test_goals=["streaming-kafka-0-10/test", "token-provider-kafka-0-10/test"], |
| ) |
| |
| |
| mllib_local = Module( |
| name="mllib-local", |
| dependencies=[tags, core], |
| source_file_regexes=[ |
| "mllib-local", |
| ], |
| sbt_test_goals=[ |
| "mllib-local/test", |
| ], |
| ) |
| |
| |
| mllib = Module( |
| name="mllib", |
| dependencies=[mllib_local, streaming, sql], |
| source_file_regexes=[ |
| "data/mllib/", |
| "mllib/", |
| ], |
| sbt_test_goals=[ |
| "mllib/test", |
| ], |
| ) |
| |
| pipelines = Module( |
| name="pipelines", |
| dependencies=[], |
| source_file_regexes=["sql/pipelines"], |
| sbt_test_goals=[ |
| "pipelines/test", |
| ], |
| ) |
| |
| connect = Module( |
| name="connect", |
| dependencies=[hive, avro, protobuf, mllib], |
| source_file_regexes=[ |
| "sql/connect", |
| ], |
| sbt_test_goals=[ |
| "connect/test", |
| "connect-client-jvm/test", |
| ], |
| ) |
| |
| examples = Module( |
| name="examples", |
| dependencies=[graphx, mllib, streaming, hive], |
| source_file_regexes=[ |
| "examples/", |
| ], |
| sbt_test_goals=[ |
| "examples/test", |
| ], |
| ) |
| |
| pyspark_core = Module( |
| name="pyspark-core", |
| dependencies=[core], |
| source_file_regexes=["python/(?!pyspark/(ml|mllib|sql|streaming|pandas|resource|testing))"], |
| python_test_goals=[ |
| # doctests |
| "pyspark.conf", |
| "pyspark.core.rdd", |
| "pyspark.core.context", |
| "pyspark.core.broadcast", |
| "pyspark.accumulators", |
| "pyspark.core.files", |
| "pyspark.serializers", |
| "pyspark.profiler", |
| "pyspark.shuffle", |
| "pyspark.taskcontext", |
| "pyspark.util", |
| # unittests |
| "pyspark.tests.test_appsubmit", |
| "pyspark.tests.test_broadcast", |
| "pyspark.tests.test_conf", |
| "pyspark.tests.test_context", |
| "pyspark.tests.test_daemon", |
| "pyspark.tests.test_install_spark", |
| "pyspark.tests.test_join", |
| "pyspark.tests.test_memory_profiler", |
| "pyspark.tests.test_profiler", |
| "pyspark.tests.test_rdd", |
| "pyspark.tests.test_rddbarrier", |
| "pyspark.tests.test_rddsampler", |
| "pyspark.tests.test_readwrite", |
| "pyspark.tests.test_serializers", |
| "pyspark.tests.test_shuffle", |
| "pyspark.tests.test_statcounter", |
| "pyspark.tests.test_taskcontext", |
| "pyspark.tests.test_util", |
| "pyspark.tests.test_worker", |
| "pyspark.tests.test_stage_sched", |
| ], |
| ) |
| |
| pyspark_sql = Module( |
| name="pyspark-sql", |
| dependencies=[pyspark_core, hive, avro, protobuf], |
| source_file_regexes=["python/pyspark/sql"], |
| python_test_goals=[ |
| # doctests |
| "pyspark.sql.types", |
| "pyspark.sql.context", |
| "pyspark.sql.session", |
| "pyspark.sql.conf", |
| "pyspark.sql.catalog", |
| "pyspark.sql.classic.column", |
| "pyspark.sql.classic.dataframe", |
| "pyspark.sql.classic.window", |
| "pyspark.sql.datasource", |
| "pyspark.sql.group", |
| "pyspark.sql.functions.builtin", |
| "pyspark.sql.functions.partitioning", |
| "pyspark.sql.merge", |
| "pyspark.sql.readwriter", |
| "pyspark.sql.streaming.query", |
| "pyspark.sql.streaming.readwriter", |
| "pyspark.sql.streaming.listener", |
| "pyspark.sql.udf", |
| "pyspark.sql.udtf", |
| "pyspark.sql.avro.functions", |
| "pyspark.sql.protobuf.functions", |
| "pyspark.sql.pandas.conversion", |
| "pyspark.sql.pandas.functions", |
| "pyspark.sql.pandas.map_ops", |
| "pyspark.sql.pandas.group_ops", |
| "pyspark.sql.pandas.types", |
| "pyspark.sql.pandas.serializers", |
| "pyspark.sql.pandas.typehints", |
| "pyspark.sql.pandas.utils", |
| "pyspark.sql.observation", |
| "pyspark.sql.tvf", |
| # unittests |
| "pyspark.sql.tests.test_artifact", |
| "pyspark.sql.tests.test_catalog", |
| "pyspark.sql.tests.test_column", |
| "pyspark.sql.tests.test_conf", |
| "pyspark.sql.tests.test_context", |
| "pyspark.sql.tests.test_dataframe", |
| "pyspark.sql.tests.test_collection", |
| "pyspark.sql.tests.test_creation", |
| "pyspark.sql.tests.test_conversion", |
| "pyspark.sql.tests.test_listener", |
| "pyspark.sql.tests.test_observation", |
| "pyspark.sql.tests.test_repartition", |
| "pyspark.sql.tests.test_stat", |
| "pyspark.sql.tests.test_datasources", |
| "pyspark.sql.tests.test_errors", |
| "pyspark.sql.tests.test_functions", |
| "pyspark.sql.tests.test_group", |
| "pyspark.sql.tests.test_sql", |
| "pyspark.sql.tests.test_job_cancellation", |
| "pyspark.sql.tests.arrow.test_arrow", |
| "pyspark.sql.tests.arrow.test_arrow_map", |
| "pyspark.sql.tests.arrow.test_arrow_cogrouped_map", |
| "pyspark.sql.tests.arrow.test_arrow_grouped_map", |
| "pyspark.sql.tests.arrow.test_arrow_python_udf", |
| "pyspark.sql.tests.arrow.test_arrow_udf", |
| "pyspark.sql.tests.arrow.test_arrow_udf_grouped_agg", |
| "pyspark.sql.tests.arrow.test_arrow_udf_scalar", |
| "pyspark.sql.tests.arrow.test_arrow_udf_window", |
| "pyspark.sql.tests.arrow.test_arrow_udf_typehints", |
| "pyspark.sql.tests.arrow.test_arrow_udtf", |
| "pyspark.sql.tests.pandas.test_pandas_cogrouped_map", |
| "pyspark.sql.tests.pandas.test_pandas_grouped_map", |
| "pyspark.sql.tests.pandas.test_pandas_grouped_map_with_state", |
| "pyspark.sql.tests.pandas.test_pandas_map", |
| "pyspark.sql.tests.pandas.test_pandas_transform_with_state", |
| "pyspark.sql.tests.pandas.test_pandas_udf", |
| "pyspark.sql.tests.pandas.test_pandas_udf_grouped_agg", |
| "pyspark.sql.tests.pandas.test_pandas_udf_scalar", |
| "pyspark.sql.tests.pandas.test_pandas_udf_typehints", |
| "pyspark.sql.tests.pandas.test_pandas_udf_typehints_with_future_annotations", |
| "pyspark.sql.tests.pandas.test_pandas_udf_window", |
| "pyspark.sql.tests.pandas.test_pandas_sqlmetrics", |
| "pyspark.sql.tests.pandas.test_converter", |
| "pyspark.sql.tests.test_python_datasource", |
| "pyspark.sql.tests.test_python_streaming_datasource", |
| "pyspark.sql.tests.test_readwriter", |
| "pyspark.sql.tests.test_serde", |
| "pyspark.sql.tests.test_session", |
| "pyspark.sql.tests.streaming.test_streaming", |
| "pyspark.sql.tests.streaming.test_streaming_foreach", |
| "pyspark.sql.tests.streaming.test_streaming_foreach_batch", |
| "pyspark.sql.tests.streaming.test_streaming_listener", |
| "pyspark.sql.tests.test_subquery", |
| "pyspark.sql.tests.test_types", |
| "pyspark.sql.tests.test_udf", |
| "pyspark.sql.tests.test_udf_combinations", |
| "pyspark.sql.tests.test_udf_profiler", |
| "pyspark.sql.tests.test_unified_udf", |
| "pyspark.sql.tests.test_udtf", |
| "pyspark.sql.tests.test_tvf", |
| "pyspark.sql.tests.test_utils", |
| "pyspark.sql.tests.test_resources", |
| "pyspark.sql.tests.plot.test_frame_plot", |
| "pyspark.sql.tests.plot.test_frame_plot_plotly", |
| "pyspark.sql.tests.test_connect_compatibility", |
| "pyspark.sql.tests.udf_type_tests.test_udf_input_types", |
| "pyspark.sql.tests.udf_type_tests.test_udf_return_types", |
| ], |
| ) |
| |
| pyspark_testing = Module( |
| name="pyspark-testing", |
| dependencies=[pyspark_core, pyspark_sql], |
| source_file_regexes=["python/pyspark/testing"], |
| python_test_goals=[ |
| # doctests |
| "pyspark.testing.utils", |
| "pyspark.testing.pandasutils", |
| # unittests |
| "pyspark.testing.tests.test_fail", |
| "pyspark.testing.tests.test_fail_in_set_up_class", |
| "pyspark.testing.tests.test_no_tests", |
| "pyspark.testing.tests.test_pass_all", |
| "pyspark.testing.tests.test_skip_all", |
| "pyspark.testing.tests.test_skip_class", |
| "pyspark.testing.tests.test_skip_set_up_class", |
| ], |
| ) |
| |
| pyspark_resource = Module( |
| name="pyspark-resource", |
| dependencies=[pyspark_core], |
| source_file_regexes=["python/pyspark/resource"], |
| python_test_goals=[ |
| # doctests |
| "pyspark.resource.profile", |
| # unittests |
| "pyspark.resource.tests.test_resources", |
| "pyspark.resource.tests.test_connect_resources", |
| ], |
| ) |
| |
| |
| pyspark_streaming = Module( |
| name="pyspark-streaming", |
| dependencies=[pyspark_core, streaming, streaming_kinesis_asl], |
| source_file_regexes=["python/pyspark/streaming"], |
| python_test_goals=[ |
| # doctests |
| "pyspark.streaming.util", |
| # unittests |
| "pyspark.streaming.tests.test_context", |
| "pyspark.streaming.tests.test_dstream", |
| "pyspark.streaming.tests.test_kinesis", |
| "pyspark.streaming.tests.test_listener", |
| ], |
| ) |
| |
| |
| pyspark_mllib = Module( |
| name="pyspark-mllib", |
| dependencies=[pyspark_core, pyspark_streaming, pyspark_sql, mllib], |
| source_file_regexes=["python/pyspark/mllib"], |
| python_test_goals=[ |
| # doctests |
| "pyspark.mllib.classification", |
| "pyspark.mllib.clustering", |
| "pyspark.mllib.evaluation", |
| "pyspark.mllib.feature", |
| "pyspark.mllib.fpm", |
| "pyspark.mllib.linalg.__init__", |
| "pyspark.mllib.linalg.distributed", |
| "pyspark.mllib.random", |
| "pyspark.mllib.recommendation", |
| "pyspark.mllib.regression", |
| "pyspark.mllib.stat._statistics", |
| "pyspark.mllib.stat.KernelDensity", |
| "pyspark.mllib.tree", |
| "pyspark.mllib.util", |
| # unittests |
| "pyspark.mllib.tests.test_algorithms", |
| "pyspark.mllib.tests.test_feature", |
| "pyspark.mllib.tests.test_linalg", |
| "pyspark.mllib.tests.test_stat", |
| "pyspark.mllib.tests.test_streaming_algorithms", |
| "pyspark.mllib.tests.test_util", |
| ], |
| excluded_python_implementations=[ |
| "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there |
| ], |
| ) |
| |
| |
| pyspark_ml = Module( |
| name="pyspark-ml", |
| dependencies=[pyspark_core, pyspark_mllib], |
| source_file_regexes=["python/pyspark/ml/"], |
| python_test_goals=[ |
| # doctests |
| "pyspark.ml.classification", |
| "pyspark.ml.clustering", |
| "pyspark.ml.evaluation", |
| "pyspark.ml.feature", |
| "pyspark.ml.fpm", |
| "pyspark.ml.functions", |
| "pyspark.ml.image", |
| "pyspark.ml.linalg.__init__", |
| "pyspark.ml.recommendation", |
| "pyspark.ml.regression", |
| "pyspark.ml.stat", |
| "pyspark.ml.tuning", |
| # unittests |
| "pyspark.ml.tests.test_algorithms", |
| "pyspark.ml.tests.test_als", |
| "pyspark.ml.tests.test_fpm", |
| "pyspark.ml.tests.test_base", |
| "pyspark.ml.tests.test_evaluation", |
| "pyspark.ml.tests.test_feature", |
| "pyspark.ml.tests.test_functions", |
| "pyspark.ml.tests.test_image", |
| "pyspark.ml.tests.test_linalg", |
| "pyspark.ml.tests.test_model_cache", |
| "pyspark.ml.tests.test_param", |
| "pyspark.ml.tests.test_persistence", |
| "pyspark.ml.tests.test_pipeline", |
| "pyspark.ml.tests.test_tuning", |
| "pyspark.ml.tests.test_ovr", |
| "pyspark.ml.tests.test_stat", |
| "pyspark.ml.tests.test_training_summary", |
| "pyspark.ml.tests.tuning.test_tuning", |
| "pyspark.ml.tests.tuning.test_cv_io_basic", |
| "pyspark.ml.tests.tuning.test_cv_io_nested", |
| "pyspark.ml.tests.tuning.test_cv_io_pipeline", |
| "pyspark.ml.tests.tuning.test_tvs_io_basic", |
| "pyspark.ml.tests.tuning.test_tvs_io_nested", |
| "pyspark.ml.tests.tuning.test_tvs_io_pipeline", |
| "pyspark.ml.tests.test_util", |
| "pyspark.ml.tests.test_wrapper", |
| "pyspark.ml.torch.tests.test_distributor", |
| "pyspark.ml.torch.tests.test_log_communication", |
| "pyspark.ml.torch.tests.test_data_loader", |
| "pyspark.ml.deepspeed.tests.test_deepspeed_distributor", |
| "pyspark.ml.tests.connect.test_legacy_mode_summarizer", |
| "pyspark.ml.tests.connect.test_legacy_mode_evaluation", |
| "pyspark.ml.tests.connect.test_legacy_mode_feature", |
| "pyspark.ml.tests.connect.test_legacy_mode_classification", |
| "pyspark.ml.tests.connect.test_legacy_mode_pipeline", |
| "pyspark.ml.tests.connect.test_legacy_mode_tuning", |
| "pyspark.ml.tests.test_classification", |
| "pyspark.ml.tests.test_regression", |
| "pyspark.ml.tests.test_clustering", |
| ], |
| excluded_python_implementations=[ |
| "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there |
| ], |
| ) |
| |
| pyspark_pandas = Module( |
| name="pyspark-pandas", |
| dependencies=[pyspark_core, pyspark_sql], |
| source_file_regexes=["python/pyspark/pandas/"], |
| python_test_goals=[ |
| # doctests |
| "pyspark.pandas.accessors", |
| "pyspark.pandas.base", |
| "pyspark.pandas.categorical", |
| "pyspark.pandas.config", |
| "pyspark.pandas.datetimes", |
| "pyspark.pandas.exceptions", |
| "pyspark.pandas.extensions", |
| "pyspark.pandas.groupby", |
| "pyspark.pandas.indexing", |
| "pyspark.pandas.internal", |
| "pyspark.pandas.mlflow", |
| "pyspark.pandas.namespace", |
| "pyspark.pandas.numpy_compat", |
| "pyspark.pandas.sql_processor", |
| "pyspark.pandas.sql_formatter", |
| "pyspark.pandas.strings", |
| "pyspark.pandas.supported_api_gen", |
| "pyspark.pandas.utils", |
| "pyspark.pandas.window", |
| "pyspark.pandas.indexes.base", |
| "pyspark.pandas.indexes.category", |
| "pyspark.pandas.indexes.datetimes", |
| "pyspark.pandas.indexes.timedelta", |
| "pyspark.pandas.indexes.multi", |
| "pyspark.pandas.spark.accessors", |
| "pyspark.pandas.spark.utils", |
| "pyspark.pandas.typedef.typehints", |
| # unittests |
| "pyspark.pandas.tests.test_categorical", |
| "pyspark.pandas.tests.test_config", |
| "pyspark.pandas.tests.test_extension", |
| "pyspark.pandas.tests.test_frame_spark", |
| "pyspark.pandas.tests.test_generic_functions", |
| "pyspark.pandas.tests.test_indexops_spark", |
| "pyspark.pandas.tests.test_internal", |
| "pyspark.pandas.tests.test_namespace", |
| "pyspark.pandas.tests.test_numpy_compat", |
| "pyspark.pandas.tests.test_repr", |
| "pyspark.pandas.tests.test_spark_functions", |
| "pyspark.pandas.tests.test_scalars", |
| "pyspark.pandas.tests.test_sql", |
| "pyspark.pandas.tests.test_typedef", |
| "pyspark.pandas.tests.test_utils", |
| "pyspark.pandas.tests.computation.test_any_all", |
| "pyspark.pandas.tests.computation.test_apply_func", |
| "pyspark.pandas.tests.computation.test_binary_ops", |
| "pyspark.pandas.tests.computation.test_combine", |
| "pyspark.pandas.tests.computation.test_compute", |
| "pyspark.pandas.tests.computation.test_corr", |
| "pyspark.pandas.tests.computation.test_corrwith", |
| "pyspark.pandas.tests.computation.test_cov", |
| "pyspark.pandas.tests.computation.test_cumulative", |
| "pyspark.pandas.tests.computation.test_describe", |
| "pyspark.pandas.tests.computation.test_eval", |
| "pyspark.pandas.tests.computation.test_melt", |
| "pyspark.pandas.tests.computation.test_missing_data", |
| "pyspark.pandas.tests.computation.test_pivot", |
| "pyspark.pandas.tests.computation.test_pivot_table", |
| "pyspark.pandas.tests.computation.test_pivot_table_adv", |
| "pyspark.pandas.tests.computation.test_pivot_table_multi_idx", |
| "pyspark.pandas.tests.computation.test_pivot_table_multi_idx_adv", |
| "pyspark.pandas.tests.computation.test_stats", |
| "pyspark.pandas.tests.data_type_ops.test_as_type", |
| "pyspark.pandas.tests.data_type_ops.test_base", |
| "pyspark.pandas.tests.data_type_ops.test_binary_ops", |
| "pyspark.pandas.tests.data_type_ops.test_boolean_ops", |
| "pyspark.pandas.tests.data_type_ops.test_categorical_ops", |
| "pyspark.pandas.tests.data_type_ops.test_complex_ops", |
| "pyspark.pandas.tests.data_type_ops.test_date_ops", |
| "pyspark.pandas.tests.data_type_ops.test_datetime_ops", |
| "pyspark.pandas.tests.data_type_ops.test_null_ops", |
| "pyspark.pandas.tests.data_type_ops.test_num_ops", |
| "pyspark.pandas.tests.data_type_ops.test_num_arithmetic", |
| "pyspark.pandas.tests.data_type_ops.test_num_mod", |
| "pyspark.pandas.tests.data_type_ops.test_num_mul_div", |
| "pyspark.pandas.tests.data_type_ops.test_num_pow", |
| "pyspark.pandas.tests.data_type_ops.test_num_reverse", |
| "pyspark.pandas.tests.data_type_ops.test_string_ops", |
| "pyspark.pandas.tests.data_type_ops.test_udt_ops", |
| "pyspark.pandas.tests.data_type_ops.test_timedelta_ops", |
| "pyspark.pandas.tests.plot.test_frame_plot", |
| "pyspark.pandas.tests.plot.test_frame_plot_matplotlib", |
| "pyspark.pandas.tests.plot.test_frame_plot_plotly", |
| "pyspark.pandas.tests.plot.test_series_plot", |
| "pyspark.pandas.tests.plot.test_series_plot_matplotlib", |
| "pyspark.pandas.tests.plot.test_series_plot_plotly", |
| "pyspark.pandas.tests.frame.test_interpolate", |
| "pyspark.pandas.tests.frame.test_interpolate_error", |
| "pyspark.pandas.tests.frame.test_attrs", |
| "pyspark.pandas.tests.frame.test_axis", |
| "pyspark.pandas.tests.frame.test_constructor", |
| "pyspark.pandas.tests.frame.test_conversion", |
| "pyspark.pandas.tests.frame.test_reindexing", |
| "pyspark.pandas.tests.frame.test_reshaping", |
| "pyspark.pandas.tests.frame.test_spark", |
| "pyspark.pandas.tests.frame.test_take", |
| "pyspark.pandas.tests.frame.test_take_adv", |
| "pyspark.pandas.tests.frame.test_time_series", |
| "pyspark.pandas.tests.frame.test_truncate", |
| "pyspark.pandas.tests.series.test_interpolate", |
| "pyspark.pandas.tests.resample.test_on", |
| "pyspark.pandas.tests.resample.test_error", |
| "pyspark.pandas.tests.resample.test_frame", |
| "pyspark.pandas.tests.resample.test_missing", |
| "pyspark.pandas.tests.resample.test_series", |
| "pyspark.pandas.tests.resample.test_timezone", |
| "pyspark.pandas.tests.reshape.test_get_dummies", |
| "pyspark.pandas.tests.reshape.test_get_dummies_kwargs", |
| "pyspark.pandas.tests.reshape.test_get_dummies_multiindex", |
| "pyspark.pandas.tests.reshape.test_get_dummies_object", |
| "pyspark.pandas.tests.reshape.test_get_dummies_prefix", |
| "pyspark.pandas.tests.reshape.test_merge_asof", |
| "pyspark.pandas.tests.window.test_expanding", |
| "pyspark.pandas.tests.window.test_expanding_adv", |
| "pyspark.pandas.tests.window.test_expanding_error", |
| "pyspark.pandas.tests.window.test_groupby_expanding", |
| "pyspark.pandas.tests.window.test_groupby_expanding_adv", |
| "pyspark.pandas.tests.window.test_ewm_error", |
| "pyspark.pandas.tests.window.test_ewm_mean", |
| "pyspark.pandas.tests.window.test_groupby_ewm_mean", |
| "pyspark.pandas.tests.window.test_missing", |
| "pyspark.pandas.tests.window.test_rolling", |
| "pyspark.pandas.tests.window.test_rolling_adv", |
| "pyspark.pandas.tests.window.test_rolling_count", |
| "pyspark.pandas.tests.window.test_rolling_error", |
| "pyspark.pandas.tests.window.test_groupby_rolling", |
| "pyspark.pandas.tests.window.test_groupby_rolling_adv", |
| "pyspark.pandas.tests.window.test_groupby_rolling_count", |
| "pyspark.pandas.tests.series.test_datetime", |
| "pyspark.pandas.tests.series.test_string_ops_adv", |
| "pyspark.pandas.tests.series.test_string_ops_basic", |
| "pyspark.pandas.tests.series.test_all_any", |
| "pyspark.pandas.tests.series.test_arg_ops", |
| "pyspark.pandas.tests.series.test_as_of", |
| "pyspark.pandas.tests.series.test_as_type", |
| "pyspark.pandas.tests.series.test_compute", |
| "pyspark.pandas.tests.series.test_conversion", |
| "pyspark.pandas.tests.series.test_cumulative", |
| "pyspark.pandas.tests.series.test_index", |
| "pyspark.pandas.tests.series.test_missing_data", |
| "pyspark.pandas.tests.series.test_series", |
| "pyspark.pandas.tests.series.test_sort", |
| "pyspark.pandas.tests.series.test_stat", |
| "pyspark.pandas.tests.io.test_io", |
| "pyspark.pandas.tests.io.test_csv", |
| "pyspark.pandas.tests.io.test_feather", |
| "pyspark.pandas.tests.io.test_stata", |
| "pyspark.pandas.tests.io.test_dataframe_conversion", |
| "pyspark.pandas.tests.io.test_dataframe_spark_io", |
| "pyspark.pandas.tests.io.test_series_conversion", |
| # fallback |
| "pyspark.pandas.tests.frame.test_asfreq", |
| "pyspark.pandas.tests.frame.test_asof", |
| ], |
| excluded_python_implementations=[ |
| "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and |
| # they aren't available there |
| ], |
| ) |
| |
| pyspark_pandas_slow = Module( |
| name="pyspark-pandas-slow", |
| dependencies=[pyspark_core, pyspark_sql], |
| source_file_regexes=["python/pyspark/pandas/"], |
| python_test_goals=[ |
| # doctests |
| "pyspark.pandas.frame", |
| "pyspark.pandas.generic", |
| "pyspark.pandas.series", |
| # unittests |
| "pyspark.pandas.tests.indexes.test_default", |
| "pyspark.pandas.tests.indexes.test_category", |
| "pyspark.pandas.tests.indexes.test_timedelta", |
| "pyspark.pandas.tests.indexes.test_basic", |
| "pyspark.pandas.tests.indexes.test_getattr", |
| "pyspark.pandas.tests.indexes.test_name", |
| "pyspark.pandas.tests.indexes.test_conversion", |
| "pyspark.pandas.tests.indexes.test_drop", |
| "pyspark.pandas.tests.indexes.test_level", |
| "pyspark.pandas.tests.indexes.test_missing", |
| "pyspark.pandas.tests.indexes.test_repeat", |
| "pyspark.pandas.tests.indexes.test_sort", |
| "pyspark.pandas.tests.indexes.test_stat", |
| "pyspark.pandas.tests.indexes.test_symmetric_diff", |
| "pyspark.pandas.tests.indexes.test_take", |
| "pyspark.pandas.tests.indexes.test_unique", |
| "pyspark.pandas.tests.indexes.test_asof", |
| "pyspark.pandas.tests.indexes.test_astype", |
| "pyspark.pandas.tests.indexes.test_delete", |
| "pyspark.pandas.tests.indexes.test_diff", |
| "pyspark.pandas.tests.indexes.test_insert", |
| "pyspark.pandas.tests.indexes.test_map", |
| "pyspark.pandas.tests.indexes.test_append", |
| "pyspark.pandas.tests.indexes.test_intersection", |
| "pyspark.pandas.tests.indexes.test_monotonic", |
| "pyspark.pandas.tests.indexes.test_union", |
| "pyspark.pandas.tests.indexes.test_datetime", |
| "pyspark.pandas.tests.indexes.test_datetime_at", |
| "pyspark.pandas.tests.indexes.test_datetime_between", |
| "pyspark.pandas.tests.indexes.test_datetime_ceil", |
| "pyspark.pandas.tests.indexes.test_datetime_floor", |
| "pyspark.pandas.tests.indexes.test_datetime_iso", |
| "pyspark.pandas.tests.indexes.test_datetime_map", |
| "pyspark.pandas.tests.indexes.test_datetime_property", |
| "pyspark.pandas.tests.indexes.test_datetime_round", |
| "pyspark.pandas.tests.indexes.test_align", |
| "pyspark.pandas.tests.indexes.test_indexing", |
| "pyspark.pandas.tests.indexes.test_indexing_adv", |
| "pyspark.pandas.tests.indexes.test_indexing_basic", |
| "pyspark.pandas.tests.indexes.test_indexing_iloc", |
| "pyspark.pandas.tests.indexes.test_indexing_loc", |
| "pyspark.pandas.tests.indexes.test_indexing_loc_2d", |
| "pyspark.pandas.tests.indexes.test_indexing_loc_multi_idx", |
| "pyspark.pandas.tests.indexes.test_reindex", |
| "pyspark.pandas.tests.indexes.test_rename", |
| "pyspark.pandas.tests.indexes.test_reset_index", |
| "pyspark.pandas.tests.groupby.test_aggregate", |
| "pyspark.pandas.tests.groupby.test_apply_func", |
| "pyspark.pandas.tests.groupby.test_corr", |
| "pyspark.pandas.tests.groupby.test_cumulative", |
| "pyspark.pandas.tests.groupby.test_describe", |
| "pyspark.pandas.tests.groupby.test_groupby", |
| "pyspark.pandas.tests.groupby.test_grouping", |
| "pyspark.pandas.tests.groupby.test_head_tail", |
| "pyspark.pandas.tests.groupby.test_index", |
| "pyspark.pandas.tests.groupby.test_missing", |
| "pyspark.pandas.tests.groupby.test_missing_data", |
| "pyspark.pandas.tests.groupby.test_nlargest_nsmallest", |
| "pyspark.pandas.tests.groupby.test_raises", |
| "pyspark.pandas.tests.groupby.test_rank", |
| "pyspark.pandas.tests.groupby.test_size", |
| "pyspark.pandas.tests.groupby.test_split_apply", |
| "pyspark.pandas.tests.groupby.test_split_apply_count", |
| "pyspark.pandas.tests.groupby.test_split_apply_first", |
| "pyspark.pandas.tests.groupby.test_split_apply_last", |
| "pyspark.pandas.tests.groupby.test_split_apply_min_max", |
| "pyspark.pandas.tests.groupby.test_split_apply_skew", |
| "pyspark.pandas.tests.groupby.test_split_apply_std", |
| "pyspark.pandas.tests.groupby.test_split_apply_var", |
| "pyspark.pandas.tests.groupby.test_stat", |
| "pyspark.pandas.tests.groupby.test_stat_adv", |
| "pyspark.pandas.tests.groupby.test_stat_ddof", |
| "pyspark.pandas.tests.groupby.test_stat_func", |
| "pyspark.pandas.tests.groupby.test_stat_prod", |
| "pyspark.pandas.tests.groupby.test_value_counts", |
| "pyspark.pandas.tests.diff_frames_ops.test_align", |
| "pyspark.pandas.tests.diff_frames_ops.test_arithmetic", |
| "pyspark.pandas.tests.diff_frames_ops.test_arithmetic_ext", |
| "pyspark.pandas.tests.diff_frames_ops.test_arithmetic_ext_float", |
| "pyspark.pandas.tests.diff_frames_ops.test_arithmetic_chain", |
| "pyspark.pandas.tests.diff_frames_ops.test_arithmetic_chain_ext", |
| "pyspark.pandas.tests.diff_frames_ops.test_arithmetic_chain_ext_float", |
| "pyspark.pandas.tests.diff_frames_ops.test_assign_frame", |
| "pyspark.pandas.tests.diff_frames_ops.test_assign_series", |
| "pyspark.pandas.tests.diff_frames_ops.test_basic", |
| "pyspark.pandas.tests.diff_frames_ops.test_bitwise", |
| "pyspark.pandas.tests.diff_frames_ops.test_combine_first", |
| "pyspark.pandas.tests.diff_frames_ops.test_compare_series", |
| "pyspark.pandas.tests.diff_frames_ops.test_concat_inner", |
| "pyspark.pandas.tests.diff_frames_ops.test_concat_outer", |
| "pyspark.pandas.tests.diff_frames_ops.test_basic_slow", |
| "pyspark.pandas.tests.diff_frames_ops.test_cov", |
| "pyspark.pandas.tests.diff_frames_ops.test_corrwith", |
| "pyspark.pandas.tests.diff_frames_ops.test_dot_frame", |
| "pyspark.pandas.tests.diff_frames_ops.test_dot_series", |
| "pyspark.pandas.tests.diff_frames_ops.test_error", |
| "pyspark.pandas.tests.diff_frames_ops.test_index", |
| "pyspark.pandas.tests.diff_frames_ops.test_series", |
| "pyspark.pandas.tests.diff_frames_ops.test_setitem_frame", |
| "pyspark.pandas.tests.diff_frames_ops.test_setitem_series", |
| "pyspark.pandas.tests.diff_frames_ops.test_groupby", |
| "pyspark.pandas.tests.diff_frames_ops.test_groupby_aggregate", |
| "pyspark.pandas.tests.diff_frames_ops.test_groupby_apply", |
| "pyspark.pandas.tests.diff_frames_ops.test_groupby_cumulative", |
| "pyspark.pandas.tests.diff_frames_ops.test_groupby_diff", |
| "pyspark.pandas.tests.diff_frames_ops.test_groupby_diff_len", |
| "pyspark.pandas.tests.diff_frames_ops.test_groupby_fillna", |
| "pyspark.pandas.tests.diff_frames_ops.test_groupby_filter", |
| "pyspark.pandas.tests.diff_frames_ops.test_groupby_shift", |
| "pyspark.pandas.tests.diff_frames_ops.test_groupby_split_apply_combine", |
| "pyspark.pandas.tests.diff_frames_ops.test_groupby_transform", |
| "pyspark.pandas.tests.diff_frames_ops.test_groupby_expanding", |
| "pyspark.pandas.tests.diff_frames_ops.test_groupby_expanding_adv", |
| "pyspark.pandas.tests.diff_frames_ops.test_groupby_expanding_count", |
| "pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling", |
| "pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_adv", |
| "pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_count", |
| ], |
| excluded_python_implementations=[ |
| "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and |
| # they aren't available there |
| ], |
| ) |
| |
| pyspark_connect = Module( |
| name="pyspark-connect", |
| dependencies=[pyspark_sql, connect], |
| source_file_regexes=[ |
| "python/pyspark/sql/connect", |
| ], |
| python_test_goals=[ |
| # sql doctests |
| "pyspark.sql.connect.catalog", |
| "pyspark.sql.connect.conf", |
| "pyspark.sql.connect.group", |
| "pyspark.sql.connect.session", |
| "pyspark.sql.connect.window", |
| "pyspark.sql.connect.column", |
| "pyspark.sql.connect.merge", |
| "pyspark.sql.connect.readwriter", |
| "pyspark.sql.connect.dataframe", |
| "pyspark.sql.connect.functions.builtin", |
| "pyspark.sql.connect.functions.partitioning", |
| "pyspark.sql.connect.observation", |
| "pyspark.sql.connect.avro.functions", |
| "pyspark.sql.connect.protobuf.functions", |
| "pyspark.sql.connect.streaming.readwriter", |
| "pyspark.sql.connect.streaming.query", |
| "pyspark.sql.connect.tvf", |
| # sql unittests |
| "pyspark.sql.tests.connect.test_connect_plan", |
| "pyspark.sql.tests.connect.test_connect_basic", |
| "pyspark.sql.tests.connect.test_connect_dataframe_property", |
| "pyspark.sql.tests.connect.test_connect_channel", |
| "pyspark.sql.tests.connect.test_connect_error", |
| "pyspark.sql.tests.connect.test_connect_function", |
| "pyspark.sql.tests.connect.test_connect_collection", |
| "pyspark.sql.tests.connect.test_connect_column", |
| "pyspark.sql.tests.connect.test_connect_creation", |
| "pyspark.sql.tests.connect.test_connect_readwriter", |
| "pyspark.sql.tests.connect.test_connect_retry", |
| "pyspark.sql.tests.connect.test_connect_session", |
| "pyspark.sql.tests.connect.test_connect_stat", |
| "pyspark.sql.tests.connect.test_parity_datasources", |
| "pyspark.sql.tests.connect.test_parity_errors", |
| "pyspark.sql.tests.connect.test_parity_catalog", |
| "pyspark.sql.tests.connect.test_parity_conf", |
| "pyspark.sql.tests.connect.test_parity_serde", |
| "pyspark.sql.tests.connect.test_parity_functions", |
| "pyspark.sql.tests.connect.test_parity_group", |
| "pyspark.sql.tests.connect.test_parity_sql", |
| "pyspark.sql.tests.connect.test_parity_job_cancellation", |
| "pyspark.sql.tests.connect.test_parity_dataframe", |
| "pyspark.sql.tests.connect.test_parity_collection", |
| "pyspark.sql.tests.connect.test_parity_creation", |
| "pyspark.sql.tests.connect.test_parity_observation", |
| "pyspark.sql.tests.connect.test_parity_repartition", |
| "pyspark.sql.tests.connect.test_parity_stat", |
| "pyspark.sql.tests.connect.test_parity_subquery", |
| "pyspark.sql.tests.connect.test_parity_types", |
| "pyspark.sql.tests.connect.test_parity_column", |
| "pyspark.sql.tests.connect.test_parity_readwriter", |
| "pyspark.sql.tests.connect.test_parity_udf", |
| "pyspark.sql.tests.connect.test_parity_udf_combinations", |
| "pyspark.sql.tests.connect.test_parity_udf_profiler", |
| "pyspark.sql.tests.connect.test_parity_unified_udf", |
| "pyspark.sql.tests.connect.test_parity_memory_profiler", |
| "pyspark.sql.tests.connect.test_parity_udtf", |
| "pyspark.sql.tests.connect.test_parity_tvf", |
| "pyspark.sql.tests.connect.test_parity_python_datasource", |
| "pyspark.sql.tests.connect.test_parity_python_streaming_datasource", |
| "pyspark.sql.tests.connect.test_parity_frame_plot", |
| "pyspark.sql.tests.connect.test_parity_frame_plot_plotly", |
| "pyspark.sql.tests.connect.test_utils", |
| "pyspark.sql.tests.connect.client.test_artifact", |
| "pyspark.sql.tests.connect.client.test_artifact_localcluster", |
| "pyspark.sql.tests.connect.client.test_client", |
| "pyspark.sql.tests.connect.client.test_reattach", |
| "pyspark.sql.tests.connect.streaming.test_parity_streaming", |
| "pyspark.sql.tests.connect.streaming.test_parity_listener", |
| "pyspark.sql.tests.connect.streaming.test_parity_foreach", |
| "pyspark.sql.tests.connect.streaming.test_parity_foreach_batch", |
| "pyspark.sql.tests.connect.test_resources", |
| "pyspark.sql.tests.connect.shell.test_progress", |
| "pyspark.sql.tests.connect.test_df_debug", |
| "pyspark.sql.tests.connect.arrow.test_parity_arrow", |
| "pyspark.sql.tests.connect.arrow.test_parity_arrow_map", |
| "pyspark.sql.tests.connect.arrow.test_parity_arrow_grouped_map", |
| "pyspark.sql.tests.connect.arrow.test_parity_arrow_cogrouped_map", |
| "pyspark.sql.tests.connect.arrow.test_parity_arrow_python_udf", |
| "pyspark.sql.tests.connect.arrow.test_parity_arrow_udf", |
| "pyspark.sql.tests.connect.arrow.test_parity_arrow_udf_scalar", |
| "pyspark.sql.tests.connect.arrow.test_parity_arrow_udf_grouped_agg", |
| "pyspark.sql.tests.connect.arrow.test_parity_arrow_udf_window", |
| "pyspark.sql.tests.connect.arrow.test_parity_arrow_udtf", |
| "pyspark.sql.tests.connect.pandas.test_parity_pandas_map", |
| "pyspark.sql.tests.connect.pandas.test_parity_pandas_grouped_map", |
| "pyspark.sql.tests.connect.pandas.test_parity_pandas_grouped_map_with_state", |
| "pyspark.sql.tests.connect.pandas.test_parity_pandas_cogrouped_map", |
| "pyspark.sql.tests.connect.pandas.test_parity_pandas_udf", |
| "pyspark.sql.tests.connect.pandas.test_parity_pandas_udf_scalar", |
| "pyspark.sql.tests.connect.pandas.test_parity_pandas_udf_grouped_agg", |
| "pyspark.sql.tests.connect.pandas.test_parity_pandas_udf_window", |
| "pyspark.sql.tests.connect.pandas.test_parity_pandas_transform_with_state", |
| ], |
| excluded_python_implementations=[ |
| "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and |
| # they aren't available there |
| ], |
| ) |
| |
| |
| pyspark_ml_connect = Module( |
| name="pyspark-ml-connect", |
| dependencies=[pyspark_connect, pyspark_ml], |
| source_file_regexes=[ |
| "python/pyspark/ml/connect", |
| ], |
| python_test_goals=[ |
| # ml doctests |
| "pyspark.ml.connect.functions", |
| # ml unittests |
| "pyspark.ml.tests.connect.test_connect_cache", |
| "pyspark.ml.tests.connect.test_connect_function", |
| "pyspark.ml.tests.connect.test_parity_torch_distributor", |
| "pyspark.ml.tests.connect.test_parity_torch_data_loader", |
| "pyspark.ml.tests.connect.test_connect_summarizer", |
| "pyspark.ml.tests.connect.test_connect_evaluation", |
| "pyspark.ml.tests.connect.test_connect_feature", |
| "pyspark.ml.tests.connect.test_connect_classification", |
| "pyspark.ml.tests.connect.test_connect_pipeline", |
| "pyspark.ml.tests.connect.test_connect_tuning", |
| "pyspark.ml.tests.connect.test_parity_als", |
| "pyspark.ml.tests.connect.test_parity_fpm", |
| "pyspark.ml.tests.connect.test_parity_classification", |
| "pyspark.ml.tests.connect.test_parity_regression", |
| "pyspark.ml.tests.connect.test_parity_clustering", |
| "pyspark.ml.tests.connect.test_parity_evaluation", |
| "pyspark.ml.tests.connect.test_parity_feature", |
| "pyspark.ml.tests.connect.test_parity_functions", |
| "pyspark.ml.tests.connect.test_parity_pipeline", |
| "pyspark.ml.tests.connect.test_parity_tuning", |
| "pyspark.ml.tests.connect.test_parity_ovr", |
| "pyspark.ml.tests.connect.test_parity_stat", |
| ], |
| excluded_python_implementations=[ |
| "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and |
| # they aren't available there |
| ], |
| ) |
| |
| |
| pyspark_pandas_connect_part0 = Module( |
| name="pyspark-pandas-connect-part0", |
| dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow], |
| source_file_regexes=[ |
| "python/pyspark/pandas", |
| ], |
| python_test_goals=[ |
| # unittests dedicated for Spark Connect |
| "pyspark.pandas.tests.connect.test_connect_plotting", |
| # pandas-on-Spark unittests |
| "pyspark.pandas.tests.connect.test_parity_categorical", |
| "pyspark.pandas.tests.connect.test_parity_config", |
| "pyspark.pandas.tests.connect.test_parity_extension", |
| "pyspark.pandas.tests.connect.test_parity_frame_spark", |
| "pyspark.pandas.tests.connect.test_parity_generic_functions", |
| "pyspark.pandas.tests.connect.test_parity_indexops_spark", |
| "pyspark.pandas.tests.connect.test_parity_internal", |
| "pyspark.pandas.tests.connect.test_parity_namespace", |
| "pyspark.pandas.tests.connect.test_parity_numpy_compat", |
| "pyspark.pandas.tests.connect.test_parity_repr", |
| "pyspark.pandas.tests.connect.test_parity_scalars", |
| "pyspark.pandas.tests.connect.test_parity_spark_functions", |
| "pyspark.pandas.tests.connect.test_parity_sql", |
| "pyspark.pandas.tests.connect.test_parity_typedef", |
| "pyspark.pandas.tests.connect.test_parity_utils", |
| "pyspark.pandas.tests.connect.data_type_ops.test_parity_as_type", |
| "pyspark.pandas.tests.connect.data_type_ops.test_parity_base", |
| "pyspark.pandas.tests.connect.data_type_ops.test_parity_binary_ops", |
| "pyspark.pandas.tests.connect.data_type_ops.test_parity_boolean_ops", |
| "pyspark.pandas.tests.connect.data_type_ops.test_parity_categorical_ops", |
| "pyspark.pandas.tests.connect.data_type_ops.test_parity_complex_ops", |
| "pyspark.pandas.tests.connect.data_type_ops.test_parity_date_ops", |
| "pyspark.pandas.tests.connect.data_type_ops.test_parity_datetime_ops", |
| "pyspark.pandas.tests.connect.data_type_ops.test_parity_null_ops", |
| "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_ops", |
| "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_reverse", |
| "pyspark.pandas.tests.connect.data_type_ops.test_parity_string_ops", |
| "pyspark.pandas.tests.connect.data_type_ops.test_parity_udt_ops", |
| "pyspark.pandas.tests.connect.data_type_ops.test_parity_timedelta_ops", |
| "pyspark.pandas.tests.connect.plot.test_parity_frame_plot", |
| "pyspark.pandas.tests.connect.plot.test_parity_frame_plot_matplotlib", |
| "pyspark.pandas.tests.connect.plot.test_parity_frame_plot_plotly", |
| "pyspark.pandas.tests.connect.plot.test_parity_series_plot", |
| "pyspark.pandas.tests.connect.plot.test_parity_series_plot_matplotlib", |
| "pyspark.pandas.tests.connect.plot.test_parity_series_plot_plotly", |
| "pyspark.pandas.tests.connect.indexes.test_parity_default", |
| "pyspark.pandas.tests.connect.indexes.test_parity_category", |
| "pyspark.pandas.tests.connect.indexes.test_parity_timedelta", |
| "pyspark.pandas.tests.connect.indexes.test_parity_basic", |
| "pyspark.pandas.tests.connect.indexes.test_parity_getattr", |
| "pyspark.pandas.tests.connect.indexes.test_parity_name", |
| "pyspark.pandas.tests.connect.indexes.test_parity_conversion", |
| "pyspark.pandas.tests.connect.indexes.test_parity_drop", |
| "pyspark.pandas.tests.connect.indexes.test_parity_level", |
| "pyspark.pandas.tests.connect.indexes.test_parity_missing", |
| "pyspark.pandas.tests.connect.indexes.test_parity_repeat", |
| "pyspark.pandas.tests.connect.indexes.test_parity_sort", |
| "pyspark.pandas.tests.connect.indexes.test_parity_stat", |
| "pyspark.pandas.tests.connect.indexes.test_parity_symmetric_diff", |
| "pyspark.pandas.tests.connect.indexes.test_parity_take", |
| "pyspark.pandas.tests.connect.indexes.test_parity_unique", |
| "pyspark.pandas.tests.connect.indexes.test_parity_asof", |
| "pyspark.pandas.tests.connect.indexes.test_parity_astype", |
| "pyspark.pandas.tests.connect.indexes.test_parity_delete", |
| "pyspark.pandas.tests.connect.indexes.test_parity_diff", |
| "pyspark.pandas.tests.connect.indexes.test_parity_insert", |
| "pyspark.pandas.tests.connect.indexes.test_parity_map", |
| "pyspark.pandas.tests.connect.indexes.test_parity_align", |
| "pyspark.pandas.tests.connect.indexes.test_parity_indexing", |
| "pyspark.pandas.tests.connect.indexes.test_parity_indexing_adv", |
| "pyspark.pandas.tests.connect.indexes.test_parity_indexing_basic", |
| "pyspark.pandas.tests.connect.indexes.test_parity_indexing_iloc", |
| "pyspark.pandas.tests.connect.indexes.test_parity_indexing_loc", |
| "pyspark.pandas.tests.connect.indexes.test_parity_indexing_loc_2d", |
| "pyspark.pandas.tests.connect.indexes.test_parity_indexing_loc_multi_idx", |
| "pyspark.pandas.tests.connect.indexes.test_parity_reindex", |
| "pyspark.pandas.tests.connect.indexes.test_parity_rename", |
| "pyspark.pandas.tests.connect.indexes.test_parity_reset_index", |
| "pyspark.pandas.tests.connect.indexes.test_parity_datetime", |
| "pyspark.pandas.tests.connect.indexes.test_parity_datetime_at", |
| "pyspark.pandas.tests.connect.indexes.test_parity_datetime_between", |
| "pyspark.pandas.tests.connect.computation.test_parity_any_all", |
| "pyspark.pandas.tests.connect.computation.test_parity_apply_func", |
| "pyspark.pandas.tests.connect.computation.test_parity_binary_ops", |
| "pyspark.pandas.tests.connect.computation.test_parity_combine", |
| "pyspark.pandas.tests.connect.computation.test_parity_compute", |
| "pyspark.pandas.tests.connect.computation.test_parity_cov", |
| "pyspark.pandas.tests.connect.computation.test_parity_corr", |
| "pyspark.pandas.tests.connect.computation.test_parity_corrwith", |
| "pyspark.pandas.tests.connect.computation.test_parity_cumulative", |
| "pyspark.pandas.tests.connect.computation.test_parity_describe", |
| "pyspark.pandas.tests.connect.computation.test_parity_eval", |
| "pyspark.pandas.tests.connect.computation.test_parity_melt", |
| "pyspark.pandas.tests.connect.computation.test_parity_missing_data", |
| "pyspark.pandas.tests.connect.groupby.test_parity_stat", |
| "pyspark.pandas.tests.connect.groupby.test_parity_stat_adv", |
| "pyspark.pandas.tests.connect.groupby.test_parity_stat_ddof", |
| "pyspark.pandas.tests.connect.groupby.test_parity_stat_func", |
| "pyspark.pandas.tests.connect.groupby.test_parity_stat_prod", |
| ], |
| excluded_python_implementations=[ |
| "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and |
| # they aren't available there |
| ], |
| ) |
| |
| pyspark_pandas_connect_part1 = Module( |
| name="pyspark-pandas-connect-part1", |
| dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow], |
| source_file_regexes=[ |
| "python/pyspark/pandas", |
| ], |
| python_test_goals=[ |
| # pandas-on-Spark unittests |
| "pyspark.pandas.tests.connect.frame.test_parity_attrs", |
| "pyspark.pandas.tests.connect.frame.test_parity_axis", |
| "pyspark.pandas.tests.connect.frame.test_parity_constructor", |
| "pyspark.pandas.tests.connect.frame.test_parity_conversion", |
| "pyspark.pandas.tests.connect.frame.test_parity_reindexing", |
| "pyspark.pandas.tests.connect.frame.test_parity_reshaping", |
| "pyspark.pandas.tests.connect.frame.test_parity_spark", |
| "pyspark.pandas.tests.connect.frame.test_parity_take", |
| "pyspark.pandas.tests.connect.frame.test_parity_take_adv", |
| "pyspark.pandas.tests.connect.frame.test_parity_time_series", |
| "pyspark.pandas.tests.connect.frame.test_parity_truncate", |
| "pyspark.pandas.tests.connect.groupby.test_parity_aggregate", |
| "pyspark.pandas.tests.connect.groupby.test_parity_apply_func", |
| "pyspark.pandas.tests.connect.groupby.test_parity_corr", |
| "pyspark.pandas.tests.connect.groupby.test_parity_cumulative", |
| "pyspark.pandas.tests.connect.groupby.test_parity_missing_data", |
| "pyspark.pandas.tests.connect.groupby.test_parity_split_apply", |
| "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_count", |
| "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_first", |
| "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_last", |
| "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_min_max", |
| "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_skew", |
| "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_std", |
| "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_var", |
| "pyspark.pandas.tests.connect.series.test_parity_datetime", |
| "pyspark.pandas.tests.connect.series.test_parity_string_ops_adv", |
| "pyspark.pandas.tests.connect.series.test_parity_string_ops_basic", |
| "pyspark.pandas.tests.connect.series.test_parity_all_any", |
| "pyspark.pandas.tests.connect.series.test_parity_arg_ops", |
| "pyspark.pandas.tests.connect.series.test_parity_as_of", |
| "pyspark.pandas.tests.connect.series.test_parity_as_type", |
| "pyspark.pandas.tests.connect.series.test_parity_compute", |
| "pyspark.pandas.tests.connect.series.test_parity_conversion", |
| "pyspark.pandas.tests.connect.series.test_parity_cumulative", |
| "pyspark.pandas.tests.connect.series.test_parity_index", |
| "pyspark.pandas.tests.connect.series.test_parity_missing_data", |
| "pyspark.pandas.tests.connect.series.test_parity_series", |
| "pyspark.pandas.tests.connect.series.test_parity_sort", |
| "pyspark.pandas.tests.connect.series.test_parity_stat", |
| "pyspark.pandas.tests.connect.series.test_parity_interpolate", |
| "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_arithmetic", |
| "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_mod", |
| "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_mul_div", |
| "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_pow", |
| "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies", |
| "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_kwargs", |
| "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_multiindex", |
| "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_object", |
| "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_prefix", |
| "pyspark.pandas.tests.connect.reshape.test_parity_merge_asof", |
| "pyspark.pandas.tests.connect.indexes.test_parity_append", |
| "pyspark.pandas.tests.connect.indexes.test_parity_intersection", |
| "pyspark.pandas.tests.connect.indexes.test_parity_monotonic", |
| "pyspark.pandas.tests.connect.indexes.test_parity_union", |
| "pyspark.pandas.tests.connect.indexes.test_parity_datetime_ceil", |
| "pyspark.pandas.tests.connect.indexes.test_parity_datetime_floor", |
| "pyspark.pandas.tests.connect.indexes.test_parity_datetime_iso", |
| "pyspark.pandas.tests.connect.indexes.test_parity_datetime_map", |
| "pyspark.pandas.tests.connect.indexes.test_parity_datetime_property", |
| "pyspark.pandas.tests.connect.indexes.test_parity_datetime_round", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_shift", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_transform", |
| # fallback |
| "pyspark.pandas.tests.connect.frame.test_parity_asfreq", |
| "pyspark.pandas.tests.connect.frame.test_parity_asof", |
| ], |
| excluded_python_implementations=[ |
| "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and |
| # they aren't available there |
| ], |
| ) |
| |
| |
| pyspark_pandas_connect_part2 = Module( |
| name="pyspark-pandas-connect-part2", |
| dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow], |
| source_file_regexes=[ |
| "python/pyspark/pandas", |
| ], |
| python_test_goals=[ |
| # pandas-on-Spark unittests |
| "pyspark.pandas.tests.connect.computation.test_parity_pivot", |
| "pyspark.pandas.tests.connect.computation.test_parity_pivot_table", |
| "pyspark.pandas.tests.connect.computation.test_parity_pivot_table_adv", |
| "pyspark.pandas.tests.connect.computation.test_parity_pivot_table_multi_idx", |
| "pyspark.pandas.tests.connect.computation.test_parity_pivot_table_multi_idx_adv", |
| "pyspark.pandas.tests.connect.computation.test_parity_stats", |
| "pyspark.pandas.tests.connect.frame.test_parity_interpolate", |
| "pyspark.pandas.tests.connect.frame.test_parity_interpolate_error", |
| "pyspark.pandas.tests.connect.resample.test_parity_frame", |
| "pyspark.pandas.tests.connect.resample.test_parity_series", |
| "pyspark.pandas.tests.connect.resample.test_parity_error", |
| "pyspark.pandas.tests.connect.resample.test_parity_missing", |
| "pyspark.pandas.tests.connect.resample.test_parity_on", |
| "pyspark.pandas.tests.connect.resample.test_parity_timezone", |
| "pyspark.pandas.tests.connect.window.test_parity_ewm_error", |
| "pyspark.pandas.tests.connect.window.test_parity_ewm_mean", |
| "pyspark.pandas.tests.connect.window.test_parity_groupby_ewm_mean", |
| "pyspark.pandas.tests.connect.window.test_parity_missing", |
| "pyspark.pandas.tests.connect.window.test_parity_rolling", |
| "pyspark.pandas.tests.connect.window.test_parity_rolling_adv", |
| "pyspark.pandas.tests.connect.window.test_parity_rolling_count", |
| "pyspark.pandas.tests.connect.window.test_parity_rolling_error", |
| "pyspark.pandas.tests.connect.window.test_parity_groupby_rolling", |
| "pyspark.pandas.tests.connect.window.test_parity_groupby_rolling_adv", |
| "pyspark.pandas.tests.connect.window.test_parity_groupby_rolling_count", |
| "pyspark.pandas.tests.connect.window.test_parity_expanding", |
| "pyspark.pandas.tests.connect.window.test_parity_expanding_adv", |
| "pyspark.pandas.tests.connect.window.test_parity_expanding_error", |
| "pyspark.pandas.tests.connect.window.test_parity_groupby_expanding", |
| "pyspark.pandas.tests.connect.window.test_parity_groupby_expanding_adv", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_adv", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_count", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_dot_frame", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_dot_series", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_error", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_align", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_basic_slow", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_cov", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_corrwith", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_index", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_series", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_setitem_frame", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_setitem_series", |
| "pyspark.pandas.tests.connect.groupby.test_parity_index", |
| "pyspark.pandas.tests.connect.groupby.test_parity_describe", |
| "pyspark.pandas.tests.connect.groupby.test_parity_head_tail", |
| "pyspark.pandas.tests.connect.groupby.test_parity_groupby", |
| "pyspark.pandas.tests.connect.groupby.test_parity_grouping", |
| "pyspark.pandas.tests.connect.groupby.test_parity_missing", |
| "pyspark.pandas.tests.connect.groupby.test_parity_nlargest_nsmallest", |
| "pyspark.pandas.tests.connect.groupby.test_parity_raises", |
| "pyspark.pandas.tests.connect.groupby.test_parity_rank", |
| "pyspark.pandas.tests.connect.groupby.test_parity_size", |
| "pyspark.pandas.tests.connect.groupby.test_parity_value_counts", |
| ], |
| excluded_python_implementations=[ |
| "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and |
| # they aren't available there |
| ], |
| ) |
| |
| |
| pyspark_pandas_connect_part3 = Module( |
| name="pyspark-pandas-connect-part3", |
| dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow], |
| source_file_regexes=[ |
| "python/pyspark/pandas", |
| ], |
| python_test_goals=[ |
| # pandas-on-Spark unittests |
| "pyspark.pandas.tests.connect.io.test_parity_io", |
| "pyspark.pandas.tests.connect.io.test_parity_csv", |
| "pyspark.pandas.tests.connect.io.test_parity_feather", |
| "pyspark.pandas.tests.connect.io.test_parity_stata", |
| "pyspark.pandas.tests.connect.io.test_parity_dataframe_conversion", |
| "pyspark.pandas.tests.connect.io.test_parity_dataframe_spark_io", |
| "pyspark.pandas.tests.connect.io.test_parity_series_conversion", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_ext", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_ext_float", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_chain", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_chain_ext", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_chain_ext_float", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_assign_frame", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_assign_series", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_basic", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_bitwise", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_combine_first", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_compare_series", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_concat_inner", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_concat_outer", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_aggregate", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_apply", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_cumulative", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_diff", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_diff_len", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_fillna", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_filter", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_split_apply_combine", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_expanding", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_expanding_adv", |
| "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_expanding_count", |
| ], |
| excluded_python_implementations=[ |
| "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and |
| # they aren't available there |
| ], |
| ) |
| |
| |
| pyspark_errors = Module( |
| name="pyspark-errors", |
| dependencies=[], |
| source_file_regexes=[ |
| # SPARK-44544: Force the execution of pyspark_errors when there are any changes |
| # in PySpark, since the Python Packaging Tests is only enabled within this module. |
| # This module is the smallest Python test module, it contains only 1 test file |
| # and normally takes < 2 seconds, so the additional cost is small. |
| "python/", |
| "python/pyspark/errors", |
| ], |
| python_test_goals=[ |
| # unittests |
| "pyspark.errors.tests.test_connect_errors_conversion", |
| "pyspark.errors.tests.test_errors", |
| "pyspark.errors.tests.test_traceback", |
| "pyspark.errors.tests.connect.test_parity_traceback", |
| ], |
| ) |
| |
| pyspark_logger = Module( |
| name="pyspark-logger", |
| dependencies=[], |
| source_file_regexes=["python/pyspark/logger"], |
| python_test_goals=[ |
| # doctests |
| "pyspark.logger.logger", |
| # unittests |
| "pyspark.logger.tests.test_logger", |
| "pyspark.logger.tests.connect.test_parity_logger", |
| ], |
| ) |
| |
| pyspark_pipelines = Module( |
| name="pyspark-pipelines", |
| dependencies=[pyspark_core, pyspark_sql, pyspark_connect], |
| source_file_regexes=["python/pyspark/pipelines"], |
| python_test_goals=[ |
| "pyspark.pipelines.tests.test_block_connect_access", |
| "pyspark.pipelines.tests.test_block_session_mutations", |
| "pyspark.pipelines.tests.test_cli", |
| "pyspark.pipelines.tests.test_decorators", |
| "pyspark.pipelines.tests.test_graph_element_registry", |
| "pyspark.pipelines.tests.test_init_cli", |
| ], |
| ) |
| |
| sparkr = Module( |
| name="sparkr", |
| dependencies=[hive, mllib], |
| source_file_regexes=[ |
| "R/", |
| ], |
| should_run_r_tests=True, |
| ) |
| |
| |
| docs = Module( |
| name="docs", |
| dependencies=[], |
| source_file_regexes=[ |
| "docs/", |
| ], |
| ) |
| |
| build = Module( |
| name="build", |
| dependencies=[], |
| source_file_regexes=[ |
| ".*pom.xml", |
| "dev/test-dependencies.sh", |
| ], |
| should_run_build_tests=True, |
| ) |
| |
| yarn = Module( |
| name="yarn", |
| dependencies=[], |
| source_file_regexes=[ |
| "resource-managers/yarn/", |
| "common/network-yarn/", |
| ], |
| build_profile_flags=["-Pyarn"], |
| sbt_test_goals=[ |
| "yarn/test", |
| "network-yarn/test", |
| ], |
| test_tags=["org.apache.spark.tags.ExtendedYarnTest"], |
| ) |
| |
| kubernetes = Module( |
| name="kubernetes", |
| dependencies=[], |
| source_file_regexes=["resource-managers/kubernetes"], |
| build_profile_flags=["-Pkubernetes", "-Pvolcano"], |
| sbt_test_goals=["kubernetes/test"], |
| ) |
| |
| hadoop_cloud = Module( |
| name="hadoop-cloud", |
| dependencies=[], |
| source_file_regexes=["hadoop-cloud"], |
| build_profile_flags=["-Phadoop-cloud"], |
| sbt_test_goals=["hadoop-cloud/test"], |
| ) |
| |
| spark_ganglia_lgpl = Module( |
| name="spark-ganglia-lgpl", |
| dependencies=[], |
| build_profile_flags=["-Pspark-ganglia-lgpl"], |
| source_file_regexes=[ |
| "connector/spark-ganglia-lgpl", |
| ], |
| ) |
| |
| docker_integration_tests = Module( |
| name="docker-integration-tests", |
| dependencies=[sql], |
| build_profile_flags=["-Pdocker-integration-tests"], |
| source_file_regexes=["connector/docker-integration-tests"], |
| sbt_test_goals=["docker-integration-tests/test"], |
| environ=None |
| if "GITHUB_ACTIONS" not in os.environ |
| else {"ENABLE_DOCKER_INTEGRATION_TESTS": "1"}, |
| test_tags=["org.apache.spark.tags.DockerTest"], |
| ) |
| |
| # The root module is a dummy module which is used to run all of the tests. |
| # No other modules should directly depend on this module. |
| root = Module( |
| name="root", |
| dependencies=[build, core], # Changes to build should trigger all tests. |
| source_file_regexes=[], |
| # In order to run all of the tests, enable every test profile: |
| build_profile_flags=list( |
| set(itertools.chain.from_iterable(m.build_profile_flags for m in all_modules)) |
| ), |
| sbt_test_goals=[ |
| "test", |
| ], |
| python_test_goals=list(itertools.chain.from_iterable(m.python_test_goals for m in all_modules)), |
| should_run_r_tests=True, |
| should_run_build_tests=True, |
| ) |