dev/sparktestsupport/utils.py - spark - Git at Google

 #!/usr/bin/env python3

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 import os
 import sys
 import subprocess
 from sparktestsupport import modules
 from sparktestsupport.shellutils import run_cmd
 from sparktestsupport.toposort import toposort_flatten

 # -------------------------------------------------------------------------------------------------
 # Functions for traversing module dependency graph
 # -------------------------------------------------------------------------------------------------


 def determine_modules_for_files(filenames):
     """
     Given a list of filenames, return the set of modules that contain those files.
     If a file is not associated with a more specific submodule, then this method will consider that
     file to belong to the 'root' module. `.github` directory is counted only in GitHub Actions,
     and `README.md` is always ignored.

     >>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/core/foo"]))
     ['pyspark-core', 'pyspark-errors', 'sql']
     >>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])]
     ['root']
     >>> [x.name for x in determine_modules_for_files(["sql/README.md"])]
     []
     """
     changed_modules = set()
     for filename in filenames:
         if filename.endswith("README.md"):
             continue
         if filename in (
             "scalastyle-config.xml",
             "dev/checkstyle.xml",
             "dev/checkstyle-suppressions.xml",
         ):
             continue
         if ("GITHUB_ACTIONS" not in os.environ) and filename.startswith(".github"):
             continue
         matched_at_least_one_module = False
         for module in modules.all_modules:
             if module.contains_file(filename):
                 changed_modules.add(module)
                 matched_at_least_one_module = True
         if not matched_at_least_one_module:
             changed_modules.add(modules.root)
     return changed_modules


 def identify_changed_files_from_git_commits(patch_sha, target_branch=None, target_ref=None):
     """
     Given a git commit and target ref, use the set of files changed in the diff in order to
     determine which modules' tests should be run.

     >>> [x.name for x in determine_modules_for_files( \
             identify_changed_files_from_git_commits("fc0a1475ef", target_ref="5da21f07"))]
     ['graphx']
     >>> 'root' in [x.name for x in determine_modules_for_files( \
          identify_changed_files_from_git_commits("50a0496a43", target_ref="6765ef9"))]
     True
     """
     if target_branch is None and target_ref is None:
         raise AttributeError("must specify either target_branch or target_ref")
     elif target_branch is not None and target_ref is not None:
         raise AttributeError("must specify either target_branch or target_ref, not both")
     if target_branch is not None:
         diff_target = target_branch
         run_cmd(["git", "fetch", "origin", str(target_branch + ":" + target_branch)])
     else:
         diff_target = target_ref
     raw_output = subprocess.check_output(
         ["git", "diff", "--name-only", patch_sha, diff_target], universal_newlines=True
     )
     # Remove any empty strings
     return [f for f in raw_output.split("\n") if f]


 def determine_modules_to_test(changed_modules, deduplicated=True):
     """
     Given a set of modules that have changed, compute the transitive closure of those modules'
     dependent modules in order to determine the set of modules that should be tested.

     Returns a topologically-sorted list of modules (ties are broken by sorting on module names).
     If ``deduplicated`` is disabled, the modules are returned without tacking the deduplication
     by dependencies into account.

     >>> [x.name for x in determine_modules_to_test([modules.root])]
     ['root']
     >>> [x.name for x in determine_modules_to_test([modules.build])]
     ['root']
     >>> [x.name for x in determine_modules_to_test([modules.core])]
     ['root']
     >>> [x.name for x in determine_modules_to_test([modules.launcher])]
     ['root']
     >>> [x.name for x in determine_modules_to_test([modules.graphx])]
     ['graphx', 'examples']
     >>> sorted([x.name for x in determine_modules_to_test([modules.sql])])
     ... # doctest: +NORMALIZE_WHITESPACE
     ['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 'hive-thriftserver',
      'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-ml-connect', 'pyspark-mllib',
      'pyspark-pandas', 'pyspark-pandas-connect-part0', 'pyspark-pandas-connect-part1',
      'pyspark-pandas-connect-part2', 'pyspark-pandas-connect-part3', 'pyspark-pandas-slow',
      'pyspark-pipelines', 'pyspark-sql', 'pyspark-testing', 'repl', 'sparkr', 'sql',
      'sql-kafka-0-10']
     >>> sorted([x.name for x in determine_modules_to_test(
     ...     [modules.sparkr, modules.sql], deduplicated=False)])
     ... # doctest: +NORMALIZE_WHITESPACE
     ['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 'hive-thriftserver',
      'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-ml-connect', 'pyspark-mllib',
      'pyspark-pandas', 'pyspark-pandas-connect-part0', 'pyspark-pandas-connect-part1',
      'pyspark-pandas-connect-part2', 'pyspark-pandas-connect-part3', 'pyspark-pandas-slow',
      'pyspark-pipelines', 'pyspark-sql', 'pyspark-testing', 'repl', 'sparkr', 'sql',
      'sql-kafka-0-10']
     >>> sorted([x.name for x in determine_modules_to_test(
     ...     [modules.sql, modules.core], deduplicated=False)])
     ... # doctest: +NORMALIZE_WHITESPACE
     ['avro', 'catalyst', 'connect', 'core', 'docker-integration-tests', 'examples', 'graphx',
      'hive', 'hive-thriftserver', 'mllib', 'mllib-local', 'protobuf', 'pyspark-connect',
      'pyspark-core', 'pyspark-ml', 'pyspark-ml-connect', 'pyspark-mllib', 'pyspark-pandas',
      'pyspark-pandas-connect-part0', 'pyspark-pandas-connect-part1', 'pyspark-pandas-connect-part2',
      'pyspark-pandas-connect-part3', 'pyspark-pandas-slow', 'pyspark-pipelines', 'pyspark-resource',
      'pyspark-sql', 'pyspark-streaming', 'pyspark-testing', 'repl', 'root', 'sparkr', 'sql',
      'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10', 'streaming-kinesis-asl']
     """
     modules_to_test = set()
     for module in changed_modules:
         modules_to_test = modules_to_test.union(
             determine_modules_to_test(module.dependent_modules, deduplicated)
         )
     modules_to_test = modules_to_test.union(set(changed_modules))

     if not deduplicated:
         return modules_to_test

     # If we need to run all of the tests, then we should short-circuit and return 'root'
     if modules.root in modules_to_test:
         return [modules.root]
     return toposort_flatten(
         {m: set(m.dependencies).intersection(modules_to_test) for m in modules_to_test}, sort=True
     )


 def determine_tags_to_exclude(changed_modules):
     tags = []
     for m in modules.all_modules:
         if m not in changed_modules:
             tags += m.test_tags
     return tags


 def _test():
     import doctest

     failure_count = doctest.testmod()[0]
     if failure_count:
         sys.exit(-1)


 if __name__ == "__main__":
     _test()
	#!/usr/bin/env python3

	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	import os
	import sys
	import subprocess
	from sparktestsupport import modules
	from sparktestsupport.shellutils import run_cmd
	from sparktestsupport.toposort import toposort_flatten

	# -------------------------------------------------------------------------------------------------
	# Functions for traversing module dependency graph
	# -------------------------------------------------------------------------------------------------


	def determine_modules_for_files(filenames):
	"""
	Given a list of filenames, return the set of modules that contain those files.
	If a file is not associated with a more specific submodule, then this method will consider that
	file to belong to the 'root' module. `.github` directory is counted only in GitHub Actions,
	and `README.md` is always ignored.

	>>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/core/foo"]))
	['pyspark-core', 'pyspark-errors', 'sql']
	>>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])]
	['root']
	>>> [x.name for x in determine_modules_for_files(["sql/README.md"])]
	[]
	"""
	changed_modules = set()
	for filename in filenames:
	if filename.endswith("README.md"):
	continue
	if filename in (
	"scalastyle-config.xml",
	"dev/checkstyle.xml",
	"dev/checkstyle-suppressions.xml",
	):
	continue
	if ("GITHUB_ACTIONS" not in os.environ) and filename.startswith(".github"):
	continue
	matched_at_least_one_module = False
	for module in modules.all_modules:
	if module.contains_file(filename):
	changed_modules.add(module)
	matched_at_least_one_module = True
	if not matched_at_least_one_module:
	changed_modules.add(modules.root)
	return changed_modules


	def identify_changed_files_from_git_commits(patch_sha, target_branch=None, target_ref=None):
	"""
	Given a git commit and target ref, use the set of files changed in the diff in order to
	determine which modules' tests should be run.

	>>> [x.name for x in determine_modules_for_files( \
	identify_changed_files_from_git_commits("fc0a1475ef", target_ref="5da21f07"))]
	['graphx']
	>>> 'root' in [x.name for x in determine_modules_for_files( \
	identify_changed_files_from_git_commits("50a0496a43", target_ref="6765ef9"))]
	True
	"""
	if target_branch is None and target_ref is None:
	raise AttributeError("must specify either target_branch or target_ref")
	elif target_branch is not None and target_ref is not None:
	raise AttributeError("must specify either target_branch or target_ref, not both")
	if target_branch is not None:
	diff_target = target_branch
	run_cmd(["git", "fetch", "origin", str(target_branch + ":" + target_branch)])
	else:
	diff_target = target_ref
	raw_output = subprocess.check_output(
	["git", "diff", "--name-only", patch_sha, diff_target], universal_newlines=True
	)
	# Remove any empty strings
	return [f for f in raw_output.split("\n") if f]


	def determine_modules_to_test(changed_modules, deduplicated=True):
	"""
	Given a set of modules that have changed, compute the transitive closure of those modules'
	dependent modules in order to determine the set of modules that should be tested.

	Returns a topologically-sorted list of modules (ties are broken by sorting on module names).
	If ``deduplicated`` is disabled, the modules are returned without tacking the deduplication
	by dependencies into account.

	>>> [x.name for x in determine_modules_to_test([modules.root])]
	['root']
	>>> [x.name for x in determine_modules_to_test([modules.build])]
	['root']
	>>> [x.name for x in determine_modules_to_test([modules.core])]
	['root']
	>>> [x.name for x in determine_modules_to_test([modules.launcher])]
	['root']
	>>> [x.name for x in determine_modules_to_test([modules.graphx])]
	['graphx', 'examples']
	>>> sorted([x.name for x in determine_modules_to_test([modules.sql])])
	... # doctest: +NORMALIZE_WHITESPACE
	['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 'hive-thriftserver',
	'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-ml-connect', 'pyspark-mllib',
	'pyspark-pandas', 'pyspark-pandas-connect-part0', 'pyspark-pandas-connect-part1',
	'pyspark-pandas-connect-part2', 'pyspark-pandas-connect-part3', 'pyspark-pandas-slow',
	'pyspark-pipelines', 'pyspark-sql', 'pyspark-testing', 'repl', 'sparkr', 'sql',
	'sql-kafka-0-10']
	>>> sorted([x.name for x in determine_modules_to_test(
	... [modules.sparkr, modules.sql], deduplicated=False)])
	... # doctest: +NORMALIZE_WHITESPACE
	['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 'hive-thriftserver',
	'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-ml-connect', 'pyspark-mllib',
	'pyspark-pandas', 'pyspark-pandas-connect-part0', 'pyspark-pandas-connect-part1',
	'pyspark-pandas-connect-part2', 'pyspark-pandas-connect-part3', 'pyspark-pandas-slow',
	'pyspark-pipelines', 'pyspark-sql', 'pyspark-testing', 'repl', 'sparkr', 'sql',
	'sql-kafka-0-10']
	>>> sorted([x.name for x in determine_modules_to_test(
	... [modules.sql, modules.core], deduplicated=False)])
	... # doctest: +NORMALIZE_WHITESPACE
	['avro', 'catalyst', 'connect', 'core', 'docker-integration-tests', 'examples', 'graphx',
	'hive', 'hive-thriftserver', 'mllib', 'mllib-local', 'protobuf', 'pyspark-connect',
	'pyspark-core', 'pyspark-ml', 'pyspark-ml-connect', 'pyspark-mllib', 'pyspark-pandas',
	'pyspark-pandas-connect-part0', 'pyspark-pandas-connect-part1', 'pyspark-pandas-connect-part2',
	'pyspark-pandas-connect-part3', 'pyspark-pandas-slow', 'pyspark-pipelines', 'pyspark-resource',
	'pyspark-sql', 'pyspark-streaming', 'pyspark-testing', 'repl', 'root', 'sparkr', 'sql',
	'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10', 'streaming-kinesis-asl']
	"""
	modules_to_test = set()
	for module in changed_modules:
	modules_to_test = modules_to_test.union(
	determine_modules_to_test(module.dependent_modules, deduplicated)
	)
	modules_to_test = modules_to_test.union(set(changed_modules))

	if not deduplicated:
	return modules_to_test

	# If we need to run all of the tests, then we should short-circuit and return 'root'
	if modules.root in modules_to_test:
	return [modules.root]
	return toposort_flatten(
	{m: set(m.dependencies).intersection(modules_to_test) for m in modules_to_test}, sort=True
	)


	def determine_tags_to_exclude(changed_modules):
	tags = []
	for m in modules.all_modules:
	if m not in changed_modules:
	tags += m.test_tags
	return tags


	def _test():
	import doctest

	failure_count = doctest.testmod()[0]
	if failure_count:
	sys.exit(-1)


	if __name__ == "__main__":
	_test()