dev-tools/update-license-files.py - storm - Git at Google

 #!/usr/bin/env python3

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 """Usage: update-license-files.py [--skip-build-storm]

 Regenerates DEPENDENCY-LICENSES and the binary dependencies section of LICENSE-binary.
 Run this after changing dependencies to bring license files up to date.

 Depends on "requests" is NOT required. Only needs Maven and Python 3.
 """

 from contextlib import contextmanager
 from pathlib import Path
 import os
 import subprocess
 import shlex
 import shutil
 import filecmp
 import re
 import argparse

 project_root = Path(__file__).resolve().parent.parent
 update_dependency_licenses_cmd = ('mvn license:aggregate-add-third-party@generate-and-check-licenses -Dlicense.skipAggregateAddThirdParty=false -B')

 LICENSE_BINARY_SEPARATOR = '----------------------------END OF SOURCE NOTICES -------------------------------------------'

 LICENSE_BINARY_PREAMBLE = """

 The following dependencies are included in the binary Storm distributions, in addition to the source dependencies listed above.
 The license texts of these dependencies can be found in the licenses directory.
 """


 @contextmanager
 def cd(newdir):
     prevdir = Path.cwd()
     os.chdir(newdir.expanduser())
     try:
         yield
     finally:
         os.chdir(prevdir)


 def read_lines(path):
     with open(path) as f:
         return f.readlines()


 def build_storm():
     print("Building Storm")
     subprocess.check_call(shlex.split(
         'mvn clean install -B -DskipTests -Dcheckstyle.skip -Dpmd.skip'
     ))
     print("Done building Storm")


 def generate_dependency_licenses():
     """Generates DEPENDENCY-LICENSES in target/. The committed DEPENDENCY-LICENSES is not modified."""
     print('Generating DEPENDENCY-LICENSES')
     cmd = (update_dependency_licenses_cmd +
            ' -Dlicense.thirdPartyFilename=DEPENDENCY-LICENSES' +
            ' -Dlicense.outputDirectory=target')
     subprocess.check_call(shlex.split(cmd))
     print('Done generating DEPENDENCY-LICENSES')


 def generate_storm_dist_license_report():
     with cd(project_root / 'storm-dist' / 'binary'):
         print('')
         print('Generating storm-dist license report')
         subprocess.check_call(shlex.split(update_dependency_licenses_cmd))
         print('Done generating storm-dist license report')


 def extract_license_report_maven_coordinates(lines):
     """Extract Maven coordinates from license report lines.
     Lines like: ' * Checker Qual (org.checkerframework:checker-qual:2.5.2 - https://checkerframework.org)'
     """
     matches = map(lambda line: re.match(
         r'\s+\*.*\((?P<gav>.*) \- .*\).*', line), lines)
     return set(map(lambda match: match.group('gav'), filter(lambda match: match is not None, matches)))


 def extract_dependency_list_maven_coordinates(lines):
     """Extract Maven coordinates from 'mvn dependency:list' output.
     Lines like: '   com.google.code.findbugs:jsr305:jar:3.0.2 -- module jsr305 (auto)'
     """
     matches = map(lambda line: re.match(
         r'\s+(?P<group>\S*)\:(?P<artifact>\S*)\:(?P<type>\S*)\:(?P<version>\S*)', line), lines)
     return set(map(lambda match: match.group('group') + ':' + match.group('artifact') + ':' + match.group('version'),
                    filter(lambda match: match is not None, matches)))


 def get_shaded_dep_coordinates():
     """Gets the set of Maven coordinates for storm-shaded-deps (excluding Storm's own modules)."""
     with cd(project_root / 'storm-shaded-deps'):
         print("Generating dependency list for storm-shaded-deps")
         subprocess.check_call(shlex.split(
             'mvn dependency:list -DoutputFile=target/deps-list -Dmdep.outputScope=false -DincludeScope=compile -B'))
         print("Done generating dependency list for storm-shaded-deps")
     shaded_dep_coordinates = extract_dependency_list_maven_coordinates(
         read_lines(project_root / 'storm-shaded-deps' / 'target' / 'deps-list'))
     shaded_dep_coordinates = set(filter(lambda coordinate: 'org.apache.storm:' not in coordinate, shaded_dep_coordinates))
     print('storm-shaded-deps dependencies: ' + str(shaded_dep_coordinates))
     print('')
     return shaded_dep_coordinates


 def parse_grouped_license_file(lines):
     """Parse a license report file (DEPENDENCY-LICENSES format) into structured groups.

     Returns a list of (header_line, [entry_lines]) tuples, preserving blank lines and formatting.
     The header_line is the license group name (e.g. '    Apache License, Version 2.0\\n').
     entry_lines are the dependency lines under that group (e.g. '        * Foo (g:a:v - url)\\n').
     """
     groups = []
     current_header = None
     current_entries = []

     for line in lines:
         # Skip the file header (first few lines before any license group)
         stripped = line.rstrip('\n')

         # License group headers are indented with 4 spaces and have non-whitespace content
         if re.match(r'    \S', line) and not line.strip().startswith('*'):
             if current_header is not None:
                 groups.append((current_header, current_entries))
             current_header = line
             current_entries = []
         elif current_header is not None:
             current_entries.append(line)

     if current_header is not None:
         groups.append((current_header, current_entries))

     return groups


 def filter_groups_to_coordinates(groups, target_coordinates):
     """Filter license groups to only include entries whose coordinates are in the target set.

     Returns lines for the filtered license report section.
     """
     result_lines = []
     gav_pattern = re.compile(r'\s+\*.*\((?P<gav>.*) \- .*\).*')

     for header, entries in groups:
         filtered_entries = []
         for entry in entries:
             match = gav_pattern.match(entry)
             if match:
                 if match.group('gav') in target_coordinates:
                     filtered_entries.append(entry)
             # Keep blank lines between entries only if we have entries
         if filtered_entries:
             result_lines.append('\n')
             result_lines.append(header)
             result_lines.append('\n')
             for entry in filtered_entries:
                 result_lines.append(entry)

     return result_lines


 def update_dependency_licenses():
     """Copy target/DEPENDENCY-LICENSES to root DEPENDENCY-LICENSES.
     Returns True if the file changed."""
     src = project_root / 'target' / 'DEPENDENCY-LICENSES'
     dst = project_root / 'DEPENDENCY-LICENSES'
     if dst.exists() and filecmp.cmp(src, dst, shallow=False):
         print('DEPENDENCY-LICENSES is already up to date')
         return False
     shutil.copy2(src, dst)
     print('Updated DEPENDENCY-LICENSES')
     return True


 def merge_groups(base_groups, extra_groups, extra_coordinates):
     """Merge extra_groups entries into base_groups for coordinates in extra_coordinates
     that are not already present in base_groups.

     Returns a new list of (header, [entry_lines]) tuples.
     """
     # Collect all coordinates already in base_groups
     gav_pattern = re.compile(r'\s+\*.*\((?P<gav>.*) \- .*\).*')
     base_coords = set()
     for _, entries in base_groups:
         for entry in entries:
             match = gav_pattern.match(entry)
             if match:
                 base_coords.add(match.group('gav'))

     # Find extra entries to add (in extra_coordinates but not in base)
     missing_coords = extra_coordinates - base_coords
     if not missing_coords:
         return base_groups

     print(f'Adding {len(missing_coords)} shaded-deps entries from DEPENDENCY-LICENSES to LICENSE-binary')

     # Build a map: header_text -> list of entry lines to add
     extra_by_header = {}
     for header, entries in extra_groups:
         header_key = header.strip()
         for entry in entries:
             match = gav_pattern.match(entry)
             if match and match.group('gav') in missing_coords:
                 extra_by_header.setdefault(header_key, []).append(entry)

     # Merge into base_groups
     result = []
     seen_headers = set()
     for header, entries in base_groups:
         header_key = header.strip()
         seen_headers.add(header_key)
         merged_entries = list(entries)
         if header_key in extra_by_header:
             merged_entries.extend(extra_by_header[header_key])
         result.append((header, merged_entries))

     # Add any license groups that only exist in extra
     for header_key, extra_entries in extra_by_header.items():
         if header_key not in seen_headers:
             result.append(('    ' + header_key + '\n', extra_entries))

     return result


 def groups_to_lines(groups):
     """Convert parsed groups back to lines for writing."""
     result_lines = []
     for header, entries in sorted(groups, key=lambda g: g[0].strip().lower()):
         result_lines.append('\n')
         result_lines.append(header)
         result_lines.append('\n')
         for entry in entries:
             # Skip blank lines that were part of the original inter-group spacing
             if entry.strip():
                 result_lines.append(entry)
     return result_lines


 def update_license_binary(shaded_dep_coordinates):
     """Replace the binary dependencies section of LICENSE-binary.

     Uses storm-dist/binary THIRD-PARTY.txt as the base (already has all binary deps grouped),
     then merges in any storm-shaded-deps entries from DEPENDENCY-LICENSES that aren't already
     covered by the binary THIRD-PARTY.txt.

     Returns True if the file changed."""
     license_binary_path = project_root / 'LICENSE-binary'
     lines = read_lines(license_binary_path)

     # Find the separator line
     separator_idx = None
     for i, line in enumerate(lines):
         if LICENSE_BINARY_SEPARATOR in line:
             separator_idx = i
             break

     if separator_idx is None:
         print(f'ERROR: Could not find separator line in LICENSE-binary: "{LICENSE_BINARY_SEPARATOR}"')
         return False

     # Keep the static header (up to and including the separator)
     static_part = lines[:separator_idx + 1]

     # Use the binary THIRD-PARTY.txt as the base — it already contains all
     # storm-dist/binary dependencies in the correct grouped format
     binary_third_party_path = (project_root / 'storm-dist' / 'binary' / 'target' /
                                'generated-sources' / 'license' / 'THIRD-PARTY.txt')
     binary_groups = parse_grouped_license_file(read_lines(binary_third_party_path))

     # Merge in storm-shaded-deps entries from DEPENDENCY-LICENSES
     dep_licenses_lines = read_lines(project_root / 'target' / 'DEPENDENCY-LICENSES')
     dep_licenses_groups = parse_grouped_license_file(dep_licenses_lines)
     merged_groups = merge_groups(binary_groups, dep_licenses_groups, shaded_dep_coordinates)

     binary_section = groups_to_lines(merged_groups)

     # Compose the new file
     preamble_lines = LICENSE_BINARY_PREAMBLE.splitlines(keepends=True)
     new_content = static_part + preamble_lines + binary_section

     # Compare with current content
     if new_content == lines:
         print('LICENSE-binary is already up to date')
         return False

     with open(license_binary_path, 'w') as f:
         f.writelines(new_content)
     print('Updated LICENSE-binary')
     return True


 if __name__ == '__main__':
     with cd(project_root):
         parser = argparse.ArgumentParser(
             description='Update Storm license files (DEPENDENCY-LICENSES and LICENSE-binary)')
         parser.add_argument('--skip-build-storm', action='store_true',
                             help='skip building Storm (use if already built)')
         args = parser.parse_args()

         try:
             if not args.skip_build_storm:
                 build_storm()
             generate_dependency_licenses()
             generate_storm_dist_license_report()

             shaded_dep_coordinates = get_shaded_dep_coordinates()

             dep_changed = update_dependency_licenses()
             lic_changed = update_license_binary(shaded_dep_coordinates)

             if dep_changed or lic_changed:
                 print('\nLicense files were updated. Please review the changes.')
             else:
                 print('\nLicense files are already up to date. No changes made.')
         except subprocess.CalledProcessError as e:
             print(f'Command failed: {e}', flush=True)
             exit(1)
         except Exception as e:
             print(f'Error updating license files: {e}', flush=True)
             exit(1)
	#!/usr/bin/env python3

	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Usage: update-license-files.py [--skip-build-storm]

	Regenerates DEPENDENCY-LICENSES and the binary dependencies section of LICENSE-binary.
	Run this after changing dependencies to bring license files up to date.

	Depends on "requests" is NOT required. Only needs Maven and Python 3.
	"""

	from contextlib import contextmanager
	from pathlib import Path
	import os
	import subprocess
	import shlex
	import shutil
	import filecmp
	import re
	import argparse

	project_root = Path(__file__).resolve().parent.parent
	update_dependency_licenses_cmd = ('mvn license:aggregate-add-third-party@generate-and-check-licenses -Dlicense.skipAggregateAddThirdParty=false -B')

	LICENSE_BINARY_SEPARATOR = '----------------------------END OF SOURCE NOTICES -------------------------------------------'

	LICENSE_BINARY_PREAMBLE = """

	The following dependencies are included in the binary Storm distributions, in addition to the source dependencies listed above.
	The license texts of these dependencies can be found in the licenses directory.
	"""


	@contextmanager
	def cd(newdir):
	prevdir = Path.cwd()
	os.chdir(newdir.expanduser())
	try:
	yield
	finally:
	os.chdir(prevdir)


	def read_lines(path):
	with open(path) as f:
	return f.readlines()


	def build_storm():
	print("Building Storm")
	subprocess.check_call(shlex.split(
	'mvn clean install -B -DskipTests -Dcheckstyle.skip -Dpmd.skip'
	))
	print("Done building Storm")


	def generate_dependency_licenses():
	"""Generates DEPENDENCY-LICENSES in target/. The committed DEPENDENCY-LICENSES is not modified."""
	print('Generating DEPENDENCY-LICENSES')
	cmd = (update_dependency_licenses_cmd +
	' -Dlicense.thirdPartyFilename=DEPENDENCY-LICENSES' +
	' -Dlicense.outputDirectory=target')
	subprocess.check_call(shlex.split(cmd))
	print('Done generating DEPENDENCY-LICENSES')


	def generate_storm_dist_license_report():
	with cd(project_root / 'storm-dist' / 'binary'):
	print('')
	print('Generating storm-dist license report')
	subprocess.check_call(shlex.split(update_dependency_licenses_cmd))
	print('Done generating storm-dist license report')


	def extract_license_report_maven_coordinates(lines):
	"""Extract Maven coordinates from license report lines.
	Lines like: ' * Checker Qual (org.checkerframework:checker-qual:2.5.2 - https://checkerframework.org)'
	"""
	matches = map(lambda line: re.match(
	r'\s+\.\((?P<gav>.) \- .\).*', line), lines)
	return set(map(lambda match: match.group('gav'), filter(lambda match: match is not None, matches)))


	def extract_dependency_list_maven_coordinates(lines):
	"""Extract Maven coordinates from 'mvn dependency:list' output.
	Lines like: ' com.google.code.findbugs:jsr305:jar:3.0.2 -- module jsr305 (auto)'
	"""
	matches = map(lambda line: re.match(
	r'\s+(?P<group>\S)\:(?P<artifact>\S)\:(?P<type>\S)\:(?P<version>\S)', line), lines)
	return set(map(lambda match: match.group('group') + ':' + match.group('artifact') + ':' + match.group('version'),
	filter(lambda match: match is not None, matches)))


	def get_shaded_dep_coordinates():
	"""Gets the set of Maven coordinates for storm-shaded-deps (excluding Storm's own modules)."""
	with cd(project_root / 'storm-shaded-deps'):
	print("Generating dependency list for storm-shaded-deps")
	subprocess.check_call(shlex.split(
	'mvn dependency:list -DoutputFile=target/deps-list -Dmdep.outputScope=false -DincludeScope=compile -B'))
	print("Done generating dependency list for storm-shaded-deps")
	shaded_dep_coordinates = extract_dependency_list_maven_coordinates(
	read_lines(project_root / 'storm-shaded-deps' / 'target' / 'deps-list'))
	shaded_dep_coordinates = set(filter(lambda coordinate: 'org.apache.storm:' not in coordinate, shaded_dep_coordinates))
	print('storm-shaded-deps dependencies: ' + str(shaded_dep_coordinates))
	print('')
	return shaded_dep_coordinates


	def parse_grouped_license_file(lines):
	"""Parse a license report file (DEPENDENCY-LICENSES format) into structured groups.

	Returns a list of (header_line, [entry_lines]) tuples, preserving blank lines and formatting.
	The header_line is the license group name (e.g. ' Apache License, Version 2.0\\n').
	entry_lines are the dependency lines under that group (e.g. ' * Foo (g:a:v - url)\\n').
	"""
	groups = []
	current_header = None
	current_entries = []

	for line in lines:
	# Skip the file header (first few lines before any license group)
	stripped = line.rstrip('\n')

	# License group headers are indented with 4 spaces and have non-whitespace content
	if re.match(r' \S', line) and not line.strip().startswith('*'):
	if current_header is not None:
	groups.append((current_header, current_entries))
	current_header = line
	current_entries = []
	elif current_header is not None:
	current_entries.append(line)

	if current_header is not None:
	groups.append((current_header, current_entries))

	return groups


	def filter_groups_to_coordinates(groups, target_coordinates):
	"""Filter license groups to only include entries whose coordinates are in the target set.

	Returns lines for the filtered license report section.
	"""
	result_lines = []
	gav_pattern = re.compile(r'\s+\.\((?P<gav>.) \- .\).*')

	for header, entries in groups:
	filtered_entries = []
	for entry in entries:
	match = gav_pattern.match(entry)
	if match:
	if match.group('gav') in target_coordinates:
	filtered_entries.append(entry)
	# Keep blank lines between entries only if we have entries
	if filtered_entries:
	result_lines.append('\n')
	result_lines.append(header)
	result_lines.append('\n')
	for entry in filtered_entries:
	result_lines.append(entry)

	return result_lines


	def update_dependency_licenses():
	"""Copy target/DEPENDENCY-LICENSES to root DEPENDENCY-LICENSES.
	Returns True if the file changed."""
	src = project_root / 'target' / 'DEPENDENCY-LICENSES'
	dst = project_root / 'DEPENDENCY-LICENSES'
	if dst.exists() and filecmp.cmp(src, dst, shallow=False):
	print('DEPENDENCY-LICENSES is already up to date')
	return False
	shutil.copy2(src, dst)
	print('Updated DEPENDENCY-LICENSES')
	return True


	def merge_groups(base_groups, extra_groups, extra_coordinates):
	"""Merge extra_groups entries into base_groups for coordinates in extra_coordinates
	that are not already present in base_groups.

	Returns a new list of (header, [entry_lines]) tuples.
	"""
	# Collect all coordinates already in base_groups
	gav_pattern = re.compile(r'\s+\.\((?P<gav>.) \- .\).*')
	base_coords = set()
	for _, entries in base_groups:
	for entry in entries:
	match = gav_pattern.match(entry)
	if match:
	base_coords.add(match.group('gav'))

	# Find extra entries to add (in extra_coordinates but not in base)
	missing_coords = extra_coordinates - base_coords
	if not missing_coords:
	return base_groups

	print(f'Adding {len(missing_coords)} shaded-deps entries from DEPENDENCY-LICENSES to LICENSE-binary')

	# Build a map: header_text -> list of entry lines to add
	extra_by_header = {}
	for header, entries in extra_groups:
	header_key = header.strip()
	for entry in entries:
	match = gav_pattern.match(entry)
	if match and match.group('gav') in missing_coords:
	extra_by_header.setdefault(header_key, []).append(entry)

	# Merge into base_groups
	result = []
	seen_headers = set()
	for header, entries in base_groups:
	header_key = header.strip()
	seen_headers.add(header_key)
	merged_entries = list(entries)
	if header_key in extra_by_header:
	merged_entries.extend(extra_by_header[header_key])
	result.append((header, merged_entries))

	# Add any license groups that only exist in extra
	for header_key, extra_entries in extra_by_header.items():
	if header_key not in seen_headers:
	result.append((' ' + header_key + '\n', extra_entries))

	return result


	def groups_to_lines(groups):
	"""Convert parsed groups back to lines for writing."""
	result_lines = []
	for header, entries in sorted(groups, key=lambda g: g[0].strip().lower()):
	result_lines.append('\n')
	result_lines.append(header)
	result_lines.append('\n')
	for entry in entries:
	# Skip blank lines that were part of the original inter-group spacing
	if entry.strip():
	result_lines.append(entry)
	return result_lines


	def update_license_binary(shaded_dep_coordinates):
	"""Replace the binary dependencies section of LICENSE-binary.

	Uses storm-dist/binary THIRD-PARTY.txt as the base (already has all binary deps grouped),
	then merges in any storm-shaded-deps entries from DEPENDENCY-LICENSES that aren't already
	covered by the binary THIRD-PARTY.txt.

	Returns True if the file changed."""
	license_binary_path = project_root / 'LICENSE-binary'
	lines = read_lines(license_binary_path)

	# Find the separator line
	separator_idx = None
	for i, line in enumerate(lines):
	if LICENSE_BINARY_SEPARATOR in line:
	separator_idx = i
	break

	if separator_idx is None:
	print(f'ERROR: Could not find separator line in LICENSE-binary: "{LICENSE_BINARY_SEPARATOR}"')
	return False

	# Keep the static header (up to and including the separator)
	static_part = lines[:separator_idx + 1]

	# Use the binary THIRD-PARTY.txt as the base — it already contains all
	# storm-dist/binary dependencies in the correct grouped format
	binary_third_party_path = (project_root / 'storm-dist' / 'binary' / 'target' /
	'generated-sources' / 'license' / 'THIRD-PARTY.txt')
	binary_groups = parse_grouped_license_file(read_lines(binary_third_party_path))

	# Merge in storm-shaded-deps entries from DEPENDENCY-LICENSES
	dep_licenses_lines = read_lines(project_root / 'target' / 'DEPENDENCY-LICENSES')
	dep_licenses_groups = parse_grouped_license_file(dep_licenses_lines)
	merged_groups = merge_groups(binary_groups, dep_licenses_groups, shaded_dep_coordinates)

	binary_section = groups_to_lines(merged_groups)

	# Compose the new file
	preamble_lines = LICENSE_BINARY_PREAMBLE.splitlines(keepends=True)
	new_content = static_part + preamble_lines + binary_section

	# Compare with current content
	if new_content == lines:
	print('LICENSE-binary is already up to date')
	return False

	with open(license_binary_path, 'w') as f:
	f.writelines(new_content)
	print('Updated LICENSE-binary')
	return True


	if __name__ == '__main__':
	with cd(project_root):
	parser = argparse.ArgumentParser(
	description='Update Storm license files (DEPENDENCY-LICENSES and LICENSE-binary)')
	parser.add_argument('--skip-build-storm', action='store_true',
	help='skip building Storm (use if already built)')
	args = parser.parse_args()

	try:
	if not args.skip_build_storm:
	build_storm()
	generate_dependency_licenses()
	generate_storm_dist_license_report()

	shaded_dep_coordinates = get_shaded_dep_coordinates()

	dep_changed = update_dependency_licenses()
	lic_changed = update_license_binary(shaded_dep_coordinates)

	if dep_changed or lic_changed:
	print('\nLicense files were updated. Please review the changes.')
	else:
	print('\nLicense files are already up to date. No changes made.')
	except subprocess.CalledProcessError as e:
	print(f'Command failed: {e}', flush=True)
	exit(1)
	except Exception as e:
	print(f'Error updating license files: {e}', flush=True)
	exit(1)