blob: b46cc248c87b4ef4eec2bf9252ed6c3627159d61 [file]
#!/usr/bin/env python3
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Usage: update-license-files.py [--skip-build-storm]
Regenerates DEPENDENCY-LICENSES and the binary dependencies section of LICENSE-binary.
Run this after changing dependencies to bring license files up to date.
Depends on "requests" is NOT required. Only needs Maven and Python 3.
"""
from contextlib import contextmanager
from pathlib import Path
import os
import subprocess
import shlex
import shutil
import filecmp
import re
import argparse
project_root = Path(__file__).resolve().parent.parent
update_dependency_licenses_cmd = ('mvn license:aggregate-add-third-party@generate-and-check-licenses -Dlicense.skipAggregateAddThirdParty=false -B')
LICENSE_BINARY_SEPARATOR = '----------------------------END OF SOURCE NOTICES -------------------------------------------'
LICENSE_BINARY_PREAMBLE = """
The following dependencies are included in the binary Storm distributions, in addition to the source dependencies listed above.
The license texts of these dependencies can be found in the licenses directory.
"""
@contextmanager
def cd(newdir):
prevdir = Path.cwd()
os.chdir(newdir.expanduser())
try:
yield
finally:
os.chdir(prevdir)
def read_lines(path):
with open(path) as f:
return f.readlines()
def build_storm():
print("Building Storm")
subprocess.check_call(shlex.split(
'mvn clean install -B -DskipTests -Dcheckstyle.skip -Dpmd.skip'
))
print("Done building Storm")
def generate_dependency_licenses():
"""Generates DEPENDENCY-LICENSES in target/. The committed DEPENDENCY-LICENSES is not modified."""
print('Generating DEPENDENCY-LICENSES')
cmd = (update_dependency_licenses_cmd +
' -Dlicense.thirdPartyFilename=DEPENDENCY-LICENSES' +
' -Dlicense.outputDirectory=target')
subprocess.check_call(shlex.split(cmd))
print('Done generating DEPENDENCY-LICENSES')
def generate_storm_dist_license_report():
with cd(project_root / 'storm-dist' / 'binary'):
print('')
print('Generating storm-dist license report')
subprocess.check_call(shlex.split(update_dependency_licenses_cmd))
print('Done generating storm-dist license report')
def extract_license_report_maven_coordinates(lines):
"""Extract Maven coordinates from license report lines.
Lines like: ' * Checker Qual (org.checkerframework:checker-qual:2.5.2 - https://checkerframework.org)'
"""
matches = map(lambda line: re.match(
r'\s+\*.*\((?P<gav>.*) \- .*\).*', line), lines)
return set(map(lambda match: match.group('gav'), filter(lambda match: match is not None, matches)))
def extract_dependency_list_maven_coordinates(lines):
"""Extract Maven coordinates from 'mvn dependency:list' output.
Lines like: ' com.google.code.findbugs:jsr305:jar:3.0.2 -- module jsr305 (auto)'
"""
matches = map(lambda line: re.match(
r'\s+(?P<group>\S*)\:(?P<artifact>\S*)\:(?P<type>\S*)\:(?P<version>\S*)', line), lines)
return set(map(lambda match: match.group('group') + ':' + match.group('artifact') + ':' + match.group('version'),
filter(lambda match: match is not None, matches)))
def get_shaded_dep_coordinates():
"""Gets the set of Maven coordinates for storm-shaded-deps (excluding Storm's own modules)."""
with cd(project_root / 'storm-shaded-deps'):
print("Generating dependency list for storm-shaded-deps")
subprocess.check_call(shlex.split(
'mvn dependency:list -DoutputFile=target/deps-list -Dmdep.outputScope=false -DincludeScope=compile -B'))
print("Done generating dependency list for storm-shaded-deps")
shaded_dep_coordinates = extract_dependency_list_maven_coordinates(
read_lines(project_root / 'storm-shaded-deps' / 'target' / 'deps-list'))
shaded_dep_coordinates = set(filter(lambda coordinate: 'org.apache.storm:' not in coordinate, shaded_dep_coordinates))
print('storm-shaded-deps dependencies: ' + str(shaded_dep_coordinates))
print('')
return shaded_dep_coordinates
def parse_grouped_license_file(lines):
"""Parse a license report file (DEPENDENCY-LICENSES format) into structured groups.
Returns a list of (header_line, [entry_lines]) tuples, preserving blank lines and formatting.
The header_line is the license group name (e.g. ' Apache License, Version 2.0\\n').
entry_lines are the dependency lines under that group (e.g. ' * Foo (g:a:v - url)\\n').
"""
groups = []
current_header = None
current_entries = []
for line in lines:
# Skip the file header (first few lines before any license group)
stripped = line.rstrip('\n')
# License group headers are indented with 4 spaces and have non-whitespace content
if re.match(r' \S', line) and not line.strip().startswith('*'):
if current_header is not None:
groups.append((current_header, current_entries))
current_header = line
current_entries = []
elif current_header is not None:
current_entries.append(line)
if current_header is not None:
groups.append((current_header, current_entries))
return groups
def filter_groups_to_coordinates(groups, target_coordinates):
"""Filter license groups to only include entries whose coordinates are in the target set.
Returns lines for the filtered license report section.
"""
result_lines = []
gav_pattern = re.compile(r'\s+\*.*\((?P<gav>.*) \- .*\).*')
for header, entries in groups:
filtered_entries = []
for entry in entries:
match = gav_pattern.match(entry)
if match:
if match.group('gav') in target_coordinates:
filtered_entries.append(entry)
# Keep blank lines between entries only if we have entries
if filtered_entries:
result_lines.append('\n')
result_lines.append(header)
result_lines.append('\n')
for entry in filtered_entries:
result_lines.append(entry)
return result_lines
def update_dependency_licenses():
"""Copy target/DEPENDENCY-LICENSES to root DEPENDENCY-LICENSES.
Returns True if the file changed."""
src = project_root / 'target' / 'DEPENDENCY-LICENSES'
dst = project_root / 'DEPENDENCY-LICENSES'
if dst.exists() and filecmp.cmp(src, dst, shallow=False):
print('DEPENDENCY-LICENSES is already up to date')
return False
shutil.copy2(src, dst)
print('Updated DEPENDENCY-LICENSES')
return True
def merge_groups(base_groups, extra_groups, extra_coordinates):
"""Merge extra_groups entries into base_groups for coordinates in extra_coordinates
that are not already present in base_groups.
Returns a new list of (header, [entry_lines]) tuples.
"""
# Collect all coordinates already in base_groups
gav_pattern = re.compile(r'\s+\*.*\((?P<gav>.*) \- .*\).*')
base_coords = set()
for _, entries in base_groups:
for entry in entries:
match = gav_pattern.match(entry)
if match:
base_coords.add(match.group('gav'))
# Find extra entries to add (in extra_coordinates but not in base)
missing_coords = extra_coordinates - base_coords
if not missing_coords:
return base_groups
print(f'Adding {len(missing_coords)} shaded-deps entries from DEPENDENCY-LICENSES to LICENSE-binary')
# Build a map: header_text -> list of entry lines to add
extra_by_header = {}
for header, entries in extra_groups:
header_key = header.strip()
for entry in entries:
match = gav_pattern.match(entry)
if match and match.group('gav') in missing_coords:
extra_by_header.setdefault(header_key, []).append(entry)
# Merge into base_groups
result = []
seen_headers = set()
for header, entries in base_groups:
header_key = header.strip()
seen_headers.add(header_key)
merged_entries = list(entries)
if header_key in extra_by_header:
merged_entries.extend(extra_by_header[header_key])
result.append((header, merged_entries))
# Add any license groups that only exist in extra
for header_key, extra_entries in extra_by_header.items():
if header_key not in seen_headers:
result.append((' ' + header_key + '\n', extra_entries))
return result
def groups_to_lines(groups):
"""Convert parsed groups back to lines for writing."""
result_lines = []
for header, entries in sorted(groups, key=lambda g: g[0].strip().lower()):
result_lines.append('\n')
result_lines.append(header)
result_lines.append('\n')
for entry in entries:
# Skip blank lines that were part of the original inter-group spacing
if entry.strip():
result_lines.append(entry)
return result_lines
def update_license_binary(shaded_dep_coordinates):
"""Replace the binary dependencies section of LICENSE-binary.
Uses storm-dist/binary THIRD-PARTY.txt as the base (already has all binary deps grouped),
then merges in any storm-shaded-deps entries from DEPENDENCY-LICENSES that aren't already
covered by the binary THIRD-PARTY.txt.
Returns True if the file changed."""
license_binary_path = project_root / 'LICENSE-binary'
lines = read_lines(license_binary_path)
# Find the separator line
separator_idx = None
for i, line in enumerate(lines):
if LICENSE_BINARY_SEPARATOR in line:
separator_idx = i
break
if separator_idx is None:
print(f'ERROR: Could not find separator line in LICENSE-binary: "{LICENSE_BINARY_SEPARATOR}"')
return False
# Keep the static header (up to and including the separator)
static_part = lines[:separator_idx + 1]
# Use the binary THIRD-PARTY.txt as the base — it already contains all
# storm-dist/binary dependencies in the correct grouped format
binary_third_party_path = (project_root / 'storm-dist' / 'binary' / 'target' /
'generated-sources' / 'license' / 'THIRD-PARTY.txt')
binary_groups = parse_grouped_license_file(read_lines(binary_third_party_path))
# Merge in storm-shaded-deps entries from DEPENDENCY-LICENSES
dep_licenses_lines = read_lines(project_root / 'target' / 'DEPENDENCY-LICENSES')
dep_licenses_groups = parse_grouped_license_file(dep_licenses_lines)
merged_groups = merge_groups(binary_groups, dep_licenses_groups, shaded_dep_coordinates)
binary_section = groups_to_lines(merged_groups)
# Compose the new file
preamble_lines = LICENSE_BINARY_PREAMBLE.splitlines(keepends=True)
new_content = static_part + preamble_lines + binary_section
# Compare with current content
if new_content == lines:
print('LICENSE-binary is already up to date')
return False
with open(license_binary_path, 'w') as f:
f.writelines(new_content)
print('Updated LICENSE-binary')
return True
if __name__ == '__main__':
with cd(project_root):
parser = argparse.ArgumentParser(
description='Update Storm license files (DEPENDENCY-LICENSES and LICENSE-binary)')
parser.add_argument('--skip-build-storm', action='store_true',
help='skip building Storm (use if already built)')
args = parser.parse_args()
try:
if not args.skip_build_storm:
build_storm()
generate_dependency_licenses()
generate_storm_dist_license_report()
shaded_dep_coordinates = get_shaded_dep_coordinates()
dep_changed = update_dependency_licenses()
lic_changed = update_license_binary(shaded_dep_coordinates)
if dep_changed or lic_changed:
print('\nLicense files were updated. Please review the changes.')
else:
print('\nLicense files are already up to date. No changes made.')
except subprocess.CalledProcessError as e:
print(f'Command failed: {e}', flush=True)
exit(1)
except Exception as e:
print(f'Error updating license files: {e}', flush=True)
exit(1)