blob: 341f7cf344b0f1d884ca837348dc689d6c50840f [file]
#!/usr/bin/env python3
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
"""
Automates updating LICENSE-binary and NOTICE-binary for Apache Pinot releases.
Instead of the manual wiki process (temporarily hacking pinot-distribution/pom.xml,
generating HTML reports, copy-pasting from a browser, hacking POMs for shade plugin
NOTICE aggregation, etc.), this script:
LICENSE-binary:
1. Parses pinot-assembly.xml to find ALL modules shipped in the binary
2. Runs `mvn dependency:list` on each to get the real transitive dependency set
3. Parses the current LICENSE-binary
4. Diffs old vs new dependencies
5. Auto-detects licenses for new deps from Maven POM metadata
6. Generates an updated LICENSE-binary and a human-readable report
NOTICE-binary:
7. Opens each dependency JAR in ~/.m2/repository
8. Extracts META-INF/NOTICE files (replacing the shade plugin POM hack)
9. Deduplicates and merges into a new NOTICE-binary
Prerequisites:
- Build the full project first:
mvn clean install -DskipTests -T1C
- Python 3.7+
Usage:
cd /path/to/pinot
python3 scripts/update-release-binaries.py # both files (default)
python3 scripts/update-release-binaries.py --license-only # LICENSE-binary only
python3 scripts/update-release-binaries.py --notice-only # NOTICE-binary only
python3 scripts/update-release-binaries.py --report-only # just show diff
"""
import argparse
import os
import re
import subprocess
import sys
import xml.etree.ElementTree as ET
import zipfile
from collections import OrderedDict
from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple
def _version_key(version: str):
"""
Convert a version string into a tuple suitable for comparison,
so that "10.0" > "9.0" works correctly (unlike lexicographic comparison).
Non-numeric segments are compared as strings.
"""
parts = re.split(r"[.\-]", version)
key = []
for p in parts:
if p.isdigit():
key.append((0, int(p)))
else:
key.append((1, p))
return tuple(key)
# ---------------------------------------------------------------------------
# Data types
# ---------------------------------------------------------------------------
class Dep:
"""A Maven dependency identified by group:artifact:version."""
def __init__(self, group_id: str, artifact_id: str, version: str):
self.group_id = group_id
self.artifact_id = artifact_id
self.version = version
def key(self) -> Tuple[str, str]:
return (self.group_id, self.artifact_id)
def __str__(self):
return f"{self.group_id}:{self.artifact_id}:{self.version}"
def __eq__(self, other):
return isinstance(other, Dep) and str(self) == str(other)
def __hash__(self):
return hash(str(self))
class LicenseSection:
"""A section in LICENSE-binary: a license name + list of deps under it."""
def __init__(self, name: str, preamble_lines: List[str], deps: List[str]):
# name: e.g. "Apache License Version 2.0", "MIT License", "BSD 3-Clause"
self.name = name
# preamble_lines: header/decoration lines before the dep list
self.preamble_lines = preamble_lines
# deps: list of "group:artifact:version" strings
self.deps = deps
def __repr__(self):
return f"LicenseSection({self.name!r}, {len(self.deps)} deps)"
# ---------------------------------------------------------------------------
# Known license classification
# ---------------------------------------------------------------------------
# Maps normalized license names/URLs from Maven POMs to LICENSE-binary section names.
# Order matters for matching: first match wins.
LICENSE_CLASSIFIERS = [
# Apache 2.0
(r"apache.*2", "Apache License Version 2.0"),
(r"asf\s*2", "Apache License Version 2.0"),
(r"apache\.org/licenses/LICENSE-2\.0", "Apache License Version 2.0"),
# MIT
(r"\bmit\b(?!-0)", "MIT License"),
(r"opensource\.org/licenses/MIT", "MIT License"),
# MIT-0
(r"mit-0|mit.0.license", "MIT-0 License"),
# BSD 2-Clause (must come before generic BSD)
(r"bsd[\s._-]*2|2[\s._-]*clause.*bsd|simplified.*bsd|freebsd", "BSD 2-Clause"),
# BSD 3-Clause (must come before generic BSD)
(r"bsd[\s._-]*3|3[\s._-]*clause.*bsd|new.*bsd|revised.*bsd|modified.*bsd", "BSD 3-Clause"),
# BSD (plain/unspecified) — fallback after specific variants
(r"\bbsd\b", "BSD"),
# CDDL 1.0
(r"cddl.*1\.0|common.*development.*distribution.*1\.0", "Common Development and Distribution License (CDDL) 1.0"),
# CDDL 1.1
(r"cddl.*1\.1|common.*development.*distribution.*1\.1", "Common Development and Distribution License (CDDL) 1.1"),
# EPL 1.0
(r"eclipse.*public.*1\.0|epl.*1\.0", "Eclipse Public License (EPL) 1.0"),
# EPL 2.0
(r"eclipse.*public.*2\.0|epl.*2\.0", "Eclipse Public License (EPL) 2.0"),
# EDL 1.0
(r"eclipse.*distribution|edl", "Eclipse Distribution License (EDL) 1.0"),
# LGPL
(r"lgpl|lesser.*general.*public", "LGPL"),
# Bouncy Castle
(r"bouncy\s*castle", "Bouncy Castle License"),
# ISC
(r"\bisc\b", "ISC License"),
# Go License
(r"\bgo\b.*license|golang", "The Go License"),
# WTFPL
(r"wtfpl|do what the fuck", "WTFPL License"),
# Public Domain
(r"public\s*domain", "Public Domain"),
]
def classify_license(license_name: str, license_url: str = "") -> Optional[str]:
"""Map a POM license name/URL to a LICENSE-binary section name."""
combined = f"{license_name} {license_url}".lower()
for pattern, section_name in LICENSE_CLASSIFIERS:
if re.search(pattern, combined):
return section_name
return None
# ---------------------------------------------------------------------------
# Step 1: Parse assembly descriptor for shipped modules
# ---------------------------------------------------------------------------
def parse_assembly_modules(pinot_root: Path) -> List[Path]:
"""Parse pinot-assembly.xml to find all module directories shipped in the binary."""
assembly_file = pinot_root / "pinot-distribution" / "pinot-assembly.xml"
if not assembly_file.exists():
print(f"ERROR: Assembly file not found: {assembly_file}", file=sys.stderr)
sys.exit(1)
tree = ET.parse(assembly_file)
root = tree.getroot()
modules = set()
# Walk all elements looking for <source> tags with ${pinot.root}/.../target/ paths
for elem in root.iter():
tag = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
if tag == "source" and elem.text:
source = elem.text.strip()
match = re.search(r"\$\{pinot\.root\}/(.+?)/target/", source)
if match:
module_path = match.group(1)
full_path = pinot_root / module_path
if full_path.exists() and (full_path / "pom.xml").exists():
modules.add(full_path)
# Always include pinot-distribution itself (its deps: pinot-tools, pinot-jdbc-client)
dist_path = pinot_root / "pinot-distribution"
if dist_path.exists():
modules.add(dist_path)
sorted_modules = sorted(modules)
print(f"Found {len(sorted_modules)} modules in assembly descriptor:")
for m in sorted_modules:
print(f" {m.relative_to(pinot_root)}")
print()
return sorted_modules
# ---------------------------------------------------------------------------
# Step 2: Get Maven artifact IDs for modules
# ---------------------------------------------------------------------------
def get_maven_artifact_id(module_dir: Path) -> Optional[str]:
"""Read pom.xml to extract artifactId."""
pom_file = module_dir / "pom.xml"
if not pom_file.exists():
return None
tree = ET.parse(pom_file)
root = tree.getroot()
# Handle Maven namespace
ns = {"m": "http://maven.apache.org/POM/4.0.0"}
elem = root.find("m:artifactId", ns)
if elem is None:
elem = root.find("artifactId")
return elem.text.strip() if elem is not None and elem.text else None
# ---------------------------------------------------------------------------
# Step 3: Run mvn dependency:list
# ---------------------------------------------------------------------------
def run_dependency_list(pinot_root: Path, module_dirs: List[Path]) -> Set[Dep]:
"""Run mvn dependency:list on all shipped modules; return merged dep set."""
artifact_ids = []
for mod_dir in module_dirs:
aid = get_maven_artifact_id(mod_dir)
if aid:
artifact_ids.append(aid)
else:
print(f" WARNING: Could not read artifactId from {mod_dir}/pom.xml",
file=sys.stderr)
if not artifact_ids:
print("ERROR: No module artifact IDs found", file=sys.stderr)
sys.exit(1)
pl_arg = ",".join(f":{aid}" for aid in artifact_ids)
output_file = pinot_root / "target" / "license-check-deps.txt"
output_file.parent.mkdir(parents=True, exist_ok=True)
if output_file.exists():
output_file.unlink()
cmd = [
"mvn", "dependency:list",
f"-pl", pl_arg,
"-DincludeScope=runtime",
"-DexcludeGroupIds=org.apache.pinot",
f"-DoutputFile={output_file}",
"-DappendOutput=true",
"-DoutputAbsoluteArtifactFilename=false",
]
print(f"Running mvn dependency:list for {len(artifact_ids)} modules...")
print(f" This may take a few minutes...\n")
result = subprocess.run(
cmd, cwd=pinot_root, capture_output=True, text=True, timeout=600
)
if result.returncode != 0:
print("ERROR: Maven command failed.", file=sys.stderr)
# Show last part of output for debugging
stderr_tail = result.stderr[-3000:] if len(result.stderr) > 3000 else result.stderr
stdout_tail = result.stdout[-3000:] if len(result.stdout) > 3000 else result.stdout
if stderr_tail.strip():
print(f"STDERR:\n{stderr_tail}", file=sys.stderr)
if stdout_tail.strip():
print(f"STDOUT:\n{stdout_tail}", file=sys.stderr)
sys.exit(1)
# Parse the output file
deps = set()
if not output_file.exists():
print("ERROR: Maven did not produce output file", file=sys.stderr)
sys.exit(1)
with open(output_file) as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
parts = line.split(":")
if len(parts) == 5:
group_id, artifact_id, _pkg_type, version, scope = parts
elif len(parts) == 6:
group_id, artifact_id, _pkg_type, _classifier, version, scope = parts
else:
continue
scope = scope.strip().lower()
if scope in ("test", "provided", "system"):
continue
deps.add(Dep(group_id.strip(), artifact_id.strip(), version.strip()))
print(f"Found {len(deps)} unique non-Pinot dependencies from Maven\n")
return deps
# ---------------------------------------------------------------------------
# Step 4: Parse current LICENSE-binary
# ---------------------------------------------------------------------------
# Regex matching a dependency line: group.id:artifact-id:version
DEP_LINE_RE = re.compile(
r"^[a-zA-Z][a-zA-Z0-9._-]*:[a-zA-Z][a-zA-Z0-9._-]*:[a-zA-Z0-9._+-]+$"
)
def parse_license_binary(license_file: Path) -> Tuple[List[str], List[LicenseSection], Dict[str, str]]:
"""
Parse LICENSE-binary into:
- file_lines: all original lines (for reconstruction)
- sections: list of LicenseSection objects
- dep_to_section: maps "group:artifact:version" -> section name
Returns (file_lines, sections, dep_to_section)
"""
with open(license_file) as f:
lines = f.read().split("\n")
sections: List[LicenseSection] = []
dep_to_section: Dict[str, str] = {}
# --- Identify the Apache 2.0 section ---
# Find the end of the Apache License text, then the first separator after it
license_text_end = None
for i, line in enumerate(lines):
if "END OF TERMS AND CONDITIONS" in line:
license_text_end = i
break
if license_text_end is None:
print("ERROR: Cannot find 'END OF TERMS AND CONDITIONS' in LICENSE-binary",
file=sys.stderr)
sys.exit(1)
first_sep = None
for i in range(license_text_end, len(lines)):
if re.match(r"^-{20,}$", lines[i]):
first_sep = i
break
if first_sep is None:
print("ERROR: Cannot find first separator in LICENSE-binary", file=sys.stderr)
sys.exit(1)
# Find the second separator (ends Apache 2.0 section)
second_sep = None
for i in range(first_sep + 1, len(lines)):
if re.match(r"^-{20,}$", lines[i]):
second_sep = i
break
# Apache 2.0 section: between first_sep and second_sep
apache_preamble = []
apache_deps = []
for i in range(first_sep + 1, second_sep if second_sep else len(lines)):
line = lines[i].strip()
if DEP_LINE_RE.match(line):
apache_deps.append(line)
elif line:
apache_preamble.append(lines[i])
sections.append(LicenseSection(
name="Apache License Version 2.0",
preamble_lines=apache_preamble,
deps=sorted(apache_deps),
))
for d in apache_deps:
dep_to_section[d] = "Apache License Version 2.0"
# --- Parse remaining sections ---
if second_sep is None:
return lines, sections, dep_to_section
# Scan from second_sep onward for license section headers
# Strategy: a section header is a non-empty, non-dep line followed by a line of dashes
# or it's the section intro text. We detect sections by their dash-underlines.
i = second_sep + 1
# Skip the intro paragraph
current_section_name = None
current_preamble: List[str] = []
current_deps: List[str] = []
while i < len(lines):
line = lines[i]
stripped = line.strip()
# Check if this line is a dash-underline (indicating previous line was a header)
if re.match(r"^-{3,}$", stripped) and i > 0:
# The previous non-blank line was the section header name
# Save current section if we have one
if current_section_name is not None:
sections.append(LicenseSection(
name=current_section_name,
preamble_lines=current_preamble,
deps=sorted(current_deps),
))
for d in current_deps:
dep_to_section[d] = current_section_name
# Find the header: walk backward from dash line to find the header text
header_line = ""
for j in range(i - 1, second_sep, -1):
if lines[j].strip():
header_line = lines[j].strip()
break
current_section_name = header_line
current_preamble = []
current_deps = []
i += 1
continue
# If we're in a section, classify the line
if current_section_name is not None:
if DEP_LINE_RE.match(stripped):
current_deps.append(stripped)
elif stripped:
current_preamble.append(line)
i += 1
# Don't forget the last section
if current_section_name is not None and (current_deps or current_preamble):
sections.append(LicenseSection(
name=current_section_name,
preamble_lines=current_preamble,
deps=sorted(current_deps),
))
for d in current_deps:
dep_to_section[d] = current_section_name
return lines, sections, dep_to_section
# ---------------------------------------------------------------------------
# Step 5: Compute diff
# ---------------------------------------------------------------------------
def compute_diff(
old_sections: List[LicenseSection],
new_deps: Set[Dep],
) -> Tuple[List[Dep], List[str], List[Tuple[str, str, str]]]:
"""
Compare old LICENSE-binary deps with new Maven deps.
Returns:
- added: list of Dep objects not in old
- removed: list of "group:artifact:version" strings not in new
- changed: list of (group:artifact, old_version, new_version) tuples
"""
# Build maps from old sections
old_by_key: Dict[Tuple[str, str], str] = {} # (group, artifact) -> version
old_dep_strings: Set[str] = set()
for section in old_sections:
for dep_str in section.deps:
parts = dep_str.split(":")
if len(parts) == 3:
g, a, v = parts
old_by_key[(g, a)] = v
old_dep_strings.add(dep_str)
# Build map from new deps
new_by_key: Dict[Tuple[str, str], str] = {}
for dep in new_deps:
key = dep.key()
# If multiple versions exist (shouldn't happen but just in case), pick the latest
if key not in new_by_key or _version_key(dep.version) > _version_key(new_by_key[key]):
new_by_key[key] = dep.version
old_keys = set(old_by_key.keys())
new_keys = set(new_by_key.keys())
# Added: in new but not in old
added = []
for key in sorted(new_keys - old_keys):
added.append(Dep(key[0], key[1], new_by_key[key]))
# Removed: in old but not in new
removed = []
for key in sorted(old_keys - new_keys):
removed.append(f"{key[0]}:{key[1]}:{old_by_key[key]}")
# Version changed: in both but different version
changed = []
for key in sorted(old_keys & new_keys):
old_ver = old_by_key[key]
new_ver = new_by_key[key]
if old_ver != new_ver:
changed.append((f"{key[0]}:{key[1]}", old_ver, new_ver))
return added, removed, changed
# ---------------------------------------------------------------------------
# Step 6: Detect licenses for deps (new and version-bumped)
# ---------------------------------------------------------------------------
def detect_license_from_pom(dep: Dep, m2_repo: Path) -> Optional[str]:
"""Look up the license from the dependency's POM in the local Maven repo."""
group_path = dep.group_id.replace(".", "/")
pom_path = m2_repo / group_path / dep.artifact_id / dep.version / f"{dep.artifact_id}-{dep.version}.pom"
if not pom_path.exists():
return None
try:
tree = ET.parse(pom_path)
root = tree.getroot()
ns = {"m": "http://maven.apache.org/POM/4.0.0"}
licenses_elem = root.find("m:licenses", ns)
if licenses_elem is None:
licenses_elem = root.find("licenses")
if licenses_elem is None:
return None
# Collect all license names and URLs
names = []
urls = []
for lic in licenses_elem:
tag_name = lic.tag.split("}")[-1] if "}" in lic.tag else lic.tag
if tag_name == "license":
name_elem = lic.find("{http://maven.apache.org/POM/4.0.0}name")
if name_elem is None:
name_elem = lic.find("name")
url_elem = lic.find("{http://maven.apache.org/POM/4.0.0}url")
if url_elem is None:
url_elem = lic.find("url")
if name_elem is not None and name_elem.text:
names.append(name_elem.text.strip())
if url_elem is not None and url_elem.text:
urls.append(url_elem.text.strip())
# Try to classify using each name/url pair
for name in names:
result = classify_license(name, " ".join(urls))
if result:
return result
for url in urls:
result = classify_license("", url)
if result:
return result
# Return raw name if we couldn't classify
if names:
return f"UNKNOWN ({names[0]})"
except ET.ParseError:
pass
return None
def check_version_bump_license_changes(
changed: List[Tuple[str, str, str]],
dep_to_section: Dict[str, str],
m2_repo: Path,
) -> Tuple[List[Tuple[str, str, str]], List[Tuple[str, str, str, str, str]]]:
"""
For version-bumped deps, re-check the POM license of the NEW version
and compare to the section the OLD version is currently in.
Returns:
- safe_changed: version bumps where the license stayed the same
- license_changed: tuples of (group:artifact, old_ver, new_ver, old_section, new_license)
"""
safe_changed = []
license_changed = []
for ga, old_ver, new_ver in changed:
parts = ga.split(":")
if len(parts) != 2:
safe_changed.append((ga, old_ver, new_ver))
continue
g, a = parts
old_dep_str = f"{ga}:{old_ver}"
old_section = dep_to_section.get(old_dep_str)
new_dep = Dep(g, a, new_ver)
new_license = detect_license_from_pom(new_dep, m2_repo)
if (new_license and old_section
and not new_license.startswith("UNKNOWN")
and new_license != old_section):
license_changed.append((ga, old_ver, new_ver, old_section, new_license))
else:
safe_changed.append((ga, old_ver, new_ver))
return safe_changed, license_changed
# ---------------------------------------------------------------------------
# Step 6b: Detect orphaned license files in licenses-binary/
# ---------------------------------------------------------------------------
def find_orphaned_license_files(
pinot_root: Path,
sections: List[LicenseSection],
removed: List[str],
added: List[Dep],
changed: List[Tuple[str, str, str]],
license_changed: List[Tuple[str, str, str, str, str]],
license_map: Dict[str, str],
) -> Tuple[List[str], List[str]]:
"""
After applying all changes, check which license sections will be empty
and which license files in licenses-binary/ are no longer referenced.
Returns:
- empty_sections: section names that will have 0 deps after updates
- orphaned_files: filenames in licenses-binary/ not referenced by LICENSE-binary
"""
licenses_dir = pinot_root / "licenses-binary"
# Compute the final dep count per section after applying all changes
removed_set = set(removed)
# Also remove deps whose license changed (they'll move to a different section)
for ga, old_ver, _new_ver, _old_section, _new_license in license_changed:
removed_set.add(f"{ga}:{old_ver}")
section_dep_counts: Dict[str, int] = {}
for section in sections:
remaining = [d for d in section.deps if d not in removed_set]
section_dep_counts[section.name] = len(remaining)
# Account for new deps being added to sections
for dep in added:
lic = license_map.get(str(dep))
if lic and not lic.startswith("UNKNOWN"):
section_dep_counts[lic] = section_dep_counts.get(lic, 0) + 1
# Account for license-changed deps moving to new sections
for ga, _old_ver, new_ver, _old_section, new_license in license_changed:
section_dep_counts[new_license] = section_dep_counts.get(new_license, 0) + 1
# Sections that have preamble content (e.g., "(see licenses/...)") are kept even
# with zero deps — they serve as license-text placeholders for their dep category.
sections_with_preamble = {
s.name for s in sections if s.preamble_lines
}
empty_sections = [
name for name, count in section_dep_counts.items()
if count == 0
and name != "Apache License Version 2.0" # never remove the main license
and name not in sections_with_preamble # keep sections that reference license files
]
# Find license files referenced in LICENSE-binary via "(see licenses/...)" lines
# and by implicit association with non-empty sections
referenced_files: Set[str] = set()
license_file_path = pinot_root / "LICENSE-binary"
if license_file_path.exists():
with open(license_file_path) as f:
content = f.read()
# Find explicit references like (see licenses/LICENSE-xxx.txt)
for match in re.finditer(r"licenses/([A-Za-z0-9._-]+)", content):
referenced_files.add(match.group(1))
# Check which files in licenses-binary/ are not referenced
orphaned_files = []
if licenses_dir.exists():
for f in sorted(licenses_dir.iterdir()):
if f.is_file() and f.name not in referenced_files:
orphaned_files.append(f.name)
return empty_sections, orphaned_files
# ---------------------------------------------------------------------------
# Step 7: Generate report
# ---------------------------------------------------------------------------
def print_report(
added: List[Dep],
removed: List[str],
changed: List[Tuple[str, str, str]],
license_changed: List[Tuple[str, str, str, str, str]],
license_map: Dict[str, str],
empty_sections: List[str],
orphaned_files: List[str],
):
"""Print a human-readable diff report."""
print("=" * 78)
print("LICENSE-BINARY UPDATE REPORT")
print("=" * 78)
if (not added and not removed and not changed and not license_changed
and not empty_sections and not orphaned_files):
print("\nNo changes detected. LICENSE-binary is up to date.")
return
# Summary
print(f"\n Added: {len(added)}")
print(f" Removed: {len(removed)}")
print(f" Version changed: {len(changed)}")
print(f" License changed: {len(license_changed)}")
print(f" Empty sections: {len(empty_sections)}")
print(f" Orphaned files: {len(orphaned_files)}")
print()
# Removed
if removed:
print("-" * 78)
print("REMOVED (delete these lines from LICENSE-binary):")
print("-" * 78)
for dep_str in removed:
print(f" - {dep_str}")
print()
# Version changes (license unchanged — safe to update in place)
if changed:
print("-" * 78)
print("VERSION CHANGED (update version numbers — license unchanged):")
print("-" * 78)
for ga, old_v, new_v in changed:
print(f" {ga}")
print(f" {old_v} --> {new_v}")
print()
# License changed on version bump — needs manual attention
if license_changed:
print("-" * 78)
print("LICENSE CHANGED ON VERSION BUMP (must move to different section):")
print("-" * 78)
for ga, old_v, new_v, old_section, new_license in license_changed:
print(f" {ga}")
print(f" {old_v} --> {new_v}")
print(f" MOVE: [{old_section}] --> [{new_license}]")
print()
# Added, grouped by detected license
if added:
print("-" * 78)
print("ADDED (new deps to add to LICENSE-binary):")
print("-" * 78)
by_license: Dict[str, List[Dep]] = OrderedDict()
unknown = []
for dep in added:
dep_str = str(dep)
lic = license_map.get(dep_str)
if lic and not lic.startswith("UNKNOWN"):
by_license.setdefault(lic, []).append(dep)
else:
unknown.append((dep, lic))
# Flag which sections are NEW (don't exist in LICENSE-binary yet)
existing_sections = {s for s in license_map.values()
if s and not s.startswith("UNKNOWN")}
for lic_name, deps in sorted(by_license.items()):
new_marker = " ** NEW SECTION NEEDED **" if lic_name not in existing_sections else ""
print(f"\n [{lic_name}]{new_marker}")
for dep in sorted(deps, key=str):
print(f" + {dep}")
if unknown:
print(f"\n [REQUIRES MANUAL LICENSE LOOKUP]")
print(f" (Visit the project's website to determine the license,")
print(f" then add to the appropriate section. If it's a new license")
print(f" type, create a new file in licenses-binary/ and a new section.)")
for dep, raw_lic in sorted(unknown, key=lambda x: str(x[0])):
hint = f" (POM says: {raw_lic})" if raw_lic else ""
print(f" ? {dep}{hint}")
print()
# Empty sections — may want to remove from LICENSE-binary
if empty_sections:
print("-" * 78)
print("EMPTY SECTIONS (consider removing from LICENSE-binary):")
print("-" * 78)
for section_name in sorted(empty_sections):
print(f" - {section_name}")
print()
# Orphaned license files
if orphaned_files:
print("-" * 78)
print("ORPHANED LICENSE FILES (not referenced in LICENSE-binary — consider deleting):")
print("-" * 78)
for filename in orphaned_files:
print(f" - licenses-binary/{filename}")
print()
# ---------------------------------------------------------------------------
# Step 8: Generate updated LICENSE-binary
# ---------------------------------------------------------------------------
def generate_updated_license_binary(
license_file: Path,
sections: List[LicenseSection],
added: List[Dep],
removed: List[str],
changed: List[Tuple[str, str, str]],
license_changed: List[Tuple[str, str, str, str, str]],
license_map: Dict[str, str],
output_file: Path,
):
"""Generate an updated LICENSE-binary file."""
# Read original file
with open(license_file) as f:
original = f.read()
# Build removed set and change map for quick lookup
removed_set = set(removed)
change_map = {} # "group:artifact" -> new_version
for ga, _old_v, new_v in changed:
change_map[ga] = new_v
# License-changed deps: remove old version from old section, add new version to new section
for ga, old_ver, new_ver, _old_section, new_license in license_changed:
removed_set.add(f"{ga}:{old_ver}")
# Build a map from section name -> deps to add
added_by_section: Dict[str, List[str]] = {}
unclassified: List[Dep] = []
for dep in added:
dep_str = str(dep)
lic = license_map.get(dep_str)
if lic and not lic.startswith("UNKNOWN"):
added_by_section.setdefault(lic, []).append(dep_str)
else:
unclassified.append(dep)
# Also add license-changed deps to their new sections
for ga, _old_ver, new_ver, _old_section, new_license in license_changed:
added_by_section.setdefault(new_license, []).append(f"{ga}:{new_ver}")
# Process the file line by line
lines = original.split("\n")
output_lines = []
current_section_name = None
# Track section boundaries so we can insert new deps at end of section
# We'll buffer dep lines per section and flush them sorted
in_dep_block = False
dep_buffer: List[str] = []
section_for_buffer: Optional[str] = None
def flush_dep_buffer():
"""Sort and write buffered deps, including any additions for this section."""
nonlocal dep_buffer, section_for_buffer
if section_for_buffer and section_for_buffer in added_by_section:
dep_buffer.extend(added_by_section.pop(section_for_buffer))
dep_buffer.sort()
for d in dep_buffer:
output_lines.append(d)
dep_buffer = []
section_for_buffer = None
i = 0
# We need to track which section we're in based on the parsed sections
section_names = {s.name for s in sections}
while i < len(lines):
line = lines[i]
stripped = line.strip()
# Detect section header (line of dashes following a non-empty line)
if re.match(r"^-{3,}$", stripped) and i > 0:
# Check if previous non-blank line is a section name
prev_text = ""
for j in range(i - 1, max(i - 5, -1), -1):
if lines[j].strip():
prev_text = lines[j].strip()
break
if prev_text in section_names:
# We're entering a new section; flush previous section's deps
if in_dep_block:
flush_dep_buffer()
in_dep_block = False
current_section_name = prev_text
output_lines.append(line)
i += 1
in_dep_block = True
section_for_buffer = current_section_name
continue
# Special case: Apache 2.0 section starts after first separator
# Detect by "This project bundles" text
if "This project bundles some components" in stripped:
if in_dep_block:
flush_dep_buffer()
current_section_name = "Apache License Version 2.0"
in_dep_block = True
section_for_buffer = current_section_name
output_lines.append(line)
i += 1
continue
# If we hit a separator line while in a dep block, flush and exit block
if re.match(r"^-{20,}$", stripped) and in_dep_block:
flush_dep_buffer()
in_dep_block = False
current_section_name = None
output_lines.append(line)
i += 1
continue
# Process dep lines
if in_dep_block and DEP_LINE_RE.match(stripped):
# Check if removed
if stripped in removed_set:
i += 1
continue # skip this line
# Check if version changed
parts = stripped.split(":")
if len(parts) == 3:
ga = f"{parts[0]}:{parts[1]}"
if ga in change_map:
new_dep_str = f"{ga}:{change_map[ga]}"
dep_buffer.append(new_dep_str)
i += 1
continue
dep_buffer.append(stripped)
i += 1
continue
# If we were in a dep block and hit a non-dep, non-blank line
# that's not a section header, this might be a note line (e.g., "(see licenses/...)")
# or we've exited the dep area
if in_dep_block and stripped and not DEP_LINE_RE.match(stripped):
# Check if this could be a section header (next line might be dashes)
if i + 1 < len(lines) and re.match(r"^-{3,}$", lines[i + 1].strip()):
# It's a new section header; flush current deps
flush_dep_buffer()
in_dep_block = False
current_section_name = None
output_lines.append(line)
i += 1
continue
else:
# It's a note/preamble line within the section; flush deps first
# then add this line
if dep_buffer:
flush_dep_buffer()
in_dep_block = False
output_lines.append(line)
i += 1
continue
# Skip blank lines that trail deps in a section — the post-processor
# re-inserts proper spacing between sections. Without this, blank
# lines from the original file (between the last dep and the next
# section header) end up before the flushed deps, creating
# double-spacing. We only skip when the buffer already has deps;
# blank lines before any deps (e.g., after a preamble like
# "(see licenses/...)") are preserved.
if in_dep_block and not stripped and dep_buffer:
i += 1
continue
# Default: pass through
output_lines.append(line)
i += 1
# Final flush
if in_dep_block:
flush_dep_buffer()
# Handle unclassified deps: add a TODO section at the end
if unclassified or added_by_section:
output_lines.append("")
if added_by_section:
# Sections that weren't matched to existing sections in the file
for section_name, deps in sorted(added_by_section.items()):
output_lines.append("")
output_lines.append(f"TODO: Add to [{section_name}] section:")
for d in sorted(deps):
output_lines.append(d)
if unclassified:
output_lines.append("")
output_lines.append("TODO: The following new deps need manual license lookup:")
for dep in sorted(unclassified, key=str):
output_lines.append(f" {dep}")
# Post-process: ensure blank line separators between sections.
# A section boundary is a non-blank, non-dep line followed by a line of dashes.
# We need at least 2 blank lines before each section header.
final_lines: List[str] = []
for idx, line in enumerate(output_lines):
is_section_header = (
line.strip()
and not DEP_LINE_RE.match(line.strip())
and idx + 1 < len(output_lines)
and re.match(r"^-{3,}$", output_lines[idx + 1].strip())
)
if is_section_header and final_lines:
# Count existing trailing blank lines
trailing_blanks = 0
for prev in reversed(final_lines):
if prev.strip() == "":
trailing_blanks += 1
else:
break
# Ensure at least 2 blank lines before the section header
while trailing_blanks < 2:
final_lines.append("")
trailing_blanks += 1
final_lines.append(line)
output_lines = final_lines
content = "\n".join(output_lines)
# Clean up excessive blank lines (more than 2 consecutive)
content = re.sub(r"\n{4,}", "\n\n\n", content)
# Ensure trailing newline
if not content.endswith("\n"):
content += "\n"
with open(output_file, "w") as f:
f.write(content)
print(f"Updated LICENSE-binary written to: {output_file}")
# ---------------------------------------------------------------------------
# Step 9: Generate NOTICE-binary
# ---------------------------------------------------------------------------
_DEFAULT_NOTICE_HEADER = """\
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
// ------------------------------------------------------------------
// NOTICE file corresponding to the section 4d of The Apache License,
// Version 2.0, in this case for
// ------------------------------------------------------------------
"""
def _read_notice_header(notice_file: Path, num_lines: int = 7) -> str:
"""Read the header from an existing NOTICE-binary file.
Falls back to a built-in default if the file doesn't exist.
"""
if not notice_file.exists():
return _DEFAULT_NOTICE_HEADER
with open(notice_file) as f:
lines = [f.readline() for _ in range(num_lines)]
return "".join(lines)
def extract_notices_from_jars(deps: Set[Dep], m2_repo: Path) -> List[Tuple[str, str]]:
"""
For each dependency, open its JAR in ~/.m2/repository and extract
META-INF/NOTICE (or META-INF/NOTICE.txt) if present.
Returns list of (dep_str, notice_text) tuples.
"""
notices = []
checked = 0
found = 0
for dep in sorted(deps, key=str):
group_path = dep.group_id.replace(".", "/")
jar_path = (
m2_repo / group_path / dep.artifact_id / dep.version
/ f"{dep.artifact_id}-{dep.version}.jar"
)
if not jar_path.exists():
continue
checked += 1
try:
with zipfile.ZipFile(jar_path, "r") as zf:
notice_text = None
# Look for NOTICE files (case-insensitive, with or without .txt)
for name in zf.namelist():
basename = name.lower()
if basename in (
"meta-inf/notice",
"meta-inf/notice.txt",
"meta-inf/notice.md",
):
notice_text = zf.read(name).decode("utf-8", errors="replace")
break
if notice_text and notice_text.strip():
# Skip notices that are just the generic ASF boilerplate
# (many Apache projects have identical trivial notices)
notices.append((str(dep), notice_text.strip()))
found += 1
except (zipfile.BadZipFile, KeyError, OSError):
pass
print(f" Scanned {checked} JARs, found {found} with NOTICE files")
return notices
def strip_asf_boilerplate(text: str) -> str:
"""
Strip the common Apache Software Foundation boilerplate that appears in
most Apache project NOTICE files. The ApacheNoticeResourceTransformer
puts this once at the top; we do the same.
Strips lines matching patterns like:
- "This product includes software developed at"
- "This product includes software developed by"
- "The Apache Software Foundation (http://www.apache.org/)."
"""
lines = text.split("\n")
filtered = []
skip_next_blank = False
i = 0
while i < len(lines):
line = lines[i].strip()
# Check for the 2-line ASF boilerplate pattern
if re.match(r"^This product includes software developed (at|by)$", line):
# Check if next line is the ASF URL line
if (i + 1 < len(lines)
and re.match(r"^The Apache Software Foundation", lines[i + 1].strip())):
i += 2
# Skip trailing blank lines after the boilerplate
while i < len(lines) and not lines[i].strip():
i += 1
continue
# Check for the single-line variant
if re.match(
r"^This product includes software developed (at|by)\s+"
r"The Apache Software Foundation",
line,
):
i += 1
while i < len(lines) and not lines[i].strip():
i += 1
continue
filtered.append(lines[i])
i += 1
# Strip leading/trailing blank lines
result = "\n".join(filtered).strip()
return result
def deduplicate_notices(notices: List[Tuple[str, str]]) -> str:
"""
Deduplicate and concatenate NOTICE texts.
Uses two strategies:
1. Exact dedup after whitespace normalization (catches identical copies)
2. Title-based dedup: if two notices share the same "title" (first
non-blank line, e.g. "# Jackson JSON processor"), keep only the
longest version (catches Jackson-style near-duplicates where each
module ships a slightly different variant)
"""
seen_normalized: Set[str] = set()
by_title: Dict[str, Tuple[str, str]] = OrderedDict() # title -> (dep, longest_text)
for dep_str, raw_text in notices:
text = strip_asf_boilerplate(raw_text)
if not text:
continue
# Exact dedup
normalized = re.sub(r"\s+", " ", text)
if normalized in seen_normalized:
continue
seen_normalized.add(normalized)
# Title-based dedup: extract first non-blank line as "title"
title = ""
for line in text.split("\n"):
stripped = line.strip().lstrip("#").strip()
if stripped:
title = stripped
break
if title and title in by_title:
# Keep the longer version (more complete notice)
existing_text = by_title[title][1]
if len(text) > len(existing_text):
by_title[title] = (dep_str, text)
else:
key = title if title else dep_str
by_title[key] = (dep_str, text)
return "\n\n".join(text for _, text in by_title.values())
def generate_notice_binary(
deps: Set[Dep],
m2_repo: Path,
output_file: Path,
existing_notice_file: Optional[Path] = None,
):
"""Generate NOTICE-binary by extracting and merging NOTICE files from dependency JARs."""
print("\nGenerating NOTICE-binary...")
notices = extract_notices_from_jars(deps, m2_repo)
if not notices:
print(" WARNING: No NOTICE files found in any dependency JARs.")
print(" Make sure the project has been built (mvn install) first.")
return
header = _read_notice_header(existing_notice_file) if existing_notice_file else _DEFAULT_NOTICE_HEADER
merged = deduplicate_notices(notices)
content = header + "\n" + merged + "\n"
with open(output_file, "w") as f:
f.write(content)
print(f" NOTICE-binary written to: {output_file}")
print(f" Contains notices from {len(notices)} dependencies "
f"({len(set(t for _, t in notices))} unique)")
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(
description="Update LICENSE-binary and NOTICE-binary for Apache Pinot releases"
)
parser.add_argument(
"--pinot-root",
type=Path,
default=None,
help="Path to pinot repository root (auto-detected if not set)",
)
parser.add_argument(
"--report-only",
action="store_true",
help="Only print the diff report; don't generate updated files",
)
scope_group = parser.add_mutually_exclusive_group()
scope_group.add_argument(
"--license-only",
action="store_true",
help="Only update LICENSE-binary (skip NOTICE-binary)",
)
scope_group.add_argument(
"--notice-only",
action="store_true",
help="Only update NOTICE-binary (skip LICENSE-binary)",
)
parser.add_argument(
"--output",
type=Path,
default=None,
help="Output path for updated LICENSE-binary (default: LICENSE-binary.updated)",
)
parser.add_argument(
"--notice-output",
type=Path,
default=None,
help="Output path for updated NOTICE-binary (default: NOTICE-binary.updated)",
)
parser.add_argument(
"--m2-repo",
type=Path,
default=Path.home() / ".m2" / "repository",
help="Path to local Maven repository (default: ~/.m2/repository)",
)
parser.add_argument(
"--skip-maven",
type=Path,
default=None,
help="Skip Maven invocation; read deps from this file instead "
"(format: one group:artifact:type:version:scope per line)",
)
args = parser.parse_args()
update_license = not args.notice_only
update_notice = not args.license_only
# Determine pinot root
if args.pinot_root:
pinot_root = args.pinot_root.resolve()
else:
# Try to detect from script location or cwd
script_dir = Path(__file__).resolve().parent
candidate = script_dir.parent
if (candidate / "LICENSE-binary").exists():
pinot_root = candidate
elif (Path.cwd() / "LICENSE-binary").exists():
pinot_root = Path.cwd()
else:
print("ERROR: Cannot detect pinot root. Use --pinot-root.", file=sys.stderr)
sys.exit(1)
license_file = pinot_root / "LICENSE-binary"
if not license_file.exists():
print(f"ERROR: {license_file} not found", file=sys.stderr)
sys.exit(1)
print(f"Pinot root: {pinot_root}")
print(f"Updating: ", end="")
if update_license and update_notice:
print("LICENSE-binary + NOTICE-binary")
elif update_license:
print("LICENSE-binary only")
else:
print("NOTICE-binary only")
print()
# Step 1: Parse assembly to find shipped modules
modules = parse_assembly_modules(pinot_root)
# Step 2-3: Get new deps from Maven
if args.skip_maven:
print(f"Reading deps from {args.skip_maven}...")
new_deps = set()
with open(args.skip_maven) as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
parts = line.split(":")
if len(parts) == 5:
# group:artifact:type:version:scope
g, a, v = parts[0], parts[1], parts[3]
new_deps.add(Dep(g.strip(), a.strip(), v.strip()))
elif len(parts) == 6:
# group:artifact:type:classifier:version:scope
g, a, v = parts[0], parts[1], parts[4]
new_deps.add(Dep(g.strip(), a.strip(), v.strip()))
print(f"Read {len(new_deps)} deps from file\n")
else:
new_deps = run_dependency_list(pinot_root, modules)
# LICENSE-binary update
if update_license:
# Step 4: Parse current LICENSE-binary
print("Parsing current LICENSE-binary...")
file_lines, sections, dep_to_section = parse_license_binary(license_file)
total_old = sum(len(s.deps) for s in sections)
print(f" Found {len(sections)} license sections with {total_old} total deps\n")
for s in sections:
print(f" {s.name}: {len(s.deps)} deps")
print()
# Step 5: Compute diff
added, removed, changed = compute_diff(sections, new_deps)
# Step 6: Detect licenses for new deps
print("Detecting licenses for new dependencies...")
license_map: Dict[str, str] = {}
detected = 0
for dep in added:
lic = detect_license_from_pom(dep, args.m2_repo)
if lic:
license_map[str(dep)] = lic
detected += 1
print(f" Auto-detected licenses for {detected}/{len(added)} new deps")
# Step 6a: Check if version-bumped deps changed their license
print("Checking version-bumped deps for license changes...")
changed, license_changed = check_version_bump_license_changes(
changed, dep_to_section, args.m2_repo
)
if license_changed:
print(f" WARNING: {len(license_changed)} deps changed license on version bump!")
else:
print(f" All {len(changed)} version bumps have unchanged licenses")
# Step 6b: Detect orphaned license files
print("Checking for orphaned license files...")
empty_sections, orphaned_files = find_orphaned_license_files(
pinot_root, sections, removed, added, changed, license_changed, license_map,
)
if empty_sections:
print(f" {len(empty_sections)} sections will be empty after update")
if orphaned_files:
print(f" {len(orphaned_files)} license files not referenced in LICENSE-binary")
print()
# Step 7: Print report
print_report(added, removed, changed, license_changed, license_map,
empty_sections, orphaned_files)
# Step 8: Generate updated file
if not args.report_only:
output_path = args.output or (pinot_root / "LICENSE-binary.updated")
generate_updated_license_binary(
license_file, sections, added, removed, changed,
license_changed, license_map, output_path,
)
print(f"\nReview the updated file, then:")
print(f" diff {license_file} {output_path}")
print(f" cp {output_path} {license_file}")
else:
print("(--report-only mode: no LICENSE-binary file written)")
# NOTICE-binary update
if update_notice:
if args.report_only:
print("(--report-only mode: no NOTICE-binary file written)")
else:
notice_output = args.notice_output or (pinot_root / "NOTICE-binary.updated")
generate_notice_binary(
new_deps, args.m2_repo, notice_output,
existing_notice_file=pinot_root / "NOTICE-binary",
)
print(f"\n Review the updated NOTICE file, then:")
print(f" diff {pinot_root / 'NOTICE-binary'} {notice_output}")
print(f" cp {notice_output} {pinot_root / 'NOTICE-binary'}")
if __name__ == "__main__":
main()