blob: 3553ffc8cb420333a4ab063947109c8f5e807094 [file] [log] [blame]
#!/usr/bin/env python3
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Script to find files missing Apache 2 license headers in the Hamilton repository."""
import sys
from pathlib import Path
# License header patterns to check for
# ASF-specific header (our standard)
ASF_LICENSE_PATTERNS = [
"Licensed to the Apache Software Foundation (ASF)",
"Apache License, Version 2.0",
]
# Third-party Apache 2.0 headers (also acceptable)
# Note: Some third-party headers may have spaces in the URL
THIRD_PARTY_APACHE_PATTERNS = [
"Apache License, Version 2.0",
"www.apache.org/licenses/LICENSE-2.0",
]
# File extensions to EXCLUDE from checking (based on what exists in this repo)
EXCLUDED_EXTENSIONS = {
# Python compiled/generated
".pyc",
".pyi", # Type stubs
".pyx", # Cython
".pxd", # Cython headers
".pxi", # Cython includes
# Compiled binaries
".so",
".dylib",
".jar",
# Images and media
".png",
".svg",
".ttf",
".afm", # Adobe font metrics
# Config/data files
".json",
".jsonl",
".yaml",
".yml",
".toml",
".cfg", # setup.cfg, etc.
".conf", # nginx.conf, etc.
".xml", # Test data, config files
".csv",
".fwf", # Fixed-width format test data
".dot", # Graphviz DOT files
".npy", # NumPy arrays
".mat", # MATLAB data
".sav", # SPSS data
".po", # Gettext translations
".mo", # Compiled translations
".template", # Template config files
# Build/generated files
".map", # Source maps
".gz",
".log",
".typed", # PEP 561 marker
# Web assets (usually don't have license headers)
".css",
".scss",
".html",
# JavaScript config files (these are code but often generated)
".eslintrc",
".nycrc",
".npmignore",
".editorconfig",
# Template files
".j2",
".jinja2",
# Documentation that doesn't need headers
".txt",
".rst",
# Other
".gitkeep",
".asc", # GPG keys
".cmd", # Windows batch
".coffee", # CoffeeScript (if any)
".mjs", # ES modules (often generated)
".cjs", # CommonJS modules (often generated)
".mts", # TypeScript ES modules
".flow", # Flow type definitions
".in", # MANIFEST.in, etc.
}
# Specific filenames to exclude (exact matches)
EXCLUDED_FILENAMES = {
# Lock files
"package-lock.json",
"yarn.lock",
"poetry.lock",
"uv.lock",
# License/legal files
"LICENSE",
"NOTICE",
"CHANGELOG",
# OS files
".DS_Store",
}
# Directories to skip
SKIP_DIRS = {
".git",
"__pycache__",
"node_modules",
".pytest_cache",
".mypy_cache",
".tox",
"venv",
".venv",
"build",
"dist",
"*.egg-info",
".eggs",
"htmlcov",
".coverage",
".claude",
}
def should_skip_path(path: Path) -> bool:
"""Check if a path should be skipped."""
# Skip if any parent directory is in SKIP_DIRS
for part in path.parts:
if part in SKIP_DIRS or part.startswith("."):
return True
# Skip documentation snippet files (they're embedded in docs via literalinclude)
path_str = str(path)
if "docs" in path.parts and "_snippets" in path_str:
return True
if "docs/code-comparisons" in path_str and "snippets" in path_str:
return True
return False
def has_license_header(file_path: Path, num_lines: int = 20) -> bool:
"""Check if a file has an Apache 2 license header (ASF or third-party)."""
try:
with open(file_path, "r", encoding="utf-8") as f:
content = "".join(f.readlines()[:num_lines])
# Check if all ASF license patterns are present
has_asf_header = all(pattern in content for pattern in ASF_LICENSE_PATTERNS)
# Check if all third-party Apache 2.0 patterns are present
has_third_party_header = all(pattern in content for pattern in THIRD_PARTY_APACHE_PATTERNS)
# Accept either ASF or third-party Apache 2.0 headers
return has_asf_header or has_third_party_header
except (UnicodeDecodeError, PermissionError):
# Skip files that can't be read as text
return True # Assume they're fine to avoid false positives
def find_files_without_license(root_dir: Path) -> list[Path]:
"""Find all files without Apache 2 license headers.
Uses an exclusion-based approach: checks all files except those with
excluded extensions or filenames.
Args:
root_dir: Root directory to search
Returns:
Sorted list of file paths without license headers
"""
files_without_license = []
for file_path in root_dir.rglob("*"):
# Skip directories
if file_path.is_dir():
continue
# Skip if in excluded paths
if should_skip_path(file_path):
continue
# Skip if extension is in exclusion list
if file_path.suffix in EXCLUDED_EXTENSIONS:
continue
# Skip if filename is in exclusion list
if file_path.name in EXCLUDED_FILENAMES:
continue
# Skip editor backup files (emacs, vim, etc.)
if (
file_path.name.startswith("#")
or file_path.name.endswith("~")
or file_path.name.endswith("#")
):
continue
# Skip files without extensions that aren't special files
if (
not file_path.suffix
and not file_path.name.startswith("Dockerfile")
and file_path.name != "README"
):
continue
# Check for license header
if not has_license_header(file_path):
files_without_license.append(file_path)
return sorted(files_without_license)
def main():
"""Main function."""
# Get repository root (parent of scripts directory)
repo_root = Path(__file__).parent.parent
print(f"Checking for Apache 2 license headers in {repo_root}")
print("Mode: Checking all files except excluded types")
print(f"Excluded extensions: {len(EXCLUDED_EXTENSIONS)} types")
print(f"Excluded filenames: {len(EXCLUDED_FILENAMES)} patterns")
print()
files_without_license = find_files_without_license(repo_root)
if not files_without_license:
print("✓ All files have license headers!")
return 0
print(f"Found {len(files_without_license)} files without Apache 2 license headers:\n")
for file_path in files_without_license:
# Print relative path from repo root
try:
rel_path = file_path.relative_to(repo_root)
print(f" {rel_path}")
except ValueError:
print(f" {file_path}")
return 1
if __name__ == "__main__":
sys.exit(main())