blob: fc3ef04266248212d5d7cbe07697e318b5871655 [file]
#!/usr/bin/env python3
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
"""
Validates and fixes ASF license headers across the TinkerPop repository.
Handles all comment styles found in the repo:
- Java/Go/C# block comments (* prefix inside /* ... */)
- Double-slash (// prefix)
- Hash (# prefix)
- AsciiDoc block comment (content inside //// ... ////)
- HTML/XML block comment (content inside <!-- ... -->)
- Batch files (:: prefix)
- RST files (.. prefix)
Respects the rat-plugin exclusion list from the root pom.xml.
Usage:
python3 bin/fix-license-headers.py # report issues only
python3 bin/fix-license-headers.py --fix # report and fix issues
python3 bin/fix-license-headers.py --verbose # show per-file details
"""
import os
import re
import sys
import fnmatch
from collections import Counter
REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# ---------------------------------------------------------------------------
# Canonical license text
# ---------------------------------------------------------------------------
# Lines of the license body with no comment prefix.
CANONICAL_LINES = [
"Licensed to the Apache Software Foundation (ASF) under one",
"or more contributor license agreements. See the NOTICE file",
"distributed with this work for additional information",
"regarding copyright ownership. The ASF licenses this file",
"to you under the Apache License, Version 2.0 (the",
'"License"); you may not use this file except in compliance',
"with the License. You may obtain a copy of the License at",
"",
" http://www.apache.org/licenses/LICENSE-2.0",
"",
"Unless required by applicable law or agreed to in writing,",
"software distributed under the License is distributed on an",
'"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY',
"KIND, either express or implied. See the License for the",
"specific language governing permissions and limitations",
"under the License.",
]
# Full AsciiDoc block (content between //// delimiters, inclusive).
CANONICAL_ASCIIDOC = """\
////
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
////"""
# Full HTML/XML comment block (content between <!-- and -->, inclusive).
CANONICAL_HTML = """\
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->"""
# Phrases that mark the end of any Apache license block (stripped of prefix).
# The standard form ends with "under the License." on its own line; the
# paragraph-wrapped form ends with "limitations under the License."
LICENSE_END_PHRASES = ("under the License.", "limitations under the License.")
# ---------------------------------------------------------------------------
# Exclusion patterns (mirrors rat-plugin <excludes> in root pom.xml)
# ---------------------------------------------------------------------------
EXCLUDE_PATTERNS = [
".mailmap", ".asf.yaml", ".travis.yml", ".travis.*.sh", ".dockerignore",
".github/**",
"**/.classpath", "**/.project", "**/.settings/**", "**/.idea/**",
".repository/**", "**/target/**",
"data/*.txt",
"**/bin/gremlin.sh", "gremlin-console/bin/gremlin.sh",
"docs/static/**", "docs/original/**", "docs/site/home/css/**", "docs/site/home/js/**",
"docs/gremlint/build/**", "docs/gremlint/public/CNAME",
"**/AGENTS.md",
"**/*.kryo", "**/*.gbin", "**/*.iml", "**/*.json", "**/*.xml",
"**/*.ldjson", "**/*.graffle", "**/*.svg", "**/*.trx", "**/*.sln",
"**/*.user", "**/*.csproj", "**/*.nuspec",
"**/goal.txt",
"**/src/main/resources/META-INF/services/**",
"**/src/test/resources/mockito-extensions/**",
"**/src/test/resources/META-INF/services/**",
"**/src/test/resources/cucumber.properties",
"**/src/test/resources/incorrect-traversals.txt",
"**/src/test/resources/org/apache/tinkerpop/gremlin/console/groovy/plugin/script-customizer-*.groovy",
"**/src/test/resources/org/apache/tinkerpop/gremlin/jsr223/script-customizer-*.groovy",
"**/src/test/resources/org/apache/tinkerpop/gremlin/console/jsr223/script-customizer-*.groovy",
"**/src/main/resources/org/apache/tinkerpop/gremlin/structure/io/script/*.txt",
"**/src/main/ext/**", "**/src/main/static/**",
"**/_bsp/**",
"DEPENDENCIES", "**/.glv",
"**/Debug/**", "**/Release/**", "**/obj/**",
"**/.vs/**", "**/NuGet.Config", "**/BenchmarkDotNet.Artifacts/**",
"**/.nvmrc", "**/.yarnrc.yml", "**/yarn.lock",
"**/node/**", "**/node_modules/**", "**/npm-debug.log",
"**/build/**", "**/doc/**", "**/lib/**",
"**/.env", "**/.prettierrc", "**/_site/**",
"**/.pytest_cache/**", "**/venv/**", "**/.venv/**", "**/.eggs/**",
"**/gremlinpython.egg-info/**", "**/docfx/**",
"**/go.sum", "**/coverage.out", "**/gremlinconsoletest.egg-info/**",
]
# Directories never descended into (faster than pattern matching every file).
PRUNE_DIRS = {'.git', 'target', 'node_modules', 'build', 'venv', '.venv',
'.eggs', 'doc', 'lib', 'Debug', 'Release', 'obj', 'docfx',
'_site', '__pycache__', '.pytest_cache', 'BenchmarkDotNet.Artifacts'}
# ---------------------------------------------------------------------------
# Pattern matching
# ---------------------------------------------------------------------------
def matches_exclude_pattern(rel_path, patterns):
rel_path = rel_path.replace(os.sep, "/")
for pattern in patterns:
pattern = pattern.replace(os.sep, "/")
if "/" not in pattern:
if fnmatch.fnmatch(os.path.basename(rel_path), pattern):
return True
elif "**" in pattern:
if pattern.startswith("**/"):
inner = pattern[3:]
parts = rel_path.split("/")
for i in range(len(parts)):
if fnmatch.fnmatch("/".join(parts[i:]), inner):
return True
if fnmatch.fnmatch(rel_path, pattern):
return True
else:
if fnmatch.fnmatch(rel_path, pattern) or fnmatch.fnmatch(os.path.basename(rel_path), pattern):
return True
return False
# ---------------------------------------------------------------------------
# AsciiDoc handling (//// ... ////)
# ---------------------------------------------------------------------------
def process_asciidoc(filepath, fix):
with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
content = f.read()
if 'Licensed to the Apache Software Foundation' not in content:
return 'no_license', []
lines = content.splitlines(keepends=True)
first = next((i for i, l in enumerate(lines) if l.rstrip('\r\n') == '////'), None)
if first is None:
return 'unparseable', ['no opening //// delimiter found']
second = next((i for i in range(first + 1, len(lines)) if lines[i].rstrip('\r\n') == '////'), None)
if second is None:
return 'unparseable', ['no closing //// delimiter found']
body = '\n'.join(l.rstrip('\r\n') for l in lines[first:second + 1])
if body == CANONICAL_ASCIIDOC:
return 'ok', []
if fix:
new_content = CANONICAL_ASCIIDOC + '\n' + ''.join(lines[second + 1:])
with open(filepath, 'w', encoding='utf-8') as f:
f.write(new_content)
return 'fixed', ['license block replaced with canonical form']
return 'has_issues', ['license block does not match canonical form']
# ---------------------------------------------------------------------------
# HTML/XML comment handling (<!-- ... -->)
# ---------------------------------------------------------------------------
def process_html_comment(filepath, fix, lines, open_idx):
"""Handle a <!-- --> comment block starting at open_idx."""
close_idx = next(
(i for i in range(open_idx + 1, len(lines)) if lines[i].rstrip('\r\n') == '-->'),
None
)
if close_idx is None:
return 'unparseable', ['no closing --> delimiter found']
body = '\n'.join(l.rstrip('\r\n') for l in lines[open_idx:close_idx + 1])
if body == CANONICAL_HTML:
return 'ok', []
if fix:
new_content = (
''.join(lines[:open_idx])
+ CANONICAL_HTML + '\n'
+ ''.join(lines[close_idx + 1:])
)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(new_content)
return 'fixed', ['html/xml comment block replaced with canonical form']
return 'has_issues', ['html/xml comment block does not match canonical form']
# ---------------------------------------------------------------------------
# Generic comment-style handling
# ---------------------------------------------------------------------------
def detect_comment_style(lines, start_idx):
"""
Detect the comment style from the line at start_idx.
Returns (style, base_prefix). Styles:
block_star — ' * ' prefix (Java/JS/C# block comments)
double_slash — '// ' prefix
hash — '# ' prefix
double_colon — ':: ' prefix (batch files)
double_dot — '.. ' prefix (RST)
plain — no prefix
"""
line = lines[start_idx].rstrip('\n').rstrip('\r')
m = re.match(r'^(\s*\*\s+)', line)
if m:
return 'block_star', m.group(1)
m = re.match(r'^(//\s*)', line)
if m:
return 'double_slash', m.group(1)
m = re.match(r'^(::\s*)', line)
if m:
return 'double_colon', m.group(1)
m = re.match(r'^(\.\.\s+)', line)
if m:
return 'double_dot', m.group(1)
m = re.match(r'^(#\s*)', line)
if m:
first_prefix = m.group(1)
prefixes_seen = [first_prefix]
for j in range(start_idx + 1, min(start_idx + 10, len(lines))):
next_line = lines[j].rstrip('\n').rstrip('\r')
if next_line.rstrip() == '#':
continue
m2 = re.match(r'^(#\s*)', next_line)
if m2 and next_line.strip():
prefixes_seen.append(m2.group(1))
prefix_counter = Counter(prefixes_seen)
return 'hash', prefix_counter.most_common(1)[0][0]
m = re.match(r'^(\s+)', line)
if m and line.strip().startswith('Licensed'):
return 'space_indent', m.group(1)
return 'plain', ''
def get_line_content(line, style, base_prefix):
"""Strip the comment prefix and return the bare content of a line."""
raw = line.rstrip('\n').rstrip('\r')
if style == 'block_star':
m = re.match(r'^(\s*\*)(.*)', raw)
if m:
full = m.group(1) + m.group(2)
return full[len(base_prefix):].rstrip() if len(full) >= len(base_prefix) else ""
return raw.rstrip()
if style in ('double_slash', 'hash'):
char = '//' if style == 'double_slash' else '#'
m = re.match(r'^(' + re.escape(char) + r')(.*)', raw)
if m:
full = char + m.group(2)
return full[len(base_prefix):].rstrip() if len(full) >= len(base_prefix) else ""
return raw.strip()
if style == 'double_colon':
if raw.rstrip() == '::':
return ""
m = re.match(r'^(::\s*)(.*)', raw)
if m:
full = '::' + raw[2:]
return full[len(base_prefix):].rstrip() if len(full) >= len(base_prefix) else m.group(2).rstrip()
return raw.strip()
if style == 'double_dot':
if raw.rstrip() == '..':
return ""
if not raw.strip():
return ""
full = '..' + raw[2:]
return full[len(base_prefix):].rstrip() if len(full) >= len(base_prefix) else raw.strip()
if style == 'space_indent':
if not raw.strip():
return ""
return raw[len(base_prefix):].rstrip() if len(raw) >= len(base_prefix) else raw.rstrip()
return raw.rstrip() # plain
def find_license_block(lines, style, base_prefix):
"""
Locate start and end line indices of the license block.
Accepts both the standard ending ('under the License.') and the
paragraph-wrapped ending ('limitations under the License.').
"""
start_idx = next(
(i for i, l in enumerate(lines) if 'Licensed to the Apache Software Foundation' in l),
None
)
if start_idx is None:
return None
end_idx = None
for i in range(start_idx, min(start_idx + 55, len(lines))):
content = get_line_content(lines[i], style, base_prefix).rstrip()
if content in LICENSE_END_PHRASES:
end_idx = i
break
if end_idx is None:
return None
num_lines = end_idx - start_idx + 1
if not (14 <= num_lines <= 22):
return None
return start_idx, end_idx
def reconstruct_license_lines(style, base_prefix):
"""Build the corrected license block lines (with newlines)."""
new_lines = []
for canonical in CANONICAL_LINES:
if canonical == "":
if style == 'block_star':
blank = re.match(r'^(\s*\*)', base_prefix)
new_lines.append((blank.group(1) if blank else "") + "\n")
elif style == 'double_slash':
new_lines.append("//\n")
elif style == 'hash':
new_lines.append("#\n")
elif style == 'double_colon':
new_lines.append("::\n")
elif style == 'double_dot':
new_lines.append("..\n")
else:
new_lines.append("\n")
else:
new_lines.append((base_prefix if style != 'plain' else "") + canonical + "\n")
return new_lines
def process_generic(filepath, fix):
try:
with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
content = f.read()
except Exception as e:
return 'error', [str(e)]
if 'Licensed to the Apache Software Foundation' not in content:
return 'no_license', []
lines = content.splitlines(keepends=True)
licensed_idx = next(
(i for i, l in enumerate(lines) if 'Licensed to the Apache Software Foundation' in l),
None
)
if licensed_idx is None:
return 'no_license', []
# Check for HTML/XML comment block (<!-- on the line before Licensed)
if licensed_idx > 0 and lines[licensed_idx - 1].rstrip('\r\n') == '<!--':
return process_html_comment(filepath, fix, lines, licensed_idx - 1)
style, base_prefix = detect_comment_style(lines, licensed_idx)
result = find_license_block(lines, style, base_prefix)
if result is None:
return 'unparseable', ['could not locate complete license block']
start_idx, end_idx = result
extracted = [get_line_content(lines[i], style, base_prefix) for i in range(start_idx, end_idx + 1)]
mismatches = []
if len(extracted) != len(CANONICAL_LINES):
mismatches.append(f"line count: got {len(extracted)}, expected {len(CANONICAL_LINES)}")
for i in range(min(len(extracted), len(CANONICAL_LINES))):
if extracted[i] != CANONICAL_LINES[i]:
mismatches.append(f"line {i}: got {repr(extracted[i])}, expected {repr(CANONICAL_LINES[i])}")
if not mismatches:
return 'ok', []
if fix:
try:
new_lines = (
lines[:start_idx]
+ reconstruct_license_lines(style, base_prefix)
+ lines[end_idx + 1:]
)
with open(filepath, 'w', encoding='utf-8', errors='replace') as f:
f.writelines(new_lines)
return 'fixed', mismatches
except Exception as e:
return 'error', [f"fix failed: {e}"]
return 'has_issues', mismatches
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
fix_mode = '--fix' in sys.argv
verbose_mode = '--verbose' in sys.argv
print(f"Repository: {REPO_ROOT}")
print(f"Mode: {'fix' if fix_mode else 'check only'}")
print()
stats = Counter()
problems = {} # rel_path -> list of issue strings
unparseable = []
for dirpath, dirnames, filenames in os.walk(REPO_ROOT):
dirnames[:] = sorted(d for d in dirnames if d not in PRUNE_DIRS)
for filename in filenames:
filepath = os.path.join(dirpath, filename)
rel_path = os.path.relpath(filepath, REPO_ROOT).replace(os.sep, '/')
if matches_exclude_pattern(rel_path, EXCLUDE_PATTERNS):
continue
if filename.endswith('.asciidoc'):
status, issues = process_asciidoc(filepath, fix_mode)
else:
status, issues = process_generic(filepath, fix_mode)
stats[status] += 1
if status == 'has_issues':
problems[rel_path] = issues
elif status == 'unparseable':
unparseable.append((rel_path, issues))
elif status in ('fixed', 'ok') and verbose_mode:
print(f"{'FIXED' if status == 'fixed' else 'OK '} {rel_path}")
total = sum(stats.values())
print(f"Files scanned: {total}")
print(f" No license: {stats['no_license']}")
print(f" OK: {stats['ok']}")
print(f" Fixed: {stats['fixed']}")
print(f" Has issues: {stats['has_issues']}")
print(f" Unparseable: {stats['unparseable']}")
print(f" Errors: {stats['error']}")
if problems:
print(f"\n=== FILES WITH LICENSE ISSUES ({len(problems)}) ===")
for rel_path, issues in sorted(problems.items()):
print(f"\n {rel_path}:")
for issue in issues[:5]:
print(f" {issue}")
if unparseable:
print(f"\n=== UNPARSEABLE LICENSE BLOCKS ({len(unparseable)}) ===")
for rel_path, issues in unparseable:
print(f" {rel_path}: {issues[0]}")
return len(problems)
if __name__ == '__main__':
sys.exit(main())