blob: d5b3759c73885671deaa709416e612cb62e94e20 [file]
#!/usr/bin/env python3
"""
apply-renames.py — Mechanical bulk URL/text substitution pass for juneau-docs.
Reads renames-table.json and applies substitutions to:
- juneau-docs/pages/**/*.md (excluding historical release-notes pre-10.x)
- juneau-docs/README.md
- juneau-docs/src/pages/downloads.md
Rules:
- Skips code-fence regions (``` blocks) to avoid false positives.
- Skips inline-code spans (`...`) to avoid false positives.
- Skips release-notes files that do NOT start with "10." (9.x and older are historical).
- Is idempotent — safe to re-run.
- Prints a per-file change count summary at the end.
"""
import json
import os
import re
import sys
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
DOCS_ROOT = os.path.dirname(SCRIPT_DIR) # juneau-docs/
RENAMES_TABLE = os.path.join(SCRIPT_DIR, "renames-table.json")
# Files/directories to walk
PAGES_DIR = os.path.join(DOCS_ROOT, "pages")
EXTRA_FILES = [
os.path.join(DOCS_ROOT, "README.md"),
os.path.join(DOCS_ROOT, "src", "pages", "downloads.md"),
]
# Carve-out: files being renamed/deleted by a concurrent agent — skip entirely
# (relative to PAGES_DIR)
CARVE_OUT_REL = {
"topics/JuneauPetstoreOverview.md",
"topics/22.01.V9.0-migration-guide.md",
"topics/24.01.V10.0-migration-guide.md",
"topics/03.Module-juneau-marshall-rdf.md",
"topics/03.01.RdfBasics.md",
"topics/03.02.RdfSerializers.md",
"topics/03.03.RdfParsers.md",
"topics/02.34.BestPractices.md",
"topics/16.01.MyJettyMicroserviceBasics.md",
"topics/16.02.MyJettyMicroserviceInstalling.md",
"topics/16.03.MyJettyMicroserviceRunning.md",
"topics/16.04.MyJettyMicroserviceBuilding.md",
}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def load_rules(path):
with open(path, "r", encoding="utf-8") as fh:
raw = json.load(fh)
compiled = []
for rule in raw:
comment = rule.get("comment", "")
replacement = rule["replacement"]
if rule["is_regex"]:
compiled.append((re.compile(rule["pattern"]), replacement, comment, True))
else:
compiled.append((rule["pattern"], replacement, comment, False))
return compiled
def mask_code(text):
"""
Return (masked_text, restore_map) where every code-fence block and every
inline-code span has been replaced with a placeholder token that contains
no characters that would match the rename patterns.
Exception: inline-code spans that appear inside a markdown link label
[` ... `](url) are NOT masked — we DO want to rewrite test class names
that appear as backtick-formatted link labels.
The restore_map is a list of (placeholder, original) pairs to undo masking.
"""
restore_map = []
counter = [0]
def make_placeholder(original):
token = f"\x00MASKED_{counter[0]}\x00"
counter[0] += 1
restore_map.append((token, original))
return token
# Mask fenced code blocks first (``` ... ```)
def mask_fence(m):
return make_placeholder(m.group(0))
text = re.sub(r"```.*?```", mask_fence, text, flags=re.DOTALL)
# Mask inline code spans (` ... `) — non-greedy, single-line.
# BUT: if the inline code is the sole content of a link label [` ... `],
# do NOT mask it — those labels should be rewritten alongside their URLs.
# Strategy: first mark link-label inline code as temporarily exempt,
# mask the rest, then restore the exempt ones.
# Step 1: temporarily replace [` ... `] link-label backtick spans
# with a sentinel form that won't be matched by the general inline mask.
link_label_spans = {}
link_sentinel_counter = [0]
def exempt_link_label(m):
sentinel = f"\x00LINKLABEL_{link_sentinel_counter[0]}\x00"
link_sentinel_counter[0] += 1
# Store only the inner backtick span (group 1), not the outer brackets.
# Restore will reconstruct [inner] from [sentinel].
link_label_spans[sentinel] = m.group(1)
return f"[{sentinel}]"
# Match [` ... `] — link label that is exactly one backtick span
text = re.sub(r"\[(`[^`\n]+`)\]", exempt_link_label, text)
# Step 2: mask all remaining inline code spans
def mask_inline(m):
return make_placeholder(m.group(0))
text = re.sub(r"`[^`\n]+`", mask_inline, text)
# Step 3: restore the exempt link labels (now substitutions can apply to them)
# Restore by replacing [sentinel] → [inner_content]
for sentinel, inner in link_label_spans.items():
text = text.replace(f"[{sentinel}]", f"[{inner}]")
return text, restore_map
def restore_code(text, restore_map):
for placeholder, original in restore_map:
text = text.replace(placeholder, original)
return text
def apply_rules_to_text(text, rules):
"""Apply all rules to text, returning (new_text, total_changes)."""
total = 0
for pattern, replacement, _comment, is_regex in rules:
if is_regex:
new_text, n = pattern.subn(replacement, text)
else:
# Literal replacement — count occurrences then replace
n = text.count(pattern)
new_text = text.replace(pattern, replacement)
total += n
text = new_text
return text, total
def process_file(path, rules, carve_out_abs):
"""
Read, apply rules (skipping code regions), write back if changed.
Returns (changed: bool, n_substitutions: int).
"""
if path in carve_out_abs:
return False, 0
# Check release-notes skip: only process 10.x files
rel_pages = os.path.relpath(path, PAGES_DIR)
if rel_pages.startswith("release-notes" + os.sep):
fname = os.path.basename(path)
if not fname.startswith("10."):
return False, 0
try:
with open(path, "r", encoding="utf-8") as fh:
original = fh.read()
except UnicodeDecodeError:
print(f" WARNING: cannot decode {path} as UTF-8 — skipping", file=sys.stderr)
return False, 0
masked, restore_map = mask_code(original)
updated, n = apply_rules_to_text(masked, rules)
restored = restore_code(updated, restore_map)
if restored == original:
return False, 0
with open(path, "w", encoding="utf-8") as fh:
fh.write(restored)
return True, n
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
rules = load_rules(RENAMES_TABLE)
print(f"Loaded {len(rules)} substitution rules from {RENAMES_TABLE}")
# Build carve-out absolute paths
carve_out_abs = set()
for rel in CARVE_OUT_REL:
carve_out_abs.add(os.path.join(PAGES_DIR, rel.replace("/", os.sep)))
# Collect all target files
target_files = []
# Walk pages/
for dirpath, _dirnames, filenames in os.walk(PAGES_DIR):
for fn in filenames:
if fn.endswith(".md"):
target_files.append(os.path.join(dirpath, fn))
# Extra top-level files
for extra in EXTRA_FILES:
if os.path.isfile(extra):
target_files.append(extra)
target_files.sort()
# Process
changed_files = []
skipped_carveout = []
skipped_historical = []
total_subs = 0
for path in target_files:
# Carve-out check
if path in carve_out_abs:
skipped_carveout.append(path)
continue
# Release-notes historical check
rel_pages = os.path.relpath(path, PAGES_DIR)
if rel_pages.startswith("release-notes" + os.sep):
fname = os.path.basename(path)
if not fname.startswith("10."):
skipped_historical.append(path)
continue
changed, n = process_file(path, rules, carve_out_abs)
if changed:
rel = os.path.relpath(path, DOCS_ROOT)
changed_files.append((rel, n))
total_subs += n
# Summary
print()
print("=" * 72)
print("SUBSTITUTION SUMMARY")
print("=" * 72)
if changed_files:
print(f"\nFiles modified ({len(changed_files)}):")
for rel, n in changed_files:
print(f" {n:4d} {rel}")
else:
print("\n (no files modified)")
print(f"\nTotal substitutions: {total_subs}")
print(f"Carve-out files skipped: {len(skipped_carveout)}")
print(f"Historical release-notes skipped: {len(skipped_historical)}")
print()
if __name__ == "__main__":
main()