scripts/apply-renames.py - juneau - Git at Google

 #!/usr/bin/env python3
 """
 apply-renames.py — Mechanical bulk URL/text substitution pass for juneau-docs.

 Reads renames-table.json and applies substitutions to:
   - juneau-docs/pages/**/*.md  (excluding historical release-notes pre-10.x)
   - juneau-docs/README.md
   - juneau-docs/src/pages/downloads.md

 Rules:
   - Skips code-fence regions (``` blocks) to avoid false positives.
   - Skips inline-code spans (`...`) to avoid false positives.
   - Skips release-notes files that do NOT start with "10." (9.x and older are historical).
   - Is idempotent — safe to re-run.
   - Prints a per-file change count summary at the end.
 """

 import json
 import os
 import re
 import sys

 # ---------------------------------------------------------------------------
 # Configuration
 # ---------------------------------------------------------------------------
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 DOCS_ROOT = os.path.dirname(SCRIPT_DIR)  # juneau-docs/
 RENAMES_TABLE = os.path.join(SCRIPT_DIR, "renames-table.json")

 # Files/directories to walk
 PAGES_DIR = os.path.join(DOCS_ROOT, "pages")
 EXTRA_FILES = [
     os.path.join(DOCS_ROOT, "README.md"),
     os.path.join(DOCS_ROOT, "src", "pages", "downloads.md"),
 ]

 # Carve-out: files being renamed/deleted by a concurrent agent — skip entirely
 # (relative to PAGES_DIR)
 CARVE_OUT_REL = {
     "topics/JuneauPetstoreOverview.md",
     "topics/22.01.V9.0-migration-guide.md",
     "topics/24.01.V10.0-migration-guide.md",
     "topics/03.Module-juneau-marshall-rdf.md",
     "topics/03.01.RdfBasics.md",
     "topics/03.02.RdfSerializers.md",
     "topics/03.03.RdfParsers.md",
     "topics/02.34.BestPractices.md",
     "topics/16.01.MyJettyMicroserviceBasics.md",
     "topics/16.02.MyJettyMicroserviceInstalling.md",
     "topics/16.03.MyJettyMicroserviceRunning.md",
     "topics/16.04.MyJettyMicroserviceBuilding.md",
 }


 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------

 def load_rules(path):
     with open(path, "r", encoding="utf-8") as fh:
         raw = json.load(fh)
     compiled = []
     for rule in raw:
         comment = rule.get("comment", "")
         replacement = rule["replacement"]
         if rule["is_regex"]:
             compiled.append((re.compile(rule["pattern"]), replacement, comment, True))
         else:
             compiled.append((rule["pattern"], replacement, comment, False))
     return compiled


 def mask_code(text):
     """
     Return (masked_text, restore_map) where every code-fence block and every
     inline-code span has been replaced with a placeholder token that contains
     no characters that would match the rename patterns.

     Exception: inline-code spans that appear inside a markdown link label
     [` ... `](url) are NOT masked — we DO want to rewrite test class names
     that appear as backtick-formatted link labels.

     The restore_map is a list of (placeholder, original) pairs to undo masking.
     """
     restore_map = []
     counter = [0]

     def make_placeholder(original):
         token = f"\x00MASKED_{counter[0]}\x00"
         counter[0] += 1
         restore_map.append((token, original))
         return token

     # Mask fenced code blocks first (``` ... ```)
     def mask_fence(m):
         return make_placeholder(m.group(0))

     text = re.sub(r"```.*?```", mask_fence, text, flags=re.DOTALL)

     # Mask inline code spans (` ... `)  — non-greedy, single-line.
     # BUT: if the inline code is the sole content of a link label [` ... `],
     # do NOT mask it — those labels should be rewritten alongside their URLs.
     # Strategy: first mark link-label inline code as temporarily exempt,
     # mask the rest, then restore the exempt ones.

     # Step 1: temporarily replace [` ... `] link-label backtick spans
     # with a sentinel form that won't be matched by the general inline mask.
     link_label_spans = {}
     link_sentinel_counter = [0]

     def exempt_link_label(m):
         sentinel = f"\x00LINKLABEL_{link_sentinel_counter[0]}\x00"
         link_sentinel_counter[0] += 1
         # Store only the inner backtick span (group 1), not the outer brackets.
         # Restore will reconstruct [inner] from [sentinel].
         link_label_spans[sentinel] = m.group(1)
         return f"[{sentinel}]"

     # Match [` ... `] — link label that is exactly one backtick span
     text = re.sub(r"\[(`[^`\n]+`)\]", exempt_link_label, text)

     # Step 2: mask all remaining inline code spans
     def mask_inline(m):
         return make_placeholder(m.group(0))

     text = re.sub(r"`[^`\n]+`", mask_inline, text)

     # Step 3: restore the exempt link labels (now substitutions can apply to them)
     # Restore by replacing [sentinel] → [inner_content]
     for sentinel, inner in link_label_spans.items():
         text = text.replace(f"[{sentinel}]", f"[{inner}]")

     return text, restore_map


 def restore_code(text, restore_map):
     for placeholder, original in restore_map:
         text = text.replace(placeholder, original)
     return text


 def apply_rules_to_text(text, rules):
     """Apply all rules to text, returning (new_text, total_changes)."""
     total = 0
     for pattern, replacement, _comment, is_regex in rules:
         if is_regex:
             new_text, n = pattern.subn(replacement, text)
         else:
             # Literal replacement — count occurrences then replace
             n = text.count(pattern)
             new_text = text.replace(pattern, replacement)
         total += n
         text = new_text
     return text, total


 def process_file(path, rules, carve_out_abs):
     """
     Read, apply rules (skipping code regions), write back if changed.
     Returns (changed: bool, n_substitutions: int).
     """
     if path in carve_out_abs:
         return False, 0

     # Check release-notes skip: only process 10.x files
     rel_pages = os.path.relpath(path, PAGES_DIR)
     if rel_pages.startswith("release-notes" + os.sep):
         fname = os.path.basename(path)
         if not fname.startswith("10."):
             return False, 0

     try:
         with open(path, "r", encoding="utf-8") as fh:
             original = fh.read()
     except UnicodeDecodeError:
         print(f"  WARNING: cannot decode {path} as UTF-8 — skipping", file=sys.stderr)
         return False, 0

     masked, restore_map = mask_code(original)
     updated, n = apply_rules_to_text(masked, rules)
     restored = restore_code(updated, restore_map)

     if restored == original:
         return False, 0

     with open(path, "w", encoding="utf-8") as fh:
         fh.write(restored)
     return True, n


 # ---------------------------------------------------------------------------
 # Main
 # ---------------------------------------------------------------------------

 def main():
     rules = load_rules(RENAMES_TABLE)
     print(f"Loaded {len(rules)} substitution rules from {RENAMES_TABLE}")

     # Build carve-out absolute paths
     carve_out_abs = set()
     for rel in CARVE_OUT_REL:
         carve_out_abs.add(os.path.join(PAGES_DIR, rel.replace("/", os.sep)))

     # Collect all target files
     target_files = []

     # Walk pages/
     for dirpath, _dirnames, filenames in os.walk(PAGES_DIR):
         for fn in filenames:
             if fn.endswith(".md"):
                 target_files.append(os.path.join(dirpath, fn))

     # Extra top-level files
     for extra in EXTRA_FILES:
         if os.path.isfile(extra):
             target_files.append(extra)

     target_files.sort()

     # Process
     changed_files = []
     skipped_carveout = []
     skipped_historical = []
     total_subs = 0

     for path in target_files:
         # Carve-out check
         if path in carve_out_abs:
             skipped_carveout.append(path)
             continue

         # Release-notes historical check
         rel_pages = os.path.relpath(path, PAGES_DIR)
         if rel_pages.startswith("release-notes" + os.sep):
             fname = os.path.basename(path)
             if not fname.startswith("10."):
                 skipped_historical.append(path)
                 continue

         changed, n = process_file(path, rules, carve_out_abs)
         if changed:
             rel = os.path.relpath(path, DOCS_ROOT)
             changed_files.append((rel, n))
             total_subs += n

     # Summary
     print()
     print("=" * 72)
     print("SUBSTITUTION SUMMARY")
     print("=" * 72)
     if changed_files:
         print(f"\nFiles modified ({len(changed_files)}):")
         for rel, n in changed_files:
             print(f"  {n:4d}  {rel}")
     else:
         print("\n  (no files modified)")

     print(f"\nTotal substitutions: {total_subs}")
     print(f"Carve-out files skipped: {len(skipped_carveout)}")
     print(f"Historical release-notes skipped: {len(skipped_historical)}")
     print()


 if __name__ == "__main__":
     main()
	#!/usr/bin/env python3
	"""
	apply-renames.py — Mechanical bulk URL/text substitution pass for juneau-docs.

	Reads renames-table.json and applies substitutions to:
	- juneau-docs/pages/*/.md (excluding historical release-notes pre-10.x)
	- juneau-docs/README.md
	- juneau-docs/src/pages/downloads.md

	Rules:
	- Skips code-fence regions (``` blocks) to avoid false positives.
	- Skips inline-code spans (`...`) to avoid false positives.
	- Skips release-notes files that do NOT start with "10." (9.x and older are historical).
	- Is idempotent — safe to re-run.
	- Prints a per-file change count summary at the end.
	"""

	import json
	import os
	import re
	import sys

	# ---------------------------------------------------------------------------
	# Configuration
	# ---------------------------------------------------------------------------
	SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
	DOCS_ROOT = os.path.dirname(SCRIPT_DIR) # juneau-docs/
	RENAMES_TABLE = os.path.join(SCRIPT_DIR, "renames-table.json")

	# Files/directories to walk
	PAGES_DIR = os.path.join(DOCS_ROOT, "pages")
	EXTRA_FILES = [
	os.path.join(DOCS_ROOT, "README.md"),
	os.path.join(DOCS_ROOT, "src", "pages", "downloads.md"),
	]

	# Carve-out: files being renamed/deleted by a concurrent agent — skip entirely
	# (relative to PAGES_DIR)
	CARVE_OUT_REL = {
	"topics/JuneauPetstoreOverview.md",
	"topics/22.01.V9.0-migration-guide.md",
	"topics/24.01.V10.0-migration-guide.md",
	"topics/03.Module-juneau-marshall-rdf.md",
	"topics/03.01.RdfBasics.md",
	"topics/03.02.RdfSerializers.md",
	"topics/03.03.RdfParsers.md",
	"topics/02.34.BestPractices.md",
	"topics/16.01.MyJettyMicroserviceBasics.md",
	"topics/16.02.MyJettyMicroserviceInstalling.md",
	"topics/16.03.MyJettyMicroserviceRunning.md",
	"topics/16.04.MyJettyMicroserviceBuilding.md",
	}


	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------

	def load_rules(path):
	with open(path, "r", encoding="utf-8") as fh:
	raw = json.load(fh)
	compiled = []
	for rule in raw:
	comment = rule.get("comment", "")
	replacement = rule["replacement"]
	if rule["is_regex"]:
	compiled.append((re.compile(rule["pattern"]), replacement, comment, True))
	else:
	compiled.append((rule["pattern"], replacement, comment, False))
	return compiled


	def mask_code(text):
	"""
	Return (masked_text, restore_map) where every code-fence block and every
	inline-code span has been replaced with a placeholder token that contains
	no characters that would match the rename patterns.

	Exception: inline-code spans that appear inside a markdown link label
	[` ... `](url) are NOT masked — we DO want to rewrite test class names
	that appear as backtick-formatted link labels.

	The restore_map is a list of (placeholder, original) pairs to undo masking.
	"""
	restore_map = []
	counter = [0]

	def make_placeholder(original):
	token = f"\x00MASKED_{counter[0]}\x00"
	counter[0] += 1
	restore_map.append((token, original))
	return token

	# Mask fenced code blocks first (``` ... ```)
	def mask_fence(m):
	return make_placeholder(m.group(0))

	text = re.sub(r"```.*?```", mask_fence, text, flags=re.DOTALL)

	# Mask inline code spans (` ... `) — non-greedy, single-line.
	# BUT: if the inline code is the sole content of a link label [` ... `],
	# do NOT mask it — those labels should be rewritten alongside their URLs.
	# Strategy: first mark link-label inline code as temporarily exempt,
	# mask the rest, then restore the exempt ones.

	# Step 1: temporarily replace [` ... `] link-label backtick spans
	# with a sentinel form that won't be matched by the general inline mask.
	link_label_spans = {}
	link_sentinel_counter = [0]

	def exempt_link_label(m):
	sentinel = f"\x00LINKLABEL_{link_sentinel_counter[0]}\x00"
	link_sentinel_counter[0] += 1
	# Store only the inner backtick span (group 1), not the outer brackets.
	# Restore will reconstruct [inner] from [sentinel].
	link_label_spans[sentinel] = m.group(1)
	return f"[{sentinel}]"

	# Match [` ... `] — link label that is exactly one backtick span
	text = re.sub(r"\[(`[^`\n]+`)\]", exempt_link_label, text)

	# Step 2: mask all remaining inline code spans
	def mask_inline(m):
	return make_placeholder(m.group(0))

	text = re.sub(r"`[^`\n]+`", mask_inline, text)

	# Step 3: restore the exempt link labels (now substitutions can apply to them)
	# Restore by replacing [sentinel] → [inner_content]
	for sentinel, inner in link_label_spans.items():
	text = text.replace(f"[{sentinel}]", f"[{inner}]")

	return text, restore_map


	def restore_code(text, restore_map):
	for placeholder, original in restore_map:
	text = text.replace(placeholder, original)
	return text


	def apply_rules_to_text(text, rules):
	"""Apply all rules to text, returning (new_text, total_changes)."""
	total = 0
	for pattern, replacement, _comment, is_regex in rules:
	if is_regex:
	new_text, n = pattern.subn(replacement, text)
	else:
	# Literal replacement — count occurrences then replace
	n = text.count(pattern)
	new_text = text.replace(pattern, replacement)
	total += n
	text = new_text
	return text, total


	def process_file(path, rules, carve_out_abs):
	"""
	Read, apply rules (skipping code regions), write back if changed.
	Returns (changed: bool, n_substitutions: int).
	"""
	if path in carve_out_abs:
	return False, 0

	# Check release-notes skip: only process 10.x files
	rel_pages = os.path.relpath(path, PAGES_DIR)
	if rel_pages.startswith("release-notes" + os.sep):
	fname = os.path.basename(path)
	if not fname.startswith("10."):
	return False, 0

	try:
	with open(path, "r", encoding="utf-8") as fh:
	original = fh.read()
	except UnicodeDecodeError:
	print(f" WARNING: cannot decode {path} as UTF-8 — skipping", file=sys.stderr)
	return False, 0

	masked, restore_map = mask_code(original)
	updated, n = apply_rules_to_text(masked, rules)
	restored = restore_code(updated, restore_map)

	if restored == original:
	return False, 0

	with open(path, "w", encoding="utf-8") as fh:
	fh.write(restored)
	return True, n


	# ---------------------------------------------------------------------------
	# Main
	# ---------------------------------------------------------------------------

	def main():
	rules = load_rules(RENAMES_TABLE)
	print(f"Loaded {len(rules)} substitution rules from {RENAMES_TABLE}")

	# Build carve-out absolute paths
	carve_out_abs = set()
	for rel in CARVE_OUT_REL:
	carve_out_abs.add(os.path.join(PAGES_DIR, rel.replace("/", os.sep)))

	# Collect all target files
	target_files = []

	# Walk pages/
	for dirpath, _dirnames, filenames in os.walk(PAGES_DIR):
	for fn in filenames:
	if fn.endswith(".md"):
	target_files.append(os.path.join(dirpath, fn))

	# Extra top-level files
	for extra in EXTRA_FILES:
	if os.path.isfile(extra):
	target_files.append(extra)

	target_files.sort()

	# Process
	changed_files = []
	skipped_carveout = []
	skipped_historical = []
	total_subs = 0

	for path in target_files:
	# Carve-out check
	if path in carve_out_abs:
	skipped_carveout.append(path)
	continue

	# Release-notes historical check
	rel_pages = os.path.relpath(path, PAGES_DIR)
	if rel_pages.startswith("release-notes" + os.sep):
	fname = os.path.basename(path)
	if not fname.startswith("10."):
	skipped_historical.append(path)
	continue

	changed, n = process_file(path, rules, carve_out_abs)
	if changed:
	rel = os.path.relpath(path, DOCS_ROOT)
	changed_files.append((rel, n))
	total_subs += n

	# Summary
	print()
	print("=" * 72)
	print("SUBSTITUTION SUMMARY")
	print("=" * 72)
	if changed_files:
	print(f"\nFiles modified ({len(changed_files)}):")
	for rel, n in changed_files:
	print(f" {n:4d} {rel}")
	else:
	print("\n (no files modified)")

	print(f"\nTotal substitutions: {total_subs}")
	print(f"Carve-out files skipped: {len(skipped_carveout)}")
	print(f"Historical release-notes skipped: {len(skipped_historical)}")
	print()


	if __name__ == "__main__":
	main()