scripts/check_dead_links.py - doris-website - Git at Google

 # Detect global dead links (no auto fix)
 #
 # Core logic:
 # Traverse all documents, match the links in the documents, and determine whether it is a dead link by the link address.
 # If it is a dead link, print:
 #   ❌ 目标文档 xxxx/xxxx.md
 #   Broken link ${target_link}
 #
 # This version:
 #   - skips inline and code block links
 #   - ignores anchors (#xxx)
 #   - checks .md and .mdx variants
 #   - never modifies files
 #   - counts and prints total broken links at the end

 import argparse
 import os
 import re
 import sys
 from urllib.parse import urlparse

 search_dirs = ["docs", "i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1","i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x","i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x", "versioned_docs/version-2.1","versioned_docs/version-3.x","versioned_docs/version-4.x", "community"]
 broken_count = 0  # 全局死链计数

 def process_md_file(file_path):
     global broken_count

     # 匹配 Markdown 链接：[text](link)
     link_pattern = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
     # 匹配代码块（包括 ``` 多行代码块 和 `行内代码`）
     code_block_pattern = re.compile(r"(```.*?```|`[^`]*`)", re.DOTALL)

     with open(file_path, "r", encoding="utf-8") as f:
         content = f.read()

     # 提取所有代码块范围
     code_blocks = []
     for match in code_block_pattern.finditer(content):
         code_blocks.append((match.start(), match.end()))

     def is_inside_code_block(pos):
         """判断该位置是否位于代码块或行内代码中"""
         for start, end in code_blocks:
             if start <= pos < end:
                 return True
         return False

     links = list(link_pattern.finditer(content))
     for match in links:
         link_target = match.group(2).strip()
         start_pos = match.start()

         # 跳过在代码块或反引号中的链接
         if is_inside_code_block(start_pos):
             continue

         # 跳过外部链接（http/https/mailto等）
         if urlparse(link_target).scheme or os.path.isabs(link_target):
             continue

         # 去掉锚点部分（#xxx）
         link_target_path = link_target.split("#", 1)[0]

         # 如果链接为空或只是锚点（如 #section），跳过
         if link_target_path == "" or link_target_path.startswith("#"):
             continue

         # 构造相对路径
         full_path = os.path.normpath(os.path.join(os.path.dirname(file_path), link_target_path))

         # 检查文件是否存在（允许省略 .md / .mdx）
         if not os.path.exists(full_path):
             md_path = full_path + ".md"
             mdx_path = full_path + ".mdx"
             if not os.path.exists(md_path) and not os.path.exists(mdx_path):
                 print(f"目标文档 {file_path}\nBroken link {link_target}\n")
                 broken_count += 1

 def travel(root_path: str):
     for root, dirs, files in os.walk(root_path):
         for file in files:
             if file.endswith(".md") or file.endswith(".mdx"):
                 process_md_file(os.path.join(root, file))

 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Detect broken md links (no auto fix)")
     parser.add_argument("--commit-id", type=str, help="Optional Git commit id (ignored for now)", default=None)
     args = parser.parse_args()

     print("🔍 Scanning for broken links...\n")

     for dir in search_dirs:
         if os.path.exists(dir):
             travel(dir)

     if broken_count > 0:
         print(f"❗ 共发现 {broken_count} 个死链")
     else:
         print("✅ 未发现死链")

     print("✅ Scan complete.")
	# Detect global dead links (no auto fix)
	#
	# Core logic:
	# Traverse all documents, match the links in the documents, and determine whether it is a dead link by the link address.
	# If it is a dead link, print:
	# ❌ 目标文档 xxxx/xxxx.md
	# Broken link ${target_link}
	#
	# This version:
	# - skips inline and code block links
	# - ignores anchors (#xxx)
	# - checks .md and .mdx variants
	# - never modifies files
	# - counts and prints total broken links at the end

	import argparse
	import os
	import re
	import sys
	from urllib.parse import urlparse

	search_dirs = ["docs", "i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1","i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x","i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x", "versioned_docs/version-2.1","versioned_docs/version-3.x","versioned_docs/version-4.x", "community"]
	broken_count = 0 # 全局死链计数

	def process_md_file(file_path):
	global broken_count

	# 匹配 Markdown 链接：[text](link)
	link_pattern = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
	# 匹配代码块（包括 ``` 多行代码块和 `行内代码`）
	code_block_pattern = re.compile(r"(```.?```\|`[^`]`)", re.DOTALL)

	with open(file_path, "r", encoding="utf-8") as f:
	content = f.read()

	# 提取所有代码块范围
	code_blocks = []
	for match in code_block_pattern.finditer(content):
	code_blocks.append((match.start(), match.end()))

	def is_inside_code_block(pos):
	"""判断该位置是否位于代码块或行内代码中"""
	for start, end in code_blocks:
	if start <= pos < end:
	return True
	return False

	links = list(link_pattern.finditer(content))
	for match in links:
	link_target = match.group(2).strip()
	start_pos = match.start()

	# 跳过在代码块或反引号中的链接
	if is_inside_code_block(start_pos):
	continue

	# 跳过外部链接（http/https/mailto等）
	if urlparse(link_target).scheme or os.path.isabs(link_target):
	continue

	# 去掉锚点部分（#xxx）
	link_target_path = link_target.split("#", 1)[0]

	# 如果链接为空或只是锚点（如 #section），跳过
	if link_target_path == "" or link_target_path.startswith("#"):
	continue

	# 构造相对路径
	full_path = os.path.normpath(os.path.join(os.path.dirname(file_path), link_target_path))

	# 检查文件是否存在（允许省略 .md / .mdx）
	if not os.path.exists(full_path):
	md_path = full_path + ".md"
	mdx_path = full_path + ".mdx"
	if not os.path.exists(md_path) and not os.path.exists(mdx_path):
	print(f"目标文档 {file_path}\nBroken link {link_target}\n")
	broken_count += 1

	def travel(root_path: str):
	for root, dirs, files in os.walk(root_path):
	for file in files:
	if file.endswith(".md") or file.endswith(".mdx"):
	process_md_file(os.path.join(root, file))

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Detect broken md links (no auto fix)")
	parser.add_argument("--commit-id", type=str, help="Optional Git commit id (ignored for now)", default=None)
	args = parser.parse_args()

	print("🔍 Scanning for broken links...\n")

	for dir in search_dirs:
	if os.path.exists(dir):
	travel(dir)

	if broken_count > 0:
	print(f"❗ 共发现 {broken_count} 个死链")
	else:
	print("✅ 未发现死链")

	print("✅ Scan complete.")