docs/_bin/broken-link-check.py - druid - Git at Google

 #!/usr/bin/env python3

 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import json
 import os
 import re
 import sys

 #
 # Checks for broken redirects (in _redirects.json) and links from markdown files to
 # nonexistent pages. Does _not_ check for links to anchors that don't exist.
 #

 # Targets to these 'well known' pages are OK.
 WELL_KNOWN_PAGES = ["/libraries.html", "/downloads.html", "/community/", "/thanks.html"]

 def normalize_link(source, target):
   dirname = os.path.dirname(source)
   normalized = os.path.normpath(os.path.join(dirname, target))
   return normalized

 def verify_redirects(docs_directory, redirect_json):
   ok = True

   with open(redirect_json, 'r') as f:
     redirects = json.loads(f.read())

   for redirect in redirects:
     if redirect["target"] in WELL_KNOWN_PAGES:
       continue

     # Replace .html and named anchors with .md, and check the file on the filesystem.
     target = re.sub(r'\.html(#.*)?$', '.md', normalize_link(redirect["source"], redirect["target"]))
     if not os.path.exists(os.path.join(docs_directory, target)):
       sys.stderr.write('Redirect [' + redirect["source"] + '] target does not exist: ' + redirect["target"] + "\n")
       ok = False

   return ok

 def verify_markdown(docs_directory):
   ok = True

   # Get list of markdown files.
   markdowns = []
   for root, dirs, files in os.walk(docs_directory):
     for name in files:
       if name.endswith('.md'):
         markdowns.append(os.path.join(root, name))

   for markdown_file in markdowns:
     with open(markdown_file, 'r') as f:
       content = f.read()

     for m in re.finditer(r'\[([^\[]*?)\]\((.*?)(?: \"[^\"]+\")?\)', content):
       target = m.group(2)

       if target in WELL_KNOWN_PAGES:
         continue

       if markdown_file.endswith("/druid-kerberos.md") and target in ['regexp', 'druid@EXAMPLE.COM']:
         # Hack to support the fact that rule examples in druid-kerberos docs look sort of like markdown links.
         continue

       target = re.sub(r'^/docs/VERSION/', '', target)
       target = re.sub(r'#.*$', '', target)
       target = re.sub(r'\.html$', '.md', target)
       target = re.sub(r'/$', '/index.md', target)
       if target and not (target.startswith('http://') or target.startswith('https://')):
         target_normalized = normalize_link(markdown_file, target)

         if not os.path.exists(target_normalized):
           sys.stderr.write('Page     [' + markdown_file + '] target does not exist: ' + m.group(2) + "\n")
           ok = False

   return ok

 def main():
   if len(sys.argv) != 3:
     sys.stderr.write('usage: program <docs dir> <redirect.json>\n')
     sys.exit(1)

   ok = verify_redirects(sys.argv[1], sys.argv[2])
   ok = verify_markdown(sys.argv[1]) and ok
   if not ok:
     sys.exit(1)

 main()
	#!/usr/bin/env python3

	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import json
	import os
	import re
	import sys

	#
	# Checks for broken redirects (in _redirects.json) and links from markdown files to
	# nonexistent pages. Does _not_ check for links to anchors that don't exist.
	#

	# Targets to these 'well known' pages are OK.
	WELL_KNOWN_PAGES = ["/libraries.html", "/downloads.html", "/community/", "/thanks.html"]

	def normalize_link(source, target):
	dirname = os.path.dirname(source)
	normalized = os.path.normpath(os.path.join(dirname, target))
	return normalized

	def verify_redirects(docs_directory, redirect_json):
	ok = True

	with open(redirect_json, 'r') as f:
	redirects = json.loads(f.read())

	for redirect in redirects:
	if redirect["target"] in WELL_KNOWN_PAGES:
	continue

	# Replace .html and named anchors with .md, and check the file on the filesystem.
	target = re.sub(r'\.html(#.*)?$', '.md', normalize_link(redirect["source"], redirect["target"]))
	if not os.path.exists(os.path.join(docs_directory, target)):
	sys.stderr.write('Redirect [' + redirect["source"] + '] target does not exist: ' + redirect["target"] + "\n")
	ok = False

	return ok

	def verify_markdown(docs_directory):
	ok = True

	# Get list of markdown files.
	markdowns = []
	for root, dirs, files in os.walk(docs_directory):
	for name in files:
	if name.endswith('.md'):
	markdowns.append(os.path.join(root, name))

	for markdown_file in markdowns:
	with open(markdown_file, 'r') as f:
	content = f.read()

	for m in re.finditer(r'\[([^\[]?)\]\((.?)(?: \"[^\"]+\")?\)', content):
	target = m.group(2)

	if target in WELL_KNOWN_PAGES:
	continue

	if markdown_file.endswith("/druid-kerberos.md") and target in ['regexp', 'druid@EXAMPLE.COM']:
	# Hack to support the fact that rule examples in druid-kerberos docs look sort of like markdown links.
	continue

	target = re.sub(r'^/docs/VERSION/', '', target)
	target = re.sub(r'#.*$', '', target)
	target = re.sub(r'\.html$', '.md', target)
	target = re.sub(r'/$', '/index.md', target)
	if target and not (target.startswith('http://') or target.startswith('https://')):
	target_normalized = normalize_link(markdown_file, target)

	if not os.path.exists(target_normalized):
	sys.stderr.write('Page [' + markdown_file + '] target does not exist: ' + m.group(2) + "\n")
	ok = False

	return ok

	def main():
	if len(sys.argv) != 3:
	sys.stderr.write('usage: program <docs dir> <redirect.json>\n')
	sys.exit(1)

	ok = verify_redirects(sys.argv[1], sys.argv[2])
	ok = verify_markdown(sys.argv[1]) and ok
	if not ok:
	sys.exit(1)

	main()