blob: 1a663f80f9d988ce34159e2386f7b4cc8e3609b7 [file] [log] [blame]
Generates HeadingIDs, ElementID, and PermaLinks
First find all specified IDs and classes. Assure unique ID and permalink
Next find all headings missing IDs. Assure unique ID and permalink
Generates a Table of Content
# from __future__ import unicode_literals
import sys
import traceback
import re
import unicodedata
from bs4 import BeautifulSoup, Comment
import pelican.contents
import pelican.plugins.signals
Based on
Which is BSD licensed, but is very much rewritten.
'unsafe_tags': True, # fix script, style, and iframe html that gfm filters as unsafe
'metadata': True, # {{ metadata }} inclusion of data in the html.
'elements': True, # {#id} and {.class} annotations.
'headings': True, # add slugified id to headings missing id. Can be overridden by page metadata.
'headings_re': r'^h[1-6]', # regex for which headings to check.
'permalinks': True, # add permalinks to elements and headings when id is added.
'toc': True, # check for [TOC] and add Table of Content if present.
'toc_headers': r'h[1-6]', # regex for which headings to include in the [TOC]
'tables': True, # add class="table" for tables missing class.
'debug': False
# Fixup tuples for HTML that GFM makes into text.
(re.compile(r'&lt;script'), '<script'),
(re.compile(r'&lt;/script'), '</script'),
(re.compile(r'&lt;style'), '<style'),
(re.compile(r'&lt;/style'), '</style'),
(re.compile(r'&lt;iframe'), '<iframe'),
(re.compile(r'&lt;/iframe'), '</iframe')
# Find {{ metadata }} inclusions
METADATA_RE = re.compile(r'{{\s*(?P<meta>[-_:a-zA-Z0-9]+)\s*}}')
# Find {#id} or {.class} elementid annotations
ELEMENTID_RE = re.compile(r'(?:[ \t]*[{\[][ \t]*(?P<type>[#.])(?P<id>[-._:a-zA-Z0-9 ]+)[}\]])(\n|$)')
# ID duplicates match
IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$')
# For permalinks
# strip permalink chars from headings for ToC
ord(LINK_CHAR): None
# Find table tags - to check for ones without class attribute.
TABLE_RE = re.compile(r'^table')
# An item in a Table of Contents - from
class HtmlTreeNode(object):
def __init__(self, parent, header, level, id):
self.children = []
self.parent = parent
self.header = header
self.level = level = id
def add(self, new_header):
new_level =
new_string = new_header.string
new_id = new_header.attrs.get('id')
if not new_string:
new_string = new_header.find_all(
text=lambda t: not isinstance(t, Comment),
new_string = ''.join(new_string)
new_string = new_string.translate(PARA_MAP)
if self.level < new_level:
new_node = HtmlTreeNode(self, new_string, new_level, new_id)
self.children += [new_node]
return new_node, new_header
elif self.level == new_level:
new_node = HtmlTreeNode(self.parent, new_string, new_level, new_id)
self.parent.children += [new_node]
return new_node, new_header
elif self.level > new_level:
return self.parent.add(new_header)
def __str__(self):
ret = ''
if self.parent:
ret = "<a class='toc-href' href='#{0}' title='{1}'>{1}</a>".format(, self.header)
if self.children:
ret += "<ul>{}</ul>".format('{}' * len(self.children)).format(
if self.parent:
ret = "<li>{}</li>".format(ret)
if not self.parent:
ret = "<div id='toc'>{}</div>".format(ret)
return ret
# assure configuration
def init_default_config(pelican):
from pelican.settings import DEFAULT_CONFIG
pelican.settings.setdefault('ASF_GENID', ASF_GENID)
# from Apache CMS markdown/extensions/ - slugify in the same way as the Apache CMS
def slugify(value, separator):
""" Slugify a string, to make it URL friendly. """
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
value = re.sub('[^\\w\\s-]', '', value.decode('ascii')).strip().lower()
return re.sub('[%s\\s]+' % separator, separator, value)
# Ensure an id is unique in a set of ids. Append '_1', '_2'... if not
def unique(id, ids):
while id in ids or not id:
m = IDCOUNT_RE.match(id)
print(f'id="{id}" is a duplicate')
if m:
id = '%s_%d' % (, int( + 1)
id = '%s_%d' % (id, 1)
return id
# append a permalink
def permalink(soup, mod_element):
new_tag = soup.new_tag('a', href='#' + mod_element['id'])
new_tag['class'] = 'headerlink'
new_tag['title'] = 'Permalink'
new_tag.string = LINK_CHAR
# fixup cmark content - note that this may be too hungry. It may need to occur later and skipped in codeblock and pre tags.
def fixup_content(content):
text = content._content
modified = False
# Find messed up html
for regex, replace in FIXUP_UNSAFE:
m =
if m:
modified = True
text = re.sub(regex, replace, text)
if modified:
content._content = text
# expand metadata found in {{ key }}
def expand_metadata(tag, metadata):
this_string = str(tag.string)
m = 1
modified = False
while m:
m =
if m:
this_data =
format_string = '{{{0}}}'.format(this_data)
new_string = format_string.format(**metadata)
print(f'{{{{{}}}}} -> {new_string}')
except Exception:
# the data expression was not found
print(f'{{{{{}}}}} is not found')
new_string = format_string
# replace the first pattern with the new_string
this_string = re.sub(METADATA_RE, new_string, this_string, count=1)
modified = True
if modified:
# do elementid transformation for {#id} and {.class} attribute annotations.
def elementid_transform(ids, soup, tag, permalinks, perma_set, debug):
tagnav = tag.parent
this_string = str(tag.string)
if debug:
print(f'name = {}, string = {this_string}')
if not in ['[document]', 'code', 'pre']:
m =
if m:
# this replacement could be better it truncates and likely drops additional annotations
if'type') == '#':
# id attribute annotation
tagnav['id'] = unique('id'), ids)
if permalinks:
permalink(soup, tagnav)
unique(tagnav['id'], perma_set)
if debug:
print(f'# insertion {tagnav}')
# class attribute annotation (regex only recognizes the two types)
tagnav['class'] ='id')
if debug:
print(f'Class {} : {tagnav["class"]}')
# generate id for a heading
def headingid_transform(ids, soup, tag, permalinks, perma_set):
new_string = tag.string
if not new_string:
# roll up strings if no immediate string
new_string = tag.find_all(
text=lambda t: not isinstance(t, Comment),
new_string = ''.join(new_string)
# don't have an id create it from text
new_id = slugify(new_string, '-')
tag['id'] = unique(new_id, ids)
if permalinks:
permalink(soup, tag)
# inform if there is a duplicate permalink
unique(tag['id'], perma_set)
# generate table of contents from headings after [TOC] content
def generate_toc(content, tags, title, toc_headers):
settoc = False
tree = node = HtmlTreeNode(None, title, 'h0', '')
# find the last [TOC]
taglast = tags[0]
for tag in tags:
taglast = tag
# find all headings after the final [TOC]
heading_re = re.compile(toc_headers)
for header in taglast.findAllNext(heading_re):
# we have heading content for the ToC
settoc = True
# add the heading.
node, _new_header = node.add(header)
# convert the ToC to Beautiful Soup
tree_soup = ''
if settoc:
print(' ToC')
# convert the HtmlTreeNode into Beautiful Soup
tree_string = '{}'.format(tree)
tree_soup = BeautifulSoup(tree_string, 'html.parser')
# Make the ToC available to the theme's template
content.toc = tree_soup.decode(formatter='html')
# replace the first [TOC] with the generated table of contents
for tag in tags:
# replace additional [TOC] with nothing
tree_soup = ''
# create breadcrumb html
def make_breadcrumbs(rel_source_path, title):
parts = rel_source_path.split('/')
url = '/'
crumbs = []
crumbs.append(f'<a href="/">Home</a>&nbsp;&raquo&nbsp;')
# don't process the filename part
last = len(parts)-1
for i in range(last):
url = f"{url}{parts[i]}/"
p = parts[i].capitalize()
crumbs.append(f'<a href="{url}">{p}</a>&nbsp;&raquo&nbsp;')
crumbs.append(f'<a href="#">{title}</a>')
return ''.join(crumbs)
# add the asfdata metadata into GFM content.
def add_data(content):
""" Mix in ASF data as metadata """
# if the reader is 'asf' then the asf metadata is already in place during asfreader plugin.
if content.metadata.get('reader') != 'asf':
asf_metadata = content.settings.get('ASF_DATA', { }).get('metadata')
if asf_metadata:
# main worker transforming the html
def generate_id(content):
if isinstance(content, pelican.contents.Static):
# get plugin settings
asf_genid = content.settings['ASF_GENID']
# asf_headings setting may be overridden
asf_headings = content.metadata.get('asf_headings', str(asf_genid['headings']))
# show active plugins
if asf_genid['debug']:
print('asfgenid:\nshow plugins in case one is processing before this one')
for name in content.settings['PLUGINS']:
print(f'plugin: {name}')
# track the id tags
ids = set()
# track permalinks
permalinks = set()
# step 1 - fixup html that cmark marks unsafe - move to later?
if asf_genid['unsafe_tags']:
# step 2 - prepare for genid processes
# parse html content into BeautifulSoup4
soup = BeautifulSoup(content._content, 'html.parser')
# page title
title = content.metadata.get('title', 'Title')
# assure relative source path is in the metadata
content.metadata['relative_source_path'] = rel_source_path = content.relative_source_path
# create breadcrumb html
content.metadata['breadcrumbs'] = breadcrumbs = make_breadcrumbs(rel_source_path, title)
# display output path and title
print(f'{content.relative_source_path} - {title}')
# if debug display breadcrumb html
if asf_genid['debug']:
print(f' {breadcrumbs}')
# enhance metadata if done by asfreader
# step 3 - metadata expansion
if asf_genid['metadata']:
if asf_genid['debug']:
print(f'metadata expansion: {content.relative_source_path}')
for tag in soup.findAll(string=METADATA_RE):
expand_metadata(tag, content.metadata)
# step 4 - find all id attributes already present
for tag in soup.findAll(id=True):
unique(tag['id'], ids)
# don't change existing ids
# step 5 - find all {#id} and {.class} text and assign attributes
if asf_genid['elements']:
if asf_genid['debug']:
print(f'elementid: {content.relative_source_path}')
for tag in soup.findAll(string=ELEMENTID_RE):
elementid_transform(ids, soup, tag, asf_genid['permalinks'], permalinks, asf_genid['debug'])
# step 6 - find all headings w/o ids already present or assigned with {#id} text
if asf_headings == 'True':
if asf_genid['debug']:
print(f'headings: {content.relative_source_path}')
# Find heading tags
HEADING_RE = re.compile(asf_genid['headings_re'])
for tag in soup.findAll(HEADING_RE, id=False):
headingid_transform(ids, soup, tag, asf_genid['permalinks'], permalinks)
# step 7 - find all tables without class
if asf_genid['tables']:
if asf_genid['debug']:
print(f'tables: {content.relative_source_path}')
for tag in soup.findAll(TABLE_RE, _class=False):
tag['class'] = 'table'
# step 8 - find TOC tag and generate Table of Contents
if asf_genid['toc']:
tags = soup('p', text='[TOC]')
if tags:
generate_toc(content, tags, title, asf_genid['toc_headers'])
# step 9 - reset the html content
content._content = soup.decode(formatter='html')
# step 10 - output all of the permalinks created
for tag in permalinks:
print(f' #{tag}')
def tb_connect(pel_ob):
"""Print any exception, before Pelican chews it into nothingness."""
except Exception:
print('-----', file=sys.stderr)
print('FATAL: %s' % (pel_ob.relative_source_path), file=sys.stderr)
# if we have errors in this module then we want to quit to avoid erasing the site
def register():