| ''' |
| asfgenid |
| =================================== |
| Generates HeadingIDs, ElementID, and PermaLinks |
| First find all specified IDs and classes. Assure unique ID and permalink |
| Next find all headings missing IDs. Assure unique ID and permalink |
| Generates a Table of Content |
| ''' |
| |
| # from __future__ import unicode_literals |
| |
| import sys |
| import traceback |
| import re |
| import unicodedata |
| |
| from bs4 import BeautifulSoup, Comment |
| |
| import pelican.contents |
| import pelican.plugins.signals |
| |
| ''' |
| Based on |
| https://github.com/waylan/Python-Markdown/blob/master/markdown/extensions/headerid.py |
| Which is BSD licensed, but is very much rewritten. |
| ''' |
| |
| ASF_GENID = { |
| 'unsafe_tags': True, # fix script, style, and iframe html that gfm filters as unsafe |
| 'metadata': True, # {{ metadata }} inclusion of data in the html. |
| 'elements': True, # {#id} and {.class} annotations. |
| 'headings': True, # add slugified id to headings missing id. Can be overridden by page metadata. |
| 'headings_re': r'^h[1-6]', # regex for which headings to check. |
| 'permalinks': True, # add permalinks to elements and headings when id is added. |
| 'toc': True, # check for [TOC] and add Table of Content if present. |
| 'toc_headers': r'h[1-6]', # regex for which headings to include in the [TOC] |
| 'tables': True, # add class="table" for tables missing class. |
| 'debug': False |
| } |
| |
| # Fixup tuples for HTML that GFM makes into text. |
| # Fixup [ and ] that download templates use for ezt. |
| FIXUP_UNSAFE = [ |
| (re.compile(r'<script'), '<script'), |
| (re.compile(r'</script'), '</script'), |
| (re.compile(r'<style'), '<style'), |
| (re.compile(r'</style'), '</style'), |
| (re.compile(r'<iframe'), '<iframe'), |
| (re.compile(r'</iframe'), '</iframe'), |
| (re.compile(r'%5B'), '['), |
| (re.compile(r'%5D'), ']'), |
| ] |
| |
| # Find {{ metadata }} inclusions |
| METADATA_RE = re.compile(r'{{\s*(?P<meta>[-_:a-zA-Z0-9]+)\s*}}') |
| |
| # Find {#id} or {.class} elementid annotations |
| ELEMENTID_RE = re.compile(r'(?:[ \t]*[{\[][ \t]*(?P<type>[#.])(?P<id>[-._:a-zA-Z0-9 ]+)[}\]])(\n|$)') |
| |
| # ID duplicates match |
| IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$') |
| |
| # For permalinks |
| LINK_CHAR = 'ΒΆ' |
| |
| # strip permalink chars from headings for ToC |
| PARA_MAP = { |
| ord(LINK_CHAR): None |
| } |
| |
| # Find table tags - to check for ones without class attribute. |
| TABLE_RE = re.compile(r'^table') |
| |
| |
| # An item in a Table of Contents - from toc.py |
| class HtmlTreeNode(object): |
| def __init__(self, parent, header, level, tag_id): |
| self.children = [] |
| self.parent = parent |
| self.header = header |
| self.level = level |
| self.tag_id = tag_id |
| |
| def add(self, new_header): |
| new_level = new_header.name |
| new_string = new_header.string |
| new_id = new_header.attrs.get('id') |
| |
| if not new_string: |
| new_string = new_header.find_all( |
| text=lambda t: not isinstance(t, Comment), |
| recursive=True) |
| new_string = ''.join(new_string) |
| new_string = new_string.translate(PARA_MAP) |
| |
| if self.level < new_level: |
| new_node = HtmlTreeNode(self, new_string, new_level, new_id) |
| self.children += [new_node] |
| return new_node, new_header |
| elif self.level == new_level: |
| new_node = HtmlTreeNode(self.parent, new_string, new_level, new_id) |
| self.parent.children += [new_node] |
| return new_node, new_header |
| elif self.level > new_level: |
| return self.parent.add(new_header) |
| |
| def __str__(self): |
| ret = '' |
| if self.parent: |
| ret = "<a class='toc-href' href='#{0}' title='{1}'>{1}</a>".format( |
| self.tag_id, self.header) |
| |
| if self.children: |
| ret += "<ul>{}</ul>".format('{}' * len(self.children)).format( |
| *self.children) |
| |
| if self.parent: |
| ret = "<li>{}</li>".format(ret) |
| |
| if not self.parent: |
| ret = "<div id='toc'>{}</div>".format(ret) |
| |
| return ret |
| |
| |
| # assure configuration |
| def init_default_config(pelican): |
| from pelican.settings import DEFAULT_CONFIG |
| |
| DEFAULT_CONFIG.setdefault('ASF_GENID', ASF_GENID) |
| if(pelican): |
| pelican.settings.setdefault('ASF_GENID', ASF_GENID) |
| |
| |
| # from Apache CMS markdown/extensions/headerid.py - slugify in the same way as the Apache CMS |
| def slugify(value, separator): |
| """ Slugify a string, to make it URL friendly. """ |
| value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore') |
| value = re.sub('[^\\w\\s-]', '', value.decode('ascii')).strip().lower() |
| return re.sub('[%s\\s]+' % separator, separator, value) |
| |
| |
| # Ensure an id is unique in a set of ids. Append '_1', '_2'... if not |
| def unique(tag_id, ids): |
| while tag_id in ids or not tag_id: |
| m = IDCOUNT_RE.match(tag_id) |
| print(f'WARNING: id="{tag_id}" is a duplicate') |
| if m: |
| tag_id = '%s_%d' % (m.group(1), int(m.group(2)) + 1) |
| else: |
| tag_id = '%s_%d' % (tag_id, 1) |
| ids.add(tag_id) |
| return tag_id |
| |
| |
| # append a permalink |
| def permalink(soup, mod_element): |
| new_tag = soup.new_tag('a', href='#' + mod_element['id']) |
| new_tag['class'] = 'headerlink' |
| new_tag['title'] = 'Permalink' |
| new_tag.string = LINK_CHAR |
| mod_element.append(new_tag) |
| |
| |
| # fixup cmark content - note that this may be too hungry. It may need to occur later and skipped in codeblock and pre tags. |
| def fixup_content(content): |
| text = content._content |
| modified = False |
| # Find messed up html |
| for regex, replace in FIXUP_UNSAFE: |
| m = regex.search(text) |
| if m: |
| modified = True |
| text = re.sub(regex, replace, text) |
| if modified: |
| content._content = text |
| |
| |
| # expand metadata found in {{ key }} |
| def expand_metadata(tag, metadata, debug): |
| this_string = str(tag.string) |
| m = 1 |
| modified = False |
| while m: |
| m = METADATA_RE.search(this_string) |
| if m: |
| this_data = m.group(1).strip() |
| format_string = '{{{0}}}'.format(this_data) |
| try: |
| new_string = format_string.format(**metadata) |
| if debug: |
| print(f'{{{{{m.group(1)}}}}} -> {new_string}') |
| except Exception: |
| # the data expression was not found |
| print(f'{{{{{m.group(1)}}}}} is not found') |
| new_string = format_string |
| # replace the first pattern with the new_string |
| this_string = re.sub(METADATA_RE, new_string, this_string, count=1) |
| modified = True |
| if modified: |
| tag.string.replace_with(this_string) |
| |
| |
| # do elementid transformation for {#id} and {.class} attribute annotations. |
| def elementid_transform(ids, soup, tag, permalinks, perma_set, debug): |
| tagnav = tag.parent |
| this_string = str(tag.string) |
| if debug: |
| print(f'name = {tagnav.name}, string = {this_string}') |
| if tagnav.name not in ['[document]', 'code', 'pre']: |
| m = ELEMENTID_RE.search(tag.string) |
| if m: |
| # this replacement could be better it truncates and likely drops additional annotations |
| tag.string.replace_with(this_string[:m.start()]) |
| if m.group('type') == '#': |
| # id attribute annotation |
| tagnav['id'] = unique(m.group('id'), ids) |
| if permalinks: |
| permalink(soup, tagnav) |
| unique(tagnav['id'], perma_set) |
| if debug: |
| print(f'# insertion {tagnav}') |
| else: |
| # class attribute annotation (regex only recognizes the two types) |
| tagnav['class'] = m.group('id') |
| if debug: |
| print(f'Class {tag.name} : {tagnav["class"]}') |
| |
| |
| # generate id for a heading |
| def headingid_transform(ids, soup, tag, permalinks, perma_set): |
| new_string = tag.string |
| if not new_string: |
| # roll up strings if no immediate string |
| new_string = tag.find_all( |
| text=lambda t: not isinstance(t, Comment), |
| recursive=True) |
| new_string = ''.join(new_string) |
| |
| # don't have an id create it from text |
| new_id = slugify(new_string, '-') |
| tag['id'] = unique(new_id, ids) |
| if permalinks: |
| permalink(soup, tag) |
| # inform if there is a duplicate permalink |
| unique(tag['id'], perma_set) |
| |
| |
| # generate table of contents from headings after [TOC] content |
| def generate_toc(content, tags, title, toc_headers, debug): |
| settoc = False |
| tree = node = HtmlTreeNode(None, title, 'h0', '') |
| # find the last [TOC] |
| taglast = tags[0] |
| for tag in tags: |
| taglast = tag |
| # find all headings after the final [TOC] |
| heading_re = re.compile(toc_headers) |
| for header in taglast.findAllNext(heading_re): |
| # we have heading content for the ToC |
| settoc = True |
| # add the heading. |
| node, _new_header = node.add(header) |
| # convert the ToC to Beautiful Soup |
| tree_soup = '' |
| if settoc: |
| if debug: |
| print(' ToC') |
| # convert the HtmlTreeNode into Beautiful Soup |
| tree_string = '{}'.format(tree) |
| tree_soup = BeautifulSoup(tree_string, 'html.parser') |
| # Make the ToC available to the theme's template |
| content.toc = tree_soup.decode(formatter='html') |
| # replace the first [TOC] with the generated table of contents |
| for tag in tags: |
| tag.replaceWith(tree_soup) |
| # replace additional [TOC] with nothing |
| tree_soup = '' |
| |
| |
| # create breadcrumb html |
| def make_breadcrumbs(rel_source_path, title): |
| parts = rel_source_path.split('/') |
| url = '/' |
| crumbs = [] |
| crumbs.append(f'<a href="/">Home</a> » ') |
| # don't process the filename part |
| last = len(parts)-1 |
| for i in range(last): |
| url = f"{url}{parts[i]}/" |
| p = parts[i].capitalize() |
| crumbs.append(f'<a href="{url}">{p}</a> » ') |
| crumbs.append(f'<a href="#">{title}</a>') |
| return ''.join(crumbs) |
| |
| |
| # add the asfdata metadata into GFM content. |
| def add_data(content): |
| """ Mix in ASF data as metadata """ |
| |
| # if the reader is 'asf' then the asf metadata is already in place during asfreader plugin. |
| if content.metadata.get('reader') != 'asf': |
| asf_metadata = content.settings.get('ASF_DATA', { }).get('metadata') |
| if asf_metadata: |
| content.metadata.update(asf_metadata) |
| |
| |
| # main worker transforming the html |
| def generate_id(content): |
| if isinstance(content, pelican.contents.Static): |
| return |
| |
| # get plugin settings |
| asf_genid = content.settings['ASF_GENID'] |
| # asf_headings setting may be overridden |
| asf_headings = content.metadata.get('asf_headings', str(asf_genid['headings'])) |
| |
| # show active plugins |
| if asf_genid['debug']: |
| print('asfgenid:\nshow plugins in case one is processing before this one') |
| for name in content.settings['PLUGINS']: |
| print(f'plugin: {name}') |
| |
| # track the id tags |
| ids = set() |
| # track permalinks |
| permalinks = set() |
| |
| # step 1 - fixup html that cmark marks unsafe - move to later? |
| if asf_genid['unsafe_tags']: |
| fixup_content(content) |
| |
| # step 2 - prepare for genid processes |
| # parse html content into BeautifulSoup4 |
| soup = BeautifulSoup(content._content, 'html.parser') |
| # page title |
| title = content.metadata.get('title', 'Title') |
| # assure relative source path is in the metadata |
| content.metadata['relative_source_path'] = rel_source_path = content.relative_source_path |
| # create breadcrumb html |
| content.metadata['breadcrumbs'] = breadcrumbs = make_breadcrumbs(rel_source_path, title) |
| # display output path and title |
| print(f'{content.relative_source_path} - {title}') |
| # if debug display breadcrumb html |
| if asf_genid['debug']: |
| print(f' {breadcrumbs}') |
| # enhance metadata if done by asfreader |
| add_data(content) |
| |
| # step 3 - metadata expansion |
| if asf_genid['metadata']: |
| if asf_genid['debug']: |
| print(f'metadata expansion: {content.relative_source_path}') |
| for tag in soup.findAll(string=METADATA_RE): |
| expand_metadata(tag, content.metadata, asf_genid['debug']) |
| |
| # step 4 - find all id attributes already present |
| for tag in soup.findAll(id=True): |
| unique(tag['id'], ids) |
| # don't change existing ids |
| |
| # step 5 - find all {#id} and {.class} text and assign attributes |
| if asf_genid['elements']: |
| if asf_genid['debug']: |
| print(f'elementid: {content.relative_source_path}') |
| for tag in soup.findAll(string=ELEMENTID_RE): |
| elementid_transform(ids, soup, tag, asf_genid['permalinks'], permalinks, asf_genid['debug']) |
| |
| # step 6 - find all headings w/o ids already present or assigned with {#id} text |
| if asf_headings == 'True': |
| if asf_genid['debug']: |
| print(f'headings: {content.relative_source_path}') |
| # Find heading tags |
| HEADING_RE = re.compile(asf_genid['headings_re']) |
| for tag in soup.findAll(HEADING_RE, id=False): |
| headingid_transform(ids, soup, tag, asf_genid['permalinks'], permalinks) |
| |
| # step 7 - find all tables without class |
| if asf_genid['tables']: |
| if asf_genid['debug']: |
| print(f'tables: {content.relative_source_path}') |
| for tag in soup.findAll(TABLE_RE, _class=False): |
| tag['class'] = 'table' |
| |
| # step 8 - find TOC tag and generate Table of Contents |
| if asf_genid['toc']: |
| tags = soup('p', text='[TOC]') |
| if tags: |
| generate_toc(content, tags, title, asf_genid['toc_headers'], asf_genid['debug']) |
| |
| # step 9 - reset the html content |
| content._content = soup.decode(formatter='html') |
| |
| # step 10 - output all of the permalinks created |
| if asf_genid['debug']: |
| for tag in permalinks: |
| print(f' #{tag}') |
| |
| |
| def tb_connect(pel_ob): |
| """Print any exception, before Pelican chews it into nothingness.""" |
| try: |
| generate_id(pel_ob) |
| except Exception: |
| print('-----', file=sys.stderr) |
| print('FATAL: %s' % (pel_ob.relative_source_path), file=sys.stderr) |
| traceback.print_exc() |
| # if we have errors in this module then we want to quit to avoid erasing the site |
| sys.exit(4) |
| |
| |
| def register(): |
| pelican.plugins.signals.initialized.connect(init_default_config) |
| |
| |
| pelican.plugins.signals.content_object_init.connect(tb_connect) |