blob: 02be2fb6cb3fafa3f76c5ba88c0622287cd3b2af [file] [log] [blame]
#########################################################################
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#########################################################################
## Markdown has these types of paragraph: heading, text, list item (bullet or numbered),
## codeblock, table, and block quote.
##
## This script fixes up differences in Markdown dialect, between Github-MD and doxia-markdown.
## Specifically, it fixes these problems:
## 1. In Github-MD, bullets and codeblock starts are self-delimiting. In doxia-markdown, they
## must be separated from preceding text or (in the case of codeblocks) bullets, by a blank line.
## Failure to do so causes the bullet or codeblock delimiter to be interpreted as ordinary text,
## and the content gets munched into the preceding paragraph. The codeblock delimiter (```) as text
## gets interpreted as a codephrase delimiter (`) plus a preceding or following empty codephrase (``).
## 2. Github-MD is liberal in regard to what an 'indent' is, allowing 1, 2, 4, or 8 blanks, or
## a tab. We mostly use 2 blanks. Doxia-markdown requires strictly 4 spaces or a tab. Failure
## to adhere to this requirement causes indents to be ignored or misinterpreted, leading again to
## paragraph munching and delimiter ignoring.
## 3. In Doxia-markdown, if you indent below a header or text paragraph, it is interpreted as
## an implicit codeblock start. In Github-MD, we only start codeblocks with the explicit
## codeblock delimiter (```) and sometimes indent below text just for visual emphasis, so the
## doxia-markdown interpretation is unwelcome. Thus, in our rewrite, we disallow indenting below
## text or headers. This may make the text less pretty than the Github-MD presentation, but it
## avoids the incorrect codeblocking.
## 4. In Doxia-markdown, the indent of the end-codeblock delimiter must match that of the
## begin-codeblock delimiter, or it won't be recognized and the codeblock will run on.
## 5. Relative links need to be re-written. '.md' files need to be changed to '.html', and
## as best we can we will re-write named anchors referring to tags autogenerated from headers.
## The problem with generated tags is that Github-MD forces header text to lower-case, and
## replaces blank spaces with hyphens, while doxia-markdown leaves case unchanged, and replaces
## blanks with underscores. Fortunately we seem to have a culture of using link references that
## are typographically the same as the header text, so we have some basis for fixing most links.
## 6. H1 headers don't get named anchors generated, unlike H2 and lower headers. Don't know
## why doxia-markdown has this deficiency, perhaps it assumes H1 will only be used once at the
## beginning of the doc. We will insert an explicit anchor just before the H1 headers, to fix.
##
## So far, we're ignoring tables and block quotes.
##
## This script also manages the re-writing of named files to *.tmp, then mv to replace the original file.
import sys
import os
import inspect
import re
# These are the characters excluded by Markdown from use in auto-generated anchor text for Headings.
EXCLUDED_CHARS_REGEX_GHM = r'[^\w\-]' # all non-alphanumerics except "-" and "_". Whitespace are previously converted.
EXCLUDED_CHARS_REGEX_DOX = r'[^\w\.\-]' # all non-alphanumerics except "-", "_", and ".". Whitespace are previously converted.
def report_error(s) :
print >>sys.stderr, "ERROR: " + s
print >>sys.stderr, "on line: " + str(FNR) + " in file: " + FILENAME
print >>sys.stderr, inputline
exit(1)
def trace(msg) :
if TRACE :
print >>sys.stderr, "TRACE: " + inspect.currentframe().f_back.f_code.co_name + " : InputLine " + str(FNR) + " : " + msg
class INDENT_STACK :
'This class maintains the indent stack during doc parsing.'
def __init__(self) :
self.my_stack = [ {'physical' : 0, 'logical' : 0, 'type' : 'none' } ]
def init_indent(self) :
del self.my_stack
self.my_stack = [ {'physical' : 0, 'logical' : 0, 'type' : 'none' } ]
def push_indent(self, n, new_type) :
#Increment the logical depth only if under a bullet type. This fixes problem #3.
level = self.logical_indent_level() + (self.current_type() == "bullet") # plus 1 if true
self.my_stack.append( {'physical':n, 'logical':level, 'type':new_type} )
def set_current_type(self, new_type) :
# adjust topmost type
self.my_stack[-1]['type'] = new_type
def pop_indent(self) :
if len(self.my_stack) > 1 :
return self.my_stack.pop()['physical']
else :
return 0
def current_indent(self) :
# top of stack, physical
return self.my_stack[-1]['physical']
def logical_indent_level(self) :
# top of stack, logical
return self.my_stack[-1]['logical']
def current_type(self) :
# top of stack, type
return self.my_stack[-1]['type']
## End class INDENT_STACK
global indent_stack
indent_stack = INDENT_STACK() # single instance
def convert_tabs(s) :
# Courtesy of Python, this does a real column-aware tab expansion.
# If this doesn't work, we'll need to go back to erroring on " \t", that is, spaces followed by tabs.
trace("orig length {0}".format(len(s)) )
ct = s.count("\t")
s = s.expandtabs(4)
trace("after {0} tab substitutions, end length is {1}".format(ct, len(s)) )
return s
def fix_prefix_blanks(new_type) :
global inputline
# Fix up the indenting (prefix blanks) in inputline. This fixes problem #2.
# Don't worry about blank lines here, they are filtered out before calling this method.
# Both uses and maintains the indent stack, which is why we need the new_type passed in.
prefix_blanks = re.search(r'^[\s]*', inputline)
if prefix_blanks :
prefix_blanks = prefix_blanks.group()
trace("After prefix-blanks match, prefix_blanks is |" + prefix_blanks + "| length is " + str(len(prefix_blanks)) )
prefix_blanks = convert_tabs(prefix_blanks)
else :
prefix_blanks = ""
trace("After convert_tabs, prefix_blanks is |" + prefix_blanks + "| length is " + str(len(prefix_blanks)) )
# prefix_blanks now contains the 'physical' indent of the current paragraph, after tab substitution.
# The indent of this paragraph may be > or == to the previous paragraph. Those are the easy cases.
# If the indent is less than previous, is it equal to the indent of the next lower indented object?
# Or of a lower yet object? Or is it intermediate between two lower objects currently in the stack?
# The latter case is an anomoly, but there's no enforcement in Github-MD.
# The following logic is an empirical reverse engineering, that seems adequate so far.
# It basically says, find a prior level of indent that this is not less than, and then pretend that
# the objects between it and this object weren't there.
trace("current logical_indent_level is {0} and current_indent is {1}".format(
indent_stack.logical_indent_level(), indent_stack.current_indent() ))
while len(prefix_blanks) < indent_stack.current_indent() :
indent_stack.pop_indent()
if len(prefix_blanks) > indent_stack.current_indent() :
indent_stack.push_indent(len(prefix_blanks), new_type)
else : # len(prefix_blanks) == indent_stack.current_indent()
indent_stack.set_current_type(new_type)
trace(("After evaluating this line's prefix-blanks and prev_type, new logical_indent_level() is {0} " +
"and current_indent is {1}").format(indent_stack.logical_indent_level(), indent_stack.current_indent() ))
# Now whack off the prefix blanks, and replace with a standardized string of blanks appropriate to
# the logical indent level.
trace("Orig line is " + inputline)
inputline = re.sub(r'^[\s]*', BLANKS[0 : 4*indent_stack.logical_indent_level()], inputline, 1)
trace("New line is " + inputline)
def rewrite_relative_links() :
global inputline
trace("entering with line: " + inputline)
# Fix up the relative links in inputline. This fixes problem #5.
num_links = inputline.count("](")
links = re.findall(r'\[[^\]]+\]\([^)]+\)', inputline)
num_whole_links = len(links)
trace("num_links = {0}, num_whole_links = {1}".format(num_links, num_whole_links))
if (num_links != num_whole_links) :
if re.search(r'\[[^\][!]*\![\s]*\[', inputline) :
# Nested link label expressions, with '!'.
# Special case where a link value is inlined into the link label,
# as in the first line of the base README.md file. Bail on such lines.
trace("WARNING: Found nested link label expressions.")
return
else :
report_error("Found link split across multiple lines. We can't process this.")
for linkitem in links :
pieces = re.search(r'(\[[\s`]*)([^\]]*[^\s`\]])([\s`]*\]\([\s]*)([^\s]+)([\s]*\))', linkitem).groups()
trace("Link: " + linkitem)
trace("Pieces: " + " ".join( (pieces[0],pieces[1],pieces[2],pieces[3],pieces[4]) ))
labeltext = pieces[1]
href = pieces[3]
trace("Extracted labeltext is: " + labeltext)
trace("Extracted href is: " + href)
if re.search(r'^http|\?', href) :
# Don't rewrite absolute or parameterized URLs; neither is native to this markdown book.
trace("skipping absolute or parameterized URL")
continue
# Rewrite implicit index references to explicit, so the book will work as well
# with 'file:///' preview as with a real web server.
# We are only concerned with file path names here, so split at '#' if present.
num_sharps = href.count("#")
if (num_sharps >= 2) :
report_error("Multiple #'s in a single link href.")
elif (num_sharps == 1) :
# Implicit index references are directory names, which seldom have a filetype suffix.
# On the other hand, explicit file references must have filetype, else the browser
# won't know what to do with it. So if no filetype extension, assume is a directory
# and add 'index.html'. Skip if this is an intra-document link.
if not re.search(r'^#|\.[^/#]+#', href) :
if not href.count("/#") :
href = re.sub(r'#', "/#", href, 1)
href = re.sub(r'/#', "/index.html#", href, 1)
# Fix up '.md' references.
href = re.sub(r'^README\.md#', "index.html#", href)
href = re.sub(r'/README\.md#', "/index.html#", href)
href = re.sub(r'\.md#', ".html#", href)
else : # num_sharps == 0
# Same logic as above, just at $ instead of #.
if not re.search(r'\.[^/]+$', href) :
if not href.endswith("/") :
href = href + "/"
href = re.sub(r'/$', "/index.html", href)
# Fix up '.md' references.
href = re.sub(r'^README\.md$', "index.html", href)
href = re.sub(r'/README\.md$', "/index.html", href)
href = re.sub(r'\.md$', ".html", href)
trace("After .md fixup, href is: " + href)
# Re-write named anchors referring to generated tags.
sharp = href.find("#")
if (sharp >= 0) :
named_anchor = href[sharp+1 : ]
trace('named_anchor = "' + named_anchor + '"')
trace('labeltext = "' + labeltext + '"')
scratch = labeltext.lower() # Github-MD forces all anchors to lowercase
scratch = re.sub(r'[\s]', "-", scratch) # convert whitespace to "-"
scratch = re.sub(EXCLUDED_CHARS_REGEX_GHM, "", scratch) # strip non-alphanumerics
if (scratch == named_anchor) :
trace("Found a rewritable case")
scratch = labeltext # Doxia-markdown doesn't change case
scratch = re.sub(r'[\s]', "_", scratch) # convert whitespace to "_"
scratch = re.sub(EXCLUDED_CHARS_REGEX_DOX, "", scratch) # strip non-alphanumerics except "."
href = re.sub("#" + named_anchor, "#" + scratch, href)
trace("After anchor rewrite, href is: " + href)
# Now swap out the bad href for the fixed one in inputline.
if (href != pieces[3]) :
# Assemble the full link string to prevent similar substrings (to href) in different contexts being substituted.
scratch = pieces[0] + pieces[1] + pieces[2] + href + pieces[4]
trace("Fixed link text is: " + scratch)
trace("linkitem is still: " + linkitem)
k = inputline.find(linkitem)
inputline = inputline[ : k] + scratch + inputline[ k + len(linkitem) : ]
trace("Fixed inputline is: " + inputline)
################################################
# begin state machine
global inputline, active_type
BLANKS = " "
TRACE = 0
FNR = -1
trace("Starting trace")
# Github uses relative indents, but doxia wants only and exactly multiples of 4.
# To turn the more forgiving into more regular, we must track both logical and actual indents.
indent_stack.init_indent()
# Paragraph type can be none, text, bullet, code, or heading.
# Note 'current_type()' used in managing the logical indent level on the indent stack,
# and 'active_type' used in the pattern recognition state machine, are deliberately different.
active_type = "none"
# Note: order of the below 'if' clauses is critically important for the state machine.
# Don't change the order.
if len(sys.argv) <= 1 :
report_error("Please provide names of files to be processed, as command line arguments.")
for FILENAME in sys.argv[1:] :
infile = open(FILENAME, 'r')
outfile = open(FILENAME + ".tmp", 'w')
FNR = 0
H1_COUNT = 0
for inputline in infile :
FNR += 1
inputline = inputline.rstrip("\n")
if '](' in inputline :
# Detect lines with hyperlinks in them, and re-write them if necessary and possible.
# This is the only fall-through block, and we put it at the very beginning.
rewrite_relative_links(); # in inputline
# Fall through for further processing.
if (active_type == "code") and ("```" not in inputline) :
trace("in codeblock, regular line")
# what happens in the codeblock, stays in the codeblock
# Put this case first (after link detection), so we don't have to test it in all the other cases.
print >>outfile, inputline
continue
if (active_type == "code") and ("```" in inputline) :
trace("in codeblock, end delimiter line")
# detect end of codeblock
# This must be the second case.
if re.search(r'```[\s]*[^\s]', inputline) :
# If there's text following the end-``` on the same line, error out and fix it in the source file.
report_error("Text following codeblock end delimiter (```) on same line.")
if re.search(r'```.*```', inputline) :
# If there are two sets of triple-ticks on the same line, that's a problem too.
report_error("Two sets of codeblock delimiters (```) on same line.")
active_type = "none"
# Force the indenting of the end-``` to match the beginning. This fixes problem #4.
inputline = re.sub(r'^[\s]*', BLANKS[0 : 4*indent_stack.logical_indent_level()], inputline)
print >>outfile, inputline
continue
if (active_type != "code") and ("```" in inputline) :
trace("start codeblock, delimiter line")
# detect start of codeblock
if re.search(r'[^\s][\s]*```', inputline) :
# If there's text preceding the begin-``` on the same line, error out and fix it in the source file.
report_error("Text preceding codeblock start delimiter (```) on same line.")
if re.search(r'```.*```', inputline) :
# If there are two sets of triple-ticks on the same line, that's a problem too.
report_error("Two sets of codeblock delimiters (```) on same line.")
if active_type == "text" or active_type == "bullet" :
print >>outfile, "" # Need preceding blank line before codeblock, in doxia.
active_type = "code"
fix_prefix_blanks(active_type) # in inputline
print >>outfile, inputline
continue
if re.search(r'^[\s]*$', inputline) :
trace("blank line")
# detect blank lines
active_type = "none"
print >>outfile, inputline # Perhaps this should be print "" instead?
continue
if re.search(r'^[\s]*([*+-]|[\d]+\.)[\s]', inputline) :
trace("bullet line")
# detect bullet line (numbered or not)
if (active_type == "text") :
print >>outfile, "" # Need preceding blank line between text and bullet, in doxia. This fixes problem #1.
active_type = "bullet"
fix_prefix_blanks(active_type); # in inputline
print >>outfile, inputline
continue
if inputline.startswith("#") :
trace("header line")
# detects header lines, which are self-delimiting, and cannot have indenting
# Header line resets the indenting as well as current type
active_type = "none"
indent_stack.init_indent()
if re.search(r'^#[^#]', inputline) :
# First-level headers ("H1") need explicit anchor inserted (Doxia style). This fixes problem #6.
anchor_name = re.sub(r' ', "_", inputline[1:].strip())
anchor_name = re.sub(EXCLUDED_CHARS_REGEX_DOX, "", anchor_name)
anchor_text = '<a name="' + anchor_name + '"></a>'
if H1_COUNT == 0 :
# Treat the first header differently - put the header after instead of before
# This is necessary to preserve document metadata titling in generated html.
# However, it means the title itself gets hidden above the top of window, when the link is used.
H1_COUNT = 1
print >>outfile, inputline
print >>outfile, anchor_text
print >>outfile, "" # Anchors aren't self-delimiting, so insert a blank line after.
else :
print >>outfile, "" # Anchors aren't self-delimiting, so insert a blank line first.
print >>outfile, anchor_text
print >>outfile, inputline
else :
# H2 or deeper level of header, doxia auto-generates anchor.
print >>outfile, inputline
continue
if re.search(r'^[\s]*#', inputline) :
trace("header line, bad")
report_error("Header specification character (#) detected with indenting. This is presumed to be an error, since it will render as text. If intentional, put a period or other printable character before it.")
## default action -- last case in state machine switch
trace("text line")
# Everything else is text-like, and therefore continues active_type, unless none.
if (active_type == "none") :
# Start new text paragraph.
active_type = "text"
fix_prefix_blanks(active_type); # in inputline
print >>outfile, inputline
continue
else :
# This is just a continuation of current text or bullet.
# Indenting is irrelevant.
print >>outfile, inputline
continue
## end loop on inputlines
if (active_type == "code") :
report_error("Unmatched codeblock delimiter (```) detected.")
infile.close()
outfile.close()
os.rename(FILENAME + ".tmp", FILENAME)
## end loop on FILENAMEs
trace("ending trace")