| ######################################################################### |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| ######################################################################### |
| |
| |
| ## Markdown has these types of paragraph: heading, text, list item (bullet or numbered), |
| ## codeblock, table, and block quote. |
| ## |
| ## This script fixes up differences in Markdown dialect, between Github-MD and doxia-markdown. |
| ## Specifically, it fixes these problems: |
| ## 1. In Github-MD, bullets and codeblock starts are self-delimiting. In doxia-markdown, they |
| ## must be separated from preceding text or (in the case of codeblocks) bullets, by a blank line. |
| ## Failure to do so causes the bullet or codeblock delimiter to be interpreted as ordinary text, |
| ## and the content gets munched into the preceding paragraph. The codeblock delimiter (```) as text |
| ## gets interpreted as a codephrase delimiter (`) plus a preceding or following empty codephrase (``). |
| ## 2. Github-MD is liberal in regard to what an 'indent' is, allowing 1, 2, 4, or 8 blanks, or |
| ## a tab. We mostly use 2 blanks. Doxia-markdown requires strictly 4 spaces or a tab. Failure |
| ## to adhere to this requirement causes indents to be ignored or misinterpreted, leading again to |
| ## paragraph munching and delimiter ignoring. |
| ## 3. In Doxia-markdown, if you indent below a header or text paragraph, it is interpreted as |
| ## an implicit codeblock start. In Github-MD, we only start codeblocks with the explicit |
| ## codeblock delimiter (```) and sometimes indent below text just for visual emphasis, so the |
| ## doxia-markdown interpretation is unwelcome. Thus, in our rewrite, we disallow indenting below |
| ## text or headers. This may make the text less pretty than the Github-MD presentation, but it |
| ## avoids the incorrect codeblocking. |
| ## 4. In Doxia-markdown, the indent of the end-codeblock delimiter must match that of the |
| ## begin-codeblock delimiter, or it won't be recognized and the codeblock will run on. |
| ## 5. Relative links need to be re-written. '.md' files need to be changed to '.html', and |
| ## as best we can we will re-write named anchors referring to tags autogenerated from headers. |
| ## The problem with generated tags is that Github-MD forces header text to lower-case, and |
| ## replaces blank spaces with hyphens, while doxia-markdown leaves case unchanged, and replaces |
| ## blanks with underscores. Fortunately we seem to have a culture of using link references that |
| ## are typographically the same as the header text, so we have some basis for fixing most links. |
| ## 6. H1 headers don't get named anchors generated, unlike H2 and lower headers. Don't know |
| ## why doxia-markdown has this deficiency, perhaps it assumes H1 will only be used once at the |
| ## beginning of the doc. We will insert an explicit anchor just before the H1 headers, to fix. |
| ## |
| ## So far, we're ignoring tables and block quotes. |
| ## |
| ## This script also manages the re-writing of named files to *.tmp, then mv to replace the original file. |
| |
| |
| import sys |
| import os |
| import inspect |
| import re |
| |
| # These are the characters excluded by Markdown from use in auto-generated anchor text for Headings. |
| EXCLUDED_CHARS_REGEX_GHM = r'[^\w\-]' # all non-alphanumerics except "-" and "_". Whitespace are previously converted. |
| EXCLUDED_CHARS_REGEX_DOX = r'[^\w\.\-]' # all non-alphanumerics except "-", "_", and ".". Whitespace are previously converted. |
| |
| def report_error(s) : |
| print >>sys.stderr, "ERROR: " + s |
| print >>sys.stderr, "on line: " + str(FNR) + " in file: " + FILENAME |
| print >>sys.stderr, inputline |
| exit(1) |
| |
| |
| def trace(msg) : |
| if TRACE : |
| print >>sys.stderr, "TRACE: " + inspect.currentframe().f_back.f_code.co_name + " : InputLine " + str(FNR) + " : " + msg |
| |
| class INDENT_STACK : |
| 'This class maintains the indent stack during doc parsing.' |
| |
| def __init__(self) : |
| self.my_stack = [ {'physical' : 0, 'logical' : 0, 'type' : 'none' } ] |
| |
| def init_indent(self) : |
| del self.my_stack |
| self.my_stack = [ {'physical' : 0, 'logical' : 0, 'type' : 'none' } ] |
| |
| def push_indent(self, n, new_type) : |
| #Increment the logical depth only if under a bullet type. This fixes problem #3. |
| level = self.logical_indent_level() + (self.current_type() == "bullet") # plus 1 if true |
| self.my_stack.append( {'physical':n, 'logical':level, 'type':new_type} ) |
| |
| def set_current_type(self, new_type) : |
| # adjust topmost type |
| self.my_stack[-1]['type'] = new_type |
| |
| def pop_indent(self) : |
| if len(self.my_stack) > 1 : |
| return self.my_stack.pop()['physical'] |
| else : |
| return 0 |
| |
| def current_indent(self) : |
| # top of stack, physical |
| return self.my_stack[-1]['physical'] |
| |
| def logical_indent_level(self) : |
| # top of stack, logical |
| return self.my_stack[-1]['logical'] |
| |
| def current_type(self) : |
| # top of stack, type |
| return self.my_stack[-1]['type'] |
| |
| ## End class INDENT_STACK |
| |
| global indent_stack |
| indent_stack = INDENT_STACK() # single instance |
| |
| |
| def convert_tabs(s) : |
| # Courtesy of Python, this does a real column-aware tab expansion. |
| # If this doesn't work, we'll need to go back to erroring on " \t", that is, spaces followed by tabs. |
| trace("orig length {0}".format(len(s)) ) |
| ct = s.count("\t") |
| s = s.expandtabs(4) |
| trace("after {0} tab substitutions, end length is {1}".format(ct, len(s)) ) |
| return s |
| |
| |
| def fix_prefix_blanks(new_type) : |
| global inputline |
| # Fix up the indenting (prefix blanks) in inputline. This fixes problem #2. |
| # Don't worry about blank lines here, they are filtered out before calling this method. |
| # Both uses and maintains the indent stack, which is why we need the new_type passed in. |
| prefix_blanks = re.search(r'^[\s]*', inputline) |
| if prefix_blanks : |
| prefix_blanks = prefix_blanks.group() |
| trace("After prefix-blanks match, prefix_blanks is |" + prefix_blanks + "| length is " + str(len(prefix_blanks)) ) |
| prefix_blanks = convert_tabs(prefix_blanks) |
| else : |
| prefix_blanks = "" |
| |
| trace("After convert_tabs, prefix_blanks is |" + prefix_blanks + "| length is " + str(len(prefix_blanks)) ) |
| |
| # prefix_blanks now contains the 'physical' indent of the current paragraph, after tab substitution. |
| # The indent of this paragraph may be > or == to the previous paragraph. Those are the easy cases. |
| # If the indent is less than previous, is it equal to the indent of the next lower indented object? |
| # Or of a lower yet object? Or is it intermediate between two lower objects currently in the stack? |
| # The latter case is an anomoly, but there's no enforcement in Github-MD. |
| # The following logic is an empirical reverse engineering, that seems adequate so far. |
| # It basically says, find a prior level of indent that this is not less than, and then pretend that |
| # the objects between it and this object weren't there. |
| |
| trace("current logical_indent_level is {0} and current_indent is {1}".format( |
| indent_stack.logical_indent_level(), indent_stack.current_indent() )) |
| while len(prefix_blanks) < indent_stack.current_indent() : |
| indent_stack.pop_indent() |
| if len(prefix_blanks) > indent_stack.current_indent() : |
| indent_stack.push_indent(len(prefix_blanks), new_type) |
| else : # len(prefix_blanks) == indent_stack.current_indent() |
| indent_stack.set_current_type(new_type) |
| |
| trace(("After evaluating this line's prefix-blanks and prev_type, new logical_indent_level() is {0} " + |
| "and current_indent is {1}").format(indent_stack.logical_indent_level(), indent_stack.current_indent() )) |
| |
| # Now whack off the prefix blanks, and replace with a standardized string of blanks appropriate to |
| # the logical indent level. |
| trace("Orig line is " + inputline) |
| inputline = re.sub(r'^[\s]*', BLANKS[0 : 4*indent_stack.logical_indent_level()], inputline, 1) |
| trace("New line is " + inputline) |
| |
| |
| def rewrite_relative_links() : |
| global inputline |
| trace("entering with line: " + inputline) |
| # Fix up the relative links in inputline. This fixes problem #5. |
| num_links = inputline.count("](") |
| links = re.findall(r'\[[^\]]+\]\([^)]+\)', inputline) |
| num_whole_links = len(links) |
| trace("num_links = {0}, num_whole_links = {1}".format(num_links, num_whole_links)) |
| if (num_links != num_whole_links) : |
| if re.search(r'\[[^\][!]*\![\s]*\[', inputline) : |
| # Nested link label expressions, with '!'. |
| # Special case where a link value is inlined into the link label, |
| # as in the first line of the base README.md file. Bail on such lines. |
| trace("WARNING: Found nested link label expressions.") |
| return |
| else : |
| report_error("Found link split across multiple lines. We can't process this.") |
| |
| for linkitem in links : |
| pieces = re.search(r'(\[[\s`]*)([^\]]*[^\s`\]])([\s`]*\]\([\s]*)([^\s]+)([\s]*\))', linkitem).groups() |
| trace("Link: " + linkitem) |
| trace("Pieces: " + " ".join( (pieces[0],pieces[1],pieces[2],pieces[3],pieces[4]) )) |
| labeltext = pieces[1] |
| href = pieces[3] |
| trace("Extracted labeltext is: " + labeltext) |
| trace("Extracted href is: " + href) |
| if re.search(r'^http|\?', href) : |
| # Don't rewrite absolute or parameterized URLs; neither is native to this markdown book. |
| trace("skipping absolute or parameterized URL") |
| continue |
| |
| # Rewrite implicit index references to explicit, so the book will work as well |
| # with 'file:///' preview as with a real web server. |
| # We are only concerned with file path names here, so split at '#' if present. |
| num_sharps = href.count("#") |
| if (num_sharps >= 2) : |
| report_error("Multiple #'s in a single link href.") |
| elif (num_sharps == 1) : |
| # Implicit index references are directory names, which seldom have a filetype suffix. |
| # On the other hand, explicit file references must have filetype, else the browser |
| # won't know what to do with it. So if no filetype extension, assume is a directory |
| # and add 'index.html'. Skip if this is an intra-document link. |
| if not re.search(r'^#|\.[^/#]+#', href) : |
| if not href.count("/#") : |
| href = re.sub(r'#', "/#", href, 1) |
| href = re.sub(r'/#', "/index.html#", href, 1) |
| |
| # Fix up '.md' references. |
| href = re.sub(r'^README\.md#', "index.html#", href) |
| href = re.sub(r'/README\.md#', "/index.html#", href) |
| href = re.sub(r'\.md#', ".html#", href) |
| |
| else : # num_sharps == 0 |
| # Same logic as above, just at $ instead of #. |
| if not re.search(r'\.[^/]+$', href) : |
| if not href.endswith("/") : |
| href = href + "/" |
| href = re.sub(r'/$', "/index.html", href) |
| |
| # Fix up '.md' references. |
| href = re.sub(r'^README\.md$', "index.html", href) |
| href = re.sub(r'/README\.md$', "/index.html", href) |
| href = re.sub(r'\.md$', ".html", href) |
| |
| trace("After .md fixup, href is: " + href) |
| |
| # Re-write named anchors referring to generated tags. |
| sharp = href.find("#") |
| if (sharp >= 0) : |
| named_anchor = href[sharp+1 : ] |
| trace('named_anchor = "' + named_anchor + '"') |
| trace('labeltext = "' + labeltext + '"') |
| scratch = labeltext.lower() # Github-MD forces all anchors to lowercase |
| scratch = re.sub(r'[\s]', "-", scratch) # convert whitespace to "-" |
| scratch = re.sub(EXCLUDED_CHARS_REGEX_GHM, "", scratch) # strip non-alphanumerics |
| if (scratch == named_anchor) : |
| trace("Found a rewritable case") |
| scratch = labeltext # Doxia-markdown doesn't change case |
| scratch = re.sub(r'[\s]', "_", scratch) # convert whitespace to "_" |
| scratch = re.sub(EXCLUDED_CHARS_REGEX_DOX, "", scratch) # strip non-alphanumerics except "." |
| href = re.sub("#" + named_anchor, "#" + scratch, href) |
| |
| trace("After anchor rewrite, href is: " + href) |
| |
| # Now swap out the bad href for the fixed one in inputline. |
| if (href != pieces[3]) : |
| # Assemble the full link string to prevent similar substrings (to href) in different contexts being substituted. |
| scratch = pieces[0] + pieces[1] + pieces[2] + href + pieces[4] |
| trace("Fixed link text is: " + scratch) |
| trace("linkitem is still: " + linkitem) |
| k = inputline.find(linkitem) |
| inputline = inputline[ : k] + scratch + inputline[ k + len(linkitem) : ] |
| trace("Fixed inputline is: " + inputline) |
| |
| |
| |
| ################################################ |
| # begin state machine |
| |
| global inputline, active_type |
| BLANKS = " " |
| TRACE = 0 |
| FNR = -1 |
| trace("Starting trace") |
| |
| # Github uses relative indents, but doxia wants only and exactly multiples of 4. |
| # To turn the more forgiving into more regular, we must track both logical and actual indents. |
| indent_stack.init_indent() |
| |
| # Paragraph type can be none, text, bullet, code, or heading. |
| # Note 'current_type()' used in managing the logical indent level on the indent stack, |
| # and 'active_type' used in the pattern recognition state machine, are deliberately different. |
| active_type = "none" |
| |
| # Note: order of the below 'if' clauses is critically important for the state machine. |
| # Don't change the order. |
| |
| if len(sys.argv) <= 1 : |
| report_error("Please provide names of files to be processed, as command line arguments.") |
| |
| for FILENAME in sys.argv[1:] : |
| infile = open(FILENAME, 'r') |
| outfile = open(FILENAME + ".tmp", 'w') |
| FNR = 0 |
| H1_COUNT = 0 |
| for inputline in infile : |
| FNR += 1 |
| inputline = inputline.rstrip("\n") |
| |
| if '](' in inputline : |
| # Detect lines with hyperlinks in them, and re-write them if necessary and possible. |
| # This is the only fall-through block, and we put it at the very beginning. |
| rewrite_relative_links(); # in inputline |
| # Fall through for further processing. |
| |
| if (active_type == "code") and ("```" not in inputline) : |
| trace("in codeblock, regular line") |
| # what happens in the codeblock, stays in the codeblock |
| # Put this case first (after link detection), so we don't have to test it in all the other cases. |
| print >>outfile, inputline |
| continue |
| |
| if (active_type == "code") and ("```" in inputline) : |
| trace("in codeblock, end delimiter line") |
| # detect end of codeblock |
| # This must be the second case. |
| if re.search(r'```[\s]*[^\s]', inputline) : |
| # If there's text following the end-``` on the same line, error out and fix it in the source file. |
| report_error("Text following codeblock end delimiter (```) on same line.") |
| |
| if re.search(r'```.*```', inputline) : |
| # If there are two sets of triple-ticks on the same line, that's a problem too. |
| report_error("Two sets of codeblock delimiters (```) on same line.") |
| |
| active_type = "none" |
| # Force the indenting of the end-``` to match the beginning. This fixes problem #4. |
| inputline = re.sub(r'^[\s]*', BLANKS[0 : 4*indent_stack.logical_indent_level()], inputline) |
| print >>outfile, inputline |
| continue |
| |
| if (active_type != "code") and ("```" in inputline) : |
| trace("start codeblock, delimiter line") |
| # detect start of codeblock |
| if re.search(r'[^\s][\s]*```', inputline) : |
| # If there's text preceding the begin-``` on the same line, error out and fix it in the source file. |
| report_error("Text preceding codeblock start delimiter (```) on same line.") |
| |
| if re.search(r'```.*```', inputline) : |
| # If there are two sets of triple-ticks on the same line, that's a problem too. |
| report_error("Two sets of codeblock delimiters (```) on same line.") |
| |
| if active_type == "text" or active_type == "bullet" : |
| print >>outfile, "" # Need preceding blank line before codeblock, in doxia. |
| |
| active_type = "code" |
| fix_prefix_blanks(active_type) # in inputline |
| print >>outfile, inputline |
| continue |
| |
| if re.search(r'^[\s]*$', inputline) : |
| trace("blank line") |
| # detect blank lines |
| active_type = "none" |
| print >>outfile, inputline # Perhaps this should be print "" instead? |
| continue |
| |
| if re.search(r'^[\s]*([*+-]|[\d]+\.)[\s]', inputline) : |
| trace("bullet line") |
| # detect bullet line (numbered or not) |
| if (active_type == "text") : |
| print >>outfile, "" # Need preceding blank line between text and bullet, in doxia. This fixes problem #1. |
| |
| active_type = "bullet" |
| fix_prefix_blanks(active_type); # in inputline |
| print >>outfile, inputline |
| continue |
| |
| if inputline.startswith("#") : |
| trace("header line") |
| # detects header lines, which are self-delimiting, and cannot have indenting |
| # Header line resets the indenting as well as current type |
| active_type = "none" |
| indent_stack.init_indent() |
| if re.search(r'^#[^#]', inputline) : |
| # First-level headers ("H1") need explicit anchor inserted (Doxia style). This fixes problem #6. |
| anchor_name = re.sub(r' ', "_", inputline[1:].strip()) |
| anchor_name = re.sub(EXCLUDED_CHARS_REGEX_DOX, "", anchor_name) |
| anchor_text = '<a name="' + anchor_name + '"></a>' |
| if H1_COUNT == 0 : |
| # Treat the first header differently - put the header after instead of before |
| # This is necessary to preserve document metadata titling in generated html. |
| # However, it means the title itself gets hidden above the top of window, when the link is used. |
| H1_COUNT = 1 |
| print >>outfile, inputline |
| print >>outfile, anchor_text |
| print >>outfile, "" # Anchors aren't self-delimiting, so insert a blank line after. |
| else : |
| print >>outfile, "" # Anchors aren't self-delimiting, so insert a blank line first. |
| print >>outfile, anchor_text |
| print >>outfile, inputline |
| else : |
| # H2 or deeper level of header, doxia auto-generates anchor. |
| print >>outfile, inputline |
| continue |
| |
| if re.search(r'^[\s]*#', inputline) : |
| trace("header line, bad") |
| report_error("Header specification character (#) detected with indenting. This is presumed to be an error, since it will render as text. If intentional, put a period or other printable character before it.") |
| |
| ## default action -- last case in state machine switch |
| trace("text line") |
| # Everything else is text-like, and therefore continues active_type, unless none. |
| if (active_type == "none") : |
| # Start new text paragraph. |
| active_type = "text" |
| fix_prefix_blanks(active_type); # in inputline |
| print >>outfile, inputline |
| continue |
| else : |
| # This is just a continuation of current text or bullet. |
| # Indenting is irrelevant. |
| print >>outfile, inputline |
| continue |
| |
| ## end loop on inputlines |
| if (active_type == "code") : |
| report_error("Unmatched codeblock delimiter (```) detected.") |
| |
| infile.close() |
| outfile.close() |
| os.rename(FILENAME + ".tmp", FILENAME) |
| |
| ## end loop on FILENAMEs |
| trace("ending trace") |