site-book/bin/fix-md-dialect.py - metron - Git at Google

 #########################################################################
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #########################################################################


 ## Markdown has these types of paragraph: heading, text, list item (bullet or numbered),
 ## codeblock, table, and block quote.
 ##
 ## This script fixes up differences in Markdown dialect, between Github-MD and doxia-markdown.
 ## Specifically, it fixes these problems:
 ##     1. In Github-MD, bullets and codeblock starts are self-delimiting.  In doxia-markdown, they
 ## must be separated from preceding text or (in the case of codeblocks) bullets, by a blank line.
 ## Failure to do so causes the bullet or codeblock delimiter to be interpreted as ordinary text,
 ## and the content gets munched into the preceding paragraph.  The codeblock delimiter (```) as text
 ## gets interpreted as a codephrase delimiter (`) plus a preceding or following empty codephrase (``).
 ##     2. Github-MD is liberal in regard to what an 'indent' is, allowing 1, 2, 4, or 8 blanks, or
 ## a tab.  We mostly use 2 blanks.  Doxia-markdown requires strictly 4 spaces or a tab.  Failure
 ## to adhere to this requirement causes indents to be ignored or misinterpreted, leading again to
 ## paragraph munching and delimiter ignoring.
 ##     3. In Doxia-markdown, if you indent below a header or text paragraph, it is interpreted as
 ## an implicit codeblock start.  In Github-MD, we only start codeblocks with the explicit
 ## codeblock delimiter (```) and sometimes indent below text just for visual emphasis, so the
 ## doxia-markdown interpretation is unwelcome.  Thus, in our rewrite, we disallow indenting below
 ## text or headers.  This may make the text less pretty than the Github-MD presentation, but it
 ## avoids the incorrect codeblocking.
 ##     4. In Doxia-markdown, the indent of the end-codeblock delimiter must match that of the
 ## begin-codeblock delimiter, or it won't be recognized and the codeblock will run on.
 ##     5. Relative links need to be re-written.  '.md' files need to be changed to '.html', and
 ## as best we can we will re-write named anchors referring to tags autogenerated from headers.
 ## The problem with generated tags is that Github-MD forces header text to lower-case, and
 ## replaces blank spaces with hyphens, while doxia-markdown leaves case unchanged, and replaces
 ## blanks with underscores.  Fortunately we seem to have a culture of using link references that
 ## are typographically the same as the header text, so we have some basis for fixing most links.
 ##     6. H1 headers don't get named anchors generated, unlike H2 and lower headers. Don't know
 ## why doxia-markdown has this deficiency, perhaps it assumes H1 will only be used once at the
 ## beginning of the doc.  We will insert an explicit anchor just before the H1 headers, to fix.
 ##
 ## So far, we're ignoring tables and block quotes.
 ##
 ## This script also manages the re-writing of named files to *.tmp, then mv to replace the original file.


 import sys
 import os
 import inspect
 import re

 # These are the characters excluded by Markdown from use in auto-generated anchor text for Headings.
 EXCLUDED_CHARS_REGEX_GHM = r'[^\w\-]'   # all non-alphanumerics except "-" and "_".  Whitespace are previously converted.
 EXCLUDED_CHARS_REGEX_DOX = r'[^\w\.\-]'   # all non-alphanumerics except "-", "_", and ".".  Whitespace are previously converted.

 def report_error(s) :
     print >>sys.stderr, "ERROR: " + s
     print >>sys.stderr, "on line: " + str(FNR) + " in file: " + FILENAME
     print >>sys.stderr, inputline
     exit(1)


 def trace(msg) :
     if TRACE :
         print >>sys.stderr, "TRACE: " + inspect.currentframe().f_back.f_code.co_name + " : InputLine " + str(FNR) + " : " + msg

 class INDENT_STACK :
     'This class maintains the indent stack during doc parsing.'

     def __init__(self) :
         self.my_stack = [ {'physical' : 0, 'logical' : 0, 'type' : 'none' } ]

     def init_indent(self) :
         del self.my_stack
         self.my_stack = [ {'physical' : 0, 'logical' : 0, 'type' : 'none' } ]

     def push_indent(self, n, new_type) :
         #Increment the logical depth only if under a bullet type. This fixes problem #3.
         level = self.logical_indent_level() + (self.current_type() == "bullet")  # plus 1 if true
         self.my_stack.append( {'physical':n, 'logical':level, 'type':new_type} )

     def set_current_type(self, new_type) :
         # adjust topmost type
         self.my_stack[-1]['type'] = new_type

     def pop_indent(self) :
         if len(self.my_stack) > 1 :
             return self.my_stack.pop()['physical']
         else :
             return 0

     def current_indent(self) :
         # top of stack, physical
         return self.my_stack[-1]['physical']

     def logical_indent_level(self) :
         # top of stack, logical
         return self.my_stack[-1]['logical']

     def current_type(self) :
         # top of stack, type
         return self.my_stack[-1]['type']

     ## End class INDENT_STACK

 global indent_stack
 indent_stack = INDENT_STACK()  # single instance


 def convert_tabs(s) :
     # Courtesy of Python, this does a real column-aware tab expansion.
     # If this doesn't work, we'll need to go back to erroring on " \t", that is, spaces followed by tabs.
     trace("orig length {0}".format(len(s)) )
     ct = s.count("\t")
     s = s.expandtabs(4)
     trace("after {0} tab substitutions, end length is {1}".format(ct, len(s)) )
     return s


 def fix_prefix_blanks(new_type) :
     global inputline
     # Fix up the indenting (prefix blanks) in inputline.  This fixes problem #2.
     # Don't worry about blank lines here, they are filtered out before calling this method.
     # Both uses and maintains the indent stack, which is why we need the new_type passed in.
     prefix_blanks = re.search(r'^[\s]*', inputline)
     if prefix_blanks :
         prefix_blanks = prefix_blanks.group()
         trace("After prefix-blanks match, prefix_blanks is |" + prefix_blanks + "| length is " + str(len(prefix_blanks)) )
         prefix_blanks = convert_tabs(prefix_blanks)
     else :
         prefix_blanks = ""

     trace("After convert_tabs, prefix_blanks is |" + prefix_blanks + "| length is " + str(len(prefix_blanks)) )

     # prefix_blanks now contains the 'physical' indent of the current paragraph, after tab substitution.
     # The indent of this paragraph may be > or == to the previous paragraph.  Those are the easy cases.
     # If the indent is less than previous, is it equal to the indent of the next lower indented object?
     # Or of a lower yet object?  Or is it intermediate between two lower objects currently in the stack?
     # The latter case is an anomoly, but there's no enforcement in Github-MD.
     # The following logic is an empirical reverse engineering, that seems adequate so far.
     # It basically says, find a prior level of indent that this is not less than, and then pretend that
     # the objects between it and this object weren't there.

     trace("current logical_indent_level is {0} and current_indent is {1}".format(
             indent_stack.logical_indent_level(), indent_stack.current_indent() ))
     while len(prefix_blanks) < indent_stack.current_indent() :
         indent_stack.pop_indent()
     if len(prefix_blanks) > indent_stack.current_indent() :
         indent_stack.push_indent(len(prefix_blanks), new_type)
     else :  # len(prefix_blanks) == indent_stack.current_indent()
         indent_stack.set_current_type(new_type)

     trace(("After evaluating this line's prefix-blanks and prev_type, new logical_indent_level() is {0} " +
            "and current_indent is {1}").format(indent_stack.logical_indent_level(), indent_stack.current_indent() ))

     # Now whack off the prefix blanks, and replace with a standardized string of blanks appropriate to
     # the logical indent level.
     trace("Orig line is " + inputline)
     inputline = re.sub(r'^[\s]*', BLANKS[0 : 4*indent_stack.logical_indent_level()], inputline, 1)
     trace("New line is  " + inputline)


 def rewrite_relative_links() :
     global inputline
     trace("entering with line: " + inputline)
     # Fix up the relative links in inputline.  This fixes problem #5.
     num_links = inputline.count("](")
     links = re.findall(r'\[[^\]]+\]\([^)]+\)', inputline)
     num_whole_links = len(links)
     trace("num_links = {0}, num_whole_links = {1}".format(num_links, num_whole_links))
     if (num_links != num_whole_links) :
         if re.search(r'\[[^\][!]*\![\s]*\[', inputline) :
             # Nested link label expressions, with '!'.
             # Special case where a link value is inlined into the link label,
             # as in the first line of the base README.md file.  Bail on such lines.
             trace("WARNING: Found nested link label expressions.")
             return
         else :
             report_error("Found link split across multiple lines.  We can't process this.")

     for linkitem in links :
         pieces = re.search(r'(\[[\s`]*)([^\]]*[^\s`\]])([\s`]*\]\([\s]*)([^\s]+)([\s]*\))', linkitem).groups()
         trace("Link: " + linkitem)
         trace("Pieces: " + " ".join( (pieces[0],pieces[1],pieces[2],pieces[3],pieces[4]) ))
         labeltext = pieces[1]
         href = pieces[3]
         trace("Extracted labeltext is: " + labeltext)
         trace("Extracted href is: " + href)
         if re.search(r'^http|\?', href) :
             # Don't rewrite absolute or parameterized URLs; neither is native to this markdown book.
             trace("skipping absolute or parameterized URL")
             continue

         # Rewrite implicit index references to explicit, so the book will work as well
         # with 'file:///' preview as with a real web server.
         # We are only concerned with file path names here, so split at '#' if present.
         num_sharps = href.count("#")
         if (num_sharps >= 2) :
             report_error("Multiple #'s in a single link href.")
         elif (num_sharps == 1) :
             # Implicit index references are directory names, which seldom have a filetype suffix.
             # On the other hand, explicit file references must have filetype, else the browser
             # won't know what to do with it.  So if no filetype extension, assume is a directory
             # and add 'index.html'.  Skip if this is an intra-document link.
             if not re.search(r'^#|\.[^/#]+#', href) :
                 if not href.count("/#") :
                     href = re.sub(r'#', "/#", href, 1)
                 href = re.sub(r'/#', "/index.html#", href, 1)

             # Fix up '.md' references.
             href = re.sub(r'^README\.md#', "index.html#", href)
             href = re.sub(r'/README\.md#', "/index.html#", href)
             href = re.sub(r'\.md#', ".html#", href)

         else :  # num_sharps == 0
             # Same logic as above, just at $ instead of #.
             if not re.search(r'\.[^/]+$', href) :
                 if not href.endswith("/") :
                     href = href + "/"
                 href = re.sub(r'/$', "/index.html", href)

             # Fix up '.md' references.
             href = re.sub(r'^README\.md$', "index.html", href)
             href = re.sub(r'/README\.md$', "/index.html", href)
             href = re.sub(r'\.md$', ".html", href)

         trace("After .md fixup, href is: " + href)

         # Re-write named anchors referring to generated tags.
         sharp = href.find("#")
         if (sharp >= 0) :
             named_anchor = href[sharp+1 : ]
             trace('named_anchor = "' + named_anchor + '"')
             trace('labeltext = "' + labeltext + '"')
             scratch = labeltext.lower()                  # Github-MD forces all anchors to lowercase
             scratch = re.sub(r'[\s]', "-", scratch)      # convert whitespace to "-"
             scratch = re.sub(EXCLUDED_CHARS_REGEX_GHM, "", scratch)  # strip non-alphanumerics
             if (scratch == named_anchor) :
                 trace("Found a rewritable case")
                 scratch = labeltext                      # Doxia-markdown doesn't change case
                 scratch = re.sub(r'[\s]', "_", scratch)  # convert whitespace to "_"
                 scratch = re.sub(EXCLUDED_CHARS_REGEX_DOX, "", scratch)  # strip non-alphanumerics except "."
                 href = re.sub("#" + named_anchor, "#" + scratch, href)

         trace("After anchor rewrite, href is: " + href)

         # Now swap out the bad href for the fixed one in inputline.
         if (href != pieces[3]) :
             # Assemble the full link string to prevent similar substrings (to href) in different contexts being substituted.
             scratch = pieces[0] + pieces[1] + pieces[2] + href + pieces[4]
             trace("Fixed link text is: " + scratch)
             trace("linkitem is still:  " + linkitem)
             k = inputline.find(linkitem)
             inputline = inputline[ : k] + scratch + inputline[ k + len(linkitem) : ]
             trace("Fixed inputline is: " + inputline)


 ################################################
 # begin state machine

 global inputline, active_type
 BLANKS = "                                                                                    "
 TRACE = 0
 FNR = -1
 trace("Starting trace")

 # Github uses relative indents, but doxia wants only and exactly multiples of 4.
 # To turn the more forgiving into more regular, we must track both logical and actual indents.
 indent_stack.init_indent()

 # Paragraph type can be none, text, bullet, code, or heading.
 # Note 'current_type()' used in managing the logical indent level on the indent stack,
 # and 'active_type' used in the pattern recognition state machine, are deliberately different.
 active_type = "none"

 # Note: order of the below 'if' clauses is critically important for the state machine.
 # Don't change the order.

 if len(sys.argv) <= 1 :
     report_error("Please provide names of files to be processed, as command line arguments.")

 for FILENAME in sys.argv[1:] :
     infile = open(FILENAME, 'r')
     outfile = open(FILENAME + ".tmp", 'w')
     FNR = 0
     H1_COUNT = 0
     for inputline in infile :
         FNR += 1
         inputline = inputline.rstrip("\n")

         if '](' in inputline :
             # Detect lines with hyperlinks in them, and re-write them if necessary and possible.
             # This is the only fall-through block, and we put it at the very beginning.
             rewrite_relative_links();  # in inputline
             # Fall through for further processing.

         if (active_type == "code") and ("```" not in inputline) :
             trace("in codeblock, regular line")
             # what happens in the codeblock, stays in the codeblock
             # Put this case first (after link detection), so we don't have to test it in all the other cases.
             print >>outfile, inputline
             continue

         if (active_type == "code") and ("```" in inputline) :
             trace("in codeblock, end delimiter line")
             # detect end of codeblock
             # This must be the second case.
             if re.search(r'```[\s]*[^\s]', inputline) :
                 # If there's text following the end-``` on the same line, error out and fix it in the source file.
                 report_error("Text following codeblock end delimiter (```) on same line.")

             if re.search(r'```.*```', inputline) :
                 # If there are two sets of triple-ticks on the same line, that's a problem too.
                 report_error("Two sets of codeblock delimiters (```) on same line.")

             active_type = "none"
             # Force the indenting of the end-``` to match the beginning. This fixes problem #4.
             inputline = re.sub(r'^[\s]*', BLANKS[0 : 4*indent_stack.logical_indent_level()], inputline)
             print >>outfile, inputline
             continue

         if (active_type != "code") and ("```" in inputline) :
             trace("start codeblock, delimiter line")
             # detect start of codeblock
             if re.search(r'[^\s][\s]*```', inputline) :
                 # If there's text preceding the begin-``` on the same line, error out and fix it in the source file.
                 report_error("Text preceding codeblock start delimiter (```) on same line.")

             if re.search(r'```.*```', inputline) :
                 # If there are two sets of triple-ticks on the same line, that's a problem too.
                 report_error("Two sets of codeblock delimiters (```) on same line.")

             if active_type == "text" or active_type == "bullet" :
                 print >>outfile, ""   # Need preceding blank line before codeblock, in doxia.

             active_type = "code"
             fix_prefix_blanks(active_type)  # in inputline
             print >>outfile, inputline
             continue

         if re.search(r'^[\s]*$', inputline) :
             trace("blank line")
             # detect blank lines
             active_type = "none"
             print >>outfile, inputline  # Perhaps this should be print "" instead?
             continue

         if re.search(r'^[\s]*([*+-]|[\d]+\.)[\s]', inputline) :
             trace("bullet line")
             # detect bullet line (numbered or not)
             if (active_type == "text") :
                 print >>outfile, ""  # Need preceding blank line between text and bullet, in doxia. This fixes problem #1.

             active_type = "bullet"
             fix_prefix_blanks(active_type);  # in inputline
             print >>outfile, inputline
             continue

         if inputline.startswith("#") :
             trace("header line")
             # detects header lines, which are self-delimiting, and cannot have indenting
             # Header line resets the indenting as well as current type
             active_type = "none"
             indent_stack.init_indent()
             if re.search(r'^#[^#]', inputline) :
                 # First-level headers ("H1") need explicit anchor inserted (Doxia style).  This fixes problem #6.
                 anchor_name = re.sub(r' ', "_", inputline[1:].strip())
                 anchor_name = re.sub(EXCLUDED_CHARS_REGEX_DOX, "", anchor_name)
                 anchor_text = '<a name="' + anchor_name + '"></a>'
                 if H1_COUNT == 0 :
                     # Treat the first header differently - put the header after instead of before
                     # This is necessary to preserve document metadata titling in generated html.
                     # However, it means the title itself gets hidden above the top of window, when the link is used.
                     H1_COUNT = 1
                     print >>outfile, inputline
                     print >>outfile, anchor_text
                     print >>outfile, ""  # Anchors aren't self-delimiting, so insert a blank line after.
                 else :
                     print >>outfile, ""  # Anchors aren't self-delimiting, so insert a blank line first.
                     print >>outfile, anchor_text
                     print >>outfile, inputline
             else :
                 # H2 or deeper level of header, doxia auto-generates anchor.
                 print >>outfile, inputline
             continue

         if re.search(r'^[\s]*#', inputline) :
             trace("header line, bad")
             report_error("Header specification character (#) detected with indenting.  This is presumed to be an error, since it will render as text. If intentional, put a period or other printable character before it.")

         ## default action -- last case in state machine switch
         trace("text line")
         # Everything else is text-like, and therefore continues active_type, unless none.
         if (active_type == "none") :
             # Start new text paragraph.
             active_type = "text"
             fix_prefix_blanks(active_type);  # in inputline
             print >>outfile, inputline
             continue
         else :
             # This is just a continuation of current text or bullet.
             # Indenting is irrelevant.
             print >>outfile, inputline
             continue

     ## end loop on inputlines
     if (active_type == "code") :
         report_error("Unmatched codeblock delimiter (```) detected.")

     infile.close()
     outfile.close()
     os.rename(FILENAME + ".tmp", FILENAME)

 ## end loop on FILENAMEs
 trace("ending trace")
	#########################################################################
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#########################################################################


	## Markdown has these types of paragraph: heading, text, list item (bullet or numbered),
	## codeblock, table, and block quote.
	##
	## This script fixes up differences in Markdown dialect, between Github-MD and doxia-markdown.
	## Specifically, it fixes these problems:
	## 1. In Github-MD, bullets and codeblock starts are self-delimiting. In doxia-markdown, they
	## must be separated from preceding text or (in the case of codeblocks) bullets, by a blank line.
	## Failure to do so causes the bullet or codeblock delimiter to be interpreted as ordinary text,
	## and the content gets munched into the preceding paragraph. The codeblock delimiter (```) as text
	## gets interpreted as a codephrase delimiter (`) plus a preceding or following empty codephrase (``).
	## 2. Github-MD is liberal in regard to what an 'indent' is, allowing 1, 2, 4, or 8 blanks, or
	## a tab. We mostly use 2 blanks. Doxia-markdown requires strictly 4 spaces or a tab. Failure
	## to adhere to this requirement causes indents to be ignored or misinterpreted, leading again to
	## paragraph munching and delimiter ignoring.
	## 3. In Doxia-markdown, if you indent below a header or text paragraph, it is interpreted as
	## an implicit codeblock start. In Github-MD, we only start codeblocks with the explicit
	## codeblock delimiter (```) and sometimes indent below text just for visual emphasis, so the
	## doxia-markdown interpretation is unwelcome. Thus, in our rewrite, we disallow indenting below
	## text or headers. This may make the text less pretty than the Github-MD presentation, but it
	## avoids the incorrect codeblocking.
	## 4. In Doxia-markdown, the indent of the end-codeblock delimiter must match that of the
	## begin-codeblock delimiter, or it won't be recognized and the codeblock will run on.
	## 5. Relative links need to be re-written. '.md' files need to be changed to '.html', and
	## as best we can we will re-write named anchors referring to tags autogenerated from headers.
	## The problem with generated tags is that Github-MD forces header text to lower-case, and
	## replaces blank spaces with hyphens, while doxia-markdown leaves case unchanged, and replaces
	## blanks with underscores. Fortunately we seem to have a culture of using link references that
	## are typographically the same as the header text, so we have some basis for fixing most links.
	## 6. H1 headers don't get named anchors generated, unlike H2 and lower headers. Don't know
	## why doxia-markdown has this deficiency, perhaps it assumes H1 will only be used once at the
	## beginning of the doc. We will insert an explicit anchor just before the H1 headers, to fix.
	##
	## So far, we're ignoring tables and block quotes.
	##
	## This script also manages the re-writing of named files to *.tmp, then mv to replace the original file.


	import sys
	import os
	import inspect
	import re

	# These are the characters excluded by Markdown from use in auto-generated anchor text for Headings.
	EXCLUDED_CHARS_REGEX_GHM = r'[^\w\-]' # all non-alphanumerics except "-" and "_". Whitespace are previously converted.
	EXCLUDED_CHARS_REGEX_DOX = r'[^\w\.\-]' # all non-alphanumerics except "-", "_", and ".". Whitespace are previously converted.

	def report_error(s) :
	print >>sys.stderr, "ERROR: " + s
	print >>sys.stderr, "on line: " + str(FNR) + " in file: " + FILENAME
	print >>sys.stderr, inputline
	exit(1)


	def trace(msg) :
	if TRACE :
	print >>sys.stderr, "TRACE: " + inspect.currentframe().f_back.f_code.co_name + " : InputLine " + str(FNR) + " : " + msg

	class INDENT_STACK :
	'This class maintains the indent stack during doc parsing.'

	def __init__(self) :
	self.my_stack = [ {'physical' : 0, 'logical' : 0, 'type' : 'none' } ]

	def init_indent(self) :
	del self.my_stack
	self.my_stack = [ {'physical' : 0, 'logical' : 0, 'type' : 'none' } ]

	def push_indent(self, n, new_type) :
	#Increment the logical depth only if under a bullet type. This fixes problem #3.
	level = self.logical_indent_level() + (self.current_type() == "bullet") # plus 1 if true
	self.my_stack.append( {'physical':n, 'logical':level, 'type':new_type} )

	def set_current_type(self, new_type) :
	# adjust topmost type
	self.my_stack[-1]['type'] = new_type

	def pop_indent(self) :
	if len(self.my_stack) > 1 :
	return self.my_stack.pop()['physical']
	else :
	return 0

	def current_indent(self) :
	# top of stack, physical
	return self.my_stack[-1]['physical']

	def logical_indent_level(self) :
	# top of stack, logical
	return self.my_stack[-1]['logical']

	def current_type(self) :
	# top of stack, type
	return self.my_stack[-1]['type']

	## End class INDENT_STACK

	global indent_stack
	indent_stack = INDENT_STACK() # single instance


	def convert_tabs(s) :
	# Courtesy of Python, this does a real column-aware tab expansion.
	# If this doesn't work, we'll need to go back to erroring on " \t", that is, spaces followed by tabs.
	trace("orig length {0}".format(len(s)) )
	ct = s.count("\t")
	s = s.expandtabs(4)
	trace("after {0} tab substitutions, end length is {1}".format(ct, len(s)) )
	return s


	def fix_prefix_blanks(new_type) :
	global inputline
	# Fix up the indenting (prefix blanks) in inputline. This fixes problem #2.
	# Don't worry about blank lines here, they are filtered out before calling this method.
	# Both uses and maintains the indent stack, which is why we need the new_type passed in.
	prefix_blanks = re.search(r'^[\s]*', inputline)
	if prefix_blanks :
	prefix_blanks = prefix_blanks.group()
	trace("After prefix-blanks match, prefix_blanks is \|" + prefix_blanks + "\| length is " + str(len(prefix_blanks)) )
	prefix_blanks = convert_tabs(prefix_blanks)
	else :
	prefix_blanks = ""

	trace("After convert_tabs, prefix_blanks is \|" + prefix_blanks + "\| length is " + str(len(prefix_blanks)) )

	# prefix_blanks now contains the 'physical' indent of the current paragraph, after tab substitution.
	# The indent of this paragraph may be > or == to the previous paragraph. Those are the easy cases.
	# If the indent is less than previous, is it equal to the indent of the next lower indented object?
	# Or of a lower yet object? Or is it intermediate between two lower objects currently in the stack?
	# The latter case is an anomoly, but there's no enforcement in Github-MD.
	# The following logic is an empirical reverse engineering, that seems adequate so far.
	# It basically says, find a prior level of indent that this is not less than, and then pretend that
	# the objects between it and this object weren't there.

	trace("current logical_indent_level is {0} and current_indent is {1}".format(
	indent_stack.logical_indent_level(), indent_stack.current_indent() ))
	while len(prefix_blanks) < indent_stack.current_indent() :
	indent_stack.pop_indent()
	if len(prefix_blanks) > indent_stack.current_indent() :
	indent_stack.push_indent(len(prefix_blanks), new_type)
	else : # len(prefix_blanks) == indent_stack.current_indent()
	indent_stack.set_current_type(new_type)

	trace(("After evaluating this line's prefix-blanks and prev_type, new logical_indent_level() is {0} " +
	"and current_indent is {1}").format(indent_stack.logical_indent_level(), indent_stack.current_indent() ))

	# Now whack off the prefix blanks, and replace with a standardized string of blanks appropriate to
	# the logical indent level.
	trace("Orig line is " + inputline)
	inputline = re.sub(r'^[\s]', BLANKS[0 : 4indent_stack.logical_indent_level()], inputline, 1)
	trace("New line is " + inputline)


	def rewrite_relative_links() :
	global inputline
	trace("entering with line: " + inputline)
	# Fix up the relative links in inputline. This fixes problem #5.
	num_links = inputline.count("](")
	links = re.findall(r'\[[^\]]+\]\([^)]+\)', inputline)
	num_whole_links = len(links)
	trace("num_links = {0}, num_whole_links = {1}".format(num_links, num_whole_links))
	if (num_links != num_whole_links) :
	if re.search(r'\[[^\][!]\![\s]\[', inputline) :
	# Nested link label expressions, with '!'.
	# Special case where a link value is inlined into the link label,
	# as in the first line of the base README.md file. Bail on such lines.
	trace("WARNING: Found nested link label expressions.")
	return
	else :
	report_error("Found link split across multiple lines. We can't process this.")

	for linkitem in links :
	pieces = re.search(r'(\[[\s`])([^\]][^\s`\]])([\s`]\]\([\s])([^\s]+)([\s]*\))', linkitem).groups()
	trace("Link: " + linkitem)
	trace("Pieces: " + " ".join( (pieces[0],pieces[1],pieces[2],pieces[3],pieces[4]) ))
	labeltext = pieces[1]
	href = pieces[3]
	trace("Extracted labeltext is: " + labeltext)
	trace("Extracted href is: " + href)
	if re.search(r'^http\|\?', href) :
	# Don't rewrite absolute or parameterized URLs; neither is native to this markdown book.
	trace("skipping absolute or parameterized URL")
	continue

	# Rewrite implicit index references to explicit, so the book will work as well
	# with 'file:///' preview as with a real web server.
	# We are only concerned with file path names here, so split at '#' if present.
	num_sharps = href.count("#")
	if (num_sharps >= 2) :
	report_error("Multiple #'s in a single link href.")
	elif (num_sharps == 1) :
	# Implicit index references are directory names, which seldom have a filetype suffix.
	# On the other hand, explicit file references must have filetype, else the browser
	# won't know what to do with it. So if no filetype extension, assume is a directory
	# and add 'index.html'. Skip if this is an intra-document link.
	if not re.search(r'^#\|\.[^/#]+#', href) :
	if not href.count("/#") :
	href = re.sub(r'#', "/#", href, 1)
	href = re.sub(r'/#', "/index.html#", href, 1)

	# Fix up '.md' references.
	href = re.sub(r'^README\.md#', "index.html#", href)
	href = re.sub(r'/README\.md#', "/index.html#", href)
	href = re.sub(r'\.md#', ".html#", href)

	else : # num_sharps == 0
	# Same logic as above, just at $ instead of #.
	if not re.search(r'\.[^/]+$', href) :
	if not href.endswith("/") :
	href = href + "/"
	href = re.sub(r'/$', "/index.html", href)

	# Fix up '.md' references.
	href = re.sub(r'^README\.md$', "index.html", href)
	href = re.sub(r'/README\.md$', "/index.html", href)
	href = re.sub(r'\.md$', ".html", href)

	trace("After .md fixup, href is: " + href)

	# Re-write named anchors referring to generated tags.
	sharp = href.find("#")
	if (sharp >= 0) :
	named_anchor = href[sharp+1 : ]
	trace('named_anchor = "' + named_anchor + '"')
	trace('labeltext = "' + labeltext + '"')
	scratch = labeltext.lower() # Github-MD forces all anchors to lowercase
	scratch = re.sub(r'[\s]', "-", scratch) # convert whitespace to "-"
	scratch = re.sub(EXCLUDED_CHARS_REGEX_GHM, "", scratch) # strip non-alphanumerics
	if (scratch == named_anchor) :
	trace("Found a rewritable case")
	scratch = labeltext # Doxia-markdown doesn't change case
	scratch = re.sub(r'[\s]', "_", scratch) # convert whitespace to "_"
	scratch = re.sub(EXCLUDED_CHARS_REGEX_DOX, "", scratch) # strip non-alphanumerics except "."
	href = re.sub("#" + named_anchor, "#" + scratch, href)

	trace("After anchor rewrite, href is: " + href)

	# Now swap out the bad href for the fixed one in inputline.
	if (href != pieces[3]) :
	# Assemble the full link string to prevent similar substrings (to href) in different contexts being substituted.
	scratch = pieces[0] + pieces[1] + pieces[2] + href + pieces[4]
	trace("Fixed link text is: " + scratch)
	trace("linkitem is still: " + linkitem)
	k = inputline.find(linkitem)
	inputline = inputline[ : k] + scratch + inputline[ k + len(linkitem) : ]
	trace("Fixed inputline is: " + inputline)



	################################################
	# begin state machine

	global inputline, active_type
	BLANKS = " "
	TRACE = 0
	FNR = -1
	trace("Starting trace")

	# Github uses relative indents, but doxia wants only and exactly multiples of 4.
	# To turn the more forgiving into more regular, we must track both logical and actual indents.
	indent_stack.init_indent()

	# Paragraph type can be none, text, bullet, code, or heading.
	# Note 'current_type()' used in managing the logical indent level on the indent stack,
	# and 'active_type' used in the pattern recognition state machine, are deliberately different.
	active_type = "none"

	# Note: order of the below 'if' clauses is critically important for the state machine.
	# Don't change the order.

	if len(sys.argv) <= 1 :
	report_error("Please provide names of files to be processed, as command line arguments.")

	for FILENAME in sys.argv[1:] :
	infile = open(FILENAME, 'r')
	outfile = open(FILENAME + ".tmp", 'w')
	FNR = 0
	H1_COUNT = 0
	for inputline in infile :
	FNR += 1
	inputline = inputline.rstrip("\n")

	if '](' in inputline :
	# Detect lines with hyperlinks in them, and re-write them if necessary and possible.
	# This is the only fall-through block, and we put it at the very beginning.
	rewrite_relative_links(); # in inputline
	# Fall through for further processing.

	if (active_type == "code") and ("```" not in inputline) :
	trace("in codeblock, regular line")
	# what happens in the codeblock, stays in the codeblock
	# Put this case first (after link detection), so we don't have to test it in all the other cases.
	print >>outfile, inputline
	continue

	if (active_type == "code") and ("```" in inputline) :
	trace("in codeblock, end delimiter line")
	# detect end of codeblock
	# This must be the second case.
	if re.search(r'```[\s]*[^\s]', inputline) :
	# If there's text following the end-``` on the same line, error out and fix it in the source file.
	report_error("Text following codeblock end delimiter (```) on same line.")

	if re.search(r'```.*```', inputline) :
	# If there are two sets of triple-ticks on the same line, that's a problem too.
	report_error("Two sets of codeblock delimiters (```) on same line.")

	active_type = "none"
	# Force the indenting of the end-``` to match the beginning. This fixes problem #4.
	inputline = re.sub(r'^[\s]', BLANKS[0 : 4indent_stack.logical_indent_level()], inputline)
	print >>outfile, inputline
	continue

	if (active_type != "code") and ("```" in inputline) :
	trace("start codeblock, delimiter line")
	# detect start of codeblock
	if re.search(r'[^\s][\s]*```', inputline) :
	# If there's text preceding the begin-``` on the same line, error out and fix it in the source file.
	report_error("Text preceding codeblock start delimiter (```) on same line.")

	if re.search(r'```.*```', inputline) :
	# If there are two sets of triple-ticks on the same line, that's a problem too.
	report_error("Two sets of codeblock delimiters (```) on same line.")

	if active_type == "text" or active_type == "bullet" :
	print >>outfile, "" # Need preceding blank line before codeblock, in doxia.

	active_type = "code"
	fix_prefix_blanks(active_type) # in inputline
	print >>outfile, inputline
	continue

	if re.search(r'^[\s]*$', inputline) :
	trace("blank line")
	# detect blank lines
	active_type = "none"
	print >>outfile, inputline # Perhaps this should be print "" instead?
	continue

	if re.search(r'^[\s]([+-]\|[\d]+\.)[\s]', inputline) :
	trace("bullet line")
	# detect bullet line (numbered or not)
	if (active_type == "text") :
	print >>outfile, "" # Need preceding blank line between text and bullet, in doxia. This fixes problem #1.

	active_type = "bullet"
	fix_prefix_blanks(active_type); # in inputline
	print >>outfile, inputline
	continue

	if inputline.startswith("#") :
	trace("header line")
	# detects header lines, which are self-delimiting, and cannot have indenting
	# Header line resets the indenting as well as current type
	active_type = "none"
	indent_stack.init_indent()
	if re.search(r'^#[^#]', inputline) :
	# First-level headers ("H1") need explicit anchor inserted (Doxia style). This fixes problem #6.
	anchor_name = re.sub(r' ', "_", inputline[1:].strip())
	anchor_name = re.sub(EXCLUDED_CHARS_REGEX_DOX, "", anchor_name)
	anchor_text = '<a name="' + anchor_name + '"></a>'
	if H1_COUNT == 0 :
	# Treat the first header differently - put the header after instead of before
	# This is necessary to preserve document metadata titling in generated html.
	# However, it means the title itself gets hidden above the top of window, when the link is used.
	H1_COUNT = 1
	print >>outfile, inputline
	print >>outfile, anchor_text
	print >>outfile, "" # Anchors aren't self-delimiting, so insert a blank line after.
	else :
	print >>outfile, "" # Anchors aren't self-delimiting, so insert a blank line first.
	print >>outfile, anchor_text
	print >>outfile, inputline
	else :
	# H2 or deeper level of header, doxia auto-generates anchor.
	print >>outfile, inputline
	continue

	if re.search(r'^[\s]*#', inputline) :
	trace("header line, bad")
	report_error("Header specification character (#) detected with indenting. This is presumed to be an error, since it will render as text. If intentional, put a period or other printable character before it.")

	## default action -- last case in state machine switch
	trace("text line")
	# Everything else is text-like, and therefore continues active_type, unless none.
	if (active_type == "none") :
	# Start new text paragraph.
	active_type = "text"
	fix_prefix_blanks(active_type); # in inputline
	print >>outfile, inputline
	continue
	else :
	# This is just a continuation of current text or bullet.
	# Indenting is irrelevant.
	print >>outfile, inputline
	continue

	## end loop on inputlines
	if (active_type == "code") :
	report_error("Unmatched codeblock delimiter (```) detected.")

	infile.close()
	outfile.close()
	os.rename(FILENAME + ".tmp", FILENAME)

	## end loop on FILENAMEs
	trace("ending trace")