|  | #!/usr/bin/env python | 
|  |  | 
|  | """\ | 
|  | This tool regenerates and replaces the ToC in an HTML file from the actual | 
|  | structure of <div>s and <h[2345]>s present in the body of the document. | 
|  | The section to be overwritten is identified as the XML subtree | 
|  | rooted at <ol id="toc">. | 
|  |  | 
|  | Usage: ./toctool.py filename... | 
|  | """ | 
|  |  | 
|  | import sys | 
|  | import os | 
|  | import xml.parsers.expat | 
|  |  | 
|  |  | 
|  | class Index: | 
|  | def __init__(self): | 
|  | self.title = None | 
|  | self.tree = [] | 
|  | self._ptr_stack = [self.tree] | 
|  |  | 
|  | def addLevel(self, id, title): | 
|  | newlevel = [(id, title)] | 
|  | self._ptr_stack[-1].append(newlevel) | 
|  | self._ptr_stack.append(newlevel) | 
|  |  | 
|  | def upLevel(self): | 
|  | self._ptr_stack.pop(-1) | 
|  |  | 
|  | def prettyString(self): | 
|  | out = [] | 
|  | def step(ilevel, node): | 
|  | if type(node) == list: | 
|  | for subnode in node: | 
|  | step(ilevel+1, subnode) | 
|  | else: | 
|  | out.append("%s%s" % ("  "*ilevel, node)) | 
|  | step(-2, self.tree) | 
|  | return "\n".join(out) | 
|  |  | 
|  | def renderXML(self): | 
|  | out = [] | 
|  | def step(ilevel, node): | 
|  | if len(node) == 1: | 
|  | out.append('%s<li><a href="#%s">%s</a></li>' | 
|  | % ('  '*ilevel, node[0][0], node[0][1])) | 
|  | else: | 
|  | out.append('%s<li><a href="#%s">%s</a>' | 
|  | % ('  '*ilevel, node[0][0], node[0][1])) | 
|  | out.append('%s<ol>' % ('  '*ilevel)) | 
|  | for subnode in node[1:]: | 
|  | step(ilevel+1, subnode) | 
|  | out.append('%s</ol>' % ('  '*ilevel)) | 
|  | out.append('%s</li> <!-- %s -->' % ('  '*ilevel, node[0][0])) | 
|  | out.append('<ol id="toc">') | 
|  | for node in self.tree: | 
|  | step(1, node) | 
|  | out.append('</ol>') | 
|  | return "\n".join(out) | 
|  |  | 
|  |  | 
|  | class ExpatParseJob: | 
|  | def parse(self, file): | 
|  | p = xml.parsers.expat.ParserCreate() | 
|  | p.ordered_attributes = self._ordered_attributes | 
|  | p.returns_unicode = False | 
|  | p.specified_attributes = True | 
|  | for name in dir(self): | 
|  | if name.endswith('Handler'): | 
|  | setattr(p, name, getattr(self, name)) | 
|  | p.ParseFile(file) | 
|  |  | 
|  |  | 
|  | class IndexBuildParse(ExpatParseJob): | 
|  | keys = {'h2':None, 'h3':None, 'h4':None, 'h5':None} | 
|  |  | 
|  | def __init__(self): | 
|  | self.index = Index() | 
|  | self.keyptr = 0 | 
|  | self.collecting_text = False | 
|  | self.text = '' | 
|  | self.waiting_for_elt = None | 
|  | self.saved_id = None | 
|  | self.elt_stack = [] | 
|  | self._ordered_attributes = False | 
|  |  | 
|  | def StartElementHandler(self, name, attrs): | 
|  | if name == 'div': | 
|  | cl = attrs.get('class') | 
|  | if cl in self.keys: | 
|  | self.waiting_for_elt = cl | 
|  | self.saved_id = attrs.get('id') | 
|  | self.elt_stack.append((name, True)) | 
|  | return | 
|  | elif name == 'title': | 
|  | self.collecting_text = name | 
|  | self.text = '' | 
|  | elif name == self.waiting_for_elt: | 
|  | self.waiting_for_elt = None | 
|  | self.collecting_text = name | 
|  | self.text = '' | 
|  | self.elt_stack.append((name, False)) | 
|  |  | 
|  | def EndElementHandler(self, name): | 
|  | if self.collecting_text: | 
|  | if name == self.collecting_text: | 
|  | if name == 'title': | 
|  | self.index.title = self.text | 
|  | else: | 
|  | self.index.addLevel(self.saved_id, self.text) | 
|  | self.saved_id = None | 
|  | self.collecting_text = False | 
|  | else: | 
|  | raise RuntimeError('foo') | 
|  | eltinfo = self.elt_stack.pop(-1) | 
|  | assert eltinfo[0] == name | 
|  | if eltinfo[1]: | 
|  | self.index.upLevel() | 
|  |  | 
|  | def DefaultHandler(self, data) : | 
|  | if self.collecting_text: | 
|  | self.text += data | 
|  |  | 
|  |  | 
|  | def attrlist_to_dict(l): | 
|  | d = {} | 
|  | for i in xrange(0, len(l), 2): | 
|  | d[l[i]] = l[i+1] | 
|  | return d | 
|  |  | 
|  |  | 
|  | def escape_entities(s): | 
|  | return s.replace('&', '&').replace('<', '<').replace('>', '>') | 
|  |  | 
|  |  | 
|  | class IndexInsertParse(ExpatParseJob): | 
|  | def __init__(self, index, outfp): | 
|  | self._ordered_attributes = True | 
|  | self.index = index | 
|  | self.outfp = outfp | 
|  | self.elt_stack = [] | 
|  | self.skipping_toc = False | 
|  |  | 
|  | self._line_in_progress = [] | 
|  | self._element_open = None | 
|  | self.linepos = 0 | 
|  | self.indentpos = 0 | 
|  |  | 
|  | self.do_not_minimize = {'script':None} | 
|  | self.do_not_indent = {'div':None, 'a':None, 'strong':None, 'em':None} | 
|  | self.do_not_wrap = {'div':None, 'strong':None, 'em':None, 'li':None} | 
|  |  | 
|  | if self.index.title == 'Subversion Design': | 
|  | self.do_not_wrap['a'] = None | 
|  |  | 
|  | def put_token(self, token, tag_name): | 
|  | self._line_in_progress.append((token, tag_name)) | 
|  |  | 
|  | def done_line(self): | 
|  | linepos = 0 | 
|  | last_was_tag = False | 
|  | outq = [] | 
|  | for token, tag_name in self._line_in_progress: | 
|  | is_tag = tag_name is not None and tag_name not in self.do_not_wrap | 
|  | no_indent_if_wrap = tag_name in self.do_not_indent | 
|  | linepos += len(token) | 
|  | if linepos > 79 and is_tag and last_was_tag: | 
|  | token = token.lstrip(' ') | 
|  | if no_indent_if_wrap: | 
|  | linepos = len(token) | 
|  | outq.append('\n') | 
|  | else: | 
|  | linepos = len(token) + 2 | 
|  | outq.append('\n  ') | 
|  | outq.append(token) | 
|  | last_was_tag = is_tag | 
|  | outq.append('\n') | 
|  | for i in outq: | 
|  | self.outfp.write(i) | 
|  | del self._line_in_progress[:] | 
|  |  | 
|  | def _finish_pending(self, minimized_form): | 
|  | if self._element_open is not None: | 
|  | name = self._element_open | 
|  | self._element_open = None | 
|  | if minimized_form: | 
|  | self.put_token(' />', name) | 
|  | return True | 
|  | else: | 
|  | self.put_token('>', name) | 
|  | return False | 
|  |  | 
|  | def StartElementHandler(self, name, attrs): | 
|  | self._finish_pending(False) | 
|  | if name == 'ol' and attrlist_to_dict(attrs).get('id') == 'toc': | 
|  | self.outfp.write(self.index.renderXML()) | 
|  | self.skipping_toc = True | 
|  | self.elt_stack.append((name, True)) | 
|  | return | 
|  | if not self.skipping_toc: | 
|  | self.put_token("<%s" % name, name) | 
|  | while attrs: | 
|  | aname = attrs.pop(0) | 
|  | aval = escape_entities(attrs.pop(0)) | 
|  | self.put_token(' %s="%s"' % (aname, aval), name) | 
|  | self._element_open = name | 
|  | self.elt_stack.append((name, False)) | 
|  |  | 
|  | def EndElementHandler(self, name): | 
|  | if not self.skipping_toc: | 
|  | if not self._finish_pending(name not in self.do_not_minimize): | 
|  | self.put_token("</%s>" % name, name) | 
|  | eltinfo = self.elt_stack.pop(-1) | 
|  | assert eltinfo[0] == name | 
|  | if eltinfo[1]: | 
|  | self.skipping_toc = False | 
|  |  | 
|  | def DefaultHandler(self, data): | 
|  | if self.skipping_toc: | 
|  | return | 
|  | self._finish_pending(False) | 
|  | # This makes an unsafe assumption that expat will pass '\n' as individual | 
|  | # characters to this function.  Seems to work at the moment. | 
|  | # Will almost certainly break later. | 
|  | if data == '\n': | 
|  | self.done_line() | 
|  | else: | 
|  | self.put_token(data, None) | 
|  |  | 
|  |  | 
|  | def process(fn): | 
|  | infp = open(fn, 'r') | 
|  | builder = IndexBuildParse() | 
|  | builder.parse(infp) | 
|  |  | 
|  | infp.seek(0) | 
|  | outfp = open(fn + '.new', 'w') | 
|  | inserter = IndexInsertParse(builder.index, outfp) | 
|  | inserter.parse(infp) | 
|  |  | 
|  | infp.close() | 
|  | outfp.close() | 
|  | os.rename(fn, fn + '.toctool-backup~') | 
|  | os.rename(fn + '.new', fn) | 
|  |  | 
|  |  | 
|  | def main(): | 
|  | for fn in sys.argv[1:]: | 
|  | process(fn) | 
|  |  | 
|  |  | 
|  | if __name__ == '__main__': | 
|  | main() |