| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| import logging |
| import re |
| import sys |
| import json |
| import traceback |
| from urllib import quote, unquote |
| from urlparse import urljoin, urlsplit |
| |
| try: |
| from forgeimporters.base import ProjectExtractor |
| urlopen = ProjectExtractor.urlopen |
| except ImportError: |
| try: |
| from allura.lib.helpers import urlopen |
| except ImportError: |
| from urllib2 import urlopen |
| |
| try: |
| # Ignore this import if the html2text package is not installed |
| import html2text |
| except ImportError: |
| pass |
| |
| from BeautifulSoup import BeautifulSoup |
| |
| log = logging.getLogger(__name__) |
| |
| |
| class WikiExporter(object): |
| |
| PAGE_LIST_URL = 'wiki/TitleIndex' |
| PAGE_URL = 'wiki/%s' |
| CONTENT_DIV_ATTRS = {'class': 'wikipage searchable'} |
| EXCLUDE_PAGES = [ |
| 'CamelCase', |
| 'InterMapTxt', |
| 'InterTrac', |
| 'InterWiki', |
| 'PageTemplates', |
| 'SandBox', |
| 'TitleIndex', |
| 'TracAccessibility', |
| 'TracAdmin', |
| 'TracBackup', |
| 'TracBrowser', |
| 'TracChangeset', |
| 'TracEnvironment', |
| 'TracFineGrainedPermissions', |
| 'TracGuide', |
| 'TracImport', |
| 'TracIni', |
| 'TracInterfaceCustomization', |
| 'TracLinks', |
| 'TracLogging', |
| 'TracNavigation', |
| 'TracNotification', |
| 'TracPermissions', |
| 'TracPlugins', |
| 'TracQuery', |
| 'TracReports', |
| 'TracRevisionLog', |
| 'TracRoadmap', |
| 'TracRss', |
| 'TracSearch', |
| 'TracSupport', |
| 'TracSyntaxColoring', |
| 'TracTickets', |
| 'TracTicketsCustomFields', |
| 'TracTimeline', |
| 'TracUnicode', |
| 'TracWiki', |
| 'TracWorkflow', |
| 'WikiDeletePage', |
| 'WikiFormatting', |
| 'WikiHtml', |
| 'WikiMacros', |
| 'WikiNewPage', |
| 'WikiPageNames', |
| 'WikiProcessors', |
| 'WikiRestructuredText', |
| 'WikiRestructuredTextLinks', |
| 'RecentChanges', |
| ] |
| RENAME_PAGES = { |
| 'WikiStart': 'Home', # Change the start page name to Home |
| 'Home': 'WikiStart', # Rename the Home page to WikiStart |
| } |
| |
| def __init__(self, base_url, options): |
| self.base_url = base_url |
| self.options = options |
| |
| def export(self, out): |
| pages = [] |
| for title in self.page_list(): |
| try: |
| pages.append(self.get_page(title)) |
| except: |
| self.log('Cannot fetch page %s. Skipping' % title) |
| self.log(traceback.format_exc()) |
| continue |
| out.write(json.dumps(pages, indent=2, sort_keys=True)) |
| out.write('\n') |
| |
| def log(self, msg): |
| log.info(msg) |
| if self.options.verbose: |
| print >>sys.stderr, msg |
| |
| def url(self, suburl, type=None): |
| url = urljoin(self.base_url, suburl) |
| if type is None: |
| return url |
| glue = '&' if '?' in suburl else '?' |
| return url + glue + 'format=' + type |
| |
| def fetch(self, url): |
| return urlopen(url) |
| |
| def page_list(self): |
| url = urljoin(self.base_url, self.PAGE_LIST_URL) |
| self.log('Fetching list of pages from %s' % url) |
| r = self.fetch(url) |
| html = BeautifulSoup(r.content) |
| pages = html.find('div', attrs=self.CONTENT_DIV_ATTRS) \ |
| .find('ul').findAll('li') |
| pages = [page.find('a').text |
| for page in pages |
| if page.find('a') |
| and page.find('a').text not in self.EXCLUDE_PAGES] |
| # Remove duplicate entries by converting page list to a set. |
| # As we're going to fetch all listed pages, |
| # it's safe to destroy the original order of pages. |
| return set(pages) |
| |
| def get_page(self, title): |
| title = quote(title) |
| convert_method = '_get_page_' + self.options.converter |
| content = getattr(self, convert_method)(title) |
| page = { |
| 'title': self.convert_title(title), |
| 'text': self.convert_content(content), |
| 'labels': '', |
| } |
| return page |
| |
| def _get_page_html2text(self, title): |
| url = self.url(self.PAGE_URL % title) |
| self.log('Fetching page %s' % url) |
| r = self.fetch(url) |
| html = BeautifulSoup(r.content) |
| return html.find('div', attrs=self.CONTENT_DIV_ATTRS) |
| |
| def _get_page_regex(self, title): |
| url = self.url(self.PAGE_URL % title, 'txt') |
| self.log('Fetching page %s' % url) |
| r = self.fetch(url) |
| return r.content |
| |
| def convert_title(self, title): |
| title = self.RENAME_PAGES.get(title, title) |
| title = title.replace('/', '-') # Handle subpages |
| title = title.rstrip('?') # Links to non-existent pages ends with '?' |
| return title |
| |
| def convert_content(self, content): |
| convert_method = '_convert_content_' + self.options.converter |
| return getattr(self, convert_method)(content) |
| |
| def _convert_wiki_toc_to_markdown(self, content): |
| """ |
| Removes contents of div.wiki-toc elements and replaces them with |
| the '[TOC]' markdown macro. |
| """ |
| for toc in content('div', attrs={'class': 'wiki-toc'}): |
| toc.string = '[TOC]' |
| return content |
| |
| def _convert_content_html2text(self, content): |
| html2text.BODY_WIDTH = 0 # Don't wrap lines |
| content = self._convert_wiki_toc_to_markdown(content) |
| content = html2text.html2text(unicode(content)) |
| # Convert internal links |
| internal_url = urlsplit(self.base_url).path + 'wiki/' |
| internal_link_re = r'\[([^]]+)\]\(%s([^)]*)\)' % internal_url |
| internal_link = re.compile(internal_link_re, re.UNICODE) |
| def sub(match): |
| caption = match.group(1) |
| page = self.convert_title(match.group(2)) |
| if caption == page: |
| link = '[%s]' % unquote(page) |
| else: |
| link = '[%s](%s)' % (caption, page) |
| return link |
| return internal_link.sub(sub, content) |
| |
| def _convert_content_regex(self, text): |
| # https://gist.github.com/sgk/1286682 |
| text = re.sub('\r\n', '\n', text) |
| text = re.sub(r'{{{(.*?)}}}', r'`\1`', text) |
| |
| def indent4(m): |
| return '\n ' + m.group(1).replace('\n', '\n ') |
| |
| text = re.sub(r'(?sm){{{\n(.*?)\n}}}', indent4, text) |
| text = re.sub(r'(?m)^====\s+(.*?)\s+====$', r'#### \1', text) |
| text = re.sub(r'(?m)^===\s+(.*?)\s+===$', r'### \1', text) |
| text = re.sub(r'(?m)^==\s+(.*?)\s+==$', r'## \1', text) |
| text = re.sub(r'(?m)^=\s+(.*?)\s+=$', r'# \1', text) |
| text = re.sub(r'^ * ', r'****', text) |
| text = re.sub(r'^ * ', r'***', text) |
| text = re.sub(r'^ * ', r'**', text) |
| text = re.sub(r'^ * ', r'*', text) |
| text = re.sub(r'^ \d+. ', r'1.', text) |
| a = [] |
| for line in text.split('\n'): |
| if not line.startswith(' '): |
| line = re.sub(r'\[(https?://[^\s\[\]]+)\s([^\[\]]+)\]', r'[\2](\1)', line) |
| line = re.sub(r'\[(wiki:[^\s\[\]]+)\s([^\[\]]+)\]', r'[\2](/\1/)', line) |
| line = re.sub(r'\!(([A-Z][a-z0-9]+){2,})', r'\1', line) |
| line = re.sub(r'\'\'\'(.*?)\'\'\'', r'*\1*', line) |
| line = re.sub(r'\'\'(.*?)\'\'', r'_\1_', line) |
| a.append(line) |
| return '\n'.join(a) |