blob: af07f50ac495cbf5160b8af1b588ad5cd419eddd [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import logging
import re
import sys
import json
import traceback
from urllib import quote, unquote
from urlparse import urljoin, urlsplit
try:
from forgeimporters.base import ProjectExtractor
urlopen = ProjectExtractor.urlopen
except ImportError:
try:
from allura.lib.helpers import urlopen
except ImportError:
from urllib2 import urlopen
try:
# Ignore this import if the html2text package is not installed
import html2text
except ImportError:
pass
from BeautifulSoup import BeautifulSoup
log = logging.getLogger(__name__)
class WikiExporter(object):
PAGE_LIST_URL = 'wiki/TitleIndex'
PAGE_URL = 'wiki/%s'
CONTENT_DIV_ATTRS = {'class': 'wikipage searchable'}
EXCLUDE_PAGES = [
'CamelCase',
'InterMapTxt',
'InterTrac',
'InterWiki',
'PageTemplates',
'SandBox',
'TitleIndex',
'TracAccessibility',
'TracAdmin',
'TracBackup',
'TracBrowser',
'TracChangeset',
'TracEnvironment',
'TracFineGrainedPermissions',
'TracGuide',
'TracImport',
'TracIni',
'TracInterfaceCustomization',
'TracLinks',
'TracLogging',
'TracNavigation',
'TracNotification',
'TracPermissions',
'TracPlugins',
'TracQuery',
'TracReports',
'TracRevisionLog',
'TracRoadmap',
'TracRss',
'TracSearch',
'TracSupport',
'TracSyntaxColoring',
'TracTickets',
'TracTicketsCustomFields',
'TracTimeline',
'TracUnicode',
'TracWiki',
'TracWorkflow',
'WikiDeletePage',
'WikiFormatting',
'WikiHtml',
'WikiMacros',
'WikiNewPage',
'WikiPageNames',
'WikiProcessors',
'WikiRestructuredText',
'WikiRestructuredTextLinks',
'RecentChanges',
]
RENAME_PAGES = {
'WikiStart': 'Home', # Change the start page name to Home
'Home': 'WikiStart', # Rename the Home page to WikiStart
}
def __init__(self, base_url, options):
self.base_url = base_url
self.options = options
def export(self, out):
pages = []
for title in self.page_list():
try:
pages.append(self.get_page(title))
except:
self.log('Cannot fetch page %s. Skipping' % title)
self.log(traceback.format_exc())
continue
out.write(json.dumps(pages, indent=2, sort_keys=True))
out.write('\n')
def log(self, msg):
log.info(msg)
if self.options.verbose:
print >>sys.stderr, msg
def url(self, suburl, type=None):
url = urljoin(self.base_url, suburl)
if type is None:
return url
glue = '&' if '?' in suburl else '?'
return url + glue + 'format=' + type
def fetch(self, url):
return urlopen(url)
def page_list(self):
url = urljoin(self.base_url, self.PAGE_LIST_URL)
self.log('Fetching list of pages from %s' % url)
r = self.fetch(url)
html = BeautifulSoup(r.content)
pages = html.find('div', attrs=self.CONTENT_DIV_ATTRS) \
.find('ul').findAll('li')
pages = [page.find('a').text
for page in pages
if page.find('a')
and page.find('a').text not in self.EXCLUDE_PAGES]
# Remove duplicate entries by converting page list to a set.
# As we're going to fetch all listed pages,
# it's safe to destroy the original order of pages.
return set(pages)
def get_page(self, title):
title = quote(title)
convert_method = '_get_page_' + self.options.converter
content = getattr(self, convert_method)(title)
page = {
'title': self.convert_title(title),
'text': self.convert_content(content),
'labels': '',
}
return page
def _get_page_html2text(self, title):
url = self.url(self.PAGE_URL % title)
self.log('Fetching page %s' % url)
r = self.fetch(url)
html = BeautifulSoup(r.content)
return html.find('div', attrs=self.CONTENT_DIV_ATTRS)
def _get_page_regex(self, title):
url = self.url(self.PAGE_URL % title, 'txt')
self.log('Fetching page %s' % url)
r = self.fetch(url)
return r.content
def convert_title(self, title):
title = self.RENAME_PAGES.get(title, title)
title = title.replace('/', '-') # Handle subpages
title = title.rstrip('?') # Links to non-existent pages ends with '?'
return title
def convert_content(self, content):
convert_method = '_convert_content_' + self.options.converter
return getattr(self, convert_method)(content)
def _convert_wiki_toc_to_markdown(self, content):
"""
Removes contents of div.wiki-toc elements and replaces them with
the '[TOC]' markdown macro.
"""
for toc in content('div', attrs={'class': 'wiki-toc'}):
toc.string = '[TOC]'
return content
def _convert_content_html2text(self, content):
html2text.BODY_WIDTH = 0 # Don't wrap lines
content = self._convert_wiki_toc_to_markdown(content)
content = html2text.html2text(unicode(content))
# Convert internal links
internal_url = urlsplit(self.base_url).path + 'wiki/'
internal_link_re = r'\[([^]]+)\]\(%s([^)]*)\)' % internal_url
internal_link = re.compile(internal_link_re, re.UNICODE)
def sub(match):
caption = match.group(1)
page = self.convert_title(match.group(2))
if caption == page:
link = '[%s]' % unquote(page)
else:
link = '[%s](%s)' % (caption, page)
return link
return internal_link.sub(sub, content)
def _convert_content_regex(self, text):
# https://gist.github.com/sgk/1286682
text = re.sub('\r\n', '\n', text)
text = re.sub(r'{{{(.*?)}}}', r'`\1`', text)
def indent4(m):
return '\n ' + m.group(1).replace('\n', '\n ')
text = re.sub(r'(?sm){{{\n(.*?)\n}}}', indent4, text)
text = re.sub(r'(?m)^====\s+(.*?)\s+====$', r'#### \1', text)
text = re.sub(r'(?m)^===\s+(.*?)\s+===$', r'### \1', text)
text = re.sub(r'(?m)^==\s+(.*?)\s+==$', r'## \1', text)
text = re.sub(r'(?m)^=\s+(.*?)\s+=$', r'# \1', text)
text = re.sub(r'^ * ', r'****', text)
text = re.sub(r'^ * ', r'***', text)
text = re.sub(r'^ * ', r'**', text)
text = re.sub(r'^ * ', r'*', text)
text = re.sub(r'^ \d+. ', r'1.', text)
a = []
for line in text.split('\n'):
if not line.startswith(' '):
line = re.sub(r'\[(https?://[^\s\[\]]+)\s([^\[\]]+)\]', r'[\2](\1)', line)
line = re.sub(r'\[(wiki:[^\s\[\]]+)\s([^\[\]]+)\]', r'[\2](/\1/)', line)
line = re.sub(r'\!(([A-Z][a-z0-9]+){2,})', r'\1', line)
line = re.sub(r'\'\'\'(.*?)\'\'\'', r'*\1*', line)
line = re.sub(r'\'\'(.*?)\'\'', r'_\1_', line)
a.append(line)
return '\n'.join(a)