[#4186] ticket:85 Import from MediaWiki improvements.
Extract and import pages history.
Extract and import talk for pages.
Extract and import attachments for pages.
Convert MediaWiki attachments links to ForgeWiki markdown format.
diff --git a/ForgeWiki/forgewiki/command/wiki2markdown/__init__.py b/ForgeWiki/forgewiki/command/wiki2markdown/__init__.py
index 20deaf6..fd0b112 100644
--- a/ForgeWiki/forgewiki/command/wiki2markdown/__init__.py
+++ b/ForgeWiki/forgewiki/command/wiki2markdown/__init__.py
@@ -13,7 +13,7 @@
parser = WikiCommand.standard_parser(verbose=True)
parser.add_option('-e', '--extract-only', action='store_true',
dest='extract',
- help='Store data from the mediawiki-dump'
+ help='Store data from the mediawiki-dump '
'on the local filesystem; not load into Allura')
parser.add_option('-l', '--load-only', action='store_true', dest='load',
help='Load into Allura previously-extracted data')
@@ -35,6 +35,9 @@
help='User for database connection')
parser.add_option('--password', dest='password', default='',
help='Password for database connection')
+ parser.add_option('-a', '--attachments-dir', dest='attachments_dir',
+ help='Path to directory with mediawiki attachments dump',
+ default='')
def command(self):
self.basic_setup()
@@ -47,7 +50,7 @@
self.loader.load()
def handle_options(self):
- if self.options.dump_dir == '':
+ if not self.options.dump_dir:
allura_base.log.error('You must specify directory for dump files')
exit(2)
@@ -72,3 +75,8 @@
else:
allura_base.log.error('You must specify valid data source')
exit(2)
+
+ if not self.options.attachments_dir:
+ allura_base.log.error('You must specify path to directory '
+ 'with mediawiki attachmets dump.')
+ exit(2)
diff --git a/ForgeWiki/forgewiki/command/wiki2markdown/extractors.py b/ForgeWiki/forgewiki/command/wiki2markdown/extractors.py
index 48637c5..4753673 100644
--- a/ForgeWiki/forgewiki/command/wiki2markdown/extractors.py
+++ b/ForgeWiki/forgewiki/command/wiki2markdown/extractors.py
@@ -2,6 +2,8 @@
import os
import shutil
import json
+import hashlib
+
from allura.command import base as allura_base
@@ -16,21 +18,7 @@
os.makedirs(self.options.dump_dir)
def extract(self):
- self.extract_pages()
- self.extract_history()
- self.extract_talk()
- self.extract_attachments()
-
- def extract_pages(self):
- raise NotImplementedError("subclass must override this")
-
- def extract_history(self):
- raise NotImplementedError("subclass must override this")
-
- def extract_talk(self):
- raise NotImplementedError("subclass must override this")
-
- def extract_attachments(self):
+ """Extract pages with history, attachments, talk-pages, etc"""
raise NotImplementedError("subclass must override this")
@@ -64,33 +52,116 @@
with open(out_file, 'w') as out:
out.write(content.encode('utf-8'))
+ def _save_attachment(self, filepath, *paths):
+ """Save attachment in dump directory.
+
+ Copy from mediawiki dump directory to our internal dump directory.
+
+ args:
+ filepath - path to attachment in mediawiki dump.
+ *paths - path to internal dump directory.
+ """
+ out_dir = os.path.join(self.options.dump_dir, *paths)
+ if not os.path.exists(out_dir):
+ os.makedirs(out_dir)
+ shutil.copy(filepath, out_dir)
+
def _pages(self):
"""Yield page_data for next wiki page"""
c = self.connection().cursor()
- c.execute('select page.page_id, page.page_title, text.old_text '
+ c.execute('select page.page_id, page.page_title '
+ 'from page where page.page_namespace = 0')
+ for row in c:
+ _id, title = row
+ page_data = {
+ 'page_id': _id,
+ 'title': title,
+ }
+ yield page_data
+
+ def _history(self, page_id):
+ """Yield page_data for next revision of wiki page"""
+ c = self.connection().cursor()
+ c.execute('select revision.rev_timestamp, text.old_text '
+ 'from revision '
+ 'left join text on revision.rev_text_id = text.old_id '
+ 'where revision.rev_page = %s', page_id)
+ for row in c:
+ timestamp, text = row
+ page_data = {
+ 'timestamp': timestamp,
+ 'text': text or ''
+ }
+ yield page_data
+
+ def _talk(self, page_title):
+ """Return page_data for talk page with `page_title` title"""
+ c = self.connection().cursor()
+ query_attrs = (page_title, 1) # page_namespace == 1 - talk pages
+ c.execute('select text.old_text '
'from page '
'left join revision on revision.rev_id = page.page_latest '
'left join text on text.old_id = revision.rev_text_id '
- 'where page.page_namespace = 0')
+ 'where page.page_title = %s and page.page_namespace = %s '
+ 'limit 1', query_attrs)
+
+ row = c.fetchone()
+ if row:
+ text = row[0]
+ return {'text': text}
+
+ def _attachments(self, page_id):
+ """Yield path to nexe file attached to wiki page"""
+ c = self.connection().cursor()
+ c.execute('select il_to from imagelinks '
+ 'where il_from = %s' % page_id)
for row in c:
- _id, title, text = row
- page_data = {
- 'title': title,
- 'text': text or ''
- }
- yield _id, page_data
+ name = row[0]
+ # mediawiki stores attachmets in subdirectories
+ # based on md5-hash of filename
+ # so we need to build path to file as follows
+ md5 = hashlib.md5(name).hexdigest()
+ path = os.path.join(self.options.attachments_dir,
+ md5[:1], md5[:2], name)
+ if os.path.isfile(path):
+ yield path
+
+ def extract(self):
+ self.extract_pages()
def extract_pages(self):
allura_base.log.info('Extracting pages...')
- for _id, page_data in self._pages():
- self._save(json.dumps(page_data), 'pages', str(_id) + '.json')
+ for page in self._pages():
+ self.extract_history(page)
+ self.extract_talk(page)
+ self.extract_attachments(page)
allura_base.log.info('Extracting pages done')
- def extract_history(self):
- allura_base.log.info('extract_history not implemented yet. Skip.')
+ def extract_history(self, page):
+ page_id = page['page_id']
+ for page_data in self._history(page_id):
+ page_data.update(page)
+ self._save(json.dumps(page_data), 'pages', str(page_id),
+ 'history', str(page_data['timestamp']) + '.json')
+ allura_base.log.info('Extracted history for page %s (%s)'
+ % (page_id, page['title']))
- def extract_talk(self):
- allura_base.log.info('extract_talk not implemented yet. Skip.')
+ def extract_talk(self, page):
+ page_id = page['page_id']
+ talk_page_data = self._talk(page['title'])
+ if talk_page_data:
+ self._save(json.dumps(talk_page_data), 'pages', str(page_id),
+ 'discussion.json')
+ allura_base.log.info('Extracted talk for page %s (%s)'
+ % (page_id, page['title']))
- def extract_attachments(self):
- allura_base.log.info('extract_attachments not implemented yet. Skip.')
+ allura_base.log.info('No talk for page %s (%s)'
+ % (page_id, page['title']))
+
+ def extract_attachments(self, page):
+ page_id = page['page_id']
+ for filepath in self._attachments(page_id):
+ self._save_attachment(filepath, 'pages', str(page_id),
+ 'attachments')
+ allura_base.log.info('Extracted attachments for page %s (%s)'
+ % (page_id, page['title']))
diff --git a/ForgeWiki/forgewiki/command/wiki2markdown/loaders.py b/ForgeWiki/forgewiki/command/wiki2markdown/loaders.py
index 29e3617..87e0031 100644
--- a/ForgeWiki/forgewiki/command/wiki2markdown/loaders.py
+++ b/ForgeWiki/forgewiki/command/wiki2markdown/loaders.py
@@ -6,8 +6,10 @@
from allura import model as M
from forgewiki import model as WM
from forgewiki.converters import mediawiki2markdown
+from forgewiki.converters import mediawiki_internal_links2markdown
from allura.command import base as allura_base
from allura.lib import helpers as h
+from allura.lib import utils
class MediawikiLoader(object):
@@ -33,47 +35,114 @@
allura_base.log.error("Can't find wiki app in given project")
exit(2)
+ h.set_context(self.project.shortname, 'wiki', neighborhood=self.nbhd)
+
def load(self):
self.load_pages()
- self.load_history()
- self.load_talk()
- self.load_attachments()
def _pages(self):
- """Yield page_data for next wiki page"""
- h.set_context(self.project.shortname, 'wiki', neighborhood=self.nbhd)
+ """Yield path to page dump directory for next wiki page"""
pages_dir = os.path.join(self.options.dump_dir, 'pages')
- page_files = []
- if os.path.isdir(pages_dir):
- page_files = os.listdir(pages_dir)
- for filename in page_files:
- file_path = os.path.join(pages_dir, filename)
- with open(file_path, 'r') as pages_file:
+ pages = []
+ if not os.path.isdir(pages_dir):
+ return
+ pages = os.listdir(pages_dir)
+ for directory in pages:
+ dir_path = os.path.join(pages_dir, directory)
+ if os.path.isdir(dir_path):
+ yield dir_path
+
+ def _history(self, page_dir):
+ """Yield page_data for next wiki page in edit history"""
+ page_dir = os.path.join(page_dir, 'history')
+ if not os.path.isdir(page_dir):
+ return
+ pages = os.listdir(page_dir)
+ pages.sort() # ensure that history in right order
+ for page in pages:
+ fn = os.path.join(page_dir, page)
+ with open(fn, 'r') as pages_file:
page_data = json.load(pages_file)
yield page_data
+ def _talk(self, page_dir):
+ """Return talk data from json dump"""
+ filename = os.path.join(page_dir, 'discussion.json')
+ if not os.path.isfile(filename):
+ return
+ with open(filename, 'r') as talk_file:
+ talk_data = json.load(talk_file)
+ return talk_data
+
+ def _attachments(self, page_dir):
+ """Yield (filename, full path) to next attachment for given page."""
+ attachments_dir = os.path.join(page_dir, 'attachments')
+ if not os.path.isdir(attachments_dir):
+ return
+ attachments = os.listdir(attachments_dir)
+ for filename in attachments:
+ yield filename, os.path.join(attachments_dir, filename)
+
def load_pages(self):
+ """Load pages with edit history from json to Allura wiki tool"""
allura_base.log.info('Loading pages into allura...')
- for page in self._pages():
+ for page_dir in self._pages():
+ for page in self._history(page_dir):
+ p = WM.Page.upsert(page['title'])
+ p.viewable_by = ['all']
+ p.text = mediawiki_internal_links2markdown(
+ mediawiki2markdown(page['text']),
+ page['title'])
+ p.commit()
+
+ # set home to main page
if page['title'] == 'Main_Page':
gl = WM.Globals.query.get(app_config_id=self.wiki.config._id)
if gl is not None:
gl.root = page['title']
- p = WM.Page.upsert(page['title'])
- p.viewable_by = ['all']
- p.text = mediawiki2markdown(page['text'])
- if not p.history().first():
- p.commit()
+ allura_base.log.info('Loaded history of page %s (%s)'
+ % (page['page_id'], page['title']))
+
+ self.load_talk(page_dir, page['title'])
+ self.load_attachments(page_dir, page['title'])
ThreadLocalORMSession.flush_all()
ThreadLocalORMSession.close_all()
allura_base.log.info('Loading pages done')
- def load_history(self):
- allura_base.log.info('load_history not implemented yet. Skip.')
+ def load_talk(self, page_dir, page_title):
+ """Load talk for page.
- def load_talk(self):
- allura_base.log.info('load_talk not implemented yet. Skip.')
+ page_dir - path to directory with page dump.
+ page_title - page title in Allura Wiki
+ """
+ talk_data = self._talk(page_dir)
+ if not talk_data:
+ return
+ text = mediawiki2markdown(talk_data['text'])
+ page = WM.Page.query.get(app_config_id=self.wiki.config._id,
+ title=page_title)
+ if not page:
+ return
+ thread = M.Thread.query.get(ref_id=page.index_id())
+ if not thread:
+ return
+ thread.add_post(
+ text=text,
+ discussion_id=thread.discussion_id,
+ thread_id=thread._id,
+ ignore_security=True)
+ allura_base.log.info('Loaded talk for page %s' % page_title)
- def load_attachments(self):
- allura_base.log.info('load_attachments not implemented yet. Skip.')
+ def load_attachments(self, page_dir, page_title):
+ """Load attachments for page.
+
+ page_dir - path to directory with page dump.
+ """
+ page = WM.Page.query.get(app_config_id=self.wiki.config._id,
+ title=page_title)
+ for filename, path in self._attachments(page_dir):
+ with open(path) as fp:
+ page.attach(filename, fp,
+ content_type=utils.guess_mime_type(filename))
+ allura_base.log.info('Loaded attachments for page %s.' % page_title)
diff --git a/ForgeWiki/forgewiki/converters.py b/ForgeWiki/forgewiki/converters.py
index bbc1ac9..3ee7c5f 100644
--- a/ForgeWiki/forgewiki/converters.py
+++ b/ForgeWiki/forgewiki/converters.py
@@ -1,12 +1,41 @@
#-*- python -*-
import html2text
+import re
# https://github.com/zikzakmedia/python-mediawiki.git
from mediawiki import wiki2html
html2text.BODY_WIDTH = 0
+_inline_img = re.compile(r'\[\[(File|Image):([^\]|]+).*\]\]', re.UNICODE)
+_inline_img_markdown = r'[[img src=\2]]'
+_link_to_attach = re.compile(r'\[\[Media:([^\]|]+)\|?(.*)\]\]', re.UNICODE)
+
+
+def _link_to_attach_markdown(page_title):
+ pattern = r'[%s](%s/attachment/%s)'
+
+ def replacement(match):
+ if match.group(2):
+ return pattern % (match.group(2), page_title, match.group(1))
+ return pattern % (match.group(1), page_title, match.group(1))
+
+ return replacement
+
def mediawiki2markdown(source):
wiki_content = wiki2html(source, True)
markdown_text = html2text.html2text(wiki_content)
return markdown_text
+
+
+def mediawiki_internal_links2markdown(markdown_text, page_title):
+ """Convert MediaWiki internal links to attachments to ForgeWiki format.
+
+ args:
+ markdown_text - text, converted by mediawiki2markdown convertor.
+ page_title - title of ForgeWiki page.
+ Used for constructing proper links to attachments.
+ """
+ output = _inline_img.sub(_inline_img_markdown, markdown_text)
+ output = _link_to_attach.sub(_link_to_attach_markdown(page_title), output)
+ return output
diff --git a/ForgeWiki/forgewiki/tests/test_converters.py b/ForgeWiki/forgewiki/tests/test_converters.py
index 1a29b9d..8787898 100644
--- a/ForgeWiki/forgewiki/tests/test_converters.py
+++ b/ForgeWiki/forgewiki/tests/test_converters.py
@@ -30,3 +30,18 @@
assert "**bold** _italics_" in mediawiki_output
assert "## Getting started" in mediawiki_output
assert "* [MediaWiki FAQ](http://www.mediawiki.org/wiki/Manual:FAQ)" in mediawiki_output
+
+
+def test_mediawiki_internal_links2markdown():
+ text = """Example page!
+Inline image: [[File:image.png]]
+Link to file: [[Media:attach.pdf|Att]]
+File: [[Media:attach.pdf]]
+Inline image in old format: [[Image:image.png]]
+"""
+ output = converters.mediawiki_internal_links2markdown(text, 'Example page')
+ assert 'Example page!' in output
+ assert 'Inline image: [[img src=image.png]]' in output
+ assert 'Link to file: [Att](Example page/attachment/attach.pdf)' in output
+ assert 'File: [attach.pdf](Example page/attachment/attach.pdf)' in output
+ assert 'Inline image in old format: [[img src=image.png]]' in output
diff --git a/ForgeWiki/forgewiki/tests/test_wiki2markdown.py b/ForgeWiki/forgewiki/tests/test_wiki2markdown.py
index 845a33b..4671707 100644
--- a/ForgeWiki/forgewiki/tests/test_wiki2markdown.py
+++ b/ForgeWiki/forgewiki/tests/test_wiki2markdown.py
@@ -1,4 +1,5 @@
import mock
+import json
from forgewiki.command.wiki2markdown.extractors import MySQLExtractor
from forgewiki.command.wiki2markdown.loaders import MediawikiLoader
@@ -21,28 +22,134 @@
# monkey-patch MySQLExtractor for test
def pages(self):
- # yield (page_id, page_data)
- yield 1, {'title': 'Test title', 'text': 'Test Text'}
- yield 2, {'title': 'Main_Page', 'text': 'Main_page text'}
- yield 3, {'title': 'Test', 'text': ''}
+ yield {'page_id': 1, 'title': 'Test title'}
+ yield {'page_id': 2, 'title': 'Main_Page'}
+ yield {'page_id': 3, 'title': 'Test'}
+
+ def history(self, page_id):
+ data = {
+ 1: [
+ {'timestamp': 1, 'text': "Test"},
+ {'timestamp': 2, 'text': "Test Text"}
+ ],
+ 2: [
+ {'timestamp': 1, 'text': "Main_Page"},
+ {'timestamp': 2, 'text': "Main_Page text"}
+ ],
+ 3: [
+ {'timestamp': 1, 'text': "Some test text"},
+ {'timestamp': 2, 'text': ""}
+ ]
+ }
+ revisions = data[page_id]
+ for rev in revisions:
+ yield rev
+
+ def talk(self, page_title):
+ return {'text': 'Talk for page %s.' % page_title}
+
+ def attachments(self, *args, **kwargs):
+ # make 'empty' iterator
+ if False:
+ yield
MySQLExtractor._pages = pages
+ MySQLExtractor._history = history
+ MySQLExtractor._talk = talk
+ MySQLExtractor._attachments = attachments
self.extractor = MySQLExtractor(self.options)
def test_extract_pages(self):
+ """Test that pages and edit history extracted properly"""
self.extractor.extract_pages()
- with open('/tmp/w2m_test/pages/1.json', 'r') as f:
- json_page = f.read()
- assert json_page == '{"text": "Test Text", "title": "Test title"}'
+ # rev 1 of page 1
+ with open('/tmp/w2m_test/pages/1/history/1.json', 'r') as f:
+ page = json.load(f)
+ res_page = {
+ 'timestamp': 1,
+ 'text': 'Test',
+ 'page_id': 1,
+ 'title': 'Test title'
+ }
+ assert page == res_page
- with open('/tmp/w2m_test/pages/2.json', 'r') as f:
- json_page = f.read()
- assert json_page == '{"text": "Main_page text", "title": "Main_Page"}'
+ # rev 2 of page 1
+ with open('/tmp/w2m_test/pages/1/history/2.json', 'r') as f:
+ page = json.load(f)
+ res_page = {
+ 'timestamp': 2,
+ 'text': 'Test Text',
+ 'page_id': 1,
+ 'title': 'Test title'
+ }
+ assert page == res_page
- with open('/tmp/w2m_test/pages/3.json', 'r') as f:
- json_page = f.read()
- assert json_page == '{"text": "", "title": "Test"}'
+ # rev 1 of page 2
+ with open('/tmp/w2m_test/pages/2/history/1.json', 'r') as f:
+ page = json.load(f)
+ res_page = {
+ 'timestamp': 1,
+ 'text': 'Main_Page',
+ 'page_id': 2,
+ 'title': 'Main_Page'
+ }
+ assert page == res_page
+
+ # rev 2 of page 2
+ with open('/tmp/w2m_test/pages/2/history/2.json', 'r') as f:
+ page = json.load(f)
+ res_page = {
+ 'timestamp': 2,
+ 'text': 'Main_Page text',
+ 'page_id': 2,
+ 'title': 'Main_Page'
+ }
+ assert page == res_page
+
+ # rev 1 of page 3
+ with open('/tmp/w2m_test/pages/3/history/1.json', 'r') as f:
+ page = json.load(f)
+ res_page = {
+ 'timestamp': 1,
+ 'text': 'Some test text',
+ 'page_id': 3,
+ 'title': 'Test'
+ }
+ assert page == res_page
+
+ # rev 2 of page 3
+ with open('/tmp/w2m_test/pages/3/history/2.json', 'r') as f:
+ page = json.load(f)
+ res_page = {
+ 'timestamp': 2,
+ 'text': '',
+ 'page_id': 3,
+ 'title': 'Test'
+ }
+ assert page == res_page
+
+ def test_extract_talk(self):
+ """Test that talk pages extracted properly."""
+ pages = [
+ {'page_id': 1, 'title': 'Test 1'},
+ {'page_id': 2, 'title': 'Test 2'},
+ {'page_id': 3, 'title': 'Test 3'},
+ ]
+ for page in pages:
+ self.extractor.extract_talk(page)
+
+ with open('/tmp/w2m_test/pages/1/discussion.json', 'r') as f:
+ page = json.load(f)
+ assert page == {'text': 'Talk for page Test 1.'}
+
+ with open('/tmp/w2m_test/pages/2/discussion.json', 'r') as f:
+ page = json.load(f)
+ assert page == {'text': 'Talk for page Test 2.'}
+
+ with open('/tmp/w2m_test/pages/3/discussion.json', 'r') as f:
+ page = json.load(f)
+ assert page == {'text': 'Talk for page Test 3.'}
class TestMediawikiLoader(object):
@@ -59,25 +166,93 @@
# monkey-patch MediawikiLoader for test
def pages(self):
- yield {'title': 'Test title', 'text': "'''bold''' ''italics''"}
- yield {'title': 'Main', 'text': "main"}
- yield {'title': 'Test', 'text': 'test'}
+ yield 1
+ yield 2
+
+ def history(self, page_dir):
+ data = {
+ 1: [
+ {
+ 'title': 'Test title',
+ 'text': "'''bold''' ''italics''",
+ 'page_id': 1,
+ 'timestamp': 1
+ },
+ {
+ 'title': 'Test title',
+ 'text': "'''bold'''",
+ 'page_id': 1,
+ 'timestamp': 2
+ },
+ ],
+ 2: [
+ {
+ 'title': 'Main',
+ 'text': "Main text rev 1",
+ 'page_id': 2,
+ 'timestamp': 1
+ },
+ {
+ 'title': 'Main',
+ 'text': "Main text rev 2",
+ 'page_id': 2,
+ 'timestamp': 2
+ },
+
+ ],
+ }
+ for page in data[page_dir]:
+ yield page
+
+ def talk(self, page_dir):
+ data = {
+ 1: {'text': "''Talk page'' for page 1."},
+ 2: {'text': "''Talk page'' for page 2."},
+ }
+ return data[page_dir]
+
+ def attachments(self, *args, **kwargs):
+ # make 'empty' iterator
+ if False:
+ yield
MediawikiLoader._pages = pages
+ MediawikiLoader._history = history
+ MediawikiLoader._talk = talk
+ MediawikiLoader._attachments = attachments
self.loader = MediawikiLoader(self.options)
def get_page(self, title):
return WM.Page.query.get(app_config_id=context.app.config._id,
title=title)
+ def get_post(self, title):
+ page = self.get_page(title)
+ thread = M.Thread.query.get(ref_id=page.index_id())
+ return M.Post.query.get(discussion_id=thread.discussion_id,
+ thread_id=thread._id)
+
def test_load_pages(self):
+ """Test that pages, edit history and talk loaded properly"""
self.loader.load_pages()
page = self.get_page('Test title')
+
+ assert '**bold**' in page.text
+ # _italics should be only in the first revision of page
+ assert '_italics_' not in page
+
+ page = page.get_version(1)
assert '**bold** _italics_' in page.text
page = self.get_page('Main')
- assert 'main' in page.text
+ assert 'Main text rev 2' in page.text
- page = self.get_page('Test')
- print page.text, len(page.text)
- assert 'test' in page.text
+ page = page.get_version(1)
+ assert 'Main text rev 1' in page.text
+
+ # Check that talk pages loaded
+ post = self.get_post('Test title')
+ assert '_Talk page_ for page 1.' in post.text
+
+ post = self.get_post('Main')
+ assert '_Talk page_ for page 2.' in post.text