[#4186] ticket:85 Import from MediaWiki improvements.

Extract and import pages history.
Extract and import talk for pages.
Extract and import attachments for pages.
Convert MediaWiki attachments links to ForgeWiki markdown format.
diff --git a/ForgeWiki/forgewiki/command/wiki2markdown/__init__.py b/ForgeWiki/forgewiki/command/wiki2markdown/__init__.py
index 20deaf6..fd0b112 100644
--- a/ForgeWiki/forgewiki/command/wiki2markdown/__init__.py
+++ b/ForgeWiki/forgewiki/command/wiki2markdown/__init__.py
@@ -13,7 +13,7 @@
     parser = WikiCommand.standard_parser(verbose=True)
     parser.add_option('-e', '--extract-only', action='store_true',
                       dest='extract',
-                      help='Store data from the mediawiki-dump'
+                      help='Store data from the mediawiki-dump '
                       'on the local filesystem; not load into Allura')
     parser.add_option('-l', '--load-only', action='store_true', dest='load',
                 help='Load into Allura previously-extracted data')
@@ -35,6 +35,9 @@
                 help='User for database connection')
     parser.add_option('--password', dest='password', default='',
                 help='Password for database connection')
+    parser.add_option('-a', '--attachments-dir', dest='attachments_dir',
+                help='Path to directory with mediawiki attachments dump',
+                default='')
 
     def command(self):
         self.basic_setup()
@@ -47,7 +50,7 @@
             self.loader.load()
 
     def handle_options(self):
-        if self.options.dump_dir == '':
+        if not self.options.dump_dir:
             allura_base.log.error('You must specify directory for dump files')
             exit(2)
 
@@ -72,3 +75,8 @@
             else:
                 allura_base.log.error('You must specify valid data source')
                 exit(2)
+
+            if not self.options.attachments_dir:
+                allura_base.log.error('You must specify path to directory '
+                                      'with mediawiki attachmets dump.')
+                exit(2)
diff --git a/ForgeWiki/forgewiki/command/wiki2markdown/extractors.py b/ForgeWiki/forgewiki/command/wiki2markdown/extractors.py
index 48637c5..4753673 100644
--- a/ForgeWiki/forgewiki/command/wiki2markdown/extractors.py
+++ b/ForgeWiki/forgewiki/command/wiki2markdown/extractors.py
@@ -2,6 +2,8 @@
 import os
 import shutil
 import json
+import hashlib
+
 from allura.command import base as allura_base
 
 
@@ -16,21 +18,7 @@
         os.makedirs(self.options.dump_dir)
 
     def extract(self):
-        self.extract_pages()
-        self.extract_history()
-        self.extract_talk()
-        self.extract_attachments()
-
-    def extract_pages(self):
-        raise NotImplementedError("subclass must override this")
-
-    def extract_history(self):
-        raise NotImplementedError("subclass must override this")
-
-    def extract_talk(self):
-        raise NotImplementedError("subclass must override this")
-
-    def extract_attachments(self):
+        """Extract pages with history, attachments, talk-pages, etc"""
         raise NotImplementedError("subclass must override this")
 
 
@@ -64,33 +52,116 @@
         with open(out_file, 'w') as out:
             out.write(content.encode('utf-8'))
 
+    def _save_attachment(self, filepath, *paths):
+        """Save attachment in dump directory.
+
+        Copy from mediawiki dump directory to our internal dump directory.
+
+        args:
+        filepath - path to attachment in mediawiki dump.
+        *paths - path to internal dump directory.
+        """
+        out_dir = os.path.join(self.options.dump_dir, *paths)
+        if not os.path.exists(out_dir):
+            os.makedirs(out_dir)
+        shutil.copy(filepath, out_dir)
+
     def _pages(self):
         """Yield page_data for next wiki page"""
         c = self.connection().cursor()
-        c.execute('select page.page_id, page.page_title, text.old_text '
+        c.execute('select page.page_id, page.page_title '
+                  'from page where page.page_namespace = 0')
+        for row in c:
+            _id, title = row
+            page_data = {
+                'page_id': _id,
+                'title': title,
+            }
+            yield page_data
+
+    def _history(self, page_id):
+        """Yield page_data for next revision of wiki page"""
+        c = self.connection().cursor()
+        c.execute('select revision.rev_timestamp, text.old_text '
+                  'from revision '
+                  'left join text on revision.rev_text_id = text.old_id '
+                  'where revision.rev_page = %s', page_id)
+        for row in c:
+            timestamp, text = row
+            page_data = {
+                'timestamp': timestamp,
+                'text': text or ''
+            }
+            yield page_data
+
+    def _talk(self, page_title):
+        """Return page_data for talk page with `page_title` title"""
+        c = self.connection().cursor()
+        query_attrs = (page_title, 1)  # page_namespace == 1 - talk pages
+        c.execute('select text.old_text '
                   'from page '
                   'left join revision on revision.rev_id = page.page_latest '
                   'left join text on text.old_id = revision.rev_text_id '
-                  'where page.page_namespace = 0')
+                  'where page.page_title = %s and page.page_namespace = %s '
+                  'limit 1', query_attrs)
+
+        row = c.fetchone()
+        if row:
+            text = row[0]
+            return {'text': text}
+
+    def _attachments(self, page_id):
+        """Yield path to nexe file attached to wiki page"""
+        c = self.connection().cursor()
+        c.execute('select il_to from imagelinks '
+                  'where il_from = %s' % page_id)
         for row in c:
-            _id, title, text = row
-            page_data = {
-                'title': title,
-                'text': text or ''
-            }
-            yield _id, page_data
+            name = row[0]
+            # mediawiki stores attachmets in subdirectories
+            # based on md5-hash of filename
+            # so we need to build path to file as follows
+            md5 = hashlib.md5(name).hexdigest()
+            path = os.path.join(self.options.attachments_dir,
+                               md5[:1], md5[:2], name)
+            if os.path.isfile(path):
+                yield path
+
+    def extract(self):
+        self.extract_pages()
 
     def extract_pages(self):
         allura_base.log.info('Extracting pages...')
-        for _id, page_data in self._pages():
-            self._save(json.dumps(page_data), 'pages', str(_id) + '.json')
+        for page in self._pages():
+            self.extract_history(page)
+            self.extract_talk(page)
+            self.extract_attachments(page)
         allura_base.log.info('Extracting pages done')
 
-    def extract_history(self):
-        allura_base.log.info('extract_history not implemented yet. Skip.')
+    def extract_history(self, page):
+        page_id = page['page_id']
+        for page_data in self._history(page_id):
+            page_data.update(page)
+            self._save(json.dumps(page_data), 'pages', str(page_id),
+                       'history', str(page_data['timestamp']) + '.json')
+        allura_base.log.info('Extracted history for page %s (%s)'
+                             % (page_id, page['title']))
 
-    def extract_talk(self):
-        allura_base.log.info('extract_talk not implemented yet. Skip.')
+    def extract_talk(self, page):
+        page_id = page['page_id']
+        talk_page_data = self._talk(page['title'])
+        if talk_page_data:
+            self._save(json.dumps(talk_page_data), 'pages', str(page_id),
+                       'discussion.json')
+            allura_base.log.info('Extracted talk for page %s (%s)'
+                                 % (page_id, page['title']))
 
-    def extract_attachments(self):
-        allura_base.log.info('extract_attachments not implemented yet. Skip.')
+        allura_base.log.info('No talk for page %s (%s)'
+                             % (page_id, page['title']))
+
+    def extract_attachments(self, page):
+        page_id = page['page_id']
+        for filepath in self._attachments(page_id):
+            self._save_attachment(filepath, 'pages', str(page_id),
+                                  'attachments')
+        allura_base.log.info('Extracted attachments for page %s (%s)'
+                             % (page_id, page['title']))
diff --git a/ForgeWiki/forgewiki/command/wiki2markdown/loaders.py b/ForgeWiki/forgewiki/command/wiki2markdown/loaders.py
index 29e3617..87e0031 100644
--- a/ForgeWiki/forgewiki/command/wiki2markdown/loaders.py
+++ b/ForgeWiki/forgewiki/command/wiki2markdown/loaders.py
@@ -6,8 +6,10 @@
 from allura import model as M
 from forgewiki import model as WM
 from forgewiki.converters import mediawiki2markdown
+from forgewiki.converters import mediawiki_internal_links2markdown
 from allura.command import base as allura_base
 from allura.lib import helpers as h
+from allura.lib import utils
 
 
 class MediawikiLoader(object):
@@ -33,47 +35,114 @@
             allura_base.log.error("Can't find wiki app in given project")
             exit(2)
 
+        h.set_context(self.project.shortname, 'wiki', neighborhood=self.nbhd)
+
     def load(self):
         self.load_pages()
-        self.load_history()
-        self.load_talk()
-        self.load_attachments()
 
     def _pages(self):
-        """Yield page_data for next wiki page"""
-        h.set_context(self.project.shortname, 'wiki', neighborhood=self.nbhd)
+        """Yield path to page dump directory for next wiki page"""
         pages_dir = os.path.join(self.options.dump_dir, 'pages')
-        page_files = []
-        if os.path.isdir(pages_dir):
-            page_files = os.listdir(pages_dir)
-        for filename in page_files:
-            file_path = os.path.join(pages_dir, filename)
-            with open(file_path, 'r') as pages_file:
+        pages = []
+        if not os.path.isdir(pages_dir):
+            return
+        pages = os.listdir(pages_dir)
+        for directory in pages:
+            dir_path = os.path.join(pages_dir, directory)
+            if os.path.isdir(dir_path):
+                yield dir_path
+
+    def _history(self, page_dir):
+        """Yield page_data for next wiki page in edit history"""
+        page_dir = os.path.join(page_dir, 'history')
+        if not os.path.isdir(page_dir):
+            return
+        pages = os.listdir(page_dir)
+        pages.sort()  # ensure that history in right order
+        for page in pages:
+            fn = os.path.join(page_dir, page)
+            with open(fn, 'r') as pages_file:
                 page_data = json.load(pages_file)
             yield page_data
 
+    def _talk(self, page_dir):
+        """Return talk data from json dump"""
+        filename = os.path.join(page_dir, 'discussion.json')
+        if not os.path.isfile(filename):
+            return
+        with open(filename, 'r') as talk_file:
+            talk_data = json.load(talk_file)
+        return talk_data
+
+    def _attachments(self, page_dir):
+        """Yield (filename, full path) to next attachment for given page."""
+        attachments_dir = os.path.join(page_dir, 'attachments')
+        if not os.path.isdir(attachments_dir):
+            return
+        attachments = os.listdir(attachments_dir)
+        for filename in attachments:
+            yield filename, os.path.join(attachments_dir, filename)
+
     def load_pages(self):
+        """Load pages with edit history from json to Allura wiki tool"""
         allura_base.log.info('Loading pages into allura...')
-        for page in self._pages():
+        for page_dir in self._pages():
+            for page in self._history(page_dir):
+                p = WM.Page.upsert(page['title'])
+                p.viewable_by = ['all']
+                p.text = mediawiki_internal_links2markdown(
+                            mediawiki2markdown(page['text']),
+                            page['title'])
+                p.commit()
+
+            # set home to main page
             if page['title'] == 'Main_Page':
                 gl = WM.Globals.query.get(app_config_id=self.wiki.config._id)
                 if gl is not None:
                     gl.root = page['title']
-            p = WM.Page.upsert(page['title'])
-            p.viewable_by = ['all']
-            p.text = mediawiki2markdown(page['text'])
-            if not p.history().first():
-                p.commit()
+            allura_base.log.info('Loaded history of page %s (%s)'
+                                 % (page['page_id'], page['title']))
+
+            self.load_talk(page_dir, page['title'])
+            self.load_attachments(page_dir, page['title'])
 
         ThreadLocalORMSession.flush_all()
         ThreadLocalORMSession.close_all()
         allura_base.log.info('Loading pages done')
 
-    def load_history(self):
-        allura_base.log.info('load_history not implemented yet. Skip.')
+    def load_talk(self, page_dir, page_title):
+        """Load talk for page.
 
-    def load_talk(self):
-        allura_base.log.info('load_talk not implemented yet. Skip.')
+        page_dir - path to directory with page dump.
+        page_title - page title in Allura Wiki
+        """
+        talk_data = self._talk(page_dir)
+        if not talk_data:
+            return
+        text = mediawiki2markdown(talk_data['text'])
+        page = WM.Page.query.get(app_config_id=self.wiki.config._id,
+                                 title=page_title)
+        if not page:
+            return
+        thread = M.Thread.query.get(ref_id=page.index_id())
+        if not thread:
+            return
+        thread.add_post(
+            text=text,
+            discussion_id=thread.discussion_id,
+            thread_id=thread._id,
+            ignore_security=True)
+        allura_base.log.info('Loaded talk for page %s' % page_title)
 
-    def load_attachments(self):
-        allura_base.log.info('load_attachments not implemented yet. Skip.')
+    def load_attachments(self, page_dir, page_title):
+        """Load attachments for page.
+
+        page_dir - path to directory with page dump.
+        """
+        page = WM.Page.query.get(app_config_id=self.wiki.config._id,
+                                 title=page_title)
+        for filename, path in self._attachments(page_dir):
+            with open(path) as fp:
+                page.attach(filename, fp,
+                            content_type=utils.guess_mime_type(filename))
+        allura_base.log.info('Loaded attachments for page %s.' % page_title)
diff --git a/ForgeWiki/forgewiki/converters.py b/ForgeWiki/forgewiki/converters.py
index bbc1ac9..3ee7c5f 100644
--- a/ForgeWiki/forgewiki/converters.py
+++ b/ForgeWiki/forgewiki/converters.py
@@ -1,12 +1,41 @@
 #-*- python -*-
 import html2text
+import re
 # https://github.com/zikzakmedia/python-mediawiki.git
 from mediawiki import wiki2html
 
 html2text.BODY_WIDTH = 0
 
+_inline_img = re.compile(r'\[\[(File|Image):([^\]|]+).*\]\]', re.UNICODE)
+_inline_img_markdown = r'[[img src=\2]]'
+_link_to_attach = re.compile(r'\[\[Media:([^\]|]+)\|?(.*)\]\]', re.UNICODE)
+
+
+def _link_to_attach_markdown(page_title):
+    pattern = r'[%s](%s/attachment/%s)'
+
+    def replacement(match):
+        if match.group(2):
+            return pattern % (match.group(2), page_title, match.group(1))
+        return pattern % (match.group(1), page_title, match.group(1))
+
+    return replacement
+
 
 def mediawiki2markdown(source):
     wiki_content = wiki2html(source, True)
     markdown_text = html2text.html2text(wiki_content)
     return markdown_text
+
+
+def mediawiki_internal_links2markdown(markdown_text, page_title):
+    """Convert MediaWiki internal links to attachments to ForgeWiki format.
+
+    args:
+    markdown_text - text, converted by mediawiki2markdown convertor.
+    page_title - title of ForgeWiki page.
+                 Used for constructing proper links to attachments.
+    """
+    output = _inline_img.sub(_inline_img_markdown, markdown_text)
+    output = _link_to_attach.sub(_link_to_attach_markdown(page_title), output)
+    return output
diff --git a/ForgeWiki/forgewiki/tests/test_converters.py b/ForgeWiki/forgewiki/tests/test_converters.py
index 1a29b9d..8787898 100644
--- a/ForgeWiki/forgewiki/tests/test_converters.py
+++ b/ForgeWiki/forgewiki/tests/test_converters.py
@@ -30,3 +30,18 @@
     assert "**bold** _italics_" in mediawiki_output
     assert "## Getting started" in mediawiki_output
     assert "* [MediaWiki FAQ](http://www.mediawiki.org/wiki/Manual:FAQ)" in mediawiki_output
+
+
+def test_mediawiki_internal_links2markdown():
+    text = """Example page!
+Inline image: [[File:image.png]]
+Link to file: [[Media:attach.pdf|Att]]
+File: [[Media:attach.pdf]]
+Inline image in old format: [[Image:image.png]]
+"""
+    output = converters.mediawiki_internal_links2markdown(text, 'Example page')
+    assert 'Example page!' in output
+    assert 'Inline image: [[img src=image.png]]' in output
+    assert 'Link to file: [Att](Example page/attachment/attach.pdf)' in output
+    assert 'File: [attach.pdf](Example page/attachment/attach.pdf)' in output
+    assert 'Inline image in old format: [[img src=image.png]]' in output
diff --git a/ForgeWiki/forgewiki/tests/test_wiki2markdown.py b/ForgeWiki/forgewiki/tests/test_wiki2markdown.py
index 845a33b..4671707 100644
--- a/ForgeWiki/forgewiki/tests/test_wiki2markdown.py
+++ b/ForgeWiki/forgewiki/tests/test_wiki2markdown.py
@@ -1,4 +1,5 @@
 import mock
+import json
 
 from forgewiki.command.wiki2markdown.extractors import MySQLExtractor
 from forgewiki.command.wiki2markdown.loaders import MediawikiLoader
@@ -21,28 +22,134 @@
 
         # monkey-patch MySQLExtractor for test
         def pages(self):
-            # yield (page_id, page_data)
-            yield 1, {'title': 'Test title', 'text': 'Test Text'}
-            yield 2, {'title': 'Main_Page', 'text': 'Main_page text'}
-            yield 3, {'title': 'Test', 'text': ''}
+            yield {'page_id': 1, 'title': 'Test title'}
+            yield {'page_id': 2, 'title': 'Main_Page'}
+            yield {'page_id': 3, 'title': 'Test'}
+
+        def history(self, page_id):
+            data = {
+                1: [
+                    {'timestamp': 1, 'text': "Test"},
+                    {'timestamp': 2, 'text': "Test Text"}
+                ],
+                2: [
+                    {'timestamp': 1, 'text': "Main_Page"},
+                    {'timestamp': 2, 'text': "Main_Page text"}
+                ],
+                3: [
+                    {'timestamp': 1, 'text': "Some test text"},
+                    {'timestamp': 2, 'text': ""}
+                ]
+            }
+            revisions = data[page_id]
+            for rev in revisions:
+                yield rev
+
+        def talk(self, page_title):
+            return {'text': 'Talk for page %s.' % page_title}
+
+        def attachments(self, *args, **kwargs):
+            # make 'empty' iterator
+            if False:
+                yield
 
         MySQLExtractor._pages = pages
+        MySQLExtractor._history = history
+        MySQLExtractor._talk = talk
+        MySQLExtractor._attachments = attachments
         self.extractor = MySQLExtractor(self.options)
 
     def test_extract_pages(self):
+        """Test that pages and edit history extracted properly"""
         self.extractor.extract_pages()
 
-        with open('/tmp/w2m_test/pages/1.json', 'r') as f:
-            json_page = f.read()
-        assert json_page == '{"text": "Test Text", "title": "Test title"}'
+        # rev 1 of page 1
+        with open('/tmp/w2m_test/pages/1/history/1.json', 'r') as f:
+            page = json.load(f)
+        res_page = {
+            'timestamp': 1,
+            'text': 'Test',
+            'page_id': 1,
+            'title': 'Test title'
+        }
+        assert page == res_page
 
-        with open('/tmp/w2m_test/pages/2.json', 'r') as f:
-            json_page = f.read()
-        assert json_page == '{"text": "Main_page text", "title": "Main_Page"}'
+        # rev 2 of page 1
+        with open('/tmp/w2m_test/pages/1/history/2.json', 'r') as f:
+            page = json.load(f)
+        res_page = {
+            'timestamp': 2,
+            'text': 'Test Text',
+            'page_id': 1,
+            'title': 'Test title'
+        }
+        assert page == res_page
 
-        with open('/tmp/w2m_test/pages/3.json', 'r') as f:
-            json_page = f.read()
-        assert json_page == '{"text": "", "title": "Test"}'
+        # rev 1 of page 2
+        with open('/tmp/w2m_test/pages/2/history/1.json', 'r') as f:
+            page = json.load(f)
+        res_page = {
+            'timestamp': 1,
+            'text': 'Main_Page',
+            'page_id': 2,
+            'title': 'Main_Page'
+        }
+        assert page == res_page
+
+        # rev 2 of page 2
+        with open('/tmp/w2m_test/pages/2/history/2.json', 'r') as f:
+            page = json.load(f)
+        res_page = {
+            'timestamp': 2,
+            'text': 'Main_Page text',
+            'page_id': 2,
+            'title': 'Main_Page'
+        }
+        assert page == res_page
+
+        # rev 1 of page 3
+        with open('/tmp/w2m_test/pages/3/history/1.json', 'r') as f:
+            page = json.load(f)
+        res_page = {
+            'timestamp': 1,
+            'text': 'Some test text',
+            'page_id': 3,
+            'title': 'Test'
+        }
+        assert page == res_page
+
+        # rev 2 of page 3
+        with open('/tmp/w2m_test/pages/3/history/2.json', 'r') as f:
+            page = json.load(f)
+        res_page = {
+            'timestamp': 2,
+            'text': '',
+            'page_id': 3,
+            'title': 'Test'
+        }
+        assert page == res_page
+
+    def test_extract_talk(self):
+        """Test that talk pages extracted properly."""
+        pages = [
+            {'page_id': 1, 'title': 'Test 1'},
+            {'page_id': 2, 'title': 'Test 2'},
+            {'page_id': 3, 'title': 'Test 3'},
+        ]
+        for page in pages:
+            self.extractor.extract_talk(page)
+
+        with open('/tmp/w2m_test/pages/1/discussion.json', 'r') as f:
+            page = json.load(f)
+        assert page == {'text': 'Talk for page Test 1.'}
+
+        with open('/tmp/w2m_test/pages/2/discussion.json', 'r') as f:
+            page = json.load(f)
+        assert page == {'text': 'Talk for page Test 2.'}
+
+        with open('/tmp/w2m_test/pages/3/discussion.json', 'r') as f:
+            page = json.load(f)
+        assert page == {'text': 'Talk for page Test 3.'}
 
 
 class TestMediawikiLoader(object):
@@ -59,25 +166,93 @@
 
         # monkey-patch MediawikiLoader for test
         def pages(self):
-            yield {'title': 'Test title', 'text': "'''bold''' ''italics''"}
-            yield {'title': 'Main', 'text': "main"}
-            yield {'title': 'Test', 'text': 'test'}
+            yield 1
+            yield 2
+
+        def history(self, page_dir):
+            data = {
+                1: [
+                    {
+                        'title': 'Test title',
+                        'text': "'''bold''' ''italics''",
+                        'page_id': 1,
+                        'timestamp': 1
+                    },
+                    {
+                        'title': 'Test title',
+                        'text': "'''bold'''",
+                        'page_id': 1,
+                        'timestamp': 2
+                    },
+                ],
+                2: [
+                    {
+                        'title': 'Main',
+                        'text': "Main text rev 1",
+                        'page_id': 2,
+                        'timestamp': 1
+                    },
+                    {
+                        'title': 'Main',
+                        'text': "Main text rev 2",
+                        'page_id': 2,
+                        'timestamp': 2
+                    },
+
+                ],
+            }
+            for page in data[page_dir]:
+                yield page
+
+        def talk(self, page_dir):
+            data = {
+                1: {'text': "''Talk page'' for page 1."},
+                2: {'text': "''Talk page'' for page 2."},
+            }
+            return data[page_dir]
+
+        def attachments(self, *args, **kwargs):
+            # make 'empty' iterator
+            if False:
+                yield
 
         MediawikiLoader._pages = pages
+        MediawikiLoader._history = history
+        MediawikiLoader._talk = talk
+        MediawikiLoader._attachments = attachments
         self.loader = MediawikiLoader(self.options)
 
     def get_page(self, title):
         return WM.Page.query.get(app_config_id=context.app.config._id,
                                  title=title)
 
+    def get_post(self, title):
+        page = self.get_page(title)
+        thread = M.Thread.query.get(ref_id=page.index_id())
+        return M.Post.query.get(discussion_id=thread.discussion_id,
+                                thread_id=thread._id)
+
     def test_load_pages(self):
+        """Test that pages, edit history and talk loaded properly"""
         self.loader.load_pages()
         page = self.get_page('Test title')
+
+        assert '**bold**' in page.text
+        # _italics should be only in the first revision of page
+        assert '_italics_' not in page
+
+        page = page.get_version(1)
         assert '**bold** _italics_' in page.text
 
         page = self.get_page('Main')
-        assert 'main' in page.text
+        assert 'Main text rev 2' in page.text
 
-        page = self.get_page('Test')
-        print page.text, len(page.text)
-        assert 'test' in page.text
+        page = page.get_version(1)
+        assert 'Main text rev 1' in page.text
+
+        # Check that talk pages loaded
+        post = self.get_post('Test title')
+        assert '_Talk page_ for page 1.' in post.text
+
+        post = self.get_post('Main')
+        assert '_Talk page_ for page 2.' in post.text