blob: ad8dfa1c78236a92fff019e201fab2b2db65485c [file] [log] [blame]
import MySQLdb
import os
import shutil
import json
import hashlib
from MySQLdb import DatabaseError
from allura.command import base as allura_base
class MediawikiExtractor(object):
"""Base class for MediaWiki data provider"""
def __init__(self, options):
self.options = options
if os.path.exists(self.options.dump_dir):
# clear dump_dir before extraction (there may be an old data)
shutil.rmtree(self.options.dump_dir)
os.makedirs(self.options.dump_dir)
def extract(self):
"""Extract pages with history, attachments, talk-pages, etc"""
raise NotImplementedError("subclass must override this")
class MySQLExtractor(MediawikiExtractor):
"""Extract MediaWiki data to json.
Use connection to MySQL database as a data source.
"""
def __init__(self, options):
super(MySQLExtractor, self).__init__(options)
self._connection = None
self.db_options = {
'host': self.options.host or 'localhost',
'user': self.options.user,
'passwd': self.options.password,
'db': self.options.db_name,
'port': self.options.port or 3306
}
def connection(self):
if not self._connection:
try:
self._connection = MySQLdb.connect(**self.db_options)
except DatabaseError, e:
allura_base.log.error("Can't connect to database: %s" % str(e))
exit(2)
return self._connection
def _save(self, content, *paths):
"""Save json to file in local filesystem"""
out_file = os.path.join(self.options.dump_dir, *paths)
if not os.path.exists(os.path.dirname(out_file)):
os.makedirs(os.path.dirname(out_file))
with open(out_file, 'w') as out:
out.write(content.encode('utf-8'))
def _save_attachment(self, filepath, *paths):
"""Save attachment in dump directory.
Copy from mediawiki dump directory to our internal dump directory.
args:
filepath - path to attachment in mediawiki dump.
*paths - path to internal dump directory.
"""
out_dir = os.path.join(self.options.dump_dir, *paths)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
shutil.copy(filepath, out_dir)
def _pages(self):
"""Yield page_data for next wiki page"""
c = self.connection().cursor()
c.execute('select page.page_id, page.page_title '
'from page where page.page_namespace = 0')
for row in c:
_id, title = row
page_data = {
'page_id': _id,
'title': title,
}
yield page_data
def _history(self, page_id):
"""Yield page_data for next revision of wiki page"""
c = self.connection().cursor()
c.execute('select revision.rev_timestamp, text.old_text '
'from revision '
'left join text on revision.rev_text_id = text.old_id '
'where revision.rev_page = %s', page_id)
for row in c:
timestamp, text = row
page_data = {
'timestamp': timestamp,
'text': text or ''
}
yield page_data
def _talk(self, page_title):
"""Return page_data for talk page with `page_title` title"""
c = self.connection().cursor()
query_attrs = (page_title, 1) # page_namespace == 1 - talk pages
c.execute('select text.old_text '
'from page '
'left join revision on revision.rev_id = page.page_latest '
'left join text on text.old_id = revision.rev_text_id '
'where page.page_title = %s and page.page_namespace = %s '
'limit 1', query_attrs)
row = c.fetchone()
if row:
text = row[0]
return {'text': text}
def _attachments(self, page_id):
"""Yield path to nexe file attached to wiki page"""
c = self.connection().cursor()
c.execute('select il_to from imagelinks '
'where il_from = %s' % page_id)
for row in c:
name = row[0]
# mediawiki stores attachmets in subdirectories
# based on md5-hash of filename
# so we need to build path to file as follows
md5 = hashlib.md5(name).hexdigest()
path = os.path.join(self.options.attachments_dir,
md5[:1], md5[:2], name)
if os.path.isfile(path):
yield path
def extract(self):
self.extract_pages()
def extract_pages(self):
allura_base.log.info('Extracting pages...')
for page in self._pages():
self.extract_history(page)
self.extract_talk(page)
self.extract_attachments(page)
allura_base.log.info('Extracting pages done')
def extract_history(self, page):
page_id = page['page_id']
for page_data in self._history(page_id):
page_data.update(page)
self._save(json.dumps(page_data), 'pages', str(page_id),
'history', str(page_data['timestamp']) + '.json')
allura_base.log.info('Extracted history for page %s (%s)'
% (page_id, page['title']))
def extract_talk(self, page):
page_id = page['page_id']
talk_page_data = self._talk(page['title'])
if talk_page_data:
self._save(json.dumps(talk_page_data), 'pages', str(page_id),
'discussion.json')
allura_base.log.info('Extracted talk for page %s (%s)'
% (page_id, page['title']))
allura_base.log.info('No talk for page %s (%s)'
% (page_id, page['title']))
def extract_attachments(self, page):
page_id = page['page_id']
for filepath in self._attachments(page_id):
self._save_attachment(filepath, 'pages', str(page_id),
'attachments')
allura_base.log.info('Extracted attachments for page %s (%s)'
% (page_id, page['title']))