ForgeWiki/forgewiki/command/wiki2markdown/extractors.py - allura - Git at Google

 import MySQLdb
 import os
 import shutil
 import json
 from allura.command import base as allura_base


 class MediawikiExtractor(object):
     """Base class for MediaWiki data provider"""

     def __init__(self, options):
         self.options = options
         if os.path.exists(self.options.dump_dir):
             # clear dump_dir before extraction (there may be an old data)
             shutil.rmtree(self.options.dump_dir)
         os.makedirs(self.options.dump_dir)

     def extract(self):
         self.extract_pages()
         self.extract_history()
         self.extract_talk()
         self.extract_attachments()

     def extract_pages(self):
         raise NotImplementedError("subclass must override this")

     def extract_history(self):
         raise NotImplementedError("subclass must override this")

     def extract_talk(self):
         raise NotImplementedError("subclass must override this")

     def extract_attachments(self):
         raise NotImplementedError("subclass must override this")


 class MySQLExtractor(MediawikiExtractor):
     """Extract MediaWiki data to json.

     Use connection to MySQL database as a data source.
     """

     def __init__(self, options):
         super(MySQLExtractor, self).__init__(options)
         self._connection = None
         self.db_options = {
             'host': self.options.host or 'localhost',
             'user': self.options.user,
             'passwd': self.options.password,
             'db': self.options.db_name,
             'port': self.options.port or 3306
         }

     def connection(self):
         if not self._connection:
             self._connection = MySQLdb.connect(**self.db_options)
         return self._connection

     def _save(self, content, *paths):
         """Save json to file in local filesystem"""
         out_file = os.path.join(self.options.dump_dir, *paths)
         if not os.path.exists(os.path.dirname(out_file)):
             os.makedirs(os.path.dirname(out_file))
         with open(out_file, 'w') as out:
             out.write(content.encode('utf-8'))

     def _pages(self):
         """Yield page_data for next wiki page"""
         c = self.connection().cursor()
         c.execute('select page.page_id, page.page_title, text.old_text '
                   'from page '
                   'left join revision on revision.rev_id = page.page_latest '
                   'left join text on text.old_id = revision.rev_text_id '
                   'where page.page_namespace = 0')
         for row in c:
             _id, title, text = row
             page_data = {
                 'title': title,
                 'text': text or ''
             }
             yield _id, page_data

     def extract_pages(self):
         allura_base.log.info('Extracting pages...')
         for _id, page_data in self._pages():
             self._save(json.dumps(page_data), 'pages', str(_id) + '.json')
         allura_base.log.info('Extracting pages done')

     def extract_history(self):
         allura_base.log.info('extract_history not implemented yet. Skip.')

     def extract_talk(self):
         allura_base.log.info('extract_talk not implemented yet. Skip.')

     def extract_attachments(self):
         allura_base.log.info('extract_attachments not implemented yet. Skip.')
	import MySQLdb
	import os
	import shutil
	import json
	from allura.command import base as allura_base


	class MediawikiExtractor(object):
	"""Base class for MediaWiki data provider"""

	def __init__(self, options):
	self.options = options
	if os.path.exists(self.options.dump_dir):
	# clear dump_dir before extraction (there may be an old data)
	shutil.rmtree(self.options.dump_dir)
	os.makedirs(self.options.dump_dir)

	def extract(self):
	self.extract_pages()
	self.extract_history()
	self.extract_talk()
	self.extract_attachments()

	def extract_pages(self):
	raise NotImplementedError("subclass must override this")

	def extract_history(self):
	raise NotImplementedError("subclass must override this")

	def extract_talk(self):
	raise NotImplementedError("subclass must override this")

	def extract_attachments(self):
	raise NotImplementedError("subclass must override this")


	class MySQLExtractor(MediawikiExtractor):
	"""Extract MediaWiki data to json.

	Use connection to MySQL database as a data source.
	"""

	def __init__(self, options):
	super(MySQLExtractor, self).__init__(options)
	self._connection = None
	self.db_options = {
	'host': self.options.host or 'localhost',
	'user': self.options.user,
	'passwd': self.options.password,
	'db': self.options.db_name,
	'port': self.options.port or 3306
	}

	def connection(self):
	if not self._connection:
	self._connection = MySQLdb.connect(**self.db_options)
	return self._connection

	def _save(self, content, *paths):
	"""Save json to file in local filesystem"""
	out_file = os.path.join(self.options.dump_dir, *paths)
	if not os.path.exists(os.path.dirname(out_file)):
	os.makedirs(os.path.dirname(out_file))
	with open(out_file, 'w') as out:
	out.write(content.encode('utf-8'))

	def _pages(self):
	"""Yield page_data for next wiki page"""
	c = self.connection().cursor()
	c.execute('select page.page_id, page.page_title, text.old_text '
	'from page '
	'left join revision on revision.rev_id = page.page_latest '
	'left join text on text.old_id = revision.rev_text_id '
	'where page.page_namespace = 0')
	for row in c:
	_id, title, text = row
	page_data = {
	'title': title,
	'text': text or ''
	}
	yield _id, page_data

	def extract_pages(self):
	allura_base.log.info('Extracting pages...')
	for _id, page_data in self._pages():
	self._save(json.dumps(page_data), 'pages', str(_id) + '.json')
	allura_base.log.info('Extracting pages done')

	def extract_history(self):
	allura_base.log.info('extract_history not implemented yet. Skip.')

	def extract_talk(self):
	allura_base.log.info('extract_talk not implemented yet. Skip.')

	def extract_attachments(self):
	allura_base.log.info('extract_attachments not implemented yet. Skip.')