blob: 9f44fd4e74411bf562f907ab177d90419b6da802 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from time import mktime
from datetime import datetime
import re
import feedparser
from bson import ObjectId
import base
from allura.command import base as allura_base
from ming.orm import session
from pylons import tmpl_context as c
from allura import model as M
from forgeblog import model as BM
from forgeblog import version
from forgeblog.main import ForgeBlogApp
from allura.lib import exceptions
from allura.lib.decorators import exceptionless
## Everything in this file depends on html2text,
## so import attempt is placed in global scope.
try:
import html2text
except ImportError:
raise ImportError("""Importing RSS feeds requires GPL library "html2text":
https://github.com/brondsem/html2text""")
html2text.BODY_WIDTH = 0
re_amp = re.compile(r'''
[&] # amp
(?= # look ahead for:
([a-zA-Z0-9]+;) # named HTML entity
|
(\#[0-9]+;) # decimal entity
|
(\#x[0-9A-F]+;) # hex entity
)
''', re.VERBOSE)
re_leading_spaces = re.compile(r'^[\t ]+', re.MULTILINE)
re_preserve_spaces = re.compile(r'''
[ ] # space
(?=[ ]) # lookahead for a space
''', re.VERBOSE)
re_angle_bracket_open = re.compile('<')
re_angle_bracket_close = re.compile('>')
def plain2markdown(text, preserve_multiple_spaces=False, has_html_entities=False):
if not has_html_entities:
# prevent &foo; and &#123; from becoming HTML entities
text = re_amp.sub('&amp;', text)
# avoid accidental 4-space indentations creating code blocks
if preserve_multiple_spaces:
text = text.replace('\t', ' ' * 4)
text = re_preserve_spaces.sub('&nbsp;', text)
else:
text = re_leading_spaces.sub('', text)
# use html2text for most of the escaping
text = html2text.escape_md_section(text, snob=True)
# prevent < and > from becoming tags
text = re_angle_bracket_open.sub('&lt;', text)
text = re_angle_bracket_close.sub('&gt;', text)
return text
class RssFeedsCommand(base.BlogCommand):
summary = 'Rss feed client'
parser = base.BlogCommand.standard_parser(verbose=True)
parser.add_option('-a', '--appid', dest='appid', default='',
help='application id')
parser.add_option('-u', '--username', dest='username', default='root',
help='poster username')
def command(self):
# If this script creates a new BlogPost, it will create an
# activitystream activity for that post. During the saving of the
# activity, User.url() will be called. This method defers to an
# AuthenticationProvider, which depends on a request being setup in
# the current thread. So, we set one up here.
import pylons, webob
pylons.request._push_object(webob.Request.blank('/'))
self.basic_setup()
self.process_feed = exceptionless(None, log=allura_base.log)(self.process_feed)
self.process_entry = exceptionless(None, log=allura_base.log)(self.process_entry)
user = M.User.query.get(username=self.options.username)
c.user = user
self.prepare_feeds()
for appid in self.feed_dict:
for feed_url in self.feed_dict[appid]:
self.process_feed(appid, feed_url)
def prepare_feeds(self):
feed_dict = {}
if self.options.appid != '':
gl_app = BM.Globals.query.get(app_config_id=ObjectId(self.options.appid))
if not gl_app:
raise exceptions.NoSuchGlobalsError("The globals %s " \
"could not be found in the database" % self.options.appid)
if len(gl_app.external_feeds) > 0:
feed_dict[gl_app.app_config_id] = gl_app.external_feeds
else:
for gl_app in BM.Globals.query.find().all():
if len(gl_app.external_feeds) > 0:
feed_dict[gl_app.app_config_id] = gl_app.external_feeds
self.feed_dict = feed_dict
def process_feed(self, appid, feed_url):
appconf = M.AppConfig.query.get(_id=appid)
if not appconf:
return
c.project = appconf.project
app = ForgeBlogApp(c.project, appconf)
c.app = app
allura_base.log.info("Get feed: %s" % feed_url)
f = feedparser.parse(feed_url)
if f.bozo:
allura_base.log.exception("%s: %s" % (feed_url, f.bozo_exception))
return
for e in f.entries:
self.process_entry(e, appid)
session(BM.BlogPost).flush()
def process_entry(self, e, appid):
title = e.title
allura_base.log.info(" ...entry '%s'", title)
parsed_content = filter(None, e.get('content') or [e.get('summary_detail')])
if parsed_content:
content = u''
for ct in parsed_content:
if ct.type != 'text/html':
content += plain2markdown(ct.value)
else:
html2md = html2text.HTML2Text(baseurl=e.link)
html2md.escape_snob = True
markdown_content = html2md.handle(ct.value)
content += markdown_content
else:
content = plain2markdown(getattr(e, 'summary',
getattr(e, 'subtitle',
getattr(e, 'title'))))
content += u' [link](%s)' % e.link
updated = datetime.utcfromtimestamp(mktime(e.updated_parsed))
base_slug = BM.BlogPost.make_base_slug(title, updated)
b_count = BM.BlogPost.query.find(dict(slug=base_slug, app_config_id=appid)).count()
if b_count == 0:
post = BM.BlogPost(title=title, text=content, timestamp=updated,
app_config_id=appid,
tool_version={'blog': version.__version__},
state='published')
post.neighborhood_id=c.project.neighborhood_id
post.make_slug()
post.commit()