blob: a12389b078f3796dacc946b847676186a40e08dd [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import re
import urllib
from urlparse import urlparse, urljoin
from collections import defaultdict
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
import logging
from BeautifulSoup import BeautifulSoup
from allura import model as M
from forgeimporters.base import ProjectExtractor
log = logging.getLogger(__name__)
class GoogleCodeProjectExtractor(ProjectExtractor):
BASE_URL = 'http://code.google.com'
RE_REPO_TYPE = re.compile(r'(svn|hg|git)')
PAGE_MAP = {
'project_info': BASE_URL + '/p/%s/',
'source_browse': BASE_URL + '/p/%s/source/browse/',
}
LICENSE_MAP = defaultdict(lambda:'Other/Proprietary License', {
'Apache License 2.0': 'Apache Software License',
'Artistic License/GPL': 'Artistic License',
'Eclipse Public License 1.0': 'Eclipse Public License',
'GNU GPL v2': 'GNU General Public License (GPL)',
'GNU GPL v3': 'GNU General Public License (GPL)',
'GNU Lesser GPL': 'GNU Library or Lesser General Public License (LGPL)',
'MIT License': 'MIT License',
'Mozilla Public License 1.1': 'Mozilla Public License 1.1 (MPL 1.1)',
'New BSD License': 'BSD License',
'Other Open Source': 'Other/Proprietary License',
})
DEFAULT_ICON = 'http://www.gstatic.com/codesite/ph/images/defaultlogo.png'
def __init__(self, allura_project, gc_project_name, page=None):
self.project = allura_project
self.gc_project_name = gc_project_name
self._page_cache = {}
self.url = None
self.page = None
if page:
self.get_page(page)
def get_page(self, page_name_or_url):
"""Return a Beautiful soup object for the given page name or url.
If a page name is provided, the associated url is looked up in
:attr:`PAGE_MAP`.
Results are cached so that subsequent calls for the same page name or
url will return the cached result rather than making another HTTP
request.
"""
if page_name_or_url in self._page_cache:
return self._page_cache[page_name_or_url]
self.url = (self.get_page_url(page_name_or_url) if page_name_or_url in
self.PAGE_MAP else page_name_or_url)
self.page = self._page_cache[page_name_or_url] = \
BeautifulSoup(self.urlopen(self.url))
return self.page
def get_page_url(self, page_name):
"""Return the url associated with ``page_name``.
Raises KeyError if ``page_name`` is not in :attr:`PAGE_MAP`.
"""
return self.PAGE_MAP[page_name] % urllib.quote(self.gc_project_name)
def get_short_description(self):
page = self.get_page('project_info')
self.project.short_description = page.find(itemprop='description').string.strip()
def get_icon(self):
page = self.get_page('project_info')
icon_url = urljoin(self.url, page.find(itemprop='image').attrMap['src'])
if icon_url == self.DEFAULT_ICON:
return
icon_name = urllib.unquote(urlparse(icon_url).path).split('/')[-1]
fp_ish = self.urlopen(icon_url)
fp = StringIO(fp_ish.read())
M.ProjectFile.save_image(
icon_name, fp,
fp_ish.info()['content-type'].split(';')[0], # strip off charset=x extra param,
square=True, thumbnail_size=(48,48),
thumbnail_meta={'project_id': self.project._id, 'category': 'icon'})
def get_license(self):
page = self.get_page('project_info')
license = page.find(text='Code license').findNext().find('a').string.strip()
trove = M.TroveCategory.query.get(fullname=self.LICENSE_MAP[license])
self.project.trove_license.append(trove._id)
def get_repo_type(self):
page = self.get_page('source_browse')
repo_type = page.find(id="crumb_root")
if not repo_type:
raise Exception("Couldn't detect repo type: no #crumb_root in "
"{0}".format(self.url))
re_match = self.RE_REPO_TYPE.match(repo_type.text.lower())
if re_match:
return re_match.group(0)
else:
raise Exception("Unknown repo type: {0}".format(repo_type.text))