| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| import re |
| import urllib |
| from urllib2 import HTTPError |
| from urlparse import urlparse, urljoin, parse_qs |
| from collections import defaultdict |
| import logging |
| import os |
| |
| import requests |
| from BeautifulSoup import BeautifulSoup |
| from formencode import validators as fev |
| |
| from allura.lib import helpers as h |
| from allura import model as M |
| from forgeimporters.base import ProjectExtractor |
| from forgeimporters.base import File |
| |
| |
| log = logging.getLogger(__name__) |
| |
| |
| def _as_text(node, chunks=None): |
| """ |
| Similar to node.text, but preserves whitespace around tags, |
| and converts <br/>s to \n. |
| """ |
| if chunks is None: |
| chunks = [] |
| for n in node: |
| if isinstance(n, basestring): |
| chunks.append(n) |
| elif n.name == 'br': |
| chunks.append('\n') |
| else: |
| _as_text(n, chunks) |
| return ''.join(chunks) |
| |
| |
| def _as_markdown(tag, project_name): |
| fragments = [] |
| for fragment in tag: |
| if getattr(fragment, 'name', None) == 'a': |
| href = urlparse(fragment['href']) |
| qs = parse_qs(href.query) |
| gc_link = not href.netloc or href.netloc == 'code.google.com' |
| path_parts = href.path.split('/') |
| target_project = None |
| if gc_link: |
| if len(path_parts) >= 5 and path_parts[1] == 'a': |
| target_project = '/'.join(path_parts[1:5]) |
| elif len(path_parts) >= 3: |
| target_project = path_parts[2] |
| internal_link = target_project == project_name |
| if gc_link and internal_link and 'id' in qs: |
| # rewrite issue 123 project-internal issue links |
| fragment = '[%s](#%s)' % (fragment.text, qs['id'][0]) |
| elif gc_link and internal_link and 'r' in qs: |
| # rewrite r123 project-internal revision links |
| fragment = '[r%s]' % qs['r'][0] |
| elif gc_link: |
| # preserve GC-internal links (probably issue PROJECT:123 |
| # inter-project issue links) |
| fragment = '[%s](%s)' % ( |
| h.plain2markdown( |
| fragment.text, preserve_multiple_spaces=True, has_html_entities=True), |
| # possibly need to adjust this URL for /a/ hosted domain URLs, |
| # but it seems fragment['href'] always starts with / so it replaces the given path |
| urljoin('https://code.google.com/p/%s/issues/' % |
| project_name, fragment['href']), |
| ) |
| else: |
| # convert all other links to Markdown syntax |
| fragment = '[%s](%s)' % (fragment.text, fragment['href']) |
| elif getattr(fragment, 'name', None) == 'i': |
| # preserve styling of "(No comment was entered for this change.)" |
| # messages |
| fragment = '*%s*' % h.plain2markdown(fragment.text, |
| preserve_multiple_spaces=True, has_html_entities=True) |
| elif getattr(fragment, 'name', None) == 'b': |
| # preserve styling of issue template |
| fragment = '**%s**' % h.plain2markdown(fragment.text, |
| preserve_multiple_spaces=True, has_html_entities=True) |
| elif getattr(fragment, 'name', None) == 'br': |
| # preserve forced line-breaks |
| fragment = '\n' |
| else: |
| # convert all others to plain MD |
| fragment = h.plain2markdown( |
| unicode(fragment), preserve_multiple_spaces=True, has_html_entities=True) |
| fragments.append(fragment) |
| return ''.join(fragments).strip() |
| |
| |
| def csv_parser(page): |
| lines = page.readlines() |
| if not lines: |
| return [] |
| # skip CSV header |
| lines = lines[1:] |
| # skip "next page here" info footer |
| if not lines[-1].startswith('"'): |
| lines.pop() |
| # remove CSV wrapping (quotes, commas, newlines) |
| return [line.strip('",\n') for line in lines] |
| |
| |
| class GoogleCodeProjectNameValidator(fev.FancyValidator): |
| not_empty = True |
| messages = { |
| 'invalid': 'Please enter a project URL, or a project name containing ' |
| 'only letters, numbers, and dashes.', |
| 'unavailable': 'This project is unavailable for import', |
| } |
| |
| def _to_python(self, value, state=None): |
| project_name_re = re.compile(r'^[a-z0-9][a-z0-9-]{,61}$') |
| if project_name_re.match(value): |
| # just a name |
| project_name = value |
| else: |
| # try as a URL |
| project_name = None |
| project_name_simple = None |
| url = urlparse(value.strip()) |
| if url.netloc.endswith('.googlecode.com'): |
| project_name = url.netloc.split('.')[0] |
| elif url.netloc == 'code.google.com': |
| path_parts = url.path.lstrip('/').split('/') |
| if len(path_parts) >= 2 and path_parts[0] == 'p': |
| project_name = path_parts[1] |
| elif len(path_parts) >= 4 and path_parts[0] == 'a' and path_parts[2] == 'p': |
| project_name_simple = path_parts[3] |
| project_name = '/'.join(path_parts[0:4]) |
| |
| if not project_name_simple: |
| project_name_simple = project_name |
| |
| if not project_name or not project_name_re.match(project_name_simple): |
| raise fev.Invalid(self.message('invalid', state), value, state) |
| |
| if not GoogleCodeProjectExtractor(project_name).check_readable(): |
| raise fev.Invalid(self.message('unavailable', state), value, state) |
| return project_name |
| |
| |
| def split_project_name(project_name): |
| ''' |
| For hosted projects, the project_name includes the hosted domain. Split, like: |
| |
| :param str project_name: "a/eclipselabs.org/p/restclient-tool" |
| :return: ``("/a/eclipselabs.org", "restclient-tool")`` |
| ''' |
| if project_name.startswith('a/'): |
| hosted_domain_prefix = '/a/' + project_name.split('/')[1] |
| project_name = project_name.split('/')[3] |
| else: |
| hosted_domain_prefix = '' |
| project_name = project_name |
| return hosted_domain_prefix, project_name |
| |
| |
| class GoogleCodeProjectExtractor(ProjectExtractor): |
| BASE_URL = 'http://code.google.com' |
| RE_REPO_TYPE = re.compile(r'(svn|hg|git)') |
| |
| PAGE_MAP = { |
| 'project_info': BASE_URL + '{hosted_domain_prefix}/p/{project_name}/', |
| 'source_browse': BASE_URL + '{hosted_domain_prefix}/p/{project_name}/source/browse/', |
| 'issues_csv': BASE_URL + '{hosted_domain_prefix}/p/{project_name}/issues/csv?can=1&colspec=ID&sort=ID&start={start}', |
| 'issue': BASE_URL + '{hosted_domain_prefix}/p/{project_name}/issues/detail?id={issue_id}', |
| } |
| |
| LICENSE_MAP = defaultdict(lambda: 'Other/Proprietary License', { |
| 'Apache License 2.0': 'Apache License V2.0', |
| 'Artistic License/GPL': 'Artistic License', |
| 'Eclipse Public License 1.0': 'Eclipse Public License', |
| 'GNU GPL v2': 'GNU General Public License version 2.0 (GPLv2)', |
| 'GNU GPL v3': 'GNU General Public License version 3.0 (GPLv3)', |
| 'GNU Lesser GPL': 'GNU Library or Lesser General Public License version 2.0 (LGPLv2)', |
| 'MIT License': 'MIT License', |
| 'Mozilla Public License 1.1': 'Mozilla Public License 1.1 (MPL 1.1)', |
| 'New BSD License': 'BSD License', |
| 'Other Open Source': 'Open Software License', |
| }) |
| |
| DEFAULT_ICON = 'http://www.gstatic.com/codesite/ph/images/defaultlogo.png' |
| |
| def get_page_url(self, page_name, **kw): |
| # override, to handle hosted domains |
| hosted_domain_prefix, project_name = split_project_name(self.project_name) |
| return self.PAGE_MAP[page_name].format( |
| project_name=urllib.quote(project_name), |
| hosted_domain_prefix=hosted_domain_prefix, |
| **kw) |
| |
| def check_readable(self): |
| resp = requests.head(self.get_page_url('project_info')) |
| return resp.status_code == 200 |
| |
| def get_short_description(self, project): |
| page = self.get_page('project_info') |
| project.short_description = page.find( |
| itemprop='description').text.strip() |
| |
| def get_icon(self, project): |
| page = self.get_page('project_info') |
| icon_url = urljoin(self.url, page.find(itemprop='image').get('src')) |
| if icon_url == self.DEFAULT_ICON: |
| return |
| icon_name = urllib.unquote(urlparse(icon_url).path).split('/')[-1] |
| icon = File(icon_url, icon_name) |
| filetype = icon.type |
| # work around Google Code giving us bogus file type |
| if filetype.startswith('text/html'): |
| filetype = 'image/png' |
| M.ProjectFile.save_image( |
| icon_name, icon.file, filetype, |
| square=True, thumbnail_size=(48, 48), |
| thumbnail_meta={'project_id': project._id, 'category': 'icon'}) |
| |
| def get_license(self, project): |
| page = self.get_page('project_info') |
| license = page.find(text='Code license').findNext().find( |
| 'a').text.strip() |
| trove = M.TroveCategory.query.get(fullname=self.LICENSE_MAP[license]) |
| project.trove_license.append(trove._id) |
| |
| def get_repo_type(self): |
| page = self.get_page('source_browse') |
| repo_type = page.find(id="crumb_root") |
| if not repo_type: |
| raise Exception("Couldn't detect repo type: no #crumb_root in " |
| "{0}".format(self.url)) |
| re_match = self.RE_REPO_TYPE.match(repo_type.text.lower()) |
| if re_match: |
| return re_match.group(0) |
| else: |
| raise Exception("Unknown repo type: {0}".format(repo_type.text)) |
| |
| @classmethod |
| def iter_issues(cls, project_name): |
| """ |
| Iterate over all issues for a project, |
| using paging to keep the responses reasonable. |
| """ |
| extractor = cls(project_name) |
| issue_ids = extractor.get_issue_ids(start=0) |
| while issue_ids: |
| for issue_id in sorted(issue_ids): |
| try: |
| yield (int(issue_id), cls(project_name, 'issue', issue_id=issue_id)) |
| except HTTPError as e: |
| if e.code == 404: |
| log.warn('Unable to load GC issue: %s #%s: %s: %s', |
| project_name, issue_id, e, e.url) |
| continue |
| else: |
| raise |
| # get any new issues that were created while importing |
| # (jumping back a few in case some were deleted and new ones added) |
| new_ids = extractor.get_issue_ids(start=len(issue_ids) - 10) |
| issue_ids = new_ids - issue_ids |
| |
| def get_issue_ids(self, start=0): |
| limit = 100 |
| |
| issue_ids = set() |
| page = self.get_page('issues_csv', parser=csv_parser, start=start) |
| while page: |
| if len(page) <= 0: |
| return |
| issue_ids.update(page) |
| start += limit |
| page = self.get_page('issues_csv', parser=csv_parser, start=start) |
| return issue_ids |
| |
| def get_issue_summary(self): |
| text = self.page.find(id='issueheader').findAll( |
| 'td', limit=2)[1].span.text.strip() |
| bs = BeautifulSoup(text, convertEntities=BeautifulSoup.HTML_ENTITIES) |
| return bs.text |
| |
| def get_issue_description(self): |
| return _as_markdown(self.page.find(id='hc0').pre, self.project_name) |
| |
| def get_issue_created_date(self): |
| return self.page.find(id='hc0').find('span', 'date').get('title') |
| |
| def get_issue_mod_date(self): |
| comments = self.page.findAll('div', 'issuecomment') |
| if comments: |
| last_update = Comment(comments[-1], self.project_name) |
| return last_update.created_date |
| else: |
| return self.get_issue_created_date() |
| |
| def get_issue_creator(self): |
| a = self.page.find(id='hc0').find(True, 'userlink') |
| return UserLink(a) |
| |
| def get_issue_status(self): |
| tag = self.page.find(id='issuemeta').find( |
| 'th', text=re.compile('Status:')).findNext().span |
| if tag: |
| return tag.text.strip() |
| else: |
| return '' |
| |
| def get_issue_owner(self): |
| tag = self.page.find(id='issuemeta').find( |
| 'th', text=re.compile('Owner:')).findNext().find(True, 'userlink') |
| if tag: |
| return UserLink(tag) |
| else: |
| return None |
| |
| def get_issue_labels(self): |
| label_nodes = self.page.find(id='issuemeta').findAll('a', 'label') |
| return [_as_text(l) for l in label_nodes] |
| |
| def get_issue_attachments(self): |
| return _get_attachments(self.page.find(id='hc0')) |
| |
| def get_issue_stars(self): |
| stars_re = re.compile(r'(\d+) (person|people) starred this issue') |
| stars = self.page.find(id='issueheader').find(text=stars_re) |
| if stars: |
| return int(stars_re.search(stars).group(1)) |
| return 0 |
| |
| def iter_comments(self): |
| for comment in self.page.findAll('div', 'issuecomment'): |
| yield Comment(comment, self.project_name) |
| |
| |
| class UserLink(object): |
| |
| def __init__(self, tag): |
| self.name = tag.text.strip() |
| if tag.get('href'): |
| self.url = urljoin( |
| GoogleCodeProjectExtractor.BASE_URL, tag.get('href')) |
| else: |
| self.url = None |
| |
| def __str__(self): |
| if self.url: |
| return '[{name}]({url})'.format(name=self.name, url=self.url) |
| else: |
| return self.name |
| |
| |
| def _get_attachments(tag): |
| attachment_links = tag.find('div', 'attachments') |
| if attachment_links: |
| attachments = [] |
| for a in attachment_links.findAll('a', text='Download'): |
| url = a.parent.get('href') |
| try: |
| attachment = Attachment(url) |
| except Exception: |
| log.exception('Could not get attachment: %s', url) |
| else: |
| attachments.append(attachment) |
| return attachments |
| else: |
| return [] |
| |
| |
| class Comment(object): |
| |
| def __init__(self, tag, project_name): |
| self.author = UserLink( |
| tag.find('span', 'author').find(True, 'userlink')) |
| self.created_date = tag.find('span', 'date').get('title') |
| self.body = _as_markdown(tag.find('pre'), project_name) |
| self._get_updates(tag) |
| self.attachments = _get_attachments(tag) |
| |
| def _get_updates(self, tag): |
| _updates = tag.find('div', 'updates') |
| self.updates = { |
| b.text: b.nextSibling.strip() |
| for b in _updates.findAll('b')} if _updates else {} |
| |
| @property |
| def annotated_text(self): |
| text = ( |
| u'*Originally posted by:* {author}\n' |
| u'\n' |
| u'{body}\n' |
| u'\n' |
| u'{updates}' |
| ).format( |
| author=self.author, |
| body=self.body, |
| updates='\n'.join( |
| '**%s** %s' % (k, v) |
| for k, v in self.updates.items() |
| ), |
| ) |
| return text |
| |
| |
| class Attachment(File): |
| |
| def __init__(self, url): |
| url = urljoin(GoogleCodeProjectExtractor.BASE_URL, url) |
| filename = parse_qs(urlparse(url).query)['name'][0] |
| super(Attachment, self).__init__(url, filename) |