| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| """ |
| Generate Allura sitemap xml files. You will need to configure your webserver to serve the files. |
| |
| This takes a while to run on a prod-sized data set. There are a couple of |
| things that would make it faster, if we need/want to. |
| |
| 1. Monkeypatch forgetracker.model.ticket.Globals.bin_count to skip the |
| refresh (Solr search) and just return zero for everything, since we don't |
| need bin counts for the sitemap. |
| |
| 2. Use multiprocessing to distribute the offsets to n subprocesses. |
| """ |
| |
| import os |
| from datetime import datetime |
| import argparse |
| |
| from jinja2 import Template |
| import pylons |
| import webob |
| from pylons import tmpl_context as c |
| from ming.orm import ThreadLocalORMSession |
| from tg import config |
| |
| from allura import model as M |
| from allura.lib import security, utils |
| from allura.scripts import ScriptTask |
| |
| |
| MAX_SITEMAP_URLS = 50000 |
| |
| INDEX_TEMPLATE = """\ |
| <?xml version="1.0" encoding="utf-8"?> |
| <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> |
| {% for sitemap in sitemaps -%} |
| <sitemap> |
| <loc>{{ sitemap }}</loc> |
| <lastmod>{{ now }}</lastmod> |
| </sitemap> |
| {%- endfor %} |
| </sitemapindex> |
| """ |
| |
| SITEMAP_TEMPLATE = """\ |
| <?xml version="1.0" encoding="utf-8"?> |
| <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> |
| {% for loc in locs -%} |
| <url> |
| <loc>{{ loc.url }}</loc> |
| <lastmod>{{ loc.date }}</lastmod> |
| <changefreq>daily</changefreq> |
| </url> |
| {% endfor %} |
| </urlset> |
| """ |
| |
| |
| class CreateSitemapFiles(ScriptTask): |
| |
| @classmethod |
| def execute(cls, options): |
| # This script will indirectly call app.sidebar_menu() for every app in |
| # every project. Some of the sidebar_menu methods expect the |
| # pylons.request threadlocal object to be present. So, we're faking it. |
| # |
| # The fact that this isn't a 'real' request doesn't matter for the |
| # purposes of the sitemap. |
| pylons.request._push_object(webob.Request.blank('/')) |
| |
| output_path = options.output_dir |
| if os.path.exists(output_path): |
| raise Exception('%s directory already exists.' % output_path) |
| os.mkdir(output_path) |
| |
| now = datetime.utcnow().date() |
| sitemap_content_template = Template(SITEMAP_TEMPLATE) |
| |
| def write_sitemap(urls, file_no): |
| sitemap_content = sitemap_content_template.render(dict(now=now, locs=urls)) |
| with open(os.path.join(output_path, 'sitemap-%d.xml' % file_no), 'w') as f: |
| f.write(sitemap_content) |
| |
| creds = security.Credentials.get() |
| locs = [] |
| file_count = 0 |
| |
| excl_nbhd_ids = [] |
| if options.exclude_neighborhoods: |
| prefix = ['/%s/' % n for n in options.exclude_neighborhoods] |
| excl_nbhd_ids = [nbhd._id for nbhd in M.Neighborhood.query.find({'url_prefix': {'$in': prefix}})] |
| |
| # write sitemap files, MAX_SITEMAP_URLS per file |
| for chunk in utils.chunked_find(M.Project, {'deleted': False, 'neighborhood_id': {'$nin': excl_nbhd_ids}}): |
| for p in chunk: |
| c.project = p |
| try: |
| for s in p.sitemap(excluded_tools=options.exclude_tools): |
| url = config['base_url'] + s.url if s.url[0] == '/' else s.url |
| locs.append({'url': url, |
| 'date': p.last_updated.strftime("%Y-%m-%d")}) |
| |
| except Exception, e: |
| print "Error creating sitemap for project '%s': %s" %\ |
| (p.shortname, e) |
| creds.clear() |
| if len(locs) >= options.urls_per_file: |
| write_sitemap(locs[:options.urls_per_file], file_count) |
| del locs[:options.urls_per_file] |
| file_count += 1 |
| M.main_orm_session.clear() |
| ThreadLocalORMSession.close_all() |
| while locs: |
| write_sitemap(locs[:options.urls_per_file], file_count) |
| del locs[:options.urls_per_file] |
| file_count += 1 |
| # write sitemap index file |
| if file_count: |
| sitemap_index_vars = dict( |
| now=now, |
| sitemaps=[ |
| '%s%s/sitemap-%d.xml' % (config['base_url'], options.url_dir, n) |
| for n in range(file_count)]) |
| sitemap_index_content = Template( |
| INDEX_TEMPLATE).render(sitemap_index_vars) |
| with open(os.path.join(output_path, 'sitemap.xml'), 'w') as f: |
| f.write(sitemap_index_content) |
| |
| @classmethod |
| def parser(cls): |
| class Validate(argparse.Action): |
| def __call__(self, parser, namespace, value, option_string=None): |
| value = min(value, MAX_SITEMAP_URLS) |
| setattr(namespace, self.dest, value) |
| |
| parser = argparse.ArgumentParser(description=__doc__, |
| formatter_class=argparse.RawDescriptionHelpFormatter) |
| parser.add_argument('-o', '--output-dir', |
| dest='output_dir', |
| default='/tmp/allura_sitemap', |
| help='Output directory (absolute path).' |
| '[default: %(default)s]') |
| parser.add_argument('-u', '--urls-per-file', dest='urls_per_file', |
| default=10000, type=int, |
| help='Number of URLs per sitemap file. [default: %(default)s, max: ' + |
| str(MAX_SITEMAP_URLS) + ']', |
| action=Validate) |
| parser.add_argument('--exclude-neighborhood', '-n', '--neighborhood', dest='exclude_neighborhoods', |
| help="URL prefix of excluded neighborhood(s) Example: u", |
| default=None, nargs='*', metavar='N') |
| parser.add_argument('--exclude-tools', dest='exclude_tools', |
| help="URL prefix of excluded neighborhood(s)", |
| default=['link', 'git', 'hg', 'svn'], |
| nargs='*', metavar='TOOL') |
| parser.add_argument('--url-dir', dest='url_dir', |
| default='/allura_sitemap', |
| help='URL directory in which the files will be served from') |
| return parser |
| |
| |
| def get_parser(): |
| return CreateSitemapFiles.parser() |
| |
| |
| if __name__ == '__main__': |
| CreateSitemapFiles.main() |