Allura/allura/model/repo.py - allura - Git at Google

 #       Licensed to the Apache Software Foundation (ASF) under one
 #       or more contributor license agreements.  See the NOTICE file
 #       distributed with this work for additional information
 #       regarding copyright ownership.  The ASF licenses this file
 #       to you under the Apache License, Version 2.0 (the
 #       "License"); you may not use this file except in compliance
 #       with the License.  You may obtain a copy of the License at
 #
 #         http://www.apache.org/licenses/LICENSE-2.0
 #
 #       Unless required by applicable law or agreed to in writing,
 #       software distributed under the License is distributed on an
 #       "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 #       KIND, either express or implied.  See the License for the
 #       specific language governing permissions and limitations
 #       under the License.

 import os
 import re
 import sys
 import logging
 from hashlib import sha1
 from itertools import chain
 from datetime import datetime
 from collections import defaultdict, OrderedDict
 from difflib import SequenceMatcher, unified_diff
 import bson

 from pylons import tmpl_context as c
 import pymongo.errors

 from ming import Field, collection, Index
 from ming import schema as S
 from ming.base import Object
 from ming.utils import LazyProperty
 from ming.orm import mapper, session

 from allura.lib import utils
 from allura.lib import helpers as h
 from allura.lib.security import has_access

 from .auth import User
 from .project import AppConfig, Project
 from .session import main_doc_session, project_doc_session
 from .session import repository_orm_session
 from .timeline import ActivityObject

 log = logging.getLogger(__name__)

 # Some schema types
 SUser = dict(name=str, email=str, date=datetime)
 SObjType=S.OneOf('blob', 'tree', 'submodule')

 # Used for when we're going to batch queries using $in
 QSIZE = 100
 README_RE = re.compile('^README(\.[^.]*)?$', re.IGNORECASE)
 VIEWABLE_EXTENSIONS = ['.php','.py','.js','.java','.html','.htm','.yaml','.sh',
     '.rb','.phtml','.txt','.bat','.ps1','.xhtml','.css','.cfm','.jsp','.jspx',
     '.pl','.php4','.php3','.rhtml','.svg','.markdown','.json','.ini','.tcl','.vbs','.xsl']
 PYPELINE_EXTENSIONS = utils.MARKDOWN_EXTENSIONS + ['.rst']

 DIFF_SIMILARITY_THRESHOLD = .5  # used for determining file renames

 # Basic commit information
 # One of these for each commit in the physical repo on disk. The _id is the
 # hexsha of the commit (for Git and Hg).
 CommitDoc = collection(
     'repo_ci', main_doc_session,
     Field('_id', str),
     Field('tree_id', str),
     Field('committed', SUser),
     Field('authored', SUser),
     Field('message', str),
     Field('parent_ids', [str], index=True),
     Field('child_ids', [str], index=True),
     Field('repo_ids', [ S.ObjectId() ], index=True))

 # Basic tree information (also see TreesDoc)
 TreeDoc = collection(
     'repo_tree', main_doc_session,
     Field('_id', str),
     Field('tree_ids', [dict(name=str, id=str)]),
     Field('blob_ids', [dict(name=str, id=str)]),
     Field('other_ids', [dict(name=str, id=str, type=SObjType)]))

 LastCommitDoc_old = collection(
     'repo_last_commit', project_doc_session,
     Field('_id', str),
     Field('object_id', str, index=True),
     Field('name', str),
     Field('commit_info', dict(
         id=str,
         date=datetime,
         author=str,
         author_email=str,
         author_url=str,
         shortlink=str,
         summary=str)))

 # Information about the last commit to touch a tree
 LastCommitDoc = collection(
     'repo_last_commit', main_doc_session,
     Field('_id', S.ObjectId()),
     Field('commit_id', str),
     Field('path', str),
     Index('commit_id', 'path'),
     Field('entries', [dict(
         name=str,
         commit_id=str)]))

 # List of all trees contained within a commit
 # TreesDoc._id = CommitDoc._id
 # TreesDoc.tree_ids = [ TreeDoc._id, ... ]
 TreesDoc = collection(
     'repo_trees', main_doc_session,
     Field('_id', str),
     Field('tree_ids', [str]))

 # Information about which things were added/removed in  commit
 # DiffInfoDoc._id = CommitDoc._id
 DiffInfoDoc = collection(
     'repo_diffinfo', main_doc_session,
     Field('_id', str),
     Field(
         'differences',
         [ dict(name=str, lhs_id=str, rhs_id=str)]))

 # List of commit runs (a run is a linear series of single-parent commits)
 # CommitRunDoc.commit_ids = [ CommitDoc._id, ... ]
 CommitRunDoc = collection(
     'repo_commitrun', main_doc_session,
     Field('_id', str),
     Field('parent_commit_ids', [str], index=True),
     Field('commit_ids', [str], index=True),
     Field('commit_times', [datetime]))

 class RepoObject(object):

     def __repr__(self): # pragma no cover
         return '<%s %s>' % (
             self.__class__.__name__, self._id)

     def primary(self):
         return self

     def index_id(self):
         '''Globally unique artifact identifier.  Used for
         SOLR ID, shortlinks, and maybe elsewhere
         '''
         id = '%s.%s#%s' % (
             self.__class__.__module__,
             self.__class__.__name__,
             self._id)
         return id.replace('.', '/')

     @classmethod
     def upsert(cls, id, **kwargs):
         isnew = False
         r = cls.query.get(_id=id)
         if r is not None: return r, isnew
         try:
             r = cls(_id=id, **kwargs)
             session(r).flush(r)
             isnew = True
         except pymongo.errors.DuplicateKeyError: # pragma no cover
             session(r).expunge(r)
             r = cls.query.get(_id=id)
         return r, isnew

 class Commit(RepoObject, ActivityObject):
     type_s = 'Commit'
     # Ephemeral attrs
     repo=None

     @property
     def activity_name(self):
         return self.shorthand_id()

     @property
     def activity_extras(self):
         d = ActivityObject.activity_extras.fget(self)
         d.update(summary=self.summary)
         if self.repo:
             d.update(app_config_id=self.repo.app.config._id)
         return d

     def has_activity_access(self, perm, user, activity):
         """Commits have no ACLs and are therefore always viewable by any user.

         """
         app_config_id = activity.obj.activity_extras.get('app_config_id')
         if app_config_id:
             app_config = AppConfig.query.get(_id=app_config_id)
             if app_config:
                 project = Project.query.get(_id=app_config.project_id)
                 app = app_config.load()(project, app_config)
                 return has_access(app, perm, user, project)
         return True

     def set_context(self, repo):
         self.repo = repo

     @LazyProperty
     def author_url(self):
         u = User.by_email_address(self.authored.email)
         if u: return u.url()

     @LazyProperty
     def committer_url(self):
         u = User.by_email_address(self.committed.email)
         if u: return u.url()

     @LazyProperty
     def tree(self):
         return self.get_tree(create=True)

     def get_tree(self, create=True):
         if self.tree_id is None and create:
             self.tree_id = self.repo.compute_tree_new(self)
         if self.tree_id is None:
             return None
         cache = getattr(c, 'model_cache', '') or ModelCache()
         t = cache.get(Tree, dict(_id=self.tree_id))
         if t is None and create:
             self.tree_id = self.repo.compute_tree_new(self)
             t = Tree.query.get(_id=self.tree_id)
             cache.set(Tree, dict(_id=self.tree_id), t)
         if t is not None:
             t.set_context(self)
         return t

     @LazyProperty
     def summary(self):
         message = h.really_unicode(self.message)
         first_line = message.split('\n')[0]
         return h.text.truncate(first_line, 50)

     def shorthand_id(self):
         if self.repo is None: self.repo = self.guess_repo()
         if self.repo is None: return repr(self)
         return self.repo.shorthand_for_commit(self._id)

     @LazyProperty
     def symbolic_ids(self):
         return self.repo.symbolics_for_commit(self)

     def get_parent(self, index=0):
         '''Get the parent of this commit.

         If there is no parent commit, or if an invalid index is given,
         returns None.
         '''
         try:
             cache = getattr(c, 'model_cache', '') or ModelCache()
             ci = cache.get(Commit, dict(_id=self.parent_ids[index]))
             if not ci:
                 return None
             ci.set_context(self.repo)
             return ci
         except IndexError as e:
             return None

     def climb_commit_tree(self, predicate=None):
         '''
         Returns a generator that walks up the commit tree along
         the first-parent ancestory, starting with this commit,
         optionally filtering by a predicate.'''
         ancestor = self
         while ancestor:
             if predicate is None or predicate(ancestor):
                 yield ancestor
             ancestor = ancestor.get_parent()

     def url(self):
         if self.repo is None: self.repo = self.guess_repo()
         if self.repo is None: return '#'
         return self.repo.url_for_commit(self)

     def guess_repo(self):
         import traceback
         log.error('guess_repo: should not be called: %s' % ''.join(traceback.format_stack()))
         for ac in c.project.app_configs:
             try:
                 app = c.project.app_instance(ac)
                 if app.repo._id in self.repo_ids:
                     return app.repo
             except AttributeError:
                 pass
         return None

     def link_text(self):
         '''The link text that will be used when a shortlink to this artifact
         is expanded into an <a></a> tag.

         By default this method returns shorthand_id(). Subclasses should
         override this method to provide more descriptive link text.
         '''
         return self.shorthand_id()

     def context(self):
         result = dict(prev=None, next=None)
         if self.parent_ids:
             result['prev'] = self.query.find(dict(_id={'$in': self.parent_ids })).all()
             for ci in result['prev']:
                 ci.set_context(self.repo)
         if self.child_ids:
             result['next'] = self.query.find(dict(_id={'$in': self.child_ids })).all()
             for ci in result['next']:
                 ci.set_context(self.repo)
         return result

     @LazyProperty
     def diffs(self):
         return self.paged_diffs()

     def paged_diffs(self, start=0, end=None):
         di = DiffInfoDoc.m.get(_id=self._id)
         if di is None:
             return Object(added=[], removed=[], changed=[], copied=[], total=0)
         added = []
         removed = []
         changed = []
         copied = []
         for change in di.differences[start:end]:
             if change.rhs_id is None:
                 removed.append(change.name)
             elif change.lhs_id is None:
                 added.append(change.name)
             else:
                 changed.append(change.name)
         copied = self._diffs_copied(added, removed)
         return Object(
             added=added, removed=removed,
             changed=changed, copied=copied,
             total=len(di.differences))

     def _diffs_copied(self, added, removed):
         '''Return list with file renames diffs.

         Will change `added` and `removed` lists also.
         '''
         def _blobs_similarity(removed_blob, added):
             best = dict(ratio=0, name='', blob=None)
             for added_name in added:
                 added_blob = self.tree.get_obj_by_path(added_name)
                 if not isinstance(added_blob, Blob):
                     continue
                 diff = SequenceMatcher(None, removed_blob.text,
                                        added_blob.text)
                 ratio = diff.quick_ratio()
                 if ratio > best['ratio']:
                     best['ratio'] = ratio
                     best['name'] = added_name
                     best['blob'] = added_blob

                 if ratio == 1:
                     break  # we'll won't find better similarity than 100% :)

             if best['ratio'] > DIFF_SIMILARITY_THRESHOLD:
                 diff = ''
                 if best['ratio'] < 1:
                     added_blob = best['blob']
                     rpath = ('a' + removed_blob.path()).encode('utf-8')
                     apath = ('b' + added_blob.path()).encode('utf-8')
                     diff = ''.join(unified_diff(list(removed_blob),
                                                 list(added_blob),
                                                 rpath, apath))
                 return dict(new=best['name'],
                             ratio=best['ratio'], diff=diff)

         def _trees_similarity(removed_tree, added):
             for added_name in added:
                 added_tree = self.tree.get_obj_by_path(added_name)
                 if not isinstance(added_tree, Tree):
                     continue
                 if removed_tree._id == added_tree._id:
                     return dict(new=added_name,
                                 ratio=1, diff='')

         if not removed:
             return []
         copied = []
         prev_commit = self.get_parent()
         for removed_name in removed[:]:
             removed_blob = prev_commit.tree.get_obj_by_path(removed_name)
             rename_info = None
             if isinstance(removed_blob, Blob):
                 rename_info = _blobs_similarity(removed_blob, added)
             elif isinstance(removed_blob, Tree):
                 rename_info = _trees_similarity(removed_blob, added)
             if rename_info is not None:
                 rename_info['old'] = removed_name
                 copied.append(rename_info)
                 removed.remove(rename_info['old'])
                 added.remove(rename_info['new'])
         return copied

     def get_path(self, path, create=True):
         path = path.lstrip('/')
         parts = path.split('/')
         cur = self.get_tree(create)
         if cur is not None:
             for part in parts:
                 if part != '':
                     cur = cur[part]
         return cur

     def has_path(self, path):
         try:
             self.get_path(path)
             return True
         except KeyError:
             return False

     @LazyProperty
     def changed_paths(self):
         '''
         Returns a list of paths changed in this commit.
         Leading and trailing slashes are removed, and
         the list is complete, meaning that if a sub-path
         is changed, all of the parent paths are included
         (including '' to represent the root path).

         Example:

             If the file /foo/bar is changed in the commit,
             this would return ['', 'foo', 'foo/bar']
         '''
         diff_info = DiffInfoDoc.m.get(_id=self._id)
         diffs = set()
         if diff_info:
             for d in diff_info.differences:
                 node = d.name.strip('/')
                 diffs.add(node)
                 node_path = os.path.dirname(node)
                 while node_path:
                     diffs.add(node_path)
                     node_path = os.path.dirname(node_path)
                 diffs.add('')  # include '/' if there are any changes
         return diffs

     @LazyProperty
     def added_paths(self):
         '''
         Returns a list of paths added in this commit.
         Leading and trailing slashes are removed, and
         the list is complete, meaning that if a directory
         with subdirectories is added, all of the child
         paths are included (this relies on the DiffInfoDoc
         being complete).

         Example:

             If the directory /foo/bar/ is added in the commit
             which contains a subdirectory /foo/bar/baz/ with
             the file /foo/bar/baz/qux.txt, this would return:
             ['foo/bar', 'foo/bar/baz', 'foo/bar/baz/qux.txt']
         '''
         diff_info = DiffInfoDoc.m.get(_id=self._id)
         diffs = set()
         if diff_info:
             for d in diff_info.differences:
                 if d.lhs_id is None:
                     diffs.add(d.name.strip('/'))
         return diffs

     @LazyProperty
     def info(self):
         return dict(
             id=self._id,
             author=self.authored.name,
             author_email=self.authored.email,
             date=self.authored.date,
             author_url=self.author_url,
             shortlink=self.shorthand_id(),
             summary=self.summary
             )

 class Tree(RepoObject):
     # Ephemeral attrs
     repo=None
     commit=None
     parent=None
     name=None

     def compute_hash(self):
         '''Compute a hash based on the contents of the tree.  Note that this
         hash does not necessarily correspond to any actual DVCS hash.
         '''
         lines = (
             [ 'tree' + x.name + x.id for x in self.tree_ids ]
             + [ 'blob' + x.name + x.id for x in self.blob_ids ]
             + [ x.type + x.name + x.id for x in self.other_ids ])
         sha_obj = sha1()
         for line in sorted(lines):
             sha_obj.update(line)
         return sha_obj.hexdigest()

     def __getitem__(self, name):
         cache = getattr(c, 'model_cache', '') or ModelCache()
         obj = self.by_name[name]
         if obj['type'] == 'blob':
             return Blob(self, name, obj['id'])
         if obj['type'] == 'submodule':
             log.info('Skipping submodule "%s"' % name)
             raise KeyError, name
         obj = cache.get(Tree, dict(_id=obj['id']))
         if obj is None:
             oid = self.repo.compute_tree_new(self.commit, self.path() + name + '/')
             obj = cache.get(Tree, dict(_id=oid))
         if obj is None: raise KeyError, name
         obj.set_context(self, name)
         return obj

     def get_obj_by_path(self, path):
         if hasattr(path, 'get'):
             path = path['new']
         if path.startswith('/'):
             path = path[1:]
         path = path.split('/')
         obj = self
         for p in path:
             try:
                 obj = obj[p]
             except KeyError:
                 return None
         return obj

     def get_blob_by_path(self, path):
         obj = self.get_obj_by_path(path)
         return obj if isinstance(obj, Blob) else None

     def set_context(self, commit_or_tree, name=None):
         assert commit_or_tree is not self
         self.repo = commit_or_tree.repo
         if name:
             self.commit = commit_or_tree.commit
             self.parent = commit_or_tree
             self.name = name
         else:
             self.commit = commit_or_tree

     def readme(self):
         'returns (filename, unicode text) if a readme file is found'
         for x in self.blob_ids:
             if README_RE.match(x.name):
                 name = x.name
                 blob = self[name]
                 return (x.name, h.really_unicode(blob.text))
         return None, None

     def ls(self):
         '''
         List the entries in this tree, with historical commit info for
         each node.  Eventually, ls_old can be removed and this can be
         replaced with the following:

             return self._lcd_map(LastCommit.get(self))
         '''
         # look for existing new format first
         last_commit = LastCommit.get(self, create=False)
         if last_commit:
             return self._lcd_map(last_commit)
         # otherwise, try old format
         old_style_results = self.ls_old()
         if old_style_results:
             log.info('Using old-style results from ls_old()')
             return old_style_results
         # finally, use the new implentation that auto-vivifies
         last_commit = LastCommit.get(self, create=True)
         # ensure that the LCD is saved, even if
         # there is an error later in the request
         if last_commit:
             session(last_commit).flush(last_commit)
             return self._lcd_map(last_commit)
         else:
             return []

     def _lcd_map(self, lcd):
         if lcd is None:
             return []
         commit_ids = [e.commit_id for e in lcd.entries]
         commits = list(Commit.query.find(dict(_id={'$in': commit_ids})))
         for commit in commits:
             commit.set_context(self.repo)
         commit_infos = {c._id: c.info for c in commits}
         by_name = lambda n: n.name
         tree_names = sorted([n.name for n in self.tree_ids])
         blob_names = sorted([n.name for n in chain(self.blob_ids, self.other_ids)])

         results = []
         for type, names in (('DIR', tree_names), ('BLOB', blob_names)):
             for name in names:
                 commit_info = commit_infos.get(lcd.by_name.get(name))
                 if not commit_info:
                     commit_info = defaultdict(str)
                 elif 'id' in commit_info:
                     commit_info['href'] = self.repo.url_for_commit(commit_info['id'])
                 results.append(dict(
                         kind=type,
                         name=name,
                         href=name,
                         last_commit=dict(
                                 author=commit_info['author'],
                                 author_email=commit_info['author_email'],
                                 author_url=commit_info['author_url'],
                                 date=commit_info.get('date'),
                                 href=commit_info.get('href',''),
                                 shortlink=commit_info['shortlink'],
                                 summary=commit_info['summary'],
                             ),
                     ))
         return results

     def ls_old(self):
         # Load last commit info
         id_re = re.compile("^{0}:{1}:".format(
             self.repo._id,
             re.escape(h.really_unicode(self.path()).encode('utf-8'))))
         lc_index = dict(
             (lc.name, lc.commit_info)
             for lc in LastCommitDoc_old.m.find(dict(_id=id_re)))

         # FIXME: Temporarily fall back to old, semi-broken lookup behavior until refresh is done
         oids = [ x.id for x in chain(self.tree_ids, self.blob_ids, self.other_ids) ]
         id_re = re.compile("^{0}:".format(self.repo._id))
         lc_index.update(dict(
             (lc.object_id, lc.commit_info)
             for lc in LastCommitDoc_old.m.find(dict(_id=id_re, object_id={'$in': oids}))))
         # /FIXME

         if not lc_index:
             # allow fallback to new method instead
             # of showing a bunch of Nones
             return []

         results = []
         def _get_last_commit(name, oid):
             lc = lc_index.get(name, lc_index.get(oid, None))
             if lc is None:
                 lc = dict(
                     author=None,
                     author_email=None,
                     author_url=None,
                     date=None,
                     id=None,
                     href=None,
                     shortlink=None,
                     summary=None)
             if 'href' not in lc:
                 lc['href'] = self.repo.url_for_commit(lc['id'])
             return lc
         for x in sorted(self.tree_ids, key=lambda x:x.name):
             results.append(dict(
                     kind='DIR',
                     name=x.name,
                     href=x.name + '/',
                     last_commit=_get_last_commit(x.name, x.id)))
         for x in sorted(self.blob_ids, key=lambda x:x.name):
             results.append(dict(
                     kind='FILE',
                     name=x.name,
                     href=x.name,
                     last_commit=_get_last_commit(x.name, x.id)))
         for x in sorted(self.other_ids, key=lambda x:x.name):
             results.append(dict(
                     kind=x.type,
                     name=x.name,
                     href=None,
                     last_commit=_get_last_commit(x.name, x.id)))
         return results

     def path(self):
         if self.parent:
             assert self.parent is not self
             return self.parent.path() + self.name + '/'
         else:
             return '/'

     def url(self):
         return self.commit.url() + 'tree' + self.path()

     @LazyProperty
     def by_name(self):
         d = Object((x.name, x) for x in self.other_ids)
         d.update(
             (x.name, Object(x, type='tree'))
             for x in self.tree_ids)
         d.update(
             (x.name, Object(x, type='blob'))
             for x in self.blob_ids)
         return d

     def is_blob(self, name):
         return self.by_name[name]['type'] == 'blob'

     def get_blob(self, name):
         x = self.by_name[name]
         return Blob(self, name, x.id)

 class Blob(object):
     '''Lightweight object representing a file in the repo'''

     def __init__(self, tree, name, _id):
         self._id = _id
         self.tree = tree
         self.name = name
         self.repo = tree.repo
         self.commit = tree.commit
         fn, ext = os.path.splitext(self.name)
         self.extension = ext or fn

     def path(self):
         return self.tree.path() + h.really_unicode(self.name)

     def url(self):
         return self.tree.url() + h.really_unicode(self.name)

     @LazyProperty
     def prev_commit(self):
         pcid = LastCommit._prev_commit_id(self.commit, self.path().strip('/'))
         if pcid:
             return self.repo.commit(pcid)
         return None

     @LazyProperty
     def next_commit(self):
         try:
             path = self.path()
             cur = self.commit
             next = cur.context()['next']
             while next:
                 cur = next[0]
                 next = cur.context()['next']
                 other_blob = cur.get_path(path, create=False)
                 if other_blob is None or other_blob._id != self._id:
                     return cur
         except:
             log.exception('Lookup next_commit')
             return None

     @LazyProperty
     def _content_type_encoding(self):
         return self.repo.guess_type(self.name)

     @LazyProperty
     def content_type(self):
         return self._content_type_encoding[0]

     @LazyProperty
     def content_encoding(self):
         return self._content_type_encoding[1]

     @property
     def has_pypeline_view(self):
         if README_RE.match(self.name) or self.extension in PYPELINE_EXTENSIONS:
             return True
         return False

     @property
     def has_html_view(self):
         if (self.content_type.startswith('text/') or
             self.extension in VIEWABLE_EXTENSIONS or
             self.extension in PYPELINE_EXTENSIONS or
             self.extension in self.repo._additional_viewable_extensions or
             utils.is_text_file(self.text)):
             return True
         return False

     @property
     def has_image_view(self):
         return self.content_type.startswith('image/')

     def context(self):
         path = self.path()
         prev = self.prev_commit
         next = self.next_commit
         if prev is not None:
             try:
                 prev = prev.get_path(path, create=False)
             except KeyError as e:
                 prev = None
         if next is not None:
             try:
                 next = next.get_path(path, create=False)
             except KeyError as e:
                 next = None
         return dict(
             prev=prev,
             next=next)

     def open(self):
         return self.repo.open_blob(self)

     def __iter__(self):
         return iter(self.open())

     @LazyProperty
     def size(self):
         return self.repo.blob_size(self)

     @LazyProperty
     def text(self):
         return self.open().read()

     @classmethod
     def diff(cls, v0, v1):
         differ = SequenceMatcher(v0, v1)
         return differ.get_opcodes()

 class LastCommit(RepoObject):
     def __repr__(self):
         return '<LastCommit /%s %s>' % (self.path, self.commit_id)

     @classmethod
     def _last_commit_id(cls, commit, path):
         try:
             rev = commit.repo.log(commit._id, path, id_only=True).next()
             return commit.repo.rev_to_commit_id(rev)
         except StopIteration:
             log.error('Tree node not recognized by SCM: %s @ %s', path, commit._id)
             return commit._id

     @classmethod
     def _prev_commit_id(cls, commit, path):
         if not commit.parent_ids or path in commit.added_paths:
             return None  # new paths by definition have no previous LCD
         lcid_cache = getattr(c, 'lcid_cache', '')
         if lcid_cache != '' and path in lcid_cache:
             return lcid_cache[path]
         try:
             log_iter = commit.repo.log(commit._id, path, id_only=True)
             log_iter.next()
             rev = log_iter.next()
             return commit.repo.rev_to_commit_id(rev)
         except StopIteration:
             return None

     @classmethod
     def get(cls, tree, create=True):
         '''Find or build the LastCommitDoc for the given tree.'''
         cache = getattr(c, 'model_cache', '') or ModelCache()
         path = tree.path().strip('/')
         last_commit_id = cls._last_commit_id(tree.commit, path)
         lcd = cache.get(cls, {'path': path, 'commit_id': last_commit_id})
         if lcd is None and create:
             commit = cache.get(Commit, {'_id': last_commit_id})
             commit.set_context(tree.repo)
             lcd = cls._build(commit.get_path(path))
         return lcd

     @classmethod
     def _build(cls, tree):
         '''
           Build the LCD record, presuming that this tree is where it was most
           recently changed.
         '''
         model_cache = getattr(c, 'model_cache', '') or ModelCache()
         path = tree.path().strip('/')
         entries = []
         prev_lcd = None
         prev_lcd_cid = cls._prev_commit_id(tree.commit, path)
         if prev_lcd_cid:
             prev_lcd = model_cache.get(cls, {'path': path, 'commit_id': prev_lcd_cid})
         entries = {}
         nodes = set([node.name for node in chain(tree.tree_ids, tree.blob_ids, tree.other_ids)])
         changed = set([node for node in nodes if os.path.join(path, node) in tree.commit.changed_paths])
         unchanged = [os.path.join(path, node) for node in nodes - changed]
         if prev_lcd:
             # get unchanged entries from previously computed LCD
             entries = prev_lcd.by_name
         elif unchanged:
             # no previously computed LCD, so get unchanged entries from SCM
             # (but only ask for the ones that we know we need)
             entries = tree.commit.repo.last_commit_ids(tree.commit, unchanged)
             if entries is None:
                 # something strange went wrong; still show the list of files
                 # and possibly try again later
                 entries = {}
             # paths are fully-qualified; shorten them back to just node names
             entries = {os.path.basename(path):commit_id for path,commit_id in entries.iteritems()}
         # update with the nodes changed in this tree's commit
         entries.update({node: tree.commit._id for node in changed})
         # convert to a list of dicts, since mongo doesn't handle arbitrary keys well (i.e., . and $ not allowed)
         entries = [{'name':name, 'commit_id':value} for name,value in entries.iteritems()]
         lcd = cls(
                 commit_id=tree.commit._id,
                 path=path,
                 entries=entries,
             )
         model_cache.set(cls, {'path': path, 'commit_id': tree.commit._id}, lcd)
         return lcd

     @LazyProperty
     def by_name(self):
         return {n.name: n.commit_id for n in self.entries}

 mapper(Commit, CommitDoc, repository_orm_session)
 mapper(Tree, TreeDoc, repository_orm_session)
 mapper(LastCommit, LastCommitDoc, repository_orm_session)


 class ModelCache(object):
     '''
     Cache model instances based on query params passed to get.
     '''
     def __init__(self, max_instances=None, max_queries=None):
         '''
         By default, each model type can have 2000 instances and
         8000 queries.  You can override these for specific model
         types by passing in a dict() for either max_instances or
         max_queries keyed by the class(es) with the max values.
         Classes not in the dict() will use the default 2000/8000
         default.

         If you pass in a number instead of a dict, that value will
         be used as the max for all classes.
         '''
         max_instances_default = 2000
         max_queries_default = 8000
         if isinstance(max_instances, int):
             max_instances_default = max_instances
         if isinstance(max_queries, int):
             max_queries_default = max_queries
         self._max_instances = defaultdict(lambda:max_instances_default)
         self._max_queries = defaultdict(lambda:max_queries_default)
         if hasattr(max_instances, 'items'):
             self._max_instances.update(max_instances)
         if hasattr(max_queries, 'items'):
             self._max_queries.update(max_queries)

         self._query_cache = defaultdict(OrderedDict)  # keyed by query, holds _id
         self._instance_cache = defaultdict(OrderedDict)  # keyed by _id
         self._synthetic_ids = defaultdict(set)
         self._synthetic_id_queries = defaultdict(set)

     def _normalize_query(self, query):
         _query = query
         if not isinstance(_query, tuple):
             _query = tuple(sorted(_query.items(), key=lambda k: k[0]))
         return _query

     def _model_query(self, cls):
         if hasattr(cls, 'query'):
             return cls.query
         elif hasattr(cls, 'm'):
             return cls.m
         else:
             raise AttributeError('%s has neither "query" nor "m" attribute' % cls)

     def get(self, cls, query):
         _query = self._normalize_query(query)
         self._touch(cls, _query)
         if _query not in self._query_cache[cls]:
             val = self._model_query(cls).get(**query)
             self.set(cls, _query, val)
             return val
         _id = self._query_cache[cls][_query]
         if _id is None:
             return None
         if _id not in self._instance_cache[cls]:
             val = self._model_query(cls).get(**query)
             self.set(cls, _query, val)
             return val
         return self._instance_cache[cls][_id]

     def set(self, cls, query, val):
         _query = self._normalize_query(query)
         if val is not None:
             _id = getattr(val, '_model_cache_id',
                     getattr(val, '_id',
                         self._query_cache[cls].get(_query,
                             None)))
             if _id is None:
                 _id = val._model_cache_id = bson.ObjectId()
                 self._synthetic_ids[cls].add(_id)
             if _id in self._synthetic_ids:
                 self._synthetic_id_queries[cls].add(_query)
             self._query_cache[cls][_query] = _id
             self._instance_cache[cls][_id] = val
         else:
             self._query_cache[cls][_query] = None
         self._touch(cls, _query)
         self._check_sizes(cls)

     def _touch(self, cls, query):
         '''
         Keep track of insertion order, prevent duplicates,
         and expire from the cache in a FIFO manner.
         '''
         _query = self._normalize_query(query)
         if _query not in self._query_cache[cls]:
             return
         _id = self._query_cache[cls].pop(_query)
         self._query_cache[cls][_query] = _id

         if _id not in self._instance_cache[cls]:
             return
         val = self._instance_cache[cls].pop(_id)
         self._instance_cache[cls][_id] = val

     def _check_sizes(self, cls):
         if self.num_queries(cls) > self._max_queries[cls]:
             _id = self._remove_least_recently_used(self._query_cache[cls])
             if _id in self._instance_cache[cls]:
                 instance = self._instance_cache[cls][_id]
                 self._try_flush(instance, expunge=False)
         if self.num_instances(cls) > self._max_instances[cls]:
             instance = self._remove_least_recently_used(self._instance_cache[cls])
             self._try_flush(instance, expunge=True)

     def _try_flush(self, instance, expunge=False):
         try:
             inst_session = session(instance)
         except AttributeError:
             inst_session = None
         if inst_session:
             inst_session.flush(instance)
             if expunge:
                 inst_session.expunge(instance)

     def _remove_least_recently_used(self, cache):
         # last-used (most-recently-used) is last in cache, so take first
         key, val = cache.popitem(last=False)
         return val

     def expire_new_instances(self, cls):
         '''
         Expire any instances that were "new" or had no _id value.

         If a lot of new instances of a class are being created, it's possible
         for a query to pull a copy from mongo when a copy keyed by the synthetic
         ID is still in the cache, potentially causing de-sync between the copies
         leading to one with missing data overwriting the other.  Clear new
         instances out of the cache relatively frequently (depending on the query
         and instance cache sizes) to avoid this.
         '''
         for _query in self._synthetic_id_queries[cls]:
             self._query_cache[cls].pop(_query)
         self._synthetic_id_queries[cls] = set()
         for _id in self._synthetic_ids[cls]:
             instance = self._instance_cache[cls].pop(_id)
             self._try_flush(instance, expunge=True)
         self._synthetic_ids[cls] = set()

     def num_queries(self, cls=None):
         if cls is None:
             return sum([len(c) for c in self._query_cache.values()])
         else:
             return len(self._query_cache[cls])

     def num_instances(self, cls=None):
         if cls is None:
             return sum([len(c) for c in self._instance_cache.values()])
         else:
             return len(self._instance_cache[cls])

     def instance_ids(self, cls):
         return self._instance_cache[cls].keys()

     def batch_load(self, cls, query, attrs=None):
         '''
         Load multiple results given a query.

         Optionally takes a list of attribute names to use
         as the cache key.  If not given, uses the keys of
         the given query.
         '''
         if attrs is None:
             attrs = query.keys()
         for result in self._model_query(cls).find(query):
             keys = {a: getattr(result, a) for a in attrs}
             self.set(cls, keys, result)