tools/cvs2svn/cvs2svn.py - subversion - Git at Google

 #!/usr/bin/env python
 #
 # cvs2svn: ...
 #

 # $LastChangedRevision$

 import rcsparse
 import os
 import sys
 import sha
 import re
 import time
 import fileinput
 import string
 import getopt
 import stat
 import md5
 import shutil
 import anydbm
 import marshal

 # Make sure this Python is recent enough.
 import sys
 if sys.hexversion < 0x2000000:
   sys.stderr.write('Python 2.0 or higher is required; see www.python.org.')
   sys.exit(1)

 # Don't settle for less.
 if anydbm._defaultmod.__name__ == 'dumbdbm':
   print 'ERROR: your installation of Python does not contain a proper'
   print '  DBM module. This script cannot continue.'
   print '  to solve: see http://python.org/doc/current/lib/module-anydbm.html'
   print '  for details.'
   sys.exit(1)

 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
 branch_tag = re.compile('^[0-9.]+\\.0\\.[0-9]+$')
 vendor_tag = re.compile('^[0-9]+\\.[0-9]+\\.[0-9]+$')

 SVNADMIN = 'svnadmin'      # Location of the svnadmin binary.
 DATAFILE = 'cvs2svn-data'
 DUMPFILE = 'cvs2svn-dump'  # The "dumpfile" we create to load into the repos

 # Skeleton version of an svn filesystem.
 SVN_REVISIONS_DB = 'cvs2svn-revisions.db'
 NODES_DB = 'cvs2svn-nodes.db'

 # See class SymbolicNameTracker for details.
 SYMBOLIC_NAMES_DB = "cvs2svn-sym-names.db"

 REVS_SUFFIX = '.revs'
 CLEAN_REVS_SUFFIX = '.c-revs'
 SORTED_REVS_SUFFIX = '.s-revs'
 RESYNC_SUFFIX = '.resync'

 SVNROOT = 'svnroot'
 ATTIC = os.sep + 'Attic'

 SVN_INVALID_REVNUM = -1

 COMMIT_THRESHOLD = 5 * 60	# flush a commit if a 5 minute gap occurs

 OP_ADD    = 'A'
 OP_DELETE = 'D'
 OP_CHANGE = 'C'

 DIGEST_END_IDX = 9 + (sha.digestsize * 2)

 verbose = 1


 class CollectData(rcsparse.Sink):
   def __init__(self, cvsroot, log_fname_base):
     self.cvsroot = cvsroot
     self.revs = open(log_fname_base + REVS_SUFFIX, 'w')
     self.resync = open(log_fname_base + RESYNC_SUFFIX, 'w')

   def set_fname(self, fname):
     "Prepare to receive data for a new file."
     self.fname = fname

     # revision -> [timestamp, author, operation, old-timestamp]
     self.rev_data = { }
     self.prev = { }
     self.branch_names = {}
     self.taglist = {}
     self.branchlist = {}

   def set_branch_name(self, revision, name):
     """Record that REVISION is the branch number for BRANCH_NAME.
     REVISION is an RCS branch number with an odd number of components,
     for example '1.7.2' (never '1.7.0.2')."""
     self.branch_names[revision] = name

   def get_branch_name(self, revision):
     """Return the name of the branch whose branch number is REVISION.
     REVISION is an RCS branch number with an odd number of components,
     for example '1.7.2' (never '1.7.0.2')."""
     brev = revision[:revision.rindex(".")]
     if not self.branch_names.has_key(brev):
       return None
     return self.branch_names[brev]

   def add_branch_point(self, revision, branch_name):
     """Record that BRANCH_NAME sprouts from REVISION.
     REVISION is a non-branch revision number with an even number of
     components, for example '1.7' (never '1.7.2' nor '1.7.0.2')."""
     if not self.branchlist.has_key(revision):
       self.branchlist[revision] = []
     self.branchlist[revision].append(branch_name)

   def add_cvs_branch(self, revision, branch_name):
     """Record the root revision and branch revision for BRANCH_NAME,
     based on REVISION.  REVISION is a CVS branch number having an even
     number of components where the second-to-last is '0'.  For
     example, if it's '1.7.0.2', then record that BRANCH_NAME sprouts
     from 1.7 and has branch number 1.7.2."""
     last_dot = revision.rfind(".")
     branch_rev = revision[:last_dot]
     last2_dot = branch_rev.rfind(".")
     branch_rev = branch_rev[:last2_dot] + revision[last_dot:]
     self.set_branch_name(branch_rev, branch_name)
     self.add_branch_point(branch_rev[:last2_dot], branch_name)

   def get_tags(self, revision):
     """Return a list of all tag names attached to REVISION.
     REVISION is a regular revision number like '1.7', and the result
     never includes branch names, only plain tags."""
     if self.taglist.has_key(revision):
       return self.taglist[revision]
     else:
       return []

   def get_branches(self, revision):
     """Return a list of all branch names that sprout from REVISION.
     REVISION is a regular revision number like '1.7'."""
     if self.branchlist.has_key(revision):
       return self.branchlist[revision]
     else:
       return []

   def define_tag(self, name, revision):
     """Record a bidirectional mapping between symbolic NAME and REVISION
     REVISION is an unprocessed revision number from the RCS file's
     header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
     This function will determine what kind of symbolic name it is by
     inspection, and record it in the right places."""
     if branch_tag.match(revision):
       self.add_cvs_branch(revision, name)
     elif vendor_tag.match(revision):
       self.set_branch_name(revision, name)
       self.add_branch_point(revision[:revision.rfind(".")], name)
     else:
       if not self.taglist.has_key(revision):
         self.taglist[revision] = []
       self.taglist[revision].append(name)

   def define_revision(self, revision, timestamp, author, state,
                       branches, next):
     ### what else?
     if state == 'dead':
       op = OP_DELETE
     else:
       op = OP_CHANGE

     # store the rev_data as a list in case we have to jigger the timestamp
     # print "KFF: revision %s of '%s'" % (revision, self.fname)
     self.rev_data[revision] = [int(timestamp), author, op, None]

     # record the previous revision for sanity checking later
     if trunk_rev.match(revision):
       self.prev[revision] = next
     elif next:
       self.prev[next] = revision
     for b in branches:
       self.prev[b] = revision

   def tree_completed(self):
     "The revision tree has been parsed. Analyze it for consistency."

     # Our algorithm depends upon the timestamps on the revisions occuring
     # monotonically over time. That is, we want to see rev 1.34 occur in
     # time before rev 1.35. If we inserted 1.35 *first* (due to the time-
     # sorting), and then tried to insert 1.34, we'd be screwed.

     # to perform the analysis, we'll simply visit all of the 'previous'
     # links that we have recorded and validate that the timestamp on the
     # previous revision is before the specified revision

     # if we have to resync some nodes, then we restart the scan. just keep
     # looping as long as we need to restart.
     while 1:
       for current, prev in self.prev.items():
         if not prev:
           # no previous revision exists (i.e. the initial revision)
           continue
         t_c = self.rev_data[current][0]
         t_p = self.rev_data[prev][0]
         if t_p >= t_c:
           # the previous revision occurred later than the current revision.
           # shove the previous revision back in time (and any before it that
           # may need to shift).
           while t_p >= t_c:
             self.rev_data[prev][0] = t_c - 1	# new timestamp
             self.rev_data[prev][3] = t_p	# old timestamp

             print 'RESYNC: %s (%s) : old time="%s" new time="%s"' \
                   % (relative_name(self.cvsroot, self.fname),
                      prev, time.ctime(t_p), time.ctime(t_c - 1))

             current = prev
             prev = self.prev[current]
             if not prev:
               break
             t_c = t_c - 1		# self.rev_data[current][0]
             t_p = self.rev_data[prev][0]

           # break from the for-loop
           break
       else:
         # finished the for-loop (no resyncing was performed)
         return

   def set_revision_info(self, revision, log, text):
     # kff fooo
     # if revision == "1.1" and self.rev_data.has_key("1.1.1.1"):
     #   return
     # print "KFF: writing %s of '%s'" % (revision, self.fname)
     timestamp, author, op, old_ts = self.rev_data[revision]
     digest = sha.new(log + '\0' + author).hexdigest()
     if old_ts:
       # the timestamp on this revision was changed. log it for later
       # resynchronization of other files's revisions that occurred
       # for this time and log message.
       self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))

     branch_name = self.get_branch_name(revision)

     write_revs_line(self.revs, timestamp, digest, op, revision, self.fname,
                     branch_name, self.get_tags(revision),
                     self.get_branches(revision))


 def make_path(ctx, path, branch_name = None, tag_name = None):
   """Return the trunk path, branch path, or tag path for PATH.
   CTX holds the name of the branches or tags directory, which is found
   under PATH's first component.

   It is an error to pass both a BRANCH_NAME and a TAG_NAME."""

   # For a while, we treated each top-level subdir of the CVS
   # repository as a "project root" and interpolated the appropriate
   # genealogy (trunk|tag|branch) in according to the official
   # recommended layout.  For example, the path '/foo/bar/baz.c' on
   # branch 'Rel2' would become
   #
   #   /foo/branches/Rel2/bar/baz.c
   #
   # and on trunk it would become
   #
   #   /foo/trunk/bar/baz.c
   #
   # However, we went back to the older and simpler method of just
   # prepending the genealogy to the front, instead of interpolating.
   # So now we produce:
   #
   #   /branches/Rel2/foo/bar/baz.c
   #   /trunk/foo/bar/baz.c
   #
   # Why?  Well, Jack Repenning pointed out that this way is much
   # friendlier to "anonymously rooted subtrees" (that's a tree where
   # the name of the top level dir doesn't matter, the point is that if
   # you cd into it and, say, run 'make', something good will happen).
   # By interpolating, we made it impossible to point cvs2svn at some
   # subdir in the CVS repository and convert it as a project, because
   # we'd treat every subdir underneath it as an independent project
   # root, which is probably not what the user wanted.
   #
   # Also, see Blair Zajac's post
   #
   #    http://subversion.tigris.org/servlets/ReadMsg?list=dev&msgNo=38965
   #
   # and the surrounding thread, for why what people really want is a
   # way of specifying an in-repository prefix path, not interpolation.

   if branch_name and tag_name:
     sys.stderr.write('make_path() miscalled, both branch and tag given')
     sys.exit(1)

   if branch_name:
     return ctx.branches_base + '/' + branch_name + '/' + path
   elif tag_name:
     return ctx.tags_base + '/' + tag_name + '/' + path
   else:
     return ctx.trunk_base + '/' + path


 def relative_name(cvsroot, fname):
   l = len(cvsroot)
   if fname[:l] == cvsroot:
     if fname[l] == '/':
       return fname[l+1:]
     return fname[l:]
   return l


 def visit_file(arg, dirname, files):
   cd, p, stats = arg
   for fname in files:
     if fname[-2:] != ',v':
       continue
     pathname = os.path.join(dirname, fname)
     if dirname[-6:] == ATTIC:
       # drop the 'Attic' portion from the pathname
       ### we should record this so we can easily insert it back in
       cd.set_fname(os.path.join(dirname[:-6], fname))
     else:
       cd.set_fname(pathname)
     if verbose:
       print pathname
     p.parse(open(pathname), cd)
     stats[0] = stats[0] + 1


 class RevInfoParser(rcsparse.Sink):
   def __init__(self):
     self.authors = { }	# revision -> author
     self.logs = { }	# revision -> log message

   def define_revision(self, revision, timestamp, author, state,
                       branches, next):
     self.authors[revision] = author

   def set_revision_info(self, revision, log, text):
     self.logs[revision] = log

   def parse_cvs_file(self, rcs_pathname):
     try:
       rcsfile = open(rcs_pathname, 'r')
     except:
       try:
         dirname, fname = os.path.split(rcs_pathname)
         rcs_pathname = os.path.join(dirname, "Attic", fname)
         rcsfile = open(rcs_pathname, 'r')
       except:
         ### should use a better error
         raise RuntimeError, ('error: %s appeared to be under CVS control, '
                              'but the RCS file is inaccessible.'
                              % rcs_pathname)

     rcsparse.Parser().parse(rcsfile, self)


 # Return a string that has not been returned by gen_key() before.
 gen_key_base = 0L
 def gen_key():
   global gen_key_base
   key = '%x' % gen_key_base
   gen_key_base = gen_key_base + 1
   return key


 class RepositoryMirror:
   def __init__(self):
     self.revs_db_file = SVN_REVISIONS_DB
     self.revs_db = anydbm.open(self.revs_db_file, 'n')
     self.nodes_db_file = NODES_DB
     self.nodes_db = anydbm.open(self.nodes_db_file, 'n')

     # These keys could never be real directory entries.
     self.mutable_flag = "/mutable"
     self.symbolic_names = "/sym_names"
     # This could represent a new mutable directory or file.
     self.empty_mutable_thang = { self.mutable_flag : 1 }

     # Init a root directory with no entries at revision 0.
     self.youngest = 0
     self.revs_db[str(self.youngest)] = gen_key()
     self.nodes_db[self.revs_db[str(self.youngest)]] = marshal.dumps({})

   def new_revision(self):
     """Stabilize the current revision, then start the next one.
     (Increments youngest.)"""
     self.stabilize_youngest()
     self.revs_db[str(self.youngest + 1)] \
                                       = self.revs_db[str(self.youngest)]
     self.youngest = self.youngest + 1

   def _stabilize_directory(self, key):
     """Close the directory whose node key is KEY."""
     dir = marshal.loads(self.nodes_db[key])
     if dir.has_key(self.mutable_flag):
       del dir[self.mutable_flag]
       for entry_key in dir.keys():
         if not entry_key[0] == '/':
           self._stabilize_directory(dir[entry_key])
       self.nodes_db[key] = marshal.dumps(dir)

   def stabilize_youngest(self):
     """Stabilize the current revision by removing mutable flags."""
     root_key = self.revs_db[str(self.youngest)]
     self._stabilize_directory(root_key)

   def probe_path(self, path, revision=-1, debugging=None):
     """If PATH exists in REVISION of the svn repository mirror,
     return its leaf value, else return None.
     If DEBUGGING is true, then print trace output to stdout.
     REVISION defaults to youngest, and PATH must not start with '/'."""
     components = string.split(path, '/')
     if revision == -1:
       revision = self.youngest

     if debugging: print "PROBING path: '%s' in %d" % (path, revision)

     parent_key = self.revs_db[str(revision)]
     parent = marshal.loads(self.nodes_db[parent_key])
     previous_component = "/"

     i = 1
     for component in components:

       if debugging:
         for n in range(i): print "  ",
         print "'%s' key: %s, val:" % (previous_component, parent_key), parent

       if not parent.has_key(component):
         if debugging:
           print "  PROBE ABANDONED: '%s' does not contain '%s'" \
                 % (previous_component, component)
         return None

       this_entry_key = parent[component]
       this_entry_val = marshal.loads(self.nodes_db[this_entry_key])
       parent_key = this_entry_key
       parent = this_entry_val
       previous_component = component
       i = i + 1

     if debugging:
       for n in range(i): print "  ",
       print "parent_key: %s, val:" % parent_key, parent

     # It's not actually a parent at this point, it's the leaf node.
     return parent

   def change_path(self, path, tags, branches, intermediate_dir_func=None):
     """Record a change to PATH.  PATH may not have a leading slash.

     Return a tuple (op, (closed_names)), where op is 'A' if the
     path was added or 'C' if it already existed, and (closed_names) is
     a tuple of symbolic names closed off by this change -- that is,
     tags or branches which could be rooted in the previous revision of
     PATH, but not in this revision, because this rev changes PATH.

     TAGS are any tags that sprout from this revision of PATH, BRANCHES
     are any branches that sprout from this revision of PATH.

     If INTERMEDIATE_DIR_FUNC is not None, then invoke it once on
     each full path to each missing intermediate directory in PATH, in
     order from shortest to longest."""

     components = string.split(path, '/')
     path_so_far = None

     # print "KFF change_path: '%s'" % path
     # print "    revision:  '%d'" % self.youngest
     # print "    tags:      ", tags
     # print "    branches:  ", branches

     parent_key = self.revs_db[str(self.youngest)]
     parent = marshal.loads(self.nodes_db[parent_key])
     if not parent.has_key(self.mutable_flag):
       parent_key = gen_key()
       parent[self.mutable_flag] = 1
       self.nodes_db[parent_key] = marshal.dumps(parent)
       self.revs_db[str(self.youngest)] = parent_key

     for component in components[:-1]:
       # parent is always mutable at the top of the loop

       if path_so_far: path_so_far = path_so_far + '/' + component
       else:           path_so_far = component

       # Ensure that the parent has an entry for this component.
       if not parent.has_key(component):
         new_child_key = gen_key()
         parent[component] = new_child_key
         self.nodes_db[new_child_key] = marshal.dumps(self.empty_mutable_thang)
         self.nodes_db[parent_key] = marshal.dumps(parent)
         if intermediate_dir_func:
           intermediate_dir_func(path_so_far)

       # One way or another, parent dir now has an entry for component,
       # so grab it, see if it's mutable, and DTRT if it's not.  (Note
       # it's important to reread the entry value from the db, even
       # though we might have just written it -- if we tweak existing
       # data structures, we could modify self.empty_mutable_thang,
       # which must not happen.)
       this_entry_key = parent[component]
       this_entry_val = marshal.loads(self.nodes_db[this_entry_key])
       if not this_entry_val.has_key(self.mutable_flag):
         this_entry_val[self.mutable_flag] = 1
         this_entry_key = gen_key()
         parent[component] = this_entry_key
         self.nodes_db[this_entry_key] = marshal.dumps(this_entry_val)
         self.nodes_db[parent_key] = marshal.dumps(parent)

       parent_key = this_entry_key
       parent = this_entry_val

     # Now change the last node, the versioned file.  Just like at the
     # top of the above loop, parent is already mutable.
     op = OP_ADD
     old_names = ()
     last_component = components[-1]
     if parent.has_key(last_component):
       # Sanity check.
       child = marshal.loads(self.nodes_db[parent[last_component]])
       if child.has_key(self.mutable_flag):
         sys.stderr.write("'%s' has already been changed in revision %d;\n" \
                          "can't change it again in the same revision."     \
                          % (path, self.youngest))
         sys.exit(1)
       # Okay, passed the sanity check.
       op = OP_CHANGE
       old_names = child[self.symbolic_names]

     leaf_key = gen_key()
     parent[last_component] = leaf_key
     self.nodes_db[parent_key] = marshal.dumps(parent)
     new_names = tuple(tags + branches)
     new_val = { self.symbolic_names : new_names,
                 self.mutable_flag   : 1 }
     s = marshal.dumps(new_val)
     self.nodes_db[leaf_key] = marshal.dumps(new_val)
     return (op, old_names)

   def delete_path(self, path, tags, branches, prune=None):
     """Delete PATH from the tree.  PATH may not have a leading slash.

     Return a tuple (path_deleted, (closed_names)), where path_deleted
     is the path actually deleted or None if PATH did not exist, and
     (closed_names) is a tuple of symbolic names closed off by this
     deletion -- that is, tags or branches which could be rooted in the
     previous revision of PATH, but not in this revision, because this
     rev changes PATH.  If path_deleted is None, then closed_names will
     be empty.

     TAGS are any tags that sprout from this revision of PATH, BRANCHES
     are any branches that sprout from this revision of PATH.  (I can't
     imagine that there are any of either, what to do if there are?)

     If PRUNE is not None, then delete the highest possible directory,
     which means the returned path may differ from PATH.  In other
     words, if PATH was the last entry in its parent, then delete
     PATH's parent, unless it too is the last entry in *its* parent, in
     which case delete that parent, and and so on up the chain, until a
     directory is encountered that has an entry which is not a member
     of the parent stack of the original target.

     PRUNE is like the -P option to 'cvs checkout'."""

     components = string.split(path, '/')
     path_so_far = None

     # print "KFF change_path: '%s'" % path
     # print "    revision:  '%d'" % self.youngest
     # print "    tags:      ", tags
     # print "    branches:  ", branches

     # Start out assuming that we will delete it.  The for-loop may
     # change this to None, if it turns out we can't even reach the
     # path (i.e., it is already deleted).
     retval = path

     parent_key = self.revs_db[str(self.youngest)]
     parent = marshal.loads(self.nodes_db[parent_key])

     # As we walk down to find the dest, we remember each parent
     # directory's name and db key, in reverse order: push each new key
     # onto the front of the list, so that by the time we reach the
     # destination node, the zeroth item in the list is the parent of
     # that destination.
     #
     # Then if we actually do the deletion, we walk the list from left
     # to right, replacing as appropriate.
     #
     # The root directory has name None.
     parent_chain = [ ]
     parent_chain.insert(0, (None, parent_key))

     def is_prunable(dir):
       """Return true if DIR, a dictionary representing a directory,
       has just zero or one non-special entry, else return false.
       (In a pure world, we'd just ask len(DIR) > 1; it's only
       because the directory might have mutable flags and other special
       entries that we need this function at all.)"""
       num_items = len(dir)
       if num_items > 3:
         return None
       if num_items == 3 or num_items == 2:
         real_entries = 0
         for key in dir.keys():
           if not key[0] == '/': real_entries = real_entries + 1
         if real_entries > 1: return None
         else:                return 1
       else:
         return 1

     for component in components[:-1]:
       # parent is always mutable at the top of the loop

       if path_so_far: path_so_far = path_so_far + '/' + component
       else:           path_so_far = component

       # If we can't reach the dest, then we don't need to do anything.
       if not parent.has_key(component):
         return (None, ())

       # Otherwise continue downward, dropping breadcrumbs.
       this_entry_key = parent[component]
       this_entry_val = marshal.loads(self.nodes_db[this_entry_key])
       parent_key = this_entry_key
       parent = this_entry_val
       parent_chain.insert(0, (component, parent_key))

     # If the target is not present in its parent, then we're done.
     last_component = components[-1]
     old_names = ()
     if not parent.has_key(last_component):
       return (None, ())
     else:
       child = marshal.loads(self.nodes_db[parent[last_component]])
       old_names = child[self.symbolic_names]

     # The target is present, so remove it and bubble up, making a new
     # mutable path and/or pruning as necessary.
     pruned_count = 0
     prev_entry_name = last_component
     new_key = None
     for parent_item in parent_chain:
       pkey = parent_item[1]
       pval = marshal.loads(self.nodes_db[pkey])
       if prune:
         if (new_key == None) and is_prunable(pval):
           pruned_count = pruned_count + 1
           pass
           # Do nothing more.  All the action takes place when we hit a
           # non-prunable parent.
         else:
           # We hit a non-prunable, so bubble up the new gospel.
           pval[self.mutable_flag] = 1
           if new_key == None:
             del pval[prev_entry_name]
           else:
             pval[prev_entry_name] = new_key
           new_key = gen_key()
       else:
         pval[self.mutable_flag] = 1
         if new_key:
           pval[prev_entry_name] = new_key
         else:
           del pval[prev_entry_name]
         new_key = gen_key()

       prev_entry_name = parent_item[0]
       if new_key:
         self.nodes_db[new_key] = marshal.dumps(pval)

     if new_key == None:
       new_key = gen_key()
       self.nodes_db[new_key] = marshal.dumps(self.empty_mutable_thang)

     # Install the new root entry.
     self.revs_db[str(self.youngest)] = new_key

     if pruned_count > len(components):
       sys.stderr.write("Error: deleting '%s' tried to prune %d components."
                        % (path, pruned_count))
       exit(1)

     if pruned_count:
       if pruned_count == len(components):
         # We never prune away the root directory, so back up one component.
         pruned_count = pruned_count - 1
       retpath = string.join(components[:0 - pruned_count], '/')
       return (retpath, old_names)
     else:
       return (path, old_names)

     ### We've no place to put tags + branches.  Suspect we just
     ### shouldn't be taking them as arguments, which the doc string
     ### implies already.  Ponder.

   def close(self):
     # Just stabilize the last revision.  This may or may not affect
     # anything, but if we end up using the mirror for anything after
     # this, it's nice to know the '/mutable' entries are gone.
     self.stabilize_youngest()


 class Dumper:
   def __init__(self, dumpfile_path):
     'Open DUMPFILE_PATH, and initialize revision to REVISION.'
     self.dumpfile_path = dumpfile_path
     self.revision = 0
     self.dumpfile = open(dumpfile_path, 'wb')
     self.repos_mirror = RepositoryMirror()

     # Initialize the dumpfile with the standard headers:
     #
     # The CVS repository doesn't have a UUID, and the Subversion
     # repository will be created with one anyway.  So when we load
     # the dumpfile, we'll tell svnadmin to ignore the UUID below.
     self.dumpfile.write('SVN-fs-dump-format-version: 2\n'
                         '\n'
                         'UUID: ????????-????-????-????-????????????\n'
                         '\n')

   def start_revision(self, props):
     """Write the next revision, with properties, to the dumpfile.
     Return the newly started revision."""

     self.revision = self.revision + 1

     # A revision typically looks like this:
     #
     #   Revision-number: 1
     #   Prop-content-length: 129
     #   Content-length: 129
     #
     #   K 7
     #   svn:log
     #   V 27
     #   Log message for revision 1.
     #   K 10
     #   svn:author
     #   V 7
     #   jrandom
     #   K 8
     #   svn:date
     #   V 27
     #   2003-04-22T22:57:58.132837Z
     #   PROPS-END
     #
     # Notice that the length headers count everything -- not just the
     # length of the data but also the lengths of the lengths, including
     # the 'K ' or 'V ' prefixes.
     #
     # The reason there are both Prop-content-length and Content-length
     # is that the former includes just props, while the latter includes
     # everything.  That's the generic header form for any entity in a
     # dumpfile.  But since revisions only have props, the two lengths
     # are always the same for revisions.

     # Calculate the total length of the props section.
     total_len = 10  # len('PROPS-END\n')
     for propname in props.keys():
       klen = len(propname)
       klen_len = len('K %d' % klen)
       vlen = len(props[propname])
       vlen_len = len('V %d' % vlen)
       # + 4 for the four newlines within a given property's section
       total_len = total_len + klen + klen_len + vlen + vlen_len + 4

     # Print the revision header and props
     self.dumpfile.write('Revision-number: %d\n'
                         'Prop-content-length: %d\n'
                         'Content-length: %d\n'
                         '\n'
                         % (self.revision, total_len, total_len))

     for propname in props.keys():
       self.dumpfile.write('K %d\n'
                           '%s\n'
                           'V %d\n'
                           '%s\n' % (len(propname),
                                     propname,
                                     len(props[propname]),
                                     props[propname]))

     self.dumpfile.write('PROPS-END\n')
     self.dumpfile.write('\n')

     self.repos_mirror.new_revision()
     return self.revision

   def add_dir(self, path):
     self.dumpfile.write("Node-path: %s\n"
                         "Node-kind: dir\n"
                         "Node-action: add\n"
                         "Prop-content-length: 10\n"
                         "Content-length: 10\n"
                         "\n"
                         "PROPS-END\n"
                         "\n"
                         "\n" % path)

   def probe_path(self, path):
     """Return true if PATH exists in the youngest tree of the svn
     repository, else return None.  PATH does not start with '/'."""
     if self.repos_mirror.probe_path(path) == None: return None
     else:                                          return 1

   def copy_path(self, svn_src_path, svn_src_rev, svn_dst_path):
     # We don't need to include "Node-kind:" for copies; the loader
     # ignores it anyway and just uses the source kind instead.
     self.dumpfile.write('Node-path: %s\n'
                         'Node-action: add\n'
                         'Node-copyfrom-rev: %d\n'
                         'Node-copyfrom-path: /%s\n'
                         '\n'
                         % (svn_dst_path, svn_src_rev, svn_src_path))

   def add_or_change_path(self, cvs_path, svn_path, cvs_rev, rcs_file,
                          tags, branches):

     # figure out the real file path for "co"
     try:
       f_st = os.stat(rcs_file)
     except os.error:
       dirname, fname = os.path.split(rcs_file)
       rcs_file = os.path.join(dirname, 'Attic', fname)
       f_st = os.stat(rcs_file)

     if f_st[0] & stat.S_IXUSR:
       is_executable = 1
       # "K 14\n" + "svn:executable\n" + "V 1\n" + "*\n" + "PROPS-END\n"
       props_len = 36
     else:
       is_executable = 0
       # just "PROPS-END\n"
       props_len = 10

     ### FIXME: We ought to notice the -kb flag set on the RCS file and
     ### use it to set svn:mime-type.

     basename = os.path.basename(rcs_file[:-2])
     pipe = os.popen('co -q -p%s \'%s\'' % (cvs_rev, rcs_file), 'r', 102400)

     # You might think we could just test
     #
     #   if cvs_rev[-2:] == '.1':
     #
     # to determine if this path exists in head yet.  But that wouldn't
     # be perfectly reliable, both because of 'cvs commit -r', and also
     # the possibility of file resurrection.
     op, closed_names = self.repos_mirror.change_path(svn_path,
                                                      tags, branches,
                                                      self.add_dir)
     if op == OP_ADD:
       action = 'add'
     else:
       action = 'change'

     self.dumpfile.write('Node-path: %s\n'
                         'Node-kind: file\n'
                         'Node-action: %s\n'
                         'Prop-content-length: %d\n'
                         'Text-content-length: '
                         % (svn_path, action, props_len))

     pos = self.dumpfile.tell()

     self.dumpfile.write('0000000000000000\n'
                         'Text-content-md5: 00000000000000000000000000000000\n'
                         'Content-length: 0000000000000000\n'
                         '\n')

     if is_executable:
       self.dumpfile.write('K 14\n'
                           'svn:executable\n'
                           'V 1\n'
                           '*\n')

     self.dumpfile.write('PROPS-END\n')

     # Insert the rev contents, calculating length and checksum as we go.
     checksum = md5.new()
     length = 0
     buf = pipe.read()
     while buf:
       checksum.update(buf)
       length = length + len(buf)
       self.dumpfile.write(buf)
       buf = pipe.read()
     pipe.close()

     # Go back to patch up the length and checksum headers:
     self.dumpfile.seek(pos, 0)
     # We left 16 zeros for the text length; replace them with the real
     # length, padded on the left with spaces:
     self.dumpfile.write('%16d' % length)
     # 16... + 1 newline + len('Text-content-md5: ') == 35
     self.dumpfile.seek(pos + 35, 0)
     self.dumpfile.write(checksum.hexdigest())
     # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
     self.dumpfile.seek(pos + 84, 0)
     # The content length is the length of property data, text data,
     # and any metadata around/inside around them.
     self.dumpfile.write('%16d' % (length + props_len))
     # Jump back to the end of the stream
     self.dumpfile.seek(0, 2)

     # This record is done.
     self.dumpfile.write('\n')
     return closed_names

   def delete_path(self, svn_path, tags, branches, prune=None):
     """If SVN_PATH exists in the head mirror, output the deletion to
     the dumpfile, else output nothing to the dumpfile.

     Return a tuple (path_deleted, (closed_names)), where path_deleted
     is the path deleted if any or None if no deletion was necessary,
     and (closed_names) is a tuple of symbolic names closed off by this
     deletion -- that is, tags or branches which could be rooted in the
     previous revision of PATH, but not in this revision, because this
     rev changes PATH.  If path_deleted is None, then closed_names will
     be empty.

     Iff PRUNE is true, then the path deleted can be not None, yet
     shorter than SVN_PATH because of pruning."""
     deleted_path, closed_names = self.repos_mirror.delete_path(svn_path,
                                                                tags, branches,
                                                                prune)
     if deleted_path:
       print '    (deleted %s)' % deleted_path
       self.dumpfile.write('Node-path: %s\n'
                           'Node-action: delete\n'
                           '\n' % deleted_path)
     return (deleted_path, closed_names)

   def close(self):
     self.repos_mirror.close()
     self.dumpfile.close()


 def format_date(date):
   """Return an svn-compatible date string for DATE (seconds since epoch)."""
   # A Subversion date looks like "2002-09-29T14:44:59.000000Z"
   return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(date))


 class SymbolicNameTracker:
   """Track the Subversion path/revision ranges of CVS symbolic names.
   This is done in a .db file, representing a tree in the usual way.
   In addition to directory entries, each object in the database stores
   the earliest revision from which it could be copied, and the first
   revision from which it could no longer be copied.  Intermediate
   directories go one step farther: they record counts for the various
   revisions from which items under them could have been copied, and
   counts for the cutoff revisions.  For example:

                                .----------.
                                |  sub1    | [(2, 1), (3, 3)]
                                |  /       | [(5, 1), (17, 2), (50, 1)]
                                | /        |
                                |/ sub2    |
                                /    \     |
                               /|_____\____|
                              /        \
                       ______/          \_________
                      /                           \
                     /                             \
                    /                               \
               .---------.                     .---------.
               |  file1  |                     |  file3  |
               |   /     | [(3, 2)]            |     \   | [(2, 1), (3, 1)]
               |  /      | [(17, 1), (50, 1)]  |      \  | [(5, 1), (10, 1)]
               | /       |                     |       \ |
               |/ file2  |                     |  file4 \|
               /    \    |                     |    /    \
              /|_____\___|                     |___/_____|\
             /        \                           /        \
            /          \                         /          \
           /            \                       /            \
          /              +                     /              +
     +======+            |                 +======+           |
     |      | [(3, 1)]   |                 |      | [(2, 1)]  |
     |      | [(17, 1)]  |                 |      | [(5, 1)]  |
     |      |            |                 |      |           |
     +======+            |                 +======+           |
                     +======+                             +======+
                     |      | [(3, 1)]                    |      | [(3, 1)]
                     |      | [(50, 1)]                   |      | [(17, 1)]
                     |      |                             |      |
                     +======+                             +======+

   The two lists to the right of each node represent the 'opening' and
   'closing' revisions respectively.  Each tuple in a list is of the
   form (REV, COUNT).  For leaf nodes, COUNT is always 1, of course.
   For intermediate nodes, the counts are the sums of the corresponding
   counts of child nodes.

   These revision scores are used to determine the optimal copy
   revisions for each tree/subtree at branch or tag creation time.

   The svn path input will most often be a trunk path, because the
   path/rev information recorded here is about where and when the given
   symbolic name could be rooted, *not* a path/rev for which commits
   along that symbolic name take place (of course, commits only happen on
   branches anyway)."""

   def __init__(self):
     self.db_file = SYMBOLIC_NAMES_DB
     self.db = anydbm.open(self.db_file, 'n')
     self.root_key = gen_key()
     self.db[self.root_key] = marshal.dumps({})
     # The keys for the opening and closing revision lists attached to
     # each directory or file.  Includes "/" so as never to conflict
     # with any real entry.
     self.opening_revs_key = "/opening"
     self.closing_revs_key = "/closing"
     self.copyfrom_rev_key = "/copyfrom-rev"

   def probe_path(self, symbolic_name, path, debugging=None):
     """If 'SYMBOLIC_NAME/PATH' exists in the symbolic name tree,
     return the value of its last component, else return None.
     PATH may be None, but may not start with '/'.
     If DEBUGGING is true, then print trace output to stdout."""
     if path:
       components = [symbolic_name] + string.split(path, '/')
     else:
       components = [symbolic_name]

     if debugging: print "PROBING SYMBOLIC NAME:\n", components

     parent_key = self.root_key
     parent = marshal.loads(self.db[parent_key])
     last_component = "/"
     i = 1
     for component in components:
       if debugging:
         for n in range(i): print "  ",
         print "'%s' key: %s, val:" % (last_component, parent_key), parent

       if not parent.has_key(component):
         sys.stderr.write("SYM PROBE FAILED: '%s' does not contain '%s'\n" \
                          % (last_component, component))
         sys.exit(1)

       this_entry_key = parent[component]
       this_entry_val = marshal.loads(self.db[this_entry_key])
       parent_key = this_entry_key
       parent = this_entry_val
       last_component = component
       i = i + 1

     if debugging:
       for n in range(i): print "  ",
       print "parent_key: %s, val:" % parent_key, parent

     # It's not actually a parent at this point, it's the leaf node.
     return parent

   def bump_rev_count(self, item_key, rev, revlist_key):
     """Increment REV's count in opening or closing list under KEY.
     REVLIST_KEY is self.opening_revs_key or self.closing_revs_key, and
     indicates which rev list to increment REV's count in.

     For example, if REV is 7, REVLIST_KEY is self.opening_revs_key,
     and the entry's opening revs list looks like this

          [(2, 5), (7, 2), (10, 15)]

     then afterwards it would look like this:

          [(2, 5), (7, 3), (10, 15)]

     But if no tuple for revision 7 were present, then one would be
     added, for example

          [(2, 5), (10, 15)]

     would become

          [(2, 5), (7, 1), (10, 15)]

     The list is sorted by ascending revision both before and after."""

     entry_val = marshal.loads(self.db[item_key])

     if not entry_val.has_key(revlist_key):
       entry_val[revlist_key] = [(rev, 1)]
     else:
       rev_counts = entry_val[revlist_key]
       done = None
       for i in range(len(rev_counts)):
         this_rev, this_count = rev_counts[i]
         if rev == this_rev:
           rev_counts[i] = (this_rev, this_count + 1)
           done = 1
           break
         elif this_rev > rev:
           if i > 0: i = i - 1
           rev_counts.insert(i, (rev, 1))
           done = 1
           break
       if not done: rev_counts.append((rev, 1))
       entry_val[revlist_key] = rev_counts

     self.db[item_key] = marshal.dumps(entry_val)

   # The verb form of "root" is "root", but that would be misleading in
   # this case; and the opposite of "uproot" is presumably "downroot",
   # but that wouldn't exactly clarify either.  Hence, "enroot" :-).
   def enroot_names(self, svn_path, svn_rev, tags, branches):
     """Record SVN_PATH at SVN_REV as the earliest point from which the
     symbolic names in TAGS and BRANCHES could be copied.  SVN_PATH
     does not start with '/'."""
     if not (tags or branches): return  # early out
     for name in tags + branches:
       components = [name] + string.split(svn_path, '/')
       # print "KFF enrooting ('%s') " % name, components

       parent_key = self.root_key
       for component in components:
         self.bump_rev_count(parent_key, svn_rev, self.opening_revs_key)
         parent = marshal.loads(self.db[parent_key])
         if not parent.has_key(component):
           new_child_key = gen_key()
           parent[component] = new_child_key
           self.db[new_child_key] = marshal.dumps({})
           self.db[parent_key] = marshal.dumps(parent)
         # One way or another, parent now has an entry for component.
         this_entry_key = parent[component]
         this_entry_val = marshal.loads(self.db[this_entry_key])
         # Swaparoo.
         parent_key = this_entry_key
         parent = this_entry_val

       self.bump_rev_count(parent_key, svn_rev, self.opening_revs_key)

   def close_names(self, svn_path, svn_rev, names):
     """Record that as of SVN_REV, SVN_PATH could no longer be the
     source from which any of symbolic names in NAMES could be copied.
     SVN_PATH does not start with '/'."""
     if not names: return  # early_out
     for name in names:
       components = [name] + string.split(svn_path, '/')
       parent_key = self.root_key
       for component in components:
         self.bump_rev_count(parent_key, svn_rev, self.closing_revs_key)
         parent = marshal.loads(self.db[parent_key])
         if not parent.has_key(component):
           sys.stderr.write("In path '%s', value for parent key '%s' "
                            "does not have entry '%s'\n" \
                            % (svn_path, parent_key, component))
           sys.exit(1)
         this_entry_key = parent[component]
         this_entry_val = marshal.loads(self.db[this_entry_key])
         # Swaparoo.
         parent_key = this_entry_key
         parent = this_entry_val

       self.bump_rev_count(parent_key, svn_rev, self.closing_revs_key)

   def score_revisions(self, openings, closings):
     """Return a list of revisions and scores based on OPENINGS and
     CLOSINGS.  The returned list looks like:

        [(REV1 SCORE1), (REV2 SCORE2), ...]

     where REV2 > REV1 and all scores are > 0.  OPENINGS and CLOSINGS
     are the values of self.opening_revs_key and self.closing_revs_key
     from some file or directory node, or None.

     Each score indicates that copying the corresponding revision of
     the object in question would yield that many correct paths at or
     underneath the object.  There may be other paths underneath it
     which are not correct and need to be deleted or recopied; those
     can only be detected by descending and examining their scores.

     If OPENINGS is false, return the empty list, else if CLOSINGS is
     false, return OPENINGS."""

     # First look for easy outs.
     if not openings: return []
     if not closings: return openings

     # No easy out, so wish for lexical closures and calculate the scores :-).
     scores = []
     opening_score_accum = 0
     for i in range(len(openings)):
       pair = openings[i]
       opening_score_accum = opening_score_accum + pair[1]
       scores.append((pair[0], opening_score_accum))
     min = 0
     for i in range(len(closings)):
       closing_rev   = closings[i][0]
       closing_score = closings[i][1]
       for j in range(min, len(scores)):
         opening_pair = scores[j]
         if closing_rev <= opening_pair[0]:
           scores[j] = (opening_pair[0], opening_pair[1] - closing_score)
         else:
           min = j + 1
     return scores

   def best_rev(self, scores):
     """Return the revision with the highest score from SCORES, a list
     returned by score_revisions()."""
     max_score = 0
     rev = SVN_INVALID_REVNUM
     for pair in scores:
       if pair[1] > max_score:
         max_score = pair[1]
         rev = pair[0]
     return rev

   def fill_branch(self, dumper, ctx, branch, svn_rev, svn_path):
     """Use DUMPER to create all currently available parts of BRANCH
     that have not been created already, and make sure that SVN_REV of
     SVN_PATH is in the branch afterwards."""
     parent_key = self.root_key
     parent = marshal.loads(self.db[parent_key])

     if not parent.has_key(branch):
       sys.stderr.write("No origin records for branch '%s'." % branch)
       sys.exit(1)

     def copy_descend(dumper, ctx, branch, parent, entry_name,
                      path_so_far, parent_rev):
       key = parent[entry_name]
       val = marshal.loads(self.db[key])
       scores = self.score_revisions(val.get(self.opening_revs_key),
                                     val.get(self.closing_revs_key))
       rev = self.best_rev(scores)
       if ((rev != parent_rev) and not val.has_key(self.copyfrom_rev_key)):
         parent_rev = rev
         # print "KFF parent:", parent
         # print "KFF entry_name: '%s'" % entry_name
         # print "KFF openings:", val.get(self.opening_revs_key)
         # print "KFF closings:", val.get(self.closing_revs_key)
         # print "KFF scores:", scores
         # print "KFF copyfrom-path: '%s'" % path_so_far
         # print "KFF copyfrom-rev: '%s'" % parent_rev
         ### FIXME: todo -- working here
         ### This path is all wrong, of course.  We need to get
         ### the source and dest to start at the same "level" in the
         ### copy descent, so first we need to fix path generation for
         ### projects.
         dst_path = make_path(ctx, path_so_far, branch)
         # print "KFF dst_path: '%s'" % dst_path
         dumper.copy_path(path_so_far, parent_rev, dst_path)
         # Record that this copy is done.
         val[self.copyfrom_rev_key] = parent_rev
         # print "KFF new val:", val
         self.db[key] = marshal.dumps(val)
       else:
         # print "KFF filled by implication: '%s'" % path_so_far
         pass
       for ent in val.keys():
         if not ent[0] == '/':
           if path_so_far: next_path = path_so_far + '/' + ent
           else:           next_path = ent
           copy_descend(dumper, ctx, branch, val, ent, next_path, parent_rev)

     # print ""
     # print "KFF filling path: '%s'" % svn_path
     copy_descend(dumper, ctx, branch, parent, branch, "", SVN_INVALID_REVNUM)
     # print ""


 class Commit:
   def __init__(self):
     self.files = { }
     self.changes = [ ]
     self.deletes = [ ]
     self.t_min = 1<<30
     self.t_max = 0

   def has_file(self, fname):
     return self.files.has_key(fname)

   def add(self, t, op, file, rev, branch_name, tags, branches):
     # Record the time range of this commit.
     #
     # ### ISSUE: It's possible, though unlikely, that the time range
     # of a commit could get gradually expanded to be arbitrarily
     # longer than COMMIT_THRESHOLD.  I'm not sure this is a huge
     # problem, and anyway deciding where to break it up would be a
     # judgement call. For now, we just print a warning in commit() if
     # this happens.
     if t < self.t_min:
       self.t_min = t
     if t > self.t_max:
       self.t_max = t

     if op == OP_CHANGE:
       self.changes.append((file, rev, branch_name, tags, branches))
     else:
       # OP_DELETE
       self.deletes.append((file, rev, branch_name, tags, branches))
     self.files[file] = 1

   def get_metadata(self):
     # by definition, the author and log message must be the same for all
     # items that went into this commit. therefore, just grab any item from
     # our record of changes/deletes.
     if self.changes:
       file, rev, br, tags, branches = self.changes[0]
     else:
       # there better be one...
       file, rev, br, tags, branches = self.deletes[0]

     # now, fetch the author/log from the ,v file
     rip = RevInfoParser()
     rip.parse_cvs_file(file)
     author = rip.authors[rev]
     log = rip.logs[rev]
     # and we already have the date, so just format it
     date = format_date(self.t_max)

     return author, log, date

   def commit(self, dumper, ctx, sym_tracker):
     # commit this transaction
     seconds = self.t_max - self.t_min
     print 'committing: %s, over %d seconds' % (time.ctime(self.t_min), seconds)
     if seconds > COMMIT_THRESHOLD:
       print 'WARNING: commit spans more than %d seconds' % COMMIT_THRESHOLD

     if ctx.dry_run:
       for f, r, br, tags, branches in self.changes:
         # compute a repository path, dropping the ,v from the file name
         svn_path = make_path(ctx, relative_name(ctx.cvsroot, f[:-2]), br)
         print '    adding or changing %s : %s' % (r, svn_path)
       for f, r, br, tags, branches in self.deletes:
         # compute a repository path, dropping the ,v from the file name
         svn_path = make_path(ctx, relative_name(ctx.cvsroot, f[:-2]), br)
         print '    deleting %s : %s' % (r, svn_path)
       print '    (skipped; dry run enabled)'
       return

     do_copies = [ ]

     # get the metadata for this commit
     author, log, date = self.get_metadata()
     try:
       ### FIXME: The 'replace' behavior should be an option, like
       ### --encoding is.
       unicode_author = unicode(author, ctx.encoding, 'replace')
       unicode_log = unicode(log, ctx.encoding, 'replace')
       props = { 'svn:author' : unicode_author.encode('utf8'),
                 'svn:log' : unicode_log.encode('utf8'),
                 'svn:date' : date }
     except UnicodeError:
       print 'Problem encoding author or log message:'
       print "  author: '%s'" % author
       print "  log:    '%s'" % log
       print "  date:   '%s'" % date
       for rcs_file, cvs_rev, br, tags, branches in self.changes:
         print "    rev %s of '%s'" % (cvs_rev, rcs_file)
       print 'Try rerunning with (for example) \"--encoding=latin1\".'
       sys.exit(1)

     ### FIXME: Until we handle branches and tags, there's a
     ### possibility that none of the code below will get used.  For
     ### example, if the CVS file was added on a branch, then its
     ### revision 1.1 will start out in state "dead", and the RCS file
     ### will be in the Attic/.  If that file is the only item in the
     ### commit, then we won't hit the `self.changes' case at all, and
     ### we won't do anything in the `self.deletes' case, since we
     ### don't handle the branch right now, and we special-case
     ### revision 1.1.
     ###
     ### So among other things, this variable tells us whether we
     ### actually wrote anything to the dumpfile.
     svn_rev = SVN_INVALID_REVNUM

     for rcs_file, cvs_rev, br, tags, branches in self.changes:
       # compute a repository path, dropping the ,v from the file name
       cvs_path = relative_name(ctx.cvsroot, rcs_file[:-2])
       svn_path = make_path(ctx, cvs_path, br)
       print '    adding or changing %s : %s' % (cvs_rev, svn_path)
       if svn_rev == SVN_INVALID_REVNUM:
         svn_rev = dumper.start_revision(props)
       sym_tracker.enroot_names(svn_path, svn_rev, tags, branches)
       if br:
         ### FIXME: Here is an obvious optimization point.  Probably
         ### dump.probe_path(PATH) is kind of slow, because it does N
         ### database lookups for the N components in PATH.  If this
         ### turns out to be a performance bottleneck, we can just
         ### maintain a database mirroring just the head tree, but
         ### keyed on full paths, to reduce the check to a quick
         ### constant time query.
         if not dumper.probe_path(svn_path):
           sym_tracker.fill_branch(dumper, ctx, br, svn_rev, svn_path)
       closed_names = dumper.add_or_change_path(cvs_path, svn_path,
                                                cvs_rev, rcs_file,
                                                tags, branches)
       sym_tracker.close_names(svn_path, svn_rev, closed_names)

     for rcs_file, cvs_rev, br, tags, branches in self.deletes:
       # compute a repository path, dropping the ,v from the file name
       cvs_path = relative_name(ctx.cvsroot, rcs_file[:-2])
       svn_path = make_path(ctx, cvs_path, br)
       print '    deleting %s : %s' % (cvs_rev, svn_path)
       if cvs_rev != '1.1':
         if svn_rev == SVN_INVALID_REVNUM:
           svn_rev = dumper.start_revision(props)
         # Uh, can this even happen on a deleted path?  Hmmm.  If not,
         # there's no risk, since tags and branches would just be empty
         # and therefore enrooting would be a no-op.  Still, it would
         # be clearer to know for sure and simply not call it.
         sym_tracker.enroot_names(svn_path, svn_rev, tags, branches)
         ### FIXME: this will return path_deleted == None if no path
         ### was deleted.  But we'll already have started the revision
         ### by then, so it's a bit late to use the knowledge!  Need to
         ### reorganize things so that starting the revision is a
         ### callback with its own internal conditional, so anyone can
         ### just invoke when they know they're really about to do
         ### something.
         ###
         ### Right now what happens is we get an empty revision
         ### (assuming nothing else happened in this revision), so it
         ### won't show up 'svn log' output, even when invoked on the
         ### root -- because no paths changed!  That needs to be fixed,
         ### regardless of whether cvs2svn creates such revisions.
         path_deleted, closed_names = dumper.delete_path(svn_path,
                                                         tags, branches,
                                                         ctx.prune)
         sym_tracker.close_names(svn_path, svn_rev, closed_names)

     if svn_rev != SVN_INVALID_REVNUM:
       print '    new revision:', svn_rev
     else:
       print '    no new revision created, as nothing to do'


 def read_resync(fname):
   "Read the .resync file into memory."

   ### note that we assume that we can hold the entire resync file in
   ### memory. really large repositories with whacky timestamps could
   ### bust this assumption. should that ever happen, then it is possible
   ### to split the resync file into pieces and make multiple passes,
   ### using each piece.

   #
   # A digest maps to a sequence of lists which specify a lower and upper
   # time bound for matching up the commit. We keep a sequence of these
   # because a number of checkins with the same log message (e.g. an empty
   # log message) could need to be remapped. We also make them a list because
   # we will dynamically expand the lower/upper bound as we find commits
   # that fall into a particular msg and time range.
   #
   # resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
   #
   resync = { }

   for line in fileinput.FileInput(fname):
     t1 = int(line[:8], 16)
     digest = line[9:DIGEST_END_IDX]
     t2 = int(line[DIGEST_END_IDX+1:], 16)
     t1_l = t1 - COMMIT_THRESHOLD/2
     t1_u = t1 + COMMIT_THRESHOLD/2
     if resync.has_key(digest):
       resync[digest].append([t1_l, t1_u, t2])
     else:
       resync[digest] = [ [t1_l, t1_u, t2] ]
   return resync


 def parse_revs_line(line):
   data = line.split(' ', 6)
   timestamp = int(data[0], 16)
   id = data[1]
   op = data[2]
   rev = data[3]
   branch_name = data[4]
   if branch_name == "*":
     branch_name = None
   ntags = int(data[5])
   tags = data[6].split(' ', ntags + 1)
   nbranches = int(tags[ntags])
   branches = tags[ntags + 1].split(' ', nbranches)
   fname = branches[nbranches][:-1]  # strip \n
   tags = tags[:ntags]
   branches = branches[:nbranches]

   return timestamp, id, op, rev, fname, branch_name, tags, branches


 def write_revs_line(output, timestamp, digest, op, revision, fname,
                     branch_name, tags, branches):
   output.write('%08lx %s %s %s ' % (timestamp, digest, op, revision))
   if not branch_name:
     branch_name = "*"
   output.write('%s ' % branch_name)
   output.write('%d ' % (len(tags)))
   for tag in tags:
     output.write('%s ' % (tag))
   output.write('%d ' % (len(branches)))
   for branch in branches:
     output.write('%s ' % (branch))
   output.write('%s\n' % fname)


 def pass1(ctx):
   cd = CollectData(ctx.cvsroot, DATAFILE)
   p = rcsparse.Parser()
   stats = [ 0 ]
   os.path.walk(ctx.cvsroot, visit_file, (cd, p, stats))
   if ctx.verbose:
     print 'processed', stats[0], 'files'


 def pass2(ctx):
   "Pass 2: clean up the revision information."

   # We may have recorded some changes in revisions' timestamp. We need to
   # scan for any other files which may have had the same log message and
   # occurred at "the same time" and change their timestamps, too.

   # read the resync data file
   resync = read_resync(ctx.log_fname_base + RESYNC_SUFFIX)

   output = open(ctx.log_fname_base + CLEAN_REVS_SUFFIX, 'w')

   # process the revisions file, looking for items to clean up
   for line in fileinput.FileInput(ctx.log_fname_base + REVS_SUFFIX):
     timestamp, digest, op, rev, fname, branch_name, tags, branches = \
       parse_revs_line(line)
     if not resync.has_key(digest):
       output.write(line)
       continue

     # we have a hit. see if this is "near" any of the resync records we
     # have recorded for this digest [of the log message].
     for record in resync[digest]:
       if record[0] <= timestamp <= record[1]:
         # bingo! remap the time on this (record[2] is the new time).
         write_revs_line(output, record[2], digest, op, rev, fname,
                         branch_name, tags, branches)

         print 'RESYNC: %s (%s) : old time="%s" new time="%s"' \
               % (relative_name(ctx.cvsroot, fname),
                  rev, time.ctime(timestamp), time.ctime(record[2]))

         # adjust the time range. we want the COMMIT_THRESHOLD from the
         # bounds of the earlier/latest commit in this group.
         record[0] = min(record[0], timestamp - COMMIT_THRESHOLD/2)
         record[1] = max(record[1], timestamp + COMMIT_THRESHOLD/2)

         # stop looking for hits
         break
     else:
       # the file/rev did not need to have its time changed.
       output.write(line)


 def pass3(ctx):
   # sort the log files
   os.system('sort %s > %s' % (ctx.log_fname_base + CLEAN_REVS_SUFFIX,
                               ctx.log_fname_base + SORTED_REVS_SUFFIX))


 def pass4(ctx):
   # create the target repository
   if not ctx.dry_run:
     if ctx.create_repos:
       os.system('%s create %s' % (ctx.svnadmin, ctx.target))
   else:
     t_fs = t_repos = None

   sym_tracker = SymbolicNameTracker()

   # A dictionary of Commit objects, keyed by digest.  Each object
   # represents one logical commit, which may involve multiple files.
   #
   # The reason this is a dictionary, not a single object, is that
   # there may be multiple commits interleaved in time.  A commit can
   # span up to COMMIT_THRESHOLD seconds, which leaves plenty of time
   # for parts of some other commit to occur.  Since the s-revs file is
   # sorted by timestamp first, then by digest within each timestamp,
   # it's quite easy to have interleaved commits.
   commits = { }

   # The number of separate commits processed in a given flush.  This
   # is used only for printing statistics, it does not affect the
   # results in the repository.
   count = 0

   # Start the dumpfile object.
   dumper = Dumper(ctx.dumpfile)

   # process the logfiles, creating the target
   for line in fileinput.FileInput(ctx.log_fname_base + SORTED_REVS_SUFFIX):
     timestamp, id, op, rev, fname, branch_name, tags, branches = \
       parse_revs_line(line)

     ### for now, only handle changes on the trunk until we get the tag
     ### and branch processing to stop making so many copies
     if not trunk_rev.match(rev):
       ### note this could/should have caused a flush, but the next item
       ### will take care of that for us
       ###
       ### TODO: working here.  Because of this condition, we're not
       ### seeing tags and branches rooted in initial revisions (CVS's
       ### infamous "1.1.1.1").
       ###
       ### See http://www.cs.uh.edu/~wjin/cvs/train/cvstrain-7.4.4.html
       ### for excellent clarification of the vendor branch thang.
       continue
       # pass

     # Each time we read a new line, we scan the commits we've
     # accumulated so far to see if any are ready for processing now.
     process = [ ]
     for scan_id, scan_c in commits.items():

       # ### ISSUE: the has_file() check below is not optimal.
       # It does fix the dataloss bug where revisions would get lost
       # if checked in too quickly, but it can also break apart the
       # commits. The correct fix would require tracking the dependencies
       # between change sets and committing them in proper order.
       if scan_c.t_max + COMMIT_THRESHOLD < timestamp or \
          scan_c.has_file(fname):
         process.append((scan_c.t_max, scan_c))
         del commits[scan_id]

     # If there are any elements in 'process' at this point, they need
     # to be committed, because this latest rev couldn't possibly be
     # part of any of them.  Sort them into time-order, then commit 'em.
     process.sort()
     for t_max, c in process:
       c.commit(dumper, ctx, sym_tracker)
     count = count + len(process)

     # Add this item into the set of still-available commits.
     if commits.has_key(id):
       c = commits[id]
     else:
       c = commits[id] = Commit()
     c.add(timestamp, op, fname, rev, branch_name, tags, branches)

   # End of the sorted revs file.  Flush any remaining commits:
   if commits:
     process = [ ]
     for id, c in commits.items():
       process.append((c.t_max, c))
     process.sort()
     for t_max, c in process:
       c.commit(dumper, ctx, sym_tracker)
     count = count + len(process)

   dumper.close()

   if ctx.verbose:
     print count, 'commits processed.'


 def pass5(ctx):
   if not ctx.dry_run:
     # ### FIXME: Er, does this "<" stuff work under Windows?
     # ### If not, then in general how do we load dumpfiles under Windows?
     print 'loading %s into %s' % (ctx.dumpfile, ctx.target)
     os.system('%s load --ignore-uuid %s < %s'
               % (ctx.svnadmin, ctx.target, ctx.dumpfile))


 _passes = [
   pass1,
   pass2,
   pass3,
   pass4,
   pass5,
   ]


 class _ctx:
   pass


 def convert(ctx, start_pass=1):
   "Convert a CVS repository to an SVN repository."

   times = [ None ] * len(_passes)
   for i in range(start_pass - 1, len(_passes)):
     times[i] = time.time()
     if verbose:
       print '----- pass %d -----' % (i + 1)
     _passes[i](ctx)
   times.append(time.time())

   if verbose:
     for i in range(start_pass, len(_passes)+1):
       print 'pass %d: %d seconds' % (i, int(times[i] - times[i-1]))
     print ' total:', int(times[len(_passes)] - times[start_pass-1]), 'seconds'


 def usage(ctx):
   print 'USAGE: %s [-n] [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
         % os.path.basename(sys.argv[0])
   print '  -n               dry run; parse CVS repos, but do not construct SVN repos'
   print '  -v               verbose'
   print '  -s PATH          path for SVN repos'
   print '  -p NUM           start at pass NUM of %d' % len(_passes)
   print '  --create         create a new SVN repository'
   print '  --dumpfile=PATH  name of intermediate svn dumpfile'
   print '  --svnadmin=PATH  path to the svnadmin program'
   print '  --trunk=PATH     path for trunk (default: %s)' % ctx.trunk_base
   # print '  --branches=PATH  path for branches (default: %s)' % ctx.branches_base
   # print '  --tags=PATH      path for tags (default: %s)' % ctx.tags_base
   print '  --no-prune       don\'t prune empty directories'
   print '  --encoding=ENC   encoding of log messages in CVS repos (default: %s)' % ctx.encoding
   sys.exit(1)


 def main():
   # prepare the operation context
   ctx = _ctx()
   ctx.cvsroot = None
   ctx.target = SVNROOT
   ctx.log_fname_base = DATAFILE
   ctx.dumpfile = DUMPFILE
   ctx.verbose = 0
   ctx.dry_run = 0
   ctx.prune = 1
   ctx.create_repos = 0
   ctx.trunk_base = "trunk"
   ctx.tags_base = "tags"
   ctx.branches_base = "branches"
   ctx.encoding = "ascii"
   ctx.svnadmin = "svnadmin"

   try:
     opts, args = getopt.getopt(sys.argv[1:], 'p:s:vn',
                                [ "create", "trunk=",
                                  "branches=", "tags=", "encoding=",
                                  "no-prune"])
   except getopt.GetoptError:
     usage(ctx)
   if len(args) != 1:
     usage(ctx)

   ctx.cvsroot = args[0]
   start_pass = 1

   for opt, value in opts:
     if opt == '-p':
       start_pass = int(value)
       if start_pass < 1 or start_pass > len(_passes):
         print 'ERROR: illegal value (%d) for starting pass. ' \
               'must be 1 through %d.' % (start_pass, len(_passes))
         sys.exit(1)
     elif opt == '-v':
       ctx.verbose = 1
     elif opt == '-n':
       ctx.dry_run = 1
     elif opt == '-s':
       ctx.target = value
     elif opt == '--create':
       ctx.create_repos = 1
     elif opt == '--dumpfile':
       ctx.dumpfile = value
     elif opt == '--svnadmin':
       ctx.svnadmin = value
     elif opt == '--trunk':
       ctx.trunk_base = value
     elif opt == '--branches':
       ctx.branches_base = value
     elif opt == '--tags':
       ctx.tags_base = value
     elif opt == '--no-prune':
       ctx.prune = None
     elif opt == '--encoding':
       ctx.encoding = value

   convert(ctx, start_pass=start_pass)


 if __name__ == '__main__':
   main()