tools/cvs2svn/cvs2svn.py - subversion - Git at Google

 #!/usr/bin/env python
 #
 # cvs2svn: ...
 #

 import rcsparse
 import os
 import sys
 import sha
 import re
 import time
 import fileinput
 import string
 import getopt
 import statcache

 from svn import fs, util, _delta, _repos

 ### these should go somewhere else. should have SWIG export them.
 svn_node_none = 0
 svn_node_file = 1
 svn_node_dir = 2
 svn_node_unknown = 3


 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')

 DATAFILE = 'cvs2svn-data'
 REVS_SUFFIX = '.revs'
 CLEAN_REVS_SUFFIX = '.c-revs'
 SORTED_REVS_SUFFIX = '.s-revs'
 TAGS_SUFFIX = '.tags'
 RESYNC_SUFFIX = '.resync'

 SVNROOT = 'svnroot'
 ATTIC = os.sep + 'Attic'

 COMMIT_THRESHOLD = 5 * 60	# flush a commit if a 5 minute gap occurs

 OP_DELETE = 'D'
 OP_CHANGE = 'C'

 DIGEST_END_IDX = 9 + (sha.digestsize * 2)

 verbose = 1


 class CollectData(rcsparse.Sink):
   def __init__(self, cvsroot, log_fname_base):
     self.cvsroot = cvsroot
     self.revs = open(log_fname_base + '.revs', 'w')
     self.tags = open(log_fname_base + '.tags', 'w')
     self.resync = open(log_fname_base + '.resync', 'w')

   def set_fname(self, fname):
     "Prepare to receive data for a new file."
     self.fname = fname

     # revision -> [timestamp, author, operation, old-timestamp]
     self.rev_data = { }
     self.prev = { }

   def define_tag(self, name, revision):
     self.tags.write('%s %s %s\n' % (name, revision, self.fname))

   def define_revision(self, revision, timestamp, author, state,
                       branches, next):
     ### what else?
     if state == 'dead':
       op = OP_DELETE
     else:
       op = OP_CHANGE

     # store the rev_data as a list in case we have to jigger the timestamp
     self.rev_data[revision] = [int(timestamp), author, op, None]

     # record the previous revision for sanity checking later
     if trunk_rev.match(revision):
       self.prev[revision] = next
     elif next:
       self.prev[next] = revision
     for b in branches:
       self.prev[b] = revision

   def tree_completed(self):
     "The revision tree has been parsed. Analyze it for consistency."

     # Our algorithm depends upon the timestamps on the revisions occuring
     # monotonically over time. That is, we want to see rev 1.34 occur in
     # time before rev 1.35. If we inserted 1.35 *first* (due to the time-
     # sorting), and then tried to insert 1.34, we'd be screwed.

     # to perform the analysis, we'll simply visit all of the 'previous'
     # links that we have recorded and validate that the timestamp on the
     # previous revision is before the specified revision

     # if we have to resync some nodes, then we restart the scan. just keep
     # looping as long as we need to restart.
     while 1:
       for current, prev in self.prev.items():
         if not prev:
           # no previous revision exists (i.e. the initial revision)
           continue
         t_c = self.rev_data[current][0]
         t_p = self.rev_data[prev][0]
         if t_p >= t_c:
           # the previous revision occurred later than the current revision.
           # shove the previous revision back in time (and any before it that
           # may need to shift).
           while t_p >= t_c:
             self.rev_data[prev][0] = t_c - 1	# new timestamp
             self.rev_data[prev][3] = t_p	# old timestamp

             print 'RESYNC: %s (%s) : old time="%s" new time="%s"' \
                   % (relative_name(self.cvsroot, self.fname),
                      prev, time.ctime(t_p), time.ctime(t_c - 1))

             current = prev
             prev = self.prev[current]
             if not prev:
               break
             t_c = t_c - 1		# self.rev_data[current][0]
             t_p = self.rev_data[prev][0]

           # break from the for-loop
           break
       else:
         # finished the for-loop (no resyncing was performed)
         return

   def set_revision_info(self, revision, log, text):
     timestamp, author, op, old_ts = self.rev_data[revision]
     digest = sha.new(log + '\0' + author).hexdigest()
     if old_ts:
       # the timestamp on this revision was changed. log it for later
       # resynchronization of other files's revisions that occurred
       # for this time and log message.
       self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))

     self.revs.write('%08lx %s %s %s %s\n' % (timestamp, digest,
                                              op, revision, self.fname))

 def relative_name(cvsroot, fname):
   l = len(cvsroot)
   if fname[:l] == cvsroot:
     if fname[l] == '/':
       return fname[l+1:]
     return fname[l:]
   return l

 def visit_file(arg, dirname, files):
   cd, p, stats = arg
   for fname in files:
     if fname[-2:] != ',v':
       continue
     pathname = os.path.join(dirname, fname)
     if dirname[-6:] == ATTIC:
       # drop the 'Attic' portion from the pathname
       ### we should record this so we can easily insert it back in
       cd.set_fname(os.path.join(dirname[:-6], fname))
     else:
       cd.set_fname(pathname)
     if verbose:
       print pathname
     p.parse(open(pathname), cd)
     stats[0] = stats[0] + 1

 class RevInfoParser(rcsparse.Sink):
   def __init__(self):
     self.authors = { }	# revision -> author
     self.logs = { }	# revision -> log message

   def define_revision(self, revision, timestamp, author, state,
                       branches, next):
     self.authors[revision] = author

   def set_revision_info(self, revision, log, text):
     self.logs[revision] = log

   def parse_cvs_file(self, rcs_pathname):
     try:
       rcsfile = open(rcs_pathname, 'r')
     except:
       try:
         dirname, fname = os.path.split(rcs_pathname)
         rcs_pathname = os.path.join(dirname, "Attic", fname)
         rcsfile = open(rcs_pathname, 'r')
       except:
         ### should use a better error
         raise RuntimeError, ('error: %s appeared to be under CVS control, '
                              'but the RCS file is inaccessible.'
                              % rcs_pathname)

     rcsparse.Parser().parse(rcsfile, self)

 class BuildRevision(rcsparse.Sink):
   def __init__(self, rev, get_metadata=0):
     self.rev = rev
     self.get_metadata = get_metadata
     self.result = None

   def define_revision(self, revision, timestamp, author, state,
                       branches, next):
     for branch in branches:
       self.prev_delta[branch] = revision
     if next:
       self.prev_delta[next] = revision
     if self.get_metadata and revision == self.rev:
       self.author = author

   def tree_completed(self):
     path = [ ]
     revision = self.rev
     while revision:
       path.append(revision)
       revision = self.prev_delta.get(revision)
     path.reverse()
     self.collect = path

   def set_revision_info(self, revision, log, text):
     if not self.collect:
       # nothing more to do
       ### would be nice to halt the file parsing...
       return

     # NOTE: we assume that the deltas appear in the proper order within
     # the RCS file, for streaming application. Thus, our max size is the
     # largest revision of all involved (rather than the revision plus all
     # diff entries).
     if revision != self.collect[0]:
       # not something we are interested in
       return

     if self.get_metadata and revision == self.rev:
       self.log = log

     if self.result is None:
       self.result = string.split(text, '\n')
     else:
       adjust = 0
       diffs = string.split(text, '\n')

       for command in diffs:
         if add_lines_remaining > 0:
           # Insertion lines from a prior "a" command
           self.result.insert(start_line + adjust, command)
           add_lines_remaining = add_lines_remaining - 1
           adjust = adjust + 1
         else:
           dmatch = self.d_command.match(command)
           amatch = self.a_command.match(command)
           if dmatch:
             # "d" - Delete command
             start_line = string.atoi(dmatch.group(1))
             count      = string.atoi(dmatch.group(2))
             begin = start_line + adjust - 1
             del self.result[begin:begin + count]
             adjust = adjust - count
           elif amatch:
             # "a" - Add command
             start_line = string.atoi(amatch.group(1))
             count      = string.atoi(amatch.group(2))
             add_lines_remaining = count
           else:
             raise RuntimeError, 'Error parsing diff commands'

 class Commit:
   def __init__(self):
     self.changes = [ ]
     self.deletes = [ ]
     self.t_min = 1<<30
     self.t_max = 0

   def add(self, t, op, file, rev):
     # record the time range of this commit
     if t < self.t_min:
       self.t_min = t
     if t > self.t_max:
       self.t_max = t

     if op == OP_CHANGE:
       self.changes.append((file, rev))
     else:
       # OP_DELETE
       self.deletes.append((file, rev))

   def get_metadata(self, pool):
     # by definition, the author and log message must be the same for all
     # items that went into this commit. therefore, just grab any item from
     # our record of changes/deletes.
     if self.changes:
       file, rev = self.changes[0]
     else:
       # there better be one...
       file, rev = self.deletes[0]

     # now, fetch the author/log from the ,v file
     rip = RevInfoParser()
     rip.parse_cvs_file(file)
     author = rip.authors[rev]
     log = rip.logs[rev]

     # format the date properly
     a_t = util.apr_time_ansi_put(self.t_max)[1]
     date = util.svn_time_to_nts(a_t, pool)

     return author, log, date

   def commit(self, t_fs, ctx):
     # commit this transaction
     print 'committing: %s, over %d seconds' % (time.ctime(self.t_min),
                                                self.t_max - self.t_min)

     # create a pool for the entire commit
     c_pool = util.svn_pool_create(ctx.pool)

     rev = fs.youngest_rev(t_fs, c_pool)
     txn = fs.begin_txn(t_fs, rev, c_pool)
     root = fs.txn_root(txn, c_pool)

     lastcommit = (None, None)

     # create a pool for each file; it will be cleared on each iteration
     f_pool = util.svn_pool_create(c_pool)

     for f, r in self.changes:
       # compute a repository path. ensure we have a leading "/" and drop
       # the ,v from the file name
       repos_path = '/' + relative_name(ctx.cvsroot, f[:-2])
       #print 'DEBUG:', repos_path

       print '    changing %s : %s' % (r, repos_path)

       ### hmm. need to clarify OS path separators vs FS path separators
       dirname = os.path.dirname(repos_path)
       if dirname != '/':
         # get the components of the path (skipping the leading '/')
         parts = string.split(dirname[1:], os.sep)
         for i in range(1, len(parts) + 1):
           # reassemble the pieces, adding a leading slash
           parent_dir = '/' + string.join(parts[:i], '/')
           if fs.check_path(root, parent_dir, f_pool) == svn_node_none:
             print '    making dir:', parent_dir
             fs.make_dir(root, parent_dir, f_pool)

       if fs.check_path(root, repos_path, f_pool) == svn_node_none:
         created_file = 1
         fs.make_file(root, repos_path, f_pool)
       else:
         created_file = 0

       handler, baton = fs.apply_textdelta(root, repos_path, f_pool)

       # figure out the real file path for "co"
       try:
         statcache.stat(f)
       except os.error:
         dirname, fname = os.path.split(f)
         f = os.path.join(dirname, 'Attic', fname)
         statcache.stat(f)

       pipe = os.popen('co -q -p%s %s' % (r, f), 'r', 102400)

       # if we just made the file, we can send it in one big hunk, rather
       # than streaming it in.
       ### we should watch out for file sizes here; we don't want to yank
       ### in HUGE files...
       if created_file:
         _delta.svn_txdelta_send_string(pipe.read(), handler, baton, f_pool)
       else:
         # open an SVN stream onto the pipe
         stream2 = util.svn_stream_from_stdio(pipe, f_pool)

         # Get the current file contents from the repo, or, if we have
         # multiple CVS revisions to the same file being done in this
         # single commit, then get the contents of the previous
         # revision from co, or else the delta won't be correct because
         # the contents in the repo won't have changed yet.
         if repos_path == lastcommit[0]:
           infile2 = os.popen("co -q -p%s %s" % (lastcommit[1], f), "r", 102400)
           stream1 = util.svn_stream_from_stdio(infile2, f_pool)
         else:
           stream1 = fs.file_contents(root, repos_path, f_pool)

         txstream = _delta.svn_txdelta(stream1, stream2, f_pool)
         _delta.svn_txdelta_send_txstream(txstream, handler, baton, f_pool)

         # shut down the previous-rev pipe, if we opened it
         infile2 = None

       # shut down the current-rev pipe
       pipe.close()

       # wipe the pool. this will get rid of the pipe streams and the delta
       # stream, and anything the FS may have done.
       util.svn_pool_clear(f_pool)

       # remember what we just did, for the next iteration
       lastcommit = (repos_path, r)

     for f, r in self.deletes:
       # compute a repository path. ensure we have a leading "/" and drop
       # the ,v from the file name
       repos_path = '/' + relative_name(ctx.cvsroot, f[:-2])

       print '    deleting %s : %s' % (r, repos_path)

       # If the file was initially added on a branch, the first mainline
       # revision will be marked dead, and thus, attempts to delete it will
       # fail, since it doesn't really exist.
       if r != '1.1':
         ### need to discriminate between OS paths and FS paths
         fs.delete(root, repos_path, f_pool)

       # wipe the pool, in case the delete loads it up
       util.svn_pool_clear(f_pool)

     # get the metadata for this commit
     author, log, date = self.get_metadata(c_pool)
     fs.change_txn_prop(txn, 'svn:author', author, c_pool)
     fs.change_txn_prop(txn, 'svn:log', log, c_pool)

     conflicts, new_rev = fs.commit_txn(txn)

     # set the time to the proper (past) time
     fs.change_rev_prop(t_fs, new_rev, 'svn:date', date, c_pool)

     ### how come conflicts is a newline?
     if conflicts != '\n':
       print '    CONFLICTS:', `conflicts`
     print '    new revision:', new_rev

     # done with the commit and file pools
     util.svn_pool_destroy(c_pool)

 def read_resync(fname):
   "Read the .resync file into memory."

   ### note that we assume that we can hold the entire resync file in
   ### memory. really large repositories with whacky timestamps could
   ### bust this assumption. should that ever happen, then it is possible
   ### to split the resync file into pieces and make multiple passes,
   ### using each piece.

   #
   # A digest maps to a sequence of lists which specify a lower and upper
   # time bound for matching up the commit. We keep a sequence of these
   # because a number of checkins with the same log message (e.g. an empty
   # log message) could need to be remapped. We also make them a list because
   # we will dynamically expand the lower/upper bound as we find commits
   # that fall into a particular msg and time range.
   #
   # resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
   #
   resync = { }

   for line in fileinput.FileInput(fname):
     t1 = int(line[:8], 16)
     digest = line[9:DIGEST_END_IDX]
     t2 = int(line[DIGEST_END_IDX+1:], 16)
     t1_l = t1 - COMMIT_THRESHOLD/2
     t1_u = t1 + COMMIT_THRESHOLD/2
     if resync.has_key(digest):
       resync[digest].append([t1_l, t1_u, t2])
     else:
       resync[digest] = [ [t1_l, t1_u, t2] ]
   return resync

 def parse_revs_line(line):
   timestamp = int(line[:8], 16)
   id = line[9:DIGEST_END_IDX]
   op = line[DIGEST_END_IDX + 1]
   idx = string.find(line, ' ', DIGEST_END_IDX + 3)
   rev = line[DIGEST_END_IDX+3:idx]
   fname = line[idx+1:-1]

   return timestamp, id, op, rev, fname


 def pass1(ctx):
   cd = CollectData(ctx.cvsroot, DATAFILE)
   p = rcsparse.Parser()
   stats = [ 0 ]
   os.path.walk(ctx.cvsroot, visit_file, (cd, p, stats))
   if ctx.verbose:
     print 'processed', stats[0], 'files'

 def pass2(ctx):
   "Pass 2: clean up the revision information."

   # We may have recorded some changes in revisions' timestamp. We need to
   # scan for any other files which may have had the same log message and
   # occurred at "the same time" and change their timestamps, too.

   # read the resync data file
   resync = read_resync(ctx.log_fname_base + RESYNC_SUFFIX)

   output = open(ctx.log_fname_base + CLEAN_REVS_SUFFIX, 'w')

   # process the revisions file, looking for items to clean up
   for line in fileinput.FileInput(ctx.log_fname_base + REVS_SUFFIX):
     timestamp, digest, op, rev, fname = parse_revs_line(line)
     if not resync.has_key(digest):
       output.write(line)
       continue

     # we have a hit. see if this is "near" any of the resync records we
     # have recorded for this digest [of the log message].
     for record in resync[digest]:
       if record[0] <= timestamp <= record[1]:
         # bingo! remap the time on this (record[2] is the new time).
         output.write('%08lx %s %s %s %s\n'
                      % (record[2], digest, op, rev, fname))

         print 'RESYNC: %s (%s) : old time="%s" new time="%s"' \
               % (relative_name(ctx.cvsroot, fname),
                  rev, time.ctime(timestamp), time.ctime(record[2]))

         # adjust the time range. we want the COMMIT_THRESHOLD from the
         # bounds of the earlier/latest commit in this group.
         record[0] = min(record[0], timestamp - COMMIT_THRESHOLD/2)
         record[1] = max(record[1], timestamp + COMMIT_THRESHOLD/2)

         # stop looking for hits
         break
     else:
       # the file/rev did not need to have its time changed.
       output.write(line)

 def pass3(ctx):
   # sort the log files
   os.system('sort %s > %s' % (ctx.log_fname_base + CLEAN_REVS_SUFFIX,
                               ctx.log_fname_base + SORTED_REVS_SUFFIX))

 def pass4(ctx):
   # create the target repository
   t_repos = _repos.svn_repos_create(ctx.target, ctx.pool)
   t_fs = _repos.svn_repos_fs(t_repos)

   # process the logfiles, creating the target
   commits = { }
   count = 0

   for line in fileinput.FileInput(ctx.log_fname_base + SORTED_REVS_SUFFIX):
     timestamp, id, op, rev, fname = parse_revs_line(line)

     ### only handle changes on the trunk for now
     if not trunk_rev.match(rev):
       ### technically, the timestamp on this could/should cause a flush.
       ### don't worry about it; the next item will handle it
       continue

     if commits.has_key(id):
       c = commits[id]
     else:
       c = commits[id] = Commit()
     c.add(timestamp, op, fname, rev)

     # scan for commits to process
     process = [ ]
     for id, c in commits.items():
       if c.t_max + COMMIT_THRESHOLD < timestamp:
         process.append((c.t_max, c))
         del commits[id]

     process.sort()
     for t_max, c in process:
       c.commit(t_fs, ctx)
     count = count + len(process)

   # if there are any pending commits left, then flush them
   if commits:
     process = [ ]
     for id, c in commits.items():
       process.append((c.t_max, c))
     process.sort()
     for t_max, c in process:
       c.commit(t_fs, ctx)
     count = count + len(process)

   if ctx.verbose:
     print count, 'commits processed.'

 _passes = [
   pass1,
   pass2,
   pass3,
   pass4,
   ]

 class _ctx:
   pass

 def convert(pool, cvsroot,
             target=SVNROOT, log_fname_base=DATAFILE, start_pass=1, verbose=0):
   "Convert a CVS repository to an SVN repository."

   # prepare the operation context
   ctx = _ctx()
   ctx.pool = pool
   ctx.cvsroot = cvsroot
   ctx.target = target
   ctx.log_fname_base = log_fname_base
   ctx.verbose = verbose

   times = [ None ] * len(_passes)
   for i in range(start_pass - 1, len(_passes)):
     times[i] = time.time()
     if verbose:
       print '----- pass %d -----' % (i + 1)
     _passes[i](ctx)
   times.append(time.time())

   if verbose:
     for i in range(start_pass, len(_passes)+1):
       print 'pass %d: %d seconds' % (i, int(times[i] - times[i-1]))
     print ' total:', int(times[len(_passes)] - times[start_pass-1]), 'seconds'

 def usage():
   print 'USAGE: %s [-v] [-s svn-repos-path] [-p pass] repository-path' \
         % sys.argv[0]
   sys.exit(1)

 def main():
   opts, args = getopt.getopt(sys.argv[1:], 'p:s:v')
   if len(args) != 1:
     usage()

   verbose = 0
   start_pass = 1
   target = SVNROOT

   for opt, value in opts:
     if opt == '-p':
       start_pass = int(value)
       if start_pass < 1 or start_pass > len(_passes):
         print 'ERROR: illegal value (%d) for starting pass. ' \
               'must be 1 through %d.' % (start_pass, len(_passes))
         sys.exit(1)
     elif opt == '-v':
       verbose = 1
     elif opt == '-s':
       target = value

   util.run_app(convert, args[0],
                start_pass=start_pass, verbose=verbose, target=target)

 if __name__ == '__main__':
   main()
	#!/usr/bin/env python
	#
	# cvs2svn: ...
	#

	import rcsparse
	import os
	import sys
	import sha
	import re
	import time
	import fileinput
	import string
	import getopt
	import statcache

	from svn import fs, util, _delta, _repos

	### these should go somewhere else. should have SWIG export them.
	svn_node_none = 0
	svn_node_file = 1
	svn_node_dir = 2
	svn_node_unknown = 3


	trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')

	DATAFILE = 'cvs2svn-data'
	REVS_SUFFIX = '.revs'
	CLEAN_REVS_SUFFIX = '.c-revs'
	SORTED_REVS_SUFFIX = '.s-revs'
	TAGS_SUFFIX = '.tags'
	RESYNC_SUFFIX = '.resync'

	SVNROOT = 'svnroot'
	ATTIC = os.sep + 'Attic'

	COMMIT_THRESHOLD = 5 * 60 # flush a commit if a 5 minute gap occurs

	OP_DELETE = 'D'
	OP_CHANGE = 'C'

	DIGEST_END_IDX = 9 + (sha.digestsize * 2)

	verbose = 1


	class CollectData(rcsparse.Sink):
	def __init__(self, cvsroot, log_fname_base):
	self.cvsroot = cvsroot
	self.revs = open(log_fname_base + '.revs', 'w')
	self.tags = open(log_fname_base + '.tags', 'w')
	self.resync = open(log_fname_base + '.resync', 'w')

	def set_fname(self, fname):
	"Prepare to receive data for a new file."
	self.fname = fname

	# revision -> [timestamp, author, operation, old-timestamp]
	self.rev_data = { }
	self.prev = { }

	def define_tag(self, name, revision):
	self.tags.write('%s %s %s\n' % (name, revision, self.fname))

	def define_revision(self, revision, timestamp, author, state,
	branches, next):
	### what else?
	if state == 'dead':
	op = OP_DELETE
	else:
	op = OP_CHANGE

	# store the rev_data as a list in case we have to jigger the timestamp
	self.rev_data[revision] = [int(timestamp), author, op, None]

	# record the previous revision for sanity checking later
	if trunk_rev.match(revision):
	self.prev[revision] = next
	elif next:
	self.prev[next] = revision
	for b in branches:
	self.prev[b] = revision

	def tree_completed(self):
	"The revision tree has been parsed. Analyze it for consistency."

	# Our algorithm depends upon the timestamps on the revisions occuring
	# monotonically over time. That is, we want to see rev 1.34 occur in
	# time before rev 1.35. If we inserted 1.35 first (due to the time-
	# sorting), and then tried to insert 1.34, we'd be screwed.

	# to perform the analysis, we'll simply visit all of the 'previous'
	# links that we have recorded and validate that the timestamp on the
	# previous revision is before the specified revision

	# if we have to resync some nodes, then we restart the scan. just keep
	# looping as long as we need to restart.
	while 1:
	for current, prev in self.prev.items():
	if not prev:
	# no previous revision exists (i.e. the initial revision)
	continue
	t_c = self.rev_data[current][0]
	t_p = self.rev_data[prev][0]
	if t_p >= t_c:
	# the previous revision occurred later than the current revision.
	# shove the previous revision back in time (and any before it that
	# may need to shift).
	while t_p >= t_c:
	self.rev_data[prev][0] = t_c - 1 # new timestamp
	self.rev_data[prev][3] = t_p # old timestamp

	print 'RESYNC: %s (%s) : old time="%s" new time="%s"' \
	% (relative_name(self.cvsroot, self.fname),
	prev, time.ctime(t_p), time.ctime(t_c - 1))

	current = prev
	prev = self.prev[current]
	if not prev:
	break
	t_c = t_c - 1 # self.rev_data[current][0]
	t_p = self.rev_data[prev][0]

	# break from the for-loop
	break
	else:
	# finished the for-loop (no resyncing was performed)
	return

	def set_revision_info(self, revision, log, text):
	timestamp, author, op, old_ts = self.rev_data[revision]
	digest = sha.new(log + '\0' + author).hexdigest()
	if old_ts:
	# the timestamp on this revision was changed. log it for later
	# resynchronization of other files's revisions that occurred
	# for this time and log message.
	self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))

	self.revs.write('%08lx %s %s %s %s\n' % (timestamp, digest,
	op, revision, self.fname))

	def relative_name(cvsroot, fname):
	l = len(cvsroot)
	if fname[:l] == cvsroot:
	if fname[l] == '/':
	return fname[l+1:]
	return fname[l:]
	return l

	def visit_file(arg, dirname, files):
	cd, p, stats = arg
	for fname in files:
	if fname[-2:] != ',v':
	continue
	pathname = os.path.join(dirname, fname)
	if dirname[-6:] == ATTIC:
	# drop the 'Attic' portion from the pathname
	### we should record this so we can easily insert it back in
	cd.set_fname(os.path.join(dirname[:-6], fname))
	else:
	cd.set_fname(pathname)
	if verbose:
	print pathname
	p.parse(open(pathname), cd)
	stats[0] = stats[0] + 1

	class RevInfoParser(rcsparse.Sink):
	def __init__(self):
	self.authors = { } # revision -> author
	self.logs = { } # revision -> log message

	def define_revision(self, revision, timestamp, author, state,
	branches, next):
	self.authors[revision] = author

	def set_revision_info(self, revision, log, text):
	self.logs[revision] = log

	def parse_cvs_file(self, rcs_pathname):
	try:
	rcsfile = open(rcs_pathname, 'r')
	except:
	try:
	dirname, fname = os.path.split(rcs_pathname)
	rcs_pathname = os.path.join(dirname, "Attic", fname)
	rcsfile = open(rcs_pathname, 'r')
	except:
	### should use a better error
	raise RuntimeError, ('error: %s appeared to be under CVS control, '
	'but the RCS file is inaccessible.'
	% rcs_pathname)

	rcsparse.Parser().parse(rcsfile, self)

	class BuildRevision(rcsparse.Sink):
	def __init__(self, rev, get_metadata=0):
	self.rev = rev
	self.get_metadata = get_metadata
	self.result = None

	def define_revision(self, revision, timestamp, author, state,
	branches, next):
	for branch in branches:
	self.prev_delta[branch] = revision
	if next:
	self.prev_delta[next] = revision
	if self.get_metadata and revision == self.rev:
	self.author = author

	def tree_completed(self):
	path = [ ]
	revision = self.rev
	while revision:
	path.append(revision)
	revision = self.prev_delta.get(revision)
	path.reverse()
	self.collect = path

	def set_revision_info(self, revision, log, text):
	if not self.collect:
	# nothing more to do
	### would be nice to halt the file parsing...
	return

	# NOTE: we assume that the deltas appear in the proper order within
	# the RCS file, for streaming application. Thus, our max size is the
	# largest revision of all involved (rather than the revision plus all
	# diff entries).
	if revision != self.collect[0]:
	# not something we are interested in
	return

	if self.get_metadata and revision == self.rev:
	self.log = log

	if self.result is None:
	self.result = string.split(text, '\n')
	else:
	adjust = 0
	diffs = string.split(text, '\n')

	for command in diffs:
	if add_lines_remaining > 0:
	# Insertion lines from a prior "a" command
	self.result.insert(start_line + adjust, command)
	add_lines_remaining = add_lines_remaining - 1
	adjust = adjust + 1
	else:
	dmatch = self.d_command.match(command)
	amatch = self.a_command.match(command)
	if dmatch:
	# "d" - Delete command
	start_line = string.atoi(dmatch.group(1))
	count = string.atoi(dmatch.group(2))
	begin = start_line + adjust - 1
	del self.result[begin:begin + count]
	adjust = adjust - count
	elif amatch:
	# "a" - Add command
	start_line = string.atoi(amatch.group(1))
	count = string.atoi(amatch.group(2))
	add_lines_remaining = count
	else:
	raise RuntimeError, 'Error parsing diff commands'

	class Commit:
	def __init__(self):
	self.changes = [ ]
	self.deletes = [ ]
	self.t_min = 1<<30
	self.t_max = 0

	def add(self, t, op, file, rev):
	# record the time range of this commit
	if t < self.t_min:
	self.t_min = t
	if t > self.t_max:
	self.t_max = t

	if op == OP_CHANGE:
	self.changes.append((file, rev))
	else:
	# OP_DELETE
	self.deletes.append((file, rev))

	def get_metadata(self, pool):
	# by definition, the author and log message must be the same for all
	# items that went into this commit. therefore, just grab any item from
	# our record of changes/deletes.
	if self.changes:
	file, rev = self.changes[0]
	else:
	# there better be one...
	file, rev = self.deletes[0]

	# now, fetch the author/log from the ,v file
	rip = RevInfoParser()
	rip.parse_cvs_file(file)
	author = rip.authors[rev]
	log = rip.logs[rev]

	# format the date properly
	a_t = util.apr_time_ansi_put(self.t_max)[1]
	date = util.svn_time_to_nts(a_t, pool)

	return author, log, date

	def commit(self, t_fs, ctx):
	# commit this transaction
	print 'committing: %s, over %d seconds' % (time.ctime(self.t_min),
	self.t_max - self.t_min)

	# create a pool for the entire commit
	c_pool = util.svn_pool_create(ctx.pool)

	rev = fs.youngest_rev(t_fs, c_pool)
	txn = fs.begin_txn(t_fs, rev, c_pool)
	root = fs.txn_root(txn, c_pool)

	lastcommit = (None, None)

	# create a pool for each file; it will be cleared on each iteration
	f_pool = util.svn_pool_create(c_pool)

	for f, r in self.changes:
	# compute a repository path. ensure we have a leading "/" and drop
	# the ,v from the file name
	repos_path = '/' + relative_name(ctx.cvsroot, f[:-2])
	#print 'DEBUG:', repos_path

	print ' changing %s : %s' % (r, repos_path)

	### hmm. need to clarify OS path separators vs FS path separators
	dirname = os.path.dirname(repos_path)
	if dirname != '/':
	# get the components of the path (skipping the leading '/')
	parts = string.split(dirname[1:], os.sep)
	for i in range(1, len(parts) + 1):
	# reassemble the pieces, adding a leading slash
	parent_dir = '/' + string.join(parts[:i], '/')
	if fs.check_path(root, parent_dir, f_pool) == svn_node_none:
	print ' making dir:', parent_dir
	fs.make_dir(root, parent_dir, f_pool)

	if fs.check_path(root, repos_path, f_pool) == svn_node_none:
	created_file = 1
	fs.make_file(root, repos_path, f_pool)
	else:
	created_file = 0

	handler, baton = fs.apply_textdelta(root, repos_path, f_pool)

	# figure out the real file path for "co"
	try:
	statcache.stat(f)
	except os.error:
	dirname, fname = os.path.split(f)
	f = os.path.join(dirname, 'Attic', fname)
	statcache.stat(f)

	pipe = os.popen('co -q -p%s %s' % (r, f), 'r', 102400)

	# if we just made the file, we can send it in one big hunk, rather
	# than streaming it in.
	### we should watch out for file sizes here; we don't want to yank
	### in HUGE files...
	if created_file:
	_delta.svn_txdelta_send_string(pipe.read(), handler, baton, f_pool)
	else:
	# open an SVN stream onto the pipe
	stream2 = util.svn_stream_from_stdio(pipe, f_pool)

	# Get the current file contents from the repo, or, if we have
	# multiple CVS revisions to the same file being done in this
	# single commit, then get the contents of the previous
	# revision from co, or else the delta won't be correct because
	# the contents in the repo won't have changed yet.
	if repos_path == lastcommit[0]:
	infile2 = os.popen("co -q -p%s %s" % (lastcommit[1], f), "r", 102400)
	stream1 = util.svn_stream_from_stdio(infile2, f_pool)
	else:
	stream1 = fs.file_contents(root, repos_path, f_pool)

	txstream = _delta.svn_txdelta(stream1, stream2, f_pool)
	_delta.svn_txdelta_send_txstream(txstream, handler, baton, f_pool)

	# shut down the previous-rev pipe, if we opened it
	infile2 = None

	# shut down the current-rev pipe
	pipe.close()

	# wipe the pool. this will get rid of the pipe streams and the delta
	# stream, and anything the FS may have done.
	util.svn_pool_clear(f_pool)

	# remember what we just did, for the next iteration
	lastcommit = (repos_path, r)

	for f, r in self.deletes:
	# compute a repository path. ensure we have a leading "/" and drop
	# the ,v from the file name
	repos_path = '/' + relative_name(ctx.cvsroot, f[:-2])

	print ' deleting %s : %s' % (r, repos_path)

	# If the file was initially added on a branch, the first mainline
	# revision will be marked dead, and thus, attempts to delete it will
	# fail, since it doesn't really exist.
	if r != '1.1':
	### need to discriminate between OS paths and FS paths
	fs.delete(root, repos_path, f_pool)

	# wipe the pool, in case the delete loads it up
	util.svn_pool_clear(f_pool)

	# get the metadata for this commit
	author, log, date = self.get_metadata(c_pool)
	fs.change_txn_prop(txn, 'svn:author', author, c_pool)
	fs.change_txn_prop(txn, 'svn:log', log, c_pool)

	conflicts, new_rev = fs.commit_txn(txn)

	# set the time to the proper (past) time
	fs.change_rev_prop(t_fs, new_rev, 'svn:date', date, c_pool)

	### how come conflicts is a newline?
	if conflicts != '\n':
	print ' CONFLICTS:', `conflicts`
	print ' new revision:', new_rev

	# done with the commit and file pools
	util.svn_pool_destroy(c_pool)

	def read_resync(fname):
	"Read the .resync file into memory."

	### note that we assume that we can hold the entire resync file in
	### memory. really large repositories with whacky timestamps could
	### bust this assumption. should that ever happen, then it is possible
	### to split the resync file into pieces and make multiple passes,
	### using each piece.

	#
	# A digest maps to a sequence of lists which specify a lower and upper
	# time bound for matching up the commit. We keep a sequence of these
	# because a number of checkins with the same log message (e.g. an empty
	# log message) could need to be remapped. We also make them a list because
	# we will dynamically expand the lower/upper bound as we find commits
	# that fall into a particular msg and time range.
	#
	# resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
	#
	resync = { }

	for line in fileinput.FileInput(fname):
	t1 = int(line[:8], 16)
	digest = line[9:DIGEST_END_IDX]
	t2 = int(line[DIGEST_END_IDX+1:], 16)
	t1_l = t1 - COMMIT_THRESHOLD/2
	t1_u = t1 + COMMIT_THRESHOLD/2
	if resync.has_key(digest):
	resync[digest].append([t1_l, t1_u, t2])
	else:
	resync[digest] = [ [t1_l, t1_u, t2] ]
	return resync

	def parse_revs_line(line):
	timestamp = int(line[:8], 16)
	id = line[9:DIGEST_END_IDX]
	op = line[DIGEST_END_IDX + 1]
	idx = string.find(line, ' ', DIGEST_END_IDX + 3)
	rev = line[DIGEST_END_IDX+3:idx]
	fname = line[idx+1:-1]

	return timestamp, id, op, rev, fname


	def pass1(ctx):
	cd = CollectData(ctx.cvsroot, DATAFILE)
	p = rcsparse.Parser()
	stats = [ 0 ]
	os.path.walk(ctx.cvsroot, visit_file, (cd, p, stats))
	if ctx.verbose:
	print 'processed', stats[0], 'files'

	def pass2(ctx):
	"Pass 2: clean up the revision information."

	# We may have recorded some changes in revisions' timestamp. We need to
	# scan for any other files which may have had the same log message and
	# occurred at "the same time" and change their timestamps, too.

	# read the resync data file
	resync = read_resync(ctx.log_fname_base + RESYNC_SUFFIX)

	output = open(ctx.log_fname_base + CLEAN_REVS_SUFFIX, 'w')

	# process the revisions file, looking for items to clean up
	for line in fileinput.FileInput(ctx.log_fname_base + REVS_SUFFIX):
	timestamp, digest, op, rev, fname = parse_revs_line(line)
	if not resync.has_key(digest):
	output.write(line)
	continue

	# we have a hit. see if this is "near" any of the resync records we
	# have recorded for this digest [of the log message].
	for record in resync[digest]:
	if record[0] <= timestamp <= record[1]:
	# bingo! remap the time on this (record[2] is the new time).
	output.write('%08lx %s %s %s %s\n'
	% (record[2], digest, op, rev, fname))

	print 'RESYNC: %s (%s) : old time="%s" new time="%s"' \
	% (relative_name(ctx.cvsroot, fname),
	rev, time.ctime(timestamp), time.ctime(record[2]))

	# adjust the time range. we want the COMMIT_THRESHOLD from the
	# bounds of the earlier/latest commit in this group.
	record[0] = min(record[0], timestamp - COMMIT_THRESHOLD/2)
	record[1] = max(record[1], timestamp + COMMIT_THRESHOLD/2)

	# stop looking for hits
	break
	else:
	# the file/rev did not need to have its time changed.
	output.write(line)

	def pass3(ctx):
	# sort the log files
	os.system('sort %s > %s' % (ctx.log_fname_base + CLEAN_REVS_SUFFIX,
	ctx.log_fname_base + SORTED_REVS_SUFFIX))

	def pass4(ctx):
	# create the target repository
	t_repos = _repos.svn_repos_create(ctx.target, ctx.pool)
	t_fs = _repos.svn_repos_fs(t_repos)

	# process the logfiles, creating the target
	commits = { }
	count = 0

	for line in fileinput.FileInput(ctx.log_fname_base + SORTED_REVS_SUFFIX):
	timestamp, id, op, rev, fname = parse_revs_line(line)

	### only handle changes on the trunk for now
	if not trunk_rev.match(rev):
	### technically, the timestamp on this could/should cause a flush.
	### don't worry about it; the next item will handle it
	continue

	if commits.has_key(id):
	c = commits[id]
	else:
	c = commits[id] = Commit()
	c.add(timestamp, op, fname, rev)

	# scan for commits to process
	process = [ ]
	for id, c in commits.items():
	if c.t_max + COMMIT_THRESHOLD < timestamp:
	process.append((c.t_max, c))
	del commits[id]

	process.sort()
	for t_max, c in process:
	c.commit(t_fs, ctx)
	count = count + len(process)

	# if there are any pending commits left, then flush them
	if commits:
	process = [ ]
	for id, c in commits.items():
	process.append((c.t_max, c))
	process.sort()
	for t_max, c in process:
	c.commit(t_fs, ctx)
	count = count + len(process)

	if ctx.verbose:
	print count, 'commits processed.'

	_passes = [
	pass1,
	pass2,
	pass3,
	pass4,
	]

	class _ctx:
	pass

	def convert(pool, cvsroot,
	target=SVNROOT, log_fname_base=DATAFILE, start_pass=1, verbose=0):
	"Convert a CVS repository to an SVN repository."

	# prepare the operation context
	ctx = _ctx()
	ctx.pool = pool
	ctx.cvsroot = cvsroot
	ctx.target = target
	ctx.log_fname_base = log_fname_base
	ctx.verbose = verbose

	times = [ None ] * len(_passes)
	for i in range(start_pass - 1, len(_passes)):
	times[i] = time.time()
	if verbose:
	print '----- pass %d -----' % (i + 1)
	_passes[i](ctx)
	times.append(time.time())

	if verbose:
	for i in range(start_pass, len(_passes)+1):
	print 'pass %d: %d seconds' % (i, int(times[i] - times[i-1]))
	print ' total:', int(times[len(_passes)] - times[start_pass-1]), 'seconds'

	def usage():
	print 'USAGE: %s [-v] [-s svn-repos-path] [-p pass] repository-path' \
	% sys.argv[0]
	sys.exit(1)

	def main():
	opts, args = getopt.getopt(sys.argv[1:], 'p:s:v')
	if len(args) != 1:
	usage()

	verbose = 0
	start_pass = 1
	target = SVNROOT

	for opt, value in opts:
	if opt == '-p':
	start_pass = int(value)
	if start_pass < 1 or start_pass > len(_passes):
	print 'ERROR: illegal value (%d) for starting pass. ' \
	'must be 1 through %d.' % (start_pass, len(_passes))
	sys.exit(1)
	elif opt == '-v':
	verbose = 1
	elif opt == '-s':
	target = value

	util.run_app(convert, args[0],
	start_pass=start_pass, verbose=verbose, target=target)

	if __name__ == '__main__':
	main()