blob: d8df30e786e81efdca4d84e31f65445051d12f05 [file] [log] [blame]
#!/usr/bin/env python
#
# cvs2svn: ...
#
import rcsparse
import os
import sys
import sha
import re
import time
import fileinput
import string
import getopt
import statcache
from svn import fs, util, _delta, _repos
### these should go somewhere else. should have SWIG export them.
svn_node_none = 0
svn_node_file = 1
svn_node_dir = 2
svn_node_unknown = 3
trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
DATAFILE = 'cvs2svn-data'
REVS_SUFFIX = '.revs'
CLEAN_REVS_SUFFIX = '.c-revs'
SORTED_REVS_SUFFIX = '.s-revs'
TAGS_SUFFIX = '.tags'
RESYNC_SUFFIX = '.resync'
SVNROOT = 'svnroot'
ATTIC = os.sep + 'Attic'
COMMIT_THRESHOLD = 5 * 60 # flush a commit if a 5 minute gap occurs
OP_DELETE = 'D'
OP_CHANGE = 'C'
DIGEST_END_IDX = 9 + (sha.digestsize * 2)
verbose = 1
class CollectData(rcsparse.Sink):
def __init__(self, cvsroot, log_fname_base):
self.cvsroot = cvsroot
self.revs = open(log_fname_base + '.revs', 'w')
self.tags = open(log_fname_base + '.tags', 'w')
self.resync = open(log_fname_base + '.resync', 'w')
def set_fname(self, fname):
"Prepare to receive data for a new file."
self.fname = fname
# revision -> [timestamp, author, operation, old-timestamp]
self.rev_data = { }
self.prev = { }
def define_tag(self, name, revision):
self.tags.write('%s %s %s\n' % (name, revision, self.fname))
def define_revision(self, revision, timestamp, author, state,
branches, next):
### what else?
if state == 'dead':
op = OP_DELETE
else:
op = OP_CHANGE
# store the rev_data as a list in case we have to jigger the timestamp
self.rev_data[revision] = [int(timestamp), author, op, None]
# record the previous revision for sanity checking later
if trunk_rev.match(revision):
self.prev[revision] = next
elif next:
self.prev[next] = revision
for b in branches:
self.prev[b] = revision
def tree_completed(self):
"The revision tree has been parsed. Analyze it for consistency."
# Our algorithm depends upon the timestamps on the revisions occuring
# monotonically over time. That is, we want to see rev 1.34 occur in
# time before rev 1.35. If we inserted 1.35 *first* (due to the time-
# sorting), and then tried to insert 1.34, we'd be screwed.
# to perform the analysis, we'll simply visit all of the 'previous'
# links that we have recorded and validate that the timestamp on the
# previous revision is before the specified revision
# if we have to resync some nodes, then we restart the scan. just keep
# looping as long as we need to restart.
while 1:
for current, prev in self.prev.items():
if not prev:
# no previous revision exists (i.e. the initial revision)
continue
t_c = self.rev_data[current][0]
t_p = self.rev_data[prev][0]
if t_p >= t_c:
# the previous revision occurred later than the current revision.
# shove the previous revision back in time (and any before it that
# may need to shift).
while t_p >= t_c:
self.rev_data[prev][0] = t_c - 1 # new timestamp
self.rev_data[prev][3] = t_p # old timestamp
print 'RESYNC: %s (%s) : old time="%s" new time="%s"' \
% (relative_name(self.cvsroot, self.fname),
prev, time.ctime(t_p), time.ctime(t_c - 1))
current = prev
prev = self.prev[current]
if not prev:
break
t_c = t_c - 1 # self.rev_data[current][0]
t_p = self.rev_data[prev][0]
# break from the for-loop
break
else:
# finished the for-loop (no resyncing was performed)
return
def set_revision_info(self, revision, log, text):
timestamp, author, op, old_ts = self.rev_data[revision]
digest = sha.new(log + '\0' + author).hexdigest()
if old_ts:
# the timestamp on this revision was changed. log it for later
# resynchronization of other files's revisions that occurred
# for this time and log message.
self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))
self.revs.write('%08lx %s %s %s %s\n' % (timestamp, digest,
op, revision, self.fname))
def relative_name(cvsroot, fname):
l = len(cvsroot)
if fname[:l] == cvsroot:
if fname[l] == '/':
return fname[l+1:]
return fname[l:]
return l
def visit_file(arg, dirname, files):
cd, p, stats = arg
for fname in files:
if fname[-2:] != ',v':
continue
pathname = os.path.join(dirname, fname)
if dirname[-6:] == ATTIC:
# drop the 'Attic' portion from the pathname
### we should record this so we can easily insert it back in
cd.set_fname(os.path.join(dirname[:-6], fname))
else:
cd.set_fname(pathname)
if verbose:
print pathname
p.parse(open(pathname), cd)
stats[0] = stats[0] + 1
class RevInfoParser(rcsparse.Sink):
def __init__(self):
self.authors = { } # revision -> author
self.logs = { } # revision -> log message
def define_revision(self, revision, timestamp, author, state,
branches, next):
self.authors[revision] = author
def set_revision_info(self, revision, log, text):
self.logs[revision] = log
def parse_cvs_file(self, rcs_pathname):
try:
rcsfile = open(rcs_pathname, 'r')
except:
try:
dirname, fname = os.path.split(rcs_pathname)
rcs_pathname = os.path.join(dirname, "Attic", fname)
rcsfile = open(rcs_pathname, 'r')
except:
### should use a better error
raise RuntimeError, ('error: %s appeared to be under CVS control, '
'but the RCS file is inaccessible.'
% rcs_pathname)
rcsparse.Parser().parse(rcsfile, self)
class BuildRevision(rcsparse.Sink):
def __init__(self, rev, get_metadata=0):
self.rev = rev
self.get_metadata = get_metadata
self.result = None
def define_revision(self, revision, timestamp, author, state,
branches, next):
for branch in branches:
self.prev_delta[branch] = revision
if next:
self.prev_delta[next] = revision
if self.get_metadata and revision == self.rev:
self.author = author
def tree_completed(self):
path = [ ]
revision = self.rev
while revision:
path.append(revision)
revision = self.prev_delta.get(revision)
path.reverse()
self.collect = path
def set_revision_info(self, revision, log, text):
if not self.collect:
# nothing more to do
### would be nice to halt the file parsing...
return
# NOTE: we assume that the deltas appear in the proper order within
# the RCS file, for streaming application. Thus, our max size is the
# largest revision of all involved (rather than the revision plus all
# diff entries).
if revision != self.collect[0]:
# not something we are interested in
return
if self.get_metadata and revision == self.rev:
self.log = log
if self.result is None:
self.result = string.split(text, '\n')
else:
adjust = 0
diffs = string.split(text, '\n')
for command in diffs:
if add_lines_remaining > 0:
# Insertion lines from a prior "a" command
self.result.insert(start_line + adjust, command)
add_lines_remaining = add_lines_remaining - 1
adjust = adjust + 1
else:
dmatch = self.d_command.match(command)
amatch = self.a_command.match(command)
if dmatch:
# "d" - Delete command
start_line = string.atoi(dmatch.group(1))
count = string.atoi(dmatch.group(2))
begin = start_line + adjust - 1
del self.result[begin:begin + count]
adjust = adjust - count
elif amatch:
# "a" - Add command
start_line = string.atoi(amatch.group(1))
count = string.atoi(amatch.group(2))
add_lines_remaining = count
else:
raise RuntimeError, 'Error parsing diff commands'
class Commit:
def __init__(self):
self.changes = [ ]
self.deletes = [ ]
self.t_min = 1<<30
self.t_max = 0
def add(self, t, op, file, rev):
# record the time range of this commit
if t < self.t_min:
self.t_min = t
if t > self.t_max:
self.t_max = t
if op == OP_CHANGE:
self.changes.append((file, rev))
else:
# OP_DELETE
self.deletes.append((file, rev))
def get_metadata(self, pool):
# by definition, the author and log message must be the same for all
# items that went into this commit. therefore, just grab any item from
# our record of changes/deletes.
if self.changes:
file, rev = self.changes[0]
else:
# there better be one...
file, rev = self.deletes[0]
# now, fetch the author/log from the ,v file
rip = RevInfoParser()
rip.parse_cvs_file(file)
author = rip.authors[rev]
log = rip.logs[rev]
# format the date properly
a_t = util.apr_time_ansi_put(self.t_max)[1]
date = util.svn_time_to_nts(a_t, pool)
return author, log, date
def commit(self, t_fs, ctx):
# commit this transaction
print 'committing: %s, over %d seconds' % (time.ctime(self.t_min),
self.t_max - self.t_min)
# create a pool for the entire commit
c_pool = util.svn_pool_create(ctx.pool)
rev = fs.youngest_rev(t_fs, c_pool)
txn = fs.begin_txn(t_fs, rev, c_pool)
root = fs.txn_root(txn, c_pool)
lastcommit = (None, None)
# create a pool for each file; it will be cleared on each iteration
f_pool = util.svn_pool_create(c_pool)
for f, r in self.changes:
# compute a repository path. ensure we have a leading "/" and drop
# the ,v from the file name
repos_path = '/' + relative_name(ctx.cvsroot, f[:-2])
#print 'DEBUG:', repos_path
print ' changing %s : %s' % (r, repos_path)
### hmm. need to clarify OS path separators vs FS path separators
dirname = os.path.dirname(repos_path)
if dirname != '/':
# get the components of the path (skipping the leading '/')
parts = string.split(dirname[1:], os.sep)
for i in range(1, len(parts) + 1):
# reassemble the pieces, adding a leading slash
parent_dir = '/' + string.join(parts[:i], '/')
if fs.check_path(root, parent_dir, f_pool) == svn_node_none:
print ' making dir:', parent_dir
fs.make_dir(root, parent_dir, f_pool)
if fs.check_path(root, repos_path, f_pool) == svn_node_none:
created_file = 1
fs.make_file(root, repos_path, f_pool)
else:
created_file = 0
handler, baton = fs.apply_textdelta(root, repos_path, f_pool)
# figure out the real file path for "co"
try:
statcache.stat(f)
except os.error:
dirname, fname = os.path.split(f)
f = os.path.join(dirname, 'Attic', fname)
statcache.stat(f)
pipe = os.popen('co -q -p%s %s' % (r, f), 'r', 102400)
# if we just made the file, we can send it in one big hunk, rather
# than streaming it in.
### we should watch out for file sizes here; we don't want to yank
### in HUGE files...
if created_file:
_delta.svn_txdelta_send_string(pipe.read(), handler, baton, f_pool)
else:
# open an SVN stream onto the pipe
stream2 = util.svn_stream_from_stdio(pipe, f_pool)
# Get the current file contents from the repo, or, if we have
# multiple CVS revisions to the same file being done in this
# single commit, then get the contents of the previous
# revision from co, or else the delta won't be correct because
# the contents in the repo won't have changed yet.
if repos_path == lastcommit[0]:
infile2 = os.popen("co -q -p%s %s" % (lastcommit[1], f), "r", 102400)
stream1 = util.svn_stream_from_stdio(infile2, f_pool)
else:
stream1 = fs.file_contents(root, repos_path, f_pool)
txstream = _delta.svn_txdelta(stream1, stream2, f_pool)
_delta.svn_txdelta_send_txstream(txstream, handler, baton, f_pool)
# shut down the previous-rev pipe, if we opened it
infile2 = None
# shut down the current-rev pipe
pipe.close()
# wipe the pool. this will get rid of the pipe streams and the delta
# stream, and anything the FS may have done.
util.svn_pool_clear(f_pool)
# remember what we just did, for the next iteration
lastcommit = (repos_path, r)
for f, r in self.deletes:
# compute a repository path. ensure we have a leading "/" and drop
# the ,v from the file name
repos_path = '/' + relative_name(ctx.cvsroot, f[:-2])
print ' deleting %s : %s' % (r, repos_path)
# If the file was initially added on a branch, the first mainline
# revision will be marked dead, and thus, attempts to delete it will
# fail, since it doesn't really exist.
if r != '1.1':
### need to discriminate between OS paths and FS paths
fs.delete(root, repos_path, f_pool)
# wipe the pool, in case the delete loads it up
util.svn_pool_clear(f_pool)
# get the metadata for this commit
author, log, date = self.get_metadata(c_pool)
fs.change_txn_prop(txn, 'svn:author', author, c_pool)
fs.change_txn_prop(txn, 'svn:log', log, c_pool)
conflicts, new_rev = fs.commit_txn(txn)
# set the time to the proper (past) time
fs.change_rev_prop(t_fs, new_rev, 'svn:date', date, c_pool)
### how come conflicts is a newline?
if conflicts != '\n':
print ' CONFLICTS:', `conflicts`
print ' new revision:', new_rev
# done with the commit and file pools
util.svn_pool_destroy(c_pool)
def read_resync(fname):
"Read the .resync file into memory."
### note that we assume that we can hold the entire resync file in
### memory. really large repositories with whacky timestamps could
### bust this assumption. should that ever happen, then it is possible
### to split the resync file into pieces and make multiple passes,
### using each piece.
#
# A digest maps to a sequence of lists which specify a lower and upper
# time bound for matching up the commit. We keep a sequence of these
# because a number of checkins with the same log message (e.g. an empty
# log message) could need to be remapped. We also make them a list because
# we will dynamically expand the lower/upper bound as we find commits
# that fall into a particular msg and time range.
#
# resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
#
resync = { }
for line in fileinput.FileInput(fname):
t1 = int(line[:8], 16)
digest = line[9:DIGEST_END_IDX]
t2 = int(line[DIGEST_END_IDX+1:], 16)
t1_l = t1 - COMMIT_THRESHOLD/2
t1_u = t1 + COMMIT_THRESHOLD/2
if resync.has_key(digest):
resync[digest].append([t1_l, t1_u, t2])
else:
resync[digest] = [ [t1_l, t1_u, t2] ]
return resync
def parse_revs_line(line):
timestamp = int(line[:8], 16)
id = line[9:DIGEST_END_IDX]
op = line[DIGEST_END_IDX + 1]
idx = string.find(line, ' ', DIGEST_END_IDX + 3)
rev = line[DIGEST_END_IDX+3:idx]
fname = line[idx+1:-1]
return timestamp, id, op, rev, fname
def pass1(ctx):
cd = CollectData(ctx.cvsroot, DATAFILE)
p = rcsparse.Parser()
stats = [ 0 ]
os.path.walk(ctx.cvsroot, visit_file, (cd, p, stats))
if ctx.verbose:
print 'processed', stats[0], 'files'
def pass2(ctx):
"Pass 2: clean up the revision information."
# We may have recorded some changes in revisions' timestamp. We need to
# scan for any other files which may have had the same log message and
# occurred at "the same time" and change their timestamps, too.
# read the resync data file
resync = read_resync(ctx.log_fname_base + RESYNC_SUFFIX)
output = open(ctx.log_fname_base + CLEAN_REVS_SUFFIX, 'w')
# process the revisions file, looking for items to clean up
for line in fileinput.FileInput(ctx.log_fname_base + REVS_SUFFIX):
timestamp, digest, op, rev, fname = parse_revs_line(line)
if not resync.has_key(digest):
output.write(line)
continue
# we have a hit. see if this is "near" any of the resync records we
# have recorded for this digest [of the log message].
for record in resync[digest]:
if record[0] <= timestamp <= record[1]:
# bingo! remap the time on this (record[2] is the new time).
output.write('%08lx %s %s %s %s\n'
% (record[2], digest, op, rev, fname))
print 'RESYNC: %s (%s) : old time="%s" new time="%s"' \
% (relative_name(ctx.cvsroot, fname),
rev, time.ctime(timestamp), time.ctime(record[2]))
# adjust the time range. we want the COMMIT_THRESHOLD from the
# bounds of the earlier/latest commit in this group.
record[0] = min(record[0], timestamp - COMMIT_THRESHOLD/2)
record[1] = max(record[1], timestamp + COMMIT_THRESHOLD/2)
# stop looking for hits
break
else:
# the file/rev did not need to have its time changed.
output.write(line)
def pass3(ctx):
# sort the log files
os.system('sort %s > %s' % (ctx.log_fname_base + CLEAN_REVS_SUFFIX,
ctx.log_fname_base + SORTED_REVS_SUFFIX))
def pass4(ctx):
# create the target repository
t_repos = _repos.svn_repos_create(ctx.target, ctx.pool)
t_fs = _repos.svn_repos_fs(t_repos)
# process the logfiles, creating the target
commits = { }
count = 0
for line in fileinput.FileInput(ctx.log_fname_base + SORTED_REVS_SUFFIX):
timestamp, id, op, rev, fname = parse_revs_line(line)
### only handle changes on the trunk for now
if not trunk_rev.match(rev):
### technically, the timestamp on this could/should cause a flush.
### don't worry about it; the next item will handle it
continue
if commits.has_key(id):
c = commits[id]
else:
c = commits[id] = Commit()
c.add(timestamp, op, fname, rev)
# scan for commits to process
process = [ ]
for id, c in commits.items():
if c.t_max + COMMIT_THRESHOLD < timestamp:
process.append((c.t_max, c))
del commits[id]
process.sort()
for t_max, c in process:
c.commit(t_fs, ctx)
count = count + len(process)
# if there are any pending commits left, then flush them
if commits:
process = [ ]
for id, c in commits.items():
process.append((c.t_max, c))
process.sort()
for t_max, c in process:
c.commit(t_fs, ctx)
count = count + len(process)
if ctx.verbose:
print count, 'commits processed.'
_passes = [
pass1,
pass2,
pass3,
pass4,
]
class _ctx:
pass
def convert(pool, cvsroot,
target=SVNROOT, log_fname_base=DATAFILE, start_pass=1, verbose=0):
"Convert a CVS repository to an SVN repository."
# prepare the operation context
ctx = _ctx()
ctx.pool = pool
ctx.cvsroot = cvsroot
ctx.target = target
ctx.log_fname_base = log_fname_base
ctx.verbose = verbose
times = [ None ] * len(_passes)
for i in range(start_pass - 1, len(_passes)):
times[i] = time.time()
if verbose:
print '----- pass %d -----' % (i + 1)
_passes[i](ctx)
times.append(time.time())
if verbose:
for i in range(start_pass, len(_passes)+1):
print 'pass %d: %d seconds' % (i, int(times[i] - times[i-1]))
print ' total:', int(times[len(_passes)] - times[start_pass-1]), 'seconds'
def usage():
print 'USAGE: %s [-v] [-s svn-repos-path] [-p pass] repository-path' \
% sys.argv[0]
sys.exit(1)
def main():
opts, args = getopt.getopt(sys.argv[1:], 'p:s:v')
if len(args) != 1:
usage()
verbose = 0
start_pass = 1
target = SVNROOT
for opt, value in opts:
if opt == '-p':
start_pass = int(value)
if start_pass < 1 or start_pass > len(_passes):
print 'ERROR: illegal value (%d) for starting pass. ' \
'must be 1 through %d.' % (start_pass, len(_passes))
sys.exit(1)
elif opt == '-v':
verbose = 1
elif opt == '-s':
target = value
util.run_app(convert, args[0],
start_pass=start_pass, verbose=verbose, target=target)
if __name__ == '__main__':
main()