contrib/server-side/svn-obliterate.py - subversion - Git at Google

 #!/usr/bin/env python

 """Whitewash the contents of a Subversion file and its successors.

 Usage: svn-obliterate.py REPOS_PATH PATH REVISION
 """

 import sys
 import os
 import string
 import re
 import bsddb3
 from svn import repos, fs, core

 ###  TODO: Clean out the transactions table.
 ###  TODO: Clean out the other stuff (maybe).

 def die(msg):
     sys.stderr.write(msg + '\n')
     sys.exit(1)


 def get_rep_keys(skel):
     # PROP-KEY and NODE-KEY (and maybe EDIT-KEY) follow the header,
     # again with possible atom size bitz.
     size, rest = string.split(skel[6:], ' ', 1)
     path = rest[0:int(size)]
     rest = rest[int(size) + 1:]
     end_header = string.find(rest, ')')
     pieces = string.split(rest[end_header + 2:-1], ' ')
     prop_key = None
     data_key = None
     if pieces[0][0] in string.digits:
         del pieces[0]
     if pieces[0]:
         prop_key = pieces[0]
     if pieces[1][0] in string.digits:
         del pieces[1]
     if pieces[1]:
         data_key = pieces[1]
     return prop_key, data_key


 def read_string(strings_db, string_key):
     string_data = ''
     key, value = strings_db.set_location(string_key)
     while key == string_key:
         string_data = string_data + value
         key, value = strings_db.next()
     return string_data


 def unparse_dirent_skel(entries):
     items = ''
     first_one = 1
     for name, id in entries.items():
         if not first_one:
             items = items + ' '
         first_one = 0
         items = items + '(%d %s %d %s)' % (len(name), name, len(id), id)
     return '(%s)' % items


 def parse_dirent_skel(skel):
     skel = skel[1:-1]
     entries = {}
     while 1:
         if not len(skel) or skel[0] != '(':
             break
         token, rest = string.split(skel[1:], ' ', 1)
         if skel[1] in string.digits:
             size = token
             name = rest[0:int(size)]
             rest = skel[1 + len(size) + 1 + int(size) + 1:]
         else:
             name = token
         match = re.match('([0-9]+ )?([a-zA-Z0-9]+\.[a-zA-Z0-9]+\.[a-zA-Z0-9]+)\)',
                          rest)
         if not match:
             break
         id = match.group(2)
         entries[name] = id
         skel = rest[len(match.group(0)) + 1:]
     return entries


 _fulltext_re = re.compile('^(\(\(fulltext [^\(]+)\(md5 (16 )?')
 def fix_affected_dirlists(node, reps_db, strings_db, affected_nodes, dirlists):
     prop_key, data_key = get_rep_keys(node)
     if not data_key:
         return
     data_rep = reps_db[data_key]

     # See if this is a fulltext rep.  If so, the STRING-KEY is a
     # pretty easy find.  Well wipe that STRING-KEY, and clear the
     # checksum from the REPRESENTATION.
     match = re.match(_fulltext_re, data_rep)
     if not match:
         die('Unable to handle non-fulltext dirent list "%s"' % data_key)

     rep_rest = data_rep[len(match.group(0)) + 16 + 3:-1]
     pieces = string.split(rep_rest, ' ')
     string_key = pieces[-1]
     string_data = read_string(strings_db, string_key)
     entries = parse_dirent_skel(string_data)
     kill_count = 0
     for name, id in entries.items():
         if id in affected_nodes:
             kill_count = kill_count + 1
             del(entries[name])
     if kill_count:
         ### begin txn!
         del(strings_db[string_key])
         strings_db[string_key] = unparse_dirent_skel(entries)
         reps_db[data_key] = match.group(1) + \
                             '(md5 16 \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0)) ' + \
                             str(len(string_key)) + ' ' + string_key + ')'
         ### end txn!
     return kill_count


 def parse_node_skel(skel):
     # PREV-ID immediately follows the COMMITTED-PATH, unless there is
     # a skel atom size marker in there first.
     is_dir = 0
     if skel[0:7] == '((file ':
         size, rest = string.split(skel[7:], ' ', 1)
     elif skel[0:6] == '((dir ':
         is_dir = 1
         size, rest = string.split(skel[6:], ' ', 1)
     else:
         die("Unable to parse skel '%s'" % skel)
     path = rest[0:int(size)]
     rest = rest[int(size) + 1:]
     rest = rest[:string.find(rest, ')')]
     pieces = string.split(rest, ' ')
     prev_id = None
     if pieces[0][0] in string.digits:
         del pieces[0]
     if pieces[0] != '':
         prev_id = pieces[0]
     return prev_id, is_dir


 def get_node_id(pool, repos_path, path, revision):
     # Open the repository and filesystem.
     repos_ptr = repos.open(repos_path, pool)
     fs_ptr = repos.fs(repos_ptr)

     # Fetch the node revision ID of interest
     rev_root = fs.revision_root(fs_ptr, int(revision), pool)
     return fs.unparse_id(fs.node_id(rev_root, path, pool), pool)


 def append_successors(nodes, node_id, affected_nodes):
     node = nodes[node_id]
     affected_nodes.append(node_id)
     for succ_id in node[2]:
         append_successors(nodes, succ_id, affected_nodes)


 def main():
     kill_preds = 1

     ### Until this thing learns to purge the 'changes', it ise
     ### basically useless (because dumps/loads are entirely
     ### 'changes'-table driven).  So just bail.

     print "This script will, at the moment, destroy your repository."
     print "You don't really want that, right?"
     sys.exit(0)

     # Parse the commandline arguments.
     argc = len(sys.argv)
     if argc < 4:
         print __doc__
         sys.exit(1)
     repos_path, path, revision = sys.argv[1:4]

     # Fetch the NODE-REV-ID of the PATH@REV which holds our interest.
     sys.stdout.write('Harvesting info for "%s" in r%s.\n' % \
                      (path, revision))
     sys.stdout.write('-- Determining node revision ID... ')
     sys.stdout.flush()
     node_id = core.run_app(get_node_id, repos_path, path, revision)
     sys.stdout.write('done.  [%s]\n' % node_id)

     # Scan the nodes table, parsing skels and building a node tree.
     nodes = {}
     sys.stdout.write('-- Building node tree... ')
     sys.stdout.flush()
     nodes_table = os.path.join(repos_path, 'db', 'nodes')
     nodes_db = bsddb3.btopen(nodes_table, 'w')
     for key in nodes_db.keys():
         if key == 'next-key':
             continue
         value = nodes_db[key]
         prev_id, is_dir = parse_node_skel(value)
         nodes[key] = [prev_id, is_dir, []]
     for key in nodes.keys():
         value = nodes[key]
         if value[0]:
             prev_value = nodes[value[0]]
             prev_value[2].append(key)
             nodes[value[0]] = prev_value
     sys.stdout.write('done.  [found %d node(s)]\n' % len(nodes.keys()))

     # Determine the nodes we wish to purge.
     affected_nodes = []
     sys.stdout.write('-- Building node purge list... ')
     sys.stdout.flush()
     if kill_preds:
         prev_id = node_id
         while nodes[prev_id][0]:
             prev_id = nodes[prev_id][0]
     append_successors(nodes, prev_id, affected_nodes)
     sys.stdout.write('done.  [found %d node(s)]\n' % len(affected_nodes))
     for id in affected_nodes:
         sys.stdout.write('   -- %s\n' % id)

     # Now, the hard part.  We need to find every directory listing
     # that contains one of our to-be-purge nodes, and then remove
     # those nodes from the entries list.
     dirlists = []
     sys.stdout.write('-- Fixing affected directory entries lists... ')
     sys.stdout.flush()
     strings_table = os.path.join(repos_path, 'db', 'strings')
     strings_db = bsddb3.btopen(strings_table, 'w')
     reps_table = os.path.join(repos_path, 'db', 'representations')
     reps_db = bsddb3.btopen(reps_table, 'w')
     dirs_fixed = 0
     entries_fixed = 0
     for key in nodes.keys():
         value = nodes[key]
         if value[1]:
             node = nodes_db[key]
             kill_count = fix_affected_dirlists(node, reps_db, strings_db,
                                                affected_nodes, dirlists)
             if kill_count:
                 sys.stdout.write('   -- %s\n' % key)
                 dirs_fixed = dirs_fixed + 1
                 entries_fixed = entries_fixed + kill_count
     sys.stdout.write('done.  [fixed %d entries in %d dirs]\n' \
                      % (entries_fixed, dirs_fixed))

     sys.stdout.write('-- Removing deleted nodes... ')
     sys.stdout.flush()
     for key in affected_nodes:
         del(nodes_db[key])
     sys.stdout.write('done.  [removed %d nodes]\n' % len(affected_nodes))

     # Cleanup after ourselves.
     strings_db.sync()
     nodes_db.sync()
     reps_db.sync()
     strings_db.close()
     reps_db.close()
     nodes_db.close()


 if __name__ == '__main__':
     main()
	#!/usr/bin/env python

	"""Whitewash the contents of a Subversion file and its successors.

	Usage: svn-obliterate.py REPOS_PATH PATH REVISION
	"""

	import sys
	import os
	import string
	import re
	import bsddb3
	from svn import repos, fs, core

	### TODO: Clean out the transactions table.
	### TODO: Clean out the other stuff (maybe).

	def die(msg):
	sys.stderr.write(msg + '\n')
	sys.exit(1)


	def get_rep_keys(skel):
	# PROP-KEY and NODE-KEY (and maybe EDIT-KEY) follow the header,
	# again with possible atom size bitz.
	size, rest = string.split(skel[6:], ' ', 1)
	path = rest[0:int(size)]
	rest = rest[int(size) + 1:]
	end_header = string.find(rest, ')')
	pieces = string.split(rest[end_header + 2:-1], ' ')
	prop_key = None
	data_key = None
	if pieces[0][0] in string.digits:
	del pieces[0]
	if pieces[0]:
	prop_key = pieces[0]
	if pieces[1][0] in string.digits:
	del pieces[1]
	if pieces[1]:
	data_key = pieces[1]
	return prop_key, data_key


	def read_string(strings_db, string_key):
	string_data = ''
	key, value = strings_db.set_location(string_key)
	while key == string_key:
	string_data = string_data + value
	key, value = strings_db.next()
	return string_data


	def unparse_dirent_skel(entries):
	items = ''
	first_one = 1
	for name, id in entries.items():
	if not first_one:
	items = items + ' '
	first_one = 0
	items = items + '(%d %s %d %s)' % (len(name), name, len(id), id)
	return '(%s)' % items


	def parse_dirent_skel(skel):
	skel = skel[1:-1]
	entries = {}
	while 1:
	if not len(skel) or skel[0] != '(':
	break
	token, rest = string.split(skel[1:], ' ', 1)
	if skel[1] in string.digits:
	size = token
	name = rest[0:int(size)]
	rest = skel[1 + len(size) + 1 + int(size) + 1:]
	else:
	name = token
	match = re.match('([0-9]+ )?([a-zA-Z0-9]+\.[a-zA-Z0-9]+\.[a-zA-Z0-9]+)\)',
	rest)
	if not match:
	break
	id = match.group(2)
	entries[name] = id
	skel = rest[len(match.group(0)) + 1:]
	return entries


	_fulltext_re = re.compile('^(\(\(fulltext [^\(]+)\(md5 (16 )?')
	def fix_affected_dirlists(node, reps_db, strings_db, affected_nodes, dirlists):
	prop_key, data_key = get_rep_keys(node)
	if not data_key:
	return
	data_rep = reps_db[data_key]

	# See if this is a fulltext rep. If so, the STRING-KEY is a
	# pretty easy find. Well wipe that STRING-KEY, and clear the
	# checksum from the REPRESENTATION.
	match = re.match(_fulltext_re, data_rep)
	if not match:
	die('Unable to handle non-fulltext dirent list "%s"' % data_key)

	rep_rest = data_rep[len(match.group(0)) + 16 + 3:-1]
	pieces = string.split(rep_rest, ' ')
	string_key = pieces[-1]
	string_data = read_string(strings_db, string_key)
	entries = parse_dirent_skel(string_data)
	kill_count = 0
	for name, id in entries.items():
	if id in affected_nodes:
	kill_count = kill_count + 1
	del(entries[name])
	if kill_count:
	### begin txn!
	del(strings_db[string_key])
	strings_db[string_key] = unparse_dirent_skel(entries)
	reps_db[data_key] = match.group(1) + \
	'(md5 16 \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0)) ' + \
	str(len(string_key)) + ' ' + string_key + ')'
	### end txn!
	return kill_count


	def parse_node_skel(skel):
	# PREV-ID immediately follows the COMMITTED-PATH, unless there is
	# a skel atom size marker in there first.
	is_dir = 0
	if skel[0:7] == '((file ':
	size, rest = string.split(skel[7:], ' ', 1)
	elif skel[0:6] == '((dir ':
	is_dir = 1
	size, rest = string.split(skel[6:], ' ', 1)
	else:
	die("Unable to parse skel '%s'" % skel)
	path = rest[0:int(size)]
	rest = rest[int(size) + 1:]
	rest = rest[:string.find(rest, ')')]
	pieces = string.split(rest, ' ')
	prev_id = None
	if pieces[0][0] in string.digits:
	del pieces[0]
	if pieces[0] != '':
	prev_id = pieces[0]
	return prev_id, is_dir


	def get_node_id(pool, repos_path, path, revision):
	# Open the repository and filesystem.
	repos_ptr = repos.open(repos_path, pool)
	fs_ptr = repos.fs(repos_ptr)

	# Fetch the node revision ID of interest
	rev_root = fs.revision_root(fs_ptr, int(revision), pool)
	return fs.unparse_id(fs.node_id(rev_root, path, pool), pool)


	def append_successors(nodes, node_id, affected_nodes):
	node = nodes[node_id]
	affected_nodes.append(node_id)
	for succ_id in node[2]:
	append_successors(nodes, succ_id, affected_nodes)


	def main():
	kill_preds = 1

	### Until this thing learns to purge the 'changes', it ise
	### basically useless (because dumps/loads are entirely
	### 'changes'-table driven). So just bail.

	print "This script will, at the moment, destroy your repository."
	print "You don't really want that, right?"
	sys.exit(0)

	# Parse the commandline arguments.
	argc = len(sys.argv)
	if argc < 4:
	print __doc__
	sys.exit(1)
	repos_path, path, revision = sys.argv[1:4]

	# Fetch the NODE-REV-ID of the PATH@REV which holds our interest.
	sys.stdout.write('Harvesting info for "%s" in r%s.\n' % \
	(path, revision))
	sys.stdout.write('-- Determining node revision ID... ')
	sys.stdout.flush()
	node_id = core.run_app(get_node_id, repos_path, path, revision)
	sys.stdout.write('done. [%s]\n' % node_id)

	# Scan the nodes table, parsing skels and building a node tree.
	nodes = {}
	sys.stdout.write('-- Building node tree... ')
	sys.stdout.flush()
	nodes_table = os.path.join(repos_path, 'db', 'nodes')
	nodes_db = bsddb3.btopen(nodes_table, 'w')
	for key in nodes_db.keys():
	if key == 'next-key':
	continue
	value = nodes_db[key]
	prev_id, is_dir = parse_node_skel(value)
	nodes[key] = [prev_id, is_dir, []]
	for key in nodes.keys():
	value = nodes[key]
	if value[0]:
	prev_value = nodes[value[0]]
	prev_value[2].append(key)
	nodes[value[0]] = prev_value
	sys.stdout.write('done. [found %d node(s)]\n' % len(nodes.keys()))

	# Determine the nodes we wish to purge.
	affected_nodes = []
	sys.stdout.write('-- Building node purge list... ')
	sys.stdout.flush()
	if kill_preds:
	prev_id = node_id
	while nodes[prev_id][0]:
	prev_id = nodes[prev_id][0]
	append_successors(nodes, prev_id, affected_nodes)
	sys.stdout.write('done. [found %d node(s)]\n' % len(affected_nodes))
	for id in affected_nodes:
	sys.stdout.write(' -- %s\n' % id)

	# Now, the hard part. We need to find every directory listing
	# that contains one of our to-be-purge nodes, and then remove
	# those nodes from the entries list.
	dirlists = []
	sys.stdout.write('-- Fixing affected directory entries lists... ')
	sys.stdout.flush()
	strings_table = os.path.join(repos_path, 'db', 'strings')
	strings_db = bsddb3.btopen(strings_table, 'w')
	reps_table = os.path.join(repos_path, 'db', 'representations')
	reps_db = bsddb3.btopen(reps_table, 'w')
	dirs_fixed = 0
	entries_fixed = 0
	for key in nodes.keys():
	value = nodes[key]
	if value[1]:
	node = nodes_db[key]
	kill_count = fix_affected_dirlists(node, reps_db, strings_db,
	affected_nodes, dirlists)
	if kill_count:
	sys.stdout.write(' -- %s\n' % key)
	dirs_fixed = dirs_fixed + 1
	entries_fixed = entries_fixed + kill_count
	sys.stdout.write('done. [fixed %d entries in %d dirs]\n' \
	% (entries_fixed, dirs_fixed))

	sys.stdout.write('-- Removing deleted nodes... ')
	sys.stdout.flush()
	for key in affected_nodes:
	del(nodes_db[key])
	sys.stdout.write('done. [removed %d nodes]\n' % len(affected_nodes))

	# Cleanup after ourselves.
	strings_db.sync()
	nodes_db.sync()
	reps_db.sync()
	strings_db.close()
	reps_db.close()
	nodes_db.close()


	if __name__ == '__main__':
	main()