| #!/usr/bin/env python |
| |
| """Whitewash the contents of a Subversion file and its successors. |
| |
| Usage: svn-obliterate.py REPOS_PATH PATH REVISION |
| """ |
| |
| import sys |
| import os |
| import string |
| import re |
| import bsddb3 |
| from svn import repos, fs, core |
| |
| ### TODO: Clean out the transactions table. |
| ### TODO: Clean out the other stuff (maybe). |
| |
| def die(msg): |
| sys.stderr.write(msg + '\n') |
| sys.exit(1) |
| |
| |
| def get_rep_keys(skel): |
| # PROP-KEY and NODE-KEY (and maybe EDIT-KEY) follow the header, |
| # again with possible atom size bitz. |
| size, rest = string.split(skel[6:], ' ', 1) |
| path = rest[0:int(size)] |
| rest = rest[int(size) + 1:] |
| end_header = string.find(rest, ')') |
| pieces = string.split(rest[end_header + 2:-1], ' ') |
| prop_key = None |
| data_key = None |
| if pieces[0][0] in string.digits: |
| del pieces[0] |
| if pieces[0]: |
| prop_key = pieces[0] |
| if pieces[1][0] in string.digits: |
| del pieces[1] |
| if pieces[1]: |
| data_key = pieces[1] |
| return prop_key, data_key |
| |
| |
| def read_string(strings_db, string_key): |
| string_data = '' |
| key, value = strings_db.set_location(string_key) |
| while key == string_key: |
| string_data = string_data + value |
| key, value = strings_db.next() |
| return string_data |
| |
| |
| def unparse_dirent_skel(entries): |
| items = '' |
| first_one = 1 |
| for name, id in entries.items(): |
| if not first_one: |
| items = items + ' ' |
| first_one = 0 |
| items = items + '(%d %s %d %s)' % (len(name), name, len(id), id) |
| return '(%s)' % items |
| |
| |
| def parse_dirent_skel(skel): |
| skel = skel[1:-1] |
| entries = {} |
| while 1: |
| if not len(skel) or skel[0] != '(': |
| break |
| token, rest = string.split(skel[1:], ' ', 1) |
| if skel[1] in string.digits: |
| size = token |
| name = rest[0:int(size)] |
| rest = skel[1 + len(size) + 1 + int(size) + 1:] |
| else: |
| name = token |
| match = re.match('([0-9]+ )?([a-zA-Z0-9]+\.[a-zA-Z0-9]+\.[a-zA-Z0-9]+)\)', |
| rest) |
| if not match: |
| break |
| id = match.group(2) |
| entries[name] = id |
| skel = rest[len(match.group(0)) + 1:] |
| return entries |
| |
| |
| _fulltext_re = re.compile('^(\(\(fulltext [^\(]+)\(md5 (16 )?') |
| def fix_affected_dirlists(node, reps_db, strings_db, affected_nodes, dirlists): |
| prop_key, data_key = get_rep_keys(node) |
| if not data_key: |
| return |
| data_rep = reps_db[data_key] |
| |
| # See if this is a fulltext rep. If so, the STRING-KEY is a |
| # pretty easy find. Well wipe that STRING-KEY, and clear the |
| # checksum from the REPRESENTATION. |
| match = re.match(_fulltext_re, data_rep) |
| if not match: |
| die('Unable to handle non-fulltext dirent list "%s"' % data_key) |
| |
| rep_rest = data_rep[len(match.group(0)) + 16 + 3:-1] |
| pieces = string.split(rep_rest, ' ') |
| string_key = pieces[-1] |
| string_data = read_string(strings_db, string_key) |
| entries = parse_dirent_skel(string_data) |
| kill_count = 0 |
| for name, id in entries.items(): |
| if id in affected_nodes: |
| kill_count = kill_count + 1 |
| del(entries[name]) |
| if kill_count: |
| ### begin txn! |
| del(strings_db[string_key]) |
| strings_db[string_key] = unparse_dirent_skel(entries) |
| reps_db[data_key] = match.group(1) + \ |
| '(md5 16 \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0)) ' + \ |
| str(len(string_key)) + ' ' + string_key + ')' |
| ### end txn! |
| return kill_count |
| |
| |
| def parse_node_skel(skel): |
| # PREV-ID immediately follows the COMMITTED-PATH, unless there is |
| # a skel atom size marker in there first. |
| is_dir = 0 |
| if skel[0:7] == '((file ': |
| size, rest = string.split(skel[7:], ' ', 1) |
| elif skel[0:6] == '((dir ': |
| is_dir = 1 |
| size, rest = string.split(skel[6:], ' ', 1) |
| else: |
| die("Unable to parse skel '%s'" % skel) |
| path = rest[0:int(size)] |
| rest = rest[int(size) + 1:] |
| rest = rest[:string.find(rest, ')')] |
| pieces = string.split(rest, ' ') |
| prev_id = None |
| if pieces[0][0] in string.digits: |
| del pieces[0] |
| if pieces[0] != '': |
| prev_id = pieces[0] |
| return prev_id, is_dir |
| |
| |
| def get_node_id(pool, repos_path, path, revision): |
| # Open the repository and filesystem. |
| repos_ptr = repos.open(repos_path, pool) |
| fs_ptr = repos.fs(repos_ptr) |
| |
| # Fetch the node revision ID of interest |
| rev_root = fs.revision_root(fs_ptr, int(revision), pool) |
| return fs.unparse_id(fs.node_id(rev_root, path, pool), pool) |
| |
| |
| def append_successors(nodes, node_id, affected_nodes): |
| node = nodes[node_id] |
| affected_nodes.append(node_id) |
| for succ_id in node[2]: |
| append_successors(nodes, succ_id, affected_nodes) |
| |
| |
| def main(): |
| kill_preds = 1 |
| |
| ### Until this thing learns to purge the 'changes', it ise |
| ### basically useless (because dumps/loads are entirely |
| ### 'changes'-table driven). So just bail. |
| |
| print "This script will, at the moment, destroy your repository." |
| print "You don't really want that, right?" |
| sys.exit(0) |
| |
| # Parse the commandline arguments. |
| argc = len(sys.argv) |
| if argc < 4: |
| print __doc__ |
| sys.exit(1) |
| repos_path, path, revision = sys.argv[1:4] |
| |
| # Fetch the NODE-REV-ID of the PATH@REV which holds our interest. |
| sys.stdout.write('Harvesting info for "%s" in r%s.\n' % \ |
| (path, revision)) |
| sys.stdout.write('-- Determining node revision ID... ') |
| sys.stdout.flush() |
| node_id = core.run_app(get_node_id, repos_path, path, revision) |
| sys.stdout.write('done. [%s]\n' % node_id) |
| |
| # Scan the nodes table, parsing skels and building a node tree. |
| nodes = {} |
| sys.stdout.write('-- Building node tree... ') |
| sys.stdout.flush() |
| nodes_table = os.path.join(repos_path, 'db', 'nodes') |
| nodes_db = bsddb3.btopen(nodes_table, 'w') |
| for key in nodes_db.keys(): |
| if key == 'next-key': |
| continue |
| value = nodes_db[key] |
| prev_id, is_dir = parse_node_skel(value) |
| nodes[key] = [prev_id, is_dir, []] |
| for key in nodes.keys(): |
| value = nodes[key] |
| if value[0]: |
| prev_value = nodes[value[0]] |
| prev_value[2].append(key) |
| nodes[value[0]] = prev_value |
| sys.stdout.write('done. [found %d node(s)]\n' % len(nodes.keys())) |
| |
| # Determine the nodes we wish to purge. |
| affected_nodes = [] |
| sys.stdout.write('-- Building node purge list... ') |
| sys.stdout.flush() |
| if kill_preds: |
| prev_id = node_id |
| while nodes[prev_id][0]: |
| prev_id = nodes[prev_id][0] |
| append_successors(nodes, prev_id, affected_nodes) |
| sys.stdout.write('done. [found %d node(s)]\n' % len(affected_nodes)) |
| for id in affected_nodes: |
| sys.stdout.write(' -- %s\n' % id) |
| |
| # Now, the hard part. We need to find every directory listing |
| # that contains one of our to-be-purge nodes, and then remove |
| # those nodes from the entries list. |
| dirlists = [] |
| sys.stdout.write('-- Fixing affected directory entries lists... ') |
| sys.stdout.flush() |
| strings_table = os.path.join(repos_path, 'db', 'strings') |
| strings_db = bsddb3.btopen(strings_table, 'w') |
| reps_table = os.path.join(repos_path, 'db', 'representations') |
| reps_db = bsddb3.btopen(reps_table, 'w') |
| dirs_fixed = 0 |
| entries_fixed = 0 |
| for key in nodes.keys(): |
| value = nodes[key] |
| if value[1]: |
| node = nodes_db[key] |
| kill_count = fix_affected_dirlists(node, reps_db, strings_db, |
| affected_nodes, dirlists) |
| if kill_count: |
| sys.stdout.write(' -- %s\n' % key) |
| dirs_fixed = dirs_fixed + 1 |
| entries_fixed = entries_fixed + kill_count |
| sys.stdout.write('done. [fixed %d entries in %d dirs]\n' \ |
| % (entries_fixed, dirs_fixed)) |
| |
| sys.stdout.write('-- Removing deleted nodes... ') |
| sys.stdout.flush() |
| for key in affected_nodes: |
| del(nodes_db[key]) |
| sys.stdout.write('done. [removed %d nodes]\n' % len(affected_nodes)) |
| |
| # Cleanup after ourselves. |
| strings_db.sync() |
| nodes_db.sync() |
| reps_db.sync() |
| strings_db.close() |
| reps_db.close() |
| nodes_db.close() |
| |
| |
| if __name__ == '__main__': |
| main() |