blob: 6a1b34c4163d996f9963c4bb7e9933f33a8e4f58 [file] [log] [blame]
#!/usr/bin/env python
usage = """
Usage:
$0 REPO-DIR FSFS-ID-WITH-BAD-OFFSET
-- Find the correct FSFS node-rev id, given one that is correct except for
its byte-offset part.
$0 REPO-DIR REV SIZE
-- Find a rep header that matches REV and SIZE.
Example:
Result of running 'svnadmin verify':
svnadmin: Corrupt node-revision '5-12302.1-12953.r12953/29475'
Invocation of this script:
$ $0 svn-repo 5-12302.1-12953.r12953/29475
Output of this script:
5-12302.1-12953.r12953/29255
"""
import os, sys
from fixer_config import *
class FixError(Exception):
"""An exception for any kind of inablility to repair the repository."""
pass
def parse_id(id):
"""Return the (NODEREV, REV, OFFSET) of ID, where ID is of the form
"NODE_ID.COPY_ID.rREV/OFFSET" and NODEREV is "NODE_ID.COPY_ID.rREV".
"""
node_id, copy_id, txn_id = id.split('.')
rev, offset = txn_id[1:].split('/')
noderev = node_id + '.' + copy_id + '.r' + rev
return noderev, rev, offset
class Repository:
"""Encapsulates key information of a repository:
its NAME, PATH, SHARD_SIZE, HEAD revision and MIN_UNPACKED_REV.
"""
def _read_repo_file(self, filename):
"""Read and return all lines from FILENAME in REPO.
"""
f = open(os.path.join(self.path, filename), "rb")
lines = f.readlines()
f.close()
return lines
def _read_config(self, filename):
""" Read and return all lines from FILENAME.
This will be used to read 'format', 'current' etc. . """
if filename not in self.db_config:
f = open(os.path.join(self.path, 'db', filename), "rb")
self.db_config[filename] = f.readlines()
f.close()
return self.db_config[filename]
def __init__(self, path_or_parent, name=None):
"""Constructor collecting everything we need to know about
the repository at path PATH_OR_PARENT or NAME within PARENT folder.
"""
if name is None:
self.name = os.path.basename(path_or_parent)
self.path = path_or_parent
else:
self.name = name
self.path = os.path.join(path_or_parent, name)
self.db_config = {}
self.repo_format = int(self._read_repo_file('format')[0])
self.fs_type = self._read_config('fs-type')[0].rstrip()
self.db_format = int(self._read_config('format')[0])
try:
self.shard_size = int(self._read_config('format')[1].split(' ')[2])
except IndexError:
self.shard_size = 0
if self.db_format >= 4:
self.min_unpacked_rev = int(self._read_config('min-unpacked-rev')[0])
else:
self.min_unpacked_rev = 0
self.head = int(self._read_config('current')[0])
def rev_file_path(self, rev):
"""Return the path to the revision file in the repository at REPO_DIR
(a path string) for revision number REV (int or string).
"""
if isinstance(rev, str):
rev = int(rev)
if self.shard_size > 0:
shard = int(rev) / self.shard_size
if rev < self.min_unpacked_rev:
path = os.path.join(self.path, 'db', 'revs', str(shard) + '.pack', 'pack')
else:
path = os.path.join(self.path, 'db', 'revs', str(shard), str(rev))
else:
path = os.path.join(self.path, 'db', 'revs', str(rev))
return path
def rev_file_indexes(self, rev):
"""Return (ids, texts), where IDS is a dictionary of all node-rev ids
defined in revision REV of the repo at REPO_DIR, in the form
{noderev: full_id}, and TEXTS is an array of
(offset, size, expanded-size, csum [,sha1-csum, uniquifier]) tuples
taken from all the "text: REV ..." representation lines
in revision REV.
Here, NODEREV is the node-revision id minus the /offset part, and
FULL_ID is the full node-revision id (including the /offset part).
"""
if isinstance(rev, str):
rev = int(rev)
ids = {}
texts = []
for line in open(self.rev_file_path(rev)):
if line.startswith('id: '):
id = line.replace('id: ', '').rstrip()
id_noderev, id_rev, _ = parse_id(id)
id_rev = int(id_rev)
# all ids in an unpacked rev file should match its rev number
if rev >= self.min_unpacked_rev:
assert id_rev == rev
# in a pre-f7 pack, revs are ordered so after REV we can stop looking
if id_rev > rev:
break
if id_rev == rev:
ids[id_noderev] = id
elif line.startswith('text: ' + str(rev) + ' '): # also 'props:' lines?
fields = line.split()
texts.append(tuple(fields[2:]))
return ids, texts
def find_good_id(repo, bad_id):
"""Return the node-rev id that is like BAD_ID but has the byte-offset
part corrected, by looking in the revision file in the repository
at REPO_DIR.
### TODO: Parsing of the rev file should skip over node-content data
when searching for a line matching "id: <id>", to avoid the
possibility of a false match.
"""
if isinstance(repo, str):
repo = Repository(repo)
noderev, rev, bad_offset = parse_id(bad_id)
ids, _ = repo.rev_file_indexes(rev)
if noderev not in ids:
raise FixError("NodeRev Id '" + noderev + "' not found in r" + rev)
return ids[noderev]
def find_good_rep_header(repo, rev, size):
"""Find a rep header that matches REV and SIZE.
Return the correct offset."""
if isinstance(repo, str):
repo = Repository(repo)
_, texts = repo.rev_file_indexes(rev)
n_matches = 0
for fields in texts:
if fields[1] == size:
offset = fields[0]
n_matches += 1
if n_matches != 1:
raise FixError("%d matches for r%s, size %s" % (n_matches, rev, size))
return offset
if __name__ == '__main__':
if len(sys.argv) == 4:
repo_dir = sys.argv[1]
rev = sys.argv[2]
size = sys.argv[3]
repo = Repository(repo_dir)
print("Good rep header offset:", find_good_rep_header(repo, rev, size))
sys.exit(0)
if len(sys.argv) != 3:
sys.stderr.write(usage + "\n")
sys.exit(1)
repo_dir = sys.argv[1]
bad_id = sys.argv[2]
repo = Repository(repo_dir)
good_id = find_good_id(repo, bad_id)
# Replacement ID must be the same length, otherwise I don't know how to
# reconstruct the file so as to preserve all offsets.
# ### TODO: This check should be in the caller rather than here.
if len(good_id) != len(bad_id):
sys.stderr.write("warning: the good ID has a different length: " + \
"bad id '" + bad_id + "', good id '" + good_id + "'\n")
print(good_id)