contrib/server-side/fsfsfixer/fixer/find_good_id.py - subversion - Git at Google

 #!/usr/bin/env python

 usage = """
 Usage:
   $0 REPO-DIR FSFS-ID-WITH-BAD-OFFSET
     -- Find the correct FSFS node-rev id, given one that is correct except for
        its byte-offset part.
   $0 REPO-DIR REV SIZE
     -- Find a rep header that matches REV and SIZE.
 Example:
   Result of running 'svnadmin verify':
     svnadmin: Corrupt node-revision '5-12302.1-12953.r12953/29475'
   Invocation of this script:
     $ $0 svn-repo 5-12302.1-12953.r12953/29475
   Output of this script:
     5-12302.1-12953.r12953/29255
 """

 import os, sys
 from fixer_config import *

 class FixError(Exception):
   """An exception for any kind of inablility to repair the repository."""
   pass

 def parse_id(id):
   """Return the (NODEREV, REV, OFFSET) of ID, where ID is of the form
      "NODE_ID.COPY_ID.rREV/OFFSET" and NODEREV is "NODE_ID.COPY_ID.rREV".
   """
   node_id, copy_id, txn_id = id.split('.')
   rev, offset = txn_id[1:].split('/')
   noderev = node_id + '.' + copy_id + '.r' + rev
   return noderev, rev, offset

 class Repository:
   """Encapsulates key information of a repository:
      its NAME, PATH, SHARD_SIZE, HEAD revision and MIN_UNPACKED_REV.
      """

   def _read_repo_file(self, filename):
     """Read and return all lines from FILENAME in REPO.
     """

     f = open(os.path.join(self.path, filename), "rb")
     lines = f.readlines()
     f.close()
     return lines

   def _read_config(self, filename):
     """ Read and return all lines from FILENAME.
         This will be used to read 'format', 'current' etc. . """

     if filename not in self.db_config:
       f = open(os.path.join(self.path, 'db', filename), "rb")
       self.db_config[filename] = f.readlines()
       f.close()

     return self.db_config[filename]

   def __init__(self, path_or_parent, name=None):
     """Constructor collecting everything we need to know about
        the repository at path PATH_OR_PARENT or NAME within PARENT folder.
     """

     if name is None:
       self.name = os.path.basename(path_or_parent)
       self.path = path_or_parent
     else:
       self.name = name
       self.path = os.path.join(path_or_parent, name)

     self.db_config = {}
     self.repo_format = int(self._read_repo_file('format')[0])
     self.fs_type = self._read_config('fs-type')[0].rstrip()
     self.db_format = int(self._read_config('format')[0])
     try:
       self.shard_size = int(self._read_config('format')[1].split(' ')[2])
     except IndexError:
       self.shard_size = 0
     if self.db_format >= 4:
       self.min_unpacked_rev = int(self._read_config('min-unpacked-rev')[0])
     else:
       self.min_unpacked_rev = 0
     self.head = int(self._read_config('current')[0])

   def rev_file_path(self, rev):
     """Return the path to the revision file in the repository at REPO_DIR
        (a path string) for revision number REV (int or string).
        """
     if isinstance(rev, str):
       rev = int(rev)
     if self.shard_size > 0:
       shard = int(rev) / self.shard_size
       if rev < self.min_unpacked_rev:
           path = os.path.join(self.path, 'db', 'revs', str(shard) + '.pack', 'pack')
       else:
           path = os.path.join(self.path, 'db', 'revs', str(shard), str(rev))
     else:
       path = os.path.join(self.path, 'db', 'revs', str(rev))
     return path

   def rev_file_indexes(self, rev):
     """Return (ids, texts), where IDS is a dictionary of all node-rev ids
        defined in revision REV of the repo at REPO_DIR, in the form
        {noderev: full_id}, and TEXTS is an array of
        (offset, size, expanded-size, csum [,sha1-csum, uniquifier]) tuples
        taken from all the "text: REV ..." representation lines
        in revision REV.

        Here, NODEREV is the node-revision id minus the /offset part, and
        FULL_ID is the full node-revision id (including the /offset part).
        """
     if isinstance(rev, str):
       rev = int(rev)
     ids = {}
     texts = []
     for line in open(self.rev_file_path(rev)):
       if line.startswith('id: '):
         id = line.replace('id: ', '').rstrip()
         id_noderev, id_rev, _ = parse_id(id)
         id_rev = int(id_rev)
         # all ids in an unpacked rev file should match its rev number
         if rev >= self.min_unpacked_rev:
           assert id_rev == rev
         # in a pre-f7 pack, revs are ordered so after REV we can stop looking
         if id_rev > rev:
           break
         if id_rev == rev:
           ids[id_noderev] = id
       elif line.startswith('text: ' + str(rev) + ' '):  # also 'props:' lines?
         fields = line.split()
         texts.append(tuple(fields[2:]))
     return ids, texts

 def find_good_id(repo, bad_id):
   """Return the node-rev id that is like BAD_ID but has the byte-offset
      part corrected, by looking in the revision file in the repository
      at REPO_DIR.

      ### TODO: Parsing of the rev file should skip over node-content data
          when searching for a line matching "id: <id>", to avoid the
          possibility of a false match.
   """

   if isinstance(repo, str):
     repo = Repository(repo)
   noderev, rev, bad_offset = parse_id(bad_id)
   ids, _ = repo.rev_file_indexes(rev)

   if noderev not in ids:
     raise FixError("NodeRev Id '" + noderev + "' not found in r" + rev)
   return ids[noderev]

 def find_good_rep_header(repo, rev, size):
   """Find a rep header that matches REV and SIZE.
      Return the correct offset."""
   if isinstance(repo, str):
     repo = Repository(repo)
   _, texts = repo.rev_file_indexes(rev)
   n_matches = 0
   for fields in texts:
     if fields[1] == size:
       offset = fields[0]
       n_matches += 1
   if n_matches != 1:
     raise FixError("%d matches for r%s, size %s" % (n_matches, rev, size))
   return offset


 if __name__ == '__main__':

   if len(sys.argv) == 4:
     repo_dir = sys.argv[1]
     rev = sys.argv[2]
     size = sys.argv[3]
     repo = Repository(repo_dir)
     print("Good rep header offset:", find_good_rep_header(repo, rev, size))
     sys.exit(0)

   if len(sys.argv) != 3:
     sys.stderr.write(usage + "\n")
     sys.exit(1)

   repo_dir = sys.argv[1]
   bad_id = sys.argv[2]

   repo = Repository(repo_dir)
   good_id = find_good_id(repo, bad_id)

   # Replacement ID must be the same length, otherwise I don't know how to
   # reconstruct the file so as to preserve all offsets.
   # ### TODO: This check should be in the caller rather than here.
   if len(good_id) != len(bad_id):
     sys.stderr.write("warning: the good ID has a different length: " + \
                      "bad id '" + bad_id + "', good id '" + good_id + "'\n")

   print(good_id)
	#!/usr/bin/env python

	usage = """
	Usage:
	$0 REPO-DIR FSFS-ID-WITH-BAD-OFFSET
	-- Find the correct FSFS node-rev id, given one that is correct except for
	its byte-offset part.
	$0 REPO-DIR REV SIZE
	-- Find a rep header that matches REV and SIZE.
	Example:
	Result of running 'svnadmin verify':
	svnadmin: Corrupt node-revision '5-12302.1-12953.r12953/29475'
	Invocation of this script:
	$ $0 svn-repo 5-12302.1-12953.r12953/29475
	Output of this script:
	5-12302.1-12953.r12953/29255
	"""

	import os, sys
	from fixer_config import *

	class FixError(Exception):
	"""An exception for any kind of inablility to repair the repository."""
	pass

	def parse_id(id):
	"""Return the (NODEREV, REV, OFFSET) of ID, where ID is of the form
	"NODE_ID.COPY_ID.rREV/OFFSET" and NODEREV is "NODE_ID.COPY_ID.rREV".
	"""
	node_id, copy_id, txn_id = id.split('.')
	rev, offset = txn_id[1:].split('/')
	noderev = node_id + '.' + copy_id + '.r' + rev
	return noderev, rev, offset

	class Repository:
	"""Encapsulates key information of a repository:
	its NAME, PATH, SHARD_SIZE, HEAD revision and MIN_UNPACKED_REV.
	"""

	def _read_repo_file(self, filename):
	"""Read and return all lines from FILENAME in REPO.
	"""

	f = open(os.path.join(self.path, filename), "rb")
	lines = f.readlines()
	f.close()
	return lines

	def _read_config(self, filename):
	""" Read and return all lines from FILENAME.
	This will be used to read 'format', 'current' etc. . """

	if filename not in self.db_config:
	f = open(os.path.join(self.path, 'db', filename), "rb")
	self.db_config[filename] = f.readlines()
	f.close()

	return self.db_config[filename]

	def __init__(self, path_or_parent, name=None):
	"""Constructor collecting everything we need to know about
	the repository at path PATH_OR_PARENT or NAME within PARENT folder.
	"""

	if name is None:
	self.name = os.path.basename(path_or_parent)
	self.path = path_or_parent
	else:
	self.name = name
	self.path = os.path.join(path_or_parent, name)

	self.db_config = {}
	self.repo_format = int(self._read_repo_file('format')[0])
	self.fs_type = self._read_config('fs-type')[0].rstrip()
	self.db_format = int(self._read_config('format')[0])
	try:
	self.shard_size = int(self._read_config('format')[1].split(' ')[2])
	except IndexError:
	self.shard_size = 0
	if self.db_format >= 4:
	self.min_unpacked_rev = int(self._read_config('min-unpacked-rev')[0])
	else:
	self.min_unpacked_rev = 0
	self.head = int(self._read_config('current')[0])

	def rev_file_path(self, rev):
	"""Return the path to the revision file in the repository at REPO_DIR
	(a path string) for revision number REV (int or string).
	"""
	if isinstance(rev, str):
	rev = int(rev)
	if self.shard_size > 0:
	shard = int(rev) / self.shard_size
	if rev < self.min_unpacked_rev:
	path = os.path.join(self.path, 'db', 'revs', str(shard) + '.pack', 'pack')
	else:
	path = os.path.join(self.path, 'db', 'revs', str(shard), str(rev))
	else:
	path = os.path.join(self.path, 'db', 'revs', str(rev))
	return path

	def rev_file_indexes(self, rev):
	"""Return (ids, texts), where IDS is a dictionary of all node-rev ids
	defined in revision REV of the repo at REPO_DIR, in the form
	{noderev: full_id}, and TEXTS is an array of
	(offset, size, expanded-size, csum [,sha1-csum, uniquifier]) tuples
	taken from all the "text: REV ..." representation lines
	in revision REV.

	Here, NODEREV is the node-revision id minus the /offset part, and
	FULL_ID is the full node-revision id (including the /offset part).
	"""
	if isinstance(rev, str):
	rev = int(rev)
	ids = {}
	texts = []
	for line in open(self.rev_file_path(rev)):
	if line.startswith('id: '):
	id = line.replace('id: ', '').rstrip()
	id_noderev, id_rev, _ = parse_id(id)
	id_rev = int(id_rev)
	# all ids in an unpacked rev file should match its rev number
	if rev >= self.min_unpacked_rev:
	assert id_rev == rev
	# in a pre-f7 pack, revs are ordered so after REV we can stop looking
	if id_rev > rev:
	break
	if id_rev == rev:
	ids[id_noderev] = id
	elif line.startswith('text: ' + str(rev) + ' '): # also 'props:' lines?
	fields = line.split()
	texts.append(tuple(fields[2:]))
	return ids, texts

	def find_good_id(repo, bad_id):
	"""Return the node-rev id that is like BAD_ID but has the byte-offset
	part corrected, by looking in the revision file in the repository
	at REPO_DIR.

	### TODO: Parsing of the rev file should skip over node-content data
	when searching for a line matching "id: <id>", to avoid the
	possibility of a false match.
	"""

	if isinstance(repo, str):
	repo = Repository(repo)
	noderev, rev, bad_offset = parse_id(bad_id)
	ids, _ = repo.rev_file_indexes(rev)

	if noderev not in ids:
	raise FixError("NodeRev Id '" + noderev + "' not found in r" + rev)
	return ids[noderev]

	def find_good_rep_header(repo, rev, size):
	"""Find a rep header that matches REV and SIZE.
	Return the correct offset."""
	if isinstance(repo, str):
	repo = Repository(repo)
	_, texts = repo.rev_file_indexes(rev)
	n_matches = 0
	for fields in texts:
	if fields[1] == size:
	offset = fields[0]
	n_matches += 1
	if n_matches != 1:
	raise FixError("%d matches for r%s, size %s" % (n_matches, rev, size))
	return offset


	if __name__ == '__main__':

	if len(sys.argv) == 4:
	repo_dir = sys.argv[1]
	rev = sys.argv[2]
	size = sys.argv[3]
	repo = Repository(repo_dir)
	print("Good rep header offset:", find_good_rep_header(repo, rev, size))
	sys.exit(0)

	if len(sys.argv) != 3:
	sys.stderr.write(usage + "\n")
	sys.exit(1)

	repo_dir = sys.argv[1]
	bad_id = sys.argv[2]

	repo = Repository(repo_dir)
	good_id = find_good_id(repo, bad_id)

	# Replacement ID must be the same length, otherwise I don't know how to
	# reconstruct the file so as to preserve all offsets.
	# ### TODO: This check should be in the caller rather than here.
	if len(good_id) != len(bad_id):
	sys.stderr.write("warning: the good ID has a different length: " + \
	"bad id '" + bad_id + "', good id '" + good_id + "'\n")

	print(good_id)