packages/rpm/mandrake-9.0/rcsparse.py - subversion - Git at Google

 #
 # Copyright (C) 2000-2001 The ViewCVS Group. All Rights Reserved.
 #
 # By using this file, you agree to the terms and conditions set forth in
 # the LICENSE.html file which can be found at the top level of the ViewCVS
 # distribution or at http://viewcvs.sourceforge.net/license-1.html.
 #
 # Contact information:
 #   Greg Stein, PO Box 760, Palo Alto, CA, 94302
 #   gstein@lyra.org, http://viewcvs.sourceforge.net/
 #
 # -----------------------------------------------------------------------
 #
 # This software is being maintained as part of the ViewCVS project.
 # Information is available at:
 #    http://viewcvs.sourceforge.net/
 #
 # This file was originally based on portions of the blame.py script by
 # Curt Hagenlocher.
 #
 # -----------------------------------------------------------------------

 import string
 import time


 class _TokenStream:
   token_term = string.whitespace + ';'

   # the algorithm is about the same speed for any CHUNK_SIZE chosen.
   # grab a good-sized chunk, but not too large to overwhelm memory.
   CHUNK_SIZE  = 100000

 #  CHUNK_SIZE  = 5	# for debugging, make the function grind...

   def __init__(self, file):
     self.rcsfile = file
     self.idx = 0
     self.buf = self.rcsfile.read(self.CHUNK_SIZE)
     if self.buf == '':
       raise RuntimeError, 'EOF'

   def get(self):
     "Get the next token from the RCS file."

     # Note: we can afford to loop within Python, examining individual
     # characters. For the whitespace and tokens, the number of iterations
     # is typically quite small. Thus, a simple iterative loop will beat
     # out more complex solutions.

     buf = self.buf
     idx = self.idx

     while 1:
       if idx == len(buf):
         buf = self.rcsfile.read(self.CHUNK_SIZE)
         if buf == '':
           # signal EOF by returning None as the token
           del self.buf	# so we fail if get() is called again
           return None
         idx = 0

       if buf[idx] not in string.whitespace:
         break

       idx = idx + 1

     if buf[idx] == ';':
       self.buf = buf
       self.idx = idx + 1
       return ';'

     if buf[idx] != '@':
       end = idx + 1
       token = ''
       while 1:
         # find token characters in the current buffer
         while end < len(buf) and buf[end] not in self.token_term:
           end = end + 1
         token = token + buf[idx:end]

         if end < len(buf):
           # we stopped before the end, so we have a full token
           idx = end
           break

         # we stopped at the end of the buffer, so we may have a partial token
         buf = self.rcsfile.read(self.CHUNK_SIZE)
         idx = end = 0

       self.buf = buf
       self.idx = idx
       return token

     # a "string" which starts with the "@" character. we'll skip it when we
     # search for content.
     idx = idx + 1

     chunks = [ ]

     while 1:
       if idx == len(buf):
         idx = 0
         buf = self.rcsfile.read(self.CHUNK_SIZE)
         if buf == '':
           raise RuntimeError, 'EOF'
       i = string.find(buf, '@', idx)
       if i == -1:
         chunks.append(buf[idx:])
         idx = len(buf)
         continue
       if i == len(buf) - 1:
         chunks.append(buf[idx:i])
         idx = 0
         buf = '@' + self.rcsfile.read(self.CHUNK_SIZE)
         if buf == '@':
           raise RuntimeError, 'EOF'
         continue
       if buf[i + 1] == '@':
         chunks.append(buf[idx:i+1])
         idx = i + 2
         continue

       chunks.append(buf[idx:i])

       self.buf = buf
       self.idx = i + 1

       return string.join(chunks, '')

 #  _get = get
 #  def get(self):
     token = self._get()
     print 'T:', `token`
     return token

   def match(self, match):
     "Try to match the next token from the input buffer."

     token = self.get()
     if token != match:
       raise RuntimeError, ('Unexpected parsing error in RCS file.\n' +
                            'Expected token: %s, but saw: %s' % (match, token))

   def unget(self, token):
     "Put this token back, for the next get() to return."

     # Override the class' .get method with a function which clears the
     # overridden method then returns the pushed token. Since this function
     # will not be looked up via the class mechanism, it should be a "normal"
     # function, meaning it won't have "self" automatically inserted.
     # Therefore, we need to pass both self and the token thru via defaults.

     # note: we don't put this into the input buffer because it may have been
     # @-unescaped already.

     def give_it_back(self=self, token=token):
       del self.get
       return token

     self.get = give_it_back

 class Parser:

   def parse_rcs_admin(self):
     while 1:
       # Read initial token at beginning of line
       token = self.ts.get()

       # We're done once we reach the description of the RCS tree
       if token[0] in string.digits:
         self.ts.unget(token)
         return

       if token == "head":
         self.sink.set_head_revision(self.ts.get())
         self.ts.match(';')
       elif token == "branch":
         self.sink.set_principal_branch(self.ts.get())
         self.ts.match(';')
       elif token == "symbols":
         while 1:
           tag = self.ts.get()
           if tag == ';':
             break
           (tag_name, tag_rev) = string.split(tag, ':')
           self.sink.define_tag(tag_name, tag_rev)
       elif token == "comment":
         self.sink.set_comment(self.ts.get())
         self.ts.match(';')

       # Ignore all these other fields - We don't care about them. Also chews
       # up "newphrase".
       elif token in ("locks", "strict", "expand", "access"):
         while 1:
           tag = self.ts.get()
           if tag == ';':
             break
       else:
         pass
         # warn("Unexpected RCS token: $token\n")

     raise RuntimeError, "Unexpected EOF";

   def parse_rcs_tree(self):
     while 1:
       revision = self.ts.get()

       # End of RCS tree description ?
       if revision == 'desc':
         self.ts.unget(revision)
         return

       # Parse date
       self.ts.match('date')
       date = self.ts.get()
       self.ts.match(';')

       # Convert date into timestamp
       date_fields = string.split(date, '.') + ['0', '0', '0']
       date_fields = map(string.atoi, date_fields)
       if date_fields[0] < 100:
         date_fields[0] = date_fields[0] + 1900
       timestamp = time.mktime(tuple(date_fields))

       # Parse author
       self.ts.match('author')
       author = self.ts.get()
       self.ts.match(';')

       # Parse state
       self.ts.match('state')
       state = ''
       while 1:
         token = self.ts.get()
         if token == ';':
           break
         state = state + token + ' '
       state = state[:-1]	# toss the trailing space

       # Parse branches
       self.ts.match('branches')
       branches = [ ]
       while 1:
         token = self.ts.get()
         if token == ';':
           break
         branches.append(token)

       # Parse revision of next delta in chain
       self.ts.match('next')
       next = self.ts.get()
       if next == ';':
         next = None
       else:
         self.ts.match(';')

       # there are some files with extra tags in them. for example:
       #    owner	640;
       #    group	15;
       #    permissions	644;
       #    hardlinks	@configure.in@;
       # this is "newphrase" in RCSFILE(5). we just want to skip over these.
       while 1:
         token = self.ts.get()
         if token == 'desc' or token[0] in string.digits:
           self.ts.unget(token)
           break
         # consume everything up to the semicolon
         while self.ts.get() != ';':
           pass

       self.sink.define_revision(revision, timestamp, author, state, branches,
                                 next)

   def parse_rcs_description(self):
     self.ts.match('desc')
     self.sink.set_description(self.ts.get())

   def parse_rcs_deltatext(self):
     while 1:
       revision = self.ts.get()
       if revision is None:
         # EOF
         break
       self.ts.match('log')
       log = self.ts.get()
       ### need to add code to chew up "newphrase"
       self.ts.match('text')
       text = self.ts.get()
       self.sink.set_revision_info(revision, log, text)

   def parse(self, file, sink):
     self.ts = _TokenStream(file)
     self.sink = sink

     self.parse_rcs_admin()
     self.parse_rcs_tree()

     # many sinks want to know when the tree has been completed so they can
     # do some work to prep for the arrival of the deltatext
     self.sink.tree_completed()

     self.parse_rcs_description()
     self.parse_rcs_deltatext()

     # easiest for us to tell the sink it is done, rather than worry about
     # higher level software doing it.
     self.sink.parse_completed()

     self.ts = self.sink = None


 class Sink:
   def set_head_revision(self, revision):
     pass
   def set_principal_branch(self, branch_name):
     pass
   def define_tag(self, name, revision):
     pass
   def set_comment(self, comment):
     pass
   def set_description(self, description):
     pass
   def define_revision(self, revision, timestamp, author, state,
                       branches, next):
     pass
   def set_revision_info(self, revision, log, text):
     pass
   def tree_completed(self):
     pass
   def parse_completed(self):
     pass

 # --------------------------------------------------------------------------
 #
 # TESTING AND DEBUGGING TOOLS
 #

 class DebugSink:
   def set_head_revision(self, revision):
     print 'head:', revision

   def set_principal_branch(self, branch_name):
     print 'branch:', branch_name

   def define_tag(self, name, revision):
     print 'tag:', name, '=', revision

   def set_comment(self, comment):
     print 'comment:', comment

   def set_description(self, description):
     print 'description:', description

   def define_revision(self, revision, timestamp, author, state,
                       branches, next):
     print 'revision:', revision
     print '    timestamp:', timestamp
     print '    author:', author
     print '    state:', state
     print '    branches:', branches
     print '    next:', next

   def set_revision_info(self, revision, log, text):
     print 'revision:', revision
     print '    log:', log
     print '    text:', text[:100], '...'

 class DumpSink:
   """Dump all the parse information directly to stdout.

   The output is relatively unformatted and untagged. It is intended as a
   raw dump of the data in the RCS file. A copy can be saved, then changes
   made to the parsing engine, then a comparison of the new output against
   the old output.
   """
   def __init__(self):
     global sha
     import sha

   def set_head_revision(self, revision):
     print revision

   def set_principal_branch(self, branch_name):
     print branch_name

   def define_tag(self, name, revision):
     print name, revision

   def set_comment(self, comment):
     print comment

   def set_description(self, description):
     print description

   def define_revision(self, revision, timestamp, author, state,
                       branches, next):
     print revision, timestamp, author, state, branches, next

   def set_revision_info(self, revision, log, text):
     print revision, sha.new(log).hexdigest(), sha.new(text).hexdigest()

   def tree_completed(self):
     print 'tree_completed'

   def parse_completed(self):
     print 'parse_completed'

 def dump_file(fname):
   Parser().parse(open(fname), DumpSink())

 def time_file(fname):
   import time
   p = Parser().parse
   f = open(fname)
   s = Sink()
   t = time.time()
   p(f, s)
   t = time.time() - t
   print t

 def _usage():
   print 'This is normally a module for importing, but it has a couple'
   print 'features for testing as an executable script.'
   print 'USAGE: %s COMMAND filename,v' % sys.argv[0]
   print '  where COMMAND is one of:'
   print '    dump: filename is "dumped" to stdout'
   print '    time: filename is parsed with the time written to stdout'
   sys.exit(1)

 if __name__ == '__main__':
   import sys
   if len(sys.argv) != 3:
     usage()
   if sys.argv[1] == 'dump':
     dump_file(sys.argv[2])
   elif sys.argv[1] == 'time':
     time_file(sys.argv[2])
   else:
     usage()
	#
	# Copyright (C) 2000-2001 The ViewCVS Group. All Rights Reserved.
	#
	# By using this file, you agree to the terms and conditions set forth in
	# the LICENSE.html file which can be found at the top level of the ViewCVS
	# distribution or at http://viewcvs.sourceforge.net/license-1.html.
	#
	# Contact information:
	# Greg Stein, PO Box 760, Palo Alto, CA, 94302
	# gstein@lyra.org, http://viewcvs.sourceforge.net/
	#
	# -----------------------------------------------------------------------
	#
	# This software is being maintained as part of the ViewCVS project.
	# Information is available at:
	# http://viewcvs.sourceforge.net/
	#
	# This file was originally based on portions of the blame.py script by
	# Curt Hagenlocher.
	#
	# -----------------------------------------------------------------------

	import string
	import time


	class _TokenStream:
	token_term = string.whitespace + ';'

	# the algorithm is about the same speed for any CHUNK_SIZE chosen.
	# grab a good-sized chunk, but not too large to overwhelm memory.
	CHUNK_SIZE = 100000

	# CHUNK_SIZE = 5 # for debugging, make the function grind...

	def __init__(self, file):
	self.rcsfile = file
	self.idx = 0
	self.buf = self.rcsfile.read(self.CHUNK_SIZE)
	if self.buf == '':
	raise RuntimeError, 'EOF'

	def get(self):
	"Get the next token from the RCS file."

	# Note: we can afford to loop within Python, examining individual
	# characters. For the whitespace and tokens, the number of iterations
	# is typically quite small. Thus, a simple iterative loop will beat
	# out more complex solutions.

	buf = self.buf
	idx = self.idx

	while 1:
	if idx == len(buf):
	buf = self.rcsfile.read(self.CHUNK_SIZE)
	if buf == '':
	# signal EOF by returning None as the token
	del self.buf # so we fail if get() is called again
	return None
	idx = 0

	if buf[idx] not in string.whitespace:
	break

	idx = idx + 1

	if buf[idx] == ';':
	self.buf = buf
	self.idx = idx + 1
	return ';'

	if buf[idx] != '@':
	end = idx + 1
	token = ''
	while 1:
	# find token characters in the current buffer
	while end < len(buf) and buf[end] not in self.token_term:
	end = end + 1
	token = token + buf[idx:end]

	if end < len(buf):
	# we stopped before the end, so we have a full token
	idx = end
	break

	# we stopped at the end of the buffer, so we may have a partial token
	buf = self.rcsfile.read(self.CHUNK_SIZE)
	idx = end = 0

	self.buf = buf
	self.idx = idx
	return token

	# a "string" which starts with the "@" character. we'll skip it when we
	# search for content.
	idx = idx + 1

	chunks = [ ]

	while 1:
	if idx == len(buf):
	idx = 0
	buf = self.rcsfile.read(self.CHUNK_SIZE)
	if buf == '':
	raise RuntimeError, 'EOF'
	i = string.find(buf, '@', idx)
	if i == -1:
	chunks.append(buf[idx:])
	idx = len(buf)
	continue
	if i == len(buf) - 1:
	chunks.append(buf[idx:i])
	idx = 0
	buf = '@' + self.rcsfile.read(self.CHUNK_SIZE)
	if buf == '@':
	raise RuntimeError, 'EOF'
	continue
	if buf[i + 1] == '@':
	chunks.append(buf[idx:i+1])
	idx = i + 2
	continue

	chunks.append(buf[idx:i])

	self.buf = buf
	self.idx = i + 1

	return string.join(chunks, '')

	# _get = get
	# def get(self):
	token = self._get()
	print 'T:', `token`
	return token

	def match(self, match):
	"Try to match the next token from the input buffer."

	token = self.get()
	if token != match:
	raise RuntimeError, ('Unexpected parsing error in RCS file.\n' +
	'Expected token: %s, but saw: %s' % (match, token))

	def unget(self, token):
	"Put this token back, for the next get() to return."

	# Override the class' .get method with a function which clears the
	# overridden method then returns the pushed token. Since this function
	# will not be looked up via the class mechanism, it should be a "normal"
	# function, meaning it won't have "self" automatically inserted.
	# Therefore, we need to pass both self and the token thru via defaults.

	# note: we don't put this into the input buffer because it may have been
	# @-unescaped already.

	def give_it_back(self=self, token=token):
	del self.get
	return token

	self.get = give_it_back

	class Parser:

	def parse_rcs_admin(self):
	while 1:
	# Read initial token at beginning of line
	token = self.ts.get()

	# We're done once we reach the description of the RCS tree
	if token[0] in string.digits:
	self.ts.unget(token)
	return

	if token == "head":
	self.sink.set_head_revision(self.ts.get())
	self.ts.match(';')
	elif token == "branch":
	self.sink.set_principal_branch(self.ts.get())
	self.ts.match(';')
	elif token == "symbols":
	while 1:
	tag = self.ts.get()
	if tag == ';':
	break
	(tag_name, tag_rev) = string.split(tag, ':')
	self.sink.define_tag(tag_name, tag_rev)
	elif token == "comment":
	self.sink.set_comment(self.ts.get())
	self.ts.match(';')

	# Ignore all these other fields - We don't care about them. Also chews
	# up "newphrase".
	elif token in ("locks", "strict", "expand", "access"):
	while 1:
	tag = self.ts.get()
	if tag == ';':
	break
	else:
	pass
	# warn("Unexpected RCS token: $token\n")

	raise RuntimeError, "Unexpected EOF";

	def parse_rcs_tree(self):
	while 1:
	revision = self.ts.get()

	# End of RCS tree description ?
	if revision == 'desc':
	self.ts.unget(revision)
	return

	# Parse date
	self.ts.match('date')
	date = self.ts.get()
	self.ts.match(';')

	# Convert date into timestamp
	date_fields = string.split(date, '.') + ['0', '0', '0']
	date_fields = map(string.atoi, date_fields)
	if date_fields[0] < 100:
	date_fields[0] = date_fields[0] + 1900
	timestamp = time.mktime(tuple(date_fields))

	# Parse author
	self.ts.match('author')
	author = self.ts.get()
	self.ts.match(';')

	# Parse state
	self.ts.match('state')
	state = ''
	while 1:
	token = self.ts.get()
	if token == ';':
	break
	state = state + token + ' '
	state = state[:-1] # toss the trailing space

	# Parse branches
	self.ts.match('branches')
	branches = [ ]
	while 1:
	token = self.ts.get()
	if token == ';':
	break
	branches.append(token)

	# Parse revision of next delta in chain
	self.ts.match('next')
	next = self.ts.get()
	if next == ';':
	next = None
	else:
	self.ts.match(';')

	# there are some files with extra tags in them. for example:
	# owner 640;
	# group 15;
	# permissions 644;
	# hardlinks @configure.in@;
	# this is "newphrase" in RCSFILE(5). we just want to skip over these.
	while 1:
	token = self.ts.get()
	if token == 'desc' or token[0] in string.digits:
	self.ts.unget(token)
	break
	# consume everything up to the semicolon
	while self.ts.get() != ';':
	pass

	self.sink.define_revision(revision, timestamp, author, state, branches,
	next)

	def parse_rcs_description(self):
	self.ts.match('desc')
	self.sink.set_description(self.ts.get())

	def parse_rcs_deltatext(self):
	while 1:
	revision = self.ts.get()
	if revision is None:
	# EOF
	break
	self.ts.match('log')
	log = self.ts.get()
	### need to add code to chew up "newphrase"
	self.ts.match('text')
	text = self.ts.get()
	self.sink.set_revision_info(revision, log, text)

	def parse(self, file, sink):
	self.ts = _TokenStream(file)
	self.sink = sink

	self.parse_rcs_admin()
	self.parse_rcs_tree()

	# many sinks want to know when the tree has been completed so they can
	# do some work to prep for the arrival of the deltatext
	self.sink.tree_completed()

	self.parse_rcs_description()
	self.parse_rcs_deltatext()

	# easiest for us to tell the sink it is done, rather than worry about
	# higher level software doing it.
	self.sink.parse_completed()

	self.ts = self.sink = None


	class Sink:
	def set_head_revision(self, revision):
	pass
	def set_principal_branch(self, branch_name):
	pass
	def define_tag(self, name, revision):
	pass
	def set_comment(self, comment):
	pass
	def set_description(self, description):
	pass
	def define_revision(self, revision, timestamp, author, state,
	branches, next):
	pass
	def set_revision_info(self, revision, log, text):
	pass
	def tree_completed(self):
	pass
	def parse_completed(self):
	pass

	# --------------------------------------------------------------------------
	#
	# TESTING AND DEBUGGING TOOLS
	#

	class DebugSink:
	def set_head_revision(self, revision):
	print 'head:', revision

	def set_principal_branch(self, branch_name):
	print 'branch:', branch_name

	def define_tag(self, name, revision):
	print 'tag:', name, '=', revision

	def set_comment(self, comment):
	print 'comment:', comment

	def set_description(self, description):
	print 'description:', description

	def define_revision(self, revision, timestamp, author, state,
	branches, next):
	print 'revision:', revision
	print ' timestamp:', timestamp
	print ' author:', author
	print ' state:', state
	print ' branches:', branches
	print ' next:', next

	def set_revision_info(self, revision, log, text):
	print 'revision:', revision
	print ' log:', log
	print ' text:', text[:100], '...'

	class DumpSink:
	"""Dump all the parse information directly to stdout.

	The output is relatively unformatted and untagged. It is intended as a
	raw dump of the data in the RCS file. A copy can be saved, then changes
	made to the parsing engine, then a comparison of the new output against
	the old output.
	"""
	def __init__(self):
	global sha
	import sha

	def set_head_revision(self, revision):
	print revision

	def set_principal_branch(self, branch_name):
	print branch_name

	def define_tag(self, name, revision):
	print name, revision

	def set_comment(self, comment):
	print comment

	def set_description(self, description):
	print description

	def define_revision(self, revision, timestamp, author, state,
	branches, next):
	print revision, timestamp, author, state, branches, next

	def set_revision_info(self, revision, log, text):
	print revision, sha.new(log).hexdigest(), sha.new(text).hexdigest()

	def tree_completed(self):
	print 'tree_completed'

	def parse_completed(self):
	print 'parse_completed'

	def dump_file(fname):
	Parser().parse(open(fname), DumpSink())

	def time_file(fname):
	import time
	p = Parser().parse
	f = open(fname)
	s = Sink()
	t = time.time()
	p(f, s)
	t = time.time() - t
	print t

	def _usage():
	print 'This is normally a module for importing, but it has a couple'
	print 'features for testing as an executable script.'
	print 'USAGE: %s COMMAND filename,v' % sys.argv[0]
	print ' where COMMAND is one of:'
	print ' dump: filename is "dumped" to stdout'
	print ' time: filename is parsed with the time written to stdout'
	sys.exit(1)

	if __name__ == '__main__':
	import sys
	if len(sys.argv) != 3:
	usage()
	if sys.argv[1] == 'dump':
	dump_file(sys.argv[2])
	elif sys.argv[1] == 'time':
	time_file(sys.argv[2])
	else:
	usage()