| # |
| # Copyright (C) 2000-2001 The ViewCVS Group. All Rights Reserved. |
| # |
| # By using this file, you agree to the terms and conditions set forth in |
| # the LICENSE.html file which can be found at the top level of the ViewCVS |
| # distribution or at http://viewcvs.sourceforge.net/license-1.html. |
| # |
| # Contact information: |
| # Greg Stein, PO Box 760, Palo Alto, CA, 94302 |
| # gstein@lyra.org, http://viewcvs.sourceforge.net/ |
| # |
| # ----------------------------------------------------------------------- |
| # |
| # This software is being maintained as part of the ViewCVS project. |
| # Information is available at: |
| # http://viewcvs.sourceforge.net/ |
| # |
| # This file was originally based on portions of the blame.py script by |
| # Curt Hagenlocher. |
| # |
| # ----------------------------------------------------------------------- |
| |
| import string |
| import time |
| |
| |
| class _TokenStream: |
| token_term = string.whitespace + ';' |
| |
| # the algorithm is about the same speed for any CHUNK_SIZE chosen. |
| # grab a good-sized chunk, but not too large to overwhelm memory. |
| CHUNK_SIZE = 100000 |
| |
| # CHUNK_SIZE = 5 # for debugging, make the function grind... |
| |
| def __init__(self, file): |
| self.rcsfile = file |
| self.idx = 0 |
| self.buf = self.rcsfile.read(self.CHUNK_SIZE) |
| if self.buf == '': |
| raise RuntimeError, 'EOF' |
| |
| def get(self): |
| "Get the next token from the RCS file." |
| |
| # Note: we can afford to loop within Python, examining individual |
| # characters. For the whitespace and tokens, the number of iterations |
| # is typically quite small. Thus, a simple iterative loop will beat |
| # out more complex solutions. |
| |
| buf = self.buf |
| idx = self.idx |
| |
| while 1: |
| if idx == len(buf): |
| buf = self.rcsfile.read(self.CHUNK_SIZE) |
| if buf == '': |
| # signal EOF by returning None as the token |
| del self.buf # so we fail if get() is called again |
| return None |
| idx = 0 |
| |
| if buf[idx] not in string.whitespace: |
| break |
| |
| idx = idx + 1 |
| |
| if buf[idx] == ';': |
| self.buf = buf |
| self.idx = idx + 1 |
| return ';' |
| |
| if buf[idx] != '@': |
| end = idx + 1 |
| token = '' |
| while 1: |
| # find token characters in the current buffer |
| while end < len(buf) and buf[end] not in self.token_term: |
| end = end + 1 |
| token = token + buf[idx:end] |
| |
| if end < len(buf): |
| # we stopped before the end, so we have a full token |
| idx = end |
| break |
| |
| # we stopped at the end of the buffer, so we may have a partial token |
| buf = self.rcsfile.read(self.CHUNK_SIZE) |
| idx = end = 0 |
| |
| self.buf = buf |
| self.idx = idx |
| return token |
| |
| # a "string" which starts with the "@" character. we'll skip it when we |
| # search for content. |
| idx = idx + 1 |
| |
| chunks = [ ] |
| |
| while 1: |
| if idx == len(buf): |
| idx = 0 |
| buf = self.rcsfile.read(self.CHUNK_SIZE) |
| if buf == '': |
| raise RuntimeError, 'EOF' |
| i = string.find(buf, '@', idx) |
| if i == -1: |
| chunks.append(buf[idx:]) |
| idx = len(buf) |
| continue |
| if i == len(buf) - 1: |
| chunks.append(buf[idx:i]) |
| idx = 0 |
| buf = '@' + self.rcsfile.read(self.CHUNK_SIZE) |
| if buf == '@': |
| raise RuntimeError, 'EOF' |
| continue |
| if buf[i + 1] == '@': |
| chunks.append(buf[idx:i+1]) |
| idx = i + 2 |
| continue |
| |
| chunks.append(buf[idx:i]) |
| |
| self.buf = buf |
| self.idx = i + 1 |
| |
| return string.join(chunks, '') |
| |
| # _get = get |
| # def get(self): |
| token = self._get() |
| print 'T:', `token` |
| return token |
| |
| def match(self, match): |
| "Try to match the next token from the input buffer." |
| |
| token = self.get() |
| if token != match: |
| raise RuntimeError, ('Unexpected parsing error in RCS file.\n' + |
| 'Expected token: %s, but saw: %s' % (match, token)) |
| |
| def unget(self, token): |
| "Put this token back, for the next get() to return." |
| |
| # Override the class' .get method with a function which clears the |
| # overridden method then returns the pushed token. Since this function |
| # will not be looked up via the class mechanism, it should be a "normal" |
| # function, meaning it won't have "self" automatically inserted. |
| # Therefore, we need to pass both self and the token thru via defaults. |
| |
| # note: we don't put this into the input buffer because it may have been |
| # @-unescaped already. |
| |
| def give_it_back(self=self, token=token): |
| del self.get |
| return token |
| |
| self.get = give_it_back |
| |
| class Parser: |
| |
| def parse_rcs_admin(self): |
| while 1: |
| # Read initial token at beginning of line |
| token = self.ts.get() |
| |
| # We're done once we reach the description of the RCS tree |
| if token[0] in string.digits: |
| self.ts.unget(token) |
| return |
| |
| if token == "head": |
| self.sink.set_head_revision(self.ts.get()) |
| self.ts.match(';') |
| elif token == "branch": |
| self.sink.set_principal_branch(self.ts.get()) |
| self.ts.match(';') |
| elif token == "symbols": |
| while 1: |
| tag = self.ts.get() |
| if tag == ';': |
| break |
| (tag_name, tag_rev) = string.split(tag, ':') |
| self.sink.define_tag(tag_name, tag_rev) |
| elif token == "comment": |
| self.sink.set_comment(self.ts.get()) |
| self.ts.match(';') |
| |
| # Ignore all these other fields - We don't care about them. Also chews |
| # up "newphrase". |
| elif token in ("locks", "strict", "expand", "access"): |
| while 1: |
| tag = self.ts.get() |
| if tag == ';': |
| break |
| else: |
| pass |
| # warn("Unexpected RCS token: $token\n") |
| |
| raise RuntimeError, "Unexpected EOF"; |
| |
| def parse_rcs_tree(self): |
| while 1: |
| revision = self.ts.get() |
| |
| # End of RCS tree description ? |
| if revision == 'desc': |
| self.ts.unget(revision) |
| return |
| |
| # Parse date |
| self.ts.match('date') |
| date = self.ts.get() |
| self.ts.match(';') |
| |
| # Convert date into timestamp |
| date_fields = string.split(date, '.') + ['0', '0', '0'] |
| date_fields = map(string.atoi, date_fields) |
| if date_fields[0] < 100: |
| date_fields[0] = date_fields[0] + 1900 |
| timestamp = time.mktime(tuple(date_fields)) |
| |
| # Parse author |
| self.ts.match('author') |
| author = self.ts.get() |
| self.ts.match(';') |
| |
| # Parse state |
| self.ts.match('state') |
| state = '' |
| while 1: |
| token = self.ts.get() |
| if token == ';': |
| break |
| state = state + token + ' ' |
| state = state[:-1] # toss the trailing space |
| |
| # Parse branches |
| self.ts.match('branches') |
| branches = [ ] |
| while 1: |
| token = self.ts.get() |
| if token == ';': |
| break |
| branches.append(token) |
| |
| # Parse revision of next delta in chain |
| self.ts.match('next') |
| next = self.ts.get() |
| if next == ';': |
| next = None |
| else: |
| self.ts.match(';') |
| |
| # there are some files with extra tags in them. for example: |
| # owner 640; |
| # group 15; |
| # permissions 644; |
| # hardlinks @configure.in@; |
| # this is "newphrase" in RCSFILE(5). we just want to skip over these. |
| while 1: |
| token = self.ts.get() |
| if token == 'desc' or token[0] in string.digits: |
| self.ts.unget(token) |
| break |
| # consume everything up to the semicolon |
| while self.ts.get() != ';': |
| pass |
| |
| self.sink.define_revision(revision, timestamp, author, state, branches, |
| next) |
| |
| def parse_rcs_description(self): |
| self.ts.match('desc') |
| self.sink.set_description(self.ts.get()) |
| |
| def parse_rcs_deltatext(self): |
| while 1: |
| revision = self.ts.get() |
| if revision is None: |
| # EOF |
| break |
| self.ts.match('log') |
| log = self.ts.get() |
| ### need to add code to chew up "newphrase" |
| self.ts.match('text') |
| text = self.ts.get() |
| self.sink.set_revision_info(revision, log, text) |
| |
| def parse(self, file, sink): |
| self.ts = _TokenStream(file) |
| self.sink = sink |
| |
| self.parse_rcs_admin() |
| self.parse_rcs_tree() |
| |
| # many sinks want to know when the tree has been completed so they can |
| # do some work to prep for the arrival of the deltatext |
| self.sink.tree_completed() |
| |
| self.parse_rcs_description() |
| self.parse_rcs_deltatext() |
| |
| # easiest for us to tell the sink it is done, rather than worry about |
| # higher level software doing it. |
| self.sink.parse_completed() |
| |
| self.ts = self.sink = None |
| |
| |
| class Sink: |
| def set_head_revision(self, revision): |
| pass |
| def set_principal_branch(self, branch_name): |
| pass |
| def define_tag(self, name, revision): |
| pass |
| def set_comment(self, comment): |
| pass |
| def set_description(self, description): |
| pass |
| def define_revision(self, revision, timestamp, author, state, |
| branches, next): |
| pass |
| def set_revision_info(self, revision, log, text): |
| pass |
| def tree_completed(self): |
| pass |
| def parse_completed(self): |
| pass |
| |
| # -------------------------------------------------------------------------- |
| # |
| # TESTING AND DEBUGGING TOOLS |
| # |
| |
| class DebugSink: |
| def set_head_revision(self, revision): |
| print 'head:', revision |
| |
| def set_principal_branch(self, branch_name): |
| print 'branch:', branch_name |
| |
| def define_tag(self, name, revision): |
| print 'tag:', name, '=', revision |
| |
| def set_comment(self, comment): |
| print 'comment:', comment |
| |
| def set_description(self, description): |
| print 'description:', description |
| |
| def define_revision(self, revision, timestamp, author, state, |
| branches, next): |
| print 'revision:', revision |
| print ' timestamp:', timestamp |
| print ' author:', author |
| print ' state:', state |
| print ' branches:', branches |
| print ' next:', next |
| |
| def set_revision_info(self, revision, log, text): |
| print 'revision:', revision |
| print ' log:', log |
| print ' text:', text[:100], '...' |
| |
| class DumpSink: |
| """Dump all the parse information directly to stdout. |
| |
| The output is relatively unformatted and untagged. It is intended as a |
| raw dump of the data in the RCS file. A copy can be saved, then changes |
| made to the parsing engine, then a comparison of the new output against |
| the old output. |
| """ |
| def __init__(self): |
| global sha |
| import sha |
| |
| def set_head_revision(self, revision): |
| print revision |
| |
| def set_principal_branch(self, branch_name): |
| print branch_name |
| |
| def define_tag(self, name, revision): |
| print name, revision |
| |
| def set_comment(self, comment): |
| print comment |
| |
| def set_description(self, description): |
| print description |
| |
| def define_revision(self, revision, timestamp, author, state, |
| branches, next): |
| print revision, timestamp, author, state, branches, next |
| |
| def set_revision_info(self, revision, log, text): |
| print revision, sha.new(log).hexdigest(), sha.new(text).hexdigest() |
| |
| def tree_completed(self): |
| print 'tree_completed' |
| |
| def parse_completed(self): |
| print 'parse_completed' |
| |
| def dump_file(fname): |
| Parser().parse(open(fname), DumpSink()) |
| |
| def time_file(fname): |
| import time |
| p = Parser().parse |
| f = open(fname) |
| s = Sink() |
| t = time.time() |
| p(f, s) |
| t = time.time() - t |
| print t |
| |
| def _usage(): |
| print 'This is normally a module for importing, but it has a couple' |
| print 'features for testing as an executable script.' |
| print 'USAGE: %s COMMAND filename,v' % sys.argv[0] |
| print ' where COMMAND is one of:' |
| print ' dump: filename is "dumped" to stdout' |
| print ' time: filename is parsed with the time written to stdout' |
| sys.exit(1) |
| |
| if __name__ == '__main__': |
| import sys |
| if len(sys.argv) != 3: |
| usage() |
| if sys.argv[1] == 'dump': |
| dump_file(sys.argv[2]) |
| elif sys.argv[1] == 'time': |
| time_file(sys.argv[2]) |
| else: |
| usage() |