core/sqf/sql/scripts/analyzeMessageGuide.py - trafodion - Git at Google

 # @@@ START COPYRIGHT @@@
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 # @@@ END COPYRIGHT @@@
 #
 #  This script checks the consistency of the Messages Guide
 #  against the code. It looks for messages whose text differs
 #  between the Messages Guide and code. It looks for messages
 #  that are no longer in use. It looks for messages that are
 #  in use but not in the Messages Guide. It produces a report
 #  of these.
 #
 #  The report includes detailed information for each message
 #  that differs between the Messages Guide and code. It lists
 #  the enum used for the message if there is one. It lists
 #  the code modules that may have a reference to the message
 #  (it uses grep to determine this, so there may be false
 #  positives or more rarely false negatives). It lists the
 #  regression test expected files that contain examples of the
 #  error message.
 #
 import os
 import sys
 import subprocess
 import sets
 import datetime
 import argparse  # requires Python 2.7


 # The script does the following:
 #
 # 1. Parses the Messages Guide chapters into a table containing message number
 # + message text.
 # 2. Parses the bin/SqlciErrors.txt file into a table containing message number
 # + message text. Merges that into the table from item 1.
 # 3. Parses the various *.h files with error message enums. Merges that
 # information into the table from item 1.
 # 4. Compares the message texts in items 1 and 2. The comparison has to be
 # smart; parameters appear as angle-bracketed items in the Messages Guide but
 # contain $0-String0 notation in the SqlciErrors.txt text. When there are
 # mismatches, this is noted. If one or the other text is missing, this notation
 # is marked "false".
 # 5. Searches the sql *.cpp files for references to the error message number
 # and/or enum, and when found, maintains a list for each, merged into the
 # table from item 1. We only do this for mismatched messages as it takes a
 # lot of time.
 # 6. Searches the regression tests LOG* files for examples of each message
 # number, and when found, maintains a list for each, attached to the table
 # constructed in 1. We only do this for mismatched messages as it takes a
 # lot of time.
 # 7. Lists error messages that mismatch between Messages Guide and
 # SqlciErrors.txt.
 # 8. Lists error messages that are missing from the Messages Guide but present
 # in SqlciErrors.txt. Mentions whether these are actually used anywhere (in
 # *.cpp files or regression tests).
 # 9. Lists error messages that are present in the Messages Guide or
 # SqlciErrors.txt but apparently are no longer used (that is, don't appear in
 # *.cpp files or regression tests).
 #
 # The table resulting from items 1 through 6 has the following shape:
 # a. Message number (key)
 # b. Messages Guide text (if any)
 # c. SqlciErrors.txt message text (if any)
 # d. Enum (if any)
 # e. Enum *.h file name (if any)
 # f. List of *.cpp files with references to message number
 # g. List of *.cpp files with references to enum name
 # h. List of regression tests with messages number examples
 # i. Boolean indicating whether SqlciErrors.txt text matches Messages Guide
 #
 # Item 1 is implemented by a function that parses a single *.adoc file
 # producing table entries. It is driven by a loop over the set of *.adoc file
 # names. The function updates the table directly.
 # Item 2 is implemented by a function that parses a *.txt file, driven by a
 # caller supplying the bin/SqlciErrors.txt name. The function updates the table
 # directly.
 # Item 3 is implemented by a function that parses a *.h file looking for error
 # message enums. It is driven by a loop over a hard-coded set of known *.h
 # files, perhaps with the name of the enum passed in. The function updates the
 # table directly.
 # Item 4 is implemented by a function that does a grep of the *.cpp files, then
 # processes the results. The function updates the table directly.
 # Item 5 is implemented by a function that does a grep of the LOG* files, then
 # processes the results. The function updates the table directly.
 # Item 6 is implemented by a function that crawls the table, comparing the
 # retrieved texts. The function updates the table directly.
 # Items 7, 8, 9 are implemented by a function that reads the table and reports
 # results.
 #
 # We model the table as a Python object and implement these functions as
 # methods on that object.

 class MessagesTable:

     # The structure of self.dict is a dictionary of dictionaries.
     # The key to the top level dictionary is the message number.
     # The value in each top level dictionary is essentially a
     # relation, modeled by a Python dictionary. The key in this
     # value is the attribute name, as follows:
     #
     # 'messageGuideText'
     # 'errorMessageFileText'
     # 'enumSymbol'
     # 'enumFile'
     # 'listOfCodeReferences'
     # 'listOfEnumSymbolReferences'
     # 'listOfTestReferences'
     # 'textsMatch'

     def __init__(self):
         self.dict = {}

     def mergeEntry(self,key,values):
         if key in self.dict:
             self.dict[key].update(values)
         else:
             self.dict[key] = values

     def parseAdoc(self,adocFileName):
         #
         # Message entries look like this:
         #
         # [[SQL-1002]]
         # == SQL 1002
         #
         # ```
         # Catalog <catalog> does not exist.
         # ```
         #
         # Where <catalog> is the ANSI name of the target catalog.
         #
         # *Cause:* The catalog does not exist.
         #
         # *Effect:* The operation fails.
         #
         # *Recovery:* Enter a valid catalog name and resubmit.
         #
         # The stuff we are interested in is the message number and
         # the message text. We use a tiny state machine to figure
         # out what lines to look for.
         try:
             f = open(adocFileName)
             state = 0
             messageNumberStr = None
             messageText = None
             for line in f:
                 line = line.rstrip('\n')  # get rid of trailing return character
                 if line.startswith('== SQL '):
                     words = line.split()
                     if len(words) == 3:
                         try:
                             messageNumber = int(words[2])
                         except ValueError:
                             messageNumber = 0
                         if messageNumber > 0:
                             messageNumberStr = words[2]
                             state = 1 # look for first backticks
                 elif state == 1:
                     if line == '```':
                         state = 2 # start capturing text
                 elif state == 2:
                     messageText = line
                     state = 3 # continue capturing text until backticks
                 elif state == 3:
                     if line == '```':
                         state = 0 # look for next heading
                         values = { 'messageGuideText': messageText }
                         self.mergeEntry(messageNumberStr,values)
                     else:
                         messageText = ' '.join([messageText,line])
             f.close()

         except IOError as detail:
             print "Could not open " + adocFileName
             print detail


     def parseMessagesFile(self,messagesFileName):
         try:
             f = open(messagesFileName)
             for line in f:
                 words = str.split(line)
                 if len(words) >= 7:
                     try:
                         messageNumber = int(words[0])
                     except ValueError:
                         messageNumber = 0
                     if messageNumber > 0:
                         # filter out "unused" messages
                         if not (words[6].startswith('--') or words[6].startswith('***')):
                             values = { 'errorMessageFileText': ' '.join(words[6:]) }
                             self.mergeEntry(words[0],values)
             f.close()

         except IOError as detail:
             print "Could not open " + messagesFileName
             print detail


     def Cscreener(self,line):
         # remove any C or C++ comments from the line, returning the line
         # (not precise; we don't check for C strings for example)
         commentState = 0
         result = ''
         for c in line:
             if commentState == 0:
                 if c == '/':
                     commentState = 1  # seen '/'
                 else:
                     result = result + c
             elif commentState == 1:
                 if c == '/':
                     commentState = 4 # in C++ comment, ignore rest of line
                 elif c == '*':
                     commentState = 2 # in C comment, look for '*'
                 else:
                     result = result + '/' + c  # false alarm
                     commentState = 0
             elif commentState == 2 and c == '*':
                 commentState = 3 # in C comment, look for '/' ending comment
             elif commentState == 3:
                 if c == '/':
                     result = result + ' '  # so we don't glue two tokens together
                     commentState = 0
                 elif c != '*':
                     commentState = 2
         return result


     def Ctokenize(self,line,screenOutComments):
         # break a line of text into a list of C-like tokens (not
         # precise, just good enough for our purposes)
         if screenOutComments:
             line = self.Cscreener(line)  # remove C, C++ comments
         result = []
         currentToken = ''
         for c in line:
             if c.isspace():
                 if len(currentToken) > 0:
                     result.append(currentToken)
                     currentToken = ''
             elif c.isalnum() or c == '_':
                 currentToken = currentToken + c
             else:
                 if len(currentToken) > 0:
                     result.append(currentToken)
                     currentToken = ''
                 result.append(c)
         return result


     def parseEnumFile(self,enumFileName,enumName):
         #
         # We are looking for a particular enum. The format we expect
         # is like this:
         #
         # enum <enumname> { <symbol> = <value>,
         #                   <symbol> = <value>,
         #                   ...
         #                   <symbol> = <value> } ;
         #
         # Of course this can freely flow across lines and there may be
         # C or C++ comments to navigate past. So, we essentially have
         # to tokenize and use a state machine to parse.
         state = 0
         symbol = None
         messageNumber = None
         try:
             f = open(enumFileName)
             for line in f:
                 tokens = self.Ctokenize(line,True)  # screen out comments
                 for token in tokens:
                     if state == 0 and token == 'enum':
                         state = 1
                     elif state == 1:
                         if token == enumName:
                             state = 2
                         else:
                             state = 0
                     elif state == 2:
                         if token == '{':
                             state = 3;
                         else:
                             state = 0;
                     elif state == 3:
                         if len(token) > 1:
                             state = 4
                             symbol = token
                         elif token == '}':
                             state = 10 # ignore rest of file
                     elif state == 4:
                         if token == '=':
                             state = 5
                         elif token == '}':
                             state = 10 # ignore rest of file
                         else:
                             state = 3
                     elif state == 5:
                         if token.isdigit():
                             values = { 'enumSymbol': symbol, 'enumFile': enumFileName }
                             if int(token) > 0:  # ignore enums for 0
                                 self.mergeEntry(token,values)
                             state = 6
                         elif token == '-':
                             state = 5  # skip unary minus sign before digits
                         else:
                             state = 3
                     elif state == 6:
                         if token == '}':
                             state = 10 # ignore rest of file
                         else:  # probably a comma
                             state = 3

             f.close()

         except IOError as detail:
             print "Could not open " + enumFileName
             print detail


     def analyzeCodeReferences(self,directory):
         fileString = directory + "/*/*.cpp " + directory + "/*/*.h " + directory + "/*/*.y"
         for key in self.dict:
             tableEntry = self.dict[key]
             if tableEntry['textsMatch'] == False:
                 if 'enumSymbol' in tableEntry:
                     patternString = '"[' + key + "|" + tableEntry['enumSymbol'] + ']"'
                 else:
                     patternString = key
                 shellCmd = 'grep -H ' + patternString + " " + fileString
                 p1 = subprocess.Popen(shellCmd, shell=True, stdout=subprocess.PIPE, close_fds=True)
                 prevFileName = ""
                 fileName = ""
                 valueSet = set()
                 for line in p1.stdout:
                     fileName = line[:line.find(':')]

                     if fileName != prevFileName and len(prevFileName) > 0:
                         entry = { 'listOfCodeReferences': valueSet }
                         self.mergeEntry(key,entry)
                         prevFileName = fileName
                         valueSet = set()

                     line = line[len(fileName)+1:]  # remove file name part and colon
                     tokens1 = self.Ctokenize(line,False)  # don't screen out comments
                     found = False
                     if key in tokens1:
                         found = True
                     elif 'enumSymbol' in tableEntry and tableEntry['enumSymbol'] in tokens1:
                         found = True
                     if found:
                         # remove directory part of the name
                         valueSet.add(fileName[len(directory):].rstrip(':').strip('/'))

                 # do the last one (if there was one)
                 if len(fileName) > 0:
                     entry = { 'listOfCodeReferences': valueSet }
                     self.mergeEntry(key,entry)


     def analyzeTestReferences(self,directory):
         # print "analyzeTestReferences called for directory " + directory
         fileString = directory + "/*/EXPECTED*"
         for key in self.dict:
             if self.dict[key]['textsMatch'] == False:
                 patternString = "[ERROR|WARNING]\[" + key + "\]"
                 shellCmd = 'grep -l "' + patternString + '" ' + fileString
                 p1 = subprocess.Popen(shellCmd, shell=True, stdout=subprocess.PIPE, close_fds=True)
                 valueList = []
                 for fileName in p1.stdout:
                     # remove directory part of the name and trailing '\n'
                     valueList.append(fileName[len(directory):].rstrip('\n').strip('/'))

                 entry = { 'listOfTestReferences': valueList }
                 self.mergeEntry(key,entry)


     def removeAngleBracketTerms(self,line):
         # replaces any text of the form "<stuff>"
         result = ''
         throwAway = ''
         i = 0
         state = 0
         while i < len(line):
             if state == 0:
                 if line[i] == '<':
                     state = 1
                     throwAway = line[i]
                 else:
                     result = result + line[i]
             elif state == 1:
                 if line[i] == '>':
                     result = result + throwAway + line[i]
                     state = 0
                 else:
                     state = 2
                     throwAway = throwAway + line[i]
             elif state == 2:
                 if line[i] == '>':
                     state = 0
                     result = result + '.elided.'
                 else:
                     throwAway = throwAway + '>' + line[i]
             i = i + 1

         # if we reached the end of the line after a '<', put the
         # throwaway text back in
         if state == 2:
             result = result + ' <' + throwAway
         #print "Before<: " + line
         #print "After<: " + result
         return result.rstrip() # ignore trailing spaces

     def removeDollarTerms(self,line):
         # removes text of the form $0~Datatype0 (where Datatype might
         # be String, Int, TableName etc.)
         result = ''
         throwAway = ''
         i = 0
         state = 0
         while i < len(line):
             if state == 0:
                 if line[i] == '$':
                     state = 1;
                     throwAway = line[i]
                 else:
                     result = result + line[i]
             elif state == 1:
                 if line[i].isdigit():
                     state = 2
                     throwAway = throwAway + line[i]
                 else:
                     result = result + throwAway + line[i]
                     throwAway = ''
                     state = 0
             elif state == 2:
                 if line[i] == '~':
                     state = 3
                     throwAway = throwAway + line[i]
                 else:
                     result = result + throwAway + line[i]
                     throwAway = ''
                     state = 0
             elif state == 3:
                 if line[i].isalpha():
                     state = 4
                     throwAway = throwAway + line[i]
                 else:
                     result = result + throwAway + line[i]
                     throwAway = ''
                     state = 0
             elif state == 4:
                 if line[i].isalpha():
                     throwAway = throwAway + line[i]
                 elif line[i].isdigit():
                     state = 0  # we reached the end of the dollar text
                     result = result + '.elided.'
                 else:
                     state = 0  # we reached the end of the dollar text
                     result = result + '.elided.' + line[i]
             i = i + 1

         # if we reached the end of the line then put the throwaway text
         # back in
         if state > 0 and state < 4:
             result = result + throwAway
         #print "Before$: " + line
         #print "After$: " + result
         return result.rstrip() # ignore trailing spaces


     def compareText(self):
         # print "compareText called"
         for key in self.dict:
             attributes = self.dict[key]
             comparison = False
             if 'messageGuideText' in attributes:
                 if 'errorMessageFileText' in attributes:
                     if attributes['messageGuideText'] == attributes['errorMessageFileText']:
                         comparison = True
                     else:
                         temp1 = self.removeAngleBracketTerms(attributes['messageGuideText'])
                         temp2 = self.removeDollarTerms(attributes['errorMessageFileText'])
                         if temp1 == temp2:
                             comparison = True
             self.mergeEntry(key,{ 'textsMatch': comparison })


     # Iterating through a Python dictionary gets keys out in hash order
     # which isn't useful to humans. This helper function gets the keys,
     # and places them in a list in numeric order
     def sortedNumericKeys(self,dictionary):
         numericKeys = []
         for key in self.dict:
             numericKeys.append(int(key))

         result = sorted(numericKeys)
         return result


     def reportResults(self,withCodeRefs):
         matchedCount = 0
         mismatchedCount = 0
         for keyN in self.sortedNumericKeys(self.dict):
             key = str(keyN)
             value = self.dict[key]
             if value['textsMatch']:
                 matchedCount = matchedCount + 1
             else:
                 mismatchedCount = mismatchedCount + 1
                 if 'messageGuideText' in value:
                     if 'errorMessageFileText' in value:
                         print "Message " + key + " differs between code and Messages Guide:"
                         print "SqlciErrors.txt:    " + value['errorMessageFileText']

                     else:
                         print "Message " + key + " appears in the Messages Guide but not the SqlciErrors.txt file:"
                     print "Message Guide text: " + value['messageGuideText']
                 else:
                     if 'errorMessageFileText' in value:
                         print "Message " + key + " appears in the SqlciErrors.txt file but not the Messages Guide:"
                         print "SqlciErrors.txt:    " + value['errorMessageFileText']
                     else:
                         # must exist only in an enum
                         print "Message " + key + " does not appear in either the SqlciErrors.txt file nor the Messages Guide."

                 if 'enumSymbol' in value:
                     print "Enum symbol: " + value['enumSymbol'] + " (file " + value['enumFile'] + ")"

                 # these tests shouldn't be necessary but do make the following code safe
                 if not 'listOfCodeReferences' in value:
                     value['listOfCodeReferences'] = set()
                 if not 'listOfTestReferences' in value:
                     value['listOfTestReferences'] = []

                 if withCodeRefs:
                     text = "This message "
                     if 'enumSymbol' in value:
                         text = text + "(or its enum symbol) "
                     if len(value['listOfCodeReferences']) == 0:
                         text = text + "does not seem to be referenced in C++ code."
                     else:
                         text = text + "has possible references in " + str(len(value['listOfCodeReferences'])) + " files:"
                     print text

                     for codeReference in value['listOfCodeReferences']:
                         print "  " + codeReference

                 if len(value['listOfTestReferences']) == 0:
                     print "This message does not appear to be in any regress expected file."
                 else:
                     print "This message possibly appears in the following " + str(len(value['listOfTestReferences'])) + " regress expected files:"

                 for testReference in value['listOfTestReferences']:
                     print "  " + testReference

                 print " "

         print "Summary: There are " + str(matchedCount) + " matching messages and " + str(mismatchedCount) + " mismatching messages."


 # beginning of main


 # process command line arguments

 parser = argparse.ArgumentParser(
     description='This script checks the consistency of the Messages Guide and the code.')
 parser.add_argument("--codeRefs", help='Looks for code references to mismatched messages; this option is quite slow and can take up to 10 seconds per message. Today there are about 2800 mismatched messages so count on six or seven hours.', action="store_true")

 args = parser.parse_args()  # exits and prints help if args are incorrect

 exitCode = 0

 messagesTable = MessagesTable()

 # check that $TRAF_HOME is set
 mySQroot = os.getenv('TRAF_HOME')
 if not mySQroot:
     print "$TRAF_HOME is not defined. Exiting."
     exit(1)

 # parse the Messages Guide files
 print
 print datetime.datetime.ctime(datetime.datetime.now()) + ": reading Messages Guide"
 messagesGuideChaptersDirectory = mySQroot + '/../../docs/messages_guide/src/asciidoc/_chapters'
 for subdir, dirs, files in os.walk(messagesGuideChaptersDirectory):
     for file in files:
         filepath = subdir + os.sep + file
         if filepath.endswith(".adoc"):
             messagesTable.parseAdoc(filepath)

 # parse the SqlciErrors.txt file
 print
 print datetime.datetime.ctime(datetime.datetime.now()) + ": reading SqlciErrors.txt"
 eTextFileName = mySQroot + '/../sql/bin/SqlciErrors.txt'
 messagesTable.parseMessagesFile(eTextFileName)

 # parse the enum files
 print
 print datetime.datetime.ctime(datetime.datetime.now()) + ": reading enum files"
 enumFileList = ( [ ['ustat/hs_const.h','USTAT_ERROR_CODES'],
     ['arkcmp/CmpErrors.h','ArkcmpErrorCode'],
     ['sqlcomp/CmpDDLCatErrorCodes.h','CatErrorCode'],
     ['optimizer/opt_error.h','OptimizerSQLErrorCode'],
     ['optimizer/UdrErrors.h','UDRErrors'],
     ['exp/ExpErrorEnums.h','ExeErrorCode'],
     ['sort/SortError.h','SortErrorEnum'],
     ['udrserv/udrdefs.h','UdrErrorEnum'] ] )
 for entry in enumFileList:
     fileName = mySQroot + '/../sql/' + entry[0]
     messagesTable.parseEnumFile(fileName,entry[1])

 # compare Messages Guide and code text
 print
 print datetime.datetime.ctime(datetime.datetime.now()) + ": comparing Messages Guide and SqlciError.txt text"
 messagesTable.compareText()

 # analyze code references
 if args.codeRefs:
     print
     print datetime.datetime.ctime(datetime.datetime.now()) + ": looking for code references for mismatched messages (this may take a while)"
     sqlCodeDirectory = mySQroot + '/../sql'
     messagesTable.analyzeCodeReferences(sqlCodeDirectory)

 # analyze test references
 print
 print datetime.datetime.ctime(datetime.datetime.now()) + ": looking for test examples for mismatched messages (this may take a while)"
 regressDirectory = mySQroot + '/../sql/regress'
 messagesTable.analyzeTestReferences(regressDirectory)


 # report results
 print
 print datetime.datetime.ctime(datetime.datetime.now()) + ": generating report"
 print
 messagesTable.reportResults(args.codeRefs)

 print
 print datetime.datetime.ctime(datetime.datetime.now()) + ": done"

 exit(exitCode)
	# @@@ START COPYRIGHT @@@
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	# @@@ END COPYRIGHT @@@
	#
	# This script checks the consistency of the Messages Guide
	# against the code. It looks for messages whose text differs
	# between the Messages Guide and code. It looks for messages
	# that are no longer in use. It looks for messages that are
	# in use but not in the Messages Guide. It produces a report
	# of these.
	#
	# The report includes detailed information for each message
	# that differs between the Messages Guide and code. It lists
	# the enum used for the message if there is one. It lists
	# the code modules that may have a reference to the message
	# (it uses grep to determine this, so there may be false
	# positives or more rarely false negatives). It lists the
	# regression test expected files that contain examples of the
	# error message.
	#
	import os
	import sys
	import subprocess
	import sets
	import datetime
	import argparse # requires Python 2.7


	# The script does the following:
	#
	# 1. Parses the Messages Guide chapters into a table containing message number
	# + message text.
	# 2. Parses the bin/SqlciErrors.txt file into a table containing message number
	# + message text. Merges that into the table from item 1.
	# 3. Parses the various *.h files with error message enums. Merges that
	# information into the table from item 1.
	# 4. Compares the message texts in items 1 and 2. The comparison has to be
	# smart; parameters appear as angle-bracketed items in the Messages Guide but
	# contain $0-String0 notation in the SqlciErrors.txt text. When there are
	# mismatches, this is noted. If one or the other text is missing, this notation
	# is marked "false".
	# 5. Searches the sql *.cpp files for references to the error message number
	# and/or enum, and when found, maintains a list for each, merged into the
	# table from item 1. We only do this for mismatched messages as it takes a
	# lot of time.
	# 6. Searches the regression tests LOG* files for examples of each message
	# number, and when found, maintains a list for each, attached to the table
	# constructed in 1. We only do this for mismatched messages as it takes a
	# lot of time.
	# 7. Lists error messages that mismatch between Messages Guide and
	# SqlciErrors.txt.
	# 8. Lists error messages that are missing from the Messages Guide but present
	# in SqlciErrors.txt. Mentions whether these are actually used anywhere (in
	# *.cpp files or regression tests).
	# 9. Lists error messages that are present in the Messages Guide or
	# SqlciErrors.txt but apparently are no longer used (that is, don't appear in
	# *.cpp files or regression tests).
	#
	# The table resulting from items 1 through 6 has the following shape:
	# a. Message number (key)
	# b. Messages Guide text (if any)
	# c. SqlciErrors.txt message text (if any)
	# d. Enum (if any)
	# e. Enum *.h file name (if any)
	# f. List of *.cpp files with references to message number
	# g. List of *.cpp files with references to enum name
	# h. List of regression tests with messages number examples
	# i. Boolean indicating whether SqlciErrors.txt text matches Messages Guide
	#
	# Item 1 is implemented by a function that parses a single *.adoc file
	# producing table entries. It is driven by a loop over the set of *.adoc file
	# names. The function updates the table directly.
	# Item 2 is implemented by a function that parses a *.txt file, driven by a
	# caller supplying the bin/SqlciErrors.txt name. The function updates the table
	# directly.
	# Item 3 is implemented by a function that parses a *.h file looking for error
	# message enums. It is driven by a loop over a hard-coded set of known *.h
	# files, perhaps with the name of the enum passed in. The function updates the
	# table directly.
	# Item 4 is implemented by a function that does a grep of the *.cpp files, then
	# processes the results. The function updates the table directly.
	# Item 5 is implemented by a function that does a grep of the LOG* files, then
	# processes the results. The function updates the table directly.
	# Item 6 is implemented by a function that crawls the table, comparing the
	# retrieved texts. The function updates the table directly.
	# Items 7, 8, 9 are implemented by a function that reads the table and reports
	# results.
	#
	# We model the table as a Python object and implement these functions as
	# methods on that object.

	class MessagesTable:

	# The structure of self.dict is a dictionary of dictionaries.
	# The key to the top level dictionary is the message number.
	# The value in each top level dictionary is essentially a
	# relation, modeled by a Python dictionary. The key in this
	# value is the attribute name, as follows:
	#
	# 'messageGuideText'
	# 'errorMessageFileText'
	# 'enumSymbol'
	# 'enumFile'
	# 'listOfCodeReferences'
	# 'listOfEnumSymbolReferences'
	# 'listOfTestReferences'
	# 'textsMatch'

	def __init__(self):
	self.dict = {}

	def mergeEntry(self,key,values):
	if key in self.dict:
	self.dict[key].update(values)
	else:
	self.dict[key] = values

	def parseAdoc(self,adocFileName):
	#
	# Message entries look like this:
	#
	# [[SQL-1002]]
	# == SQL 1002
	#
	# ```
	# Catalog <catalog> does not exist.
	# ```
	#
	# Where <catalog> is the ANSI name of the target catalog.
	#
	# Cause: The catalog does not exist.
	#
	# Effect: The operation fails.
	#
	# Recovery: Enter a valid catalog name and resubmit.
	#
	# The stuff we are interested in is the message number and
	# the message text. We use a tiny state machine to figure
	# out what lines to look for.
	try:
	f = open(adocFileName)
	state = 0
	messageNumberStr = None
	messageText = None
	for line in f:
	line = line.rstrip('\n') # get rid of trailing return character
	if line.startswith('== SQL '):
	words = line.split()
	if len(words) == 3:
	try:
	messageNumber = int(words[2])
	except ValueError:
	messageNumber = 0
	if messageNumber > 0:
	messageNumberStr = words[2]
	state = 1 # look for first backticks
	elif state == 1:
	if line == '```':
	state = 2 # start capturing text
	elif state == 2:
	messageText = line
	state = 3 # continue capturing text until backticks
	elif state == 3:
	if line == '```':
	state = 0 # look for next heading
	values = { 'messageGuideText': messageText }
	self.mergeEntry(messageNumberStr,values)
	else:
	messageText = ' '.join([messageText,line])
	f.close()

	except IOError as detail:
	print "Could not open " + adocFileName
	print detail



	def parseMessagesFile(self,messagesFileName):
	try:
	f = open(messagesFileName)
	for line in f:
	words = str.split(line)
	if len(words) >= 7:
	try:
	messageNumber = int(words[0])
	except ValueError:
	messageNumber = 0
	if messageNumber > 0:
	# filter out "unused" messages
	if not (words[6].startswith('--') or words[6].startswith('***')):
	values = { 'errorMessageFileText': ' '.join(words[6:]) }
	self.mergeEntry(words[0],values)
	f.close()

	except IOError as detail:
	print "Could not open " + messagesFileName
	print detail


	def Cscreener(self,line):
	# remove any C or C++ comments from the line, returning the line
	# (not precise; we don't check for C strings for example)
	commentState = 0
	result = ''
	for c in line:
	if commentState == 0:
	if c == '/':
	commentState = 1 # seen '/'
	else:
	result = result + c
	elif commentState == 1:
	if c == '/':
	commentState = 4 # in C++ comment, ignore rest of line
	elif c == '*':
	commentState = 2 # in C comment, look for '*'
	else:
	result = result + '/' + c # false alarm
	commentState = 0
	elif commentState == 2 and c == '*':
	commentState = 3 # in C comment, look for '/' ending comment
	elif commentState == 3:
	if c == '/':
	result = result + ' ' # so we don't glue two tokens together
	commentState = 0
	elif c != '*':
	commentState = 2
	return result


	def Ctokenize(self,line,screenOutComments):
	# break a line of text into a list of C-like tokens (not
	# precise, just good enough for our purposes)
	if screenOutComments:
	line = self.Cscreener(line) # remove C, C++ comments
	result = []
	currentToken = ''
	for c in line:
	if c.isspace():
	if len(currentToken) > 0:
	result.append(currentToken)
	currentToken = ''
	elif c.isalnum() or c == '_':
	currentToken = currentToken + c
	else:
	if len(currentToken) > 0:
	result.append(currentToken)
	currentToken = ''
	result.append(c)
	return result


	def parseEnumFile(self,enumFileName,enumName):
	#
	# We are looking for a particular enum. The format we expect
	# is like this:
	#
	# enum <enumname> { <symbol> = <value>,
	# <symbol> = <value>,
	# ...
	# <symbol> = <value> } ;
	#
	# Of course this can freely flow across lines and there may be
	# C or C++ comments to navigate past. So, we essentially have
	# to tokenize and use a state machine to parse.
	state = 0
	symbol = None
	messageNumber = None
	try:
	f = open(enumFileName)
	for line in f:
	tokens = self.Ctokenize(line,True) # screen out comments
	for token in tokens:
	if state == 0 and token == 'enum':
	state = 1
	elif state == 1:
	if token == enumName:
	state = 2
	else:
	state = 0
	elif state == 2:
	if token == '{':
	state = 3;
	else:
	state = 0;
	elif state == 3:
	if len(token) > 1:
	state = 4
	symbol = token
	elif token == '}':
	state = 10 # ignore rest of file
	elif state == 4:
	if token == '=':
	state = 5
	elif token == '}':
	state = 10 # ignore rest of file
	else:
	state = 3
	elif state == 5:
	if token.isdigit():
	values = { 'enumSymbol': symbol, 'enumFile': enumFileName }
	if int(token) > 0: # ignore enums for 0
	self.mergeEntry(token,values)
	state = 6
	elif token == '-':
	state = 5 # skip unary minus sign before digits
	else:
	state = 3
	elif state == 6:
	if token == '}':
	state = 10 # ignore rest of file
	else: # probably a comma
	state = 3

	f.close()

	except IOError as detail:
	print "Could not open " + enumFileName
	print detail


	def analyzeCodeReferences(self,directory):
	fileString = directory + "//.cpp " + directory + "//.h " + directory + "//.y"
	for key in self.dict:
	tableEntry = self.dict[key]
	if tableEntry['textsMatch'] == False:
	if 'enumSymbol' in tableEntry:
	patternString = '"[' + key + "\|" + tableEntry['enumSymbol'] + ']"'
	else:
	patternString = key
	shellCmd = 'grep -H ' + patternString + " " + fileString
	p1 = subprocess.Popen(shellCmd, shell=True, stdout=subprocess.PIPE, close_fds=True)
	prevFileName = ""
	fileName = ""
	valueSet = set()
	for line in p1.stdout:
	fileName = line[:line.find(':')]

	if fileName != prevFileName and len(prevFileName) > 0:
	entry = { 'listOfCodeReferences': valueSet }
	self.mergeEntry(key,entry)
	prevFileName = fileName
	valueSet = set()

	line = line[len(fileName)+1:] # remove file name part and colon
	tokens1 = self.Ctokenize(line,False) # don't screen out comments
	found = False
	if key in tokens1:
	found = True
	elif 'enumSymbol' in tableEntry and tableEntry['enumSymbol'] in tokens1:
	found = True
	if found:
	# remove directory part of the name
	valueSet.add(fileName[len(directory):].rstrip(':').strip('/'))

	# do the last one (if there was one)
	if len(fileName) > 0:
	entry = { 'listOfCodeReferences': valueSet }
	self.mergeEntry(key,entry)



	def analyzeTestReferences(self,directory):
	# print "analyzeTestReferences called for directory " + directory
	fileString = directory + "//EXPECTED"
	for key in self.dict:
	if self.dict[key]['textsMatch'] == False:
	patternString = "[ERROR\|WARNING]\[" + key + "\]"
	shellCmd = 'grep -l "' + patternString + '" ' + fileString
	p1 = subprocess.Popen(shellCmd, shell=True, stdout=subprocess.PIPE, close_fds=True)
	valueList = []
	for fileName in p1.stdout:
	# remove directory part of the name and trailing '\n'
	valueList.append(fileName[len(directory):].rstrip('\n').strip('/'))

	entry = { 'listOfTestReferences': valueList }
	self.mergeEntry(key,entry)


	def removeAngleBracketTerms(self,line):
	# replaces any text of the form "<stuff>"
	result = ''
	throwAway = ''
	i = 0
	state = 0
	while i < len(line):
	if state == 0:
	if line[i] == '<':
	state = 1
	throwAway = line[i]
	else:
	result = result + line[i]
	elif state == 1:
	if line[i] == '>':
	result = result + throwAway + line[i]
	state = 0
	else:
	state = 2
	throwAway = throwAway + line[i]
	elif state == 2:
	if line[i] == '>':
	state = 0
	result = result + '.elided.'
	else:
	throwAway = throwAway + '>' + line[i]
	i = i + 1

	# if we reached the end of the line after a '<', put the
	# throwaway text back in
	if state == 2:
	result = result + ' <' + throwAway
	#print "Before<: " + line
	#print "After<: " + result
	return result.rstrip() # ignore trailing spaces

	def removeDollarTerms(self,line):
	# removes text of the form $0~Datatype0 (where Datatype might
	# be String, Int, TableName etc.)
	result = ''
	throwAway = ''
	i = 0
	state = 0
	while i < len(line):
	if state == 0:
	if line[i] == '$':
	state = 1;
	throwAway = line[i]
	else:
	result = result + line[i]
	elif state == 1:
	if line[i].isdigit():
	state = 2
	throwAway = throwAway + line[i]
	else:
	result = result + throwAway + line[i]
	throwAway = ''
	state = 0
	elif state == 2:
	if line[i] == '~':
	state = 3
	throwAway = throwAway + line[i]
	else:
	result = result + throwAway + line[i]
	throwAway = ''
	state = 0
	elif state == 3:
	if line[i].isalpha():
	state = 4
	throwAway = throwAway + line[i]
	else:
	result = result + throwAway + line[i]
	throwAway = ''
	state = 0
	elif state == 4:
	if line[i].isalpha():
	throwAway = throwAway + line[i]
	elif line[i].isdigit():
	state = 0 # we reached the end of the dollar text
	result = result + '.elided.'
	else:
	state = 0 # we reached the end of the dollar text
	result = result + '.elided.' + line[i]
	i = i + 1

	# if we reached the end of the line then put the throwaway text
	# back in
	if state > 0 and state < 4:
	result = result + throwAway
	#print "Before$: " + line
	#print "After$: " + result
	return result.rstrip() # ignore trailing spaces


	def compareText(self):
	# print "compareText called"
	for key in self.dict:
	attributes = self.dict[key]
	comparison = False
	if 'messageGuideText' in attributes:
	if 'errorMessageFileText' in attributes:
	if attributes['messageGuideText'] == attributes['errorMessageFileText']:
	comparison = True
	else:
	temp1 = self.removeAngleBracketTerms(attributes['messageGuideText'])
	temp2 = self.removeDollarTerms(attributes['errorMessageFileText'])
	if temp1 == temp2:
	comparison = True
	self.mergeEntry(key,{ 'textsMatch': comparison })


	# Iterating through a Python dictionary gets keys out in hash order
	# which isn't useful to humans. This helper function gets the keys,
	# and places them in a list in numeric order
	def sortedNumericKeys(self,dictionary):
	numericKeys = []
	for key in self.dict:
	numericKeys.append(int(key))

	result = sorted(numericKeys)
	return result


	def reportResults(self,withCodeRefs):
	matchedCount = 0
	mismatchedCount = 0
	for keyN in self.sortedNumericKeys(self.dict):
	key = str(keyN)
	value = self.dict[key]
	if value['textsMatch']:
	matchedCount = matchedCount + 1
	else:
	mismatchedCount = mismatchedCount + 1
	if 'messageGuideText' in value:
	if 'errorMessageFileText' in value:
	print "Message " + key + " differs between code and Messages Guide:"
	print "SqlciErrors.txt: " + value['errorMessageFileText']

	else:
	print "Message " + key + " appears in the Messages Guide but not the SqlciErrors.txt file:"
	print "Message Guide text: " + value['messageGuideText']
	else:
	if 'errorMessageFileText' in value:
	print "Message " + key + " appears in the SqlciErrors.txt file but not the Messages Guide:"
	print "SqlciErrors.txt: " + value['errorMessageFileText']
	else:
	# must exist only in an enum
	print "Message " + key + " does not appear in either the SqlciErrors.txt file nor the Messages Guide."

	if 'enumSymbol' in value:
	print "Enum symbol: " + value['enumSymbol'] + " (file " + value['enumFile'] + ")"

	# these tests shouldn't be necessary but do make the following code safe
	if not 'listOfCodeReferences' in value:
	value['listOfCodeReferences'] = set()
	if not 'listOfTestReferences' in value:
	value['listOfTestReferences'] = []

	if withCodeRefs:
	text = "This message "
	if 'enumSymbol' in value:
	text = text + "(or its enum symbol) "
	if len(value['listOfCodeReferences']) == 0:
	text = text + "does not seem to be referenced in C++ code."
	else:
	text = text + "has possible references in " + str(len(value['listOfCodeReferences'])) + " files:"
	print text

	for codeReference in value['listOfCodeReferences']:
	print " " + codeReference

	if len(value['listOfTestReferences']) == 0:
	print "This message does not appear to be in any regress expected file."
	else:
	print "This message possibly appears in the following " + str(len(value['listOfTestReferences'])) + " regress expected files:"

	for testReference in value['listOfTestReferences']:
	print " " + testReference

	print " "

	print "Summary: There are " + str(matchedCount) + " matching messages and " + str(mismatchedCount) + " mismatching messages."





	# beginning of main


	# process command line arguments

	parser = argparse.ArgumentParser(
	description='This script checks the consistency of the Messages Guide and the code.')
	parser.add_argument("--codeRefs", help='Looks for code references to mismatched messages; this option is quite slow and can take up to 10 seconds per message. Today there are about 2800 mismatched messages so count on six or seven hours.', action="store_true")

	args = parser.parse_args() # exits and prints help if args are incorrect

	exitCode = 0

	messagesTable = MessagesTable()

	# check that $TRAF_HOME is set
	mySQroot = os.getenv('TRAF_HOME')
	if not mySQroot:
	print "$TRAF_HOME is not defined. Exiting."
	exit(1)

	# parse the Messages Guide files
	print
	print datetime.datetime.ctime(datetime.datetime.now()) + ": reading Messages Guide"
	messagesGuideChaptersDirectory = mySQroot + '/../../docs/messages_guide/src/asciidoc/_chapters'
	for subdir, dirs, files in os.walk(messagesGuideChaptersDirectory):
	for file in files:
	filepath = subdir + os.sep + file
	if filepath.endswith(".adoc"):
	messagesTable.parseAdoc(filepath)

	# parse the SqlciErrors.txt file
	print
	print datetime.datetime.ctime(datetime.datetime.now()) + ": reading SqlciErrors.txt"
	eTextFileName = mySQroot + '/../sql/bin/SqlciErrors.txt'
	messagesTable.parseMessagesFile(eTextFileName)

	# parse the enum files
	print
	print datetime.datetime.ctime(datetime.datetime.now()) + ": reading enum files"
	enumFileList = ( [ ['ustat/hs_const.h','USTAT_ERROR_CODES'],
	['arkcmp/CmpErrors.h','ArkcmpErrorCode'],
	['sqlcomp/CmpDDLCatErrorCodes.h','CatErrorCode'],
	['optimizer/opt_error.h','OptimizerSQLErrorCode'],
	['optimizer/UdrErrors.h','UDRErrors'],
	['exp/ExpErrorEnums.h','ExeErrorCode'],
	['sort/SortError.h','SortErrorEnum'],
	['udrserv/udrdefs.h','UdrErrorEnum'] ] )
	for entry in enumFileList:
	fileName = mySQroot + '/../sql/' + entry[0]
	messagesTable.parseEnumFile(fileName,entry[1])

	# compare Messages Guide and code text
	print
	print datetime.datetime.ctime(datetime.datetime.now()) + ": comparing Messages Guide and SqlciError.txt text"
	messagesTable.compareText()

	# analyze code references
	if args.codeRefs:
	print
	print datetime.datetime.ctime(datetime.datetime.now()) + ": looking for code references for mismatched messages (this may take a while)"
	sqlCodeDirectory = mySQroot + '/../sql'
	messagesTable.analyzeCodeReferences(sqlCodeDirectory)

	# analyze test references
	print
	print datetime.datetime.ctime(datetime.datetime.now()) + ": looking for test examples for mismatched messages (this may take a while)"
	regressDirectory = mySQroot + '/../sql/regress'
	messagesTable.analyzeTestReferences(regressDirectory)


	# report results
	print
	print datetime.datetime.ctime(datetime.datetime.now()) + ": generating report"
	print
	messagesTable.reportResults(args.codeRefs)

	print
	print datetime.datetime.ctime(datetime.datetime.now()) + ": done"

	exit(exitCode)