blob: f660b6264fee7d2829237b107877a9ae906c7734 [file] [log] [blame]
# @@@ START COPYRIGHT @@@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# @@@ END COPYRIGHT @@@
#
# This script checks the consistency of the Messages Guide
# against the code. It looks for messages whose text differs
# between the Messages Guide and code. It looks for messages
# that are no longer in use. It looks for messages that are
# in use but not in the Messages Guide. It produces a report
# of these.
#
# The report includes detailed information for each message
# that differs between the Messages Guide and code. It lists
# the enum used for the message if there is one. It lists
# the code modules that may have a reference to the message
# (it uses grep to determine this, so there may be false
# positives or more rarely false negatives). It lists the
# regression test expected files that contain examples of the
# error message.
#
import os
import sys
import subprocess
import sets
import datetime
import argparse # requires Python 2.7
# The script does the following:
#
# 1. Parses the Messages Guide chapters into a table containing message number
# + message text.
# 2. Parses the bin/SqlciErrors.txt file into a table containing message number
# + message text. Merges that into the table from item 1.
# 3. Parses the various *.h files with error message enums. Merges that
# information into the table from item 1.
# 4. Compares the message texts in items 1 and 2. The comparison has to be
# smart; parameters appear as angle-bracketed items in the Messages Guide but
# contain $0-String0 notation in the SqlciErrors.txt text. When there are
# mismatches, this is noted. If one or the other text is missing, this notation
# is marked "false".
# 5. Searches the sql *.cpp files for references to the error message number
# and/or enum, and when found, maintains a list for each, merged into the
# table from item 1. We only do this for mismatched messages as it takes a
# lot of time.
# 6. Searches the regression tests LOG* files for examples of each message
# number, and when found, maintains a list for each, attached to the table
# constructed in 1. We only do this for mismatched messages as it takes a
# lot of time.
# 7. Lists error messages that mismatch between Messages Guide and
# SqlciErrors.txt.
# 8. Lists error messages that are missing from the Messages Guide but present
# in SqlciErrors.txt. Mentions whether these are actually used anywhere (in
# *.cpp files or regression tests).
# 9. Lists error messages that are present in the Messages Guide or
# SqlciErrors.txt but apparently are no longer used (that is, don't appear in
# *.cpp files or regression tests).
#
# The table resulting from items 1 through 6 has the following shape:
# a. Message number (key)
# b. Messages Guide text (if any)
# c. SqlciErrors.txt message text (if any)
# d. Enum (if any)
# e. Enum *.h file name (if any)
# f. List of *.cpp files with references to message number
# g. List of *.cpp files with references to enum name
# h. List of regression tests with messages number examples
# i. Boolean indicating whether SqlciErrors.txt text matches Messages Guide
#
# Item 1 is implemented by a function that parses a single *.adoc file
# producing table entries. It is driven by a loop over the set of *.adoc file
# names. The function updates the table directly.
# Item 2 is implemented by a function that parses a *.txt file, driven by a
# caller supplying the bin/SqlciErrors.txt name. The function updates the table
# directly.
# Item 3 is implemented by a function that parses a *.h file looking for error
# message enums. It is driven by a loop over a hard-coded set of known *.h
# files, perhaps with the name of the enum passed in. The function updates the
# table directly.
# Item 4 is implemented by a function that does a grep of the *.cpp files, then
# processes the results. The function updates the table directly.
# Item 5 is implemented by a function that does a grep of the LOG* files, then
# processes the results. The function updates the table directly.
# Item 6 is implemented by a function that crawls the table, comparing the
# retrieved texts. The function updates the table directly.
# Items 7, 8, 9 are implemented by a function that reads the table and reports
# results.
#
# We model the table as a Python object and implement these functions as
# methods on that object.
class MessagesTable:
# The structure of self.dict is a dictionary of dictionaries.
# The key to the top level dictionary is the message number.
# The value in each top level dictionary is essentially a
# relation, modeled by a Python dictionary. The key in this
# value is the attribute name, as follows:
#
# 'messageGuideText'
# 'errorMessageFileText'
# 'enumSymbol'
# 'enumFile'
# 'listOfCodeReferences'
# 'listOfEnumSymbolReferences'
# 'listOfTestReferences'
# 'textsMatch'
def __init__(self):
self.dict = {}
def mergeEntry(self,key,values):
if key in self.dict:
self.dict[key].update(values)
else:
self.dict[key] = values
def parseAdoc(self,adocFileName):
#
# Message entries look like this:
#
# [[SQL-1002]]
# == SQL 1002
#
# ```
# Catalog <catalog> does not exist.
# ```
#
# Where <catalog> is the ANSI name of the target catalog.
#
# *Cause:* The catalog does not exist.
#
# *Effect:* The operation fails.
#
# *Recovery:* Enter a valid catalog name and resubmit.
#
# The stuff we are interested in is the message number and
# the message text. We use a tiny state machine to figure
# out what lines to look for.
try:
f = open(adocFileName)
state = 0
messageNumberStr = None
messageText = None
for line in f:
line = line.rstrip('\n') # get rid of trailing return character
if line.startswith('== SQL '):
words = line.split()
if len(words) == 3:
try:
messageNumber = int(words[2])
except ValueError:
messageNumber = 0
if messageNumber > 0:
messageNumberStr = words[2]
state = 1 # look for first backticks
elif state == 1:
if line == '```':
state = 2 # start capturing text
elif state == 2:
messageText = line
state = 3 # continue capturing text until backticks
elif state == 3:
if line == '```':
state = 0 # look for next heading
values = { 'messageGuideText': messageText }
self.mergeEntry(messageNumberStr,values)
else:
messageText = ' '.join([messageText,line])
f.close()
except IOError as detail:
print "Could not open " + adocFileName
print detail
def parseMessagesFile(self,messagesFileName):
try:
f = open(messagesFileName)
for line in f:
words = str.split(line)
if len(words) >= 7:
try:
messageNumber = int(words[0])
except ValueError:
messageNumber = 0
if messageNumber > 0:
# filter out "unused" messages
if not (words[6].startswith('--') or words[6].startswith('***')):
values = { 'errorMessageFileText': ' '.join(words[6:]) }
self.mergeEntry(words[0],values)
f.close()
except IOError as detail:
print "Could not open " + messagesFileName
print detail
def Cscreener(self,line):
# remove any C or C++ comments from the line, returning the line
# (not precise; we don't check for C strings for example)
commentState = 0
result = ''
for c in line:
if commentState == 0:
if c == '/':
commentState = 1 # seen '/'
else:
result = result + c
elif commentState == 1:
if c == '/':
commentState = 4 # in C++ comment, ignore rest of line
elif c == '*':
commentState = 2 # in C comment, look for '*'
else:
result = result + '/' + c # false alarm
commentState = 0
elif commentState == 2 and c == '*':
commentState = 3 # in C comment, look for '/' ending comment
elif commentState == 3:
if c == '/':
result = result + ' ' # so we don't glue two tokens together
commentState = 0
elif c != '*':
commentState = 2
return result
def Ctokenize(self,line,screenOutComments):
# break a line of text into a list of C-like tokens (not
# precise, just good enough for our purposes)
if screenOutComments:
line = self.Cscreener(line) # remove C, C++ comments
result = []
currentToken = ''
for c in line:
if c.isspace():
if len(currentToken) > 0:
result.append(currentToken)
currentToken = ''
elif c.isalnum() or c == '_':
currentToken = currentToken + c
else:
if len(currentToken) > 0:
result.append(currentToken)
currentToken = ''
result.append(c)
return result
def parseEnumFile(self,enumFileName,enumName):
#
# We are looking for a particular enum. The format we expect
# is like this:
#
# enum <enumname> { <symbol> = <value>,
# <symbol> = <value>,
# ...
# <symbol> = <value> } ;
#
# Of course this can freely flow across lines and there may be
# C or C++ comments to navigate past. So, we essentially have
# to tokenize and use a state machine to parse.
state = 0
symbol = None
messageNumber = None
try:
f = open(enumFileName)
for line in f:
tokens = self.Ctokenize(line,True) # screen out comments
for token in tokens:
if state == 0 and token == 'enum':
state = 1
elif state == 1:
if token == enumName:
state = 2
else:
state = 0
elif state == 2:
if token == '{':
state = 3;
else:
state = 0;
elif state == 3:
if len(token) > 1:
state = 4
symbol = token
elif token == '}':
state = 10 # ignore rest of file
elif state == 4:
if token == '=':
state = 5
elif token == '}':
state = 10 # ignore rest of file
else:
state = 3
elif state == 5:
if token.isdigit():
values = { 'enumSymbol': symbol, 'enumFile': enumFileName }
if int(token) > 0: # ignore enums for 0
self.mergeEntry(token,values)
state = 6
elif token == '-':
state = 5 # skip unary minus sign before digits
else:
state = 3
elif state == 6:
if token == '}':
state = 10 # ignore rest of file
else: # probably a comma
state = 3
f.close()
except IOError as detail:
print "Could not open " + enumFileName
print detail
def analyzeCodeReferences(self,directory):
fileString = directory + "/*/*.cpp " + directory + "/*/*.h " + directory + "/*/*.y"
for key in self.dict:
tableEntry = self.dict[key]
if tableEntry['textsMatch'] == False:
if 'enumSymbol' in tableEntry:
patternString = '"[' + key + "|" + tableEntry['enumSymbol'] + ']"'
else:
patternString = key
shellCmd = 'grep -H ' + patternString + " " + fileString
p1 = subprocess.Popen(shellCmd, shell=True, stdout=subprocess.PIPE, close_fds=True)
prevFileName = ""
fileName = ""
valueSet = set()
for line in p1.stdout:
fileName = line[:line.find(':')]
if fileName != prevFileName and len(prevFileName) > 0:
entry = { 'listOfCodeReferences': valueSet }
self.mergeEntry(key,entry)
prevFileName = fileName
valueSet = set()
line = line[len(fileName)+1:] # remove file name part and colon
tokens1 = self.Ctokenize(line,False) # don't screen out comments
found = False
if key in tokens1:
found = True
elif 'enumSymbol' in tableEntry and tableEntry['enumSymbol'] in tokens1:
found = True
if found:
# remove directory part of the name
valueSet.add(fileName[len(directory):].rstrip(':').strip('/'))
# do the last one (if there was one)
if len(fileName) > 0:
entry = { 'listOfCodeReferences': valueSet }
self.mergeEntry(key,entry)
def analyzeTestReferences(self,directory):
# print "analyzeTestReferences called for directory " + directory
fileString = directory + "/*/EXPECTED*"
for key in self.dict:
if self.dict[key]['textsMatch'] == False:
patternString = "[ERROR|WARNING]\[" + key + "\]"
shellCmd = 'grep -l "' + patternString + '" ' + fileString
p1 = subprocess.Popen(shellCmd, shell=True, stdout=subprocess.PIPE, close_fds=True)
valueList = []
for fileName in p1.stdout:
# remove directory part of the name and trailing '\n'
valueList.append(fileName[len(directory):].rstrip('\n').strip('/'))
entry = { 'listOfTestReferences': valueList }
self.mergeEntry(key,entry)
def removeAngleBracketTerms(self,line):
# replaces any text of the form "<stuff>"
result = ''
throwAway = ''
i = 0
state = 0
while i < len(line):
if state == 0:
if line[i] == '<':
state = 1
throwAway = line[i]
else:
result = result + line[i]
elif state == 1:
if line[i] == '>':
result = result + throwAway + line[i]
state = 0
else:
state = 2
throwAway = throwAway + line[i]
elif state == 2:
if line[i] == '>':
state = 0
result = result + '.elided.'
else:
throwAway = throwAway + '>' + line[i]
i = i + 1
# if we reached the end of the line after a '<', put the
# throwaway text back in
if state == 2:
result = result + ' <' + throwAway
#print "Before<: " + line
#print "After<: " + result
return result.rstrip() # ignore trailing spaces
def removeDollarTerms(self,line):
# removes text of the form $0~Datatype0 (where Datatype might
# be String, Int, TableName etc.)
result = ''
throwAway = ''
i = 0
state = 0
while i < len(line):
if state == 0:
if line[i] == '$':
state = 1;
throwAway = line[i]
else:
result = result + line[i]
elif state == 1:
if line[i].isdigit():
state = 2
throwAway = throwAway + line[i]
else:
result = result + throwAway + line[i]
throwAway = ''
state = 0
elif state == 2:
if line[i] == '~':
state = 3
throwAway = throwAway + line[i]
else:
result = result + throwAway + line[i]
throwAway = ''
state = 0
elif state == 3:
if line[i].isalpha():
state = 4
throwAway = throwAway + line[i]
else:
result = result + throwAway + line[i]
throwAway = ''
state = 0
elif state == 4:
if line[i].isalpha():
throwAway = throwAway + line[i]
elif line[i].isdigit():
state = 0 # we reached the end of the dollar text
result = result + '.elided.'
else:
state = 0 # we reached the end of the dollar text
result = result + '.elided.' + line[i]
i = i + 1
# if we reached the end of the line then put the throwaway text
# back in
if state > 0 and state < 4:
result = result + throwAway
#print "Before$: " + line
#print "After$: " + result
return result.rstrip() # ignore trailing spaces
def compareText(self):
# print "compareText called"
for key in self.dict:
attributes = self.dict[key]
comparison = False
if 'messageGuideText' in attributes:
if 'errorMessageFileText' in attributes:
if attributes['messageGuideText'] == attributes['errorMessageFileText']:
comparison = True
else:
temp1 = self.removeAngleBracketTerms(attributes['messageGuideText'])
temp2 = self.removeDollarTerms(attributes['errorMessageFileText'])
if temp1 == temp2:
comparison = True
self.mergeEntry(key,{ 'textsMatch': comparison })
# Iterating through a Python dictionary gets keys out in hash order
# which isn't useful to humans. This helper function gets the keys,
# and places them in a list in numeric order
def sortedNumericKeys(self,dictionary):
numericKeys = []
for key in self.dict:
numericKeys.append(int(key))
result = sorted(numericKeys)
return result
def reportResults(self,withCodeRefs):
matchedCount = 0
mismatchedCount = 0
for keyN in self.sortedNumericKeys(self.dict):
key = str(keyN)
value = self.dict[key]
if value['textsMatch']:
matchedCount = matchedCount + 1
else:
mismatchedCount = mismatchedCount + 1
if 'messageGuideText' in value:
if 'errorMessageFileText' in value:
print "Message " + key + " differs between code and Messages Guide:"
print "SqlciErrors.txt: " + value['errorMessageFileText']
else:
print "Message " + key + " appears in the Messages Guide but not the SqlciErrors.txt file:"
print "Message Guide text: " + value['messageGuideText']
else:
if 'errorMessageFileText' in value:
print "Message " + key + " appears in the SqlciErrors.txt file but not the Messages Guide:"
print "SqlciErrors.txt: " + value['errorMessageFileText']
else:
# must exist only in an enum
print "Message " + key + " does not appear in either the SqlciErrors.txt file nor the Messages Guide."
if 'enumSymbol' in value:
print "Enum symbol: " + value['enumSymbol'] + " (file " + value['enumFile'] + ")"
# these tests shouldn't be necessary but do make the following code safe
if not 'listOfCodeReferences' in value:
value['listOfCodeReferences'] = set()
if not 'listOfTestReferences' in value:
value['listOfTestReferences'] = []
if withCodeRefs:
text = "This message "
if 'enumSymbol' in value:
text = text + "(or its enum symbol) "
if len(value['listOfCodeReferences']) == 0:
text = text + "does not seem to be referenced in C++ code."
else:
text = text + "has possible references in " + str(len(value['listOfCodeReferences'])) + " files:"
print text
for codeReference in value['listOfCodeReferences']:
print " " + codeReference
if len(value['listOfTestReferences']) == 0:
print "This message does not appear to be in any regress expected file."
else:
print "This message possibly appears in the following " + str(len(value['listOfTestReferences'])) + " regress expected files:"
for testReference in value['listOfTestReferences']:
print " " + testReference
print " "
print "Summary: There are " + str(matchedCount) + " matching messages and " + str(mismatchedCount) + " mismatching messages."
# beginning of main
# process command line arguments
parser = argparse.ArgumentParser(
description='This script checks the consistency of the Messages Guide and the code.')
parser.add_argument("--codeRefs", help='Looks for code references to mismatched messages; this option is quite slow and can take up to 10 seconds per message. Today there are about 2800 mismatched messages so count on six or seven hours.', action="store_true")
args = parser.parse_args() # exits and prints help if args are incorrect
exitCode = 0
messagesTable = MessagesTable()
# check that $TRAF_HOME is set
mySQroot = os.getenv('TRAF_HOME')
if not mySQroot:
print "$TRAF_HOME is not defined. Exiting."
exit(1)
# parse the Messages Guide files
print
print datetime.datetime.ctime(datetime.datetime.now()) + ": reading Messages Guide"
messagesGuideChaptersDirectory = mySQroot + '/../../docs/messages_guide/src/asciidoc/_chapters'
for subdir, dirs, files in os.walk(messagesGuideChaptersDirectory):
for file in files:
filepath = subdir + os.sep + file
if filepath.endswith(".adoc"):
messagesTable.parseAdoc(filepath)
# parse the SqlciErrors.txt file
print
print datetime.datetime.ctime(datetime.datetime.now()) + ": reading SqlciErrors.txt"
eTextFileName = mySQroot + '/../sql/bin/SqlciErrors.txt'
messagesTable.parseMessagesFile(eTextFileName)
# parse the enum files
print
print datetime.datetime.ctime(datetime.datetime.now()) + ": reading enum files"
enumFileList = ( [ ['ustat/hs_const.h','USTAT_ERROR_CODES'],
['arkcmp/CmpErrors.h','ArkcmpErrorCode'],
['sqlcomp/CmpDDLCatErrorCodes.h','CatErrorCode'],
['optimizer/opt_error.h','OptimizerSQLErrorCode'],
['optimizer/UdrErrors.h','UDRErrors'],
['exp/ExpErrorEnums.h','ExeErrorCode'],
['sort/SortError.h','SortErrorEnum'],
['udrserv/udrdefs.h','UdrErrorEnum'] ] )
for entry in enumFileList:
fileName = mySQroot + '/../sql/' + entry[0]
messagesTable.parseEnumFile(fileName,entry[1])
# compare Messages Guide and code text
print
print datetime.datetime.ctime(datetime.datetime.now()) + ": comparing Messages Guide and SqlciError.txt text"
messagesTable.compareText()
# analyze code references
if args.codeRefs:
print
print datetime.datetime.ctime(datetime.datetime.now()) + ": looking for code references for mismatched messages (this may take a while)"
sqlCodeDirectory = mySQroot + '/../sql'
messagesTable.analyzeCodeReferences(sqlCodeDirectory)
# analyze test references
print
print datetime.datetime.ctime(datetime.datetime.now()) + ": looking for test examples for mismatched messages (this may take a while)"
regressDirectory = mySQroot + '/../sql/regress'
messagesTable.analyzeTestReferences(regressDirectory)
# report results
print
print datetime.datetime.ctime(datetime.datetime.now()) + ": generating report"
print
messagesTable.reportResults(args.codeRefs)
print
print datetime.datetime.ctime(datetime.datetime.now()) + ": done"
exit(exitCode)