| #!/usr/bin/env python |
| # -*- coding: utf-8 -*- |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| # |
| # gplogfilter [options] [inputfile]... |
| # |
| # |
| |
| # import Python standard library modules |
| import csv |
| import gzip |
| import locale |
| import os |
| import os.path |
| import re |
| import sys |
| |
| from optparse import Option, OptionGroup, OptionParser, OptionValueError, SUPPRESS_USAGE |
| |
| # import GPDB modules |
| try: |
| from gppylib.gpparseopts import * |
| from gppylib.datetimeutils import str_to_datetime, str_to_duration, DatetimeValueError |
| from gppylib.logfilter import * |
| from gppylib.gpcoverage import GpCoverage |
| from gppylib.commands import gp |
| except ImportError, e: |
| sys.exit('ERROR: Cannot import modules. Please check that you have sourced greenplum_path.sh. Detail: ' + str(e)) |
| |
| # These values are from src/backend/po/*.po |
| TROUBLE_VALUES=[ |
| 'ERROR', 'FATAL', 'PANIC', # EN |
| bytearray(b'\x46\x4F\x55\x54').decode('iso-8859-1'), # AF "ERROR" |
| bytearray(b'\x46\x41\x54\x41\x41\x4C').decode('iso-8859-1'), # AF "FATAL" |
| bytearray(b'\x57\x41\x4E\x48\x4F\x4F\x50').decode('iso-8859-1'),# AF "PANIC" |
| bytearray(b'\x46\x45\x48\x4C\x45\x52').decode('ISO-8859-1'), # DE "ERROR" |
| bytearray(b'\x46\x41\x54\x41\x4C').decode('ISO-8859-1'), # DE "FATAL" |
| bytearray(b'\x50\x41\x4E\x49\x43').decode('ISO-8859-1'), # DE "PANIC" |
| bytearray(b'\x45\x52\x52\x45\x55\x52').decode('ISO-8859-15'), # FR "ERROR" |
| bytearray(b'\x46\x41\x54\x41\x4C').decode('ISO-8859-15'), # FR "FATAL" |
| bytearray(b'\x50\x41\x4E\x49\x43').decode('ISO-8859-15'), # FR "PANIC" |
| u'오류', u'치명적오류', u'손상', # KO |
| bytearray(b'\x46\x45\x49\x4C').decode('ISO-8859-1'), # NB "ERROR" |
| bytearray(b'\x50\x41\x4E\x49\x4B\x4B').decode('ISO-8859-1'), # NB "PANIC" |
| bytearray(b'\x45\x52\x52\x4F').decode('ISO-8859-1'), # PT_BR "ERROR" |
| bytearray(b'\x46\x41\x54\x41\x4C').decode('ISO-8859-1'), # PT_BR "FATAL" |
| bytearray(b'\x50\xC2\x4E\x49\x43\x4F').decode('ISO-8859-1'), # PT_BR "PANIC" |
| bytearray(b'\x43\x48\x59\x42\x41').decode('ISO-8859-2'), # SK "ERROR" |
| bytearray(b'\x46\x41\x54\xC1\x4C\x4E\x45').decode('ISO-8859-2'),# SK "FATAL" |
| bytearray(b'\x50\x41\x4E\x49\x4B\x41').decode('ISO-8859-2'), # SK "PANIC" |
| bytearray(b'\x4E\x41\x50\x41\x4B\x41').decode('ISO-8859-2'), # SL "ERROR" |
| bytearray(b'\x55\x53\x4F\x44\x4E\x41\x20\x4E\x41\x50\x41\x4B\x41').decode('ISO-8859-2'), # SL "FATAL" |
| bytearray(b'\x50\x41\x4E\x49\x4B\x41').decode('ISO-8859-2'), # SL "PANIC" |
| bytearray(b'\x46\x45\x4C').decode('ISO-8859-1'), # SV "ERROR" |
| bytearray(b'\x46\x41\x54\x41\x4C\x54').decode('ISO-8859-1'), # SV "FATAL" |
| bytearray(b'\x50\x41\x4E\x49\x4B').decode('ISO-8859-1'), # SV "PANIC" |
| u'HATA', u'ÖLÜMCÜL', u'KRİTİK', # TR |
| bytearray(b'\xB4\xED\xCE\xF3').decode('gb2312'), # ZH_CN "ERROR" |
| bytearray(b'\xD6\xC2\xC3\xFC\xB4\xED\xCE\xF3').decode('gb2312'),# ZH_CN "FATAL" |
| bytearray(b'\xB1\xC8\xD6\xC2\xC3\xFC\xB4\xED\xCE\xF3\xBB\xB9\xB9\xFD\xB7\xD6\xB5\xC4\xB4\xED\xCE\xF3').decode('gb2312'), # ZH_CN "PANIC" |
| u'錯誤', u'嚴重錯誤' # ZH_TW |
| ] |
| |
| description = (""" |
| Reads HAWQ log file(s), extracts log entries which meet |
| all the criteria you specify, and writes them to output |
| file(s) or standard output. |
| """) |
| |
| _help = [""" |
| If no input file is specified, the value of the MASTER_DATA_DIRECTORY |
| is used. If an input file is a |
| directory, the suffix ".log" is appended to obtain the input file name. |
| To read from standard input, specify input file "-". |
| """,""" |
| To write the extracted log entries to a destination other than stdout, |
| use the --out option. If the destination is a directory, then for each |
| input file, a correspondingly named output file is written there; |
| otherwise the results from all input files are written to the specified |
| --out file or standard output. |
| """,""" |
| In the input file, each log entry starts with a timestamp |
| "yyyy-mm-dd hh:mm:ss[.fraction]" at the beginning of a line. |
| Any following lines are considered to belong to the same entry, |
| up to the next line having a different timestamp. |
| """,""" |
| --begin and --end timestamp values can be specified as either |
| "yyyy-mm-dd[ hh:mm[:ss]]" or "yyyymmdd[ hhmm[ss]]". Between date |
| and time, either a space or the letter "T" is required. When only |
| the date is given, 00:00:00 is used for the time. |
| Specify --duration as "[hours][:minutes[:seconds]]"; |
| it is unused if both --begin and --end are given. |
| """,""" |
| The ending date and time can be given by --end, or computed as |
| --begin plus --duration. The beginning date and time can be |
| given by --begin, or computed as --end minus --duration, or |
| computed as the current date and time minus --duration. |
| """,""" |
| Log entries are skipped unless they fulfill all of the conditions |
| you specify. For example, if you specify two occurrences of the |
| --find option, log entries must contain both of the strings. |
| """,""" |
| If the destination specified by --out is a directory, the output file is |
| given the same name as the input file (excluding '.gz' suffix if any). |
| When writing compressed output, '.gz' is suffixed to the output file name. |
| If you specify an output file name ending in '.gz', the output is |
| compressed (-z9) by default. |
| """,""" |
| Example: |
| gplogfilter -t -d2 |
| # view trouble messages timestamped within the past two hours |
| """] |
| |
| |
| def parseargs(): |
| # Create our OptionParser object |
| parser = OptParser(option_class=OptChecker, |
| description=' '.join(description.split()), |
| version='%prog version $Revision$') |
| parser.helpStr = _help |
| parser.set_usage('%prog [--help] [options] [inputfile]...') |
| parser.remove_option('-h') |
| |
| # Define the options |
| optgrp = OptionGroup(parser, '\n Timestamp range', |
| 'Use any two of these options to impose lower and ' |
| 'upper bounds on timestamps; or use any one option to ' |
| 'bound the timestamp range on just one side. ') |
| optgrp.add_option('-b', '--begin', type='datetime', metavar='datetime', |
| help='beginning date and time: "yyyy-mm-dd[ hh:mm[:ss]"') |
| optgrp.add_option('-e', '--end', type='datetime', metavar='datetime', |
| help='ending date and time') |
| optgrp.add_option('-d', '--duration', type='duration', metavar='[h][:m[:s]]', |
| help='duration from beginning to end') |
| optgrp.add_option('--prunefiles', action='store_true', default=False, |
| help="Discard files based on filename of the form hawq-%Y-%m-%d_%H%M%S.csv") |
| parser.add_option_group(optgrp) |
| |
| optgrp = OptionGroup(parser, 'Pattern and string matching', |
| 'Log entries can be chosen depending on whether ' |
| 'they contain a match for a pattern (regular ' |
| 'expression) or string. Matching of alphabetic ' |
| 'characters is case-sensitive unless preceded ' |
| 'by --case=ignore. These options can be used as many ' |
| 'times as needed to apply multiple restrictions. ' |
| 'Regular expression syntax is documented at ' |
| 'http://docs.python.org/lib/re-syntax.html') |
| optgrp.add_option('-f', '--find', type='literal', metavar='string', |
| dest='filters', action='MatchRegex', |
| help='select log entries containing string') |
| optgrp.add_option('-F', '--nofind', type='literal', metavar='string', |
| dest='filters', action='NoMatchRegex', |
| help='reject log entries containing string') |
| optgrp.add_option('-m', '--match', type='regex', metavar='regex', |
| dest='filters', action='MatchRegex', |
| help='select log entries where a match for the regex is found') |
| optgrp.add_option('-M', '--nomatch', type='regex', metavar='regex', |
| dest='filters', action='NoMatchRegex', |
| help='reject log entries where a match for the regex is found') |
| optgrp.add_option('-t', '--trouble', action='store_true', |
| help='select log entries having ERROR:, FATAL:, or PANIC: ' |
| 'in the first line') |
| optgrp.add_option('-C', '--columns', type='str', metavar='string', |
| dest='filters', action='MatchColumns', |
| help='select specific log file columns, provided as a ' |
| 'comma delimiter string counting from 1') |
| parser.add_option_group(optgrp) |
| |
| optgrp = OptionGroup(parser, 'Mode flags', |
| 'These flags affect the behavior of other options ' |
| 'specified to their right. They can be used ' |
| 'as many times as needed.') |
| optgrp.add_option('-c', '--case', type='choice', choices=['i', 'ignore', 'r', 'respect'], |
| metavar='i[gnore]|r[espect]', |
| action='callback', |
| callback=OptChecker.regexSetCaseSensitivity, |
| help=('ignore or respect the distinction between ' |
| 'upper and lower case letters in pattern and ' |
| 'string matching options after this')) |
| parser.add_option_group(optgrp) |
| |
| optgrp = OptionGroup(parser, 'Final selection', |
| 'Limit the output to a subsequence of the ' |
| 'qualifying log entries from each input file. ' |
| 'Use at most one of these options.') |
| optgrp.add_option('-n', '--tail', type='int', metavar='N', |
| help=('select the last N qualifying log entries')) |
| optgrp.add_option('-s', '--slice', type='int', metavar='I [J]', |
| action='optionalSecondArg', |
| help=('select qualifying log entries I <= i < J ' |
| '(0 is first; <0 is relative to the end)')) |
| parser.add_option_group(optgrp) |
| |
| optgrp = OptionGroup(parser, 'Input options') |
| optgrp.add_option('-u', '--unzip', action='store_true', |
| help='read gzip-compressed input; assumed when inputfile suffix is ".gz"') |
| parser.add_option_group(optgrp) |
| |
| optgrp = OptionGroup(parser, 'Output options') |
| optgrp.add_option('-o', '--out', type='string', metavar='outputfile', |
| help='write output to specified file or directory (instead of stdout)') |
| optgrp.add_option('-z', '--zip', type='choice', choices=list('0123456789'), metavar='0..9', |
| help=('compression level (gzip): 0 = no compression; ' |
| '9 = maximum compression')) |
| optgrp.add_option('-a', '--append', action='store_true', |
| help="when output file already exists, append to it; don't overwrite") |
| parser.add_option_group(optgrp) |
| |
| optgrp = OptionGroup(parser, 'Message options') |
| optgrp.add_option('-q', '--quiet', dest='verbose', action='store_false', |
| help='suppress status messages') |
| optgrp.add_option('-h', '-?','--help', action='help', |
| help='show this help message and exit') |
| optgrp.add_option('--usage', action="briefhelp") |
| parser.add_option_group(optgrp) |
| |
| parser.set_defaults(verbose=True, filters=[], slice=(None, None)) |
| |
| # Parse the command line arguments |
| (options, args) = parser.parse_args() |
| |
| return options, args |
| |
| |
| #------------------------------------------------------------------------- |
| |
| def openInputFile(ifn, options): |
| filesToClose = [] |
| unzip = options.unzip |
| |
| # Open input file, unless reading from stdin |
| if ifn == '-': |
| ifn = zname = 'stdin' |
| fileIn = sys.stdin |
| else: |
| ifn = os.path.abspath(ifn) |
| # In case a master or segment instance's data directory name |
| # was given, append '.log' to get the instance's log file name |
| # (hawq convention). |
| if (os.path.isdir(ifn)): |
| ifn += '.log' |
| zname = os.path.split(ifn)[1] |
| if ifn.endswith('.gz'): |
| unzip = True |
| zname = zname[0:-3] |
| if os.path.splitext(zname)[1] == '': |
| zname += '.log' |
| fileIn = open(ifn, (unzip and 'rb') or 'rU') |
| filesToClose.append(fileIn) |
| |
| # Set up input decompression |
| if unzip: |
| fileIn = gzip.GzipFile(zname, 'rb', fileobj=fileIn) |
| filesToClose.insert(0, fileIn) |
| |
| if zname.endswith('.csv'): |
| fileIn = csv.reader(fileIn,delimiter=',',quotechar='"') |
| fileIn = CsvFlatten(fileIn) |
| |
| return fileIn, filesToClose, ifn, zname |
| |
| |
| def openOutputFile(ifn, zname, options): |
| filesToClose = [] |
| |
| # append or overwrite? |
| if options.append: |
| omode = 'a' |
| else: |
| omode = 'w' |
| |
| # Compressed output file should be opened in binary mode |
| zipout = options.zip and options.zip > '0' |
| if zipout: |
| omode += 'b' |
| |
| # Open output file in binary mode so as not to disturb the |
| # original line ending control characters. |
| else: |
| omode += 'b' |
| |
| # Open output file, unless writing to stdout |
| if options.out is None: |
| fileOut = sys.stdout |
| else: |
| # let error messages show full path in case something goes wrong |
| ofn = os.path.abspath(options.out) |
| |
| # if ofn refers to a directory, append name of input file |
| if os.path.isdir(ofn): |
| ofn = os.path.join(ofn, zname + ".out") |
| |
| # append .gz suffix if compressing |
| if zipout and not ofn.endswith('.gz'): |
| ofn += '.gz' |
| |
| # error if ofn is a directory now |
| if os.path.isdir(ofn): |
| raise IOError('cannot write output file because there is a ' |
| 'directory at the output location: %s' % ofn) |
| |
| # make sure we can write to the output directory |
| odn = os.path.split(ofn)[0] |
| if not os.access(odn, os.W_OK): |
| raise IOError('output directory not found or not writable: %s' % odn) |
| |
| # open the file |
| fileOut = open(ofn, omode) |
| filesToClose.append(fileOut) |
| |
| if options.verbose: |
| if options.append: |
| print >>sys.stderr, ' append to ', ofn |
| else: |
| print >>sys.stderr, ' output to ', ofn |
| |
| # Set up output compression |
| if zipout: |
| fileOut = gzip.GzipFile(zname, omode, |
| compresslevel=int(options.zip), |
| fileobj=fileOut) |
| filesToClose.insert(0, fileOut) |
| |
| return fileOut, filesToClose |
| |
| |
| #------------------------------- Mainline -------------------------------- |
| |
| coverage = GpCoverage() |
| coverage.start() |
| |
| # Use default locale specified by LANG environment variable |
| try: |
| locale.setlocale(locale.LC_ALL, '') |
| except Exception: |
| pass |
| |
| # Parse the command line arguments |
| options, args = parseargs() |
| |
| # Determine timestamp range |
| begin, end = spiffInterval(options.begin, options.end, options.duration) |
| if begin: |
| begin = begin.replace(microsecond=0) |
| if end: |
| end = end.replace(microsecond=0) |
| |
| # Insert trouble message filter ahead of other pattern matching filters |
| if options.trouble: |
| options.filters.insert(0, filterize(MatchInFirstLine, |
| (': |'.join(TROUBLE_VALUES)) + u': ' )) |
| |
| # Limit output to last N entries if requested. Let --tail override --slice. |
| if options.tail is None: |
| sliceBegin, sliceEnd = options.slice |
| elif options.tail > 0: |
| sliceBegin, sliceEnd = -options.tail, None |
| else: |
| sliceBegin, sliceEnd = 0, 0 |
| |
| # Output suffix .gz implies maximum compression unless overridden by -z |
| if (options.zip is None and |
| options.out and |
| options.out.endswith('.gz')): |
| options.zip = '9' |
| |
| try: |
| # If no inputfile arg, use MASTER_DATA_DIRECTORY variable as default |
| if len(args) == 0: |
| s = gp.get_masterdatadir() |
| if s: |
| #we only support log rotation in pg_log dir. |
| if os.path.exists(s + "/pg_log"): |
| for logfile in os.listdir(s + "/pg_log"): |
| args.append(s + "/pg_log/" + logfile) |
| else: |
| raise IOError('Specify input file or "-" for standard input') |
| else: |
| raise IOError('specify input file or "-" for standard input') |
| |
| # In MS Windows, apply shell wildcard expansion to input filename list |
| if sys.platform == 'win32': |
| import glob |
| newargs = [] |
| for a in args: |
| names = glob.glob(a) |
| names.sort() |
| newargs.extend(names) |
| args = newargs |
| |
| inputFilesToClose = outputFilesToClose = [] |
| fileOut = None |
| try: |
| # Output to a directory? |
| outputFilePerInputFile = False |
| if options.out: |
| options.out = os.path.abspath(options.out) |
| if os.path.isdir(options.out): |
| outputFilePerInputFile = True |
| |
| if options.verbose: |
| msg = ('requested timestamp range from %s to %s' |
| % (begin or 'beginning of data', end or 'end of data')) |
| print >>sys.stderr, msg |
| |
| # Loop over input files |
| for ifn in args: |
| """ |
| Open each file in the logs directory. Check to see if the file name |
| looks anything like a log file name with a time stamp that we |
| recognize. If true, and the user specified a time range, skip the |
| file if it is outside the range. That is, close the file and any |
| associated temporary files. |
| |
| All other files with names that do not look like time stamps are |
| processed. That is, their log information is extracted, and if |
| the user specified a time range, only those entries that are |
| within that range are kept. |
| """ |
| # Open next input file |
| fileIn, inputFilesToClose, ifn, zname = openInputFile(ifn, options) |
| |
| # if we can skip the whole file, let's do so |
| if zname.startswith('hawq') and zname.endswith('.csv'): |
| goodFormat = True |
| try: |
| # try format YYYY-MM-DD_HHMMSS |
| filedate=datetime.strptime(zname[5:-4],'%Y-%m-%d_%H%M%S') |
| except: |
| try: |
| # try format YYYY-MM-DD |
| filedate=datetime.strptime(zname[5:-4], '%Y-%m-%d') |
| except: |
| # the format isn't anything I understand |
| goodFormat = False |
| |
| if goodFormat and begin and filedate < begin: |
| if end and filedate > end: |
| print >>sys.stderr,"SKIP file: %s" % zname |
| for f in inputFilesToClose: |
| f.close() |
| inputFilesToClose = [] |
| continue |
| |
| |
| |
| # Announce each input file *before* its output file if --out is dir |
| if options.verbose and outputFilePerInputFile: |
| print >>sys.stderr, '---------- ', ifn, '---------- ' |
| |
| # Open the output file (once per input file if --out is a directory) |
| if fileOut is None: |
| fileOut, outputFilesToClose = openOutputFile(ifn, zname, options) |
| |
| # Announce input files *after* single output file |
| if options.verbose and not outputFilePerInputFile: |
| print >>sys.stderr, '---------- ', ifn, '---------- ' |
| |
| # Construct the filtering pipeline |
| filteredInput = FilterLogEntries(fileIn, |
| verbose=options.verbose, |
| beginstamp=begin, |
| endstamp=end, |
| filters=options.filters, |
| ibegin=sliceBegin, |
| jend=sliceEnd) |
| |
| # Write filtered lines to output file. Don't append \n to |
| # each line, because the original line ends are still there. |
| for line in filteredInput: |
| print >>fileOut, line, |
| |
| # Close input and output files |
| for file in inputFilesToClose: |
| file.close() |
| inputFilesToClose = [] |
| |
| if outputFilePerInputFile: |
| fileOut = None |
| for file in outputFilesToClose: |
| file.close() |
| outputFilesToClose = [] |
| |
| finally: |
| for file in outputFilesToClose: |
| file.close() |
| for file in inputFilesToClose: |
| file.close() |
| except IOError, msg: |
| execname = os.path.basename(sys.argv[0]) |
| print >>sys.stderr, '%s: (IOError) "%s"' % (execname, msg) |
| sys.exit(2) |
| finally: |
| coverage.stop() |
| coverage.generate_report() |