blob: 682f2b0605df4f6f578800b9ecba944ccdeb66bd [file] [log] [blame]
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# gplogfilter [options] [inputfile]...
#
#
# import Python standard library modules
import csv
import gzip
import locale
import os
import os.path
import re
import sys
from optparse import Option, OptionGroup, OptionParser, OptionValueError, SUPPRESS_USAGE
# import GPDB modules
try:
from gppylib.gpparseopts import *
from gppylib.datetimeutils import str_to_datetime, str_to_duration, DatetimeValueError
from gppylib.logfilter import *
from gppylib.gpcoverage import GpCoverage
from gppylib.commands import gp
except ImportError, e:
sys.exit('ERROR: Cannot import modules. Please check that you have sourced greenplum_path.sh. Detail: ' + str(e))
# These values are from src/backend/po/*.po
TROUBLE_VALUES=[
'ERROR', 'FATAL', 'PANIC', # EN
bytearray(b'\x46\x4F\x55\x54').decode('iso-8859-1'), # AF "ERROR"
bytearray(b'\x46\x41\x54\x41\x41\x4C').decode('iso-8859-1'), # AF "FATAL"
bytearray(b'\x57\x41\x4E\x48\x4F\x4F\x50').decode('iso-8859-1'),# AF "PANIC"
bytearray(b'\x46\x45\x48\x4C\x45\x52').decode('ISO-8859-1'), # DE "ERROR"
bytearray(b'\x46\x41\x54\x41\x4C').decode('ISO-8859-1'), # DE "FATAL"
bytearray(b'\x50\x41\x4E\x49\x43').decode('ISO-8859-1'), # DE "PANIC"
bytearray(b'\x45\x52\x52\x45\x55\x52').decode('ISO-8859-15'), # FR "ERROR"
bytearray(b'\x46\x41\x54\x41\x4C').decode('ISO-8859-15'), # FR "FATAL"
bytearray(b'\x50\x41\x4E\x49\x43').decode('ISO-8859-15'), # FR "PANIC"
u'오류', u'치명적오류', u'손상', # KO
bytearray(b'\x46\x45\x49\x4C').decode('ISO-8859-1'), # NB "ERROR"
bytearray(b'\x50\x41\x4E\x49\x4B\x4B').decode('ISO-8859-1'), # NB "PANIC"
bytearray(b'\x45\x52\x52\x4F').decode('ISO-8859-1'), # PT_BR "ERROR"
bytearray(b'\x46\x41\x54\x41\x4C').decode('ISO-8859-1'), # PT_BR "FATAL"
bytearray(b'\x50\xC2\x4E\x49\x43\x4F').decode('ISO-8859-1'), # PT_BR "PANIC"
bytearray(b'\x43\x48\x59\x42\x41').decode('ISO-8859-2'), # SK "ERROR"
bytearray(b'\x46\x41\x54\xC1\x4C\x4E\x45').decode('ISO-8859-2'),# SK "FATAL"
bytearray(b'\x50\x41\x4E\x49\x4B\x41').decode('ISO-8859-2'), # SK "PANIC"
bytearray(b'\x4E\x41\x50\x41\x4B\x41').decode('ISO-8859-2'), # SL "ERROR"
bytearray(b'\x55\x53\x4F\x44\x4E\x41\x20\x4E\x41\x50\x41\x4B\x41').decode('ISO-8859-2'), # SL "FATAL"
bytearray(b'\x50\x41\x4E\x49\x4B\x41').decode('ISO-8859-2'), # SL "PANIC"
bytearray(b'\x46\x45\x4C').decode('ISO-8859-1'), # SV "ERROR"
bytearray(b'\x46\x41\x54\x41\x4C\x54').decode('ISO-8859-1'), # SV "FATAL"
bytearray(b'\x50\x41\x4E\x49\x4B').decode('ISO-8859-1'), # SV "PANIC"
u'HATA', u'ÖLÜMCÜL', u'KRİTİK', # TR
bytearray(b'\xB4\xED\xCE\xF3').decode('gb2312'), # ZH_CN "ERROR"
bytearray(b'\xD6\xC2\xC3\xFC\xB4\xED\xCE\xF3').decode('gb2312'),# ZH_CN "FATAL"
bytearray(b'\xB1\xC8\xD6\xC2\xC3\xFC\xB4\xED\xCE\xF3\xBB\xB9\xB9\xFD\xB7\xD6\xB5\xC4\xB4\xED\xCE\xF3').decode('gb2312'), # ZH_CN "PANIC"
u'錯誤', u'嚴重錯誤' # ZH_TW
]
description = ("""
Reads HAWQ log file(s), extracts log entries which meet
all the criteria you specify, and writes them to output
file(s) or standard output.
""")
_help = ["""
If no input file is specified, the value of the MASTER_DATA_DIRECTORY
is used. If an input file is a
directory, the suffix ".log" is appended to obtain the input file name.
To read from standard input, specify input file "-".
""","""
To write the extracted log entries to a destination other than stdout,
use the --out option. If the destination is a directory, then for each
input file, a correspondingly named output file is written there;
otherwise the results from all input files are written to the specified
--out file or standard output.
""","""
In the input file, each log entry starts with a timestamp
"yyyy-mm-dd hh:mm:ss[.fraction]" at the beginning of a line.
Any following lines are considered to belong to the same entry,
up to the next line having a different timestamp.
""","""
--begin and --end timestamp values can be specified as either
"yyyy-mm-dd[ hh:mm[:ss]]" or "yyyymmdd[ hhmm[ss]]". Between date
and time, either a space or the letter "T" is required. When only
the date is given, 00:00:00 is used for the time.
Specify --duration as "[hours][:minutes[:seconds]]";
it is unused if both --begin and --end are given.
""","""
The ending date and time can be given by --end, or computed as
--begin plus --duration. The beginning date and time can be
given by --begin, or computed as --end minus --duration, or
computed as the current date and time minus --duration.
""","""
Log entries are skipped unless they fulfill all of the conditions
you specify. For example, if you specify two occurrences of the
--find option, log entries must contain both of the strings.
""","""
If the destination specified by --out is a directory, the output file is
given the same name as the input file (excluding '.gz' suffix if any).
When writing compressed output, '.gz' is suffixed to the output file name.
If you specify an output file name ending in '.gz', the output is
compressed (-z9) by default.
""","""
Example:
gplogfilter -t -d2
# view trouble messages timestamped within the past two hours
"""]
def parseargs():
# Create our OptionParser object
parser = OptParser(option_class=OptChecker,
description=' '.join(description.split()),
version='%prog version $Revision$')
parser.helpStr = _help
parser.set_usage('%prog [--help] [options] [inputfile]...')
parser.remove_option('-h')
# Define the options
optgrp = OptionGroup(parser, '\n Timestamp range',
'Use any two of these options to impose lower and '
'upper bounds on timestamps; or use any one option to '
'bound the timestamp range on just one side. ')
optgrp.add_option('-b', '--begin', type='datetime', metavar='datetime',
help='beginning date and time: "yyyy-mm-dd[ hh:mm[:ss]"')
optgrp.add_option('-e', '--end', type='datetime', metavar='datetime',
help='ending date and time')
optgrp.add_option('-d', '--duration', type='duration', metavar='[h][:m[:s]]',
help='duration from beginning to end')
optgrp.add_option('--prunefiles', action='store_true', default=False,
help="Discard files based on filename of the form hawq-%Y-%m-%d_%H%M%S.csv")
parser.add_option_group(optgrp)
optgrp = OptionGroup(parser, 'Pattern and string matching',
'Log entries can be chosen depending on whether '
'they contain a match for a pattern (regular '
'expression) or string. Matching of alphabetic '
'characters is case-sensitive unless preceded '
'by --case=ignore. These options can be used as many '
'times as needed to apply multiple restrictions. '
'Regular expression syntax is documented at '
'http://docs.python.org/lib/re-syntax.html')
optgrp.add_option('-f', '--find', type='literal', metavar='string',
dest='filters', action='MatchRegex',
help='select log entries containing string')
optgrp.add_option('-F', '--nofind', type='literal', metavar='string',
dest='filters', action='NoMatchRegex',
help='reject log entries containing string')
optgrp.add_option('-m', '--match', type='regex', metavar='regex',
dest='filters', action='MatchRegex',
help='select log entries where a match for the regex is found')
optgrp.add_option('-M', '--nomatch', type='regex', metavar='regex',
dest='filters', action='NoMatchRegex',
help='reject log entries where a match for the regex is found')
optgrp.add_option('-t', '--trouble', action='store_true',
help='select log entries having ERROR:, FATAL:, or PANIC: '
'in the first line')
optgrp.add_option('-C', '--columns', type='str', metavar='string',
dest='filters', action='MatchColumns',
help='select specific log file columns, provided as a '
'comma delimiter string counting from 1')
parser.add_option_group(optgrp)
optgrp = OptionGroup(parser, 'Mode flags',
'These flags affect the behavior of other options '
'specified to their right. They can be used '
'as many times as needed.')
optgrp.add_option('-c', '--case', type='choice', choices=['i', 'ignore', 'r', 'respect'],
metavar='i[gnore]|r[espect]',
action='callback',
callback=OptChecker.regexSetCaseSensitivity,
help=('ignore or respect the distinction between '
'upper and lower case letters in pattern and '
'string matching options after this'))
parser.add_option_group(optgrp)
optgrp = OptionGroup(parser, 'Final selection',
'Limit the output to a subsequence of the '
'qualifying log entries from each input file. '
'Use at most one of these options.')
optgrp.add_option('-n', '--tail', type='int', metavar='N',
help=('select the last N qualifying log entries'))
optgrp.add_option('-s', '--slice', type='int', metavar='I [J]',
action='optionalSecondArg',
help=('select qualifying log entries I <= i < J '
'(0 is first; <0 is relative to the end)'))
parser.add_option_group(optgrp)
optgrp = OptionGroup(parser, 'Input options')
optgrp.add_option('-u', '--unzip', action='store_true',
help='read gzip-compressed input; assumed when inputfile suffix is ".gz"')
parser.add_option_group(optgrp)
optgrp = OptionGroup(parser, 'Output options')
optgrp.add_option('-o', '--out', type='string', metavar='outputfile',
help='write output to specified file or directory (instead of stdout)')
optgrp.add_option('-z', '--zip', type='choice', choices=list('0123456789'), metavar='0..9',
help=('compression level (gzip): 0 = no compression; '
'9 = maximum compression'))
optgrp.add_option('-a', '--append', action='store_true',
help="when output file already exists, append to it; don't overwrite")
parser.add_option_group(optgrp)
optgrp = OptionGroup(parser, 'Message options')
optgrp.add_option('-q', '--quiet', dest='verbose', action='store_false',
help='suppress status messages')
optgrp.add_option('-h', '-?','--help', action='help',
help='show this help message and exit')
optgrp.add_option('--usage', action="briefhelp")
parser.add_option_group(optgrp)
parser.set_defaults(verbose=True, filters=[], slice=(None, None))
# Parse the command line arguments
(options, args) = parser.parse_args()
return options, args
#-------------------------------------------------------------------------
def openInputFile(ifn, options):
filesToClose = []
unzip = options.unzip
# Open input file, unless reading from stdin
if ifn == '-':
ifn = zname = 'stdin'
fileIn = sys.stdin
else:
ifn = os.path.abspath(ifn)
# In case a master or segment instance's data directory name
# was given, append '.log' to get the instance's log file name
# (hawq convention).
if (os.path.isdir(ifn)):
ifn += '.log'
zname = os.path.split(ifn)[1]
if ifn.endswith('.gz'):
unzip = True
zname = zname[0:-3]
if os.path.splitext(zname)[1] == '':
zname += '.log'
fileIn = open(ifn, (unzip and 'rb') or 'rU')
filesToClose.append(fileIn)
# Set up input decompression
if unzip:
fileIn = gzip.GzipFile(zname, 'rb', fileobj=fileIn)
filesToClose.insert(0, fileIn)
if zname.endswith('.csv'):
fileIn = csv.reader(fileIn,delimiter=',',quotechar='"')
fileIn = CsvFlatten(fileIn)
return fileIn, filesToClose, ifn, zname
def openOutputFile(ifn, zname, options):
filesToClose = []
# append or overwrite?
if options.append:
omode = 'a'
else:
omode = 'w'
# Compressed output file should be opened in binary mode
zipout = options.zip and options.zip > '0'
if zipout:
omode += 'b'
# Open output file in binary mode so as not to disturb the
# original line ending control characters.
else:
omode += 'b'
# Open output file, unless writing to stdout
if options.out is None:
fileOut = sys.stdout
else:
# let error messages show full path in case something goes wrong
ofn = os.path.abspath(options.out)
# if ofn refers to a directory, append name of input file
if os.path.isdir(ofn):
ofn = os.path.join(ofn, zname + ".out")
# append .gz suffix if compressing
if zipout and not ofn.endswith('.gz'):
ofn += '.gz'
# error if ofn is a directory now
if os.path.isdir(ofn):
raise IOError('cannot write output file because there is a '
'directory at the output location: %s' % ofn)
# make sure we can write to the output directory
odn = os.path.split(ofn)[0]
if not os.access(odn, os.W_OK):
raise IOError('output directory not found or not writable: %s' % odn)
# open the file
fileOut = open(ofn, omode)
filesToClose.append(fileOut)
if options.verbose:
if options.append:
print >>sys.stderr, ' append to ', ofn
else:
print >>sys.stderr, ' output to ', ofn
# Set up output compression
if zipout:
fileOut = gzip.GzipFile(zname, omode,
compresslevel=int(options.zip),
fileobj=fileOut)
filesToClose.insert(0, fileOut)
return fileOut, filesToClose
#------------------------------- Mainline --------------------------------
coverage = GpCoverage()
coverage.start()
# Use default locale specified by LANG environment variable
try:
locale.setlocale(locale.LC_ALL, '')
except Exception:
pass
# Parse the command line arguments
options, args = parseargs()
# Determine timestamp range
begin, end = spiffInterval(options.begin, options.end, options.duration)
if begin:
begin = begin.replace(microsecond=0)
if end:
end = end.replace(microsecond=0)
# Insert trouble message filter ahead of other pattern matching filters
if options.trouble:
options.filters.insert(0, filterize(MatchInFirstLine,
(': |'.join(TROUBLE_VALUES)) + u': ' ))
# Limit output to last N entries if requested. Let --tail override --slice.
if options.tail is None:
sliceBegin, sliceEnd = options.slice
elif options.tail > 0:
sliceBegin, sliceEnd = -options.tail, None
else:
sliceBegin, sliceEnd = 0, 0
# Output suffix .gz implies maximum compression unless overridden by -z
if (options.zip is None and
options.out and
options.out.endswith('.gz')):
options.zip = '9'
try:
# If no inputfile arg, use MASTER_DATA_DIRECTORY variable as default
if len(args) == 0:
s = gp.get_masterdatadir()
if s:
#we only support log rotation in pg_log dir.
if os.path.exists(s + "/pg_log"):
for logfile in os.listdir(s + "/pg_log"):
args.append(s + "/pg_log/" + logfile)
else:
raise IOError('Specify input file or "-" for standard input')
else:
raise IOError('specify input file or "-" for standard input')
# In MS Windows, apply shell wildcard expansion to input filename list
if sys.platform == 'win32':
import glob
newargs = []
for a in args:
names = glob.glob(a)
names.sort()
newargs.extend(names)
args = newargs
inputFilesToClose = outputFilesToClose = []
fileOut = None
try:
# Output to a directory?
outputFilePerInputFile = False
if options.out:
options.out = os.path.abspath(options.out)
if os.path.isdir(options.out):
outputFilePerInputFile = True
if options.verbose:
msg = ('requested timestamp range from %s to %s'
% (begin or 'beginning of data', end or 'end of data'))
print >>sys.stderr, msg
# Loop over input files
for ifn in args:
"""
Open each file in the logs directory. Check to see if the file name
looks anything like a log file name with a time stamp that we
recognize. If true, and the user specified a time range, skip the
file if it is outside the range. That is, close the file and any
associated temporary files.
All other files with names that do not look like time stamps are
processed. That is, their log information is extracted, and if
the user specified a time range, only those entries that are
within that range are kept.
"""
# Open next input file
fileIn, inputFilesToClose, ifn, zname = openInputFile(ifn, options)
# if we can skip the whole file, let's do so
if zname.startswith('hawq') and zname.endswith('.csv'):
goodFormat = True
try:
# try format YYYY-MM-DD_HHMMSS
filedate=datetime.strptime(zname[5:-4],'%Y-%m-%d_%H%M%S')
except:
try:
# try format YYYY-MM-DD
filedate=datetime.strptime(zname[5:-4], '%Y-%m-%d')
except:
# the format isn't anything I understand
goodFormat = False
if goodFormat and begin and filedate < begin:
if end and filedate > end:
print >>sys.stderr,"SKIP file: %s" % zname
for f in inputFilesToClose:
f.close()
inputFilesToClose = []
continue
# Announce each input file *before* its output file if --out is dir
if options.verbose and outputFilePerInputFile:
print >>sys.stderr, '---------- ', ifn, '---------- '
# Open the output file (once per input file if --out is a directory)
if fileOut is None:
fileOut, outputFilesToClose = openOutputFile(ifn, zname, options)
# Announce input files *after* single output file
if options.verbose and not outputFilePerInputFile:
print >>sys.stderr, '---------- ', ifn, '---------- '
# Construct the filtering pipeline
filteredInput = FilterLogEntries(fileIn,
verbose=options.verbose,
beginstamp=begin,
endstamp=end,
filters=options.filters,
ibegin=sliceBegin,
jend=sliceEnd)
# Write filtered lines to output file. Don't append \n to
# each line, because the original line ends are still there.
for line in filteredInput:
print >>fileOut, line,
# Close input and output files
for file in inputFilesToClose:
file.close()
inputFilesToClose = []
if outputFilePerInputFile:
fileOut = None
for file in outputFilesToClose:
file.close()
outputFilesToClose = []
finally:
for file in outputFilesToClose:
file.close()
for file in inputFilesToClose:
file.close()
except IOError, msg:
execname = os.path.basename(sys.argv[0])
print >>sys.stderr, '%s: (IOError) "%s"' % (execname, msg)
sys.exit(2)
finally:
coverage.stop()
coverage.generate_report()