blob: 9298cba7b06a0ce5cd5f71828dd37d850568011f [file] [log] [blame]
#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# $Id: $
# $Change: $
# $DateTime: $
# $Author: $
"""
Toolkit for filtering input from GPDB server logs
Module contents:
---- High level filters
FilterLogEntries() - the one-stop filter shop
---- Identity filters which observe a stream without altering it
Count() - count items fetched from a stream
TimestampSpy() - count lines and note timestamp range
---- Grouping and ungrouping filters
GroupByTimestamp() - group lines with same timestamp
Ungroup() - decompose groups into lines
EnumerateUngroup() - number the groups and decompose into lines
---- Pattern matching filters
MatchRegex() - select lines or groups in which a regular expr is matched
NoMatchRegex() - select lines or groups in which a regular expr has no match
MatchInFirstLine() - select groups in which regex has a match in first line
NoMatchInFirstLine() - select groups in which regex doesn't match in first line
---- Slicing filters
Slice() - select items in Pythonesque slice of stream[begin:end]
FirstNItems() - select the first n items of a stream
LastNItems() - select the last n items of a finite stream
SkipNItems() - select all but the first n items of a stream
SkipLastNItems() - select all but the last n items of a finite stream
IntersectionOfHeadAndTail() - select items within first m and last n
---- Miscellaneous filters
NotNull() - drop items which are equivalent to False
---- Utilities for setting up filter parameters
filterize() - wrap a filter for use in FilterLogEntries filter list
spiffInterval() - get begin/end datetime given any subset of begin/end/duration
"""
from datetime import date, datetime
import re
import sys
import time
timestampPattern = re.compile(r'\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d(\.\d*)?')
# This pattern matches the date and time stamp at the beginning of a line
# in a GPDB log file. The timestamp format is: YYYY-MM-DD HH:MM:SS[.frac]
# A timezone specifier may follow the timestamp, but we ignore that.
def FilterLogEntries(iterable,
msgfile=sys.stderr,
verbose=False,
beginstamp=None,
endstamp=None,
include=None,
exclude=None,
filters=[],
ibegin=0,
jend=None):
"""
Generator to consume the lines of a GPDB log file from iterable,
yield the lines which satisfy the given criteria, and skip the rest.
iterable should be a sequence of strings, an already-open input file,
or some object which supports iteration and yields strings.
verbose, if True, causes status messages to be written to msgfile,
which should be an already-open output file.
For our purposes, a log entry consists of a line which starts with a
timestamp in YYYY-MM-DD HH:MM:SS[.fraction] format, followed by zero
or more lines having the same timestamp or no timestamp.
beginstamp should be a datetime.datetime or datetime.date object, or None.
Log entries are skipped if their timestamp is less than the specified
date and time. Fractional seconds and timezones are ignored. If a
date object is given, it is converted to a datetime with time 00:00:00.
endstamp is like beginstamp, except that it causes log entries to be
skipped if their timestamp is greater than or equal to the specified
date and time.
include should be a regular expression object returned by the re.compile()
method; or a string specifying a regular expression according to the rules
of the re package in the Python standard library; or a list of such objects
and/or strings; or None. A log entry is skipped if there is an include
regex which - in every line of the entry - fails to match.
exclude is like include, except that it causes a log entry to be skipped
if there is an exclude regex which matches in some line of the entry.
filters is a sequence of callables. Each callable will be called just
once, with one argument: an input stream, which will be an iterator
yielding groups. (Here a 'group' is a sequence of strings: the lines
of a log entry.) The callable should return an iterator yielding
filtered groups. The filters are applied in the order given,
downstream of the begin/end/include/exclude filters. For example,
this filter selects log entries with 'DEBUGn:' in the first line...
lambda(iterable): MatchInFirstLine(iterable, r'DEBUG\d:')
The filterize() function, defined later in this module, is useful for
building the list of filters.
ibegin and jend should be integers or None. They can be specified like
the bounds of a Python slice, to select a subrange of the log entries
which satisfy all the preceding criteria. Values >= 0 are counted from
the beginning of the stream; values < 0 are counted from the end of the
stream. 0 is before the first qualifying log entry; 1 is after the first
and before the second; -1 is before the last. Entries coming before the
ibegin point or after the jend point are skipped. For example, jend=3
to select only the first 3 qualifying log entries; or ibegin=-3 to
extract the last 3 entries.
Regular expression syntax is at http://docs.python.org/lib/re-syntax.html
At the beginning of a log file before the first timestamped line there
could be some lines with no timestamp. If beginstamp or endstamp
is not None, any such lines are skipped. Otherwise they are grouped
together and treated as one log entry.
"""
iterable = iter(iterable)
spyIn = countIn = spyMid = spyMatch = countOut = None
if jend is not None and jend == sys.maxint:
jend = None
# Collect unfiltered input statistics
if verbose:
iterable = spyIn = TimestampSpy(iterable)
# Build filter pipeline
if include or exclude or filters or ibegin or (jend is not None):
# We want patterns to be tested entry-by-entry rather than line-by-line,
# so group together the lines of each entry.
iterable = GroupByTimestamp(iterable)
# Count the unfiltered log entries
if verbose:
iterable = countIn = Count(iterable)
# Select log entries such that beginstamp <= timestamp < endstamp
if beginstamp or endstamp:
iterable = TimestampInBounds(iterable, beginstamp, endstamp)
if verbose:
iterable = spyMid = TimestampSpy(iterable)
# Include matching log entries.
if (isinstance(include, basestring) or # one string
hasattr(include, 'search')): # or compiled regex
include = [include]
if include:
for regex in include:
iterable = MatchRegex(iterable, regex)
# Exclude non-matching log entries.
if (isinstance(exclude, basestring) or # one string
hasattr(exclude, 'search')): # or compiled regex
exclude = [exclude]
if exclude:
for regex in exclude:
iterable = NoMatchRegex(iterable, regex)
# Append caller's filters to the pipeline.
for func in filters:
iterable = func(iterable)
# Collect match/filter statistics
if verbose and iterable is not (spyMid or countIn):
iterable = spyMatch = TimestampSpy(iterable)
# After all other filtering, extract slice of qualifying log entries.
if ibegin or jend is not None:
iterable = Slice(iterable, ibegin, jend)
# Count final output log entries
if verbose:
iterable = countOut = Count(iterable)
# Break the groups back down into lines for output.
iterable = Ungroup(iterable)
# Collect final statistics
if verbose:
iterable = spyOut = TimestampSpy(iterable)
elif beginstamp or endstamp:
# Select log entries such that beginstamp <= timestamp < endstamp
iterable = TimestampInBounds(iterable, beginstamp, endstamp)
# Collect final statistics
if verbose:
iterable = spyOut = spyMid = TimestampSpy(iterable)
else:
# Caller didn't request any filtering.
spyOut = spyIn
# Pull filtered lines out of the pipeline and yield them to caller
for line in iterable:
yield line
# Display statistics if requested
if verbose:
# Did we even try to read any input?
if spyIn.items == 0 and spyOut.items == 0 and not spyIn.eod:
print >>msgfile, ('%7d lines processed; an unsatisfiable condition '
'was specified' % 0)
return
# Unfiltered input statistics
srange = spyIn.str_range()
msg = ' in: %7d lines' % spyIn.lines
if countIn:
msg += ', %7d log entries' % countIn.count()
if srange:
msg += '; timestamps from %s to %s' % srange
else:
msg += '; no timestamps found'
if not spyIn.eod:
msg += '; stopped before end of input'
print >>msgfile, msg
# Entries where begin <= timestamp < end
if spyMid:
srange = spyMid.str_range()
msg = ' time ok: %7d lines' % spyMid.lines
if spyMid.groups:
msg += ', %7d log entries' % spyMid.groups
if srange:
msg += '; timestamps from %s to %s' % srange
print >>msgfile, msg
# After applying include/exclude/filters
if spyMatch:
srange = spyMatch.str_range()
msg = ' match: %7d lines' % spyMatch.lines
if spyMatch.groups:
msg += ', %7d log entries' % spyMatch.groups
if srange:
msg += '; timestamps from %s to %s' % srange
print >>msgfile, msg
# Final output statistics
srange = spyOut.str_range()
msg = ' out: %7d lines' % spyOut.lines
if countOut:
msg += ', %7d log entries' % countOut.count()
if srange:
msg += '; timestamps from %s to %s' % srange
print >>msgfile, msg
#------------------------------- Spying --------------------------------
class CsvFlatten(object):
"""
Used to flatten a CSV parsed log line into something that looks like the
old format.
"""
def __init__(self,iterable):
self.source = iter(iterable)
def __iter__(self):
return self
def next(self):
item = self.source.next()
#we need to make a minor format change to the log level field so that
# our single regex will match both.
item[16] = item[16] + ": "
return '|'.join(item) + "\n"
#------------------------------- Spying --------------------------------
class Count(object):
"""
Iterator to pass through a stream of items while counting them.
Count(iterable) -> iterator
iterable -- a sequence, iterator, file, or other object which
supports iteration, yielding items of any type.
"""
def __init__(self, iterable):
self.source = iter(iterable)
self.n = 0
def __iter__(self):
return self
def next(self):
item = self.source.next()
self.n += 1
return item
def count(self):
return self.n
class TimestampSpy(object):
"""
Iterator to pass through a stream of GPDB log entries while noting the
lowest and highest timestamps, the number of lines and groups, etc.
TimestampSpy(iterable) -> iterator
iterable -- a sequence, iterator, file, or other object which
supports iteration. Each item returned by its next() method
must be either a string (a line of GPDB log data) or a group
(a sequence of strings where the timestamp, if any, is at the
beginning of the first string).
"""
def __init__(self, iterable):
self.source = iter(iterable)
self.minstamp = '\xff'
self.maxstamp = ''
self.items = 0
self.lines = 0
self.groups = 0
self.eod = False
def __iter__(self):
return self
def next(self):
try:
item = self.source.next()
except StopIteration, e:
self.eod = True
raise e
self.items += 1
if isinstance(item, basestring): # ungrouped input
s = item # item is a string
self.lines += 1
elif len(item) > 0: # grouped input
s = item[0] # item is a sequence of strings
self.lines += len(item)
self.groups += 1
else: # item is an empty sequence
s = ''
self.groups += 1
if self.minstamp > s:
m = timestampPattern.match(s)
if m:
self.minstamp = m.group(0)
if self.maxstamp < s:
m = timestampPattern.match(s)
if m:
self.maxstamp = m.group(0)
return item
def str_range(self):
if self.maxstamp == '':
return None
return (self.minstamp, self.maxstamp)
def datetime_range(self):
if self.maxstamp == '':
return None
minstruct = time.strptime(self.minstamp, '%Y-%m-%d %H:%M:%S')[:6]
maxstruct = time.strptime(self.maxstamp, '%Y-%m-%d %H:%M:%S')[:6]
return (datetime(*minstruct), datetime(*maxstruct))
#------------------------------- Grouping --------------------------------
def GroupByTimestamp(iterable, skipnull=True):
"""
Generator to consume lines of a GPDB log and group them into lists.
A new group is started whenever the timestamp changes.
The first call to next() yields a (possibly empty) list of all lines
found before the first timestamped line. Subsequently, each call
yields a list of one or more lines, in which the first line starts
with a timestamp, and the rest have the same timestamp or no timestamp.
The inverse of GroupByTimestamp is Ungroup.
GroupByTimestamp(iterable, skipnull) -> iterator
iterable -- a sequence, iterator, file, or other object which
supports iteration. Each item returned by its next() method
must be a string.
skipnull -- True to omit the first group if it is empty.
Example:
# Print log entries from stdin having 'ERROR:' in the first line.
from logfilter import GroupByTimestamp
from sys import stdin
for lines in GroupByTimestamp(stdin):
if lines[0].find('ERROR:'):
for line in lines:
print line
"""
source = iter(iterable)
more = True
# Build list of lines preceding the first timestamped line, and yield
# it as the first group (could be empty).
lines = []
while more:
try:
s = source.next()
except StopIteration:
more = False
break
tsmatch = timestampPattern.match(s)
if tsmatch:
break
lines.append(s)
if lines or not skipnull:
yield lines
while more:
# Start a new group. Save timestamp from its first line.
lines = [s]
timestamp = tsmatch.group(0)
tsmatch = None
# Any more lines with same (or no) timestamp? Add them to the list.
while True:
try:
s = source.next()
except StopIteration: # end of data
more = False
break
if not s.startswith(timestamp):
tsmatch = timestampPattern.match(s)
if tsmatch: # line has a different timestamp
break
lines.append(s)
# Current group is finished... output it
yield lines
def Ungroup(iterable):
"""
Generator which takes a stream of groups and flattens it by one level,
yielding the elements of the groups. Here, the term 'group' means
a sequence, iterator, or some object which supports iteration.
Empty groups are skipped.
Ungroup(iterable) -> iterator
iterable -- a sequence, iterator, or some object which supports
iteration. Each item returned by its next() method must be
a sequence, iterator, or iterable object.
Example:
# Print the numbers from 1 to 7, the list [8, [], 9], and 10
from gplogfilter import Ungroup
for i in Ungroup([[1, 2], [3, 4, 5], [], (6, 7), [[8, [], 9], 10]]):
print i
"""
for group in iterable:
for item in group:
yield item
def EnumerateUngroup(iterable):
"""
Generator which takes a stream of groups and flattens by one level,
yielding pairs (i, e) where i is a group counter (from zero) and
e is an element of the i'th group. Here, the term 'group' means
a sequence, iterator, or some object which supports iteration.
Empty groups are counted and skipped.
EnumerateUngroup(iterable) -> iterator
iterable -- a sequence, iterator, or some object which supports
iteration. Each item returned by its next() method must be
a sequence, iterator, or iterable object.
"""
i = 0
for group in iterable:
for item in group:
yield i, item
i += 1
#-------------------------------------------------------------------------
def TimestampInBounds(iterable, begin, end):
"""
Generator to extract GPDB log entries in a datetime interval:
begin <= timestamp < end
TimestampInBounds(iterable, begin, end) -> iterator
iterable -- a sequence, iterator, file, or other object which
supports iteration. Each item returned by its next() method
must be either a string (a line of GPDB log data) or a group
(a sequence of strings where the timestamp, if any, is at the
beginning of the first string).
begin -- a date or datetime giving the lower bound of the interval;
or None for no lower bound.
end -- a date or datetime giving the upper bound of the interval;
or None for no upper bound.
Example:
# Print log entries from stdin dated 2008-05-01.
from logfilter import FilterByTimestamp
from datetime import date
from sys import stdin
for s in TimestampInBounds(stdin, date(2008,5,1), date(2008,5,2)):
print s
At the beginning of a log file there may be some lines with no timestamp,
preceding the first timestamped entry. Any such lines are skipped.
"""
# Prepare lower bound
if begin is None:
begin = '0000-00-00'
elif hasattr(begin, 'hour'):
begin = begin.strftime('%Y-%m-%d %H:%M:%S') # 'YYYY-MM-DD HH:MM:SS'
else:
begin = begin.strftime('%Y-%m-%d')
# Prepare upper bound
if end is None:
end = '9999-99-99'
elif hasattr(end, 'hour'):
end = end.strftime('%Y-%m-%d %H:%M:%S')
else:
end = end.strftime('%Y-%m-%d')
# Quit immediately if there cannot be timestamps within the interval.
if begin >= end:
return
# Fetch first item from input stream.
source = iter(iterable)
item = source.next()
# If first item is a string, assume input consists of individual lines.
# Yield lines which start with a timestamp within the given bounds, plus
# any following lines which do not have timestamps.
if isinstance(item, basestring):
withinbounds = False
while True:
if begin <= item < end:
withinbounds = True
yield item
elif timestampPattern.match(item):
withinbounds = False
elif withinbounds:
yield item
item = source.next()
# Else assume input consists of groups (i.e. sequences) of lines.
# Yield groups in which the first line starts with a timestamp within
# the given bounds. Skip groups which are empty or have no timestamp.
while True:
if (len(item) > 0 and
begin <= item[0] < end):
yield item
item = source.next()
#--------------------------- Pattern Matching ----------------------------
def MatchRegex(iterable, regex):
"""
Generator to filter a stream, selecting items in which there is a match
for a regular expression.
MatchRegex(iterable, include) -> iterator
iterable -- a sequence, iterator, file, or other object which
supports iteration. Each item returned by its next() method
must be either a string or a group. Here the term 'group'
means a sequence of strings or an iterable yielding strings.
regex -- a regular expression string, or a regular expression
object returned by re.compile(). The filter excludes items
in which there is no match for the regex.
Example:
# Print lines from stdin which contain the string 'ERROR:'
from logfilter import MatchRegex
import re, sys
for s in MatchRegex(sys.stdin, re.compile('ERROR:')):
print s
"""
if isinstance(regex, basestring):
regex = re.compile(regex)
# Yield items in which a match is found for the 'include' pattern.
for item in iterable:
if isinstance(item, basestring): # item is a string
if regex.search(item):
yield item
else: # item is a group of strings
for s in item:
if regex.search(s):
yield item
break
def NoMatchRegex(iterable, regex):
"""
Generator to filter a stream, selecting items in which there is no match
for the regular expression.
NoMatchRegex(iterable, regex) -> iterator
iterable -- a sequence, iterator, file, or other object which
supports iteration. Each item returned by its next() method
must be either a string or a group. Here the term 'group'
means a sequence of strings or an iterable yielding strings.
regex -- a regular expression string, or a regular expression
object returned by re.compile(). The filter excludes items
in which there is a match for the regex.
Example:
# Print log entries from stdin which don't contain the string 'HINT'
from logfilter import FilterNoMatch, GroupByTimestamp, Ungroup
import sys
for s in Ungroup(NoMatchRegex(TimestampInBounds(sys.stdin), 'HINT')):
print s
"""
if isinstance(regex, basestring):
regex = re.compile(regex)
# Yield items in which no match is found for the 'exclude' pattern.
for item in iterable:
if isinstance(item, basestring): # item is a string
if not regex.search(item):
yield item
else: # item is a group of strings
for s in item:
if regex.search(s):
break
else:
yield item
def MatchInFirstLine(iterable, regex):
"""
Generator to filter a stream of groups. Yields those groups whose
first line contains a match for the given regex.
MatchInFirstLine(iterable, regex) -> iterator
iterable -- a sequence, iterator, or some object which supports
iteration. Each item returned by its next() method must be
a group. Here the term 'group' means a sequence of strings.
include -- either a string specifying a regular expression, or a
regular expression object returned by re.compile()
Example:
# Print log entries whose first line contains the string 'ERROR:'
for s in Ungroup(MatchInFirstLine(GroupByTimestamp(sys.stdin), 'ERROR:')):
print s
"""
if isinstance(regex, basestring):
regex = re.compile(regex)
for group in iterable:
if (len(group) > 0 and
regex.search(group[0])):
yield group
def NoMatchInFirstLine(iterable, regex):
"""
Generator to filter a stream of groups. Skips those groups whose
first line contains a match for the given regex; yields all other
groups.
NoMatchInFirstLine(iterable, regex) -> iterator
iterable -- a sequence, iterator, or some object which supports
iteration. Each item returned by its next() method must be
a group. Here the term 'group' means a sequence of strings.
regex -- either a string specifying a regular expression, or a
regular expression object returned by re.compile()
Example:
# Print log entries whose first line does not contain 'DEBUGn:'
for s in Ungroup(NoMatchInFirstLine(GroupByTimestamp(sys.stdin), r'DEBUG\d:')):
print s
"""
if isinstance(regex, basestring):
regex = re.compile(regex)
for group in iterable:
if (len(group) == 0 or
regex.search(group[0]) is None):
yield group
def MatchColumns(iterable, cols):
if isinstance(cols, basestring):
cols = cols.split(',')
cols = map(lambda x: int(x), cols)
# Yield items in which a match is found for the 'include' pattern.
for item in iterable:
if 1:
#print "item\n%s\nitem" % item
ret = []
for s in item:
n = 1
out = []
for c in s.split('|'):
if n in cols:
out.append(c)
n += 1
if len(out):
#print out
ret.append('|'.join(out) + "\n")
yield ret
#-------------------------------- Slicing --------------------------------
def Slice(iterable, begin=0, end=None):
"""
Generator yielding the items of iterable[begin:end], like a Python slice.
Slice(iterable, begin, end) -> iterator
iterable -- a sequence, iterator, or some object which supports
iteration, yielding items of any type. If begin or end is
negative, the iteration must be finite.
begin -- integer or None. If >=0, number of initial items to skip.
If <0, skip items preceding this number of final items.
end -- integer or None. If >=0, maximum number of items to be
fetched from the head of the input iterable. If <0, number
of items to be excluded at the end of the input stream.
"""
if begin is None:
begin = 0
if begin >= 0:
iterable = SkipNItems(iterable, begin)
if end is None or end == sys.maxint:
pass
elif end >= 0:
iterable = FirstNItems(iterable, end-begin)
else:
iterable = SkipLastNItems(iterable, -end)
elif end is None or end == sys.maxint:
iterable = LastNItems(iterable, -begin)
elif end < 0:
iterable = LastNItems(iterable, -begin, -end)
else:
iterable = IntersectionOfHeadAndTail(iterable, end, -begin)
return iterable
def FirstNItems(iterable, n):
"""
Generator yielding the first n items of a stream.
FirstNItems(iterable, n) -> iterator
iterable -- a sequence, iterator, or some object which supports
iteration, yielding items of any type.
n -- an integer or None
Example:
# Read and print 5 lines from standard input
for s in FirstNItems(sys.stdin, 5):
print s,
"""
def FNI(iterable, n):
source = iter(iterable)
while n > 0:
yield source.next()
n -= 1
if n is None:
pass
elif n <= 0:
iterable = []
else:
iterable = FNI(iterable, n)
return iterable
def LastNItems(iterable, n, dropLastN=0):
"""
Generator yielding the final n items of a finite stream. Of those
final items, the last dropLastN items are omitted if dropLastN > 0.
The number of items yielded is max(0, min(n, N) - dropLastN)) where
N is the number of items yielded by the input iterable.
LastNItems(iterable, n) -> iterator
iterable -- a sequence, iterator, or some object which supports
iteration, yielding items of any type.
n -- integer or None
dropLastN -- integer
Example:
# Read a file and print its last 5 lines
f = open(filename1, 'r')
for line in LastNItems(f, 5):
print line.rstrip()
# Read the last 5 lines of a file. Print them, excluding the last 3.
f = open(filename2, 'r')
for line in LastNItems(f, 5, 3):
print line.rstrip()
"""
def listOfLastNItems(iterable, n):
items = []
for item in iterable:
if len(items) == n:
del items[:1]
items.append(item)
return items
def LNI(iterable, n, dropLastN):
items = listOfLastNItems(iterable, n)
if dropLastN > 0:
del items[-dropLastN:]
while items:
yield items.pop(0)
if n is None:
pass
elif n <= 0 or n <= dropLastN:
iterable = []
else:
iterable = LNI(iterable, n, dropLastN)
return iterable
def SkipNItems(iterable, n):
"""
Generator yielding all but the first n items of a stream.
SkipNItems(iterable, n) -> iterator
iterable -- a sequence, iterator, or some object which supports
iteration, yielding items of any type.
n -- an integer or None
Example:
# Read and print lines 5 and 6 of a file
f = open(filename, 'r')
for line in FirstNItems(SkipNItems(f, 4), 2):
print line.rstrip()
"""
def SNI(iterable, n):
source = iter(iterable)
while n > 0:
source.next()
n -= 1
while True:
yield source.next()
if n and n > 0:
iterable = SNI(iterable, n)
return iterable
def SkipLastNItems(iterable, n):
"""
Generator yielding all but the final n items of a finite stream.
SkipLastNItems(iterable, n) -> iterator
iterable -- a sequence, iterator, or some object which supports
iteration, yielding items of any type.
n -- an integer or None
"""
def SLNI(iterable, n):
items = list(iterable)[:-n]
while items:
yield items.pop(0)
if n and n > 0:
iterable = SLNI(iterable, n)
return iterable
def IntersectionOfHeadAndTail(iterable, nhead, ntail):
"""
Generator yielding those items of a finite stream which belong to both
the first 'nhead' items and the last 'ntail' items of the stream.
"""
def IHT(iterable, nhead, ntail):
items = []
n = 0
for item in iterable:
if n < nhead:
items.append(item)
if n >= ntail:
del items[:1]
if not items:
break
n += 1
while items:
yield items.pop(0)
if nhead <= 0 or ntail <= 0:
iterable = []
else:
iterable = IHT(iterable, nhead, ntail)
return iterable
#------------------------ Miscellaneous Filters ------------------------
def NotNull(iterable):
"""
Generator to filter out items which are equivalent to False, such as
empty groups.
NotNull(iterable) -> iterator
iterable -- a sequence, iterator, or some object which supports
iteration, yielding items of any type.
"""
return (item for item in iterable if item)
#-------------------------- Utility Functions --------------------------
def filterize(Filter, *args, **kwargs):
"""
Return a function of one argument (an input stream) which, when called,
will return the result of applying the given Filter function to that
argument together with any additional args and kwargs.
This is useful for building a 'filters' list to be passed to FilterLogFile.
Example:
# Print log entries containing the string 'Detail:'
from logfilter import FilterLogEntries, MatchRegex, filterize
import sys
filters = []
filters.append(filterize(MatchRegex, 'Detail:'))
for line in FilterLogFile(sys.stdin, filters=filters):
print line.rstrip()
"""
if args or kwargs:
return lambda(stream): Filter(stream, *args, **kwargs)
else:
return Filter
def spiffInterval(begin=None, end=None, duration=None):
"""
Determine a datetime interval given zero or more of the parameters
begin, end, duration.
The begin and end parameter values should be instances of the
datetime.datetime or datetime.date class, or None. Any dates
(datetime.date) are converted to datetimes (datetime.datetime)
with time set to 00:00:00.
The duration parameter value should be an instance of the
datetime.timedelta class, or None. The duration is used to
calculate a missing endpoint in case begin or end is None.
The duration parameter is ignored if the caller specifies
both begin and end.
Returns a pair (begin, end) in which each element is either
an instance of the datetime.datetime class or None.
"""
if begin and not hasattr(begin, 'hour'):
begin = datetime(begin.year, begin.month, begin.day)
if end and not hasattr(end, 'hour'):
end = datetime(end.year, end.month, end.day)
if (begin is None or end is None) and duration is not None:
if begin:
end = begin + duration
elif end:
begin = end - duration
else:
# Neither begin nor end was given. Let default interval
# begin at the current date and time minus the duration.
# Let the interval remain unbounded at the high end.
begin = datetime.now() - duration
return begin, end