| #!/usr/bin/env python |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # $Id: $ |
| # $Change: $ |
| # $DateTime: $ |
| # $Author: $ |
| """ |
| Toolkit for filtering input from GPDB server logs |
| |
| Module contents: |
| |
| ---- High level filters |
| FilterLogEntries() - the one-stop filter shop |
| |
| ---- Identity filters which observe a stream without altering it |
| Count() - count items fetched from a stream |
| TimestampSpy() - count lines and note timestamp range |
| |
| ---- Grouping and ungrouping filters |
| GroupByTimestamp() - group lines with same timestamp |
| Ungroup() - decompose groups into lines |
| EnumerateUngroup() - number the groups and decompose into lines |
| |
| ---- Pattern matching filters |
| MatchRegex() - select lines or groups in which a regular expr is matched |
| NoMatchRegex() - select lines or groups in which a regular expr has no match |
| MatchInFirstLine() - select groups in which regex has a match in first line |
| NoMatchInFirstLine() - select groups in which regex doesn't match in first line |
| |
| ---- Slicing filters |
| Slice() - select items in Pythonesque slice of stream[begin:end] |
| FirstNItems() - select the first n items of a stream |
| LastNItems() - select the last n items of a finite stream |
| SkipNItems() - select all but the first n items of a stream |
| SkipLastNItems() - select all but the last n items of a finite stream |
| IntersectionOfHeadAndTail() - select items within first m and last n |
| |
| ---- Miscellaneous filters |
| NotNull() - drop items which are equivalent to False |
| |
| ---- Utilities for setting up filter parameters |
| filterize() - wrap a filter for use in FilterLogEntries filter list |
| spiffInterval() - get begin/end datetime given any subset of begin/end/duration |
| """ |
| |
| from datetime import date, datetime |
| import re |
| import sys |
| import time |
| |
| timestampPattern = re.compile(r'\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d(\.\d*)?') |
| # This pattern matches the date and time stamp at the beginning of a line |
| # in a GPDB log file. The timestamp format is: YYYY-MM-DD HH:MM:SS[.frac] |
| # A timezone specifier may follow the timestamp, but we ignore that. |
| |
| |
| def FilterLogEntries(iterable, |
| msgfile=sys.stderr, |
| verbose=False, |
| beginstamp=None, |
| endstamp=None, |
| include=None, |
| exclude=None, |
| filters=[], |
| ibegin=0, |
| jend=None): |
| """ |
| Generator to consume the lines of a GPDB log file from iterable, |
| yield the lines which satisfy the given criteria, and skip the rest. |
| |
| iterable should be a sequence of strings, an already-open input file, |
| or some object which supports iteration and yields strings. |
| |
| verbose, if True, causes status messages to be written to msgfile, |
| which should be an already-open output file. |
| |
| For our purposes, a log entry consists of a line which starts with a |
| timestamp in YYYY-MM-DD HH:MM:SS[.fraction] format, followed by zero |
| or more lines having the same timestamp or no timestamp. |
| |
| beginstamp should be a datetime.datetime or datetime.date object, or None. |
| Log entries are skipped if their timestamp is less than the specified |
| date and time. Fractional seconds and timezones are ignored. If a |
| date object is given, it is converted to a datetime with time 00:00:00. |
| |
| endstamp is like beginstamp, except that it causes log entries to be |
| skipped if their timestamp is greater than or equal to the specified |
| date and time. |
| |
| include should be a regular expression object returned by the re.compile() |
| method; or a string specifying a regular expression according to the rules |
| of the re package in the Python standard library; or a list of such objects |
| and/or strings; or None. A log entry is skipped if there is an include |
| regex which - in every line of the entry - fails to match. |
| |
| exclude is like include, except that it causes a log entry to be skipped |
| if there is an exclude regex which matches in some line of the entry. |
| |
| filters is a sequence of callables. Each callable will be called just |
| once, with one argument: an input stream, which will be an iterator |
| yielding groups. (Here a 'group' is a sequence of strings: the lines |
| of a log entry.) The callable should return an iterator yielding |
| filtered groups. The filters are applied in the order given, |
| downstream of the begin/end/include/exclude filters. For example, |
| this filter selects log entries with 'DEBUGn:' in the first line... |
| lambda(iterable): MatchInFirstLine(iterable, r'DEBUG\d:') |
| The filterize() function, defined later in this module, is useful for |
| building the list of filters. |
| |
| ibegin and jend should be integers or None. They can be specified like |
| the bounds of a Python slice, to select a subrange of the log entries |
| which satisfy all the preceding criteria. Values >= 0 are counted from |
| the beginning of the stream; values < 0 are counted from the end of the |
| stream. 0 is before the first qualifying log entry; 1 is after the first |
| and before the second; -1 is before the last. Entries coming before the |
| ibegin point or after the jend point are skipped. For example, jend=3 |
| to select only the first 3 qualifying log entries; or ibegin=-3 to |
| extract the last 3 entries. |
| |
| Regular expression syntax is at http://docs.python.org/lib/re-syntax.html |
| |
| At the beginning of a log file before the first timestamped line there |
| could be some lines with no timestamp. If beginstamp or endstamp |
| is not None, any such lines are skipped. Otherwise they are grouped |
| together and treated as one log entry. |
| """ |
| iterable = iter(iterable) |
| spyIn = countIn = spyMid = spyMatch = countOut = None |
| if jend is not None and jend == sys.maxint: |
| jend = None |
| |
| # Collect unfiltered input statistics |
| if verbose: |
| iterable = spyIn = TimestampSpy(iterable) |
| |
| # Build filter pipeline |
| if include or exclude or filters or ibegin or (jend is not None): |
| # We want patterns to be tested entry-by-entry rather than line-by-line, |
| # so group together the lines of each entry. |
| iterable = GroupByTimestamp(iterable) |
| |
| # Count the unfiltered log entries |
| if verbose: |
| iterable = countIn = Count(iterable) |
| |
| # Select log entries such that beginstamp <= timestamp < endstamp |
| if beginstamp or endstamp: |
| iterable = TimestampInBounds(iterable, beginstamp, endstamp) |
| if verbose: |
| iterable = spyMid = TimestampSpy(iterable) |
| |
| # Include matching log entries. |
| if (isinstance(include, basestring) or # one string |
| hasattr(include, 'search')): # or compiled regex |
| include = [include] |
| if include: |
| for regex in include: |
| iterable = MatchRegex(iterable, regex) |
| |
| # Exclude non-matching log entries. |
| if (isinstance(exclude, basestring) or # one string |
| hasattr(exclude, 'search')): # or compiled regex |
| exclude = [exclude] |
| if exclude: |
| for regex in exclude: |
| iterable = NoMatchRegex(iterable, regex) |
| |
| # Append caller's filters to the pipeline. |
| for func in filters: |
| iterable = func(iterable) |
| |
| # Collect match/filter statistics |
| if verbose and iterable is not (spyMid or countIn): |
| iterable = spyMatch = TimestampSpy(iterable) |
| |
| # After all other filtering, extract slice of qualifying log entries. |
| if ibegin or jend is not None: |
| iterable = Slice(iterable, ibegin, jend) |
| |
| # Count final output log entries |
| if verbose: |
| iterable = countOut = Count(iterable) |
| |
| # Break the groups back down into lines for output. |
| iterable = Ungroup(iterable) |
| |
| # Collect final statistics |
| if verbose: |
| iterable = spyOut = TimestampSpy(iterable) |
| |
| elif beginstamp or endstamp: |
| # Select log entries such that beginstamp <= timestamp < endstamp |
| iterable = TimestampInBounds(iterable, beginstamp, endstamp) |
| |
| # Collect final statistics |
| if verbose: |
| iterable = spyOut = spyMid = TimestampSpy(iterable) |
| |
| else: |
| # Caller didn't request any filtering. |
| spyOut = spyIn |
| |
| # Pull filtered lines out of the pipeline and yield them to caller |
| for line in iterable: |
| yield line |
| |
| # Display statistics if requested |
| if verbose: |
| # Did we even try to read any input? |
| if spyIn.items == 0 and spyOut.items == 0 and not spyIn.eod: |
| print >>msgfile, ('%7d lines processed; an unsatisfiable condition ' |
| 'was specified' % 0) |
| return |
| |
| # Unfiltered input statistics |
| srange = spyIn.str_range() |
| msg = ' in: %7d lines' % spyIn.lines |
| if countIn: |
| msg += ', %7d log entries' % countIn.count() |
| if srange: |
| msg += '; timestamps from %s to %s' % srange |
| else: |
| msg += '; no timestamps found' |
| if not spyIn.eod: |
| msg += '; stopped before end of input' |
| print >>msgfile, msg |
| |
| # Entries where begin <= timestamp < end |
| if spyMid: |
| srange = spyMid.str_range() |
| msg = ' time ok: %7d lines' % spyMid.lines |
| if spyMid.groups: |
| msg += ', %7d log entries' % spyMid.groups |
| if srange: |
| msg += '; timestamps from %s to %s' % srange |
| print >>msgfile, msg |
| |
| # After applying include/exclude/filters |
| if spyMatch: |
| srange = spyMatch.str_range() |
| msg = ' match: %7d lines' % spyMatch.lines |
| if spyMatch.groups: |
| msg += ', %7d log entries' % spyMatch.groups |
| if srange: |
| msg += '; timestamps from %s to %s' % srange |
| print >>msgfile, msg |
| |
| # Final output statistics |
| srange = spyOut.str_range() |
| msg = ' out: %7d lines' % spyOut.lines |
| if countOut: |
| msg += ', %7d log entries' % countOut.count() |
| if srange: |
| msg += '; timestamps from %s to %s' % srange |
| print >>msgfile, msg |
| |
| |
| |
| #------------------------------- Spying -------------------------------- |
| class CsvFlatten(object): |
| """ |
| Used to flatten a CSV parsed log line into something that looks like the |
| old format. |
| """ |
| |
| def __init__(self,iterable): |
| self.source = iter(iterable) |
| |
| def __iter__(self): |
| return self |
| |
| def next(self): |
| item = self.source.next() |
| #we need to make a minor format change to the log level field so that |
| # our single regex will match both. |
| item[16] = item[16] + ": " |
| return '|'.join(item) + "\n" |
| |
| |
| #------------------------------- Spying -------------------------------- |
| |
| class Count(object): |
| """ |
| Iterator to pass through a stream of items while counting them. |
| |
| Count(iterable) -> iterator |
| iterable -- a sequence, iterator, file, or other object which |
| supports iteration, yielding items of any type. |
| """ |
| def __init__(self, iterable): |
| self.source = iter(iterable) |
| self.n = 0 |
| |
| def __iter__(self): |
| return self |
| |
| def next(self): |
| item = self.source.next() |
| self.n += 1 |
| return item |
| |
| def count(self): |
| return self.n |
| |
| |
| class TimestampSpy(object): |
| """ |
| Iterator to pass through a stream of GPDB log entries while noting the |
| lowest and highest timestamps, the number of lines and groups, etc. |
| |
| TimestampSpy(iterable) -> iterator |
| iterable -- a sequence, iterator, file, or other object which |
| supports iteration. Each item returned by its next() method |
| must be either a string (a line of GPDB log data) or a group |
| (a sequence of strings where the timestamp, if any, is at the |
| beginning of the first string). |
| """ |
| def __init__(self, iterable): |
| self.source = iter(iterable) |
| self.minstamp = '\xff' |
| self.maxstamp = '' |
| self.items = 0 |
| self.lines = 0 |
| self.groups = 0 |
| self.eod = False |
| |
| def __iter__(self): |
| return self |
| |
| def next(self): |
| try: |
| item = self.source.next() |
| except StopIteration, e: |
| self.eod = True |
| raise e |
| self.items += 1 |
| |
| if isinstance(item, basestring): # ungrouped input |
| s = item # item is a string |
| self.lines += 1 |
| elif len(item) > 0: # grouped input |
| s = item[0] # item is a sequence of strings |
| self.lines += len(item) |
| self.groups += 1 |
| else: # item is an empty sequence |
| s = '' |
| self.groups += 1 |
| |
| if self.minstamp > s: |
| m = timestampPattern.match(s) |
| if m: |
| self.minstamp = m.group(0) |
| if self.maxstamp < s: |
| m = timestampPattern.match(s) |
| if m: |
| self.maxstamp = m.group(0) |
| return item |
| |
| def str_range(self): |
| if self.maxstamp == '': |
| return None |
| return (self.minstamp, self.maxstamp) |
| |
| def datetime_range(self): |
| if self.maxstamp == '': |
| return None |
| minstruct = time.strptime(self.minstamp, '%Y-%m-%d %H:%M:%S')[:6] |
| maxstruct = time.strptime(self.maxstamp, '%Y-%m-%d %H:%M:%S')[:6] |
| return (datetime(*minstruct), datetime(*maxstruct)) |
| |
| |
| #------------------------------- Grouping -------------------------------- |
| |
| def GroupByTimestamp(iterable, skipnull=True): |
| """ |
| Generator to consume lines of a GPDB log and group them into lists. |
| A new group is started whenever the timestamp changes. |
| |
| The first call to next() yields a (possibly empty) list of all lines |
| found before the first timestamped line. Subsequently, each call |
| yields a list of one or more lines, in which the first line starts |
| with a timestamp, and the rest have the same timestamp or no timestamp. |
| |
| The inverse of GroupByTimestamp is Ungroup. |
| |
| GroupByTimestamp(iterable, skipnull) -> iterator |
| iterable -- a sequence, iterator, file, or other object which |
| supports iteration. Each item returned by its next() method |
| must be a string. |
| skipnull -- True to omit the first group if it is empty. |
| |
| Example: |
| # Print log entries from stdin having 'ERROR:' in the first line. |
| from logfilter import GroupByTimestamp |
| from sys import stdin |
| for lines in GroupByTimestamp(stdin): |
| if lines[0].find('ERROR:'): |
| for line in lines: |
| print line |
| """ |
| source = iter(iterable) |
| more = True |
| |
| # Build list of lines preceding the first timestamped line, and yield |
| # it as the first group (could be empty). |
| lines = [] |
| while more: |
| try: |
| s = source.next() |
| except StopIteration: |
| more = False |
| break |
| tsmatch = timestampPattern.match(s) |
| if tsmatch: |
| break |
| lines.append(s) |
| |
| if lines or not skipnull: |
| yield lines |
| |
| while more: |
| # Start a new group. Save timestamp from its first line. |
| lines = [s] |
| timestamp = tsmatch.group(0) |
| tsmatch = None |
| |
| # Any more lines with same (or no) timestamp? Add them to the list. |
| while True: |
| try: |
| s = source.next() |
| except StopIteration: # end of data |
| more = False |
| break |
| if not s.startswith(timestamp): |
| tsmatch = timestampPattern.match(s) |
| if tsmatch: # line has a different timestamp |
| break |
| lines.append(s) |
| |
| # Current group is finished... output it |
| yield lines |
| |
| |
| def Ungroup(iterable): |
| """ |
| Generator which takes a stream of groups and flattens it by one level, |
| yielding the elements of the groups. Here, the term 'group' means |
| a sequence, iterator, or some object which supports iteration. |
| Empty groups are skipped. |
| |
| Ungroup(iterable) -> iterator |
| iterable -- a sequence, iterator, or some object which supports |
| iteration. Each item returned by its next() method must be |
| a sequence, iterator, or iterable object. |
| |
| Example: |
| # Print the numbers from 1 to 7, the list [8, [], 9], and 10 |
| from gplogfilter import Ungroup |
| for i in Ungroup([[1, 2], [3, 4, 5], [], (6, 7), [[8, [], 9], 10]]): |
| print i |
| """ |
| for group in iterable: |
| for item in group: |
| yield item |
| |
| |
| def EnumerateUngroup(iterable): |
| """ |
| Generator which takes a stream of groups and flattens by one level, |
| yielding pairs (i, e) where i is a group counter (from zero) and |
| e is an element of the i'th group. Here, the term 'group' means |
| a sequence, iterator, or some object which supports iteration. |
| Empty groups are counted and skipped. |
| |
| EnumerateUngroup(iterable) -> iterator |
| iterable -- a sequence, iterator, or some object which supports |
| iteration. Each item returned by its next() method must be |
| a sequence, iterator, or iterable object. |
| """ |
| i = 0 |
| for group in iterable: |
| for item in group: |
| yield i, item |
| i += 1 |
| |
| |
| #------------------------------------------------------------------------- |
| |
| def TimestampInBounds(iterable, begin, end): |
| """ |
| Generator to extract GPDB log entries in a datetime interval: |
| begin <= timestamp < end |
| |
| TimestampInBounds(iterable, begin, end) -> iterator |
| iterable -- a sequence, iterator, file, or other object which |
| supports iteration. Each item returned by its next() method |
| must be either a string (a line of GPDB log data) or a group |
| (a sequence of strings where the timestamp, if any, is at the |
| beginning of the first string). |
| begin -- a date or datetime giving the lower bound of the interval; |
| or None for no lower bound. |
| end -- a date or datetime giving the upper bound of the interval; |
| or None for no upper bound. |
| |
| Example: |
| # Print log entries from stdin dated 2008-05-01. |
| from logfilter import FilterByTimestamp |
| from datetime import date |
| from sys import stdin |
| for s in TimestampInBounds(stdin, date(2008,5,1), date(2008,5,2)): |
| print s |
| |
| At the beginning of a log file there may be some lines with no timestamp, |
| preceding the first timestamped entry. Any such lines are skipped. |
| """ |
| # Prepare lower bound |
| if begin is None: |
| begin = '0000-00-00' |
| elif hasattr(begin, 'hour'): |
| begin = begin.strftime('%Y-%m-%d %H:%M:%S') # 'YYYY-MM-DD HH:MM:SS' |
| else: |
| begin = begin.strftime('%Y-%m-%d') |
| |
| # Prepare upper bound |
| if end is None: |
| end = '9999-99-99' |
| elif hasattr(end, 'hour'): |
| end = end.strftime('%Y-%m-%d %H:%M:%S') |
| else: |
| end = end.strftime('%Y-%m-%d') |
| |
| # Quit immediately if there cannot be timestamps within the interval. |
| if begin >= end: |
| return |
| |
| # Fetch first item from input stream. |
| source = iter(iterable) |
| item = source.next() |
| |
| # If first item is a string, assume input consists of individual lines. |
| # Yield lines which start with a timestamp within the given bounds, plus |
| # any following lines which do not have timestamps. |
| if isinstance(item, basestring): |
| withinbounds = False |
| while True: |
| if begin <= item < end: |
| withinbounds = True |
| yield item |
| elif timestampPattern.match(item): |
| withinbounds = False |
| elif withinbounds: |
| yield item |
| item = source.next() |
| |
| # Else assume input consists of groups (i.e. sequences) of lines. |
| # Yield groups in which the first line starts with a timestamp within |
| # the given bounds. Skip groups which are empty or have no timestamp. |
| while True: |
| if (len(item) > 0 and |
| begin <= item[0] < end): |
| yield item |
| item = source.next() |
| |
| |
| #--------------------------- Pattern Matching ---------------------------- |
| |
| def MatchRegex(iterable, regex): |
| """ |
| Generator to filter a stream, selecting items in which there is a match |
| for a regular expression. |
| |
| MatchRegex(iterable, include) -> iterator |
| iterable -- a sequence, iterator, file, or other object which |
| supports iteration. Each item returned by its next() method |
| must be either a string or a group. Here the term 'group' |
| means a sequence of strings or an iterable yielding strings. |
| regex -- a regular expression string, or a regular expression |
| object returned by re.compile(). The filter excludes items |
| in which there is no match for the regex. |
| |
| Example: |
| # Print lines from stdin which contain the string 'ERROR:' |
| from logfilter import MatchRegex |
| import re, sys |
| for s in MatchRegex(sys.stdin, re.compile('ERROR:')): |
| print s |
| """ |
| if isinstance(regex, basestring): |
| regex = re.compile(regex) |
| |
| # Yield items in which a match is found for the 'include' pattern. |
| for item in iterable: |
| if isinstance(item, basestring): # item is a string |
| if regex.search(item): |
| yield item |
| else: # item is a group of strings |
| for s in item: |
| if regex.search(s): |
| yield item |
| break |
| |
| |
| def NoMatchRegex(iterable, regex): |
| """ |
| Generator to filter a stream, selecting items in which there is no match |
| for the regular expression. |
| |
| NoMatchRegex(iterable, regex) -> iterator |
| iterable -- a sequence, iterator, file, or other object which |
| supports iteration. Each item returned by its next() method |
| must be either a string or a group. Here the term 'group' |
| means a sequence of strings or an iterable yielding strings. |
| regex -- a regular expression string, or a regular expression |
| object returned by re.compile(). The filter excludes items |
| in which there is a match for the regex. |
| |
| Example: |
| # Print log entries from stdin which don't contain the string 'HINT' |
| from logfilter import FilterNoMatch, GroupByTimestamp, Ungroup |
| import sys |
| for s in Ungroup(NoMatchRegex(TimestampInBounds(sys.stdin), 'HINT')): |
| print s |
| """ |
| if isinstance(regex, basestring): |
| regex = re.compile(regex) |
| |
| # Yield items in which no match is found for the 'exclude' pattern. |
| for item in iterable: |
| if isinstance(item, basestring): # item is a string |
| if not regex.search(item): |
| yield item |
| else: # item is a group of strings |
| for s in item: |
| if regex.search(s): |
| break |
| else: |
| yield item |
| |
| |
| def MatchInFirstLine(iterable, regex): |
| """ |
| Generator to filter a stream of groups. Yields those groups whose |
| first line contains a match for the given regex. |
| |
| MatchInFirstLine(iterable, regex) -> iterator |
| iterable -- a sequence, iterator, or some object which supports |
| iteration. Each item returned by its next() method must be |
| a group. Here the term 'group' means a sequence of strings. |
| include -- either a string specifying a regular expression, or a |
| regular expression object returned by re.compile() |
| |
| Example: |
| # Print log entries whose first line contains the string 'ERROR:' |
| for s in Ungroup(MatchInFirstLine(GroupByTimestamp(sys.stdin), 'ERROR:')): |
| print s |
| """ |
| if isinstance(regex, basestring): |
| regex = re.compile(regex) |
| for group in iterable: |
| if (len(group) > 0 and |
| regex.search(group[0])): |
| yield group |
| |
| |
| def NoMatchInFirstLine(iterable, regex): |
| """ |
| Generator to filter a stream of groups. Skips those groups whose |
| first line contains a match for the given regex; yields all other |
| groups. |
| |
| NoMatchInFirstLine(iterable, regex) -> iterator |
| iterable -- a sequence, iterator, or some object which supports |
| iteration. Each item returned by its next() method must be |
| a group. Here the term 'group' means a sequence of strings. |
| regex -- either a string specifying a regular expression, or a |
| regular expression object returned by re.compile() |
| |
| Example: |
| # Print log entries whose first line does not contain 'DEBUGn:' |
| for s in Ungroup(NoMatchInFirstLine(GroupByTimestamp(sys.stdin), r'DEBUG\d:')): |
| print s |
| """ |
| if isinstance(regex, basestring): |
| regex = re.compile(regex) |
| for group in iterable: |
| if (len(group) == 0 or |
| regex.search(group[0]) is None): |
| yield group |
| |
| def MatchColumns(iterable, cols): |
| if isinstance(cols, basestring): |
| cols = cols.split(',') |
| cols = map(lambda x: int(x), cols) |
| |
| # Yield items in which a match is found for the 'include' pattern. |
| for item in iterable: |
| if 1: |
| #print "item\n%s\nitem" % item |
| ret = [] |
| for s in item: |
| n = 1 |
| out = [] |
| |
| for c in s.split('|'): |
| if n in cols: |
| out.append(c) |
| n += 1 |
| if len(out): |
| #print out |
| ret.append('|'.join(out) + "\n") |
| yield ret |
| |
| #-------------------------------- Slicing -------------------------------- |
| |
| def Slice(iterable, begin=0, end=None): |
| """ |
| Generator yielding the items of iterable[begin:end], like a Python slice. |
| |
| Slice(iterable, begin, end) -> iterator |
| iterable -- a sequence, iterator, or some object which supports |
| iteration, yielding items of any type. If begin or end is |
| negative, the iteration must be finite. |
| begin -- integer or None. If >=0, number of initial items to skip. |
| If <0, skip items preceding this number of final items. |
| end -- integer or None. If >=0, maximum number of items to be |
| fetched from the head of the input iterable. If <0, number |
| of items to be excluded at the end of the input stream. |
| """ |
| if begin is None: |
| begin = 0 |
| if begin >= 0: |
| iterable = SkipNItems(iterable, begin) |
| if end is None or end == sys.maxint: |
| pass |
| elif end >= 0: |
| iterable = FirstNItems(iterable, end-begin) |
| else: |
| iterable = SkipLastNItems(iterable, -end) |
| elif end is None or end == sys.maxint: |
| iterable = LastNItems(iterable, -begin) |
| elif end < 0: |
| iterable = LastNItems(iterable, -begin, -end) |
| else: |
| iterable = IntersectionOfHeadAndTail(iterable, end, -begin) |
| return iterable |
| |
| |
| def FirstNItems(iterable, n): |
| """ |
| Generator yielding the first n items of a stream. |
| |
| FirstNItems(iterable, n) -> iterator |
| iterable -- a sequence, iterator, or some object which supports |
| iteration, yielding items of any type. |
| n -- an integer or None |
| |
| Example: |
| # Read and print 5 lines from standard input |
| for s in FirstNItems(sys.stdin, 5): |
| print s, |
| """ |
| def FNI(iterable, n): |
| source = iter(iterable) |
| while n > 0: |
| yield source.next() |
| n -= 1 |
| |
| if n is None: |
| pass |
| elif n <= 0: |
| iterable = [] |
| else: |
| iterable = FNI(iterable, n) |
| return iterable |
| |
| |
| def LastNItems(iterable, n, dropLastN=0): |
| """ |
| Generator yielding the final n items of a finite stream. Of those |
| final items, the last dropLastN items are omitted if dropLastN > 0. |
| |
| The number of items yielded is max(0, min(n, N) - dropLastN)) where |
| N is the number of items yielded by the input iterable. |
| |
| LastNItems(iterable, n) -> iterator |
| iterable -- a sequence, iterator, or some object which supports |
| iteration, yielding items of any type. |
| n -- integer or None |
| dropLastN -- integer |
| |
| Example: |
| # Read a file and print its last 5 lines |
| f = open(filename1, 'r') |
| for line in LastNItems(f, 5): |
| print line.rstrip() |
| |
| # Read the last 5 lines of a file. Print them, excluding the last 3. |
| f = open(filename2, 'r') |
| for line in LastNItems(f, 5, 3): |
| print line.rstrip() |
| """ |
| def listOfLastNItems(iterable, n): |
| items = [] |
| for item in iterable: |
| if len(items) == n: |
| del items[:1] |
| items.append(item) |
| return items |
| |
| def LNI(iterable, n, dropLastN): |
| items = listOfLastNItems(iterable, n) |
| if dropLastN > 0: |
| del items[-dropLastN:] |
| while items: |
| yield items.pop(0) |
| |
| if n is None: |
| pass |
| elif n <= 0 or n <= dropLastN: |
| iterable = [] |
| else: |
| iterable = LNI(iterable, n, dropLastN) |
| return iterable |
| |
| |
| def SkipNItems(iterable, n): |
| """ |
| Generator yielding all but the first n items of a stream. |
| |
| SkipNItems(iterable, n) -> iterator |
| iterable -- a sequence, iterator, or some object which supports |
| iteration, yielding items of any type. |
| n -- an integer or None |
| |
| Example: |
| # Read and print lines 5 and 6 of a file |
| f = open(filename, 'r') |
| for line in FirstNItems(SkipNItems(f, 4), 2): |
| print line.rstrip() |
| """ |
| def SNI(iterable, n): |
| source = iter(iterable) |
| while n > 0: |
| source.next() |
| n -= 1 |
| while True: |
| yield source.next() |
| |
| if n and n > 0: |
| iterable = SNI(iterable, n) |
| return iterable |
| |
| |
| def SkipLastNItems(iterable, n): |
| """ |
| Generator yielding all but the final n items of a finite stream. |
| |
| SkipLastNItems(iterable, n) -> iterator |
| iterable -- a sequence, iterator, or some object which supports |
| iteration, yielding items of any type. |
| n -- an integer or None |
| """ |
| def SLNI(iterable, n): |
| items = list(iterable)[:-n] |
| while items: |
| yield items.pop(0) |
| |
| if n and n > 0: |
| iterable = SLNI(iterable, n) |
| return iterable |
| |
| |
| def IntersectionOfHeadAndTail(iterable, nhead, ntail): |
| """ |
| Generator yielding those items of a finite stream which belong to both |
| the first 'nhead' items and the last 'ntail' items of the stream. |
| """ |
| def IHT(iterable, nhead, ntail): |
| items = [] |
| n = 0 |
| for item in iterable: |
| if n < nhead: |
| items.append(item) |
| if n >= ntail: |
| del items[:1] |
| if not items: |
| break |
| n += 1 |
| while items: |
| yield items.pop(0) |
| |
| if nhead <= 0 or ntail <= 0: |
| iterable = [] |
| else: |
| iterable = IHT(iterable, nhead, ntail) |
| return iterable |
| |
| |
| #------------------------ Miscellaneous Filters ------------------------ |
| |
| def NotNull(iterable): |
| """ |
| Generator to filter out items which are equivalent to False, such as |
| empty groups. |
| |
| NotNull(iterable) -> iterator |
| iterable -- a sequence, iterator, or some object which supports |
| iteration, yielding items of any type. |
| """ |
| return (item for item in iterable if item) |
| |
| |
| #-------------------------- Utility Functions -------------------------- |
| |
| def filterize(Filter, *args, **kwargs): |
| """ |
| Return a function of one argument (an input stream) which, when called, |
| will return the result of applying the given Filter function to that |
| argument together with any additional args and kwargs. |
| |
| This is useful for building a 'filters' list to be passed to FilterLogFile. |
| |
| Example: |
| # Print log entries containing the string 'Detail:' |
| from logfilter import FilterLogEntries, MatchRegex, filterize |
| import sys |
| filters = [] |
| filters.append(filterize(MatchRegex, 'Detail:')) |
| for line in FilterLogFile(sys.stdin, filters=filters): |
| print line.rstrip() |
| """ |
| if args or kwargs: |
| return lambda(stream): Filter(stream, *args, **kwargs) |
| else: |
| return Filter |
| |
| |
| def spiffInterval(begin=None, end=None, duration=None): |
| """ |
| Determine a datetime interval given zero or more of the parameters |
| begin, end, duration. |
| |
| The begin and end parameter values should be instances of the |
| datetime.datetime or datetime.date class, or None. Any dates |
| (datetime.date) are converted to datetimes (datetime.datetime) |
| with time set to 00:00:00. |
| |
| The duration parameter value should be an instance of the |
| datetime.timedelta class, or None. The duration is used to |
| calculate a missing endpoint in case begin or end is None. |
| The duration parameter is ignored if the caller specifies |
| both begin and end. |
| |
| Returns a pair (begin, end) in which each element is either |
| an instance of the datetime.datetime class or None. |
| """ |
| if begin and not hasattr(begin, 'hour'): |
| begin = datetime(begin.year, begin.month, begin.day) |
| if end and not hasattr(end, 'hour'): |
| end = datetime(end.year, end.month, end.day) |
| |
| if (begin is None or end is None) and duration is not None: |
| if begin: |
| end = begin + duration |
| elif end: |
| begin = end - duration |
| else: |
| # Neither begin nor end was given. Let default interval |
| # begin at the current date and time minus the duration. |
| # Let the interval remain unbounded at the high end. |
| begin = datetime.now() - duration |
| |
| return begin, end |
| |