tools/mboxo_patch.py - incubator-ponymail - Git at Google

 # -*- coding: utf-8 -*-
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 """
 Byte stream reader to process mboxo style mailbox files.
 These are not currently handled by the Python email package.

 It replaces any occurrence of b'\n>From ' with b'\nFrom '

 The class handles matching across read boundaries.

 To use:

 from mboxo_patch import MboxoFactory
 ...
 messages = mailbox.mbox(filename, MboxoFactory)

 N.B.
 To simplify the code, the MboxoReader class changes the
 size parameter to 7 if (and only if): 0 <= size < 7
 The return byte buffer can thus be larger than expected.
 However this is only a theoretical possibility
 as the mailbox code uses a size of 8192 (or None)

 """
 import mailbox

 FROM_MANGLED  =b'\n>From '
 FROM_MANGLED_LEN=len(FROM_MANGLED)
 FROM_UNMANGLED=b'\nFrom '
 # We want to match the 7 bytes b'\n>From ' in the input stream
 # However this can be split over multiple reads.
 # The split can occur anywhere after the leading b'\n'
 # and the trailing b' '. If we match any of these
 # we keep the trailing part of the buffer for next time
 # The following are all the possible prefixes for a split:
 FROMS=(FROM_MANGLED[:-1],
        FROM_MANGLED[:-2],
        FROM_MANGLED[:-3],
        FROM_MANGLED[:-4],
        FROM_MANGLED[:-5],
        FROM_MANGLED[:-6],
        )

 class MboxoReader(mailbox._PartialFile): # pylint: disable=W0212
     def __init__(self, f, start=None, stop=None):
         self.remain=0 # number of bytes to keep for next read
         super().__init__(f._file, start=f._start, stop=f._stop) # pylint: disable=W0212

     # Override the read method to provide mboxo filtering
     def _read(self, size, read_method):
         # get the next chunk, resetting if necessary
         if self.remain != 0:
             super().seek(whence=1, offset=-self.remain)
         # if size is None or negative, then read returns everything.
         # in which case there is no need to wory about matching across reads
         limited_read = size and size >= 0
         # ensure we get enough to match successfully when refilling
         if limited_read and size < FROM_MANGLED_LEN:
             size = FROM_MANGLED_LEN
         buff = super()._read(size, read_method)
         bufflen=len(buff)
         # did we get anything new?
         if limited_read and bufflen > self.remain:
             # is there a potential cross-boundary match?
             if buff.endswith(FROMS):
                 # yes, work out what to keep
                 # N.B. rindex will fail if it cannot find the LF;
                 # this should be impossible
                 self.remain=bufflen - buff.rindex(b'\n')
             else:
                 # don't need to keep anything back
                 self.remain=0
         else:
             # EOF
             self.remain=0
         # we cannot use -0 to mean end of array...
         end = bufflen if self.remain == 0 else -self.remain
         # exclude the potential split match from the return
         return buff[:end].replace(FROM_MANGLED, FROM_UNMANGLED)

 class MboxoFactory(mailbox.mboxMessage):
     def __init__(self, message=None):
         super().__init__(message=MboxoReader(message))
	# -- coding: utf-8 --
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	Byte stream reader to process mboxo style mailbox files.
	These are not currently handled by the Python email package.

	It replaces any occurrence of b'\n>From ' with b'\nFrom '

	The class handles matching across read boundaries.

	To use:

	from mboxo_patch import MboxoFactory
	...
	messages = mailbox.mbox(filename, MboxoFactory)

	N.B.
	To simplify the code, the MboxoReader class changes the
	size parameter to 7 if (and only if): 0 <= size < 7
	The return byte buffer can thus be larger than expected.
	However this is only a theoretical possibility
	as the mailbox code uses a size of 8192 (or None)

	"""
	import mailbox

	FROM_MANGLED =b'\n>From '
	FROM_MANGLED_LEN=len(FROM_MANGLED)
	FROM_UNMANGLED=b'\nFrom '
	# We want to match the 7 bytes b'\n>From ' in the input stream
	# However this can be split over multiple reads.
	# The split can occur anywhere after the leading b'\n'
	# and the trailing b' '. If we match any of these
	# we keep the trailing part of the buffer for next time
	# The following are all the possible prefixes for a split:
	FROMS=(FROM_MANGLED[:-1],
	FROM_MANGLED[:-2],
	FROM_MANGLED[:-3],
	FROM_MANGLED[:-4],
	FROM_MANGLED[:-5],
	FROM_MANGLED[:-6],
	)

	class MboxoReader(mailbox._PartialFile): # pylint: disable=W0212
	def __init__(self, f, start=None, stop=None):
	self.remain=0 # number of bytes to keep for next read
	super().__init__(f._file, start=f._start, stop=f._stop) # pylint: disable=W0212

	# Override the read method to provide mboxo filtering
	def _read(self, size, read_method):
	# get the next chunk, resetting if necessary
	if self.remain != 0:
	super().seek(whence=1, offset=-self.remain)
	# if size is None or negative, then read returns everything.
	# in which case there is no need to wory about matching across reads
	limited_read = size and size >= 0
	# ensure we get enough to match successfully when refilling
	if limited_read and size < FROM_MANGLED_LEN:
	size = FROM_MANGLED_LEN
	buff = super()._read(size, read_method)
	bufflen=len(buff)
	# did we get anything new?
	if limited_read and bufflen > self.remain:
	# is there a potential cross-boundary match?
	if buff.endswith(FROMS):
	# yes, work out what to keep
	# N.B. rindex will fail if it cannot find the LF;
	# this should be impossible
	self.remain=bufflen - buff.rindex(b'\n')
	else:
	# don't need to keep anything back
	self.remain=0
	else:
	# EOF
	self.remain=0
	# we cannot use -0 to mean end of array...
	end = bufflen if self.remain == 0 else -self.remain
	# exclude the potential split match from the return
	return buff[:end].replace(FROM_MANGLED, FROM_UNMANGLED)

	class MboxoFactory(mailbox.mboxMessage):
	def __init__(self, message=None):
	super().__init__(message=MboxoReader(message))