tools/generators.py - incubator-ponymail - Git at Google

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 """
 This file contains the various ID generators for Pony Mail's archivers.
 """

 import hashlib
 import email.utils
 import time
 import re

 # Full generator: uses the entire email (including server-dependent data)
 # This is the recommended generator for single-node setups.
 def full(msg, _body, lid, _attachments):
     """
     Full generator: uses the entire email
     (including server-dependent data)
     The id is almost certainly unique,
     but different copies of the message are likely to have different headers, thus ids

     WARNING: the archiver by default adds an archived-at header with the current time.
     This is included in the hash, so messages will get different Permalinks if reloaded from source

     Parameters:
     msg - the parsed message
     _body - the parsed text content (not used)
     lid - list id
     _attachments - list of attachments (not used)

     Returns: "<hash>@<lid>" where hash is sha224 of message bytes
     """
     mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
     return mid

 # Medium: Standard 0.9 generator - Not recommended for future installations.
 # See 'full' or 'cluster' generators instead.
 def medium(msg, body, lid, _attachments):
     """
     Standard 0.9 generator - Not recommended for future installations.
     (does not generate sufficiently unique ids)
     Also the lid is included in the hash; this causes problems if the listname needs to be changed.

     N.B. The id is not guaranteed stable - i.e. it may change if the message is reparsed.
     The id depends on the parsed body, which depends on the exact method used to parse the mail.
     For example, are invalid characters ignored or replaced; is html parsing used?

     The following message fields are concatenated to form the hash input:
     - body: if bytes as is else encoded ascii, ignoring invalid characters; if the body is null an Exception is thrown
     - lid
     - Date header if it exists and parses OK; failing that
     - archived-at header if it exists and parses OK; failing that
     - current time.
     The resulting date is converted to YYYY/MM/DD HH:MM:SS (using UTC)

     Parameters:
     msg - the parsed message (used to get the date)
     body - the parsed text content (may be null)
     lid - list id
     _attachments - list of attachments (not used)

     Returns: "<hash>@<lid>" where hash is sha224 of the message items noted above
     """

     # Use text body
     xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
     # Use List ID
     xbody += bytes(lid, encoding='ascii')
     # Use Date header
     try:
         mdate = email.utils.parsedate_tz(msg.get('date'))
     except:
         pass
     # In keeping with preserving the past, we have kept this next section(s).
     # For all intents and purposes, this is not a proper way of maintaining
     # a consistent ID in case of missing dates. It is recommended to use
     # another generator
     if not mdate and msg.get('archived-at'):
         mdate = email.utils.parsedate_tz(msg.get('archived-at'))
     elif not mdate:
         mdate = time.gmtime() # Get a standard 9-tuple
         mdate = mdate + (0, ) # Fake a TZ (10th element)
     mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(email.utils.mktime_tz(mdate)))
     xbody += bytes(mdatestring, encoding='ascii')
     mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
     return mid

 # cluster: Use data that is guaranteed to be the same across cluster setups
 # This is the recommended generator for cluster setups.
 # Unlike 'medium', this only makes use of the Date: header and not the archived-at,
 # as the archived-at may change from node to node (and will change if not in the raw mbox file)
 # Also the lid is not included in the hash, so the hash does not change if the lid is overridden
 #
 def cluster(msg, body, lid, attachments):
     """
     Use data that is guaranteed to be the same across cluster setups
     For mails with a valid Message-ID this is likely to be unique
     In other cases it is better than the medium generator as it uses several extra fields

     N.B. The id is not guaranteed stable - i.e. it may change if the message is reparsed.
     The id depends on the parsed body, which depends on the exact method used to parse the mail.
     For example, are invalid characters ignored or replaced; is html parsing used?
     The output also depends on attachment hashes, so any changes to attachment parsing
     can also change the output. For example, the code now handles inline attachments.

     The following message fields are concatenated to form the hash input:
     - body as is if bytes else encoded ascii, ignoring invalid characters; if the body is null it is treated as an empty string
       (currently trailing whitespace is dropped)
     - Message-ID (if present)
     - Date header converted to YYYY/MM/DD HH:MM:SS (UTC)
       or "(null)" if the date does not exist or cannot be converted
     - sender, encoded as ascii (if the field exists)
     - subject, encoded as ascii (if the field exists)
     - the hashes of any attachments

     Note: the lid is not included in the hash.

     Parameters:
     msg - the parsed message
     body - the parsed text content
     lid - list id
     attachments - list of attachments (uses the hashes)

     Returns: "r<hash>@<lid>" where hash is sha224 of the message items noted above
     """
     # Use text body
     if not body: # Make sure body is not None, which will fail.
         body = ""
     xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')

     # Crop out any trailing whitespace in body
     xbody = re.sub(b"\s+$", b"", xbody)

     # Use Message-Id (or '' if missing)
     xbody += bytes(msg.get('Message-Id', ''), encoding='ascii')

     # Use Date header. Don't use archived-at, as the archiver sets this if not present.
     mdatestring = "(null)" # Default to null, ONLY changed if replicable across imports
     try:
         mdate = email.utils.parsedate_tz(msg.get('date'))
         mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(email.utils.mktime_tz(mdate)))
     except:
         pass
     xbody += bytes(mdatestring, encoding='ascii')

     # Use sender
     sender = msg.get('from', None)
     if sender:
         xbody += bytes(sender, encoding = 'ascii')

     # Use subject
     subject = msg.get('subject', None)
     if subject:
         xbody += bytes(subject, encoding = 'ascii')

     # Use attachment hashes if present
     if attachments:
         for a in attachments:
             xbody += bytes(a['hash'], encoding = 'ascii')

     # generate the hash and combine with the lid to form the id
     mid = "r%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
     return mid


 # Old school way of making IDs
 def legacy(msg, body, lid, _attachments):
     """
     Original generator - DO NOT USE
     (does not generate unique ids)

     The hash input is created from
     - body: if bytes as is else encoded ascii, ignoring invalid characters; if the body is null an Exception is thrown

     The uid_mdate for the id is the Date converted to UTC epoch else 0

     Parameters:
     msg - the parsed message (used to get the date)
     body - the parsed text content (may be null)
     lid - list id
     _attachments - list of attachments (not used)

     Returns: "<hash>@<uid_mdate>@<lid>" where hash is sha224 of the message items noted above
     """
     uid_mdate = 0 # Default if no date found
     try:
         mdate = email.utils.parsedate_tz(msg.get('date'))
         uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid
     except:
         pass
     mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
     return mid

 __GENERATORS={
     'full': full,
     'medium': medium,
     'cluster': cluster,
     'legacy': legacy,
 }

 def generator(name):
     try:
         return __GENERATORS[name]
     except:
         print("WARN: generator %s not found, defaulting to 'legacy'" % name)
         return legacy

 def generate(name, msg, body, lid, attachments):
     return generator(name)(msg, body, lid, attachments)

 def generator_names():
     return list(__GENERATORS)
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	This file contains the various ID generators for Pony Mail's archivers.
	"""

	import hashlib
	import email.utils
	import time
	import re

	# Full generator: uses the entire email (including server-dependent data)
	# This is the recommended generator for single-node setups.
	def full(msg, _body, lid, _attachments):
	"""
	Full generator: uses the entire email
	(including server-dependent data)
	The id is almost certainly unique,
	but different copies of the message are likely to have different headers, thus ids

	WARNING: the archiver by default adds an archived-at header with the current time.
	This is included in the hash, so messages will get different Permalinks if reloaded from source

	Parameters:
	msg - the parsed message
	_body - the parsed text content (not used)
	lid - list id
	_attachments - list of attachments (not used)

	Returns: "<hash>@<lid>" where hash is sha224 of message bytes
	"""
	mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
	return mid

	# Medium: Standard 0.9 generator - Not recommended for future installations.
	# See 'full' or 'cluster' generators instead.
	def medium(msg, body, lid, _attachments):
	"""
	Standard 0.9 generator - Not recommended for future installations.
	(does not generate sufficiently unique ids)
	Also the lid is included in the hash; this causes problems if the listname needs to be changed.

	N.B. The id is not guaranteed stable - i.e. it may change if the message is reparsed.
	The id depends on the parsed body, which depends on the exact method used to parse the mail.
	For example, are invalid characters ignored or replaced; is html parsing used?

	The following message fields are concatenated to form the hash input:
	- body: if bytes as is else encoded ascii, ignoring invalid characters; if the body is null an Exception is thrown
	- lid
	- Date header if it exists and parses OK; failing that
	- archived-at header if it exists and parses OK; failing that
	- current time.
	The resulting date is converted to YYYY/MM/DD HH:MM:SS (using UTC)

	Parameters:
	msg - the parsed message (used to get the date)
	body - the parsed text content (may be null)
	lid - list id
	_attachments - list of attachments (not used)

	Returns: "<hash>@<lid>" where hash is sha224 of the message items noted above
	"""

	# Use text body
	xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
	# Use List ID
	xbody += bytes(lid, encoding='ascii')
	# Use Date header
	try:
	mdate = email.utils.parsedate_tz(msg.get('date'))
	except:
	pass
	# In keeping with preserving the past, we have kept this next section(s).
	# For all intents and purposes, this is not a proper way of maintaining
	# a consistent ID in case of missing dates. It is recommended to use
	# another generator
	if not mdate and msg.get('archived-at'):
	mdate = email.utils.parsedate_tz(msg.get('archived-at'))
	elif not mdate:
	mdate = time.gmtime() # Get a standard 9-tuple
	mdate = mdate + (0, ) # Fake a TZ (10th element)
	mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(email.utils.mktime_tz(mdate)))
	xbody += bytes(mdatestring, encoding='ascii')
	mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
	return mid

	# cluster: Use data that is guaranteed to be the same across cluster setups
	# This is the recommended generator for cluster setups.
	# Unlike 'medium', this only makes use of the Date: header and not the archived-at,
	# as the archived-at may change from node to node (and will change if not in the raw mbox file)
	# Also the lid is not included in the hash, so the hash does not change if the lid is overridden
	#
	def cluster(msg, body, lid, attachments):
	"""
	Use data that is guaranteed to be the same across cluster setups
	For mails with a valid Message-ID this is likely to be unique
	In other cases it is better than the medium generator as it uses several extra fields

	N.B. The id is not guaranteed stable - i.e. it may change if the message is reparsed.
	The id depends on the parsed body, which depends on the exact method used to parse the mail.
	For example, are invalid characters ignored or replaced; is html parsing used?
	The output also depends on attachment hashes, so any changes to attachment parsing
	can also change the output. For example, the code now handles inline attachments.

	The following message fields are concatenated to form the hash input:
	- body as is if bytes else encoded ascii, ignoring invalid characters; if the body is null it is treated as an empty string
	(currently trailing whitespace is dropped)
	- Message-ID (if present)
	- Date header converted to YYYY/MM/DD HH:MM:SS (UTC)
	or "(null)" if the date does not exist or cannot be converted
	- sender, encoded as ascii (if the field exists)
	- subject, encoded as ascii (if the field exists)
	- the hashes of any attachments

	Note: the lid is not included in the hash.

	Parameters:
	msg - the parsed message
	body - the parsed text content
	lid - list id
	attachments - list of attachments (uses the hashes)

	Returns: "r<hash>@<lid>" where hash is sha224 of the message items noted above
	"""
	# Use text body
	if not body: # Make sure body is not None, which will fail.
	body = ""
	xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')

	# Crop out any trailing whitespace in body
	xbody = re.sub(b"\s+$", b"", xbody)

	# Use Message-Id (or '' if missing)
	xbody += bytes(msg.get('Message-Id', ''), encoding='ascii')

	# Use Date header. Don't use archived-at, as the archiver sets this if not present.
	mdatestring = "(null)" # Default to null, ONLY changed if replicable across imports
	try:
	mdate = email.utils.parsedate_tz(msg.get('date'))
	mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(email.utils.mktime_tz(mdate)))
	except:
	pass
	xbody += bytes(mdatestring, encoding='ascii')

	# Use sender
	sender = msg.get('from', None)
	if sender:
	xbody += bytes(sender, encoding = 'ascii')

	# Use subject
	subject = msg.get('subject', None)
	if subject:
	xbody += bytes(subject, encoding = 'ascii')

	# Use attachment hashes if present
	if attachments:
	for a in attachments:
	xbody += bytes(a['hash'], encoding = 'ascii')

	# generate the hash and combine with the lid to form the id
	mid = "r%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
	return mid


	# Old school way of making IDs
	def legacy(msg, body, lid, _attachments):
	"""
	Original generator - DO NOT USE
	(does not generate unique ids)

	The hash input is created from
	- body: if bytes as is else encoded ascii, ignoring invalid characters; if the body is null an Exception is thrown

	The uid_mdate for the id is the Date converted to UTC epoch else 0

	Parameters:
	msg - the parsed message (used to get the date)
	body - the parsed text content (may be null)
	lid - list id
	_attachments - list of attachments (not used)

	Returns: "<hash>@<uid_mdate>@<lid>" where hash is sha224 of the message items noted above
	"""
	uid_mdate = 0 # Default if no date found
	try:
	mdate = email.utils.parsedate_tz(msg.get('date'))
	uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid
	except:
	pass
	mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
	return mid

	__GENERATORS={
	'full': full,
	'medium': medium,
	'cluster': cluster,
	'legacy': legacy,
	}

	def generator(name):
	try:
	return __GENERATORS[name]
	except:
	print("WARN: generator %s not found, defaulting to 'legacy'" % name)
	return legacy

	def generate(name, msg, body, lid, attachments):
	return generator(name)(msg, body, lid, attachments)

	def generator_names():
	return list(__GENERATORS)