blob: 5de6f25f981fc8e8a364743a680224fe98d9092c [file] [log] [blame]
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This file contains the various ID generators for Pony Mail's archivers.
"""
import hashlib
import email.utils
import time
import re
# Full generator: uses the entire email (including server-dependent data)
# This is the recommended generator for single-node setups.
def full(msg, _body, lid, _attachments):
"""
Full generator: uses the entire email
(including server-dependent data)
The id is almost certainly unique,
but different copies of the message are likely to have different headers, thus ids
WARNING: the archiver by default adds an archived-at header with the current time.
This is included in the hash, so messages will get different Permalinks if reloaded from source
Parameters:
msg - the parsed message
_body - the parsed text content (not used)
lid - list id
_attachments - list of attachments (not used)
Returns: "<hash>@<lid>" where hash is sha224 of message bytes
"""
mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid)
return mid
# Medium: Standard 0.9 generator - Not recommended for future installations.
# See 'full' or 'cluster' generators instead.
def medium(msg, body, lid, _attachments):
"""
Standard 0.9 generator - Not recommended for future installations.
(does not generate sufficiently unique ids)
Also the lid is included in the hash; this causes problems if the listname needs to be changed.
N.B. The id is not guaranteed stable - i.e. it may change if the message is reparsed.
The id depends on the parsed body, which depends on the exact method used to parse the mail.
For example, are invalid characters ignored or replaced; is html parsing used?
The following message fields are concatenated to form the hash input:
- body: if bytes as is else encoded ascii, ignoring invalid characters; if the body is null an Exception is thrown
- lid
- Date header if it exists and parses OK; failing that
- archived-at header if it exists and parses OK; failing that
- current time.
The resulting date is converted to YYYY/MM/DD HH:MM:SS (using UTC)
Parameters:
msg - the parsed message (used to get the date)
body - the parsed text content (may be null)
lid - list id
_attachments - list of attachments (not used)
Returns: "<hash>@<lid>" where hash is sha224 of the message items noted above
"""
# Use text body
xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
# Use List ID
xbody += bytes(lid, encoding='ascii')
# Use Date header
try:
mdate = email.utils.parsedate_tz(msg.get('date'))
except:
pass
# In keeping with preserving the past, we have kept this next section(s).
# For all intents and purposes, this is not a proper way of maintaining
# a consistent ID in case of missing dates. It is recommended to use
# another generator
if not mdate and msg.get('archived-at'):
mdate = email.utils.parsedate_tz(msg.get('archived-at'))
elif not mdate:
mdate = time.gmtime() # Get a standard 9-tuple
mdate = mdate + (0, ) # Fake a TZ (10th element)
mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(email.utils.mktime_tz(mdate)))
xbody += bytes(mdatestring, encoding='ascii')
mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
return mid
# cluster: Use data that is guaranteed to be the same across cluster setups
# This is the recommended generator for cluster setups.
# Unlike 'medium', this only makes use of the Date: header and not the archived-at,
# as the archived-at may change from node to node (and will change if not in the raw mbox file)
# Also the lid is not included in the hash, so the hash does not change if the lid is overridden
#
def cluster(msg, body, lid, attachments):
"""
Use data that is guaranteed to be the same across cluster setups
For mails with a valid Message-ID this is likely to be unique
In other cases it is better than the medium generator as it uses several extra fields
N.B. The id is not guaranteed stable - i.e. it may change if the message is reparsed.
The id depends on the parsed body, which depends on the exact method used to parse the mail.
For example, are invalid characters ignored or replaced; is html parsing used?
The output also depends on attachment hashes, so any changes to attachment parsing
can also change the output. For example, the code now handles inline attachments.
The following message fields are concatenated to form the hash input:
- body as is if bytes else encoded ascii, ignoring invalid characters; if the body is null it is treated as an empty string
(currently trailing whitespace is dropped)
- Message-ID (if present)
- Date header converted to YYYY/MM/DD HH:MM:SS (UTC)
or "(null)" if the date does not exist or cannot be converted
- sender, encoded as ascii (if the field exists)
- subject, encoded as ascii (if the field exists)
- the hashes of any attachments
Note: the lid is not included in the hash.
Parameters:
msg - the parsed message
body - the parsed text content
lid - list id
attachments - list of attachments (uses the hashes)
Returns: "r<hash>@<lid>" where hash is sha224 of the message items noted above
"""
# Use text body
if not body: # Make sure body is not None, which will fail.
body = ""
xbody = body if type(body) is bytes else body.encode('ascii', 'ignore')
# Crop out any trailing whitespace in body
xbody = re.sub(b"\s+$", b"", xbody)
# Use Message-Id (or '' if missing)
xbody += bytes(msg.get('Message-Id', ''), encoding='ascii')
# Use Date header. Don't use archived-at, as the archiver sets this if not present.
mdatestring = "(null)" # Default to null, ONLY changed if replicable across imports
try:
mdate = email.utils.parsedate_tz(msg.get('date'))
mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(email.utils.mktime_tz(mdate)))
except:
pass
xbody += bytes(mdatestring, encoding='ascii')
# Use sender
sender = msg.get('from', None)
if sender:
xbody += bytes(sender, encoding = 'ascii')
# Use subject
subject = msg.get('subject', None)
if subject:
xbody += bytes(subject, encoding = 'ascii')
# Use attachment hashes if present
if attachments:
for a in attachments:
xbody += bytes(a['hash'], encoding = 'ascii')
# generate the hash and combine with the lid to form the id
mid = "r%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid)
return mid
# Old school way of making IDs
def legacy(msg, body, lid, _attachments):
"""
Original generator - DO NOT USE
(does not generate unique ids)
The hash input is created from
- body: if bytes as is else encoded ascii, ignoring invalid characters; if the body is null an Exception is thrown
The uid_mdate for the id is the Date converted to UTC epoch else 0
Parameters:
msg - the parsed message (used to get the date)
body - the parsed text content (may be null)
lid - list id
_attachments - list of attachments (not used)
Returns: "<hash>@<uid_mdate>@<lid>" where hash is sha224 of the message items noted above
"""
uid_mdate = 0 # Default if no date found
try:
mdate = email.utils.parsedate_tz(msg.get('date'))
uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid
except:
pass
mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
return mid
__GENERATORS={
'full': full,
'medium': medium,
'cluster': cluster,
'legacy': legacy,
}
def generator(name):
try:
return __GENERATORS[name]
except:
print("WARN: generator %s not found, defaulting to 'legacy'" % name)
return legacy
def generate(name, msg, body, lid, attachments):
return generator(name)(msg, body, lid, attachments)
def generator_names():
return list(__GENERATORS)