| #!/usr/bin/env python3 |
| # -*- coding: utf-8 -*- |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| """ |
| This file contains the various ID generators for Pony Mail's archivers. |
| """ |
| |
| import hashlib |
| import email.utils |
| import time |
| import re |
| |
| # Full generator: uses the entire email (including server-dependent data) |
| # This is the recommended generator for single-node setups. |
| def full(msg, _body, lid, _attachments): |
| """ |
| Full generator: uses the entire email |
| (including server-dependent data) |
| The id is almost certainly unique, |
| but different copies of the message are likely to have different headers, thus ids |
| |
| WARNING: the archiver by default adds an archived-at header with the current time. |
| This is included in the hash, so messages will get different Permalinks if reloaded from source |
| |
| Parameters: |
| msg - the parsed message |
| _body - the parsed text content (not used) |
| lid - list id |
| _attachments - list of attachments (not used) |
| |
| Returns: "<hash>@<lid>" where hash is sha224 of message bytes |
| """ |
| mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid) |
| return mid |
| |
| # Medium: Standard 0.9 generator - Not recommended for future installations. |
| # See 'full' or 'cluster' generators instead. |
| def medium(msg, body, lid, _attachments): |
| """ |
| Standard 0.9 generator - Not recommended for future installations. |
| (does not generate sufficiently unique ids) |
| Also the lid is included in the hash; this causes problems if the listname needs to be changed. |
| |
| N.B. The id is not guaranteed stable - i.e. it may change if the message is reparsed. |
| The id depends on the parsed body, which depends on the exact method used to parse the mail. |
| For example, are invalid characters ignored or replaced; is html parsing used? |
| |
| The following message fields are concatenated to form the hash input: |
| - body: if bytes as is else encoded ascii, ignoring invalid characters; if the body is null an Exception is thrown |
| - lid |
| - Date header if it exists and parses OK; failing that |
| - archived-at header if it exists and parses OK; failing that |
| - current time. |
| The resulting date is converted to YYYY/MM/DD HH:MM:SS (using UTC) |
| |
| Parameters: |
| msg - the parsed message (used to get the date) |
| body - the parsed text content (may be null) |
| lid - list id |
| _attachments - list of attachments (not used) |
| |
| Returns: "<hash>@<lid>" where hash is sha224 of the message items noted above |
| """ |
| |
| # Use text body |
| xbody = body if type(body) is bytes else body.encode('ascii', 'ignore') |
| # Use List ID |
| xbody += bytes(lid, encoding='ascii') |
| # Use Date header |
| try: |
| mdate = email.utils.parsedate_tz(msg.get('date')) |
| except: |
| pass |
| # In keeping with preserving the past, we have kept this next section(s). |
| # For all intents and purposes, this is not a proper way of maintaining |
| # a consistent ID in case of missing dates. It is recommended to use |
| # another generator |
| if not mdate and msg.get('archived-at'): |
| mdate = email.utils.parsedate_tz(msg.get('archived-at')) |
| elif not mdate: |
| mdate = time.gmtime() # Get a standard 9-tuple |
| mdate = mdate + (0, ) # Fake a TZ (10th element) |
| mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(email.utils.mktime_tz(mdate))) |
| xbody += bytes(mdatestring, encoding='ascii') |
| mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid) |
| return mid |
| |
| # cluster: Use data that is guaranteed to be the same across cluster setups |
| # This is the recommended generator for cluster setups. |
| # Unlike 'medium', this only makes use of the Date: header and not the archived-at, |
| # as the archived-at may change from node to node (and will change if not in the raw mbox file) |
| # Also the lid is not included in the hash, so the hash does not change if the lid is overridden |
| # |
| def cluster(msg, body, lid, attachments): |
| """ |
| Use data that is guaranteed to be the same across cluster setups |
| For mails with a valid Message-ID this is likely to be unique |
| In other cases it is better than the medium generator as it uses several extra fields |
| |
| N.B. The id is not guaranteed stable - i.e. it may change if the message is reparsed. |
| The id depends on the parsed body, which depends on the exact method used to parse the mail. |
| For example, are invalid characters ignored or replaced; is html parsing used? |
| The output also depends on attachment hashes, so any changes to attachment parsing |
| can also change the output. For example, the code now handles inline attachments. |
| |
| The following message fields are concatenated to form the hash input: |
| - body as is if bytes else encoded ascii, ignoring invalid characters; if the body is null it is treated as an empty string |
| (currently trailing whitespace is dropped) |
| - Message-ID (if present) |
| - Date header converted to YYYY/MM/DD HH:MM:SS (UTC) |
| or "(null)" if the date does not exist or cannot be converted |
| - sender, encoded as ascii (if the field exists) |
| - subject, encoded as ascii (if the field exists) |
| - the hashes of any attachments |
| |
| Note: the lid is not included in the hash. |
| |
| Parameters: |
| msg - the parsed message |
| body - the parsed text content |
| lid - list id |
| attachments - list of attachments (uses the hashes) |
| |
| Returns: "r<hash>@<lid>" where hash is sha224 of the message items noted above |
| """ |
| # Use text body |
| if not body: # Make sure body is not None, which will fail. |
| body = "" |
| xbody = body if type(body) is bytes else body.encode('ascii', 'ignore') |
| |
| # Crop out any trailing whitespace in body |
| xbody = re.sub(b"\s+$", b"", xbody) |
| |
| # Use Message-Id (or '' if missing) |
| xbody += bytes(msg.get('Message-Id', ''), encoding='ascii') |
| |
| # Use Date header. Don't use archived-at, as the archiver sets this if not present. |
| mdatestring = "(null)" # Default to null, ONLY changed if replicable across imports |
| try: |
| mdate = email.utils.parsedate_tz(msg.get('date')) |
| mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(email.utils.mktime_tz(mdate))) |
| except: |
| pass |
| xbody += bytes(mdatestring, encoding='ascii') |
| |
| # Use sender |
| sender = msg.get('from', None) |
| if sender: |
| xbody += bytes(sender, encoding = 'ascii') |
| |
| # Use subject |
| subject = msg.get('subject', None) |
| if subject: |
| xbody += bytes(subject, encoding = 'ascii') |
| |
| # Use attachment hashes if present |
| if attachments: |
| for a in attachments: |
| xbody += bytes(a['hash'], encoding = 'ascii') |
| |
| # generate the hash and combine with the lid to form the id |
| mid = "r%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid) |
| return mid |
| |
| |
| # Old school way of making IDs |
| def legacy(msg, body, lid, _attachments): |
| """ |
| Original generator - DO NOT USE |
| (does not generate unique ids) |
| |
| The hash input is created from |
| - body: if bytes as is else encoded ascii, ignoring invalid characters; if the body is null an Exception is thrown |
| |
| The uid_mdate for the id is the Date converted to UTC epoch else 0 |
| |
| Parameters: |
| msg - the parsed message (used to get the date) |
| body - the parsed text content (may be null) |
| lid - list id |
| _attachments - list of attachments (not used) |
| |
| Returns: "<hash>@<uid_mdate>@<lid>" where hash is sha224 of the message items noted above |
| """ |
| uid_mdate = 0 # Default if no date found |
| try: |
| mdate = email.utils.parsedate_tz(msg.get('date')) |
| uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid |
| except: |
| pass |
| mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid) |
| return mid |
| |
| __GENERATORS={ |
| 'full': full, |
| 'medium': medium, |
| 'cluster': cluster, |
| 'legacy': legacy, |
| } |
| |
| def generator(name): |
| try: |
| return __GENERATORS[name] |
| except: |
| print("WARN: generator %s not found, defaulting to 'legacy'" % name) |
| return legacy |
| |
| def generate(name, msg, body, lid, attachments): |
| return generator(name)(msg, body, lid, attachments) |
| |
| def generator_names(): |
| return list(__GENERATORS) |