blob: 5df7595e3d488a0287b92abef3082f83d6b8af95 [file] [log] [blame]
#!/usr/bin/env python3
"""
Simple tool for collating multiple mbox files into a single one, sorted by message ID.
If the message-ID is missing, use the Date or Subject and prefix the sort key to appear last.
Can optionally sort by ezmlm number.
This should be less likely to have missing numbers or duplicate entries.
However duplicates can occur in archive files if:
- the sequence number was reset at any point
- multiple mailing lists were merged
- messages were somehow duplicated before archival
Used for multi-import tests where you wish to check that multiple sources give the same ID
Emails with duplicate sort keys are logged and dropped
"""
import argparse
import mailbox
import re
import sys
parser = argparse.ArgumentParser(description='Command line options.')
parser.add_argument('--ezmlm', dest='ezmlm', action='store_true',
help="Use ezmlm numbering for sorting")
parser.add_argument('args', nargs=argparse.REMAINDER)
args = parser.parse_args()
outmbox = args.args[0]
msgfiles = args.args[1:] # multiple input files allowed
allmessages = {}
noid = 0
skipped = 0
crlf = None # assume that all emails have the same EOL
for msgfile in msgfiles:
messages = mailbox.mbox(
msgfile, None, create=False
)
sortkey = None
for key in messages.iterkeys():
message = messages.get(key)
if args.ezmlm:
from_ = message.get_from()
m = re.search(r"return-(\d+)-", from_)
if m:
sortkey = m.group(1)
else:
print("Failed to find ezmlm id in %s" % from_)
skipped += 1
continue
else:
msgid = message.get('message-id')
if msgid:
sortkey = msgid.strip()
else:
print("No message id, sorting by date or subject: ", message.get_from())
noid += 1
altid = message.get('date') or message.get('subject')
sortkey = "~" + altid.strip() # try to ensure it sorts last
# store the data
file = messages.get_file(key, True)
message_raw = b''
if crlf is None:
message_raw = file.readline()
crlf = (message_raw.endswith(b'\r\n'))
message_raw += file.read()
file.close()
if sortkey in allmessages:
print("Duplicate sort key: %s" % sortkey)
skipped += 1
allmessages[sortkey] = message_raw
nw = 0
with open(outmbox, "wb") as f:
for key in sorted(allmessages.keys()):
f.write(allmessages[key])
if crlf:
f.write(b'\r\n')
else:
f.write(b'\n')
nw += 1
print("Wrote %u emails to %s with CRLF %s (%u without message-id) WARN: %u skipped" % (nw, outmbox, crlf, noid, skipped))