blob: 3929fae442bdf20b471f07ed3dc97a7a86171519 [file] [log] [blame]
#!/usr/bin/python
"""
The purpose of this script is to find attachments to email messages that
are sent to secretary@apache.org and commit them into svn:documents/received.
This task is made more difficult by the fact that email often uses payloads
for reasons other than attachments, from time to time we get spam, some
people routinely pgp sign all of their emails, and others use pgp signatures
to sign forms.
Deciding what to commit is therefore, necessarily, a bit of heuristics. When
in doubt, the intent here is to err on the side of commiting more than is
necessary than to miss an email.
Examples of heurisitics:
* Images less than 10K bytes tend to be decorations for HTML formatted
spam emails, and are not likely to be scanned forms.
* text/plain email that contain a PGP signature and the ASF fax number
are likely to be signed forms.
"""
import email
import gzip
import mailbox
import rfc822
import mimetypes
import os
from datetime import datetime
from email.header import decode_header
from glob import glob
import re
from subprocess import Popen, PIPE
from threading import Thread
import commands
import getpass
try:
from hashlib import md5
except ImportError:
from md5 import new as md5
# attachment types which generally are not saved.
skip = ['multipart/alternative', 'multipart/related', 'multipart/mixed',
'message/delivery-status', 'text/plain', 'text/html']
# attachment file names which always are saved, even if they come in
# with one of the 'skip' mime types.
forms = ['pgp.txt', 'icla.txt', 'icla.txt.asc', 'icla.pdf', 'icla.pdf.asc', 'membership-application.txt']
# mime types for pgp signatures
sigs = ['application/pkcs7-signature', 'application/pgp-signature']
# convert header from whatever encoding it is in to utf-8. Handle
# mislabelled encodings.
def decode(header, field=0):
if isinstance(header, unicode):
data = (header.encode('utf-8'), 'utf-8')
else:
data = decode_header(header)[field]
try:
return data[0].decode(data[1]).encode('utf-8')
except:
return data[0].decode('iso-8859-1').encode('utf-8')
# convert non-ascii characters into rough equivalents for the purpose
# of determining a file name to store in SVN.
def asciize(name):
if re.search(r"[^\x00-\x7F]", name):
# digraphs. May be culturally sensitive
name=re.sub(r"\xc3\x9f", 'ss', name)
name=re.sub(r"\xc3\xa4|a\xcc\x88", 'ae', name)
name=re.sub(r"\xc3\xa5|a\xcc\x8a", 'aa', name)
name=re.sub(r"\xc3\xa6", 'ae', name)
name=re.sub(r"\xc3\xb1|n\xcc\x83", 'ny', name)
name=re.sub(r"\xc3\xb6|o\xcc\x88", 'oe', name)
name=re.sub(r"\xc3\xbc|u\xcc\x88", 'ue', name)
# latin 1
name=re.sub(r"\xc3[\xa0-\xa5]", 'a', name)
name=re.sub(r"\xc3\xa7", 'c', name)
name=re.sub(r"\xc3[\xa8-\xab]", 'e', name)
name=re.sub(r"\xc3[\xac-\xaf]", 'i', name)
name=re.sub(r"\xc3[\xb2-\xb6]|\xc3\xb8", 'o', name)
name=re.sub(r"\xc3[\xb9-\xbc]", 'u', name)
name=re.sub(r"\xc3[\xbd\xbf]", 'y', name)
# Latin Extended-A
name=re.sub(r"\xc4[\x80-\x85]", 'a', name)
name=re.sub(r"\xc4[\x86-\x8d]", 'c', name)
name=re.sub(r"\xc4[\x8e-\x91]", 'd', name)
name=re.sub(r"\xc4[\x92-\x9b]", 'e', name)
name=re.sub(r"\xc4[\x9c-\xa3]", 'g', name)
name=re.sub(r"\xc4[\xa4-\xa7]", 'h', name)
name=re.sub(r"\xc4[\xa8-\xb1]", 'i', name)
name=re.sub(r"\xc4[\xb2-\xb3]", 'ij', name)
name=re.sub(r"\xc4[\xb4-\xb5]", 'j', name)
name=re.sub(r"\xc4[\xb6-\xb8]", 'k', name)
name=re.sub(r"\xc4[\xb9-\xff]|\xc5[\x80-\x82]", 'l', name)
name=re.sub(r"\xc5[\x83-\x8b]", 'n', name)
name=re.sub(r"\xc5[\x8c-\x91]", 'o', name)
name=re.sub(r"\xc5[\x92-\x93]", 'oe', name)
name=re.sub(r"\xc5[\x94-\x99]", 'r', name)
name=re.sub(r"\xc5[\x9a-\xa2]", 's', name)
name=re.sub(r"\xc5[\xa2-\xa7]", 't', name)
name=re.sub(r"\xc5[\xa8-\xb3]", 'u', name)
name=re.sub(r"\xc5[\xb4-\xb5]", 'w', name)
name=re.sub(r"\xc5[\xb6-\xb8]", 'y', name)
name=re.sub(r"\xc5[\xb9-\xbe]", 'z', name)
# denormalized diacritics
name=re.sub(r"\xcc[\x80-\xff]|\xcd[\x80-\xaf]", '', name)
return re.sub(r"[^.\w]+", '-', name)
# add svn at sign if necessary
def svn(command, file):
command = 'svn ' + command + ' ' + file
if '@' in file: command = command + '@'
# import sys
# sys.stderr.write(command+"\n")
return os.system(command)
# spam assassin client
def analyze(msg):
spamc = Popen('spamc', shell=True, stdin=PIPE, stdout=PIPE)
class passthru(Thread):
def __init__(self, stdin, msg):
Thread.__init__(self)
self.msg = msg
self.stdin = stdin
def run(self):
try:
email.generator.Generator(self.stdin).flatten(self.msg)
except:
pass
self.stdin.close()
thread = passthru(spamc.stdin, msg)
thread.start()
subject = msg['subject']
msg = email.message_from_file(spamc.stdout)
msg['subject'] = subject # spamc mangles encoded strings
setattr(msg, 'spam', str(msg['X-Spam-Status']).startswith('Yes'))
thread.join()
spamc.wait()
spamc.stdout.close()
return msg
# main logic for this script: process attachments for a single message
def detach(msg):
# quick exit if we have seen this entry before
if not msg['message-id']: return
id = md5(msg['message-id']).hexdigest()
if os.path.exists(os.path.join('tally',id)): return
# known spammers
if '<r_ieftin@yahoo.ro>' in msg['from']:
return
# collect eligible attachments
attachments = []
for payload in msg.get_payload():
# progress into multipart/mixed
if payload.get_content_type() == 'multipart/mixed':
payload = payload.get_payload()
else:
payload = [payload]
# iterate over (possibly nested) attachments
for subpayload in payload:
if subpayload.get_content_type() in skip:
if subpayload.get_filename() not in forms: continue
content = subpayload.get_payload(decode=True)
if 'License Agreement' not in content and \
'-----BEGIN PGP SIGNATURE-----' not in content:
continue
if subpayload.get_content_type() == 'image/gif':
if len(subpayload.get_payload(decode=True))<10240: continue
# if not subpayload.get_payload(decode=True): continue
# get_filename doesn't appear to have an endswith method
# if subpayload.get_filename().endswith('.gpg'): continue
attachments.append(subpayload)
if len(attachments) == 0: return
if os.system('svn update received') != 0:
return
## COMMENTED OUT - AS SPAMC IS NOT INSTALLED HERE
#
# if 'eFax message from' not in decode(msg['subject']):
# msg = analyze(msg)
# if msg.spam:
# attachments = []
# determine output file name prefix
prefix = ''
if len(attachments) > 1:
prefix = rfc822.parseaddr(decode(msg['from']).decode('utf-8'))[1]
received = os.path.join('received',prefix)
if (not re.match(r'^[.@\w]+$',prefix)) or os.path.exists(received):
dirname = datetime(*email.utils.parsedate(msg['date'])[:7]).isoformat()
prefix = dirname.replace(':','_').replace('-','_')
received = os.path.join('received',prefix)
if not os.path.exists(received): os.mkdir(received)
svn('add', received)
prefix += os.sep
elif len(attachments) == 1:
name=asciize(decode(attachments[0].get_filename()))
if not name: return
if attachments[0].get_content_type() in sigs: return
if len(name)<16:
prefix = decode(msg['from'])
if prefix.startswith('"eFax"'):
prefix = 'eFax'
else:
prefix = asciize(prefix)
if prefix.find('<')>=0: prefix = prefix.split('<')[1]
prefix = prefix.split('@')[0]
prefix = prefix + '-'
try:
name.decode('utf-8')
except:
name=name.decode('iso-8859-1').encode('utf-8')
# determine commit message
summary = "\n".join([
'Subject: ' + decode(msg['subject']),
'From: ' + decode(msg['from']),
'Date: ' + str(msg['date']),
'Message-Id: ' + str(msg['message-id']),
'X-Spam-Status' + str(msg['X-Spam-Status']),
])
count = 0
file = None
# decode payloads and place add to svn
for attachment in attachments:
mime = attachment.get_content_type()
if mime == 'application/octet-stream':
mime = mimetypes.guess_type(decode(attachment.get_filename()))[0]
name=asciize(decode(attachment.get_filename()))
if name=='none': name=str(dict(attachment.get_params()).get('name'))
content = attachment.get_payload(decode=True)
if content:
file=os.path.join('received',(prefix+name).strip('-'))
if os.path.isdir(file): file = os.path.join(file, 'unnamed')
fh=open(file,'w')
fh.write(content)
fh.close()
svn('add', file)
if mime: svn('propset svn:mime-type ' + mime, file)
count = count + 1
if count>1: file = os.path.join('received',prefix.strip('-'))
try:
name = decode(msg['from'],0)
try:
addr = rfc822.parseaddr(decode(msg['from'],1))[1]
except:
name, addr = rfc822.parseaddr(name)
if name != 'eFax' and file:
props = {
'email:id': msg['message-id'],
'email:subject': re.sub(r'\n\s*', ' ', decode(msg['subject']))
}
if name: props['email:name'] = name
if addr: props['email:addr'] = addr
if msg['cc']: props['email:cc'] = re.sub('\s+', ' ', decode(msg['cc']))
for (key, value) in props.items():
svn('propset ' + key + ' ' + repr(value), file)
except:
pass
tally = os.path.join('tally',id)
fh=open(tally,'w')
fh.write(summary + "\n")
fh.close()
if count>0 and getpass.getuser() != 'www-data':
if svn('commit --file ' + tally, file) != 0:
return # try again next cron cycle
if __name__ == "__main__":
if os.path.exists('/home/apmail/private-arch/officers-secretary'):
archive = '/home/apmail/private-arch/officers-secretary/20*'
os.chdir('/home/apmail/secretary-mail')
previous = os.stat('latest').st_mtime
elif os.path.exists('mailbox'):
archive = 'mailbox'
previous = int(os.stat(archive).st_mtime) - 1
else:
import sys
sys.stderr.write("can't find mailbox. Exiting.\n")
sys.exit(1)
latest = previous
last_processed = None
# process updated mbox files
for file in glob(archive):
if int(previous) >= int(os.stat(file).st_mtime): continue
# open gzipped/raw file
if file.endswith('.gz'):
fh=gzip.open(file)
else:
fh=open(file)
# process each multipart message in the mailbox
for msg in iter(mailbox.UnixMailbox(fh, email.message_from_file)):
last_processed = msg['Date']
if msg.is_multipart():
detach(msg)
elif '919-573-9199' in msg.get_payload():
if '-----BEGIN PGP SIGNATURE-----' in msg.get_payload().split("\n"):
msg.add_header('Content-Disposition', 'attachment',
filename='pgp.txt')
wrapper=email.message.Message()
wrapper.attach(msg)
for header in msg.keys(): wrapper[header]=msg[header]
detach(wrapper)
# keep track of the latest
if latest < os.stat(file).st_mtime:
latest = os.stat(file).st_mtime
# record where we are so that the next run can pick up where we left off
if latest > previous:
os.utime('latest', (latest, latest))
# check for any incomplete removals
if commands.getoutput('svn status received') != '':
os.system("svn st received | grep '!' | cut -c 8- | xargs -r svn revert --")
# check for any incomplete commits
if commands.getoutput('svn status received') != '':
if getpass.getuser() != 'www-data':
os.system('svn commit -m "queued documents" received')
# update web page with last processed information
if last_processed and os.path.exists('../public_html/secmail.txt'):
fh = open('../public_html/secmail.txt', 'w')
fh.write("Latest email processed was sent: %s" % last_processed)
fh.close()