src/plugins/scanners/pipermail.py - kibble-scanners - Git at Google

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
  #the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import mailbox
 import email.errors
 import email.utils
 import email.header
 import time
 import re
 import os
 import sys
 import hashlib
 import datetime
 import plugins.utils.urlmisc

 title = "Scanner for GNU Mailman Pipermail"
 version = "0.1.0"

 def accepts(source):
     """ Whether or not we think this is pipermail """
     if source['type'] == "pipermail":
         return True
     if source['type'] == 'mail':
         url = source['sourceURL']
         pipermail = re.match(r"(https?://.+/(archives|pipermail)/.+?)/?$", url)
         if pipermail:
             return True
     return False


 def scan(KibbleBit, source):
     url = source['sourceURL']
     pipermail = re.match(r"(https?://.+/(archives|pipermail)/.+?)/?$", url)
     if pipermail:
         KibbleBit.pprint("Scanning Pipermail source %s" % url)
         skipped = 0
         jsa = []
         jsp = []
         source['steps']['mail'] = {
             'time': time.time(),
             'status': 'Downloading Pipermail statistics',
             'running': True,
             'good': True
         }
         KibbleBit.updateSource(source)

         dt = time.gmtime(time.time())
         firstYear = 1970
         year = dt[0]
         month = dt[1]
         if month <= 0:
             month += 12
             year -= 1
         months = 0

         knowns = {}

         # While we have older archives, continue to parse
         monthNames = ['December', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
         while firstYear <= year:
             gzurl = "%s/%04u-%s.txt.gz" % (url, year, monthNames[month])
             pd = datetime.date(year, month, 1).timetuple()
             dhash = hashlib.sha224((("%s %s") % (source['organisation'], gzurl)).encode('ascii', errors='replace')).hexdigest()
             found = False
             found = KibbleBit.exists('mailstats', dhash)
             if months <= 1 or not found: # Always parse this month's stats and the previous month :)
                 months += 1
                 mailFile = plugins.utils.urlmisc.unzip(gzurl)
                 if mailFile:
                     try:
                         skipped = 0
                         messages = mailbox.mbox(mailFile)

                         rawtopics = {}
                         posters = {}
                         no_posters = 0
                         emails = 0
                         senders = {}
                         for message in messages:
                             emails += 1
                             sender = message['from']
                             name = sender
                             if not 'subject' in message or not message['subject'] or not 'from' in message or not message['from']:
                                 continue

                             irt = message.get('in-reply-to', None)
                             if not irt and message.get('references'):
                                 irt = message.get('references').split("\n")[0].strip()
                             replyto = None
                             if irt and irt in senders:
                                 replyto = senders[irt]
                                 print("This is a reply to %s" % replyto)
                             raw_subject = re.sub(r"^[a-zA-Z]+\s*:\s*", "", message['subject'], count=10)
                             raw_subject = re.sub(r"[\r\n\t]+", "", raw_subject, count=10)
                             if not raw_subject in rawtopics:
                                 rawtopics[raw_subject] = 0
                             rawtopics[raw_subject] += 1
                             m = re.match(r"(.+?) at (.+?) \((.*)\)$", message['from'], flags=re.UNICODE)
                             if m:
                                 name = m.group(3).strip()
                                 sender = m.group(1) + "@" + m.group(2)
                             else:
                                 m = re.match(r"(.+)\s*<(.+)>", message['from'], flags=re.UNICODE)
                                 if m:
                                     name = m.group(1).replace('"', "").strip()
                                     sender = m.group(2)
                             if not sender in posters:
                                 posters[sender] = {
                                     'name': name,
                                     'email': sender
                                 }
                             senders[message.get('message-id', "??")] = sender
                             mdate = email.utils.parsedate_tz(message['date'])
                             mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(email.utils.mktime_tz(mdate)))
                             if not sender in knowns:
                                 sid = hashlib.sha1( ("%s%s" % (source['organisation'], sender)).encode('ascii', errors='replace')).hexdigest()
                                 knowns[sender] = KibbleBit.exists('person', sid)
                             if not sender in knowns:
                                 KibbleBit.append('person',
                                     {
                                     'name': name,
                                     'email': sender,
                                     'organisation': source['organisation'],
                                     'id' :hashlib.sha1( ("%s%s" % (source['organisation'], sender)).encode('ascii', errors='replace')).hexdigest()
                                 })
                                 knowns[sender] = True
                             jse = {
                                 'organisation': source['organisation'],
                                 'sourceURL': source['sourceURL'],
                                 'sourceID': source['sourceID'],
                                 'date': mdatestring,
                                 'sender': sender,
                                 'replyto': replyto,
                                 'subject': message['subject'],
                                 'address': sender,
                                 'ts': email.utils.mktime_tz(mdate),
                                 'id': message['message-id']
                             }
                             KibbleBit.append('email', jse)

                         for sender in posters:
                             no_posters += 1
                         i = 0
                         topics = 0
                         for key in rawtopics:
                             topics += 1
                         for key in reversed(sorted(rawtopics, key= lambda x: x)):
                             val = rawtopics[key]
                             i += 1
                             if i > 10:
                                 break
                             KibbleBit.pprint("Found top 10: %s (%s emails)" % (key, val))
                             shash = hashlib.sha224(key.encode('ascii', errors='replace')).hexdigest()
                             md = time.strftime("%Y/%m/%d %H:%M:%S", pd)
                             mlhash = hashlib.sha224(( ("%s%s%s%s") % (key, source['sourceURL'], source['organisation'], md)).encode('ascii', errors='replace')).hexdigest() # one unique id per month per mail thread
                             jst = {
                                 'organisation': source['organisation'],
                                 'sourceURL': source['sourceURL'],
                                 'sourceID': source['sourceID'],
                                 'date': md,
                                 'emails': val,
                                 'shash': shash,
                                 'subject': key,
                                 'ts': time.mktime(pd),
                                 'id': mlhash
                             }
                             KibbleBit.index('mailtop', mlhash, jst)

                         jso = {
                             'organisation': source['organisation'],
                             'sourceURL': source['sourceURL'],
                             'sourceID': source['sourceID'],
                             'date': time.strftime("%Y/%m/%d %H:%M:%S", pd),
                             'authors': no_posters,
                             'emails': emails,
                             'topics': topics
                         }
                         KibbleBit.index('mailstats', dhash, jso)

                         os.unlink(mailFile)
                     except Exception as err:
                         KibbleBit.pprint("Couldn't parse %s, skipping: %s" % (gzurl, err))
                         skipped += 1
                         if skipped > 12:
                             KibbleBit.pprint("12 skips in a row, breaking off (no more data?)")
                             break
                 else:
                     KibbleBit.pprint("Couldn't find %s, skipping." % gzurl)
                     skipped += 1
                     if skipped > 12:
                         KibbleBit.pprint("12 skips in a row, breaking off (no more data?)")
                         break
             month -= 1
             if month <= 0:
                 month += 12
                 year -= 1

         source['steps']['mail'] = {
             'time': time.time(),
             'status': 'Mail archives successfully scanned at ' + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())),
             'running': False,
             'good': True
         }
         KibbleBit.updateSource(source)
     else:
         KibbleBit.pprint("Invalid Pipermail URL detected: %s" % url, True)
         source['steps']['mail'] = {
             'time': time.time(),
             'status': 'Invalid or malformed URL detected!',
             'running': False,
             'good': False
         }
         KibbleBit.updateSource(source)
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	#the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import mailbox
	import email.errors
	import email.utils
	import email.header
	import time
	import re
	import os
	import sys
	import hashlib
	import datetime
	import plugins.utils.urlmisc

	title = "Scanner for GNU Mailman Pipermail"
	version = "0.1.0"

	def accepts(source):
	""" Whether or not we think this is pipermail """
	if source['type'] == "pipermail":
	return True
	if source['type'] == 'mail':
	url = source['sourceURL']
	pipermail = re.match(r"(https?://.+/(archives\|pipermail)/.+?)/?$", url)
	if pipermail:
	return True
	return False


	def scan(KibbleBit, source):
	url = source['sourceURL']
	pipermail = re.match(r"(https?://.+/(archives\|pipermail)/.+?)/?$", url)
	if pipermail:
	KibbleBit.pprint("Scanning Pipermail source %s" % url)
	skipped = 0
	jsa = []
	jsp = []
	source['steps']['mail'] = {
	'time': time.time(),
	'status': 'Downloading Pipermail statistics',
	'running': True,
	'good': True
	}
	KibbleBit.updateSource(source)

	dt = time.gmtime(time.time())
	firstYear = 1970
	year = dt[0]
	month = dt[1]
	if month <= 0:
	month += 12
	year -= 1
	months = 0

	knowns = {}

	# While we have older archives, continue to parse
	monthNames = ['December', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
	while firstYear <= year:
	gzurl = "%s/%04u-%s.txt.gz" % (url, year, monthNames[month])
	pd = datetime.date(year, month, 1).timetuple()
	dhash = hashlib.sha224((("%s %s") % (source['organisation'], gzurl)).encode('ascii', errors='replace')).hexdigest()
	found = False
	found = KibbleBit.exists('mailstats', dhash)
	if months <= 1 or not found: # Always parse this month's stats and the previous month :)
	months += 1
	mailFile = plugins.utils.urlmisc.unzip(gzurl)
	if mailFile:
	try:
	skipped = 0
	messages = mailbox.mbox(mailFile)

	rawtopics = {}
	posters = {}
	no_posters = 0
	emails = 0
	senders = {}
	for message in messages:
	emails += 1
	sender = message['from']
	name = sender
	if not 'subject' in message or not message['subject'] or not 'from' in message or not message['from']:
	continue

	irt = message.get('in-reply-to', None)
	if not irt and message.get('references'):
	irt = message.get('references').split("\n")[0].strip()
	replyto = None
	if irt and irt in senders:
	replyto = senders[irt]
	print("This is a reply to %s" % replyto)
	raw_subject = re.sub(r"^[a-zA-Z]+\s:\s", "", message['subject'], count=10)
	raw_subject = re.sub(r"[\r\n\t]+", "", raw_subject, count=10)
	if not raw_subject in rawtopics:
	rawtopics[raw_subject] = 0
	rawtopics[raw_subject] += 1
	m = re.match(r"(.+?) at (.+?) \((.*)\)$", message['from'], flags=re.UNICODE)
	if m:
	name = m.group(3).strip()
	sender = m.group(1) + "@" + m.group(2)
	else:
	m = re.match(r"(.+)\s*<(.+)>", message['from'], flags=re.UNICODE)
	if m:
	name = m.group(1).replace('"', "").strip()
	sender = m.group(2)
	if not sender in posters:
	posters[sender] = {
	'name': name,
	'email': sender
	}
	senders[message.get('message-id', "??")] = sender
	mdate = email.utils.parsedate_tz(message['date'])
	mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(email.utils.mktime_tz(mdate)))
	if not sender in knowns:
	sid = hashlib.sha1( ("%s%s" % (source['organisation'], sender)).encode('ascii', errors='replace')).hexdigest()
	knowns[sender] = KibbleBit.exists('person', sid)
	if not sender in knowns:
	KibbleBit.append('person',
	{
	'name': name,
	'email': sender,
	'organisation': source['organisation'],
	'id' :hashlib.sha1( ("%s%s" % (source['organisation'], sender)).encode('ascii', errors='replace')).hexdigest()
	})
	knowns[sender] = True
	jse = {
	'organisation': source['organisation'],
	'sourceURL': source['sourceURL'],
	'sourceID': source['sourceID'],
	'date': mdatestring,
	'sender': sender,
	'replyto': replyto,
	'subject': message['subject'],
	'address': sender,
	'ts': email.utils.mktime_tz(mdate),
	'id': message['message-id']
	}
	KibbleBit.append('email', jse)

	for sender in posters:
	no_posters += 1
	i = 0
	topics = 0
	for key in rawtopics:
	topics += 1
	for key in reversed(sorted(rawtopics, key= lambda x: x)):
	val = rawtopics[key]
	i += 1
	if i > 10:
	break
	KibbleBit.pprint("Found top 10: %s (%s emails)" % (key, val))
	shash = hashlib.sha224(key.encode('ascii', errors='replace')).hexdigest()
	md = time.strftime("%Y/%m/%d %H:%M:%S", pd)
	mlhash = hashlib.sha224(( ("%s%s%s%s") % (key, source['sourceURL'], source['organisation'], md)).encode('ascii', errors='replace')).hexdigest() # one unique id per month per mail thread
	jst = {
	'organisation': source['organisation'],
	'sourceURL': source['sourceURL'],
	'sourceID': source['sourceID'],
	'date': md,
	'emails': val,
	'shash': shash,
	'subject': key,
	'ts': time.mktime(pd),
	'id': mlhash
	}
	KibbleBit.index('mailtop', mlhash, jst)

	jso = {
	'organisation': source['organisation'],
	'sourceURL': source['sourceURL'],
	'sourceID': source['sourceID'],
	'date': time.strftime("%Y/%m/%d %H:%M:%S", pd),
	'authors': no_posters,
	'emails': emails,
	'topics': topics
	}
	KibbleBit.index('mailstats', dhash, jso)

	os.unlink(mailFile)
	except Exception as err:
	KibbleBit.pprint("Couldn't parse %s, skipping: %s" % (gzurl, err))
	skipped += 1
	if skipped > 12:
	KibbleBit.pprint("12 skips in a row, breaking off (no more data?)")
	break
	else:
	KibbleBit.pprint("Couldn't find %s, skipping." % gzurl)
	skipped += 1
	if skipped > 12:
	KibbleBit.pprint("12 skips in a row, breaking off (no more data?)")
	break
	month -= 1
	if month <= 0:
	month += 12
	year -= 1

	source['steps']['mail'] = {
	'time': time.time(),
	'status': 'Mail archives successfully scanned at ' + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())),
	'running': False,
	'good': True
	}
	KibbleBit.updateSource(source)
	else:
	KibbleBit.pprint("Invalid Pipermail URL detected: %s" % url, True)
	source['steps']['mail'] = {
	'time': time.time(),
	'status': 'Invalid or malformed URL detected!',
	'running': False,
	'good': False
	}
	KibbleBit.updateSource(source)