| #!/usr/bin/env python3 |
| # -*- coding: utf-8 -*- |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| #the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| """ |
| This is a Kibble scanner plugin for Apache Pony Mail sources. |
| """ |
| import requests |
| import json |
| import time |
| import re |
| import hashlib |
| import sys |
| import datetime |
| import plugins.utils.jsonapi |
| |
| title = "Scanner plugin for Apache Pony Mail" |
| version = "0.1.0" |
| |
| def accepts(source): |
| """ Test if source matches a Pony Mail archive """ |
| # If the source equals the plugin name, assume a yes |
| if source['type'] == 'ponymail': |
| return True |
| |
| # If it's of type 'mail', check the URL |
| if source['type'] == 'mail': |
| if re.match(r"(https?://.+)/list\.html\?(.+)@(.+)", source['sourceURL']): |
| return True |
| |
| # Default to not recognizing the source |
| return False |
| |
| def countSubs(struct, kids = 0): |
| """ Counts replies in a thread """ |
| if 'children' in struct and len(struct['children']) > 0: |
| for child in struct['children']: |
| kids += 1 |
| kids += countSubs(child) |
| return kids |
| |
| def repliedTo(emails, struct): |
| myList = {} |
| for eml in struct: |
| myID = eml['tid'] |
| if 'children' in eml: |
| for child in eml['children']: |
| myList[child['tid']] = myID |
| if len(child['children']) > 0: |
| cList = repliedTo(emails, child['children']) |
| myList.update(cList) |
| return myList |
| |
| def getSender(email): |
| sender = email['from'] |
| name = sender |
| m = re.match(r"(.+)\s*<(.+)>", email['from'], flags=re.UNICODE) |
| if m: |
| name = m.group(1).replace('"', "").strip() |
| sender = m.group(2) |
| return sender |
| |
| def scan(KibbleBit, source): |
| # Validate URL first |
| url = re.match(r"(https?://.+)/list\.html\?(.+)@(.+)", source['sourceURL']) |
| if not url: |
| KibbleBit.pprint("Malformed or invalid Pony Mail URL passed to scanner: %s" % source['sourceURL']) |
| source['steps']['mail'] = { |
| 'time': time.time(), |
| 'status': 'Could not parse Pony Mail URL!', |
| 'running': False, |
| 'good': False |
| } |
| KibbleBit.updateSource(source) |
| return |
| |
| # Pony Mail requires a UI cookie in order to work. Maked sure we have one! |
| cookie = None |
| if 'creds' in source and source['creds']: |
| cookie = source['creds'].get('cookie', None) |
| if not cookie: |
| KibbleBit.pprint("Pony Mail instance at %s requires an authorized cookie, none found! Bailing." % source['sourceURL']) |
| source['steps']['mail'] = { |
| 'time': time.time(), |
| 'status': 'No authorized cookie found in source object.', |
| 'running': False, |
| 'good': False |
| } |
| KibbleBit.updateSource(source) |
| return |
| |
| # Notify scanner and DB that this is valid and we've begun parsing |
| KibbleBit.pprint("%s is a valid Pony Mail address, parsing" % source['sourceURL']) |
| source['steps']['mail'] = { |
| 'time': time.time(), |
| 'status': 'Downloading Pony Mail statistics', |
| 'running': True, |
| 'good': True |
| } |
| KibbleBit.updateSource(source) |
| |
| |
| # Get base URL, list and domain to parse |
| u = url.group(1) |
| l = url.group(2) |
| d = url.group(3) |
| |
| # Get this month |
| dt = time.gmtime(time.time()) |
| firstYear = 1970 |
| year = dt[0] |
| month = dt[1] |
| if month <= 0: |
| month += 12 |
| year -= 1 |
| months = 0 |
| |
| # Hash for keeping records of who we know |
| knowns = {} |
| |
| # While we have older archives, continue to parse |
| while firstYear <= year: |
| statsurl = "%s/api/stats.lua?list=%s&domain=%s&d=%s" % (u, l, d, "%04u-%02u" % (year, month)) |
| dhash = hashlib.sha224((("%s %s") % (source['organisation'], statsurl)).encode('ascii', errors='replace')).hexdigest() |
| found = False |
| if KibbleBit.exists('mailstats', dhash): |
| found = True |
| if months <= 1 or not found: # Always parse this month's stats :) |
| months += 1 |
| KibbleBit.pprint("Parsing %04u-%02u" % (year, month)) |
| KibbleBit.pprint(statsurl) |
| pd = datetime.date(year, month, 1).timetuple() |
| try: |
| js = plugins.utils.jsonapi.get(statsurl, cookie = cookie) |
| except Exception as err: |
| KibbleBit.pprint("Server error, skipping this month") |
| month -= 1 |
| if month <= 0: |
| month += 12 |
| year -= 1 |
| continue |
| if 'firstYear' in js: |
| firstYear = js['firstYear'] |
| #print("First Year is %u" % firstYear) |
| else: |
| KibbleBit.pprint("JSON was missing fields, aborting!") |
| break |
| replyList = repliedTo(js['emails'], js['thread_struct']) |
| topics = js['no_threads'] |
| posters = {} |
| no_posters = 0 |
| emails = len(js['emails']) |
| top10 = [] |
| for eml in js['thread_struct']: |
| count = countSubs(eml, 0) |
| subject = "" |
| for reml in js['emails']: |
| if reml['id'] == eml['tid']: |
| subject = reml['subject'] |
| break |
| if len(subject) > 0 and count > 0: |
| subject = re.sub(r"^((re|fwd|aw|fw):\s*)+", "", subject, flags=re.IGNORECASE) |
| subject = re.sub(r"[\r\n\t]+", "", subject, count=20) |
| emlid = hashlib.sha1(subject.encode('ascii', errors='replace')).hexdigest() |
| top10.append([emlid, subject, count]) |
| i = 0 |
| for top in reversed(sorted(top10, key= lambda x: x[2])): |
| i += 1 |
| if i > 10: |
| break |
| KibbleBit.pprint("Found top 10: %s (%s emails)" % (top[1], top[2])) |
| md = time.strftime("%Y/%m/%d %H:%M:%S", pd) |
| mlhash = hashlib.sha224(( ("%s%s%s%s") % (top[0], source['sourceURL'], source['organisation'], md)).encode('ascii', errors='replace')).hexdigest() # one unique id per month per mail thread |
| jst = { |
| 'organisation': source['organisation'], |
| 'sourceURL': source['sourceURL'], |
| 'sourceID': source['sourceID'], |
| 'date': md, |
| 'emails': top[2], |
| 'shash': top[0], |
| 'subject': top[1], |
| 'ts': time.mktime(pd), |
| 'id': mlhash |
| } |
| KibbleBit.index('mailtop', mlhash, jst) |
| |
| for email in js['emails']: |
| sender = email['from'] |
| name = sender |
| m = re.match(r"(.+)\s*<(.+)>", email['from'], flags=re.UNICODE) |
| if m: |
| name = m.group(1).replace('"', "").strip() |
| sender = m.group(2) |
| if not sender in posters: |
| posters[sender] = { |
| 'name': name, |
| 'email': sender |
| } |
| if not sender in knowns: |
| sid = hashlib.sha1( ("%s%s" % (source['organisation'], sender)).encode('ascii', errors='replace')).hexdigest() |
| if KibbleBit.exists('person',sid): |
| knowns[sender] = True |
| if not sender in knowns or name != sender: |
| KibbleBit.append('person', |
| { |
| 'upsert': True, |
| 'name': name, |
| 'email': sender, |
| 'organisation': source['organisation'], |
| 'id' :hashlib.sha1( ("%s%s" % (source['organisation'], sender)).encode('ascii', errors='replace')).hexdigest() |
| }) |
| knowns[sender] = True |
| replyTo = None |
| if email['id'] in replyList: |
| rt = replyList[email['id']] |
| for eml in js['emails']: |
| if eml['id'] == rt: |
| replyTo = getSender(eml) |
| print("Email was reply to %s" % sender) |
| jse = { |
| 'organisation': source['organisation'], |
| 'sourceURL': source['sourceURL'], |
| 'sourceID': source['sourceID'], |
| 'date': time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(email['epoch'])), |
| 'sender': sender, |
| 'address': sender, |
| 'subject': email['subject'], |
| 'replyto': replyTo, |
| 'ts': email['epoch'], |
| 'id': email['id'], |
| 'upsert': True |
| } |
| KibbleBit.append('email', jse) |
| for sender in posters: |
| no_posters += 1 |
| |
| |
| jso = { |
| 'organisation': source['organisation'], |
| 'sourceURL': source['sourceURL'], |
| 'sourceID': source['sourceID'], |
| 'date': time.strftime("%Y/%m/%d %H:%M:%S", pd), |
| 'authors': no_posters, |
| 'emails': emails, |
| 'topics': topics |
| } |
| #print("Indexing as %s" % dhash) |
| KibbleBit.index('mailstats', dhash, jso) |
| month -= 1 |
| if month <= 0: |
| month += 12 |
| year -= 1 |
| |
| |
| source['steps']['mail'] = { |
| 'time': time.time(), |
| 'status': 'Mail archives successfully scanned at ' + time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())), |
| 'running': False, |
| 'good': True |
| } |
| KibbleBit.updateSource(source) |