blob: 55dbc2a6ed70609f1062008e67c6cdc7df766998 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import datetime
import hashlib
import re
import time
from kibble.scanners.utils import jsonapi
"""
This is a Kibble scanner plugin for Apache Pony Mail sources.
"""
title = "Scanner plugin for Apache Pony Mail"
version = "0.1.0"
def accepts(source):
""" Test if source matches a Pony Mail archive """
# If the source equals the plugin name, assume a yes
if source["type"] == "ponymail":
return True
# If it's of type 'mail', check the URL
if source["type"] == "mail":
if re.match(r"(https?://.+)/list\.html\?(.+)@(.+)", source["sourceURL"]):
return True
# Default to not recognizing the source
return False
def count_subs(struct, kids=0):
""" Counts replies in a thread """
if "children" in struct and len(struct["children"]) > 0:
for child in struct["children"]:
kids += 1
kids += count_subs(child)
return kids
def replied_to(emails, struct):
my_list = {}
for eml in struct:
my_id = eml["tid"]
if "children" in eml:
for child in eml["children"]:
my_list[child["tid"]] = my_id
if len(child["children"]) > 0:
c_list = replied_to(emails, child["children"])
my_list.update(c_list)
return my_list
def get_sender(email):
sender = email["from"]
m = re.match(r"(.+)\s*<(.+)>", email["from"], flags=re.UNICODE)
if m:
# name = m.group(1).replace('"', "").strip()
sender = m.group(2)
return sender
def scan(kibble_bit, source):
# Validate URL first
url = re.match(r"(https?://.+)/list\.html\?(.+)@(.+)", source["sourceURL"])
if not url:
kibble_bit.pprint(
"Malformed or invalid Pony Mail URL passed to scanner: %s"
% source["sourceURL"]
)
source["steps"]["mail"] = {
"time": time.time(),
"status": "Could not parse Pony Mail URL!",
"running": False,
"good": False,
}
kibble_bit.update_source(source)
return
# Pony Mail requires a UI cookie in order to work. Make sure we have one!
cookie = None
if "creds" in source and source["creds"]:
cookie = source["creds"].get("cookie", None)
if not cookie:
kibble_bit.pprint(
"Pony Mail instance at %s requires an authorized cookie, none found! Bailing."
% source["sourceURL"]
)
source["steps"]["mail"] = {
"time": time.time(),
"status": "No authorized cookie found in source object.",
"running": False,
"good": False,
}
kibble_bit.update_source(source)
return
# Notify scanner and DB that this is valid and we've begun parsing
kibble_bit.pprint("%s is a valid Pony Mail address, parsing" % source["sourceURL"])
source["steps"]["mail"] = {
"time": time.time(),
"status": "Downloading Pony Mail statistics",
"running": True,
"good": True,
}
kibble_bit.update_source(source)
# Get base URL, list and domain to parse
u = url.group(1)
l = url.group(2)
d = url.group(3)
# Get this month
dt = time.gmtime(time.time())
first_year = 1970
year = dt[0]
month = dt[1]
if month <= 0:
month += 12
year -= 1
months = 0
# Hash for keeping records of who we know
knowns = {}
# While we have older archives, continue to parse
while first_year <= year:
statsurl = "%s/api/stats.lua?list=%s&domain=%s&d=%s" % (
u,
l,
d,
"%04u-%02u" % (year, month),
)
dhash = hashlib.sha224(
("%s %s" % (source["organisation"], statsurl)).encode(
"ascii", errors="replace"
)
).hexdigest()
found = False
if kibble_bit.exists("mailstats", dhash):
found = True
if months <= 1 or not found: # Always parse this month's stats :)
months += 1
kibble_bit.pprint("Parsing %04u-%02u" % (year, month))
kibble_bit.pprint(statsurl)
pd = datetime.date(year, month, 1).timetuple()
try:
js = jsonapi.get(statsurl, cookie=cookie)
except Exception as err:
kibble_bit.pprint(f"Server error: {err}, skipping this month")
month -= 1
if month <= 0:
month += 12
year -= 1
continue
if "firstYear" in js:
first_year = js["firstYear"]
# print("First Year is %u" % firstYear)
else:
kibble_bit.pprint("JSON was missing fields, aborting!")
break
reply_list = replied_to(js["emails"], js["thread_struct"])
topics = js["no_threads"]
posters = {}
no_posters = 0
emails = len(js["emails"])
top10 = []
for eml in js["thread_struct"]:
count = count_subs(eml, 0)
subject = ""
for reml in js["emails"]:
if reml["id"] == eml["tid"]:
subject = reml["subject"]
break
if len(subject) > 0 and count > 0:
subject = re.sub(
r"^((re|fwd|aw|fw):\s*)+", "", subject, flags=re.IGNORECASE
)
subject = re.sub(r"[\r\n\t]+", "", subject, count=20)
emlid = hashlib.sha1(
subject.encode("ascii", errors="replace")
).hexdigest()
top10.append([emlid, subject, count])
i = 0
for top in reversed(sorted(top10, key=lambda x: x[2])):
i += 1
if i > 10:
break
kibble_bit.pprint("Found top 10: %s (%s emails)" % (top[1], top[2]))
md = time.strftime("%Y/%m/%d %H:%M:%S", pd)
mlhash = hashlib.sha224(
(
"%s%s%s%s"
% (top[0], source["sourceURL"], source["organisation"], md)
).encode("ascii", errors="replace")
).hexdigest() # one unique id per month per mail thread
jst = {
"organisation": source["organisation"],
"sourceURL": source["sourceURL"],
"sourceID": source["sourceID"],
"date": md,
"emails": top[2],
"shash": top[0],
"subject": top[1],
"ts": time.mktime(pd),
"id": mlhash,
}
kibble_bit.index("mailtop", mlhash, jst)
for email in js["emails"]:
sender = email["from"]
name = sender
m = re.match(r"(.+)\s*<(.+)>", email["from"], flags=re.UNICODE)
if m:
name = m.group(1).replace('"', "").strip()
sender = m.group(2)
if not sender in posters:
posters[sender] = {"name": name, "email": sender}
if not sender in knowns:
sid = hashlib.sha1(
("%s%s" % (source["organisation"], sender)).encode(
"ascii", errors="replace"
)
).hexdigest()
if kibble_bit.exists("person", sid):
knowns[sender] = True
if not sender in knowns or name != sender:
kibble_bit.append(
"person",
{
"upsert": True,
"name": name,
"email": sender,
"organisation": source["organisation"],
"id": hashlib.sha1(
("%s%s" % (source["organisation"], sender)).encode(
"ascii", errors="replace"
)
).hexdigest(),
},
)
knowns[sender] = True
reply_to = None
if email["id"] in reply_list:
rt = reply_list[email["id"]]
for eml in js["emails"]:
if eml["id"] == rt:
reply_to = get_sender(eml)
print("Email was reply to %s" % sender)
jse = {
"organisation": source["organisation"],
"sourceURL": source["sourceURL"],
"sourceID": source["sourceID"],
"date": time.strftime(
"%Y/%m/%d %H:%M:%S", time.gmtime(email["epoch"])
),
"sender": sender,
"address": sender,
"subject": email["subject"],
"replyto": reply_to,
"ts": email["epoch"],
"id": email["id"],
"upsert": True,
}
kibble_bit.append("email", jse)
no_posters = len(posters)
jso = {
"organisation": source["organisation"],
"sourceURL": source["sourceURL"],
"sourceID": source["sourceID"],
"date": time.strftime("%Y/%m/%d %H:%M:%S", pd),
"authors": no_posters,
"emails": emails,
"topics": topics,
}
# print("Indexing as %s" % dhash)
kibble_bit.index("mailstats", dhash, jso)
month -= 1
if month <= 0:
month += 12
year -= 1
source["steps"]["mail"] = {
"time": time.time(),
"status": "Mail archives successfully scanned at "
+ time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())),
"running": False,
"good": True,
}
kibble_bit.update_source(source)