Better trimming of unnecessary text elements
We don't want to be analysing:
- quotes
- "on $date, bla bla wrote" sort of sentences
- URLs, email addresses
diff --git a/src/plugins/utils/kpe.py b/src/plugins/utils/kpe.py
index 7985d3e..0390100 100644
--- a/src/plugins/utils/kpe.py
+++ b/src/plugins/utils/kpe.py
@@ -42,6 +42,22 @@
import json
import uuid
+def trimBody(body):
+ """ Quick function for trimming away the fat from emails """
+ # Cut away "On $date, jane doe wrote: " kind of texts
+ body = re.sub(r"((?:\r?\n)((on .+ wrote:[\r\n]+)|(sent from my .+)|(>+[ \t]+[^\r\n]*\r?\n[^\n]*\n*)+)+)+", "", body, flags = re.I | re.M)
+
+ # Crop out quotes
+ lines = body.split("\n")
+ body = "\n".join([x for x in lines if not x.startswith(">")])
+
+ # Remove hyperlinks
+ body = re.sub(r"[a-z]+://\S+", "", body)
+
+ # Remove email addresses
+ body = re.sub(r"(<[^>]+>\s*\S+@\S+)", "", body)
+ body = re.sub(r"(\S+@\S+)", "", body)
+ return body
def azureKPE(KibbleBit, bodies):
""" KPE using Azure Text Analysis API """
@@ -62,7 +78,7 @@
for body in bodies:
# Crop out quotes
lines = body.split("\n")
- body = "\n".join([x for x in lines if not x.startswith(">")])
+ body = trimBody(body)
doc = {
"language": "en",
"id": str(a),
@@ -113,9 +129,8 @@
a = 0
KPEs = []
for body in bodies:
- # Crop out quotes
- lines = body.split("\n")
- body = "\n".join([x for x in lines if not x.startswith(">")])
+ body = trimBody(body)
+
doc = {
"id": str(a),
"body": body