| #!/usr/bin/env python3 |
| # -*- coding: utf-8 -*- |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| #the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| """ |
| This is an experimental key phrase extraction plugin for using |
| Azure/picoAPI for analyzing the key elements of an email on a list. This |
| requires an account with a text analysis service provider, and a |
| corresponding API section in config.yaml, as such: |
| |
| # picoAPI example: |
| picoapi: |
| key: abcdef1234567890 |
| |
| # Azure example: |
| azure: |
| apikey: abcdef1234567890 |
| location: westeurope |
| |
| Currently only pony mail is supported. more to come. |
| """ |
| |
| import time |
| import datetime |
| import re |
| import json |
| import hashlib |
| import requests |
| import json |
| import uuid |
| |
| def trimBody(body): |
| """ Quick function for trimming away the fat from emails """ |
| # Cut away "On $date, jane doe wrote: " kind of texts |
| body = re.sub(r"(((?:\r?\n|^)((on .+ wrote:[\r\n]+)|(sent from my .+)|(>+[ \t]*[^\r\n]*\r?\n[^\n]*\n*)+)+)+)", "", body, flags = re.I | re.M) |
| |
| # Crop out quotes |
| lines = body.split("\n") |
| body = "\n".join([x for x in lines if not x.startswith(">")]) |
| |
| # Remove hyperlinks |
| body = re.sub(r"[a-z]+://\S+", "", body) |
| |
| # Remove email addresses |
| body = re.sub(r"(<[^>]+>\s*\S+@\S+)", "", body) |
| body = re.sub(r"(\S+@\S+)", "", body) |
| return body |
| |
| def azureKPE(KibbleBit, bodies): |
| """ KPE using Azure Text Analysis API """ |
| if 'azure' in KibbleBit.config: |
| headers = { |
| 'Content-Type': 'application/json', |
| 'Ocp-Apim-Subscription-Key': KibbleBit.config['azure']['apikey'] |
| } |
| |
| |
| js = { |
| "documents": [] |
| } |
| |
| # For each body... |
| a = 0 |
| KPEs = [] |
| for body in bodies: |
| # Crop out quotes |
| lines = body.split("\n") |
| body = trimBody(body) |
| doc = { |
| "language": "en", |
| "id": str(a), |
| "text": body |
| } |
| js['documents'].append(doc) |
| KPEs.append({}) # placeholder for each doc, to be replaced |
| a += 1 |
| try: |
| rv = requests.post( |
| "https://%s.api.cognitive.microsoft.com/text/analytics/v2.0/keyPhrases" % KibbleBit.config['azure']['location'], |
| headers = headers, |
| data = json.dumps(js) |
| ) |
| jsout = rv.json() |
| except: |
| jsout = {} # borked sentiment analysis? |
| |
| if 'documents' in jsout and len(jsout['documents']) > 0: |
| for doc in jsout['documents']: |
| KPEs[int(doc['id'])] = doc['keyPhrases'][:5] # Replace KPEs[X] with the actual phrases, 5 first ones. |
| |
| else: |
| KibbleBit.pprint("Failed to analyze email body.") |
| print(jsout) |
| # Depending on price tier, Azure will return a 429 if you go too fast. |
| # If we see a statusCode return, let's just stop for now. |
| # Later scans can pick up the slack. |
| if 'statusCode' in jsout: |
| KibbleBit.pprint("Possible rate limiting in place, stopping for now.") |
| return False |
| return KPEs |
| |
| def picoKPE(KibbleBit, bodies): |
| """ KPE using picoAPI Text Analysis """ |
| if 'picoapi' in KibbleBit.config: |
| headers = { |
| 'Content-Type': 'application/json', |
| 'PicoAPI-Key': KibbleBit.config['picoapi']['key'] |
| } |
| |
| |
| js = { |
| "texts": [] |
| } |
| |
| # For each body... |
| a = 0 |
| KPEs = [] |
| for body in bodies: |
| body = trimBody(body) |
| |
| doc = { |
| "id": str(a), |
| "body": body |
| } |
| js['texts'].append(doc) |
| KPEs.append({}) # placeholder for each doc, to be replaced |
| a += 1 |
| try: |
| rv = requests.post( |
| "https://v1.picoapi.com/api/text/keyphrase", |
| headers = headers, |
| data = json.dumps(js) |
| ) |
| jsout = rv.json() |
| except: |
| jsout = {} # borked sentiment analysis? |
| |
| if 'results' in jsout and len(jsout['results']) > 0: |
| for doc in jsout['results']: |
| phrases = [] |
| # This is a bit different than Azure, in that it has a weighting score |
| # So we need to just extract key phrases above a certain level. |
| # Grab up o 5 key phrases per text |
| MINIMUM_WEIGHT = 0.02 |
| for element in doc['keyphrases']: |
| if element['score'] > MINIMUM_WEIGHT: |
| phrases.append(element['phrase']) |
| if len(phrases) == 5: |
| break |
| KPEs[int(doc['id'])] = phrases # Replace KPEs[X] with the actual phrases |
| |
| else: |
| KibbleBit.pprint("Failed to analyze email body.") |
| print(jsout) |
| # 403 returned on invalid key, 429 on rate exceeded. |
| # If we see a code return, let's just stop for now. |
| # Later scans can pick up the slack. |
| if 'code' in jsout: |
| KibbleBit.pprint("Possible rate limiting in place, stopping for now.") |
| return False |
| return KPEs |
| |