Add in spamfilter operations

commit: b9d3352ceae569b9cfc08124d3ccb797d12b2b64 [log] [tgz]
author: Daniel Gruno <humbedooh@apache.org> Thu Feb 25 09:41:57 2021 +0100
committer: GitHub <noreply@github.com> Thu Feb 25 09:41:57 2021 +0100
tree: 463211e513c2f62c5b41fb6b5c29d276ce78bef9
parent: d4166b93d16b85e94f37a3b4a28442df5b558d43 [diff]
diff --git a/aardvark.py b/aardvark.py
index 6cbb21d..97a180c 100644
--- a/aardvark.py
+++ b/aardvark.py

@@ -25,6 +25,7 @@
 import asfpy.syslog
 import typing
 import multidict
+import spamfilter
 
 # Shadow print with our syslog wrapper
 print = asfpy.syslog.Printer(stdout=True, identity="aardvark")
@@ -35,6 +36,7 @@
 DEFAULT_BACKEND = "http://localhost:8080"
 DEFAULT_IPHEADER = "x-forwarded-for"
 DEFAULT_DEBUG = False
+DEFAULT_SPAM_NAIVE_THRESHOLD = 60
 MINIMUM_SCAN_LENGTH = 16  # We don't normally scan form data elements with fewer than 16 chars
 
 
@@ -67,6 +69,7 @@
         self.multispam_required = set()  # Multi-Match required matches
         self.multispam_auxiliary = set()  # Auxiliary Multi-Match strings
         self.offenders = set()  # List of already known offenders (block right out!)
+        self.naive_threshold = DEFAULT_SPAM_NAIVE_THRESHOLD
 
         # If config file, load that into the vars
         if config_file:
@@ -75,6 +78,7 @@
             self.proxy_url = self.config.get("proxy_url", self.proxy_url)
             self.port = int(self.config.get("port", self.port))
             self.ipheader = self.config.get("ipheader", self.ipheader)
+            self.naive_threshold = self.config.get("naive_spam_threshold", self.naive_threshold)
             for pm in self.config.get("postmatches", []):
                 r = re.compile(bytes(pm, encoding="utf-8"), flags=re.IGNORECASE)
                 self.postmatches.add(r)
@@ -90,6 +94,8 @@
                 for req in multimatch.get("auxiliary", []):
                     r = re.compile(bytes(req, encoding="utf-8"), flags=re.IGNORECASE)
                     self.multispam_auxiliary.add(r)
+        print("Loading Bayesian spam filter...")
+        self.spamfilter = spamfilter.BayesScanner()
 
     def scan_simple(self, request_url: str, post_data: bytes = None):
         """Scans post data for spam"""
@@ -125,7 +131,9 @@
             if v and isinstance(v, str) and len(v) >= MINIMUM_SCAN_LENGTH:
                 b = bytes(v, encoding="utf-8")
                 bad_items.extend(self.scan_simple(f"formdata::{k}", b))
-
+                res = self.spamfilter.scan_text(v)
+                if res >= self.naive_threshold:
+                    bad_items.append(f"Form element {k} has spam score of {res}, crosses threshold of {self.naive_threshold}!")
         return bad_items
 
     async def proxy(self, request: aiohttp.web.Request):
commit	b9d3352ceae569b9cfc08124d3ccb797d12b2b64	[log] [tgz]
author	Daniel Gruno <humbedooh@apache.org>	Thu Feb 25 09:41:57 2021 +0100
committer	GitHub <noreply@github.com>	Thu Feb 25 09:41:57 2021 +0100
tree	463211e513c2f62c5b41fb6b5c29d276ce78bef9
parent	d4166b93d16b85e94f37a3b4a28442df5b558d43 [diff]