Move args to Archive constuctor

commit: c414692692b4fcebf4317e7b6e7e8334b9aeaef2 [log] [tgz]
author: Sebb <sebb@apache.org> Sat Aug 22 23:34:04 2020 +0100
committer: Sebb <sebb@apache.org> Sat Aug 22 23:34:04 2020 +0100
tree: bcbc1b67c2035e547d39f7828a8e464806e2be55
parent: d7a96069b3d6e30517e54b25a31e44408f196ded [diff]
diff --git a/tools/archiver.py b/tools/archiver.py
index 64f7844..8cc2243 100755
--- a/tools/archiver.py
+++ b/tools/archiver.py

@@ -84,8 +84,6 @@
 if config.has_option('elasticsearch', 'user'):
     auth = (config.get('elasticsearch','user'), config.get('elasticsearch','password'))
 
-archiver_generator = config.get("archiver", "generator", fallback="medium")
-
 def encode_base64(buff):
     """ Convert bytes to base64 as text string (no newlines) """
     return base64.standard_b64encode(buff).decode('ascii', 'ignore')
@@ -179,10 +177,16 @@
             **kwargs
         )
 
-    def __init__(self, generator=archiver_generator, parse_html=False, dump_dir=None):
-        """ Just initialize ES. """
+    def __init__(self, generator=None, parse_html=False, ignore_body=None, dump_dir=None, verbose=False):
+
         self.html = parse_html
-        self.generator = generator
+        # Fall back to medium generator if nothing is set.
+        self.generator = generator or config.get("archiver", "generator", fallback="medium")
+        self.cropout = config.get("debug", "cropout", fallback=None)
+        self.verbose = verbose
+        self.ignore_body = ignore_body
+        self.dump_dir = dump_dir
+
         if parse_html:
             import html2text
             self.html2text = html2text.html2text
@@ -192,11 +196,10 @@
         self.consistency = config.get('elasticsearch', 'write', fallback='quorum')
         if ES_MAJOR == 2:
             pass
-        elif ES_MAJOR in [5,6,7]:
+        elif ES_MAJOR in [5,6]:
             self.wait_for_active_shards = config.get('elasticsearch', 'wait', fallback=1)
         else:
             raise Exception("Unexpected elasticsearch version ", elasticsearch.VERSION)
-        self.cropout = config.get("debug", "cropout", fallback=None)
         uri = config.get("elasticsearch", "uri", fallback="")
         dbs = [
             {
@@ -221,7 +224,8 @@
             }
             )
         # If we have a dump dir, we can risk failing the connection.
-        if dump_dir:
+        # NOTE: this does not contact the database, so is unlikely to fail
+        if self.dump_dir:
             try:
                 self.es = elasticsearch.Elasticsearch(dbs,
                     max_retries=5,
@@ -245,12 +249,12 @@
                 contents[part_meta['hash']] = part_file
         return attachments, contents
 
-    def msgbody(self, msg, verbose=False, ignore_body=None):
+    def msgbody(self, msg):
         body = None
         firstHTML = None
         for part in msg.walk():
             # can be called from importer
-            if verbose:
+            if self.verbose:
                 print("Content-Type: %s" % part.get_content_type())
             """
                 Find the first body part and the first HTML part
@@ -267,7 +271,7 @@
                 print(err)
 
         # this requires a GPL lib, user will have to install it themselves
-        if firstHTML and (not body or len(body) <= 1 or (ignore_body and str(body).find(str(ignore_body)) != -1)):
+        if firstHTML and (not body or len(body) <= 1 or (self.ignore_body and str(body).find(str(self.ignore_body)) != -1)):
             body = self.html2text(firstHTML.decode("utf-8", 'ignore') if type(firstHTML) is bytes else firstHTML)
 
         # See issue#463
@@ -284,10 +288,9 @@
         return body
 
     # N.B. this is also called by import-mbox.py
-    def compute_updates(self, args, lid, private, msg):
+    def compute_updates(self, lid, private, msg):
         """Determine what needs to be sent to the archiver.
 
-        :param args: Command line arguments for the archiver
         :param lid: The list id
         :param private: Whether privately archived email or not (bool)
         :param msg: The message object
@@ -339,7 +342,7 @@
         # mdate calculations are all done, prepare the index entry
         epoch = email.utils.mktime_tz(mdate)
         mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(epoch))
-        body = self.msgbody(msg, verbose=args.verbose, ignore_body=args.ibody)
+        body = self.msgbody(msg)
         try:
             if 'content-type' in msg_metadata and msg_metadata['content-type'].find("flowed") != -1:
                 # N.B. the convertToWrapped call always fails, because body is a string instead of bytes
@@ -406,7 +409,7 @@
     def archive_message(self, args, mlist, msg, raw_message):
         """Send the message to the archiver.
 
-        :param args: Command line args (verbose, ibody)
+        :param args: Command line args (dry, dump)
         :param mlist: The IMailingList object.
         :param msg: The message object.
         :param raw_message: Raw message bytes
@@ -424,7 +427,7 @@
         elif hasattr(mlist, 'archive_policy') and mlist.archive_policy is not ArchivePolicy.public:
             private = True
 
-        ojson, contents, msg_metadata, irt = self.compute_updates(args, lid, private, msg)
+        ojson, contents, msg_metadata, irt = self.compute_updates(lid, private, msg)
         if not ojson:
             _id = msg.get('message-id') or msg.get('Subject') or msg.get("Date")
             raise Exception("Could not parse message %s for %s" % (_id,lid))
@@ -623,7 +626,7 @@
         # Also eliminates: 'Undecodable raw error response from server:' warning message
         logging.getLogger("elasticsearch").setLevel(logging.ERROR)
 
-    archie = Archiver(generator=args.generator or archiver_generator, parse_html=args.html2text, dump_dir=args.dump)
+    archie = Archiver(generator=args.generator, parse_html=args.html2text, ignore_body=args.ibody, verbose=args.verbose, dump_dir=args.dump)
     # use binary input so parser can use appropriate charset
     input_stream = sys.stdin.buffer
 

diff --git a/tools/import-mbox.py b/tools/import-mbox.py
index 007257f..b2246d1 100755
--- a/tools/import-mbox.py
+++ b/tools/import-mbox.py

@@ -64,7 +64,6 @@
 interactive = False
 extension = ".mbox"
 piperWeirdness = False
-parseHTML = False
 resendTo = None
 timeout = 600
 fromFilter = None
@@ -122,11 +121,10 @@
         mboxfile = ""
         filename = ""
 
-        if args.generator:
-            archie = archiver.Archiver(generator=args.generator, parse_html=parseHTML)            
-        else:
-            archie = archiver.Archiver(parse_html=parseHTML)
-
+        archie = archiver.Archiver(generator=args.generator, parse_html=args.html2text,
+                                   ignore_body=args.ibody[0] if args.ibody else None,
+                                   verbose=args.verbose)            
+ 
         while len(lists) > 0:
             self.printid("%u elements left to slurp" % len(lists))
 
@@ -224,7 +222,7 @@
                     bad += 1
                     continue
 
-                json, contents, _msgdata, _irt = archie.compute_updates(args, list_override, private, message)
+                json, contents, _msgdata, _irt = archie.compute_updates(list_override, private, message)
 
                 # Not sure this can ever happen
                 if json and not (json['list'] and json['list_raw']):
@@ -411,10 +409,6 @@
     dedup = args.dedup
 if args.ext:
     extension = args.ext[0]
-if args.html2text:
-    parseHTML = True
-if args.ibody:
-    archiver.iBody = args.ibody[0]
 if args.fromfilter:
     fromFilter = args.fromfilter[0]
 if args.nomboxo:
commit	c414692692b4fcebf4317e7b6e7e8334b9aeaef2	[log] [tgz]
author	Sebb <sebb@apache.org>	Sat Aug 22 23:34:04 2020 +0100
committer	Sebb <sebb@apache.org>	Sat Aug 22 23:34:04 2020 +0100
tree	bcbc1b67c2035e547d39f7828a8e464806e2be55
parent	d7a96069b3d6e30517e54b25a31e44408f196ded [diff]