Move args to Archive constuctor
diff --git a/tools/archiver.py b/tools/archiver.py
index 64f7844..8cc2243 100755
--- a/tools/archiver.py
+++ b/tools/archiver.py
@@ -84,8 +84,6 @@
if config.has_option('elasticsearch', 'user'):
auth = (config.get('elasticsearch','user'), config.get('elasticsearch','password'))
-archiver_generator = config.get("archiver", "generator", fallback="medium")
-
def encode_base64(buff):
""" Convert bytes to base64 as text string (no newlines) """
return base64.standard_b64encode(buff).decode('ascii', 'ignore')
@@ -179,10 +177,16 @@
**kwargs
)
- def __init__(self, generator=archiver_generator, parse_html=False, dump_dir=None):
- """ Just initialize ES. """
+ def __init__(self, generator=None, parse_html=False, ignore_body=None, dump_dir=None, verbose=False):
+
self.html = parse_html
- self.generator = generator
+ # Fall back to medium generator if nothing is set.
+ self.generator = generator or config.get("archiver", "generator", fallback="medium")
+ self.cropout = config.get("debug", "cropout", fallback=None)
+ self.verbose = verbose
+ self.ignore_body = ignore_body
+ self.dump_dir = dump_dir
+
if parse_html:
import html2text
self.html2text = html2text.html2text
@@ -192,11 +196,10 @@
self.consistency = config.get('elasticsearch', 'write', fallback='quorum')
if ES_MAJOR == 2:
pass
- elif ES_MAJOR in [5,6,7]:
+ elif ES_MAJOR in [5,6]:
self.wait_for_active_shards = config.get('elasticsearch', 'wait', fallback=1)
else:
raise Exception("Unexpected elasticsearch version ", elasticsearch.VERSION)
- self.cropout = config.get("debug", "cropout", fallback=None)
uri = config.get("elasticsearch", "uri", fallback="")
dbs = [
{
@@ -221,7 +224,8 @@
}
)
# If we have a dump dir, we can risk failing the connection.
- if dump_dir:
+ # NOTE: this does not contact the database, so is unlikely to fail
+ if self.dump_dir:
try:
self.es = elasticsearch.Elasticsearch(dbs,
max_retries=5,
@@ -245,12 +249,12 @@
contents[part_meta['hash']] = part_file
return attachments, contents
- def msgbody(self, msg, verbose=False, ignore_body=None):
+ def msgbody(self, msg):
body = None
firstHTML = None
for part in msg.walk():
# can be called from importer
- if verbose:
+ if self.verbose:
print("Content-Type: %s" % part.get_content_type())
"""
Find the first body part and the first HTML part
@@ -267,7 +271,7 @@
print(err)
# this requires a GPL lib, user will have to install it themselves
- if firstHTML and (not body or len(body) <= 1 or (ignore_body and str(body).find(str(ignore_body)) != -1)):
+ if firstHTML and (not body or len(body) <= 1 or (self.ignore_body and str(body).find(str(self.ignore_body)) != -1)):
body = self.html2text(firstHTML.decode("utf-8", 'ignore') if type(firstHTML) is bytes else firstHTML)
# See issue#463
@@ -284,10 +288,9 @@
return body
# N.B. this is also called by import-mbox.py
- def compute_updates(self, args, lid, private, msg):
+ def compute_updates(self, lid, private, msg):
"""Determine what needs to be sent to the archiver.
- :param args: Command line arguments for the archiver
:param lid: The list id
:param private: Whether privately archived email or not (bool)
:param msg: The message object
@@ -339,7 +342,7 @@
# mdate calculations are all done, prepare the index entry
epoch = email.utils.mktime_tz(mdate)
mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(epoch))
- body = self.msgbody(msg, verbose=args.verbose, ignore_body=args.ibody)
+ body = self.msgbody(msg)
try:
if 'content-type' in msg_metadata and msg_metadata['content-type'].find("flowed") != -1:
# N.B. the convertToWrapped call always fails, because body is a string instead of bytes
@@ -406,7 +409,7 @@
def archive_message(self, args, mlist, msg, raw_message):
"""Send the message to the archiver.
- :param args: Command line args (verbose, ibody)
+ :param args: Command line args (dry, dump)
:param mlist: The IMailingList object.
:param msg: The message object.
:param raw_message: Raw message bytes
@@ -424,7 +427,7 @@
elif hasattr(mlist, 'archive_policy') and mlist.archive_policy is not ArchivePolicy.public:
private = True
- ojson, contents, msg_metadata, irt = self.compute_updates(args, lid, private, msg)
+ ojson, contents, msg_metadata, irt = self.compute_updates(lid, private, msg)
if not ojson:
_id = msg.get('message-id') or msg.get('Subject') or msg.get("Date")
raise Exception("Could not parse message %s for %s" % (_id,lid))
@@ -623,7 +626,7 @@
# Also eliminates: 'Undecodable raw error response from server:' warning message
logging.getLogger("elasticsearch").setLevel(logging.ERROR)
- archie = Archiver(generator=args.generator or archiver_generator, parse_html=args.html2text, dump_dir=args.dump)
+ archie = Archiver(generator=args.generator, parse_html=args.html2text, ignore_body=args.ibody, verbose=args.verbose, dump_dir=args.dump)
# use binary input so parser can use appropriate charset
input_stream = sys.stdin.buffer
diff --git a/tools/import-mbox.py b/tools/import-mbox.py
index 007257f..b2246d1 100755
--- a/tools/import-mbox.py
+++ b/tools/import-mbox.py
@@ -64,7 +64,6 @@
interactive = False
extension = ".mbox"
piperWeirdness = False
-parseHTML = False
resendTo = None
timeout = 600
fromFilter = None
@@ -122,11 +121,10 @@
mboxfile = ""
filename = ""
- if args.generator:
- archie = archiver.Archiver(generator=args.generator, parse_html=parseHTML)
- else:
- archie = archiver.Archiver(parse_html=parseHTML)
-
+ archie = archiver.Archiver(generator=args.generator, parse_html=args.html2text,
+ ignore_body=args.ibody[0] if args.ibody else None,
+ verbose=args.verbose)
+
while len(lists) > 0:
self.printid("%u elements left to slurp" % len(lists))
@@ -224,7 +222,7 @@
bad += 1
continue
- json, contents, _msgdata, _irt = archie.compute_updates(args, list_override, private, message)
+ json, contents, _msgdata, _irt = archie.compute_updates(list_override, private, message)
# Not sure this can ever happen
if json and not (json['list'] and json['list_raw']):
@@ -411,10 +409,6 @@
dedup = args.dedup
if args.ext:
extension = args.ext[0]
-if args.html2text:
- parseHTML = True
-if args.ibody:
- archiver.iBody = args.ibody[0]
if args.fromfilter:
fromFilter = args.fromfilter[0]
if args.nomboxo: