blob: 770ff6fc9cac34d02944f68cf6081533a1192e60 [file] [log] [blame]
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Modify lists and messages
This utility can be used to:
- rename a list
- make a list private
- make a list public
- update the description for a list
- delete mails from a list (does not delete mbox_source entries)
- obfuscate some fields (from, subject, body) in an mbox entry (does not obfuscate the raw source document)
"""
import sys
import time
import argparse
from elastic import Elastic
class options:
def __init__(self):
parser = argparse.ArgumentParser(description='Command line options.')
# Cannot have both source and mid as input
source_group = parser.add_mutually_exclusive_group()
source_group.add_argument('--source', dest='source', type=str,
help='Source list to edit')
source_group.add_argument('--mid', dest='mid', type=str,
help='Source Message-ID to edit')
parser.add_argument('--rename', dest='target', type=str,
help='(optional) new list ID')
parser.add_argument('--desc', dest='desc', type=str,
help='(optional) new list description')
parser.add_argument('--obfuscate', dest='obfuscate', type=str,
help='Things to obfuscate in body, if any')
# private and public are mutually exclusive
privacy_group = parser.add_mutually_exclusive_group()
privacy_group.add_argument('--private', dest='private', action='store_true',
help='Make all emails in list private')
privacy_group.add_argument('--public', dest='public', action='store_true',
help='Make all emails in list public')
parser.add_argument('--delete', dest='delete', action='store_true',
help='Delete emails from this list')
parser.add_argument('--wildcard', dest='glob', action='store_true',
help='Allow wildcards in --source')
parser.add_argument('--debug', dest='debug', action='store_true',
help='Debug output - very noisy!')
parser.add_argument('--notag', dest='notag', action='store_true',
help='List IDs do not have <> in them')
parser.add_argument('--test', dest='test', action='store_true',
help='Only test for occurrences, do not run the chosen action (dry run)')
args = parser.parse_args()
self.sourceLID = args.source
self.targetLID = args.target
self.desc = args.desc
self.makePrivate = args.private
self.makePublic = args.public
self.deleteEmails = args.delete
self.wildcard = args.glob
self.debug = args.debug
self.notag = args.notag
self.mid = args.mid
self.obfuscate = args.obfuscate
self.dryrun = args.test
self.privacyChange = self.makePrivate or self.makePublic
self.otherChange = self.targetLID or self.desc or self.obfuscate
self.anyChange = self.privacyChange or self.otherChange
if not self.sourceLID and not self.mid:
print("No source list ID specified!")
parser.print_help()
sys.exit(-1)
if not (self.anyChange or self.deleteEmails):
print("Nothing to do! No target list ID or action specified")
parser.print_help()
sys.exit(-1)
if self.desc and not self.sourceLID:
print("No source list ID specified for description!")
parser.print_help()
sys.exit(-1)
if self.anyChange and self.deleteEmails:
print("Cannot both change and delete emails in the same run")
parser.print_help()
sys.exit(-1)
# TODO does it make sense to allow --rename with --mid?
# i.e. rename the list for a single mid?
if self.sourceLID:
self.sourceLID = ("%s" if self.notag else "<%s>") % self.sourceLID.replace("@", ".").strip("<>")
if self.targetLID:
self.targetLID = "<%s>" % self.targetLID.replace("@", ".").strip("<>")
def process_hits(page, args, dbname):
""" Processes each hit in a scroll search and proposes changes
in the array returned """
changes = []
if 'hits' in page and 'hits' in page['hits']:
for hit in page['hits']['hits']:
doc = hit['_id']
body = {}
if args.obfuscate:
body['body'] = hit['_source']['body'].replace(args.obfuscate, "...")
body['subject'] = hit['_source']['subject'].replace(args.obfuscate, "...")
body['from'] = hit['_source']['from'].replace(args.obfuscate, "...")
if args.targetLID:
body['list_raw'] = args.targetLID
body['list'] = args.targetLID
if args.makePrivate:
body['private'] = True
if args.makePublic:
body['private'] = False
if not args.dryrun:
changes.append({
'_op_type': 'delete' if args.deleteEmails else 'update',
'_index': dbname,
'_type': 'mbox',
'_id': doc,
'doc': body
})
else:
changes.append({}) # Empty action for counting if dryrun, so we never accidentally run it.
return changes
def main():
es = Elastic()
dbname = es.getdbname()
# get config and set up default databas
es = Elastic()
# default database name
dbname = es.getdbname()
args = options()
print("Beginning list edit:")
if args.sourceLID:
print(" - List ID: %s" % args.sourceLID)
else:
print(" - MID: %s" % args.mid)
if args.targetLID:
print(" - Target ID: %s" % args.targetLID)
if args.makePublic:
print(" - Action: Mark all emails public")
if args.makePrivate:
print(" - Action: Mark all emails private")
if args.deleteEmails:
print(" - Action: Delete emails (sources will be kept!)")
if args.obfuscate:
print(" - Action: Obfuscate parts of email containing: %s" % args.obfuscate)
if args.desc:
print(" - Action: add description: %s" % args.desc)
if args.dryrun:
print("DRY RUN - NO CHANGES WILL BE MADE")
else:
LID = args.sourceLID
if args.targetLID:
LID = args.targetLID
es.index(
doc_type="mailinglists",
id=LID,
body = {
'list': LID,
'name': LID,
'description':args.desc
}
)
print("All done, updated description.")
if args.targetLID or args.makePrivate or args.makePublic or args.deleteEmails or args.mid or args.obfuscate:
if args.dryrun:
print("DRY RUN - NO CHANGES WILL BE MADE")
print("Updating docs...")
then = time.time()
terms = {
'wildcard' if args.wildcard else 'term': {
'list_raw': args.sourceLID
}
}
if args.mid:
terms = {
'term': {
'mid': args.mid
}
}
query = {
'_source': ['body', 'subject', 'from'] if args.obfuscate else False,
'query': {
'bool': {
'must': [
terms
]
}
}
}
proposed_changes = []
for page in es.scan_and_scroll(body = query):
prop = process_hits(page, args, dbname)
if prop:
proposed_changes.extend(prop)
tmp = []
count = len(proposed_changes)
processed = 0
# Handle proposed changes in batches of 500
while len(proposed_changes) > 0:
tmp.append(proposed_changes.pop(0))
if len(tmp) >= 500:
if not args.dryrun:
es.bulk(tmp)
processed += len(tmp)
tmp = []
print("Processed %u documents..." % processed)
# Any stragglers remaining gets processed here
if len(tmp) > 0:
if not args.dryrun:
es.bulk(tmp)
processed += len(tmp)
print("Processed %u documents..." % processed)
print("All done, processed %u docs in %u seconds" % (count, time.time() - then))
if __name__ == '__main__':
main()