| #!/usr/bin/env python3 |
| # -*- coding: utf-8 -*- |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| """ Modify lists and messages |
| |
| This utility can be used to: |
| - rename a list |
| - make a list private |
| - make a list public |
| - update the description for a list |
| - delete mails from a list (does not delete mbox_source entries) |
| - obfuscate some fields (from, subject, body) in an mbox entry (does not obfuscate the raw source document) |
| |
| """ |
| |
| import sys |
| import time |
| import argparse |
| |
| from elastic import Elastic |
| |
| class options: |
| def __init__(self): |
| parser = argparse.ArgumentParser(description='Command line options.') |
| # Cannot have both source and mid as input |
| source_group = parser.add_mutually_exclusive_group() |
| source_group.add_argument('--source', dest='source', type=str, |
| help='Source list to edit') |
| source_group.add_argument('--mid', dest='mid', type=str, |
| help='Source Message-ID to edit') |
| parser.add_argument('--rename', dest='target', type=str, |
| help='(optional) new list ID') |
| parser.add_argument('--desc', dest='desc', type=str, |
| help='(optional) new list description') |
| parser.add_argument('--obfuscate', dest='obfuscate', type=str, |
| help='Things to obfuscate in body, if any') |
| # private and public are mutually exclusive |
| privacy_group = parser.add_mutually_exclusive_group() |
| privacy_group.add_argument('--private', dest='private', action='store_true', |
| help='Make all emails in list private') |
| privacy_group.add_argument('--public', dest='public', action='store_true', |
| help='Make all emails in list public') |
| parser.add_argument('--delete', dest='delete', action='store_true', |
| help='Delete emails from this list') |
| parser.add_argument('--wildcard', dest='glob', action='store_true', |
| help='Allow wildcards in --source') |
| parser.add_argument('--debug', dest='debug', action='store_true', |
| help='Debug output - very noisy!') |
| parser.add_argument('--notag', dest='notag', action='store_true', |
| help='List IDs do not have <> in them') |
| parser.add_argument('--test', dest='test', action='store_true', |
| help='Only test for occurrences, do not run the chosen action (dry run)') |
| |
| args = parser.parse_args() |
| |
| self.sourceLID = args.source |
| self.targetLID = args.target |
| self.desc = args.desc |
| self.makePrivate = args.private |
| self.makePublic = args.public |
| self.deleteEmails = args.delete |
| self.wildcard = args.glob |
| self.debug = args.debug |
| self.notag = args.notag |
| self.mid = args.mid |
| self.obfuscate = args.obfuscate |
| self.dryrun = args.test |
| |
| self.privacyChange = self.makePrivate or self.makePublic |
| self.otherChange = self.targetLID or self.desc or self.obfuscate |
| self.anyChange = self.privacyChange or self.otherChange |
| |
| if not self.sourceLID and not self.mid: |
| print("No source list ID specified!") |
| parser.print_help() |
| sys.exit(-1) |
| if not (self.anyChange or self.deleteEmails): |
| print("Nothing to do! No target list ID or action specified") |
| parser.print_help() |
| sys.exit(-1) |
| if self.desc and not self.sourceLID: |
| print("No source list ID specified for description!") |
| parser.print_help() |
| sys.exit(-1) |
| if self.anyChange and self.deleteEmails: |
| print("Cannot both change and delete emails in the same run") |
| parser.print_help() |
| sys.exit(-1) |
| |
| # TODO does it make sense to allow --rename with --mid? |
| # i.e. rename the list for a single mid? |
| |
| if self.sourceLID: |
| self.sourceLID = ("%s" if self.notag else "<%s>") % self.sourceLID.replace("@", ".").strip("<>") |
| if self.targetLID: |
| self.targetLID = "<%s>" % self.targetLID.replace("@", ".").strip("<>") |
| |
| |
| def process_hits(page, args, dbname): |
| """ Processes each hit in a scroll search and proposes changes |
| in the array returned """ |
| changes = [] |
| if 'hits' in page and 'hits' in page['hits']: |
| for hit in page['hits']['hits']: |
| doc = hit['_id'] |
| body = {} |
| if args.obfuscate: |
| body['body'] = hit['_source']['body'].replace(args.obfuscate, "...") |
| body['subject'] = hit['_source']['subject'].replace(args.obfuscate, "...") |
| body['from'] = hit['_source']['from'].replace(args.obfuscate, "...") |
| if args.targetLID: |
| body['list_raw'] = args.targetLID |
| body['list'] = args.targetLID |
| if args.makePrivate: |
| body['private'] = True |
| if args.makePublic: |
| body['private'] = False |
| if not args.dryrun: |
| changes.append({ |
| '_op_type': 'delete' if args.deleteEmails else 'update', |
| '_index': dbname, |
| '_type': 'mbox', |
| '_id': doc, |
| 'doc': body |
| }) |
| else: |
| changes.append({}) # Empty action for counting if dryrun, so we never accidentally run it. |
| return changes |
| |
| def main(): |
| es = Elastic() |
| dbname = es.getdbname() |
| # get config and set up default databas |
| es = Elastic() |
| # default database name |
| dbname = es.getdbname() |
| |
| args = options() |
| |
| print("Beginning list edit:") |
| if args.sourceLID: |
| print(" - List ID: %s" % args.sourceLID) |
| else: |
| print(" - MID: %s" % args.mid) |
| if args.targetLID: |
| print(" - Target ID: %s" % args.targetLID) |
| if args.makePublic: |
| print(" - Action: Mark all emails public") |
| if args.makePrivate: |
| print(" - Action: Mark all emails private") |
| if args.deleteEmails: |
| print(" - Action: Delete emails (sources will be kept!)") |
| if args.obfuscate: |
| print(" - Action: Obfuscate parts of email containing: %s" % args.obfuscate) |
| |
| if args.desc: |
| print(" - Action: add description: %s" % args.desc) |
| if args.dryrun: |
| print("DRY RUN - NO CHANGES WILL BE MADE") |
| else: |
| LID = args.sourceLID |
| if args.targetLID: |
| LID = args.targetLID |
| es.index( |
| doc_type="mailinglists", |
| id=LID, |
| body = { |
| 'list': LID, |
| 'name': LID, |
| 'description':args.desc |
| } |
| ) |
| print("All done, updated description.") |
| |
| if args.targetLID or args.makePrivate or args.makePublic or args.deleteEmails or args.mid or args.obfuscate: |
| if args.dryrun: |
| print("DRY RUN - NO CHANGES WILL BE MADE") |
| print("Updating docs...") |
| then = time.time() |
| terms = { |
| 'wildcard' if args.wildcard else 'term': { |
| 'list_raw': args.sourceLID |
| } |
| } |
| if args.mid: |
| terms = { |
| 'term': { |
| 'mid': args.mid |
| } |
| } |
| query = { |
| '_source': ['body', 'subject', 'from'] if args.obfuscate else False, |
| 'query': { |
| 'bool': { |
| 'must': [ |
| terms |
| ] |
| } |
| } |
| } |
| proposed_changes = [] |
| for page in es.scan_and_scroll(body = query): |
| prop = process_hits(page, args, dbname) |
| if prop: |
| proposed_changes.extend(prop) |
| |
| tmp = [] |
| count = len(proposed_changes) |
| processed = 0 |
| # Handle proposed changes in batches of 500 |
| while len(proposed_changes) > 0: |
| tmp.append(proposed_changes.pop(0)) |
| if len(tmp) >= 500: |
| if not args.dryrun: |
| es.bulk(tmp) |
| processed += len(tmp) |
| tmp = [] |
| print("Processed %u documents..." % processed) |
| # Any stragglers remaining gets processed here |
| if len(tmp) > 0: |
| if not args.dryrun: |
| es.bulk(tmp) |
| processed += len(tmp) |
| print("Processed %u documents..." % processed) |
| |
| print("All done, processed %u docs in %u seconds" % (count, time.time() - then)) |
| |
| if __name__ == '__main__': |
| main() |