tools/edit-list.py - incubator-ponymail - Git at Google

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 """ Modify lists and messages

 This utility can be used to:
 - rename a list
 - make a list private
 - make a list public
 - update the description for a list
 - delete mails from a list (does not delete mbox_source entries)
 - obfuscate some fields (from, subject, body) in an mbox entry (does not obfuscate the raw source document)

 """

 import sys
 import time
 import argparse
 import json

 from elastic import Elastic

 class options:
     def __init__(self):
         parser = argparse.ArgumentParser(description='Command line options.')
         # Cannot have both source and mid as input
         source_group = parser.add_mutually_exclusive_group()
         source_group.add_argument('--source', dest='source', type=str,
                            help='Source list to edit')
         source_group.add_argument('--mid', dest='mid', type=str,
                            help='Source Message-ID to edit')
         parser.add_argument('--rename', dest='target', type=str,
                            help='(optional) new list ID')
         parser.add_argument('--desc', dest='desc', type=str,
                            help='(optional) new list description')
         parser.add_argument('--obfuscate', dest='obfuscate', type=str,
                            help='Things to obfuscate in body, if any')
         # private and public are mutually exclusive
         privacy_group = parser.add_mutually_exclusive_group()
         privacy_group.add_argument('--private', dest='private', action='store_true',
                            help='Make all emails in list private')
         privacy_group.add_argument('--public', dest='public', action='store_true',
                            help='Make all emails in list public')
         parser.add_argument('--delete', dest='delete', action='store_true',
                            help='Delete emails from this list')
         parser.add_argument('--wildcard', dest='glob', action='store_true',
                            help='Allow wildcards in --source')
         parser.add_argument('--debug', dest='debug', action='store_true',
                            help='Debug output - very noisy!')
         parser.add_argument('--notag', dest='notag', action='store_true',
                            help='List IDs do not have <> in them')
         parser.add_argument('--test', dest='test', action='store_true',
                            help='Only test for occurrences, do not run the chosen action (dry run)')

         args = parser.parse_args()

         self.sourceLID = args.source
         self.targetLID = args.target
         self.desc = args.desc
         self.makePrivate = args.private
         self.makePublic = args.public
         self.deleteEmails = args.delete
         self.wildcard = args.glob
         self.debug = args.debug
         self.notag = args.notag
         self.mid = args.mid
         self.obfuscate = args.obfuscate
         self.dryrun = args.test

         self.privacyChange = self.makePrivate or self.makePublic
         self.otherChange = self.targetLID or self.desc or self.obfuscate
         self.anyChange = self.privacyChange or self.otherChange

         if not self.sourceLID and not self.mid:
             print("No source list ID specified!")
             parser.print_help()
             sys.exit(-1)
         if not (self.anyChange or self.deleteEmails):
             print("Nothing to do! No target list ID or action specified")
             parser.print_help()
             sys.exit(-1)
         if self.desc and not self.sourceLID:
             print("No source list ID specified for description!")
             parser.print_help()
             sys.exit(-1)
         if self.anyChange and self.deleteEmails:
             print("Cannot both change and delete emails in the same run")
             parser.print_help()
             sys.exit(-1)

         # TODO does it make sense to allow --rename with --mid?
         # i.e. rename the list for a single mid?

         if self.sourceLID:
             self.sourceLID = ("%s" if self.notag else "<%s>")  % self.sourceLID.replace("@", ".").strip("<>")
         if self.targetLID:
             self.targetLID = "<%s>" % self.targetLID.replace("@", ".").strip("<>")


 def process_hits(page, args, dbname):
     """ Processes each hit in a scroll search and proposes changes
         in the array returned """
     changes = []
     if 'hits' in page and 'hits' in page['hits']:
         for hit in page['hits']['hits']:
             doc = hit['_id']
             body = {}
             if args.obfuscate:
                 body['body'] = hit['_source']['body'].replace(args.obfuscate, "...")
                 body['subject'] = hit['_source']['subject'].replace(args.obfuscate, "...")
                 body['from'] = hit['_source']['from'].replace(args.obfuscate, "...")
             if args.targetLID:
                 body['list_raw'] = args.targetLID
                 body['list'] = args.targetLID
             if args.makePrivate:
                 body['private'] = True
             if args.makePublic:
                 body['private'] = False
             changes.append({
                 '_op_type': 'delete' if args.deleteEmails else 'update',
                 '_index': dbname,
                 '_type': 'mbox',
                 '_id': doc,
                 'doc': body
                 })
     return changes

 def main():
     es = Elastic()
     dbname = es.getdbname()
     # get config and set up default databas
     es = Elastic()
     # default database name
     dbname = es.getdbname()

     args = options()

     print("Beginning list edit:")
     if args.sourceLID:
         print("  - List ID: %s" % args.sourceLID)
     else:
         print("  - MID: %s" % args.mid)
     if args.targetLID:
         print("  - Target ID: %s" % args.targetLID)
     if args.makePublic:
         print("  - Action: Mark all emails public")
     if args.makePrivate:
         print("  - Action: Mark all emails private")
     if args.deleteEmails:
         print("  - Action: Delete emails (sources will be kept!)")
     if args.obfuscate:
         print("  - Action: Obfuscate parts of email containing: %s" % args.obfuscate)

     if args.desc:
         print("  - Action: add description: %s" % args.desc)
         if args.dryrun:
             print("DRY RUN - NO CHANGES WILL BE MADE")
         else:
             LID = args.sourceLID
             if args.targetLID:
                 LID = args.targetLID
             es.index(
                 doc_type="mailinglists",
                 id=LID,
                 body = {
                     'list': LID,
                     'name': LID,
                     'description': desc
                 }
             )
             print("All done, updated description.")

     if args.targetLID or args.makePrivate or args.makePublic or args.deleteEmails or args.mid or args.obfuscate:
         if args.dryrun:
             print("DRY RUN - NO CHANGES WILL BE MADE")
         print("Updating docs...")
         then = time.time()
         terms = {
             'wildcard' if args.wildcard else 'term': {
                 'list_raw': args.sourceLID
             }
         }
         if args.mid:
             terms = {
                 'term': {
                     'mid': args.mid
                 }
             }
         query = {
             '_source': ['body', 'subject', 'from'] if args.obfuscate else False,
             'query': {
                 'bool': {
                     'must': [
                         terms
                     ]
                 }
             }
         }
         proposed_changes = []
         for page in es.scan_and_scroll(body = query):
             prop = process_hits(page, args, dbname)
             if prop:
                 proposed_changes.extend(prop)

         tmp = []
         count = len(proposed_changes)
         processed = 0
         # Handle proposed changes in batches of 500
         while len(proposed_changes) > 0:
             tmp.append(proposed_changes.pop(0))
             if len(tmp) == 500 and not args.dryrun:
                 es.bulk(tmp)
                 tmp = []
                 processed += 500
                 print("Processed %u documents..." % processed)
         # Any stragglers remaining gets processed here
         if len(tmp) > 0 and not args.dryrun:
             es.bulk(tmp)
             processed += len(tmp)
             print("Processed %u documents..." % processed)

         print("All done, processed %u docs in %u seconds" % (count, time.time() - then))

 if __name__ == '__main__':
     main()
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	""" Modify lists and messages

	This utility can be used to:
	- rename a list
	- make a list private
	- make a list public
	- update the description for a list
	- delete mails from a list (does not delete mbox_source entries)
	- obfuscate some fields (from, subject, body) in an mbox entry (does not obfuscate the raw source document)

	"""

	import sys
	import time
	import argparse
	import json

	from elastic import Elastic

	class options:
	def __init__(self):
	parser = argparse.ArgumentParser(description='Command line options.')
	# Cannot have both source and mid as input
	source_group = parser.add_mutually_exclusive_group()
	source_group.add_argument('--source', dest='source', type=str,
	help='Source list to edit')
	source_group.add_argument('--mid', dest='mid', type=str,
	help='Source Message-ID to edit')
	parser.add_argument('--rename', dest='target', type=str,
	help='(optional) new list ID')
	parser.add_argument('--desc', dest='desc', type=str,
	help='(optional) new list description')
	parser.add_argument('--obfuscate', dest='obfuscate', type=str,
	help='Things to obfuscate in body, if any')
	# private and public are mutually exclusive
	privacy_group = parser.add_mutually_exclusive_group()
	privacy_group.add_argument('--private', dest='private', action='store_true',
	help='Make all emails in list private')
	privacy_group.add_argument('--public', dest='public', action='store_true',
	help='Make all emails in list public')
	parser.add_argument('--delete', dest='delete', action='store_true',
	help='Delete emails from this list')
	parser.add_argument('--wildcard', dest='glob', action='store_true',
	help='Allow wildcards in --source')
	parser.add_argument('--debug', dest='debug', action='store_true',
	help='Debug output - very noisy!')
	parser.add_argument('--notag', dest='notag', action='store_true',
	help='List IDs do not have <> in them')
	parser.add_argument('--test', dest='test', action='store_true',
	help='Only test for occurrences, do not run the chosen action (dry run)')

	args = parser.parse_args()

	self.sourceLID = args.source
	self.targetLID = args.target
	self.desc = args.desc
	self.makePrivate = args.private
	self.makePublic = args.public
	self.deleteEmails = args.delete
	self.wildcard = args.glob
	self.debug = args.debug
	self.notag = args.notag
	self.mid = args.mid
	self.obfuscate = args.obfuscate
	self.dryrun = args.test

	self.privacyChange = self.makePrivate or self.makePublic
	self.otherChange = self.targetLID or self.desc or self.obfuscate
	self.anyChange = self.privacyChange or self.otherChange

	if not self.sourceLID and not self.mid:
	print("No source list ID specified!")
	parser.print_help()
	sys.exit(-1)
	if not (self.anyChange or self.deleteEmails):
	print("Nothing to do! No target list ID or action specified")
	parser.print_help()
	sys.exit(-1)
	if self.desc and not self.sourceLID:
	print("No source list ID specified for description!")
	parser.print_help()
	sys.exit(-1)
	if self.anyChange and self.deleteEmails:
	print("Cannot both change and delete emails in the same run")
	parser.print_help()
	sys.exit(-1)

	# TODO does it make sense to allow --rename with --mid?
	# i.e. rename the list for a single mid?

	if self.sourceLID:
	self.sourceLID = ("%s" if self.notag else "<%s>") % self.sourceLID.replace("@", ".").strip("<>")
	if self.targetLID:
	self.targetLID = "<%s>" % self.targetLID.replace("@", ".").strip("<>")


	def process_hits(page, args, dbname):
	""" Processes each hit in a scroll search and proposes changes
	in the array returned """
	changes = []
	if 'hits' in page and 'hits' in page['hits']:
	for hit in page['hits']['hits']:
	doc = hit['_id']
	body = {}
	if args.obfuscate:
	body['body'] = hit['_source']['body'].replace(args.obfuscate, "...")
	body['subject'] = hit['_source']['subject'].replace(args.obfuscate, "...")
	body['from'] = hit['_source']['from'].replace(args.obfuscate, "...")
	if args.targetLID:
	body['list_raw'] = args.targetLID
	body['list'] = args.targetLID
	if args.makePrivate:
	body['private'] = True
	if args.makePublic:
	body['private'] = False
	changes.append({
	'_op_type': 'delete' if args.deleteEmails else 'update',
	'_index': dbname,
	'_type': 'mbox',
	'_id': doc,
	'doc': body
	})
	return changes

	def main():
	es = Elastic()
	dbname = es.getdbname()
	# get config and set up default databas
	es = Elastic()
	# default database name
	dbname = es.getdbname()

	args = options()

	print("Beginning list edit:")
	if args.sourceLID:
	print(" - List ID: %s" % args.sourceLID)
	else:
	print(" - MID: %s" % args.mid)
	if args.targetLID:
	print(" - Target ID: %s" % args.targetLID)
	if args.makePublic:
	print(" - Action: Mark all emails public")
	if args.makePrivate:
	print(" - Action: Mark all emails private")
	if args.deleteEmails:
	print(" - Action: Delete emails (sources will be kept!)")
	if args.obfuscate:
	print(" - Action: Obfuscate parts of email containing: %s" % args.obfuscate)

	if args.desc:
	print(" - Action: add description: %s" % args.desc)
	if args.dryrun:
	print("DRY RUN - NO CHANGES WILL BE MADE")
	else:
	LID = args.sourceLID
	if args.targetLID:
	LID = args.targetLID
	es.index(
	doc_type="mailinglists",
	id=LID,
	body = {
	'list': LID,
	'name': LID,
	'description': desc
	}
	)
	print("All done, updated description.")

	if args.targetLID or args.makePrivate or args.makePublic or args.deleteEmails or args.mid or args.obfuscate:
	if args.dryrun:
	print("DRY RUN - NO CHANGES WILL BE MADE")
	print("Updating docs...")
	then = time.time()
	terms = {
	'wildcard' if args.wildcard else 'term': {
	'list_raw': args.sourceLID
	}
	}
	if args.mid:
	terms = {
	'term': {
	'mid': args.mid
	}
	}
	query = {
	'_source': ['body', 'subject', 'from'] if args.obfuscate else False,
	'query': {
	'bool': {
	'must': [
	terms
	]
	}
	}
	}
	proposed_changes = []
	for page in es.scan_and_scroll(body = query):
	prop = process_hits(page, args, dbname)
	if prop:
	proposed_changes.extend(prop)

	tmp = []
	count = len(proposed_changes)
	processed = 0
	# Handle proposed changes in batches of 500
	while len(proposed_changes) > 0:
	tmp.append(proposed_changes.pop(0))
	if len(tmp) == 500 and not args.dryrun:
	es.bulk(tmp)
	tmp = []
	processed += 500
	print("Processed %u documents..." % processed)
	# Any stragglers remaining gets processed here
	if len(tmp) > 0 and not args.dryrun:
	es.bulk(tmp)
	processed += len(tmp)
	print("Processed %u documents..." % processed)

	print("All done, processed %u docs in %u seconds" % (count, time.time() - then))

	if __name__ == '__main__':
	main()