src/main/python/apache/aurora/admin/maintenance.py - aurora - Git at Google

 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 from twitter.common import app, log

 from apache.aurora.client.base import GROUPING_OPTION, get_grouping_or_die, requires
 from apache.aurora.common.clusters import CLUSTERS

 from .admin_util import (
     DEFAULT_SLA_DURATION_OPTION,
     DEFAULT_SLA_PERCENTAGE_OPTION,
     FILENAME_OPTION,
     FORCE_DRAIN_TIMEOUT_OPTION,
     HOSTS_OPTION,
     OVERRIDE_SLA_DURATION_OPTION,
     OVERRIDE_SLA_PERCENTAGE_OPTION,
     OVERRIDE_SLA_REASON_OPTION,
     POST_DRAIN_SCRIPT_OPTION,
     UNSAFE_SLA_HOSTS_FILE_OPTION,
     parse_and_validate_sla_drain_default,
     parse_and_validate_sla_overrides,
     parse_hostnames,
     parse_script
 )
 from .host_maintenance import HostMaintenance


 # TODO(maxim): merge with admin.py commands.
 @app.command
 @app.command_option(FILENAME_OPTION)
 @app.command_option(HOSTS_OPTION)
 @requires.exactly('cluster')
 def host_deactivate(cluster):
   """usage: host_deactivate {--filename=filename | --hosts=hosts}
                             cluster

   Puts hosts into maintenance mode.

   The list of hosts is marked for maintenance, and will be de-prioritized
   from consideration for scheduling.  Note, they are not removed from
   consideration, and may still schedule tasks if resources are very scarce.
   Usually you would mark a larger set of machines for drain, and then do
   them in batches within the larger set, to help drained tasks not land on
   future hosts that will be drained shortly in subsequent batches.
   """
   options = app.get_options()
   HostMaintenance(
       cluster=CLUSTERS[cluster],
       verbosity=options.verbosity,
       bypass_leader_redirect=options.bypass_leader_redirect).start_maintenance(
           parse_hostnames(options.filename, options.hosts))


 @app.command
 @app.command_option(FILENAME_OPTION)
 @app.command_option(HOSTS_OPTION)
 @requires.exactly('cluster')
 def host_activate(cluster):
   """usage: host_activate {--filename=filename | --hosts=hosts}
                           cluster

   Removes maintenance mode from hosts.

   The list of hosts is marked as not in a drained state anymore. This will
   allow normal scheduling to resume on the given list of hosts.
   """
   options = app.get_options()
   HostMaintenance(
       cluster=CLUSTERS[cluster],
       verbosity=options.verbosity,
       bypass_leader_redirect=options.bypass_leader_redirect).end_maintenance(
           parse_hostnames(options.filename, options.hosts))


 @app.command
 @app.command_option(FILENAME_OPTION)
 @app.command_option(HOSTS_OPTION)
 @app.command_option(POST_DRAIN_SCRIPT_OPTION)
 @app.command_option(GROUPING_OPTION)
 @app.command_option(OVERRIDE_SLA_PERCENTAGE_OPTION)
 @app.command_option(OVERRIDE_SLA_DURATION_OPTION)
 @app.command_option(OVERRIDE_SLA_REASON_OPTION)
 @app.command_option(UNSAFE_SLA_HOSTS_FILE_OPTION)
 @requires.exactly('cluster')
 def host_drain(cluster):
   """usage: host_drain {--filename=filename | --hosts=hosts}
                        [--post_drain_script=path]
                        [--grouping=function]
                        [--override_percentage=percentage]
                        [--override_duration=duration]
                        [--override_reason=reason]
                        [--unsafe_hosts_file=unsafe_hosts_filename]
                        cluster

   Asks the scheduler to start maintenance on the list of provided hosts (see host_deactivate
   for more details) and drains any active tasks on them.

   The list of hosts is drained and marked in a drained state.  This will kill
   off any tasks currently running on these hosts, as well as prevent future
   tasks from scheduling on these hosts while they are drained.

   The hosts are left in maintenance mode upon completion. Use host_activate to
   return hosts back to service and allow scheduling tasks on them.
   """
   options = app.get_options()
   drainable_hosts = parse_hostnames(options.filename, options.hosts)
   get_grouping_or_die(options.grouping)

   override_percentage, override_duration = parse_and_validate_sla_overrides(
       options,
       drainable_hosts)

   post_drain_callback = parse_script(options.post_drain_script)

   HostMaintenance(
       cluster=CLUSTERS[cluster],
       verbosity=options.verbosity,
       bypass_leader_redirect=options.bypass_leader_redirect).perform_maintenance(
           drainable_hosts,
           grouping_function=options.grouping,
           percentage=override_percentage,
           duration=override_duration,
           output_file=options.unsafe_hosts_filename,
           callback=post_drain_callback)


 @app.command
 @app.command_option(FORCE_DRAIN_TIMEOUT_OPTION)
 @app.command_option(FILENAME_OPTION)
 @app.command_option(HOSTS_OPTION)
 @app.command_option(DEFAULT_SLA_PERCENTAGE_OPTION)
 @app.command_option(DEFAULT_SLA_DURATION_OPTION)
 @requires.exactly('cluster')
 def sla_host_drain(cluster):
   """usage: sla_host_drain {--filename=filename | --hosts=hosts}
                            [--default_percentage=percentage]
                            [--default_duration=duration]
                            [--force_drain_timeout=timeout]
                            cluster

   Asks the scheduler to drain the list of provided hosts in an SLA-aware manner.

   The list of hosts is drained and marked in a drained state.  This will kill
   off any tasks currently running on these hosts, as well as prevent future
   tasks from scheduling on these hosts while they are drained.

   The hosts are left in maintenance mode upon completion. Use host_activate to
   return hosts back to service and allow scheduling tasks on them.

   If tasks are unable to be drained after the specified timeout interval they will
   be forcefully drained even if it breaks SLA.
   """
   options = app.get_options()
   drainable_hosts = parse_hostnames(options.filename, options.hosts)

   percentage, duration, timeout = parse_and_validate_sla_drain_default(options)

   HostMaintenance(
       cluster=CLUSTERS[cluster],
       verbosity=options.verbosity,
       bypass_leader_redirect=options.bypass_leader_redirect).perform_sla_maintenance(
           drainable_hosts,
           percentage=percentage,
           duration=duration,
           timeout=timeout)


 @app.command
 @app.command_option(FILENAME_OPTION)
 @app.command_option(HOSTS_OPTION)
 @requires.exactly('cluster')
 def host_status(cluster):
   """usage: host_status {--filename=filename | --hosts=hosts}
                         cluster

   Print the drain status of each supplied host.
   """
   options = app.get_options()
   checkable_hosts = parse_hostnames(options.filename, options.hosts)
   statuses = HostMaintenance(
       cluster=CLUSTERS[cluster],
       verbosity=options.verbosity,
       bypass_leader_redirect=options.bypass_leader_redirect).check_status(checkable_hosts)

   for pair in statuses:
     log.info("%s is in state: %s" % pair)
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	from twitter.common import app, log

	from apache.aurora.client.base import GROUPING_OPTION, get_grouping_or_die, requires
	from apache.aurora.common.clusters import CLUSTERS

	from .admin_util import (
	DEFAULT_SLA_DURATION_OPTION,
	DEFAULT_SLA_PERCENTAGE_OPTION,
	FILENAME_OPTION,
	FORCE_DRAIN_TIMEOUT_OPTION,
	HOSTS_OPTION,
	OVERRIDE_SLA_DURATION_OPTION,
	OVERRIDE_SLA_PERCENTAGE_OPTION,
	OVERRIDE_SLA_REASON_OPTION,
	POST_DRAIN_SCRIPT_OPTION,
	UNSAFE_SLA_HOSTS_FILE_OPTION,
	parse_and_validate_sla_drain_default,
	parse_and_validate_sla_overrides,
	parse_hostnames,
	parse_script
	)
	from .host_maintenance import HostMaintenance


	# TODO(maxim): merge with admin.py commands.
	@app.command
	@app.command_option(FILENAME_OPTION)
	@app.command_option(HOSTS_OPTION)
	@requires.exactly('cluster')
	def host_deactivate(cluster):
	"""usage: host_deactivate {--filename=filename \| --hosts=hosts}
	cluster

	Puts hosts into maintenance mode.

	The list of hosts is marked for maintenance, and will be de-prioritized
	from consideration for scheduling. Note, they are not removed from
	consideration, and may still schedule tasks if resources are very scarce.
	Usually you would mark a larger set of machines for drain, and then do
	them in batches within the larger set, to help drained tasks not land on
	future hosts that will be drained shortly in subsequent batches.
	"""
	options = app.get_options()
	HostMaintenance(
	cluster=CLUSTERS[cluster],
	verbosity=options.verbosity,
	bypass_leader_redirect=options.bypass_leader_redirect).start_maintenance(
	parse_hostnames(options.filename, options.hosts))


	@app.command
	@app.command_option(FILENAME_OPTION)
	@app.command_option(HOSTS_OPTION)
	@requires.exactly('cluster')
	def host_activate(cluster):
	"""usage: host_activate {--filename=filename \| --hosts=hosts}
	cluster

	Removes maintenance mode from hosts.

	The list of hosts is marked as not in a drained state anymore. This will
	allow normal scheduling to resume on the given list of hosts.
	"""
	options = app.get_options()
	HostMaintenance(
	cluster=CLUSTERS[cluster],
	verbosity=options.verbosity,
	bypass_leader_redirect=options.bypass_leader_redirect).end_maintenance(
	parse_hostnames(options.filename, options.hosts))


	@app.command
	@app.command_option(FILENAME_OPTION)
	@app.command_option(HOSTS_OPTION)
	@app.command_option(POST_DRAIN_SCRIPT_OPTION)
	@app.command_option(GROUPING_OPTION)
	@app.command_option(OVERRIDE_SLA_PERCENTAGE_OPTION)
	@app.command_option(OVERRIDE_SLA_DURATION_OPTION)
	@app.command_option(OVERRIDE_SLA_REASON_OPTION)
	@app.command_option(UNSAFE_SLA_HOSTS_FILE_OPTION)
	@requires.exactly('cluster')
	def host_drain(cluster):
	"""usage: host_drain {--filename=filename \| --hosts=hosts}
	[--post_drain_script=path]
	[--grouping=function]
	[--override_percentage=percentage]
	[--override_duration=duration]
	[--override_reason=reason]
	[--unsafe_hosts_file=unsafe_hosts_filename]
	cluster

	Asks the scheduler to start maintenance on the list of provided hosts (see host_deactivate
	for more details) and drains any active tasks on them.

	The list of hosts is drained and marked in a drained state. This will kill
	off any tasks currently running on these hosts, as well as prevent future
	tasks from scheduling on these hosts while they are drained.

	The hosts are left in maintenance mode upon completion. Use host_activate to
	return hosts back to service and allow scheduling tasks on them.
	"""
	options = app.get_options()
	drainable_hosts = parse_hostnames(options.filename, options.hosts)
	get_grouping_or_die(options.grouping)

	override_percentage, override_duration = parse_and_validate_sla_overrides(
	options,
	drainable_hosts)

	post_drain_callback = parse_script(options.post_drain_script)

	HostMaintenance(
	cluster=CLUSTERS[cluster],
	verbosity=options.verbosity,
	bypass_leader_redirect=options.bypass_leader_redirect).perform_maintenance(
	drainable_hosts,
	grouping_function=options.grouping,
	percentage=override_percentage,
	duration=override_duration,
	output_file=options.unsafe_hosts_filename,
	callback=post_drain_callback)


	@app.command
	@app.command_option(FORCE_DRAIN_TIMEOUT_OPTION)
	@app.command_option(FILENAME_OPTION)
	@app.command_option(HOSTS_OPTION)
	@app.command_option(DEFAULT_SLA_PERCENTAGE_OPTION)
	@app.command_option(DEFAULT_SLA_DURATION_OPTION)
	@requires.exactly('cluster')
	def sla_host_drain(cluster):
	"""usage: sla_host_drain {--filename=filename \| --hosts=hosts}
	[--default_percentage=percentage]
	[--default_duration=duration]
	[--force_drain_timeout=timeout]
	cluster

	Asks the scheduler to drain the list of provided hosts in an SLA-aware manner.

	The list of hosts is drained and marked in a drained state. This will kill
	off any tasks currently running on these hosts, as well as prevent future
	tasks from scheduling on these hosts while they are drained.

	The hosts are left in maintenance mode upon completion. Use host_activate to
	return hosts back to service and allow scheduling tasks on them.

	If tasks are unable to be drained after the specified timeout interval they will
	be forcefully drained even if it breaks SLA.
	"""
	options = app.get_options()
	drainable_hosts = parse_hostnames(options.filename, options.hosts)

	percentage, duration, timeout = parse_and_validate_sla_drain_default(options)

	HostMaintenance(
	cluster=CLUSTERS[cluster],
	verbosity=options.verbosity,
	bypass_leader_redirect=options.bypass_leader_redirect).perform_sla_maintenance(
	drainable_hosts,
	percentage=percentage,
	duration=duration,
	timeout=timeout)


	@app.command
	@app.command_option(FILENAME_OPTION)
	@app.command_option(HOSTS_OPTION)
	@requires.exactly('cluster')
	def host_status(cluster):
	"""usage: host_status {--filename=filename \| --hosts=hosts}
	cluster

	Print the drain status of each supplied host.
	"""
	options = app.get_options()
	checkable_hosts = parse_hostnames(options.filename, options.hosts)
	statuses = HostMaintenance(
	cluster=CLUSTERS[cluster],
	verbosity=options.verbosity,
	bypass_leader_redirect=options.bypass_leader_redirect).check_status(checkable_hosts)

	for pair in statuses:
	log.info("%s is in state: %s" % pair)