blob: 964876b41456df7cde0bc6d94a48e7e0494e6fd7 [file] [log] [blame]
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import getpass
import logging
import optparse
import os
import subprocess
from uuid import uuid1
from twitter.common.quantity import Amount, Time
from twitter.common.quantity.parse_simple import parse_time
from apache.aurora.client.api import AuroraClientAPI
from apache.aurora.client.base import AURORA_ADMIN_USER_AGENT_NAME, die
from apache.aurora.common.cluster import Cluster
from apache.aurora.common.clusters import CLUSTERS
"""Admin client utility functions shared between admin and maintenance modules."""
# TODO(maxim): Switch to CLI ConfigurationPlugin within AURORA-486.
LOGGER_NAME = 'aurora_admin'
logger = logging.getLogger(LOGGER_NAME)
CLIENT_ID = uuid1()
# Default SLA limits
SLA_UPTIME_PERCENTAGE_LIMIT = 95
SLA_UPTIME_DURATION_LIMIT = Amount(30, Time.MINUTES)
def log_admin_message(sev, msg, *args, **kwargs):
"""Logs message using the module-defined logger.
:param sev: message severity
:type sev: The numeric level of the logging event (one of DEBUG, INFO etc.)
:param msg: message to log
:type msg: string
"""
extra = kwargs.get('extra', {})
extra['clientid'] = CLIENT_ID
extra['user'] = getpass.getuser()
extra['logger_name'] = LOGGER_NAME
kwargs['extra'] = extra
logger.log(sev, msg, *args, **kwargs)
FILENAME_OPTION = optparse.Option(
'--filename',
dest='filename',
default=None,
help='Name of the file with hostnames')
HOSTS_OPTION = optparse.Option(
'--hosts',
dest='hosts',
default=None,
help='Comma separated list of hosts')
POST_DRAIN_SCRIPT_OPTION = optparse.Option(
'--post_drain_script',
dest='post_drain_script',
default=None,
help='Path to a script to run for each host if needed.')
OVERRIDE_SLA_PERCENTAGE_OPTION = optparse.Option(
'--override_percentage',
dest='percentage',
default=None,
help='Percentage of tasks required to be up all the time within the duration. '
'Default value:%s. DO NOT override default value unless absolutely necessary! '
'See sla_probe_hosts and sla_list_safe_domain commands '
'for more details on SLA.' % SLA_UPTIME_PERCENTAGE_LIMIT)
OVERRIDE_SLA_DURATION_OPTION = optparse.Option(
'--override_duration',
dest='duration',
default=None,
help='Time interval (now - value) for the percentage of up tasks. Format: XdYhZmWs. '
'Default value:%s. DO NOT override default value unless absolutely necessary! '
'See sla_probe_hosts and sla_list_safe_domain commands '
'for more details on SLA.' % SLA_UPTIME_DURATION_LIMIT)
OVERRIDE_SLA_REASON_OPTION = optparse.Option(
'--override_reason',
dest='reason',
default=None,
help='Reason for overriding default SLA values. Provide details including the '
'maintenance ticket number.')
DEFAULT_SLA_PERCENTAGE_OPTION = optparse.Option(
'--default_percentage',
dest='default_percentage',
default=95,
help='Percentage of tasks required to be up all the time within the duration. '
'This percentage will be used for SLA calculation if tasks do not '
'have a configured SlaPolicy.')
DEFAULT_SLA_DURATION_OPTION = optparse.Option(
'--default_duration',
dest='default_duration',
default='30m',
help='Time interval (now - value) for the percentage of up tasks. Format: XdYhZmWs. '
'This duration will be used for SLA calculation if tasks do not '
'have a configured SlaPolicy.')
FORCE_DRAIN_TIMEOUT_OPTION = optparse.Option(
'--force_drain_timeout',
dest='timeout',
default='7d',
help='Time interval (now - value) after which tasks will be forcefully drained. '
'Format: XdYhZmWs.')
UNSAFE_SLA_HOSTS_FILE_OPTION = optparse.Option(
'--unsafe_hosts_file',
dest='unsafe_hosts_filename',
default=None,
help='Output file to write host names that did not pass SLA check.')
def parse_sla_percentage(percentage):
"""Parses percentage value for an SLA check.
:param percentage: string percentage to parse
:type percentage: string
:rtype: float
"""
val = float(percentage)
if val <= 0 or val > 100:
die('Invalid percentage %s. Must be within (0, 100].' % percentage)
return val
def _parse_hostname_list(hostname_list):
hostnames = [hostname.strip() for hostname in hostname_list.split(",")]
if not hostnames:
die('No valid hosts found.')
return hostnames
def _parse_hostname_file(filename):
with open(filename, 'r') as hosts:
hostnames = [hostname.strip() for hostname in hosts]
if not hostnames:
die('No valid hosts found in %s.' % filename)
return hostnames
def parse_hostnames_optional(list_option, file_option):
"""Parses host names from a comma-separated list or a filename.
Does not require either of the arguments (returns None list if no option provided).
:param list_option: command option with comma-separated list of host names
:type list_option: app.option
:param file_option: command option with filename (one host per line)
:type file_option: app.option
:rtype: list of host names or None.
"""
if bool(list_option) and bool(file_option):
die('Cannot specify both filename and list for the same option.')
hostnames = None
if file_option:
hostnames = _parse_hostname_file(file_option)
elif list_option:
hostnames = _parse_hostname_list(list_option)
return hostnames
def parse_hostnames(filename, hostnames):
"""Parses host names from a comma-separated list or a filename.
Fails if neither filename nor hostnames provided.
:param filename: filename with host names (one per line)
:type filename: string
:param hostnames: comma-separated list of host names
:type hostnames: string
:rtype: list of host names
"""
if bool(filename) == bool(hostnames):
die('Please specify either --filename or --hosts')
if filename:
hostnames = _parse_hostname_file(filename)
elif hostnames:
hostnames = _parse_hostname_list(hostnames)
if not hostnames:
die('No valid hosts found.')
return hostnames
def parse_script(filename):
"""Parses shell script from the provided file and wraps it up into a subprocess callback.
:param filename: name of the script file
:type filename: string
:rtype: function
"""
def callback(host):
subprocess.Popen([cmd, host]).wait()
if filename:
if not os.path.exists(filename):
die("No such file: %s" % filename)
cmd = os.path.abspath(filename)
return callback
else:
return None
def parse_and_validate_sla_overrides(options, hostnames):
"""Parses and validates host SLA override 3-tuple (percentage, duration, reason).
In addition, logs an admin message about overriding default SLA values.
:param options: command line options
:type options: list of app.option
:param hostnames: host names override is issued to
:type hostnames: list of string
:rtype: a tuple of: override percentage (float) and override duration (Amount)
"""
has_override = bool(options.percentage) or bool(options.duration) or bool(options.reason)
all_overrides = bool(options.percentage) and bool(options.duration) and bool(options.reason)
if has_override != all_overrides:
die('All --override_* options are required when attempting to override default SLA values.')
print(options.percentage)
percentage = parse_sla_percentage(options.percentage) if options.percentage else None
duration = parse_time(options.duration) if options.duration else None
if options.reason:
log_admin_message(
logging.WARNING,
'Default SLA values (percentage: %s, duration: %s) are overridden for the following '
'hosts: %s. New percentage: %s, duration: %s, override reason: %s' % (
SLA_UPTIME_PERCENTAGE_LIMIT,
SLA_UPTIME_DURATION_LIMIT,
hostnames,
percentage,
duration,
options.reason))
return percentage or SLA_UPTIME_PERCENTAGE_LIMIT, duration or SLA_UPTIME_DURATION_LIMIT
def parse_and_validate_sla_drain_default(options):
"""Parses and validates host SLA default 3-tuple (percentage, duration, timeout).
:param options: command line options
:type options: list of app.option
:rtype: a tuple of: default percentage (float), default duration (Amount) and timeout (Amount)
"""
percentage = parse_sla_percentage(options.default_percentage)
duration = parse_time(options.default_duration).as_(Time.SECONDS)
timeout = parse_time(options.timeout).as_(Time.SECONDS)
return percentage, duration, timeout
def print_results(results):
"""Prints formatted SLA results.
:param results: formatted SLA results
:type results: list of string
"""
for line in results:
print(line)
def format_sla_results(host_groups, unsafe_only=False):
"""Formats SLA check result output.
:param host_groups: SLA check result groups (grouped by external grouping criteria, e.g. by_host)
:type host_groups: list of (defaultdict(list))
:param unsafe_only: If True, includes only SLA-"unsafe" hosts from the results
:type unsafe_only: bool
:rtype: a tuple of: list of output strings, set of hostnames included in output.
"""
results = []
include_unsafe_only = lambda d: not d.safe if unsafe_only else True
hostnames = set()
for group in host_groups:
for host, job_details in sorted(group.items()):
host_details = '\n'.join(
['%s\t%s\t%.2f\t%s\t%s' %
(host,
d.job.to_path(),
d.predicted_percentage,
d.safe,
'n/a' if d.safe_in_secs is None else d.safe_in_secs)
for d in sorted(job_details) if include_unsafe_only(d)])
if host_details:
results.append(host_details)
hostnames.add(host)
return results, hostnames
def make_admin_client(cluster, verbose=False, bypass_leader_redirect=False):
"""Creates an API client with the specified options for use in admin commands.
:param cluster: The cluster to connect with.
:type cluster: Either a string cluster name or a Cluster object.
:param verbose: Should the client emit verbose output.
:type verbose: bool
:type bypass_leader_redirect: Should the client bypass the scheduler's leader redirect filter.
:type bypass_leader_redirect: bool
:rtype: an AuroraClientAPI instance.
"""
is_cluster_object = isinstance(cluster, Cluster)
if not is_cluster_object and cluster not in CLUSTERS:
die('Unknown cluster: %s. Known clusters: %s' % (cluster, ", ".join(CLUSTERS.keys())))
return AuroraClientAPI(
cluster if is_cluster_object else CLUSTERS[cluster],
AURORA_ADMIN_USER_AGENT_NAME,
verbose=verbose,
bypass_leader_redirect=bypass_leader_redirect)