| #! /usr/bin/env bash |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # https://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| |
| bin_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) |
| at_home=$(cd "$(dirname "$bin_dir")" && pwd) |
| source "${bin_dir}/build" |
| |
| function print_usage() { |
| cat <<EOF |
| |
| Usage: agitator <command> |
| |
| Possible commands: |
| start Starts agitator |
| stop Stop agitator |
| EOF |
| } |
| |
| # Starts a app specific agitator |
| # usage: start_app_agitator app_name kill_sleep_time restart_sleep_time min_kill max_kill start_cmd kill_cmd |
| # Requires that a list of hosts to be set in $APP_HOSTS |
| function start_app_agitator() { |
| local app_name=$1 |
| local kill_sleep_time=$2 |
| local restart_sleep_time=$3 |
| local min_kill=$4 |
| local max_kill=$5 |
| local start_cmd=$6 |
| local kill_cmd=$7 |
| local hosts_array |
| read -a hosts_array < <(get_app_hosts "$app_name") |
| local num_hosts=${#hosts_array[@]} |
| local node_to_kill |
| nodes_to_kill_array=() |
| local T |
| local ENV_VARS="ACCUMULO_HOME=$ACCUMULO_HOME ZOOKEEPER_HOME=$ZOOKEEPER_HOME HADOOP_HOME=$HADOOP_HOME JAVA_HOME=$JAVA_HOME" |
| |
| if ((num_hosts == 0)); then |
| echo "ERROR: No hosts were found in env for $app_name" |
| exit 1 |
| fi |
| if ((max_kill > num_hosts)); then |
| echo "ERROR: Max kill $max_kill is greater than number of hosts $num_hosts" |
| exit 1 |
| fi |
| if ((max_kill < min_kill)); then |
| echo "ERROR: Max kill $max_kill is less than min kill $min_kill" |
| exit 1 |
| fi |
| |
| T="$(date +'%Y%m%d %H:%M:%S')" |
| echo "$T Starting $app_name agitation. Kill every $kill_sleep_time minutes, restart every $restart_sleep_time minutes." |
| echo "$T Will randomly kill between $min_kill and $max_kill of the following: ${hosts_array[*]}" |
| while true; do |
| echo "$T Sleeping for $kill_sleep_time minutes" |
| sleep $((kill_sleep_time * 60)) |
| |
| T="$(date +'%Y%m%d %H:%M:%S')" |
| if ((num_hosts == 1)); then |
| node_to_kill=${hosts_array[0]} |
| echo "$T Killing $app_name at $node_to_kill" |
| ssh "$node_to_kill" "$kill_cmd" |
| else |
| local num_to_kill=$((min_kill + RANDOM % max_kill)) |
| # get the random nodes to kill |
| local count=0 |
| while [[ $count -lt $num_to_kill ]]; do |
| randomHostIndex=$((RANDOM % num_hosts)) |
| node_to_kill=${hosts_array[randomHostIndex]} |
| # only add host to the array if its not already there |
| if [[ ! " ${nodes_to_kill_array[*]} " =~ $node_to_kill ]]; then |
| nodes_to_kill_array[count]=$node_to_kill |
| fi |
| count=${#nodes_to_kill_array[@]} |
| done |
| echo "$T Killing $count $app_name nodes" |
| for i in "${nodes_to_kill_array[@]}"; do |
| echo "$T Killing $app_name at $i" |
| ssh "$i" "$kill_cmd" |
| done |
| fi |
| |
| T="$(date +'%Y%m%d %H:%M:%S')" |
| echo "$T Sleeping for $restart_sleep_time minutes." |
| sleep $((restart_sleep_time * 60)) |
| |
| T="$(date +'%Y%m%d %H:%M:%S')" |
| if ((num_hosts == 1)); then |
| echo "$T Restarting $app_name at $node_to_kill" |
| ssh "$node_to_kill" "bash -c '${ENV_VARS} $start_cmd'" |
| else |
| for i in "${nodes_to_kill_array[@]}"; do |
| echo "$T Restarting $app_name node at ${i}" |
| ssh "$i" "bash -c '${ENV_VARS} $start_cmd'" |
| done |
| fi |
| done |
| } |
| |
| function start_agitator() { |
| ## check that pssh is installed, falling back to parallel-ssh if needed |
| ## make sure to export it, so it can be seen inside the agitator perl script |
| if hash pssh 2>/dev/null; then |
| export PSSH=pssh |
| elif hash parallel-ssh 2>/dev/null; then |
| export PSSH=parallel-ssh |
| else |
| echo >&2 "The agitator requires pssh/parallel-ssh to be installed. Aborting." |
| exit 1 |
| fi |
| ## read configuration into env variables |
| read_cluster_conf |
| |
| mkdir -p "${at_home}/logs" |
| log_base="${at_home}/logs/$(date +%Y%m%d%H%M%S)_$(hostname)" |
| manager_log="${log_base}_manager-agitator.log" |
| tserver_log="${log_base}_tserver-agitator.log" |
| datanode_log="${log_base}_datanode-agitator.log" |
| datanode_kill_cmd="pkill -9 -f '[p]roc_datanode'" |
| datanode_start_cmd="$HADOOP_HOME/bin/hdfs --daemon start datanode" |
| manager_kill_cmd="pkill -f '[ ]org[.]apache[.]accumulo[.]start[.]Main manager'" |
| manager_start_cmd="$ACCUMULO_HOME/bin/accumulo-service manager start" |
| tserver_kill_cmd="pkill -f '[ ]org[.]apache[.]accumulo[.]start[.]Main tserver'" |
| tserver_start_cmd="$ACCUMULO_HOME/bin/accumulo-service tserver start" |
| |
| [[ -n $AGITATOR_USER ]] || AGITATOR_USER=$(whoami) |
| |
| if [[ $AGITATOR_USER != "$AGTR_ACCUMULO_USER" ]]; then |
| sudo -i -u "$AGTR_ACCUMULO_USER" |
| fi |
| echo "Starting manager and tserver agitation as $(whoami)" |
| start_app_agitator manager "$AGTR_MANAGER_KILL_SLEEP_TIME" "$AGTR_MANAGER_RESTART_SLEEP_TIME" 1 1 "$manager_start_cmd" "$manager_kill_cmd" >"$manager_log" 2>&1 & |
| start_app_agitator tserver "$AGTR_TSERVER_KILL_SLEEP_TIME" "$AGTR_TSERVER_RESTART_SLEEP_TIME" "$AGTR_TSERVER_MIN_KILL" "$AGTR_TSERVER_MAX_KILL" "$tserver_start_cmd" "$tserver_kill_cmd" >"$tserver_log" 2>&1 & |
| |
| if [[ $AGITATOR_USER != "$AGTR_HDFS_USER" ]]; then |
| sudo -i -u "$AGTR_HDFS_USER" |
| fi |
| echo "Running datanode agitator as $(whoami)" |
| start_app_agitator datanode "$AGTR_DATANODE_KILL_SLEEP_TIME" "$AGTR_DATANODE_RESTART_SLEEP_TIME" "$AGTR_DATANODE_MIN_KILL" "$AGTR_DATANODE_MAX_KILL" "$datanode_start_cmd" "$datanode_kill_cmd" >"${datanode_log}" 2>&1 & |
| |
| if ${AGTR_HDFS:-false}; then |
| agitator_log=${log_base}_hdfs-agitator |
| sudo -u "$AGTR_HDFS_SUPERUSER" nohup "${at_home}/libexec/hdfs-agitator.pl" --sleep "${AGTR_HDFS_SLEEP_TIME}" --hdfs-cmd "${AGTR_HDFS_COMMAND}" --superuser "${AGTR_HDFS_SUPERUSER}" >"${agitator_log}.out" 2>"${agitator_log}.err" & |
| fi |
| } |
| |
| function stop_agitator() { |
| [[ -n $AGITATOR_USER ]] || AGITATOR_USER=$(whoami) |
| echo "Stopping all processes in the same process group as 'agitator' as user $AGITATOR_USER" |
| ## get process ids of all agitator processes (return 1 if none found) |
| local agitator_pids=() |
| read -a agitator_pids < <(pgrep -f "agitator start") || return 1 |
| ## get the group process ids of all agitator processes |
| local group_pids=() |
| read -a group_pids < <(ps -o pgid= -p "${agitator_pids[@]}") |
| ## kill all processes in the process groups (should include agitators and their sleep processes) |
| kill -- "${group_pids[@]/#/-}" |
| } |
| |
| function parse_fail() { |
| echo "Failed to parse ${conf}/cluster.yaml" |
| exit 1 |
| } |
| |
| # Read the configuration from cluster.yaml - expects at least the manager and tservers |
| function read_cluster_conf() { |
| conf="${ACCUMULO_HOME}/conf" |
| echo "Reading cluster config from ${conf}/cluster.yaml" |
| trap 'rm -f "$CONFIG_FILE"' EXIT |
| CONFIG_FILE=$(mktemp) || exit 1 |
| accumulo org.apache.accumulo.core.conf.cluster.ClusterConfigParser "${conf}"/cluster.yaml >"$CONFIG_FILE" || parse_fail |
| . "$CONFIG_FILE" |
| rm -f "$CONFIG_FILE" |
| |
| if [[ -z $MANAGER_HOSTS ]]; then |
| echo "ERROR: managers not found in ${conf}/cluster.yaml" |
| exit 1 |
| fi |
| if [[ -z $TSERVER_HOSTS ]]; then |
| echo "ERROR: tservers not found in ${conf}/cluster.yaml" |
| exit 1 |
| fi |
| } |
| |
| # Given an app_name $1, return the space separated string of hosts through echo |
| function get_app_hosts() { |
| case "$1" in |
| manager) echo -n "$MANAGER_HOSTS" ;; |
| tserver | datanode) echo -n "$TSERVER_HOSTS" ;; |
| *) return 1 ;; |
| esac |
| } |
| |
| case "$1" in |
| start) |
| start_agitator |
| ;; |
| stop) |
| stop_agitator |
| ;; |
| *) |
| echo "ERROR: unknown command - $2" |
| print_usage |
| exit 1 |
| ;; |
| esac |