blob: 9fb13a25320bb5006af2f36c2ecf50f8ba5f1335 [file] [log] [blame]
#! /usr/bin/env bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
LOGHOST=$1
shift
process=$1
SOURCE="${BASH_SOURCE[0]}"
while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
bin="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
SOURCE="$(readlink "$SOURCE")"
[[ $SOURCE != /* ]] && SOURCE="$bin/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
done
bin="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
# Stop: Resolve Script Directory
. "${bin}"/config.sh
CLEAN_EXIT="Clean Exit"
UNEXPECTED_EXCEPTION="Unexpected exception"
OOM_EXCEPTION="Out of memory exception"
ZKLOCK_LOST="ZKLock lost"
UNKNOWN_ERROR="Unknown error"
ERRFILE=${ACCUMULO_LOG_DIR}/${process}_${LOGHOST}.err
OUTFILE=${ACCUMULO_LOG_DIR}/${process}_${LOGHOST}.out
DEBUGLOG=${ACCUMULO_LOG_DIR}/${process}_$(hostname).debug.log
export COMMAND="${bin}/accumulo \"\$@\""
logger -s "starting process $process at $(date)"
stopRunning=""
while [ -z "$stopRunning" ];
do
eval $COMMAND 2> $ERRFILE
exit=$?
unset cause
if [ "$exit" -eq 0 ]; then
potentialStopRunning=$CLEAN_EXIT
elif [ "$exit" -eq 1 ]; then
potentialStopRunning=$UNEXPECTED_EXCEPTION
elif [ "$exit" -eq 130 ]; then
stopRunning="Control C detected, exiting"
elif [ "$exit" -eq 143 ]; then
stopRunning="Process terminated, exiting"
elif [ "$exit" -eq 137 ]; then
potentialStopRunning="Process killed, exiting"
fi
if [ -z "$stopRunning" ]; then
stopRunning=$potentialStopRunning;
if [ $exit -eq 1 ]; then
source="exit code"
cause=$UNEXPECTED_EXCEPTION
elif tail -n50 $OUTFILE | grep "java.lang.OutOfMemoryError:" > /dev/null; then
source="logs"
cause=$OOM_EXCEPTION
elif [ "$process" = "tserver" ]; then
if tail -n50 $DEBUGLOG | grep "ERROR: Lost tablet server lock (reason =" > /dev/null ; then
source="logs"
cause=$ZKLOCK_LOST
fi
elif [ "$process" = "master" ]; then
if tail -n50 $DEBUGLOG | grep "ERROR: Master lock in zookeeper lost (reason =" > /dev/null ; then
source="logs"
cause=$ZKLOCK_LOST
fi
elif [ "$process" = "gc" ]; then
if tail -n50 $DEBUGLOG | grep "FATAL: GC lock in zookeeper lost (reason =" > /dev/null ; then
source="logs"
cause=$ZKLOCK_LOST
fi
elif [ "$process" = "monitor" ]; then
if tail -n50 $DEBUGLOG | grep "ERROR: Monitor lock in zookeeper lost (reason =" > /dev/null ; then
source="logs"
cause=$ZKLOCK_LOST
fi
elif [ $exit -ne 0 ]; then
source="exit code"
cause=$UNKNOWN_ERROR
fi
case $cause in
#Unknown exit code
"$UNKNOWN_ERROR")
#window doesn't matter when retries = 0
RETRIES=0
;;
"$UNEXPECTED_EXCEPTION")
WINDOW=$UNEXPECTED_TIMESPAN
RETRIES=$UNEXPECTED_RETRIES
;;
"$OOM_EXCEPTION")
WINDOW=$OOM_TIMESPAN
RETRIES=$OOM_RETRIES
;;
"$ZLOCK_LOST")
WINDOW=$ZKLOCK_TIMESPAN
RETRIES=$ZKLOCK_RETRIES
;;
esac
if [ -n "$cause" ]; then
stopRunning=""
declare -i attempts
attempts="`jobs | grep "reason$cause" | wc -l`+1"
if [ "$RETRIES" -le $attempts ]; then
stopRunning="$process encountered $cause in $source with exit code $exit- quitting ($attempts/$RETRIES in $WINDOW seconds)"
# kill all sleeps now
for list in `jobs | cut -b 2-2`; do kill %$list; done
else
logger -s "$process encountered $cause in $source with exit code $exit- retrying ($attempts/$RETRIES in $WINDOW seconds)"
eval "(sleep $WINDOW ; echo "reason$cause" >> /dev/null) &"
fi
fi
fi
done
logger -s $stopRunning