blob: 1c069e3884047e338cc6f8463ef5cb1e5622d5d6 [file] [log] [blame]
#!/usr/bin/env bash
################################################################################
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
#
# This file contains a watchdog tool to monitor a task and potentially kill it after
# not producing any output for $MAX_NO_OUTPUT seconds.
#
# Number of seconds w/o output before printing a stack trace and killing the watched process
MAX_NO_OUTPUT=${MAX_NO_OUTPUT:-900}
# Number of seconds to sleep before checking the output again
SLEEP_TIME=${SLEEP_TIME:-20}
# Internal fields
CMD_OUT="/tmp/watchdog.out"
CMD_PID="/tmp/watchdog.pid"
CMD_EXIT="/tmp/watchdog.exit"
# =============================================
# Utility functions
# =============================================
max_of() {
local max number
max="$1"
for number in "${@:2}"; do
if ((number > max)); then
max="$number"
fi
done
printf '%d\n' "$max"
}
# Returns the highest modification time out of $CMD_OUT (which is the command output file)
# and any file(s) named "mvn-*.log" (which are logging files created by Flink's tests)
mod_time () {
CMD_OUT_MOD_TIME=`stat -c "%Y" $CMD_OUT`
ADDITIONAL_FILES_MOD_TIMES=`stat -c "%Y" $WATCHDOG_ADDITIONAL_MONITORING_FILES 2> /dev/null`
echo `max_of $CMD_OUT_MOD_TIME $ADDITIONAL_FILES_MOD_TIMES`
}
the_time() {
echo `date +%s`
}
# watchdog process
watchdog () {
touch $CMD_OUT
while true; do
sleep $SLEEP_TIME
time_diff=$((`the_time` - `mod_time`))
if [ $time_diff -ge $MAX_NO_OUTPUT ]; then
echo "=============================================================================="
echo "Process produced no output for ${MAX_NO_OUTPUT} seconds."
echo "=============================================================================="
# run timeout callback
$CALLBACK_ON_TIMEOUT
echo "Killing process with pid=$(<$CMD_PID) and all descendants"
pkill -P $(<$CMD_PID) # kill descendants
kill $(<$CMD_PID) # kill process itself
exit 1
fi
done
}
# =============================================
# main function
# =============================================
# entrypoint
function run_with_watchdog() {
local cmd="$1"
local CALLBACK_ON_TIMEOUT="$2"
watchdog &
WD_PID=$!
echo "STARTED watchdog (${WD_PID})."
echo "RUNNING '${cmd}'."
# Run $CMD and pipe output to $CMD_OUT for the watchdog. The PID is written to $CMD_PID to
# allow the watchdog to kill $CMD if it is not producing any output anymore. $CMD_EXIT contains
# the exit code. This is important for CI build life-cycle (success/failure).
( $cmd & PID=$! ; echo $PID >&3 ; wait $PID ; echo $? >&4 ) 3>$CMD_PID 4>$CMD_EXIT | tee $CMD_OUT
EXIT_CODE=$(<$CMD_EXIT)
echo "Process exited with EXIT CODE: ${EXIT_CODE}."
# Make sure to kill the watchdog in any case after $CMD has completed
echo "Trying to KILL watchdog (${WD_PID})."
( kill $WD_PID 2>&1 ) > /dev/null
rm $CMD_PID
rm $CMD_EXIT
return $EXIT_CODE
}