blob: 77d27ef8e36ce13addc24f14efd9463c69df9f30 [file] [log] [blame]
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
if [ $# -ne 5 ]
then
echo "USAGE: $0 <meta-count> <replica-count> <app-name> <kill-type> <sleep-time>"
echo " kill-type: meta | replica | all"
exit 1
fi
META_COUNT=$1
REPLICA_COUNT=$2
APP_NAME=$3
KILL_TYPE=$4
SLEEP_TIME=$5
if [ "$KILL_TYPE" != "meta" -a "$KILL_TYPE" != "replica" -a "$KILL_TYPE" != "all" ]
then
echo "ERROR: invalid kill-type, should be: meta | replica | all"
exit 1
fi
function stop_kill_test() {
echo "---------------------------"
./run.sh shell <$SHELL_INPUT | grep '^[0-9]\|^primary_meta_server'
echo "---------------------------"
ps -ef | grep pegasus_kill_test | grep -v grep | awk '{print $2}' | xargs kill
./run.sh stop_onebox
exit 1
}
pwd="$( cd "$( dirname "$0" )" && pwd )"
shell_dir="$( cd $pwd/.. && pwd )"
cd $shell_dir
LEASE_TIMEOUT=20
UNHEALTHY_TIMEOUT=1200
KILL_META_PROB=10
SHELL_INPUT=.kill_test.shell.input
SHELL_OUTPUT=.kill_test.shell.output
echo "cluster_info" >$SHELL_INPUT
echo "nodes" >>$SHELL_INPUT
echo "app $APP_NAME --detailed" >>$SHELL_INPUT
WAIT_SECONDS=0
while true
do
if ! ps -ef | grep "pegasus_kill_test" | grep -v grep &>/dev/null
then
echo "[`date`] pegasus_kill_test process is not exist, stop kill test"
stop_kill_test
fi
if [ "`find onebox -name 'core.*' | wc -l `" -gt 0 -o "`find onebox -name 'core' | wc -l `" -gt 0 ]
then
echo "[`date`] coredump generated, stop kill test"
stop_kill_test
fi
./run.sh shell <$SHELL_INPUT &>$SHELL_OUTPUT
UNALIVE_COUNT=`cat $SHELL_OUTPUT | grep UNALIVE | wc -l`
PARTITION_COUNT=`cat $SHELL_OUTPUT | awk '{print $3}' | grep '/' | grep -o '^[0-9]*' | wc -l`
UNHEALTHY_COUNT=`cat $SHELL_OUTPUT | awk '{print $3}' | grep '/' | grep -o '^[0-9]*' | grep -v '3' | wc -l`
if [ $UNALIVE_COUNT -gt 0 -o $PARTITION_COUNT -eq 0 -o $UNHEALTHY_COUNT -gt 0 ]
then
if [ $WAIT_SECONDS -gt $UNHEALTHY_TIMEOUT ]
then
echo "[`date`] not healthy for $WAIT_SECONDS seconds, stop kill test"
stop_kill_test
fi
WAIT_SECONDS=$((WAIT_SECONDS+1))
sleep 1
continue
else
echo "[`date`] healthy now, waited for $WAIT_SECONDS seconds"
echo "---------------------------"
cat $SHELL_OUTPUT | grep '^[0-9]\|^primary_meta_server'
echo "---------------------------"
echo
fi
echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>"
if [ "$KILL_TYPE" = "meta" ]
then
TYPE=meta
elif [ "$KILL_TYPE" = "replica" ]
then
TYPE=replica
else
R=$((RANDOM % KILL_META_PROB))
if [ $R -eq 0 ]
then
TYPE=meta
else
TYPE=replica
fi
fi
if [ "$TYPE" = "meta" ]
then
OPT="-m"
TASK_ID=$((RANDOM % META_COUNT + 1))
else
OPT="-r"
TASK_ID=$((RANDOM % REPLICA_COUNT + 1))
fi
###
SECONDS=$((RANDOM % SLEEP_TIME + 1))
echo "[`date`] sleep for $SECONDS seconds before stopping $TYPE #$TASK_ID"
sleep $SECONDS
###
echo "[`date`] stop $TYPE #$TASK_ID"
./run.sh stop_onebox_instance $OPT $TASK_ID
WAIT_STOP=0
while ./run.sh list_onebox | grep "\<$TYPE@$TASK_ID\>" &>/dev/null
do
if [ $WAIT_STOP -gt 10 ]
then
echo "[`date`] wait stopping of $TYPE #$TASK_ID for $WAIT_STOP seconds"
fi
sleep 1
WAIT_STOP=$((WAIT_STOP+1))
done
###
SECONDS=$((RANDOM % LEASE_TIMEOUT + 1))
echo "[`date`] sleep for $SECONDS seconds before starting $TYPE #$TASK_ID"
sleep $SECONDS
###
echo "[`date`] start $TYPE #$TASK_ID"
./run.sh start_onebox_instance $OPT $TASK_ID
WAIT_SECONDS=0
while true
do
echo "[`date`] sleep for $LEASE_TIMEOUT seconds before checking cluster healthy"
sleep $LEASE_TIMEOUT
WAIT_SECONDS=$((WAIT_SECONDS+LEASE_TIMEOUT))
if ./run.sh list_onebox | grep "\<$TYPE@$TASK_ID\>" &>/dev/null
then
break
fi
cd onebox/${TYPE}${TASK_ID}/data/log
if ls --sort=time -r | tail -n 1 | xargs grep "Address already in use"
then
cd $shell_dir
PORT=$((34800+TASK_ID))
netstat -nap 2>/dev/null | grep '^tcp' | grep "\<$PORT\>"
echo "[`date`] start $TYPE #$TASK_ID again"
./run.sh start_onebox_instance $OPT $TASK_ID
WAIT_SECONDS=0
else
cd $shell_dir
echo "[`date`] start $TYPE #$TASK_ID failed, stop kill test"
stop_kill_test
fi
done
done