blob: 93c6ec6b2cfb6cf9732daa5da5f831ba57640d64 [file] [log] [blame]
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# test that we can kill/restart mock participant remotely
export TEST_NAME=helix_random_kill_remote
source setup_env.inc
# users/machines/dirs info for each test machine
#USER_TAB=( "zzhang" "zzhang" "zzhang" "zzhang" )
#MACHINE_TAB=( "eat1-app26.corp" "eat1-app27.corp" "eat1-app28.corp" "eat1-app29.corp" )
#SCRIPT_DIR_TAB=( "/export/home/zzhang/workspace/helix/helix-core/src/main/scripts/integration-test/script" "/export/home/zzhang/workspace/helix/helix-core/src/main/scripts/integration-test/script" "/export/home/zzhang/workspace/helix/helix-core/src/main/scripts/integration-test/script" "/export/home/zzhang/workspace/helix/helix-core/src/main/scripts/integration-test/script" )
USER_TAB=( "zzhang" "zzhang" )
MACHINE_TAB=( "eat1-app205.corp" "eat1-app80.corp" )
SCRIPT_DIR_TAB=( "/export/home/zzhang/workspace/helix/helix-core/src/main/scripts/integration-test/script" "/export/home/zzhang/workspace/helix/helix-core/src/main/scripts/integration-test/script" )
# constants
machine_nb=${#MACHINE_TAB[*]}
controller_idx=1
mocks_per_node=3
# colorful echo
red='\e[00;31m'
green='\e[00;32m'
function cecho
{
message="$1"
if [ -n "$message" ]; then
color="$2"
if [ -z "$color" ]; then
echo "$message"
else
echo -e "$color$message\e[00m"
fi
fi
}
# zookeeper_server_ports="localhost:2188"
# use the first machine as zookeeper and controller
#zookeeper_address=${MACHINE_TAB[0]}:2181,${MACHINE_TAB[1]}:2181,${MACHINE_TAB[2]}:2181
zookeeper_address=eat1-app78.corp:12913
# default datadir integration_test/var/work/zookeeper/data/1
# start the zookeeper cluster
# for i in `seq 0 2`; do
# ssh ${USER_TAB[$i]}@${MACHINE_TAB[$i]} "${SCRIPT_DIR_TAB[$i]}/cm_driver.py -n ${TEST_NAME} -c zookeeper -o start --zookeeper_reset --zookeeper_server_ports=\"$zookeeper_address\" --zookeeper_server_ids=$i --cmdline_props=\"tickTime=2000;initLimit=5;syncLimit=2\""
# done
# create cluster
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} -addCluster test-cluster"
# enable healthCheck
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --setConfig CLUSTER=test-cluster healthChange.enabled=true"
# add alerts
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.MeanMysqlLatency))CMP(GREATER)CON(2.132700625)\""
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.MinLuceneLatency))CMP(GREATER)CON(1.765905)\""
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.MaxServerLatency))CMP(GREATER)CON(167.714205)\""
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.MeanLuceneLatency))CMP(GREATER)CON(16.107599458333335)\""
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.MeanLucenePoolLatency))CMP(GREATER)CON(8.120545333333335)\""
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.MinServerLatency))CMP(GREATER)CON(0.425275)\""
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.IndexStoreMismatchCount))CMP(GREATER)CON(5)\""
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.ErrorCount))CMP(GREATER)CON(5)\""
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.MeanMysqlPoolLatency))CMP(GREATER)CON(1.0704102916666665)\""
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.MinLucenePoolLatency))CMP(GREATER)CON(0.008185)\""
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.MinMysqlLatency))CMP(GREATER)CON(0.709695)\""
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.MaxMysqlPoolLatency))CMP(GREATER)CON(8.606975)\""
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.MinMysqlPoolLatency))CMP(GREATER)CON(0.091885)\""
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.MaxLucenePoolLatency))CMP(GREATER)CON(65.930565)\""
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.MaxMysqlLatency))CMP(GREATER)CON(9.369825)\""
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.TimeStamp))CMP(GREATER)CON(1332895048145)\""
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.MeanConcurrencyLevel))CMP(GREATER)CON(1.5)\""
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.QueryStartCount))CMP(GREATER)CON(5)\""
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.MeanServerLatency))CMP(GREATER)CON(39.5451535)\""
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} --addAlert test-cluster \"EXP(decay(1.0)(*.MockRestQueryStats@DBName=BizProfile.MaxLuceneLatency))CMP(GREATER)CON(111.78795)\""
# add resource
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} -addResource test-cluster test-db 120 MasterSlave"
# add nodes
start_port=8900
for j in `seq 0 $(($machine_nb-1))`; do
for i in `seq 1 $mocks_per_node`; do
port=$(($start_port + $i))
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} -addNode test-cluster ${MACHINE_TAB[$j]}:${port}"
done
done
# rebalance
$SCRIPT_DIR/cm_driver.py -c clm_console --cmdline_args="-zkSvr ${zookeeper_address} -rebalance test-cluster test-db 3"
# Launch cluster manager
# -Djava.rmi.server.hostname=${MACHINE_TAB[$controller_idx]}
ssh ${USER_TAB[$controller_idx]}@${MACHINE_TAB[$controller_idx]} "${SCRIPT_DIR_TAB[$controller_idx]}/cm_driver.py -n ${TEST_NAME} -c cluster-manager -o start --jvm_args=\"-Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.local.only=false -Dcom.sun.management.jmxremote.port=27960 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false\" -l \"integration-test/config/log4j-info.properties\" --cmdline_args=\"-zkSvr ${zookeeper_address} -cluster test-cluster\""
# Launch mock health report process
# for j in {0..1}; do
for j in `seq 0 $(($machine_nb-1))`; do
for i in `seq 1 $mocks_per_node`; do
port=$(($start_port + $i))
ssh ${USER_TAB[$j]}@${MACHINE_TAB[$j]} "${SCRIPT_DIR_TAB[$j]}/cm_driver.py -n ${TEST_NAME} -c mock-health-report-process -o start -l \"integration-test/config/log4j-info.properties\" --save_process_id --component_id=$i --cmdline_args=\"-zkSvr ${zookeeper_address} -cluster test-cluster -host ${MACHINE_TAB[$j]} -port ${port}\""
done
done
# sleep 3
#verify cluster state
verifier_output=$VIEW_ROOT/$LOG_DIR_FROM_ROOT/verifier_`date +"%y%m%d_%H%M%S"`.log
$SCRIPT_DIR/cm_driver.py -n ${TEST_NAME} -c cluster-state-verifier -o start --logfile=$verifier_output -l "integration-test/config/log4j-info.properties" --cmdline_args="-zkSvr ${zookeeper_address} -cluster test-cluster -timeout 1200000"
echo "verifier_output=$verifier_output"
verifier_result=`grep 'Successful\|fail' $verifier_output`
cecho "$verifier_result" $red
# kill m random and restart
# : <<'END'
n=$((${#MACHINE_TAB[*]} * ${mocks_per_node}))
m=1
# do kill m random and restart for r rounds
for r in {0..0}; do
to_kill=`shuf --input-range=1-$n | head -${m}`
for k in ${to_kill[*]}; do
j=$((($k - 1) / 5))
i=$((($k - 1) % 5 + 1))
port=$(($start_port + $i))
cecho "kill ${MACHINE_TAB[$j]}:$port" $red
ssh ${USER_TAB[$j]}@${MACHINE_TAB[$j]} "${SCRIPT_DIR_TAB[$j]}/cm_driver.py -n ${TEST_NAME} -c mock-health-report-process -o stop --component_id=$i"
# sleep 3
done
# sleep 10
# verify cluster state after kill
verifier_output=$VIEW_ROOT/$LOG_DIR_FROM_ROOT/verifier_`date +"%y%m%d_%H%M%S"`.log
$SCRIPT_DIR/cm_driver.py -c cluster-state-verifier -o start --logfile=$verifier_output -l "integration-test/config/log4j-info.properties" --cmdline_args="-zkSvr ${zookeeper_address} -cluster test-cluster -timeout 120000"
echo "verifier_output=$verifier_output"
verifier_result=`grep 'Successful\|fail' $verifier_output`
cecho "$verifier_result" $red
sleep 10
: <<'END'
for k in ${to_kill[*]}; do
j=$((($k - 1) / 5))
i=$((($k - 1) % 5 + 1))
port=$(($start_port + $i))
cecho "restart ${MACHINE_TAB[$j]}:$port" $green
ssh ${USER_TAB[$j]}@${MACHINE_TAB[$j]} "${SCRIPT_DIR_TAB[$j]}/cm_driver.py -n ${TEST_NAME} -c mock-health-report-process -o start -l \"integration-test/config/log4j-info.properties\" --save_process_id --component_id=$i --cmdline_args=\"-zkSvr ${zookeeper_address} -cluster test-cluster -host ${MACHINE_TAB[$j]} -port ${port}\""
# sleep 1
done
# sleep 3
#verify cluster state after restart
verifier_output=$VIEW_ROOT/$LOG_DIR_FROM_ROOT/verifier_`date +"%y%m%d_%H%M%S"`.log
$SCRIPT_DIR/cm_driver.py -c cluster-state-verifier -o start --logfile=$verifier_output -l "integration-test/config/log4j-info.properties" --cmdline_args="-zkSvr ${zookeeper_address} -cluster test-cluster -timeout 120000"
echo "verifier_output=$verifier_output"
verifier_result=`grep 'Successful\|fail' $verifier_output`
cecho "$verifier_result" $red
sleep 10
END
done
# END
# clean up
cecho "clean up..." $green
# sleep 600
ssh ${USER_TAB[$controller_idx]}@${MACHINE_TAB[$controller_idx]} "${SCRIPT_DIR_TAB[$controller_idx]}/cm_driver.py -n ${TEST_NAME} -c cluster-manager -o stop"
#for j in {0..1}; do
for j in `seq 0 $(($machine_nb-1))`; do
for i in `seq 1 $mocks_per_node`; do
ssh ${USER_TAB[$j]}@${MACHINE_TAB[$j]} "${SCRIPT_DIR_TAB[$j]}/cm_driver.py -n ${TEST_NAME} -c mock-health-report-process -o stop --component_id=$i"
done
done
# for i in {0..2}; do
# ssh ${USER_TAB[$i]}@${MACHINE_TAB[$i]} "${SCRIPT_DIR_TAB[$i]}/cm_driver.py -n ${TEST_NAME} -c zookeeper -o stop"
# done
echo == GREP SUCCEED ==
grep Successful $verifier_output
source report_pass_fail.inc
exit $all_stat