| #!/usr/bin/env bash |
| ################################################################################ |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| ################################################################################ |
| |
| source "$(dirname "$0")"/common.sh |
| |
| TEST_PROGRAM_JAR=${END_TO_END_DIR}/flink-datastream-allround-test/target/DataStreamAllroundTestProgram.jar |
| |
| JM_WATCHDOG_PID=0 |
| TM_WATCHDOG_PID=0 |
| |
| # flag indicating if we have already cleared up things after a test |
| CLEARED=0 |
| |
| function stop_cluster_and_watchdog() { |
| if [ ${CLEARED} -eq 0 ]; then |
| |
| if ! [ ${JM_WATCHDOG_PID} -eq 0 ]; then |
| echo "Killing JM watchdog @ ${JM_WATCHDOG_PID}" |
| kill ${JM_WATCHDOG_PID} 2> /dev/null |
| wait ${JM_WATCHDOG_PID} 2> /dev/null |
| fi |
| |
| if ! [ ${TM_WATCHDOG_PID} -eq 0 ]; then |
| echo "Killing TM watchdog @ ${TM_WATCHDOG_PID}" |
| kill ${TM_WATCHDOG_PID} 2> /dev/null |
| wait ${TM_WATCHDOG_PID} 2> /dev/null |
| fi |
| |
| CLEARED=1 |
| fi |
| } |
| |
| function verify_logs() { |
| local OUTPUT=$FLINK_DIR/log/*.out |
| local JM_FAILURES=$1 |
| local EXIT_CODE=0 |
| |
| # verify that we have no alerts |
| if ! [ `cat ${OUTPUT} | wc -l` -eq 0 ]; then |
| echo "FAILURE: Alerts found at the general purpose DataStream job." |
| EXIT_CODE=1 |
| fi |
| |
| # checks that all apart from the first JM recover the failed jobgraph. |
| if ! [ `grep -r --include '*standalonesession*.log' Recovered SubmittedJobGraph "${FLINK_DIR}/log/" | cut -d ":" -f 1 | uniq | wc -l` -eq ${JM_FAILURES} ]; then |
| echo "FAILURE: A JM did not take over." |
| EXIT_CODE=1 |
| fi |
| |
| # search the logs for JMs that log completed checkpoints |
| if ! [ `grep -r --include '*standalonesession*.log' Completed checkpoint "${FLINK_DIR}/log/" | cut -d ":" -f 1 | uniq | wc -l` -eq $((JM_FAILURES + 1)) ]; then |
| echo "FAILURE: A JM did not execute the job." |
| EXIT_CODE=1 |
| fi |
| |
| if [[ $EXIT_CODE != 0 ]]; then |
| echo "One or more tests FAILED." |
| exit $EXIT_CODE |
| fi |
| } |
| |
| function jm_watchdog() { |
| local EXPECTED_JMS=$1 |
| local IP_PORT=$2 |
| |
| while true; do |
| local RUNNING_JMS=`jps | grep 'StandaloneSessionClusterEntrypoint' | wc -l`; |
| local MISSING_JMS=$((EXPECTED_JMS-RUNNING_JMS)) |
| for (( c=0; c<MISSING_JMS; c++ )); do |
| "$FLINK_DIR"/bin/jobmanager.sh start "localhost" ${IP_PORT} |
| done |
| sleep 5; |
| done |
| } |
| |
| function kill_jm { |
| local JM_PIDS=`jps | grep 'StandaloneSessionClusterEntrypoint' | cut -d " " -f 1` |
| local JM_PIDS=(${JM_PIDS[@]}) |
| local PID=${JM_PIDS[0]} |
| kill -9 ${PID} |
| |
| echo "Killed JM @ ${PID}" |
| } |
| |
| function tm_watchdog() { |
| local JOB_ID=$1 |
| local EXPECTED_TMS=$2 |
| |
| # the number of already seen successful checkpoints |
| local SUCCESSFUL_CHCKP=0 |
| |
| while true; do |
| |
| # check how many successful checkpoints we have |
| # and kill a TM only if the previous one already had some |
| |
| local CHECKPOINTS=`curl -s "http://localhost:8081/jobs/${JOB_ID}/checkpoints" | cut -d ":" -f 6 | sed 's/,.*//'` |
| |
| if [[ ${CHECKPOINTS} =~ '^[0-9]+$' ]] || [[ ${CHECKPOINTS} == "" ]]; then |
| |
| # this may be the case during leader election. |
| # in this case we retry later with a smaller interval |
| sleep 5; continue |
| |
| elif [ "${CHECKPOINTS}" -ne "${SUCCESSFUL_CHCKP}" ]; then |
| |
| # we are not only searching for > because when the JM goes down, |
| # the job starts with reporting 0 successful checkpoints |
| |
| local RUNNING_TMS=`jps | grep 'TaskManager' | wc -l` |
| local TM_PIDS=`jps | grep 'TaskManager' | cut -d " " -f 1` |
| |
| local MISSING_TMS=$((EXPECTED_TMS-RUNNING_TMS)) |
| if [ ${MISSING_TMS} -eq 0 ]; then |
| # start a new TM only if we have exactly the expected number |
| "$FLINK_DIR"/bin/taskmanager.sh start > /dev/null |
| fi |
| |
| # kill an existing one |
| local TM_PIDS=(${TM_PIDS[@]}) |
| local PID=${TM_PIDS[0]} |
| kill -9 ${PID} |
| |
| echo "Killed TM @ ${PID}" |
| |
| SUCCESSFUL_CHCKP=${CHECKPOINTS} |
| fi |
| |
| sleep 11; |
| done |
| } |
| |
| function run_ha_test() { |
| local PARALLELISM=$1 |
| local BACKEND=$2 |
| local ASYNC=$3 |
| local INCREM=$4 |
| |
| local JM_KILLS=3 |
| local CHECKPOINT_DIR="${TEST_DATA_DIR}/checkpoints/" |
| |
| CLEARED=0 |
| |
| # start the cluster on HA mode |
| start_ha_cluster |
| |
| echo "Running on HA mode: parallelism=${PARALLELISM}, backend=${BACKEND}, asyncSnapshots=${ASYNC}, and incremSnapshots=${INCREM}." |
| |
| # submit a job in detached mode and let it run |
| local JOB_ID=$($FLINK_DIR/bin/flink run -d -p ${PARALLELISM} \ |
| $TEST_PROGRAM_JAR \ |
| --environment.parallelism ${PARALLELISM} \ |
| --test.semantics exactly-once \ |
| --test.simulate_failure true \ |
| --test.simulate_failure.num_records 200 \ |
| --test.simulate_failure.num_checkpoints 1 \ |
| --test.simulate_failure.max_failures 20 \ |
| --state_backend ${BACKEND} \ |
| --state_backend.checkpoint_directory "file://${CHECKPOINT_DIR}" \ |
| --state_backend.file.async ${ASYNC} \ |
| --state_backend.rocks.incremental ${INCREM} \ |
| --sequence_generator_source.sleep_time 15 \ |
| --sequence_generator_source.sleep_after_elements 1 \ |
| | grep "Job has been submitted with JobID" | sed 's/.* //g') |
| |
| wait_job_running ${JOB_ID} |
| |
| # start the watchdog that keeps the number of JMs stable |
| jm_watchdog 1 "8081" & |
| JM_WATCHDOG_PID=$! |
| echo "Running JM watchdog @ ${JM_WATCHDOG_PID}" |
| |
| sleep 5 |
| |
| # start the watchdog that keeps the number of TMs stable |
| tm_watchdog ${JOB_ID} 1 & |
| TM_WATCHDOG_PID=$! |
| echo "Running TM watchdog @ ${TM_WATCHDOG_PID}" |
| |
| # let the job run for a while to take some checkpoints |
| sleep 20 |
| |
| for (( c=0; c<${JM_KILLS}; c++ )); do |
| # kill the JM and wait for watchdog to |
| # create a new one which will take over |
| kill_jm |
| sleep 60 |
| done |
| |
| verify_logs ${JM_KILLS} |
| |
| # kill the cluster and zookeeper |
| stop_cluster_and_watchdog |
| } |
| |
| trap stop_cluster_and_watchdog INT |
| trap stop_cluster_and_watchdog EXIT |
| |
| STATE_BACKEND_TYPE=${1:-file} |
| STATE_BACKEND_FILE_ASYNC=${2:-true} |
| STATE_BACKEND_ROCKS_INCREMENTAL=${3:-false} |
| |
| run_ha_test 4 ${STATE_BACKEND_TYPE} ${STATE_BACKEND_FILE_ASYNC} ${STATE_BACKEND_ROCKS_INCREMENTAL} |