GEODE-7554: Add retry mechanism for failed tests (#4461)

* GEODE-7554: Add retry mechanism for failed tests

There are still some flaky benchmarks. Retry tests that have failed, up
to 5 times, to determine if the failure is legitimate or just a flaky
test. If the test fails 5 times in a row, we know that it is a
legitimate failure.

With each run of run_against_baseline (in the apache/geode-benchmarks
repo), the class names of all failed tests will be written to a file.
The run_benchmarks script called by CI will read the failed tests from
the file and run only those tests. Once the return code is 0, or once
we've tried 5 times, exit.

* build failedFile path dynamically

Signed-off-by: Helena A. Bales <hbales@pivotal.io>

* add print of iteration number for CI debugging of failures

Signed-off-by: Robert Houghton <rhoughton@pivotal.io>

* triggering CI again. Previous status never updated

Signed-off-by: Helena A. Bales <hbales@pivotal.io>

* refactor if statement for readability

* restarting CI

* remove hardcoded cluster tag

(cherry picked from commit 278c2470a5cac2c332d13914b935f0618b820a91)
diff --git a/ci/scripts/run_benchmarks.sh b/ci/scripts/run_benchmarks.sh
index 92f072d..ae18968 100755
--- a/ci/scripts/run_benchmarks.sh
+++ b/ci/scripts/run_benchmarks.sh
@@ -41,15 +41,51 @@
 GEODE_SHA=$(git rev-parse --verify HEAD)
 popd
 
+input= "$(pwd)/results/failedTests"
+
 pushd geode-benchmarks/infrastructure/scripts/aws/
 ./launch_cluster.sh -t ${CLUSTER_TAG} -c ${CLUSTER_COUNT} --ci
 
-if [ -z "${BASELINE_VERSION}" ]; then
-  ./run_against_baseline.sh -t ${CLUSTER_TAG} -b ${GEODE_SHA} -B ${BASELINE_BRANCH} -e ${BENCHMARKS_BRANCH} -o ${RESULTS_DIR} -m "'source':'geode-ci','benchmark_branch':'${BENCHMARK_BRANCH}','baseline_branch':'${BASELINE_BRANCH}','geode_branch':'${GEODE_SHA}'" --ci -- ${FLAGS} ${TEST_OPTIONS}
-else
-  ./run_against_baseline.sh -t ${CLUSTER_TAG} -b ${GEODE_SHA} -V ${BASELINE_VERSION} -e ${BENCHMARKS_BRANCH} -o ${RESULTS_DIR} -m "'source':'geode-ci','benchmark_branch':'${BENCHMARK_BRANCH}','baseline_version':'${BASELINE_VERSION}','geode_branch':'${GEODE_SHA}'" --ci -- ${FLAGS} ${TEST_OPTIONS}
-fi
+# test retry loop - Check if any tests have failed. If so, overwrite the TEST_OPTIONS with only the
+# failed tests. Test failures only result in an exit code of 1 when on the last iteration of loop.
+for i in {1..5}
+do
+  echo "This is ITERATION ${i} of benchmarking against baseline."
 
+  if [[ -f ${input} ]]; then
+    unset TEST_OPTIONS
+    TEST_OPTIONS=""
+    while IFS= read -r line; do
+      test=" --tests $line"
+      TEST_OPTIONS=${TEST_OPTIONS}${test}
+    done < ${input}
 
+    rm ${input}
+  fi
+
+  if [[ ${i} != 5 ]]; then
+    set +e
+  fi
+
+  if [ -z "${BASELINE_VERSION}" ]; then
+    BASELINE_OPTION="-B ${BASELINE_BRANCH}"
+    METADATA_BASELINE="'benchmark_branch':'${BASELINE_BRANCH}'"
+  else
+    BASELINE_OPTION="-V ${BASELINE_VERSION}"
+    METADATA_BASELINE="'benchmark_version':'${BASELINE_VERSION}'"
+  fi
+
+  ./run_on_cluster.sh -t ${CLUSTER_TAG} -- pkill -9 java
+  ./run_on_cluster.sh -t ${CLUSTER_TAG} -- rm /home/geode/locator10334view.dat;
+  ./run_against_baseline.sh -t ${CLUSTER_TAG} -b ${GEODE_SHA} ${BASELINE_OPTION} -e ${BENCHMARKS_BRANCH} -o ${RESULTS_DIR} -m "'source':'geode-ci',${METADATA_BASELINE},'baseline_branch':'${BASELINE_BRANCH}','geode_branch':'${GEODE_SHA}'" --ci -- ${FLAGS} ${TEST_OPTIONS}
+
+  if [[ $? -eq 0 ]]; then
+    break;
+  fi
+
+  if [[ i != 5 ]]; then
+    set -e
+  fi
+done
 
 popd