METRON-1991 Bro plugin docker scripts should exit nonzero when bro and kafka counts differ (JonZeolla via ottobackwards) closes apache/metron-bro-plugin-kafka#29

commit: 2f1edcbec745d110ff9489dbf03348e428ea4c76 [log] [tgz]
author: JonZeolla <zeolla@gmail.com> Thu Feb 28 12:29:55 2019 -0500
committer: otto <otto@apache.org> Thu Feb 28 12:29:55 2019 -0500
tree: 5a7fab02863ddaa9e7b1c3f605cdd6f872b17b3a
parent: ddfba5cd37b50984ed2223095a12cc6f9899f34c [diff]
diff --git a/docker/README.md b/docker/README.md
index 8e4d3fa..3bae93b 100644
--- a/docker/README.md
+++ b/docker/README.md

@@ -61,6 +61,7 @@
 #### Scripts executed on the host to setup and interact with the docker containers
 
 ```bash
+├── analyze_results.sh
 ├── build_container.sh
 ├── cleanup_docker.sh
 ├── create_docker_network.sh
@@ -83,6 +84,11 @@
 └── stop_container.sh
 ```
 
+- `analyze_results.sh`: Analyzes the `results.csv` files for any issues
+  ###### Parameters
+  ```bash
+  --test-directory               [REQUIRED] The directory for the tests
+  ```
 - `build_container.sh`: Runs docker build in the passed directory, and names the results
   ###### Parameters
   ```bash
@@ -191,12 +197,12 @@
   ```bash
   --data-path                    [REQUIRED] The pcap data path
   ```
-- `print_results.sh` : Prints the `results.csv` for all the pcaps processed in the given directory to console
+- `print_results.sh`: Prints the `results.csv` for all the pcaps processed in the given directory to console
   ###### Parameters
   ```bash
   --test-directory               [REQUIRED] The directory for the tests
   ```
-- `split_kafka_output_by_log.sh` : For a pcap result directory, will create a LOG.kafka.log for each LOG.log's entry in the kafka-output.log
+- `split_kafka_output_by_log.sh`: For a pcap result directory, will create a LOG.kafka.log for each LOG.log's entry in the kafka-output.log
   ###### Parameters
   ```bash
   --log-directory                [REQUIRED] The directory with the logs

diff --git a/docker/run_end_to_end.sh b/docker/run_end_to_end.sh
index 6baf679..ae06715 100755
--- a/docker/run_end_to_end.sh
+++ b/docker/run_end_to_end.sh

@@ -182,22 +182,34 @@
   echo "OFFSET------------------> ${OFFSET}"
 
   bash "${SCRIPT_DIR}"/docker_execute_process_data_file.sh --pcap-file-name="${BASE_FILE_NAME}" --output-directory-name="${DOCKER_DIRECTORY_NAME}"
-
   rc=$?; if [[ ${rc} != 0 ]]; then
     echo "ERROR> FAILED TO PROCESS ${file} DATA.  CHECK LOGS, please run the finish_end_to_end.sh when you are done."
     exit ${rc}
   fi
+
   KAFKA_OUTPUT_FILE="${TEST_OUTPUT_PATH}/${DOCKER_DIRECTORY_NAME}/kafka-output.log"
   bash "${SCRIPT_DIR}"/docker_run_consume_bro_kafka.sh --offset=$OFFSET | "${ROOT_DIR}"/remove_timeout_message.sh | tee "${KAFKA_OUTPUT_FILE}"
-
   rc=$?; if [[ ${rc} != 0 ]]; then
     echo "ERROR> FAILED TO PROCESS ${DATA_PATH} DATA.  CHECK LOGS"
   fi
 
   "${SCRIPT_DIR}"/split_kakfa_output_by_log.sh --log-directory="${TEST_OUTPUT_PATH}/${DOCKER_DIRECTORY_NAME}"
+  rc=$?; if [[ ${rc} != 0 ]]; then
+    echo "ERROR> ISSUE ENCOUNTERED WHEN SPLITTING KAFKA OUTPUT LOGS"
+  fi
 done
 
 "${SCRIPT_DIR}"/print_results.sh --test-directory="${TEST_OUTPUT_PATH}"
+rc=$?; if [[ ${rc} != 0 ]]; then
+  echo "ERROR> ISSUE ENCOUNTERED WHEN PRINTING RESULTS"
+  exit ${rc}
+fi
+
+"${SCRIPT_DIR}"/analyze_results.sh --test-directory="${TEST_OUTPUT_PATH}"
+rc=$?; if [[ ${rc} != 0 ]]; then
+  echo "ERROR> ISSUE ENCOUNTERED WHEN ANALYZING RESULTS"
+  exit ${rc}
+fi
 
 echo ""
 echo "Run complete"

diff --git a/docker/scripts/analyze_results.sh b/docker/scripts/analyze_results.sh
new file mode 100755
index 0000000..790ec18
--- /dev/null
+++ b/docker/scripts/analyze_results.sh

@@ -0,0 +1,207 @@
+#!/usr/bin/env bash
+
+#
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+shopt -s nocasematch
+#set -u # nounset disabled
+set -e # errexit
+set -E # errtrap
+set -o pipefail
+
+#
+# Analyzes the results.csv files to identify issues
+#
+
+function help {
+  echo " "
+  echo "usage: ${0}"
+  echo "    --test-directory           [REQUIRED] The directory for the tests"
+  echo "    -h/--help                  Usage information."
+  echo " "
+  echo " "
+}
+
+function _echo() {
+  color="txt${1:-DEFAULT}"
+  case "${1}" in
+    ERROR)
+      >&2 echo -e "${!color}${1}> ${2}${txtDEFAULT}"
+      ;;
+    WARN)
+      echo -e "${!color}${1}> ${2}${txtDEFAULT}"
+      ;;
+    *)
+      echo -e "${!color}${1}> ${2}${txtDEFAULT}"
+      ;;
+  esac
+}
+
+SCRIPT_NAME=$(basename -- "$0")
+TEST_DIRECTORY=
+declare -A LOGS_WITH_UNEQUAL_RESULTS
+declare -a LOG_NAMES
+declare -A OVERALL_LOG_CARDINALITY
+declare -A LOG_ISSUE_COUNT
+declare -r txtDEFAULT='\033[0m'
+# shellcheck disable=SC2034
+declare -r txtERROR='\033[0;31m'
+# shellcheck disable=SC2034
+declare -r txtWARN='\033[0;33m'
+
+# Handle command line options
+for i in "$@"; do
+  case $i in
+  #
+  # TEST_DIRECTORY
+  #
+  #   --test-directory
+  #
+    --test-directory=*)
+      TEST_DIRECTORY="${i#*=}"
+      shift # past argument=value
+    ;;
+
+  #
+  # -h/--help
+  #
+    -h | --help)
+      help
+      exit 0
+      shift # past argument with no value
+    ;;
+
+  #
+  # Unknown option
+  #
+    *)
+      UNKNOWN_OPTION="${i#*=}"
+      _echo ERROR "unknown option: $UNKNOWN_OPTION"
+      help
+    ;;
+  esac
+done
+
+if [[ -z "$TEST_DIRECTORY" ]]; then
+  echo "$TEST_DIRECTORY must be passed"
+  exit 1
+fi
+
+echo "Running ${SCRIPT_NAME} with"
+echo "TEST_DIRECTORY = $TEST_DIRECTORY"
+echo "==================================================="
+
+## Main functions
+function count_occurrences_of_each_log_file
+{
+  # Count the number of occurences of each log name
+  for LOG_NAME in "${LOG_NAMES[@]}"; do
+    (( ++OVERALL_LOG_CARDINALITY["${LOG_NAME}"] ))
+  done
+}
+
+function check_for_unequal_log_counts
+{
+  RESULTS_FILE="${1}"
+
+  # Get the pcap folder name from the provided file
+  # shellcheck disable=SC2001
+  PCAP_FOLDER="$( cd "$( dirname "${RESULTS_FILE}" )" >/dev/null 2>&1 && echo "${PWD##*/}")"
+
+  # Check each log line in the provided log file for unequal results
+  for LOG_NAME in "${LOG_NAMES[@]}"; do
+    # For each log in the provided results, identify any unequal log counts
+    UNEQUAL_LOG=$(awk -F\, -v log_name="${LOG_NAME}" '$1 == log_name && $2 != $3 {print $1}' "${RESULTS_FILE}")
+
+    # Create a space separated list of unequal logs to simulate a
+    # multidimensional array
+    if [[ -n "${UNEQUAL_LOG}" ]]; then
+      if [[ "${#LOGS_WITH_UNEQUAL_RESULTS[${PCAP_FOLDER}]}" -eq 0 ]]; then
+        LOGS_WITH_UNEQUAL_RESULTS["${PCAP_FOLDER}"]="${UNEQUAL_LOG}"
+      else
+        LOGS_WITH_UNEQUAL_RESULTS["${PCAP_FOLDER}"]+=" ${UNEQUAL_LOG}"
+      fi
+    fi
+  done
+}
+
+function print_unequal_results
+{
+  # Output a table with the pcap file and log name details where the imbalance
+  # was detected
+  {
+  echo "PCAP FOLDER,LOG NAME"
+
+  for KEY in "${!LOGS_WITH_UNEQUAL_RESULTS[@]}"; do
+    # This must be done because we are simulating multidimensional arrays due to
+    # the lack of native bash support
+    for VALUE in ${LOGS_WITH_UNEQUAL_RESULTS[${KEY}]}; do
+      echo "${KEY},${VALUE}"
+    done
+  done
+  } | column -t -s ','
+}
+
+function print_log_comparison_insights
+{
+  # Load the log to instance count mapping from LOGS_WITH_UNEQUAL_RESULTS into a new
+  # associative array
+  # shellcheck disable=SC2046
+  declare -A $(echo "${LOGS_WITH_UNEQUAL_RESULTS[@]}" | tr ' ' '\n' | sort | uniq -c | awk '{print "LOG_ISSUE_COUNT["$2"]="$1}')
+
+  # Compare each log type's instances of inequality to the total number of
+  # instances of each log.  If they are equal, this indicates that there may be
+  # a log-type related issue.
+  #
+  # For example, if count_occurrences_of_each_log_file identified that there
+  # were 10 instances of http logs across all of the `results.csv` files,
+  # ${OVERALL_LOG_CARDINALITY[http]} should equal 10. If check_for_unequal_log_counts
+  # independently found 10 instances where the http bro and kafka log counts
+  # from the `results.csv` files were not equal, ${LOG_ISSUE_COUNT[http]}
+  # would also have 10 entries, causing us to warn the user of that insight.
+  for KEY in "${!LOG_ISSUE_COUNT[@]}"; do
+    if [[ "${LOG_ISSUE_COUNT[${KEY}]}" == "${OVERALL_LOG_CARDINALITY[${KEY}]}" ]]; then
+      _echo WARN "None of the ${KEY} log counts were the same between bro and kafka.  This may indicate an issue specific to that log."
+    fi
+  done
+}
+
+## Main
+# Move over to the docker area
+cd "${TEST_DIRECTORY}" || exit 1
+# Get a list of results files
+RESULTS_FILES=$(find "${TEST_DIRECTORY}" -name "results.csv")
+# Analyze each results file for issues
+for file in $RESULTS_FILES; do
+  # Capture the first column (the log names) of the provided file's contents in
+  # the array LOG_NAMES, excluding the header
+  mapfile -s 1 -t LOG_NAMES < <(awk -F\, '{print $1}' "${file}")
+
+  count_occurrences_of_each_log_file
+  check_for_unequal_log_counts "${file}"
+done
+
+if [[ "${#LOGS_WITH_UNEQUAL_RESULTS[@]}" -gt 0 ]]; then
+  _echo ERROR "UNEQUALITY FOUND IN BRO AND KAFKA LOG COUNTS"
+  echo ""
+
+  print_unequal_results
+  print_log_comparison_insights
+
+  exit 1
+fi
+

diff --git a/docker/scripts/build_container.sh b/docker/scripts/build_container.sh
index d4e5dca..40810db 100755
--- a/docker/scripts/build_container.sh
+++ b/docker/scripts/build_container.sh

@@ -22,6 +22,7 @@
 set -e # errexit
 set -E # errtrap
 set -o pipefail
+
 #
 # Runs docker build in a provided directory, with a provided name
 #
@@ -36,6 +37,7 @@
   echo " "
 }
 
+SCRIPT_NAME=$(basename -- "$0")
 CONTAINER_DIRECTORY=
 CONTAINER_NAME=
 
@@ -92,7 +94,7 @@
   exit 1
 fi
 
-echo "Running with "
+echo "Running ${SCRIPT_NAME} with"
 echo "CONTAINER_DIRECTORY = $CONTAINER_DIRECTORY"
 echo "CONTAINER_NAME = $CONTAINER_NAME"
 echo "==================================================="

diff --git a/docker/scripts/print_results.sh b/docker/scripts/print_results.sh
index ecc67ca..6e107c7 100755
--- a/docker/scripts/print_results.sh
+++ b/docker/scripts/print_results.sh

@@ -31,11 +31,12 @@
   echo " "
   echo "usage: ${0}"
   echo "    --test-directory           [REQUIRED] The directory for the tests"
-  echo "    -h/--help                   Usage information."
+  echo "    -h/--help                  Usage information."
   echo " "
   echo " "
 }
 
+SCRIPT_NAME=$(basename -- "$0")
 TEST_DIRECTORY=
 
 # Handle command line options
@@ -77,7 +78,7 @@
 fi
 
 
-echo "Running with "
+echo "Running ${SCRIPT_NAME} with"
 echo "TEST_DIRECTORY = $TEST_DIRECTORY"
 echo "==================================================="
 

diff --git a/docker/scripts/split_kakfa_output_by_log.sh b/docker/scripts/split_kakfa_output_by_log.sh
index 74d55e3..61e53e4 100755
--- a/docker/scripts/split_kakfa_output_by_log.sh
+++ b/docker/scripts/split_kakfa_output_by_log.sh

@@ -37,6 +37,7 @@
   echo " "
 }
 
+SCRIPT_NAME=$(basename -- "$0")
 LOG_DIRECTORY=
 
 # Handle command line options
@@ -77,7 +78,7 @@
   exit 1
 fi
 
-echo "Running with "
+echo "Running ${SCRIPT_NAME} with"
 echo "$LOG_DIRECTORY = $LOG_DIRECTORY"
 echo "==================================================="
commit	2f1edcbec745d110ff9489dbf03348e428ea4c76	[log] [tgz]
author	JonZeolla <zeolla@gmail.com>	Thu Feb 28 12:29:55 2019 -0500
committer	otto <otto@apache.org>	Thu Feb 28 12:29:55 2019 -0500
tree	5a7fab02863ddaa9e7b1c3f605cdd6f872b17b3a
parent	ddfba5cd37b50984ed2223095a12cc6f9899f34c [diff]