testdata/cluster/admin - impala - Git at Google

 #!/bin/bash

 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 # This will create/control/destroy a local hdfs/yarn/kms/kudu cluster.
 #
 # All roles run on 127.0.0.1, just like the standard mini cluster included with hadoop.
 # The difference is with this cluster, each role runs in its own process and has its own
 # configs. For each node, the layout of the configs, logs, start/stop scripts, etc, is
 # kept as close as possible to a real cluster. For example, the first node will live in
 # the dir "cdh-<version>/node-1" and its logs would be at "cdh-<version>/node-1/var/log".
 # TODO: Run each node on its own IP address, e.g. 127.0.0.1, 127.0.0.2, and so on.

 set -euo pipefail
 . $IMPALA_HOME/bin/report_build_error.sh
 setup_report_build_error

 : ${IMPALA_KERBERIZE=}
 : ${INCLUDE_YARN=}

 # For Hive 3, we require Yarn for Tez support.
 if "$USE_CDP_HIVE"; then
   INCLUDE_YARN=1
 fi

 while getopts vky OPT; do
   case $OPT in
     v) set -x;;
     k) export IMPALA_KERBERIZE=1;;
     y) export INCLUDE_YARN=1;;
     ?) echo "Usage: $0 [-v (verbose) -k (kerberize) -y (yarn)] ACTION (see source...)"; exit 1;;
   esac
 done
 shift $(($OPTIND-1))

 DIR=$(dirname $0)
 NODES_DIR="$DIR/cdh$CDH_MAJOR_VERSION"
 NODE_COUNT=3
 if [[ "$TARGET_FILESYSTEM" == "hdfs" && "$ERASURE_CODING" = true ]]; then
   NODE_COUNT=5
 fi
 NODE_PREFIX=node-
 COMMON_NODE_TEMPLATE="$DIR/node_templates/common"
 NODE_TEMPLATE="$DIR/node_templates/cdh$CDH_MAJOR_VERSION"
 TEMPLATE_SUFFIX=".tmpl"
 PY_TEMPLATE_SUFFIX=".xml.py"

 # Each process should be marked with this so a "pkill -f" can be done to nuke everything.
 export KILL_CLUSTER_MARKER=IBelongToTheMiniCluster

 if [[ "$TARGET_FILESYSTEM" == "hdfs" ]]; then
   # The check above indicates that the regular mini-cluster is in use.
   SUPPORTED_SERVICES=(hdfs kms)
   if [ -n "${INCLUDE_YARN}" ]; then
     SUPPORTED_SERVICES+=(yarn)
   fi
 else
   # Either a remote distributed file system or a local non-distributed file system is
   # in use. Currently the only service that is expected to work is Kudu, though in theory
   # the other services could work after the proper configuration changes.
   SUPPORTED_SERVICES=()
 fi
 if $KUDU_IS_SUPPORTED; then
   SUPPORTED_SERVICES+=(kudu)
 fi

 # All DataNodes and NodeManagers need a unique but fixed address. The IP is fixed at
 # 127.0.0.1, so the only difference is the port. The address must be fixed because it is
 # used as an identifier. If a node were restarted with a different address it would be
 # considered a new node. The values below are arbitrary and may conflict with existing
 # services. Fixed ports were preferred to dynamically chosen free ports for consistency.
 DATANODE_FREE_PORT_START=31000
 DATANODE_FREE_HTTP_PORT_START=31010
 DATANODE_FREE_IPC_PORT_START=31020
 DATANODE_FREE_HTTPS_PORT_START=31030
 NODEMANAGER_FREE_PORT_START=31100
 NODEMANAGER_FREE_LOCALIZER_PORT_START=31120
 NODEMANAGER_FREE_WEBUI_PORT_START=31140
 KUDU_TS_RPC_FREE_PORT_START=31200
 KUDU_TS_WEBUI_FREE_PORT_START=31300

 # Used to populate config templates later. Any changes made here have no effect on an
 # existing cluster.
 export HDFS_WEBUI_PORT=5070   # changed from 50070 so it is not ephemeral
 export YARN_WEBUI_PORT=8088   # same as default
 export KMS_WEBUI_PORT=9600    # changed to make room for non-ephemeral HBase ports
                               # (HADOOP-12811)
 export KUDU_WEBUI_PORT=8051   # same as default

 # Empty dirs that should be included in the templates. Since git ignores empty dirs it is
 # easier to maintain them here.
 EMPTY_NODE_DIRS=$(echo data/dfs/{dn,nn} var/{run,lib/hadoop-hdfs,log} \
     var/{log,run}/kudu/{master,ts} var/lib/kudu/{master,ts}/{wal,data})

 EASY_ACCESS_LOG_DIR="$IMPALA_CLUSTER_LOGS_DIR"
 MINIKDC_INIT=${IMPALA_HOME}/testdata/bin/minikdc.sh

 FIND_EXECUTABLE_FILTER="-executable"
 if ! find /dev/null "${FIND_EXECUTABLE_FILTER}" 2> /dev/null; then
   # OSX and RHEL5, among others, don't have -executable
   FIND_EXECUTABLE_FILTER="-perm +0111"
 fi

 #
 # Various initialization routines for a Kerberized cluster setup.  We
 # do this if either
 #    1) IMPALA_KERBERIZE is set
 #    2) The config files exist and use kerberos
 # If either are true, then we set IMPALA_KERBERIZE ourselves, ensure the
 # minikdc is started, source the appropriate env vars from it, and get
 # ourselves a fresh TGT (Ticket-Granting-Ticket).
 #
 function kerberize_setup {
   if is_kerberized; then
     export IMPALA_KERBERIZE=1
   fi

   # No kerberos?  We're done.
   if [ -z "${IMPALA_KERBERIZE}" ]; then
     return
   fi

   # We must have the 'kinit' binary installed so that the minikdc can
   # grant the user a ticket-granting-ticket.
   if ! type -f kinit > /dev/null 2>&1; then
     echo "Unable to locate 'kinit' on this machine, it must be installed"
     echo "in order for you to actually communicate with the kerberized"
     echo "components of the cluster."
     echo "Try this (on ubuntu):"
     echo " --> $ sudo apt-get install krb5-user"
     echo
     exit 1
   fi

   # While we're checking things, make sure the SASL GSSAPI libraries
   # are installed.  These aren't required until we go to start up
   # the impala daemons, but we might as well fail early...
   IMPALA_SASL_PATH="/usr/lib/sasl2 /usr/lib64/sasl2 /usr/local/lib/sasl2"
   IMPALA_SASL_PATH="${IMPALA_SASL_PATH} /usr/lib/x86_64-linux-gnu/sasl2"
   SASL_GSSAPI_FOUND=0
   for i in ${IMPALA_SASL_PATH}; do
     if ls $i/libgssapi* > /dev/null 2>&1; then
       SASL_GSSAPI_FOUND=1
       break
     fi
   done
   if [ ${SASL_GSSAPI_FOUND} -eq 0 ]; then
     echo "Unable to locate the SASL GSSAPI libraries in the following path:"
     echo "${IMPALA_SASL_PATH}"
     echo "This is required for Kerberos communication using SASL, which"
     echo "the Impala daemons depend on.  To install (on Ubuntu), try:"
     echo " --> $ sudo apt-get install libsasl2-modules-gssapi-mit"
     echo
     exit 1
   fi

   # Starting it has no effect if it's already started...
   ${MINIKDC_INIT} start

   # Source the appropriate minikdc environment variables
   . ${MINIKDC_ENV}

   # Give ourselves a fresh TGT.  Debate here whether that should be
   # ${USER} or 'impala'.  Each has issues; see comments in buildall.sh.
   # Stick with ${USER} for now.
   kinit -k -t "${KRB5_KTNAME}" "${MINIKDC_PRINC_USER}"
   if [ $? -ne 0 ]; then
     echo "kinit failure; aborting"
     exit 1
   fi
 }

 # Return success if the cluster is kerberized
 function is_kerberized {
   HCONFSC="`get_hadoop_client_conf_dir`/core-site.xml"
   if [ -f "${HCONFSC}" ]; then
     if grep -qi "Kerberos settings" "${HCONFSC}"; then
       # If the config exists and has kerberos things in it, treat as kerberized
       return 0
     fi
   fi

   return 1
 }

 function cluster_exists {
   # Just use the first node as an indicator...
   if [[ ! -e "$NODES_DIR/${NODE_PREFIX}1" ]]; then
     return 1
   fi
 }

 function create_cluster {
   mkdir -p "$NODES_DIR"

   # Used to populate config templates later
   GROUP=$(id -gn)
   export GROUP

   # Blow away existing config files (so we don't pick up kerberos settings)
   rm -f `get_hadoop_client_conf_dir`/*

   if [ ! -z "${IMPALA_KERBERIZE}" ]; then
     kerberize_setup
     echo "Creating Kerberized cluster."
   else
     # Stop the minikdc in case it was running
     . ${MINIKDC_ENV}
     ${MINIKDC_INIT} stop
   fi

   echo "Hostname for internal communication: ${INTERNAL_LISTEN_HOST}" \
        "and for external communication: ${EXTERNAL_LISTEN_HOST}"
   # For consistency, the first node will host all the master roles.
   for ((NODE_IDX=$NODE_COUNT; NODE_IDX >= 1; NODE_IDX--)); do
     NODE=${NODE_PREFIX}$NODE_IDX
     NODE_DIR=$(get_node_dir $NODE)

     echo "Creating $NODE at $NODE_DIR"

     mkdir -p "$NODE_DIR"
     cp -a "$COMMON_NODE_TEMPLATE"/* "$NODE_DIR"
     if [[ -e "$NODE_TEMPLATE" ]]; then
       cp -a "$NODE_TEMPLATE"/* "$NODE_DIR"
     fi
     if [[ $NODE_IDX -gt 1 ]]; then
       # Remove master role scripts from slave nodes
       rm -f "$NODE_DIR/etc/init.d/"{hdfs-namenode,yarn-resourcemanager} \
             "$NODE_DIR/etc/init.d/"{kms,kudu-master}
       # Only run one YARN nodemanager (more memory-efficient to scale up a
       # single NM than run several)
       rm -f "$NODE_DIR/etc/init.d/yarn-nodemanager"
     fi
     for EMPTY_NODE_DIR in $EMPTY_NODE_DIRS; do
       mkdir -p "$NODE_DIR/$EMPTY_NODE_DIR"
     done

     # Add some easy access links closer to IMPALA_HOME
     EASY_ACCESS_LOG_LINK="$EASY_ACCESS_LOG_DIR/cdh$CDH_MAJOR_VERSION-$NODE"
     if [[ ! -e "$EASY_ACCESS_LOG_LINK" ]]; then
       mkdir -p "$EASY_ACCESS_LOG_DIR"
       ln -s "$NODE_DIR/var/log" "$EASY_ACCESS_LOG_DIR"
       mv "$IMPALA_CLUSTER_LOGS_DIR/log" "$EASY_ACCESS_LOG_LINK"
     fi

     # Template population
     DATANODE_PORT=$((DATANODE_FREE_PORT_START++))
     DATANODE_HTTP_PORT=$((DATANODE_FREE_HTTP_PORT_START++))
     DATANODE_IPC_PORT=$((DATANODE_FREE_IPC_PORT_START++))
     DATANODE_HTTPS_PORT=$((DATANODE_FREE_HTTPS_PORT_START++))
     NODEMANAGER_PORT=$((NODEMANAGER_FREE_PORT_START++))
     NODEMANAGER_LOCALIZER_PORT=$((NODEMANAGER_FREE_LOCALIZER_PORT_START++))
     NODEMANAGER_WEBUI_PORT=$((NODEMANAGER_FREE_WEBUI_PORT_START++))
     KUDU_TS_RPC_PORT=$((KUDU_TS_RPC_FREE_PORT_START++))
     KUDU_TS_WEBUI_PORT=$((KUDU_TS_WEBUI_FREE_PORT_START++))
     echo "$NODE will use ports DATANODE_PORT=$DATANODE_PORT," \
         "DATANODE_HTTP_PORT=$DATANODE_HTTP_PORT," \
         "DATANODE_IPC_PORT=$DATANODE_IPC_PORT," \
         "DATANODE_HTTPS_PORT=$DATANODE_HTTPS_PORT," \
         "NODEMANAGER_PORT=$NODEMANAGER_PORT," \
         "NODEMANAGER_LOCALIZER_PORT=$NODEMANAGER_LOCALIZER_PORT," \
         "NODEMANAGER_WEBUI_PORT=$NODEMANAGER_WEBUI_PORT," \
         "KUDU_TS_RPC_PORT=$KUDU_TS_RPC_PORT," \
         "and KUDU_TS_WEBUI_PORT=$KUDU_TS_WEBUI_PORT"

     export NODE NODE_DIR
     export DATANODE_PORT DATANODE_HTTP_PORT DATANODE_IPC_PORT DATANODE_HTTPS_PORT
     export NODEMANAGER_PORT NODEMANAGER_LOCALIZER_PORT NODEMANAGER_WEBUI_PORT
     export KUDU_TS_RPC_PORT KUDU_TS_WEBUI_PORT
     for TEMPLATE_PATH in $(find "$NODE_DIR" -name "*$TEMPLATE_SUFFIX"); do
       ACTUAL_PATH="${TEMPLATE_PATH%$TEMPLATE_SUFFIX}"

       # Search for strings like ${FOO}, if FOO is defined in the environment then replace
       # "${FOO}" with the environment value.
       perl -wpl -e 's/\$\{([^}]+)\}/defined $ENV{$1} ? $ENV{$1} : $&/eg' \
           "$TEMPLATE_PATH" > "$ACTUAL_PATH.1"

       # Chop out everything between the BEGIN/END Kerberos comments if
       # not kerberized
       if [ -z "${IMPALA_KERBERIZE}" ]; then
         sed '/<!-- BEGIN Kerberos/,/END Kerberos settings -->/d' \
             "$ACTUAL_PATH.1" > "$ACTUAL_PATH"
       else
         cp "$ACTUAL_PATH.1" "$ACTUAL_PATH"
       fi

       # Assumes that environment variables will be ALL CAPS...
       if grep '\${[A-Z_]*}' "$ACTUAL_PATH"; then
         echo "Found undefined variables in $ACTUAL_PATH, aborting"
         exit 1
       fi

       if [[ -x "$TEMPLATE_PATH" ]]; then
         chmod u+x "$ACTUAL_PATH"
       fi
       rm "$TEMPLATE_PATH" "$ACTUAL_PATH.1"
     done
     # Substitute python-templated XML files.
     # TODO(todd): move over all the XML templates to be Python-based.
     for TEMPLATE_PATH in $(find "$NODE_DIR" -name "*$PY_TEMPLATE_SUFFIX"); do
       ACTUAL_PATH="${TEMPLATE_PATH%$PY_TEMPLATE_SUFFIX}".xml
       $IMPALA_HOME/bin/generate_xml_config.py $TEMPLATE_PATH $ACTUAL_PATH
       rm $TEMPLATE_PATH
     done
   done
 }

 function start_cluster {
   if ! cluster_exists; then
     echo "The cluster must be created first"
     return 1
   fi

   if [ ! -z "${IMPALA_KERBERIZE}" ] && ! is_kerberized; then
     echo "Kerberized start requested, but the config files aren't set up"
     echo "for kerberos.  Destroy the cluster and rebuild it:"
     echo "  --> $ ./testdata/cluster/admin delete_cluster"
     echo "  --> $ IMPALA_KERBERIZE=1 ./testdata/cluster/admin create_cluster"
     exit 1
   fi

   kerberize_setup
   if is_kerberized; then
     echo "Starting Kerberized cluster."
   fi

   if [ ${#SUPPORTED_SERVICES[@]} -gt 0 ]; then
     for SERVICE in ${SUPPORTED_SERVICES[@]-}; do
       # If all of the following are true, we wait for ntp to sync before proceeding:
       #
       # 1. Kudu is being started
       # 2. An internet connection is available (checked by pinging $NTP_CANARY)
       # 3. ntp-wait is installed (checked by calling ntp-wait with the "--help" flag)
       if [[ "${SERVICE}" == "kudu" ]]; then
         NTP_CANARY=pool.ntp.org
         if ! ping -c 1 -w 5 "${NTP_CANARY}" >/dev/null 2>/dev/null; then
           echo "WARNING: cannot reach ${NTP_CANARY}; ntp sync recommended for Kudu"
         else
           # ntp-wait is in /usr/sbin on both CentOS and Debian-based systems, but regular
           # users don't always have /usr/sbin in $PATH. Use the absolute path.
           ntp_wait=$(command -v ntp-wait || echo /usr/sbin/ntp-wait)
           if [[ ! -x $ntp_wait ]]; then
             echo "WARNING: ntp-wait not installed; ntp sync recommended for Kudu"
           elif ! $ntp_wait -v; then
             echo "ntp-wait failed; cannot start kudu"
             return 1
           fi
         fi
       fi
       start $SERVICE
     done

     # If there any services to start, check that the cluster is still alive...
     sleep 10
     check_cluster_status
   fi
   return $?
 }

 function start {
   if [[ $# -ne 1 ]]; then
     echo start must be called with a single argument -- the service to start. 1>&2
     exit 1
   fi
   local SERVICE=$1

   WEBUI_PORT_VAR="$(echo "$SERVICE" | awk '{print toupper($0)}')_WEBUI_PORT"
   echo "Starting ${SERVICE} (Web UI - http://localhost:${!WEBUI_PORT_VAR})"
   exec_init_script "$SERVICE*" start
   if [ "${SERVICE}" = "hdfs" ]; then
       chmod_hdfs_root
   fi
 }

 function chmod_hdfs_root {
     # When created, the cluster only contains the root directory with
     # 0755 permissions and owned by hdfs.  In a kerberized
     # environment, this means that no one but hdfs can create
     # anything!  Chmod / to 01777 so that other users can create
     # things.  Insecure, but easy.  While we're here, make other sane
     # and permissive initial directories
     if is_kerberized; then
         PREVIOUS_PRINCIPAL=`klist | grep ^Default | awk '{print $3}'`
         # Become hdfs:
         kinit -k -t ${KRB5_KTNAME} ${MINIKDC_PRINC_HDFS}
         # Do the chmod, etc
         hadoop fs -chmod 1777 /
         if ! hadoop fs -ls /tmp > /dev/null 2>&1; then
             for i in /tmp /home /user; do
                 hadoop fs -mkdir $i
                 hadoop fs -chmod 1777 $i
             done
         fi
         # Become ourselves again.
         kinit -k -t ${KRB5_KTNAME} ${PREVIOUS_PRINCIPAL}
     fi
 }

 function exec_init_script {
   local SCRIPT_NAME="$1"
   shift
   local CMD="$1"

   local PIDS=()
   for SCRIPT in $(find "$NODES_DIR" -path "*/$NODE_PREFIX*/etc/init.d/$SCRIPT_NAME" \
       $FIND_EXECUTABLE_FILTER -type f); do
     if "$SCRIPT" status &>/dev/null; then
       RUNNING=true
     else
       RUNNING=false
     fi
     case "$CMD" in
       start) if ! $RUNNING; then "$SCRIPT" start; fi;;
       stop) if $RUNNING; then "$SCRIPT" stop; fi;;
       *) "$SCRIPT" "$@";;
     esac &
     PIDS+=($!)
   done

   # Always wait for all the pids, otherwise the output gets messed up. If this function
   # were to return while the background processes are still running, they may print
   # output that makes it appear as though this process is still running.
   local RET=0
   for PID in ${PIDS[@]}; do
     if ! wait $PID; then
       RET=1
     fi
   done
   return $RET
 }

 function check_cluster_status {
   if ! cluster_exists; then
     echo "The cluster does not exist"
     return 1
   fi

   ROLE_COUNT=0
   NOT_RUNNING=()
   for NODE_DIR in "$NODES_DIR/$NODE_PREFIX"*; do
     for SERVICE in ${SUPPORTED_SERVICES[@]-}; do
       for SCRIPT in $(find "$NODE_DIR" -path "*/etc/init.d/$SERVICE*" $FIND_EXECUTABLE_FILTER \
           -type f); do
         ROLE_COUNT=$((ROLE_COUNT + 1))
         if ! "$SCRIPT" status &>/dev/null; then
           NOT_RUNNING+=("\n$(basename $SCRIPT) is not running on $(basename $NODE_DIR)")
         fi
       done
     done
   done

   case ${#NOT_RUNNING[@]} in
     0) echo "The cluster is running"; return;;
     $ROLE_COUNT) echo "The cluster is not running"; return 1;;
     *) echo -e "${NOT_RUNNING[@]}"; return 1;;
   esac
 }

 function stop_cluster {
   if cluster_exists; then
     # Stop services in reverse order to give them a chance to shutdown cleanly
     for ((SERVICE_IDX=${#SUPPORTED_SERVICES[@]} - 1; SERVICE_IDX >= 0; SERVICE_IDX--)); do
       stop "${SUPPORTED_SERVICES[$SERVICE_IDX]}"
     done
   fi
   # Kill everything anyways just in case a git clean -xdf was done
   pkill -u $USER -f $KILL_CLUSTER_MARKER || true
 }

 function stop {
   if [[ $# -ne 1 ]]; then
     echo stop must be called with a single argument -- the service to stop. 1>&2
     exit 1
   fi
   local SERVICE=$1

   echo "Stopping ${SERVICE}"
   exec_init_script "$SERVICE*" stop
 }

 function restart {
   if [[ $# -ne 1 ]]; then
     echo restart must be called with a single argument -- the service to restart. 1>&2
     exit 1
   fi
   local SERVICE=$1
   stop $SERVICE
   start $SERVICE
 }

 function delete_data {
   # Delete namenode, datanode and KMS data while preserving directory structure.
   rm -rf "$NODES_DIR/$NODE_PREFIX"*/data/dfs/{nn,dn}/*
   rm -f "$NODES_DIR/$NODE_PREFIX"*/data/kms.keystore
   delete_kudu_data
 }

 function delete_kudu_data {
   rm -rf "$NODES_DIR/$NODE_PREFIX"*/var/lib/kudu/{master,ts}/*
 }

 function delete_cluster {
   pkill -u $USER -f $KILL_CLUSTER_MARKER || true
   rm -rf "$NODES_DIR"
 }

 function get_node_dir {
   if $IS_OSX; then
     greadlink -f "$NODES_DIR/$1"
   else
     readlink -f "$NODES_DIR/$1"
   fi
 }

 function get_hadoop_client_conf_dir {
   echo "$NODES_DIR/$NODE_PREFIX"1/etc/hadoop/conf
 }

 COMMAND=$1
 shift
 case $COMMAND in
   check_cluster_status | cluster_exists | is_kerberized)
     # Use an "if" to avoid triggering the ERR trap.
     if ! $COMMAND "$@"; then
       exit 1
     fi;;
   *) $COMMAND "$@";;
 esac
	#!/bin/bash

	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	# This will create/control/destroy a local hdfs/yarn/kms/kudu cluster.
	#
	# All roles run on 127.0.0.1, just like the standard mini cluster included with hadoop.
	# The difference is with this cluster, each role runs in its own process and has its own
	# configs. For each node, the layout of the configs, logs, start/stop scripts, etc, is
	# kept as close as possible to a real cluster. For example, the first node will live in
	# the dir "cdh-<version>/node-1" and its logs would be at "cdh-<version>/node-1/var/log".
	# TODO: Run each node on its own IP address, e.g. 127.0.0.1, 127.0.0.2, and so on.

	set -euo pipefail
	. $IMPALA_HOME/bin/report_build_error.sh
	setup_report_build_error

	: ${IMPALA_KERBERIZE=}
	: ${INCLUDE_YARN=}

	# For Hive 3, we require Yarn for Tez support.
	if "$USE_CDP_HIVE"; then
	INCLUDE_YARN=1
	fi

	while getopts vky OPT; do
	case $OPT in
	v) set -x;;
	k) export IMPALA_KERBERIZE=1;;
	y) export INCLUDE_YARN=1;;
	?) echo "Usage: $0 [-v (verbose) -k (kerberize) -y (yarn)] ACTION (see source...)"; exit 1;;
	esac
	done
	shift $(($OPTIND-1))

	DIR=$(dirname $0)
	NODES_DIR="$DIR/cdh$CDH_MAJOR_VERSION"
	NODE_COUNT=3
	if [[ "$TARGET_FILESYSTEM" == "hdfs" && "$ERASURE_CODING" = true ]]; then
	NODE_COUNT=5
	fi
	NODE_PREFIX=node-
	COMMON_NODE_TEMPLATE="$DIR/node_templates/common"
	NODE_TEMPLATE="$DIR/node_templates/cdh$CDH_MAJOR_VERSION"
	TEMPLATE_SUFFIX=".tmpl"
	PY_TEMPLATE_SUFFIX=".xml.py"

	# Each process should be marked with this so a "pkill -f" can be done to nuke everything.
	export KILL_CLUSTER_MARKER=IBelongToTheMiniCluster

	if [[ "$TARGET_FILESYSTEM" == "hdfs" ]]; then
	# The check above indicates that the regular mini-cluster is in use.
	SUPPORTED_SERVICES=(hdfs kms)
	if [ -n "${INCLUDE_YARN}" ]; then
	SUPPORTED_SERVICES+=(yarn)
	fi
	else
	# Either a remote distributed file system or a local non-distributed file system is
	# in use. Currently the only service that is expected to work is Kudu, though in theory
	# the other services could work after the proper configuration changes.
	SUPPORTED_SERVICES=()
	fi
	if $KUDU_IS_SUPPORTED; then
	SUPPORTED_SERVICES+=(kudu)
	fi

	# All DataNodes and NodeManagers need a unique but fixed address. The IP is fixed at
	# 127.0.0.1, so the only difference is the port. The address must be fixed because it is
	# used as an identifier. If a node were restarted with a different address it would be
	# considered a new node. The values below are arbitrary and may conflict with existing
	# services. Fixed ports were preferred to dynamically chosen free ports for consistency.
	DATANODE_FREE_PORT_START=31000
	DATANODE_FREE_HTTP_PORT_START=31010
	DATANODE_FREE_IPC_PORT_START=31020
	DATANODE_FREE_HTTPS_PORT_START=31030
	NODEMANAGER_FREE_PORT_START=31100
	NODEMANAGER_FREE_LOCALIZER_PORT_START=31120
	NODEMANAGER_FREE_WEBUI_PORT_START=31140
	KUDU_TS_RPC_FREE_PORT_START=31200
	KUDU_TS_WEBUI_FREE_PORT_START=31300

	# Used to populate config templates later. Any changes made here have no effect on an
	# existing cluster.
	export HDFS_WEBUI_PORT=5070 # changed from 50070 so it is not ephemeral
	export YARN_WEBUI_PORT=8088 # same as default
	export KMS_WEBUI_PORT=9600 # changed to make room for non-ephemeral HBase ports
	# (HADOOP-12811)
	export KUDU_WEBUI_PORT=8051 # same as default

	# Empty dirs that should be included in the templates. Since git ignores empty dirs it is
	# easier to maintain them here.
	EMPTY_NODE_DIRS=$(echo data/dfs/{dn,nn} var/{run,lib/hadoop-hdfs,log} \
	var/{log,run}/kudu/{master,ts} var/lib/kudu/{master,ts}/{wal,data})

	EASY_ACCESS_LOG_DIR="$IMPALA_CLUSTER_LOGS_DIR"
	MINIKDC_INIT=${IMPALA_HOME}/testdata/bin/minikdc.sh

	FIND_EXECUTABLE_FILTER="-executable"
	if ! find /dev/null "${FIND_EXECUTABLE_FILTER}" 2> /dev/null; then
	# OSX and RHEL5, among others, don't have -executable
	FIND_EXECUTABLE_FILTER="-perm +0111"
	fi

	#
	# Various initialization routines for a Kerberized cluster setup. We
	# do this if either
	# 1) IMPALA_KERBERIZE is set
	# 2) The config files exist and use kerberos
	# If either are true, then we set IMPALA_KERBERIZE ourselves, ensure the
	# minikdc is started, source the appropriate env vars from it, and get
	# ourselves a fresh TGT (Ticket-Granting-Ticket).
	#
	function kerberize_setup {
	if is_kerberized; then
	export IMPALA_KERBERIZE=1
	fi

	# No kerberos? We're done.
	if [ -z "${IMPALA_KERBERIZE}" ]; then
	return
	fi

	# We must have the 'kinit' binary installed so that the minikdc can
	# grant the user a ticket-granting-ticket.
	if ! type -f kinit > /dev/null 2>&1; then
	echo "Unable to locate 'kinit' on this machine, it must be installed"
	echo "in order for you to actually communicate with the kerberized"
	echo "components of the cluster."
	echo "Try this (on ubuntu):"
	echo " --> $ sudo apt-get install krb5-user"
	echo
	exit 1
	fi

	# While we're checking things, make sure the SASL GSSAPI libraries
	# are installed. These aren't required until we go to start up
	# the impala daemons, but we might as well fail early...
	IMPALA_SASL_PATH="/usr/lib/sasl2 /usr/lib64/sasl2 /usr/local/lib/sasl2"
	IMPALA_SASL_PATH="${IMPALA_SASL_PATH} /usr/lib/x86_64-linux-gnu/sasl2"
	SASL_GSSAPI_FOUND=0
	for i in ${IMPALA_SASL_PATH}; do
	if ls $i/libgssapi* > /dev/null 2>&1; then
	SASL_GSSAPI_FOUND=1
	break
	fi
	done
	if [ ${SASL_GSSAPI_FOUND} -eq 0 ]; then
	echo "Unable to locate the SASL GSSAPI libraries in the following path:"
	echo "${IMPALA_SASL_PATH}"
	echo "This is required for Kerberos communication using SASL, which"
	echo "the Impala daemons depend on. To install (on Ubuntu), try:"
	echo " --> $ sudo apt-get install libsasl2-modules-gssapi-mit"
	echo
	exit 1
	fi

	# Starting it has no effect if it's already started...
	${MINIKDC_INIT} start

	# Source the appropriate minikdc environment variables
	. ${MINIKDC_ENV}

	# Give ourselves a fresh TGT. Debate here whether that should be
	# ${USER} or 'impala'. Each has issues; see comments in buildall.sh.
	# Stick with ${USER} for now.
	kinit -k -t "${KRB5_KTNAME}" "${MINIKDC_PRINC_USER}"
	if [ $? -ne 0 ]; then
	echo "kinit failure; aborting"
	exit 1
	fi
	}

	# Return success if the cluster is kerberized
	function is_kerberized {
	HCONFSC="`get_hadoop_client_conf_dir`/core-site.xml"
	if [ -f "${HCONFSC}" ]; then
	if grep -qi "Kerberos settings" "${HCONFSC}"; then
	# If the config exists and has kerberos things in it, treat as kerberized
	return 0
	fi
	fi

	return 1
	}

	function cluster_exists {
	# Just use the first node as an indicator...
	if [[ ! -e "$NODES_DIR/${NODE_PREFIX}1" ]]; then
	return 1
	fi
	}

	function create_cluster {
	mkdir -p "$NODES_DIR"

	# Used to populate config templates later
	GROUP=$(id -gn)
	export GROUP

	# Blow away existing config files (so we don't pick up kerberos settings)
	rm -f `get_hadoop_client_conf_dir`/*

	if [ ! -z "${IMPALA_KERBERIZE}" ]; then
	kerberize_setup
	echo "Creating Kerberized cluster."
	else
	# Stop the minikdc in case it was running
	. ${MINIKDC_ENV}
	${MINIKDC_INIT} stop
	fi

	echo "Hostname for internal communication: ${INTERNAL_LISTEN_HOST}" \
	"and for external communication: ${EXTERNAL_LISTEN_HOST}"
	# For consistency, the first node will host all the master roles.
	for ((NODE_IDX=$NODE_COUNT; NODE_IDX >= 1; NODE_IDX--)); do
	NODE=${NODE_PREFIX}$NODE_IDX
	NODE_DIR=$(get_node_dir $NODE)

	echo "Creating $NODE at $NODE_DIR"

	mkdir -p "$NODE_DIR"
	cp -a "$COMMON_NODE_TEMPLATE"/* "$NODE_DIR"
	if [[ -e "$NODE_TEMPLATE" ]]; then
	cp -a "$NODE_TEMPLATE"/* "$NODE_DIR"
	fi
	if [[ $NODE_IDX -gt 1 ]]; then
	# Remove master role scripts from slave nodes
	rm -f "$NODE_DIR/etc/init.d/"{hdfs-namenode,yarn-resourcemanager} \
	"$NODE_DIR/etc/init.d/"{kms,kudu-master}
	# Only run one YARN nodemanager (more memory-efficient to scale up a
	# single NM than run several)
	rm -f "$NODE_DIR/etc/init.d/yarn-nodemanager"
	fi
	for EMPTY_NODE_DIR in $EMPTY_NODE_DIRS; do
	mkdir -p "$NODE_DIR/$EMPTY_NODE_DIR"
	done

	# Add some easy access links closer to IMPALA_HOME
	EASY_ACCESS_LOG_LINK="$EASY_ACCESS_LOG_DIR/cdh$CDH_MAJOR_VERSION-$NODE"
	if [[ ! -e "$EASY_ACCESS_LOG_LINK" ]]; then
	mkdir -p "$EASY_ACCESS_LOG_DIR"
	ln -s "$NODE_DIR/var/log" "$EASY_ACCESS_LOG_DIR"
	mv "$IMPALA_CLUSTER_LOGS_DIR/log" "$EASY_ACCESS_LOG_LINK"
	fi

	# Template population
	DATANODE_PORT=$((DATANODE_FREE_PORT_START++))
	DATANODE_HTTP_PORT=$((DATANODE_FREE_HTTP_PORT_START++))
	DATANODE_IPC_PORT=$((DATANODE_FREE_IPC_PORT_START++))
	DATANODE_HTTPS_PORT=$((DATANODE_FREE_HTTPS_PORT_START++))
	NODEMANAGER_PORT=$((NODEMANAGER_FREE_PORT_START++))
	NODEMANAGER_LOCALIZER_PORT=$((NODEMANAGER_FREE_LOCALIZER_PORT_START++))
	NODEMANAGER_WEBUI_PORT=$((NODEMANAGER_FREE_WEBUI_PORT_START++))
	KUDU_TS_RPC_PORT=$((KUDU_TS_RPC_FREE_PORT_START++))
	KUDU_TS_WEBUI_PORT=$((KUDU_TS_WEBUI_FREE_PORT_START++))
	echo "$NODE will use ports DATANODE_PORT=$DATANODE_PORT," \
	"DATANODE_HTTP_PORT=$DATANODE_HTTP_PORT," \
	"DATANODE_IPC_PORT=$DATANODE_IPC_PORT," \
	"DATANODE_HTTPS_PORT=$DATANODE_HTTPS_PORT," \
	"NODEMANAGER_PORT=$NODEMANAGER_PORT," \
	"NODEMANAGER_LOCALIZER_PORT=$NODEMANAGER_LOCALIZER_PORT," \
	"NODEMANAGER_WEBUI_PORT=$NODEMANAGER_WEBUI_PORT," \
	"KUDU_TS_RPC_PORT=$KUDU_TS_RPC_PORT," \
	"and KUDU_TS_WEBUI_PORT=$KUDU_TS_WEBUI_PORT"

	export NODE NODE_DIR
	export DATANODE_PORT DATANODE_HTTP_PORT DATANODE_IPC_PORT DATANODE_HTTPS_PORT
	export NODEMANAGER_PORT NODEMANAGER_LOCALIZER_PORT NODEMANAGER_WEBUI_PORT
	export KUDU_TS_RPC_PORT KUDU_TS_WEBUI_PORT
	for TEMPLATE_PATH in $(find "$NODE_DIR" -name "*$TEMPLATE_SUFFIX"); do
	ACTUAL_PATH="${TEMPLATE_PATH%$TEMPLATE_SUFFIX}"

	# Search for strings like ${FOO}, if FOO is defined in the environment then replace
	# "${FOO}" with the environment value.
	perl -wpl -e 's/\$\{([^}]+)\}/defined $ENV{$1} ? $ENV{$1} : $&/eg' \
	"$TEMPLATE_PATH" > "$ACTUAL_PATH.1"

	# Chop out everything between the BEGIN/END Kerberos comments if
	# not kerberized
	if [ -z "${IMPALA_KERBERIZE}" ]; then
	sed '/<!-- BEGIN Kerberos/,/END Kerberos settings -->/d' \
	"$ACTUAL_PATH.1" > "$ACTUAL_PATH"
	else
	cp "$ACTUAL_PATH.1" "$ACTUAL_PATH"
	fi

	# Assumes that environment variables will be ALL CAPS...
	if grep '\${[A-Z_]*}' "$ACTUAL_PATH"; then
	echo "Found undefined variables in $ACTUAL_PATH, aborting"
	exit 1
	fi

	if [[ -x "$TEMPLATE_PATH" ]]; then
	chmod u+x "$ACTUAL_PATH"
	fi
	rm "$TEMPLATE_PATH" "$ACTUAL_PATH.1"
	done
	# Substitute python-templated XML files.
	# TODO(todd): move over all the XML templates to be Python-based.
	for TEMPLATE_PATH in $(find "$NODE_DIR" -name "*$PY_TEMPLATE_SUFFIX"); do
	ACTUAL_PATH="${TEMPLATE_PATH%$PY_TEMPLATE_SUFFIX}".xml
	$IMPALA_HOME/bin/generate_xml_config.py $TEMPLATE_PATH $ACTUAL_PATH
	rm $TEMPLATE_PATH
	done
	done
	}

	function start_cluster {
	if ! cluster_exists; then
	echo "The cluster must be created first"
	return 1
	fi

	if [ ! -z "${IMPALA_KERBERIZE}" ] && ! is_kerberized; then
	echo "Kerberized start requested, but the config files aren't set up"
	echo "for kerberos. Destroy the cluster and rebuild it:"
	echo " --> $ ./testdata/cluster/admin delete_cluster"
	echo " --> $ IMPALA_KERBERIZE=1 ./testdata/cluster/admin create_cluster"
	exit 1
	fi

	kerberize_setup
	if is_kerberized; then
	echo "Starting Kerberized cluster."
	fi

	if [ ${#SUPPORTED_SERVICES[@]} -gt 0 ]; then
	for SERVICE in ${SUPPORTED_SERVICES[@]-}; do
	# If all of the following are true, we wait for ntp to sync before proceeding:
	#
	# 1. Kudu is being started
	# 2. An internet connection is available (checked by pinging $NTP_CANARY)
	# 3. ntp-wait is installed (checked by calling ntp-wait with the "--help" flag)
	if [[ "${SERVICE}" == "kudu" ]]; then
	NTP_CANARY=pool.ntp.org
	if ! ping -c 1 -w 5 "${NTP_CANARY}" >/dev/null 2>/dev/null; then
	echo "WARNING: cannot reach ${NTP_CANARY}; ntp sync recommended for Kudu"
	else
	# ntp-wait is in /usr/sbin on both CentOS and Debian-based systems, but regular
	# users don't always have /usr/sbin in $PATH. Use the absolute path.
	ntp_wait=$(command -v ntp-wait \|\| echo /usr/sbin/ntp-wait)
	if [[ ! -x $ntp_wait ]]; then
	echo "WARNING: ntp-wait not installed; ntp sync recommended for Kudu"
	elif ! $ntp_wait -v; then
	echo "ntp-wait failed; cannot start kudu"
	return 1
	fi
	fi
	fi
	start $SERVICE
	done

	# If there any services to start, check that the cluster is still alive...
	sleep 10
	check_cluster_status
	fi
	return $?
	}

	function start {
	if [[ $# -ne 1 ]]; then
	echo start must be called with a single argument -- the service to start. 1>&2
	exit 1
	fi
	local SERVICE=$1

	WEBUI_PORT_VAR="$(echo "$SERVICE" \| awk '{print toupper($0)}')_WEBUI_PORT"
	echo "Starting ${SERVICE} (Web UI - http://localhost:${!WEBUI_PORT_VAR})"
	exec_init_script "$SERVICE*" start
	if [ "${SERVICE}" = "hdfs" ]; then
	chmod_hdfs_root
	fi
	}

	function chmod_hdfs_root {
	# When created, the cluster only contains the root directory with
	# 0755 permissions and owned by hdfs. In a kerberized
	# environment, this means that no one but hdfs can create
	# anything! Chmod / to 01777 so that other users can create
	# things. Insecure, but easy. While we're here, make other sane
	# and permissive initial directories
	if is_kerberized; then
	PREVIOUS_PRINCIPAL=`klist \| grep ^Default \| awk '{print $3}'`
	# Become hdfs:
	kinit -k -t ${KRB5_KTNAME} ${MINIKDC_PRINC_HDFS}
	# Do the chmod, etc
	hadoop fs -chmod 1777 /
	if ! hadoop fs -ls /tmp > /dev/null 2>&1; then
	for i in /tmp /home /user; do
	hadoop fs -mkdir $i
	hadoop fs -chmod 1777 $i
	done
	fi
	# Become ourselves again.
	kinit -k -t ${KRB5_KTNAME} ${PREVIOUS_PRINCIPAL}
	fi
	}

	function exec_init_script {
	local SCRIPT_NAME="$1"
	shift
	local CMD="$1"

	local PIDS=()
	for SCRIPT in $(find "$NODES_DIR" -path "/$NODE_PREFIX/etc/init.d/$SCRIPT_NAME" \
	$FIND_EXECUTABLE_FILTER -type f); do
	if "$SCRIPT" status &>/dev/null; then
	RUNNING=true
	else
	RUNNING=false
	fi
	case "$CMD" in
	start) if ! $RUNNING; then "$SCRIPT" start; fi;;
	stop) if $RUNNING; then "$SCRIPT" stop; fi;;
	*) "$SCRIPT" "$@";;
	esac &
	PIDS+=($!)
	done

	# Always wait for all the pids, otherwise the output gets messed up. If this function
	# were to return while the background processes are still running, they may print
	# output that makes it appear as though this process is still running.
	local RET=0
	for PID in ${PIDS[@]}; do
	if ! wait $PID; then
	RET=1
	fi
	done
	return $RET
	}

	function check_cluster_status {
	if ! cluster_exists; then
	echo "The cluster does not exist"
	return 1
	fi

	ROLE_COUNT=0
	NOT_RUNNING=()
	for NODE_DIR in "$NODES_DIR/$NODE_PREFIX"*; do
	for SERVICE in ${SUPPORTED_SERVICES[@]-}; do
	for SCRIPT in $(find "$NODE_DIR" -path "/etc/init.d/$SERVICE" $FIND_EXECUTABLE_FILTER \
	-type f); do
	ROLE_COUNT=$((ROLE_COUNT + 1))
	if ! "$SCRIPT" status &>/dev/null; then
	NOT_RUNNING+=("\n$(basename $SCRIPT) is not running on $(basename $NODE_DIR)")
	fi
	done
	done
	done

	case ${#NOT_RUNNING[@]} in
	0) echo "The cluster is running"; return;;
	$ROLE_COUNT) echo "The cluster is not running"; return 1;;
	*) echo -e "${NOT_RUNNING[@]}"; return 1;;
	esac
	}

	function stop_cluster {
	if cluster_exists; then
	# Stop services in reverse order to give them a chance to shutdown cleanly
	for ((SERVICE_IDX=${#SUPPORTED_SERVICES[@]} - 1; SERVICE_IDX >= 0; SERVICE_IDX--)); do
	stop "${SUPPORTED_SERVICES[$SERVICE_IDX]}"
	done
	fi
	# Kill everything anyways just in case a git clean -xdf was done
	pkill -u $USER -f $KILL_CLUSTER_MARKER \|\| true
	}

	function stop {
	if [[ $# -ne 1 ]]; then
	echo stop must be called with a single argument -- the service to stop. 1>&2
	exit 1
	fi
	local SERVICE=$1

	echo "Stopping ${SERVICE}"
	exec_init_script "$SERVICE*" stop
	}

	function restart {
	if [[ $# -ne 1 ]]; then
	echo restart must be called with a single argument -- the service to restart. 1>&2
	exit 1
	fi
	local SERVICE=$1
	stop $SERVICE
	start $SERVICE
	}

	function delete_data {
	# Delete namenode, datanode and KMS data while preserving directory structure.
	rm -rf "$NODES_DIR/$NODE_PREFIX"/data/dfs/{nn,dn}/
	rm -f "$NODES_DIR/$NODE_PREFIX"*/data/kms.keystore
	delete_kudu_data
	}

	function delete_kudu_data {
	rm -rf "$NODES_DIR/$NODE_PREFIX"/var/lib/kudu/{master,ts}/
	}

	function delete_cluster {
	pkill -u $USER -f $KILL_CLUSTER_MARKER \|\| true
	rm -rf "$NODES_DIR"
	}

	function get_node_dir {
	if $IS_OSX; then
	greadlink -f "$NODES_DIR/$1"
	else
	readlink -f "$NODES_DIR/$1"
	fi
	}

	function get_hadoop_client_conf_dir {
	echo "$NODES_DIR/$NODE_PREFIX"1/etc/hadoop/conf
	}

	COMMAND=$1
	shift
	case $COMMAND in
	check_cluster_status \| cluster_exists \| is_kerberized)
	# Use an "if" to avoid triggering the ERR trap.
	if ! $COMMAND "$@"; then
	exit 1
	fi;;
	*) $COMMAND "$@";;
	esac