docker/entrypoint.sh - impala - Git at Google

 #!/bin/bash
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 # Entrypoint code for test-with-docker.py containers. test-with-docker.py
 # will create Docker containers with this script as the entrypoint,
 # with a variety of arguments. See test-with-docker.py for a more
 # general overview.
 #
 # This assumes that the following are already mounted inside
 # the container:
 #   /etc/localtime                      -> /mnt/localtime
 #     Helps timestamps be in the time zone of the host
 #   $IMPALA_HOME [a git repo of Impala] -> /repo
 #     Used to check out Impala afresh
 #   $IMPALA_HOME/logs/docker/<n1>/<n2> -> /logs
 #     Used to save logs out to host.
 #     <n1> represents the --name passed into
 #     test-with-docker for the test run. <n2>
 #     indicates which specific container is being run.
 #   ~/.ccache [configurable]            -> /ccache
 #     Used to speed up builds.
 #
 # Usage:
 #   entrypoint.sh build <uid>
 #   entrypoint.sh test_suite <suite>
 #      where <suite> is one of: BE_TEST JDBC_TEST CLUSTER_TEST
 #                               EE_TEST_SERIAL EE_TEST_PARALLEL

 # Starts or stops postgres
 # The centos:7 Docker image doesn't allow systemctl to start postgresql,
 # so we start it explicitly with pg_ctl.
 function _pg_ctl() {
   if [ -f /etc/redhat-release ]; then
     if which systemctl; then
       sudo -u postgres PGDATA=/var/lib/pgsql/data bash -c "pg_ctl $1 -w --timeout=120 >> /var/lib/pgsql/pg.log 2>&1"
       return
     fi
   fi
   sudo service postgresql $1
 }

 # Boostraps the container by creating a user and adding basic tools like Python and git.
 # Takes a uid as an argument for the user to be created.
 function build() {
   # Handy for testing.
   if [[ $TEST_TEST_WITH_DOCKER ]]; then
     # We sleep busily so that CPU metrics will show usage, to
     # better exercise the timeline code.
     echo sleeping busily for 4 seconds
     bash -c 'while [[ $SECONDS -lt 4 ]]; do :; done'
     return
   fi

   # Configure timezone, so any timestamps that appear are coherent with the host.
   configure_timezone

   # Assert we're superuser.
   [ "$(id -u)" = 0 ]
   if id $1 2> /dev/null; then
     echo "User with id $1 already exists. Please run this as a user id missing from " \
       "the base Ubuntu container."
     echo
     echo "Container users:"
     paste <(cut -d : -f3 /etc/passwd) <(cut -d : -f1 /etc/passwd) | sort -n
     exit 1
   fi
   if which apt-get > /dev/null; then
     apt-get update
     apt-get install -y sudo git lsb-release python
   else
     yum -y install sudo git python
   fi

   if ! id impdev; then
     # Adduser is slightly different on CentOS and Ubuntu
     if which apt-get; then
       adduser --disabled-password --gecos "" --uid $1 impdev
     else
       adduser --uid $1 impdev
     fi
     echo "impdev ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
   fi

   ulimit -a
   su impdev -c "$0 build_impdev"
 }

 # Sets up Impala environment
 function impala_environment() {
   pushd /home/impdev/Impala
   export IMPALA_HOME=/home/impdev/Impala
   source bin/impala-config.sh
   popd
 }

 # Starts SSH and PostgreSQL; configures container as necessary;
 # prepares Kudu for starting.
 function boot_container() {
   pushd /home/impdev/Impala

   # Make log directories. This is typically done in buildall.sh.
   mkdir -p logs/be_tests logs/fe_tests/coverage logs/ee_tests logs/custom_cluster_tests

   # Update /etc/hosts to remove the entry for the unique docker hostname,
   # and instead point it to 127.0.0.1. Otherwise, HttpFS returns Location:
   # redirects to said hostname, but the relevant datanode isn't listening
   # on the wildcard address. bootstrap_system.sh does this as well, but
   # Docker creates a new /etc/hosts every time a container is created, so
   # this needs to be done here as well.
   #
   # "sed -i" in place doesn't work on Docker, because /etc/hosts is a bind mount.
   sed -e /$(hostname)/d /etc/hosts > /tmp/hosts
   echo "127.0.0.1 $(hostname -s) $(hostname)" >> /tmp/hosts
   sudo cp /tmp/hosts /etc/hosts

   echo Hostname: $(hostname)
   echo Hosts file:
   cat /etc/hosts

   popd
 }

 function start_minicluster {
   # The subshell here avoids the verbose output from set -x.
   (echo ">>> Starting PostgreSQL and SSH") 2> /dev/null
   pushd /home/impdev/Impala

   # Required for metastore
   _pg_ctl start

   # Required for starting HBase
   if [ -f /etc/redhat-release ]; then
     if which systemctl; then
       # centos7 doesn't support systemd running inside of docker to start daemons
       sudo /usr/sbin/sshd
     else
       sudo service sshd start
     fi
   else
     sudo service ssh start
   fi

   (echo ">>> Copying Kudu Data") 2> /dev/null
   # Move around Kudu's WALs to avoid issue with Docker filesystems (aufs and
   # overlayfs) that don't support os.rename(2) on directories, which Kudu
   # requires. We make a fresh copy of the data, in which case rename(2) works
   # presumably because there's only one layer involved. See
   # https://issues.apache.org/jira/browse/KUDU-1419.
   set -x
   if [ "true" = $KUDU_IS_SUPPORTED ]; then
     pushd /home/impdev/Impala/testdata
     for x in cluster/cdh*/node-*/var/lib/kudu/*/wal; do
       echo $x
       # This mv takes time, as it's actually copying into the latest layer.
       mv $x $x-orig
       mkdir $x
       mv $x-orig/* $x
       rmdir $x-orig
     done
     popd
   fi

   # Wait for postgresql to really start; if it doesn't, Hive Metastore will fail to start.
   for i in {1..120}; do
     echo connecting to postgresql attempt $i
     if sudo -u postgres psql -c "select 1"; then
       break
     else
       sleep 2
     fi
   done
   sudo -u postgres psql -c "select 1"

   (echo ">>> Starting mini cluster") 2> /dev/null
   testdata/bin/run-all.sh

   popd
 }

 # Runs bootstrap_system.sh and then builds Impala.
 function build_impdev() {
   # Assert we're impdev now.
   [ "$(id -un)" = impdev ]

   # Bump "Max processes" ulimit to the hard limit; default
   # on CentOS 6 can be 1024, which isn't enough for minicluster.
   ulimit -u $(cat /proc/self/limits | grep 'Max processes' | awk '{ print $4 }')
   ulimit -a

   # Link in ccache from host.
   ln -s /ccache /home/impdev/.ccache

   # Instead of doing a full "git clone" of /repo, which is the host's checkout,
   # we only fetch one branch, without tags. This keeps the checkout
   # considerably lighter.
   mkdir /home/impdev/Impala
   pushd /home/impdev/Impala
   git init
   git fetch /git_common_dir --no-tags "$GIT_HEAD_REV"
   git checkout -b test-with-docker FETCH_HEAD

   # Checkout impala-lzo too
   mkdir /home/impdev/Impala-lzo
   pushd /home/impdev/Impala-lzo
   git init
   git fetch $IMPALA_LZO_REPO --no-tags "$IMPALA_LZO_REF"
   git checkout -b test-with-docker FETCH_HEAD
   popd

   # Link in logs. Logs are on the host since that's the most important thing to
   # look at after the tests are run.
   ln -sf /logs logs

   bin/bootstrap_system.sh
   impala_environment

   # Builds Impala and loads test data.
   # Note that IMPALA-6494 prevents us from using shared library builds,
   # which are smaller and thereby speed things up. We use "-notests"
   # to avoid building backend tests, which are sizable, and
   # can be built when executing those tests. We use "-noclean" to
   # avoid deleting the log for this invocation which is in logs/,
   # and, this is a first build anyway.
   ./buildall.sh -noclean -format -testdata -notests

   # We make one exception to "-notests":
   # test_insert_parquet.py, which is used in all the end-to-end test
   # shards, depends on this binary. We build it here once,
   # instead of building it during the startup of each container running
   # a subset of E2E tests. Building it here is also a lot faster.
   make -j$(nproc) --load-average=$(nproc) parquet-reader

   # Dump current memory usage to logs, before shutting things down.
   memory_usage

   # Shut down things cleanly.
   testdata/bin/kill-all.sh

   # "Compress" HDFS data by de-duplicating blocks. As a result of
   # having three datanodes, our data load is 3x larger than it needs
   # to be. To alleviate this (to the tune of ~20GB savings), we
   # use hardlinks to link together the identical blocks. This is absolutely
   # taking advantage of an implementation detail of HDFS.
   echo "Hardlinking duplicate HDFS block data."
   set +x
   for x in $(find testdata/cluster/*/node-1/data/dfs/dn/current/ -name 'blk_*[0-9]'); do
     for n in 2 3; do
       xn=${x/node-1/node-$n}
       if [ -f $xn ]; then
         rm $xn
         ln $x $xn
       fi
     done
   done
   set -x

   # Shutting down PostgreSQL nicely speeds up it's start time for new containers.
   _pg_ctl stop

   # Clean up things we don't need to reduce image size
   find be -name '*.o' -execdir rm '{}' + # ~1.6GB

   # Clean up dangling symlinks. These (typically "cluster/cdh*-node-*")
   # may point to something inside a container that no longer exists
   # and can confuse Jenkins.
   find /logs -xtype l -execdir rm '{}' ';'

   popd
 }

 # Prints top 20 RSS consumers (and other, total), in megabytes Common culprits
 # are Java processes without Xmx set. Since most things don't reclaim memory,
 # this is a decent proxy for peak memory usage by long-lived processes.
 function memory_usage() {
   (
   echo "Top 20 memory consumers (RSS in MBs)"
   sudo ps -axho rss,args | \
     sed -e 's/^ *//' | \
     sed -e 's, ,\t,' | \
     sort -nr | \
     awk -F'\t' '
     FNR < 20 { print $1/1024.0, $2; total += $1/1024.0 }
     FNR >= 20 { other+= $1/1024.0; total += $1/1024.0 }
     END {
       if (other) { print other, "-- other --" };
       print total, "-- total --"
     }'
   ) >& /logs/memory_usage.txt
 }

 # Runs a suite passed in as the first argument. Tightly
 # coupled with Impala's run-all-tests and the suite names.
 # from test-with-docker.py.
 #
 # Before running tests, starts up the minicluster.
 function test_suite() {
   cd /home/impdev/Impala

   # These test suites are for testing.
   if [[ $1 == NOOP ]]; then
     # Sleep busily for 10 seconds.
     bash -c 'while [[ $SECONDS -lt 10 ]]; do :; done'
     return 0
   fi
   if [[ $1 == NOOP_FAIL ]]; then
     return 1
   fi
   if [[ $1 == NOOP_SLEEP_FOREVER ]]; then
     # Handy to test timeouts.
     while true; do sleep 60; done
   fi

   # Assert that we're running as impdev
   [ "$(id -un)" = impdev ]

   # Assert that /home/impdev/Impala/logs is a symlink to /logs.
   [ "$(readlink /home/impdev/Impala/logs)" = /logs ]

   boot_container
   impala_environment

   if [[ ${REBUILD_ASAN:-false} = true ]]; then
     # Note: we're not redoing data loading.
     SKIP_TOOLCHAIN_BOOTSTRAP=true ./buildall.sh -noclean -notests -asan
   fi

   # BE tests don't require the minicluster, so we can run them directly.
   if [[ $1 = BE_TEST* ]]; then
     make -j$(nproc) --load-average=$(nproc) be-test be-benchmarks
     if ! bin/run-backend-tests.sh; then
       echo "Tests $1 failed!"
       return 1
     else
       echo "Tests $1 succeeded!"
       return 0
     fi
   fi

   if [[ $1 == RAT_CHECK ]]; then
     # Runs Apache RAT (a license checker)
     git archive --prefix=rat/ -o rat-impala.zip HEAD
     wget --quiet https://archive.apache.org/dist/creadur/apache-rat-0.12/apache-rat-0.12-bin.tar.gz
     tar xzf apache-rat-0.12-bin.tar.gz
     java -jar apache-rat-0.12/apache-rat-0.12.jar -x rat-impala.zip > logs/rat.xml
     bin/check-rat-report.py bin/rat_exclude_files.txt logs/rat.xml
     return $?
   fi

   # Start the minicluster
   start_minicluster

   # By default, the JVM will use 1/4 of your OS memory for its heap size. For a
   # long-running test, this will delay GC inside of impalad's leading to
   # unnecessarily large process RSS footprints. To combat this, we
   # set a small initial heap size, and then cap it at a more reasonable
   # size. The small initial heap sizes help for daemons that do little
   # in the way of JVM work (e.g., the 2nd and 3rd impalad's).
   # Note that "test_insert_large_string" fails at 2g and 3g, so the suite that
   # includes it (EE_TEST_PARALLEL) gets additional memory.

   # Note that we avoid using TEST_START_CLUSTER_ARGS="--jvm-args=..."
   # because it gets flattened along the way if we need to provide
   # more than one Java argument. We use JAVA_TOOL_OPTIONS instead.
   JVM_HEAP_MAX_GB=2
   if [[ $1 = EE_TEST_PARALLEL ]]; then
     JVM_HEAP_MAX_GB=4
   elif [[ $1 = EE_TEST_PARALLEL_EXHAUSTIVE ]]; then
     JVM_HEAP_MAX_GB=8
   fi
   JAVA_TOOL_OPTIONS="-Xms512M -Xmx${JVM_HEAP_MAX_GB}G"

   # Similarly, bin/start-impala-cluster typically configures the memlimit
   # to be 80% of the machine memory, divided by the number of daemons.
   # If multiple containers are to be run simultaneously, this is scaled
   # down in test-with-docker.py (and further configurable with --impalad-mem-limit-bytes)
   # and passed in via $IMPALAD_MEM_LIMIT_BYTES to the container. There is a
   # relationship between the number of parallel tests that can be run by py.test and this
   # limit.
   export TEST_START_CLUSTER_ARGS="--impalad_args=--mem_limit=$IMPALAD_MEM_LIMIT_BYTES"

   export MAX_PYTEST_FAILURES=0

   # Asserting that these should are all set (to either true or false as strings).
   # This is how run-all.sh chooses between them.
   [[ $FE_TEST && $BE_TEST && $EE_TEST && $JDBC_TEST && $CLUSTER_TEST ]]

   ret=0

   # Run tests.
   (echo ">>> $1: Starting run-all-test") 2> /dev/null
   if ! time -p bash -x bin/run-all-tests.sh; then
     ret=1
     echo "Tests $1 failed!"
   else
     echo "Tests $1 succeeded!"
   fi

   # Save memory usage after tests have run but before shutting down the cluster.
   memory_usage || true

   # Oddly, I've observed bash fail to exit (and wind down the container),
   # leading to test-with-docker.py hitting a timeout. Killing the minicluster
   # daemons fixes this.
   testdata/bin/kill-all.sh || true
   return $ret
 }

 # It's convenient (for log files to be legible) for the container
 # to have the host timezone. However, /etc/localtime is finnicky
 # (see localtime(5)) and mounting it to the host /etc/localtime or
 # symlinking it there doesn't always work. Instead, we expect
 # $LOCALTIME_LINK_TARGET to be set to a path in /usr/share/zoneinfo.
 function configure_timezone() {
   if [ -e "${LOCALTIME_LINK_TARGET}" ]; then
     ln -sf "${LOCALTIME_LINK_TARGET}" /etc/localtime
     # Only Debian-based distros have this file.
     if [ -f /etc/timezone ]; then
       echo "${LOCALTIME_LINK_TARGET}" | sed -e 's,.*zoneinfo/,,' > /etc/timezone
     fi
   else
     echo '$LOCALTIME_LINK_TARGET not configured.' 1>&2
   fi
 }

 # Exposes a shell, with the container booted with
 # a minicluster.
 function shell() {
   echo "Starting minicluster and Impala."
   # Logs is typically a symlink; remove it if so.
   rm logs || true
   mkdir -p logs
   boot_container
   impala_environment
   # Kudu requires --privileged for the Docker container; see
   # https://issues.apache.org/jira/browse/KUDU-2000. Because
   # our goal here is convenience for new developers, we
   # skip kudu if "ntptime" doesn't work, which is a good
   # proxy for Kudu won't start.
   if ! ntptime > /dev/null; then
     export KUDU_IS_SUPPORTED=false
     KUDU_MSG="Kudu is not started."
   fi
   start_minicluster
   bin/start-impala-cluster.py
   cat <<"EOF"

 ==========================================================
 Welcome to the Impala development environment.

 The "minicluster" is running; i.e., HDFS, HBase, Hive,
 etc. are running. $KUDU_MSG

 To get started, perhaps run:
   impala-shell.sh -q 'select count(*) from tpcds.web_page'
 ==========================================================

 EOF
   exec bash
 }

 function main() {
   set -e

   # Run given command
   CMD="$1"
   shift

   # Treat shell specialy to avoid the extra logging and | cat below.
   if [[ $CMD = "shell" ]]; then
     shell
     # shell shoud have exec'd, so if we get here, it's a failure.
     exit 1
   fi

   echo ">>> ${CMD} $@ (begin)"
   # Dump environment, for debugging
   env | grep -vE "AWS_(SECRET_)?ACCESS_KEY"
   ulimit -a
   set -x
   # The "| cat" here avoids "set -e"/errexit from exiting the
   # script right away.
   "${CMD}" "$@" | cat
   ret=${PIPESTATUS[0]}
   set +x
   echo ">>> ${CMD} $@ ($ret) (end)"
   exit $ret
 }

 # Run main() unless we're being sourced.
 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
   main "$@"
 fi
	#!/bin/bash
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	# Entrypoint code for test-with-docker.py containers. test-with-docker.py
	# will create Docker containers with this script as the entrypoint,
	# with a variety of arguments. See test-with-docker.py for a more
	# general overview.
	#
	# This assumes that the following are already mounted inside
	# the container:
	# /etc/localtime -> /mnt/localtime
	# Helps timestamps be in the time zone of the host
	# $IMPALA_HOME [a git repo of Impala] -> /repo
	# Used to check out Impala afresh
	# $IMPALA_HOME/logs/docker/<n1>/<n2> -> /logs
	# Used to save logs out to host.
	# <n1> represents the --name passed into
	# test-with-docker for the test run. <n2>
	# indicates which specific container is being run.
	# ~/.ccache [configurable] -> /ccache
	# Used to speed up builds.
	#
	# Usage:
	# entrypoint.sh build <uid>
	# entrypoint.sh test_suite <suite>
	# where <suite> is one of: BE_TEST JDBC_TEST CLUSTER_TEST
	# EE_TEST_SERIAL EE_TEST_PARALLEL

	# Starts or stops postgres
	# The centos:7 Docker image doesn't allow systemctl to start postgresql,
	# so we start it explicitly with pg_ctl.
	function _pg_ctl() {
	if [ -f /etc/redhat-release ]; then
	if which systemctl; then
	sudo -u postgres PGDATA=/var/lib/pgsql/data bash -c "pg_ctl $1 -w --timeout=120 >> /var/lib/pgsql/pg.log 2>&1"
	return
	fi
	fi
	sudo service postgresql $1
	}

	# Boostraps the container by creating a user and adding basic tools like Python and git.
	# Takes a uid as an argument for the user to be created.
	function build() {
	# Handy for testing.
	if [[ $TEST_TEST_WITH_DOCKER ]]; then
	# We sleep busily so that CPU metrics will show usage, to
	# better exercise the timeline code.
	echo sleeping busily for 4 seconds
	bash -c 'while [[ $SECONDS -lt 4 ]]; do :; done'
	return
	fi

	# Configure timezone, so any timestamps that appear are coherent with the host.
	configure_timezone

	# Assert we're superuser.
	[ "$(id -u)" = 0 ]
	if id $1 2> /dev/null; then
	echo "User with id $1 already exists. Please run this as a user id missing from " \
	"the base Ubuntu container."
	echo
	echo "Container users:"
	paste <(cut -d : -f3 /etc/passwd) <(cut -d : -f1 /etc/passwd) \| sort -n
	exit 1
	fi
	if which apt-get > /dev/null; then
	apt-get update
	apt-get install -y sudo git lsb-release python
	else
	yum -y install sudo git python
	fi

	if ! id impdev; then
	# Adduser is slightly different on CentOS and Ubuntu
	if which apt-get; then
	adduser --disabled-password --gecos "" --uid $1 impdev
	else
	adduser --uid $1 impdev
	fi
	echo "impdev ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
	fi

	ulimit -a
	su impdev -c "$0 build_impdev"
	}

	# Sets up Impala environment
	function impala_environment() {
	pushd /home/impdev/Impala
	export IMPALA_HOME=/home/impdev/Impala
	source bin/impala-config.sh
	popd
	}

	# Starts SSH and PostgreSQL; configures container as necessary;
	# prepares Kudu for starting.
	function boot_container() {
	pushd /home/impdev/Impala

	# Make log directories. This is typically done in buildall.sh.
	mkdir -p logs/be_tests logs/fe_tests/coverage logs/ee_tests logs/custom_cluster_tests

	# Update /etc/hosts to remove the entry for the unique docker hostname,
	# and instead point it to 127.0.0.1. Otherwise, HttpFS returns Location:
	# redirects to said hostname, but the relevant datanode isn't listening
	# on the wildcard address. bootstrap_system.sh does this as well, but
	# Docker creates a new /etc/hosts every time a container is created, so
	# this needs to be done here as well.
	#
	# "sed -i" in place doesn't work on Docker, because /etc/hosts is a bind mount.
	sed -e /$(hostname)/d /etc/hosts > /tmp/hosts
	echo "127.0.0.1 $(hostname -s) $(hostname)" >> /tmp/hosts
	sudo cp /tmp/hosts /etc/hosts

	echo Hostname: $(hostname)
	echo Hosts file:
	cat /etc/hosts

	popd
	}

	function start_minicluster {
	# The subshell here avoids the verbose output from set -x.
	(echo ">>> Starting PostgreSQL and SSH") 2> /dev/null
	pushd /home/impdev/Impala

	# Required for metastore
	_pg_ctl start

	# Required for starting HBase
	if [ -f /etc/redhat-release ]; then
	if which systemctl; then
	# centos7 doesn't support systemd running inside of docker to start daemons
	sudo /usr/sbin/sshd
	else
	sudo service sshd start
	fi
	else
	sudo service ssh start
	fi

	(echo ">>> Copying Kudu Data") 2> /dev/null
	# Move around Kudu's WALs to avoid issue with Docker filesystems (aufs and
	# overlayfs) that don't support os.rename(2) on directories, which Kudu
	# requires. We make a fresh copy of the data, in which case rename(2) works
	# presumably because there's only one layer involved. See
	# https://issues.apache.org/jira/browse/KUDU-1419.
	set -x
	if [ "true" = $KUDU_IS_SUPPORTED ]; then
	pushd /home/impdev/Impala/testdata
	for x in cluster/cdh/node-/var/lib/kudu/*/wal; do
	echo $x
	# This mv takes time, as it's actually copying into the latest layer.
	mv $x $x-orig
	mkdir $x
	mv $x-orig/* $x
	rmdir $x-orig
	done
	popd
	fi

	# Wait for postgresql to really start; if it doesn't, Hive Metastore will fail to start.
	for i in {1..120}; do
	echo connecting to postgresql attempt $i
	if sudo -u postgres psql -c "select 1"; then
	break
	else
	sleep 2
	fi
	done
	sudo -u postgres psql -c "select 1"

	(echo ">>> Starting mini cluster") 2> /dev/null
	testdata/bin/run-all.sh

	popd
	}

	# Runs bootstrap_system.sh and then builds Impala.
	function build_impdev() {
	# Assert we're impdev now.
	[ "$(id -un)" = impdev ]

	# Bump "Max processes" ulimit to the hard limit; default
	# on CentOS 6 can be 1024, which isn't enough for minicluster.
	ulimit -u $(cat /proc/self/limits \| grep 'Max processes' \| awk '{ print $4 }')
	ulimit -a

	# Link in ccache from host.
	ln -s /ccache /home/impdev/.ccache

	# Instead of doing a full "git clone" of /repo, which is the host's checkout,
	# we only fetch one branch, without tags. This keeps the checkout
	# considerably lighter.
	mkdir /home/impdev/Impala
	pushd /home/impdev/Impala
	git init
	git fetch /git_common_dir --no-tags "$GIT_HEAD_REV"
	git checkout -b test-with-docker FETCH_HEAD

	# Checkout impala-lzo too
	mkdir /home/impdev/Impala-lzo
	pushd /home/impdev/Impala-lzo
	git init
	git fetch $IMPALA_LZO_REPO --no-tags "$IMPALA_LZO_REF"
	git checkout -b test-with-docker FETCH_HEAD
	popd

	# Link in logs. Logs are on the host since that's the most important thing to
	# look at after the tests are run.
	ln -sf /logs logs

	bin/bootstrap_system.sh
	impala_environment

	# Builds Impala and loads test data.
	# Note that IMPALA-6494 prevents us from using shared library builds,
	# which are smaller and thereby speed things up. We use "-notests"
	# to avoid building backend tests, which are sizable, and
	# can be built when executing those tests. We use "-noclean" to
	# avoid deleting the log for this invocation which is in logs/,
	# and, this is a first build anyway.
	./buildall.sh -noclean -format -testdata -notests

	# We make one exception to "-notests":
	# test_insert_parquet.py, which is used in all the end-to-end test
	# shards, depends on this binary. We build it here once,
	# instead of building it during the startup of each container running
	# a subset of E2E tests. Building it here is also a lot faster.
	make -j$(nproc) --load-average=$(nproc) parquet-reader

	# Dump current memory usage to logs, before shutting things down.
	memory_usage

	# Shut down things cleanly.
	testdata/bin/kill-all.sh

	# "Compress" HDFS data by de-duplicating blocks. As a result of
	# having three datanodes, our data load is 3x larger than it needs
	# to be. To alleviate this (to the tune of ~20GB savings), we
	# use hardlinks to link together the identical blocks. This is absolutely
	# taking advantage of an implementation detail of HDFS.
	echo "Hardlinking duplicate HDFS block data."
	set +x
	for x in $(find testdata/cluster//node-1/data/dfs/dn/current/ -name 'blk_[0-9]'); do
	for n in 2 3; do
	xn=${x/node-1/node-$n}
	if [ -f $xn ]; then
	rm $xn
	ln $x $xn
	fi
	done
	done
	set -x

	# Shutting down PostgreSQL nicely speeds up it's start time for new containers.
	_pg_ctl stop

	# Clean up things we don't need to reduce image size
	find be -name '*.o' -execdir rm '{}' + # ~1.6GB

	# Clean up dangling symlinks. These (typically "cluster/cdh-node-")
	# may point to something inside a container that no longer exists
	# and can confuse Jenkins.
	find /logs -xtype l -execdir rm '{}' ';'

	popd
	}

	# Prints top 20 RSS consumers (and other, total), in megabytes Common culprits
	# are Java processes without Xmx set. Since most things don't reclaim memory,
	# this is a decent proxy for peak memory usage by long-lived processes.
	function memory_usage() {
	(
	echo "Top 20 memory consumers (RSS in MBs)"
	sudo ps -axho rss,args \| \
	sed -e 's/^ *//' \| \
	sed -e 's, ,\t,' \| \
	sort -nr \| \
	awk -F'\t' '
	FNR < 20 { print $1/1024.0, $2; total += $1/1024.0 }
	FNR >= 20 { other+= $1/1024.0; total += $1/1024.0 }
	END {
	if (other) { print other, "-- other --" };
	print total, "-- total --"
	}'
	) >& /logs/memory_usage.txt
	}

	# Runs a suite passed in as the first argument. Tightly
	# coupled with Impala's run-all-tests and the suite names.
	# from test-with-docker.py.
	#
	# Before running tests, starts up the minicluster.
	function test_suite() {
	cd /home/impdev/Impala

	# These test suites are for testing.
	if [[ $1 == NOOP ]]; then
	# Sleep busily for 10 seconds.
	bash -c 'while [[ $SECONDS -lt 10 ]]; do :; done'
	return 0
	fi
	if [[ $1 == NOOP_FAIL ]]; then
	return 1
	fi
	if [[ $1 == NOOP_SLEEP_FOREVER ]]; then
	# Handy to test timeouts.
	while true; do sleep 60; done
	fi

	# Assert that we're running as impdev
	[ "$(id -un)" = impdev ]

	# Assert that /home/impdev/Impala/logs is a symlink to /logs.
	[ "$(readlink /home/impdev/Impala/logs)" = /logs ]

	boot_container
	impala_environment

	if [[ ${REBUILD_ASAN:-false} = true ]]; then
	# Note: we're not redoing data loading.
	SKIP_TOOLCHAIN_BOOTSTRAP=true ./buildall.sh -noclean -notests -asan
	fi

	# BE tests don't require the minicluster, so we can run them directly.
	if [[ $1 = BE_TEST* ]]; then
	make -j$(nproc) --load-average=$(nproc) be-test be-benchmarks
	if ! bin/run-backend-tests.sh; then
	echo "Tests $1 failed!"
	return 1
	else
	echo "Tests $1 succeeded!"
	return 0
	fi
	fi

	if [[ $1 == RAT_CHECK ]]; then
	# Runs Apache RAT (a license checker)
	git archive --prefix=rat/ -o rat-impala.zip HEAD
	wget --quiet https://archive.apache.org/dist/creadur/apache-rat-0.12/apache-rat-0.12-bin.tar.gz
	tar xzf apache-rat-0.12-bin.tar.gz
	java -jar apache-rat-0.12/apache-rat-0.12.jar -x rat-impala.zip > logs/rat.xml
	bin/check-rat-report.py bin/rat_exclude_files.txt logs/rat.xml
	return $?
	fi

	# Start the minicluster
	start_minicluster

	# By default, the JVM will use 1/4 of your OS memory for its heap size. For a
	# long-running test, this will delay GC inside of impalad's leading to
	# unnecessarily large process RSS footprints. To combat this, we
	# set a small initial heap size, and then cap it at a more reasonable
	# size. The small initial heap sizes help for daemons that do little
	# in the way of JVM work (e.g., the 2nd and 3rd impalad's).
	# Note that "test_insert_large_string" fails at 2g and 3g, so the suite that
	# includes it (EE_TEST_PARALLEL) gets additional memory.

	# Note that we avoid using TEST_START_CLUSTER_ARGS="--jvm-args=..."
	# because it gets flattened along the way if we need to provide
	# more than one Java argument. We use JAVA_TOOL_OPTIONS instead.
	JVM_HEAP_MAX_GB=2
	if [[ $1 = EE_TEST_PARALLEL ]]; then
	JVM_HEAP_MAX_GB=4
	elif [[ $1 = EE_TEST_PARALLEL_EXHAUSTIVE ]]; then
	JVM_HEAP_MAX_GB=8
	fi
	JAVA_TOOL_OPTIONS="-Xms512M -Xmx${JVM_HEAP_MAX_GB}G"

	# Similarly, bin/start-impala-cluster typically configures the memlimit
	# to be 80% of the machine memory, divided by the number of daemons.
	# If multiple containers are to be run simultaneously, this is scaled
	# down in test-with-docker.py (and further configurable with --impalad-mem-limit-bytes)
	# and passed in via $IMPALAD_MEM_LIMIT_BYTES to the container. There is a
	# relationship between the number of parallel tests that can be run by py.test and this
	# limit.
	export TEST_START_CLUSTER_ARGS="--impalad_args=--mem_limit=$IMPALAD_MEM_LIMIT_BYTES"

	export MAX_PYTEST_FAILURES=0

	# Asserting that these should are all set (to either true or false as strings).
	# This is how run-all.sh chooses between them.
	[[ $FE_TEST && $BE_TEST && $EE_TEST && $JDBC_TEST && $CLUSTER_TEST ]]

	ret=0

	# Run tests.
	(echo ">>> $1: Starting run-all-test") 2> /dev/null
	if ! time -p bash -x bin/run-all-tests.sh; then
	ret=1
	echo "Tests $1 failed!"
	else
	echo "Tests $1 succeeded!"
	fi

	# Save memory usage after tests have run but before shutting down the cluster.
	memory_usage \|\| true

	# Oddly, I've observed bash fail to exit (and wind down the container),
	# leading to test-with-docker.py hitting a timeout. Killing the minicluster
	# daemons fixes this.
	testdata/bin/kill-all.sh \|\| true
	return $ret
	}

	# It's convenient (for log files to be legible) for the container
	# to have the host timezone. However, /etc/localtime is finnicky
	# (see localtime(5)) and mounting it to the host /etc/localtime or
	# symlinking it there doesn't always work. Instead, we expect
	# $LOCALTIME_LINK_TARGET to be set to a path in /usr/share/zoneinfo.
	function configure_timezone() {
	if [ -e "${LOCALTIME_LINK_TARGET}" ]; then
	ln -sf "${LOCALTIME_LINK_TARGET}" /etc/localtime
	# Only Debian-based distros have this file.
	if [ -f /etc/timezone ]; then
	echo "${LOCALTIME_LINK_TARGET}" \| sed -e 's,.*zoneinfo/,,' > /etc/timezone
	fi
	else
	echo '$LOCALTIME_LINK_TARGET not configured.' 1>&2
	fi
	}

	# Exposes a shell, with the container booted with
	# a minicluster.
	function shell() {
	echo "Starting minicluster and Impala."
	# Logs is typically a symlink; remove it if so.
	rm logs \|\| true
	mkdir -p logs
	boot_container
	impala_environment
	# Kudu requires --privileged for the Docker container; see
	# https://issues.apache.org/jira/browse/KUDU-2000. Because
	# our goal here is convenience for new developers, we
	# skip kudu if "ntptime" doesn't work, which is a good
	# proxy for Kudu won't start.
	if ! ntptime > /dev/null; then
	export KUDU_IS_SUPPORTED=false
	KUDU_MSG="Kudu is not started."
	fi
	start_minicluster
	bin/start-impala-cluster.py
	cat <<"EOF"

	==========================================================
	Welcome to the Impala development environment.

	The "minicluster" is running; i.e., HDFS, HBase, Hive,
	etc. are running. $KUDU_MSG

	To get started, perhaps run:
	impala-shell.sh -q 'select count(*) from tpcds.web_page'
	==========================================================

	EOF
	exec bash
	}

	function main() {
	set -e

	# Run given command
	CMD="$1"
	shift

	# Treat shell specialy to avoid the extra logging and \| cat below.
	if [[ $CMD = "shell" ]]; then
	shell
	# shell shoud have exec'd, so if we get here, it's a failure.
	exit 1
	fi

	echo ">>> ${CMD} $@ (begin)"
	# Dump environment, for debugging
	env \| grep -vE "AWS_(SECRET_)?ACCESS_KEY"
	ulimit -a
	set -x
	# The "\| cat" here avoids "set -e"/errexit from exiting the
	# script right away.
	"${CMD}" "$@" \| cat
	ret=${PIPESTATUS[0]}
	set +x
	echo ">>> ${CMD} $@ ($ret) (end)"
	exit $ret
	}

	# Run main() unless we're being sourced.
	if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
	main "$@"
	fi