blob: f1f584d0124ad3eaeba5b38b54555b70d3f37665 [file]
#!/bin/bash
################################################################################
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
################################################################################
#
# This script follows the pattern described in the docker best practices here:
# https://docs.docker.com/develop/develop-images/dockerfile_best-practices/#entrypoint
################################################################################
set -euo pipefail
set -x
if [[ $# -eq 0 ]]; then
echo "Must provide at least one argument."
exit 1
elif [[ "$1" = "load_tpcds" ]]; then
echo "Loading TPC-DS data"
IMPALA_TOOLCHAIN_BASE=https://native-toolchain.s3.amazonaws.com/build/7-f2ddef91e9/
TPCDS_VERSION=2.1.0
TPCDS_TARBALL=tpc-ds-${TPCDS_VERSION}-gcc-4.9.2-ec2-package-ubuntu-18-04.tar.gz
TPCDS_URL=${IMPALA_TOOLCHAIN_BASE}tpc-ds/${TPCDS_VERSION}-gcc-4.9.2/${TPCDS_TARBALL}
curl ${TPCDS_URL} --output tpcds.tar.gz
tar xzf tpcds.tar.gz
# The base directory for Hive external tables, in a mounted volume.
WAREHOUSE_EXTERNAL_DIR=/user/hive/warehouse/external
TPCDS_RAW_DIR=${WAREHOUSE_EXTERNAL_DIR}/tpcds_raw
# Use a marker file to avoid regenerating the data if already present in
# the warehouse. dsdgen is a serial process and somewhat slow.
if ! stat ${TPCDS_RAW_DIR}/generated; then
SCALE_FACTOR=1
# Generate the data. This creates one .dat file for each table.
./tpc-ds-${TPCDS_VERSION}/bin/dsdgen -force -verbose -scale ${SCALE_FACTOR}
# Move the tables into the warehouse, one per subdirectory
for FILE in *.dat; do
FILE_DIR=${TPCDS_RAW_DIR}/${FILE%.dat}
rm -rf "${FILE_DIR}"
mkdir -p "${FILE_DIR}"
mv "${FILE}" "${FILE_DIR}"
done
touch ${TPCDS_RAW_DIR}/generated
fi
IMPALA_SHELL="impala-shell --protocol=hs2 -i docker_impalad-1_1"
# Wait until Impala comes up (it started in parallel with the data loader).
for i in $(seq 300); do
if ${IMPALA_SHELL} -q 'select version()'; then
break
fi
echo "Waiting for impala to come up"
sleep 0.5
done
${IMPALA_SHELL} -f /opt/impala/sql/load_tpcds_parquet.sql
# Load data into Kudu if the Kudu master is up.
if ping -c1 kudu-master-1; then
${IMPALA_SHELL} -f /opt/impala/sql/load_tpcds_kudu.sql
fi
elif [[ "$1" = "impala-shell" ]]; then
shift
# Execute impala-shell with any extra arguments provided.
exec impala-shell --protocol=hs2 --history_file=/tmp/impalahistory \
-i docker_impalad-1_1 "$@"
else
# Execute the provided input as a command
exec "$@"
fi