blob: bcac551e883f1fc070e1bac467c44f6c26116428 [file] [log] [blame]
#!/usr/bin/env bash
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
##############################################################
# This script is used to generate TPC-H data set
##############################################################
set -eo pipefail
ROOT=$(dirname "$0")
ROOT=$(
cd "${ROOT}"
pwd
)
CURDIR="${ROOT}"
TPCH_DBGEN_DIR="${CURDIR}/TPC-H_Tools_v3.0.0/dbgen/"
TPCH_DATA_DIR="${CURDIR}/tpch-data/"
usage() {
echo "
Usage: $0 <options>
Optional options:
-s scale factor, default is 100
-c parallelism to generate data of (lineitem, orders, partsupp) table, default is 10
Eg.
$0 generate data using default value.
$0 -s 10 generate data with scale factor 10.
$0 -s 10 -c 5 generate data with scale factor 10. And using 5 threads to generate data concurrently.
"
exit 1
}
OPTS=$(getopt \
-n "$0" \
-o '' \
-o 'hs:c:' \
-- "$@")
eval set -- "${OPTS}"
SCALE_FACTOR=100
PARALLEL=10
HELP=0
if [[ $# == 0 ]]; then
usage
fi
while true; do
case "$1" in
-h)
HELP=1
shift
;;
-s)
SCALE_FACTOR=$2
shift 2
;;
-c)
PARALLEL=$2
shift 2
;;
--)
shift
break
;;
*)
echo "Internal error"
exit 1
;;
esac
done
if [[ ${HELP} -eq 1 ]]; then
usage
exit
fi
echo "Scale Factor: ${SCALE_FACTOR}"
echo "Parallelism: ${PARALLEL}"
# check if dbgen exists
if [[ ! -f ${TPCH_DBGEN_DIR}/dbgen ]]; then
echo "${TPCH_DBGEN_DIR}/dbgen does not exist. Run build-tpch-dbgen.sh first to build it first."
exit 1
fi
if [[ -d ${TPCH_DATA_DIR}/ ]]; then
echo "${TPCH_DATA_DIR} exists. Remove it before generating data"
exit 1
fi
mkdir "${TPCH_DATA_DIR}"/
# gen data
cd "${TPCH_DBGEN_DIR}"
echo "Begin to generate data for table: region"
"${TPCH_DBGEN_DIR}"/dbgen -f -s "${SCALE_FACTOR}" -T r
echo "Begin to generate data for table: nation"
"${TPCH_DBGEN_DIR}"/dbgen -f -s "${SCALE_FACTOR}" -T n
echo "Begin to generate data for table: supplier"
"${TPCH_DBGEN_DIR}"/dbgen -f -s "${SCALE_FACTOR}" -T s
echo "Begin to generate data for table: part"
"${TPCH_DBGEN_DIR}"/dbgen -f -s "${SCALE_FACTOR}" -T P
echo "Begin to generate data for table: customer"
"${TPCH_DBGEN_DIR}"/dbgen -f -s "${SCALE_FACTOR}" -T c
echo "Begin to generate data for table: partsupp"
for i in $(seq 1 "${PARALLEL}"); do
{
"${TPCH_DBGEN_DIR}"/dbgen -f -s "${SCALE_FACTOR}" -T S -C "${PARALLEL}" -S "${i}"
} &
done
wait
echo "Begin to generate data for table: orders"
for i in $(seq 1 "${PARALLEL}"); do
{
"${TPCH_DBGEN_DIR}"/dbgen -f -s "${SCALE_FACTOR}" -T O -C "${PARALLEL}" -S "${i}"
} &
done
wait
echo "Begin to generate data for table: lineitem"
for i in $(seq 1 "${PARALLEL}"); do
{
"${TPCH_DBGEN_DIR}"/dbgen -f -s "${SCALE_FACTOR}" -T L -C "${PARALLEL}" -S "${i}"
} &
done
wait
cd -
# move data to ${TPCH_DATA_DIR}
mv "${TPCH_DBGEN_DIR}"/*.tbl* "${TPCH_DATA_DIR}"/
# check data
du -sh "${TPCH_DATA_DIR}"/*.tbl*