blob: 17da726dd08ece71a3de3f312668b6607359d57a [file] [log] [blame]
#!/bin/sh
# @@@ START COPYRIGHT @@@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# @@@ END COPYRIGHT @@@
#
##############################################################################
# This script installs a TPC-DS environment, to be used for Trafodion regression
# tests.
# invoke it with -h or --help to get help
# Before calling this script, the following environment variables need to be set up,
# or they need to be passed in through command line parameters:
# MY_LOCAL_SW_DIST (optional): Location for tar/zip files for this install
# MY_TPCDS_DATA_DIR : Directory where to unpack the TPC-DS data generator
# MY_LOG_FILE : Log file for error/tracing information
# MY_HDFS_CMD : Command to be used for HDFS shell (e.g. "hdfs" or "swhdfs")
# MY_HIVE_CMD : Command to be used for Hive (e.g. "hive" or "swhive")
TPCDS_MIRROR_URL=http://www.tpc.org/tpcds/dsgen/dsgen-download-files.asp?download_key=NaN
TPCDS_ZIP=tpcds_kit.zip
function usage {
cat <<EOF
Usage:
$0 [ options... ]
Options:
--unpackDir=<dir1> Directory where to unpack the zip files
--dataDir=<dir2> Directory where to store generated data files
--logFile=<file2> Log file for diagnostic messages
--hdfsCmd=<cmd1> HDFS command
--hiveCmd=<cmd2> Hive command
--force Force re-install
--check Check for show stoppers only and exit
-h | --help Display this help information
Set environment variable MY_LOCAL_SW_DIST to the directory of a local
copy of the TPC-DS zip file, if possible.
EOF
}
FORCE=""
CHECKONLY=""
while [ $# -gt 0 ];
do
case "$1" in
--unpackDir=*)
MY_TPCDS_UNPACK_DIR=${1#--unpackDir=}
;;
--dataDir=*)
MY_TPCDS_DATA_DIR=${1#--dataDir=}
;;
--logFile=*)
MY_LOG_FILE=${1#--logFile=}
;;
--hdfsCmd=*)
MY_HDFS_CMD=${1#--hdfsCmd=}
;;
--hiveCmd=*)
MY_HIVE_CMD=${1#--hiveCmd=}
;;
--force)
FORCE="true"
;;
--check)
CHECKONLY="true"
;;
-h)
usage
exit 1
;;
--help)
usage
exit 1
;;
**)
usage
echo "Unexpected argument: $1"
exit 100
;;
esac
shift
done
#####################################################
# Check the last thing done by this script
# (or most recently added item)
if [[ "$FORCE" != "true" ]]
then
$MY_HIVE_CMD -e 'describe store_orc;' >/dev/null 2>&1
if (( $? == 0 ))
then
echo "Hive table store_orc found. Skipping TPC-DS set-up."
exit 0
fi
fi
#####################################################
if [ ! -f $MY_LOCAL_SW_DIST/${TPCDS_ZIP} ]; then
# Right now there is no URL to download this tool automatically
# Please download the tpcds_kit.zip file from"
# "http://www.tpc.org/tpcds/dsgen-download-request.asp"
# "and place it into a directory pointed to by the MY_LOCAL_SW_DIST"
# "environment variable. Then retry installing. Sorry, this is due"
# "to the TPC wanting your email address."
echo "The testware tpcds_kit.zip does not exist and will not be installed"
echo "This testware is needed to run developer HIVE regression tests"
exit 99
fi
if [ -n "$CHECKONLY" ]; then
echo "Requisite files exist, script should succeed if called."
exit 0
fi
echo "Downloading TPC-DS setup tools..."
if [ -z "$MY_TPCDS_UNPACK_DIR" ]; then
MY_TPCDS_UNPACK_DIR=.
cd -P .
else
mkdir $MY_TPCDS_UNPACK_DIR
cd $MY_TPCDS_UNPACK_DIR
fi
if [ -f $MY_LOCAL_SW_DIST/${TPCDS_ZIP} ]; then
cp $MY_LOCAL_SW_DIST/${TPCDS_ZIP} .
else
# Right now there is no URL to download this tool automatically
# curl --output ${TPCDS_ZIP} ${TPCDS_MIRROR_URL}
exit 99
fi
echo "Unzipping TPC-DS kit ${TPCDS_ZIP} into ${PWD} and making the tools..."
unzip ${TPCDS_ZIP} >/dev/null
if [ -d tools ]; then
# older version of tpcds_kit
cd tools
else
# more recent version
cd TPCDSVersion*/tools
fi
echo "Making objects for TPC-DS database generator..." | tee -a ${MY_LOG_FILE}
make clean >>${MY_LOG_FILE} 2>&1
make >>${MY_LOG_FILE} 2>&1
echo "Generating the data..." | tee -a ${MY_LOG_FILE}
mkdir $MY_TPCDS_DATA_DIR
SCALE=1
FORCE=Y
export PATH=.:$PATH
./dsdgen -force $FORCE -dir $MY_TPCDS_DATA_DIR -scale $SCALE -table store_sales >>${MY_LOG_FILE} 2>&1
./dsdgen -force $FORCE -dir $MY_TPCDS_DATA_DIR -scale $SCALE -table date_dim >>${MY_LOG_FILE} 2>&1
./dsdgen -force $FORCE -dir $MY_TPCDS_DATA_DIR -scale $SCALE -table time_dim >>${MY_LOG_FILE} 2>&1
./dsdgen -force $FORCE -dir $MY_TPCDS_DATA_DIR -scale $SCALE -table item >>${MY_LOG_FILE} 2>&1
./dsdgen -force $FORCE -dir $MY_TPCDS_DATA_DIR -scale $SCALE -table customer >>${MY_LOG_FILE} 2>&1
./dsdgen -force $FORCE -dir $MY_TPCDS_DATA_DIR -scale $SCALE -table customer_demographics >>${MY_LOG_FILE} 2>&1
./dsdgen -force $FORCE -dir $MY_TPCDS_DATA_DIR -scale $SCALE -table household_demographics >>${MY_LOG_FILE} 2>&1
./dsdgen -force $FORCE -dir $MY_TPCDS_DATA_DIR -scale $SCALE -table customer_address >>${MY_LOG_FILE} 2>&1
./dsdgen -force $FORCE -dir $MY_TPCDS_DATA_DIR -scale $SCALE -table store >>${MY_LOG_FILE} 2>&1
./dsdgen -force $FORCE -dir $MY_TPCDS_DATA_DIR -scale $SCALE -table promotion >>${MY_LOG_FILE} 2>&1
$MY_HDFS_CMD dfs -mkdir -p /hive/tpcds >>${MY_LOG_FILE} 2>&1
$MY_HDFS_CMD dfs -mkdir -p /hive/tpcds/date_dim >>${MY_LOG_FILE} 2>&1
$MY_HDFS_CMD dfs -mkdir -p /hive/tpcds/time_dim >>${MY_LOG_FILE} 2>&1
$MY_HDFS_CMD dfs -mkdir -p /hive/tpcds/item >>${MY_LOG_FILE} 2>&1
$MY_HDFS_CMD dfs -mkdir -p /hive/tpcds/customer >>${MY_LOG_FILE} 2>&1
$MY_HDFS_CMD dfs -mkdir -p /hive/tpcds/customer_demographics >>${MY_LOG_FILE} 2>&1
$MY_HDFS_CMD dfs -mkdir -p /hive/tpcds/household_demographics >>${MY_LOG_FILE} 2>&1
$MY_HDFS_CMD dfs -mkdir -p /hive/tpcds/customer_address >>${MY_LOG_FILE} 2>&1
$MY_HDFS_CMD dfs -mkdir -p /hive/tpcds/store >>${MY_LOG_FILE} 2>&1
$MY_HDFS_CMD dfs -mkdir -p /hive/tpcds/promotion >>${MY_LOG_FILE} 2>&1
$MY_HDFS_CMD dfs -mkdir -p /hive/tpcds/store_sales >>${MY_LOG_FILE} 2>&1
cd $MY_TPCDS_DATA_DIR
which iconv >>${MY_LOG_FILE} 2>&1
if (( $? != 0 ))
then
echo "iconv utility not available. The data will be in ISO-8859-1 format."
else
echo "Converting the data into UTF-8 format ..."
for t in date_dim time_dim item customer customer_demographics household_demographics customer_address store promotion store_sales
do
iconv -f ISO-8859-1 -t UTF-8 -o ${t}.utf8.dat ${t}.dat >>${MY_LOG_FILE} 2>&1
mv ${t}.utf8.dat ${t}.dat
done
fi
echo "Copying generated data to HDFS..."
for t in date_dim time_dim item customer customer_demographics household_demographics customer_address store promotion store_sales
do
$MY_HDFS_CMD dfs -put ${t}.dat /hive/tpcds/${t} >>${MY_LOG_FILE} 2>&1
done
$MY_HDFS_CMD dfs -ls -R /hive/tpcds/*/*.dat >>${MY_LOG_FILE} 2>&1
echo "Creating tables in Hive..." | tee -a ${MY_LOG_FILE}
$MY_HIVE_CMD <<EOF >>${MY_LOG_FILE} 2>&1
-- store_sales star only
create external table store_sales
(
ss_sold_date_sk int,
ss_sold_time_sk int,
ss_item_sk int,
ss_customer_sk int,
ss_cdemo_sk int,
ss_hdemo_sk int,
ss_addr_sk int,
ss_store_sk int,
ss_promo_sk int,
ss_ticket_number int,
ss_quantity int,
ss_wholesale_cost float,
ss_list_price float,
ss_sales_price float,
ss_ext_discount_amt float,
ss_ext_sales_price float,
ss_ext_wholesale_cost float,
ss_ext_list_price float,
ss_ext_tax float,
ss_coupon_amt float,
ss_net_paid float,
ss_net_paid_inc_tax float,
ss_net_profit float
)
row format delimited fields terminated by '|'
location '/hive/tpcds/store_sales';
create external table customer_demographics
(
cd_demo_sk int,
cd_gender string,
cd_marital_status string,
cd_education_status string,
cd_purchase_estimate int,
cd_credit_rating string,
cd_dep_count int,
cd_dep_employed_count int,
cd_dep_college_count int
)
row format delimited fields terminated by '|'
location '/hive/tpcds/customer_demographics';
create external table date_dim
(
d_date_sk int,
d_date_id string,
d_date timestamp,
d_month_seq int,
d_week_seq int,
d_quarter_seq int,
d_year int,
d_dow int,
d_moy int,
d_dom int,
d_qoy int,
d_fy_year int,
d_fy_quarter_seq int,
d_fy_week_seq int,
d_day_name string,
d_quarter_name string,
d_holiday string,
d_weekend string,
d_following_holiday string,
d_first_dom int,
d_last_dom int,
d_same_day_ly int,
d_same_day_lq int,
d_current_day string,
d_current_week string,
d_current_month string,
d_current_quarter string,
d_current_year string
)
row format delimited fields terminated by '|'
location '/hive/tpcds/date_dim';
create external table time_dim
(
t_time_sk int,
t_time_id string,
t_time int,
t_hour int,
t_minute int,
t_second int,
t_am_pm string,
t_shift string,
t_sub_shift string,
t_meal_time string
)
row format delimited fields terminated by '|'
location '/hive/tpcds/time_dim';
create external table item
(
i_item_sk int,
i_item_id string,
i_rec_start_date timestamp,
i_rec_end_date timestamp,
i_item_desc string,
i_current_price float,
i_wholesale_cost float,
i_brand_id int,
i_brand string,
i_class_id int,
i_class string,
i_category_id int,
i_category string,
i_manufact_id int,
i_manufact string,
i_size string,
i_formulation string,
i_color string,
i_units string,
i_container string,
i_manager_id int,
i_product_name string
)
row format delimited fields terminated by '|'
location '/hive/tpcds/item';
create external table store
(
s_store_sk int,
s_store_id string,
s_rec_start_date timestamp,
s_rec_end_date timestamp,
s_closed_date_sk int,
s_store_name string,
s_number_employees int,
s_floor_space int,
s_hours string,
s_manager string,
s_market_id int,
s_geography_class string,
s_market_desc string,
s_market_manager string,
s_division_id int,
s_division_name string,
s_company_id int,
s_company_name string,
s_street_number string,
s_street_name string,
s_street_type string,
s_suite_number string,
s_city string,
s_county string,
s_state string,
s_zip string,
s_country string,
s_gmt_offset float,
s_tax_precentage float
)
row format delimited fields terminated by '|'
location '/hive/tpcds/store';
create external table customer
(
c_customer_sk int,
c_customer_id string,
c_current_cdemo_sk int,
c_current_hdemo_sk int,
c_current_addr_sk int,
c_first_shipto_date_sk int,
c_first_sales_date_sk int,
c_salutation string,
c_first_name string,
c_last_name string,
c_preferred_cust_flag string,
c_birth_day int,
c_birth_month int,
c_birth_year int,
c_birth_country string,
c_login string,
c_email_address string,
c_last_review_date string
)
row format delimited fields terminated by '|'
location '/hive/tpcds/customer';
create external table promotion
(
p_promo_sk int,
p_promo_id string,
p_start_date_sk int,
p_end_date_sk int,
p_item_sk int,
p_cost float,
p_response_target int,
p_promo_name string,
p_channel_dmail string,
p_channel_email string,
p_channel_catalog string,
p_channel_tv string,
p_channel_radio string,
p_channel_press string,
p_channel_event string,
p_channel_demo string,
p_channel_details string,
p_purpose string,
p_discount_active string
)
row format delimited fields terminated by '|'
location '/hive/tpcds/promotion';
create external table household_demographics
(
hd_demo_sk int,
hd_income_band_sk int,
hd_buy_potential string,
hd_dep_count int,
hd_vehicle_count int
)
row format delimited fields terminated by '|'
location '/hive/tpcds/household_demographics';
create external table customer_address
(
ca_address_sk int,
ca_address_id string,
ca_street_number string,
ca_street_name string,
ca_street_type string,
ca_suite_number string,
ca_city string,
ca_county string,
ca_state string,
ca_zip string,
ca_country string,
ca_gmt_offset float,
ca_location_type string
)
row format delimited fields terminated by '|'
location '/hive/tpcds/customer_address';
create table store_orc stored as orc as select * from store;
quit;
EOF