blob: e8e72577ba102875221c093a6eead2a5163e631b [file]
#!/bin/bash
set -eo pipefail
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
. "${CUR_DIR}/bootstrap/bootstrap-groups.sh"
BOOTSTRAP_GROUPS="$(bootstrap_normalize_groups "${HIVE_BOOTSTRAP_GROUPS:-}")"
PIPELINE_DATA_URL_PREFIX="https://${s3BucketName}.${s3Endpoint}/regression/datalake/pipeline_data"
HIVE_AUXLIB_URL_PREFIX="https://${s3BucketName}.${s3Endpoint}/regression/docker/hive3"
echo "Prepare hive data with bootstrap groups: ${BOOTSTRAP_GROUPS}"
extract_archives=()
while IFS= read -r -d '' archive_path; do
relative_archive_path="${archive_path#${CUR_DIR}/}"
if bootstrap_archive_selected "${BOOTSTRAP_GROUPS}" "${relative_archive_path}"; then
extract_archives+=("${archive_path}")
fi
done < <(find "${CUR_DIR}/data" -type f -name "*.tar.gz" -print0)
if (( ${#extract_archives[@]} > 0 )); then
printf '%s\0' "${extract_archives[@]}" | xargs -0 -n1 -P"${LOAD_PARALLEL}" bash -c '
f="$0"
echo "Extracting hive data $f"
dir=$(dirname "$f")
tar -xzf "$f" -C "$dir"
'
fi
download_archive_if_missing() {
local relative_dir="$1"
local workdir="$2"
local remote_path="$3"
local archive_name="$4"
local target_dir="${CUR_DIR}/${relative_dir}"
if ! bootstrap_item_selected "${BOOTSTRAP_GROUPS}" "download_dir" "${relative_dir}"; then
return
fi
if [[ ! -d "${target_dir}" ]] || [[ -z "$(find "${target_dir}" -mindepth 1 -print -quit 2>/dev/null)" ]]; then
echo "${target_dir} is missing or empty"
rm -rf "${target_dir}"
pushd "${CUR_DIR}/${workdir}" >/dev/null
curl -O "${PIPELINE_DATA_URL_PREFIX}/${remote_path}"
tar -xzf "${archive_name}"
rm -rf "${archive_name}"
popd >/dev/null
else
echo "${target_dir} exists and is non-empty, continue !"
fi
}
download_specs=(
"tpch1.db|.|tpch1.db.tar.gz|tpch1.db.tar.gz"
"tvf_data|.|tvf_data.tar.gz|tvf_data.tar.gz"
"data/multi_catalog/test_complex_types/data|data/multi_catalog/test_complex_types|multi_catalog/test_complex_types/data.tar.gz|data.tar.gz"
"data/multi_catalog/test_compress_partitioned/data|data/multi_catalog/test_compress_partitioned|multi_catalog/test_compress_partitioned/data.tar.gz|data.tar.gz"
"data/multi_catalog/test_wide_table/data|data/multi_catalog/test_wide_table|multi_catalog/test_wide_table/data.tar.gz|data.tar.gz"
"data/tvf/test_hdfs_tvf_compression/test_data|data/tvf/test_hdfs_tvf_compression|test_hdfs_tvf_compression/test_data.tar.gz|test_data.tar.gz"
"data/tvf/test_tvf/tvf|data/tvf/test_tvf|test_tvf/data.tar.gz|data.tar.gz"
"data/multi_catalog/logs1_parquet/data|data/multi_catalog/logs1_parquet|multi_catalog/logs1_parquet/data.tar.gz|data.tar.gz"
)
download_spec=""
for download_spec in "${download_specs[@]}"; do
IFS='|' read -r relative_dir workdir remote_path archive_name <<<"${download_spec}"
download_archive_if_missing "${relative_dir}" "${workdir}" "${remote_path}" "${archive_name}"
done
# download auxiliary jars
jars=(
jdom-1.1.jar
aliyun-java-sdk-core-3.4.0.jar
aliyun-java-sdk-ecs-4.2.0.jar
aliyun-java-sdk-ram-3.0.0.jar
aliyun-java-sdk-sts-3.0.0.jar
jindo-core-6.3.4.jar
jindo-core-linux-el7-aarch64-6.3.4.jar
jindo-sdk-6.3.4.jar
aws-java-sdk-bundle-1.11.375.jar
hadoop-huaweicloud-3.1.1-hw-54.5.jar
hadoop-cos-3.1.0-8.3.22.jar
cos_api-bundle-5.6.244.4.jar
hadoop-aws-3.2.1.jar
paimon-hive-connector-3.1-1.3-SNAPSHOT.jar
gcs-connector-hadoop3-2.2.24-shaded.jar
)
cd ${CUR_DIR}/auxlib
for jar in "${jars[@]}"; do
if [[ -f "${CUR_DIR}/auxlib/${jar}" ]]; then
echo "Reuse cached hive aux jar ${jar}"
continue
fi
curl -O "${HIVE_AUXLIB_URL_PREFIX}/${jar}"
done