| #!/usr/bin/env bash |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| ############################################################## |
| # This script is used to load clickbench data into Doris |
| ############################################################## |
| |
| set -eo pipefail |
| |
| ROOT=$(dirname "$0") |
| ROOT=$( |
| cd "$ROOT" |
| pwd |
| ) |
| |
| CURDIR=${ROOT} |
| DATA_DIR=$CURDIR/ |
| # DATA_DIR=/mnt/disk1/stephen/data/clickbench |
| |
| usage() { |
| echo " |
| This script is used to load ClickBench data, |
| will use mysql client to connect Doris server which is specified in conf/doris-cluster.conf file. |
| Usage: $0 |
| " |
| exit 1 |
| } |
| |
| OPTS=$(getopt \ |
| -n $0 \ |
| -o '' \ |
| -o 'h' \ |
| -- "$@") |
| eval set -- "$OPTS" |
| |
| HELP=0 |
| while true; do |
| case "$1" in |
| -h) |
| HELP=1 |
| shift |
| ;; |
| --) |
| shift |
| break |
| ;; |
| *) |
| echo "Internal error" |
| exit 1 |
| ;; |
| esac |
| done |
| |
| if [[ ${HELP} -eq 1 ]]; then |
| usage |
| exit |
| fi |
| |
| check_prerequest() { |
| local CMD=$1 |
| local NAME=$2 |
| if ! $CMD; then |
| echo "$NAME is missing. This script depends on cURL to load data to Doris." |
| exit 1 |
| fi |
| } |
| |
| check_prerequest "mysql --version" "mysql" |
| check_prerequest "curl --version" "curl" |
| check_prerequest "wget --version" "wget" |
| |
| source $CURDIR/conf/doris-cluster.conf |
| |
| echo "FE_HOST: $FE_HOST" |
| echo "FE_HTTP_PORT: $FE_HTTP_PORT" |
| echo "USER: $USER" |
| echo "PASSWORD: $PASSWORD" |
| echo "DB: $DB" |
| |
| function check_doirs_conf() { |
| cv=$(mysql -h$FE_HOST -P$FE_QUERY_PORT -u$USER -e 'admin show frontend config' | grep 'stream_load_default_timeout_second' | awk '{print $2}') |
| if (($cv < 3600)); then |
| echo "advise: revise your Doris FE's conf to set 'stream_load_default_timeout_second=3600' or above" |
| fi |
| |
| cv=$(curl "${BE_HOST}:${BE_WEBSERVER_PORT}/varz" 2>/dev/null | grep 'streaming_load_max_mb' | awk -F'=' '{print $2}') |
| if (($cv < 16000)); then |
| echo -e "advise: revise your Doris BE's conf to set 'streaming_load_max_mb=16000' or above and 'flush_thread_num_per_store=5' to speed up load." |
| fi |
| } |
| |
| function load() { |
| echo "(1/2) prepare clickbench data file" |
| need_download=false |
| cd $DATA_DIR |
| for i in $(seq 0 9); do |
| if [ ! -f "$DATA_DIR/hits_split${i}" ]; then |
| echo "will download hits_split${i} to $DATA_DIR" |
| wget --continue "https://doris-test-data.oss-cn-hongkong.aliyuncs.com/ClickBench/hits_split${i}" & |
| # wget --continue "https://doris-test-data.oss-cn-hongkong-internal.aliyuncs.com/ClickBench/hits_split${i}" & |
| fi |
| done |
| |
| echo "wait for download task done..." |
| wait |
| cd - |
| |
| echo "(2/2) load clickbench data file $DATA_DIR/hits_split[0-9] into Doris" |
| for i in $(seq 0 9); do |
| echo -e " |
| start loading hits_split${i}" |
| curl --location-trusted \ |
| -u $USER:$PASSWORD \ |
| -T "$DATA_DIR/hits_split${i}" \ |
| -H "columns:WatchID,JavaEnable,Title,GoodEvent,EventTime,EventDate,CounterID,ClientIP,RegionID,UserID,CounterClass,OS,UserAgent,URL,Referer,IsRefresh,RefererCategoryID,RefererRegionID,URLCategoryID,URLRegionID,ResolutionWidth,ResolutionHeight,ResolutionDepth,FlashMajor,FlashMinor,FlashMinor2,NetMajor,NetMinor,UserAgentMajor,UserAgentMinor,CookieEnable,JavascriptEnable,IsMobile,MobilePhone,MobilePhoneModel,Params,IPNetworkID,TraficSourceID,SearchEngineID,SearchPhrase,AdvEngineID,IsArtifical,WindowClientWidth,WindowClientHeight,ClientTimeZone,ClientEventTime,SilverlightVersion1,SilverlightVersion2,SilverlightVersion3,SilverlightVersion4,PageCharset,CodeVersion,IsLink,IsDownload,IsNotBounce,FUniqID,OriginalURL,HID,IsOldCounter,IsEvent,IsParameter,DontCountHits,WithHash,HitColor,LocalEventTime,Age,Sex,Income,Interests,Robotness,RemoteIP,WindowName,OpenerName,HistoryLength,BrowserLanguage,BrowserCountry,SocialNetwork,SocialAction,HTTPError,SendTiming,DNSTiming,ConnectTiming,ResponseStartTiming,ResponseEndTiming,FetchTiming,SocialSourceNetworkID,SocialSourcePage,ParamPrice,ParamOrderID,ParamCurrency,ParamCurrencyID,OpenstatServiceName,OpenstatCampaignID,OpenstatAdID,OpenstatSourceID,UTMSource,UTMMedium,UTMCampaign,UTMContent,UTMTerm,FromTag,HasGCLID,RefererHash,URLHash,CLID" \ |
| http://$FE_HOST:$FE_HTTP_PORT/api/$DB/hits/_stream_load |
| done |
| } |
| |
| echo "start..." |
| start=$(date +%s) |
| check_doirs_conf |
| load |
| end=$(date +%s) |
| echo "load cost time: $((end - start)) seconds" |