blob: 94e531fac980c28ca7b23638c9cce7709e2c5f38 [file] [log] [blame]
#!/bin/bash -xe
########################################################################
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Run and compare benchmarks.
#
# Allows for running comparisons either locally or as part of a
# Jenkins job which integrates with a historical stats DB.
# Run this script with -help for usage information.
########################################################################
# Fail the job if any part fails, even when piping through 'tee', etc.
set -o pipefail
################################################################
# Constants
################################################################
MODE_JENKINS="jenkins"
MODE_LOCAL="local"
# In Jenkins mode, get last 10 weeks of stats stored in the database.
STATS_DAYS_TO_FETCH=70
LOCAL_STATS_BASE="local-stats"
NUM_MT_TABLET_TESTS=5
MT_TABLET_TEST=mt-tablet-test
RPC_BENCH_TEST=RpcBenchBenchmark
CBTREE_TEST=ConcurrentBTreeScan
BLOOM_TEST=BloomfileBenchmark
MT_BLOOM_TEST=MultithreadedBloomfileBenchmark
WIRE_PROTOCOL_TEST=WireProtocolBenchmark
COMPACT_MERGE_BENCH=CompactBenchMerge
WITH_OVERLAP=Overlap
NO_OVERLAP=NoOverlap
MEMROWSET_BENCH=MemRowSetBenchmark
TS_INSERT_LATENCY=TabletServerInsertLatency
TS_8THREAD_BENCH=TabletServer8Threads
INSERT=Insert
SCAN_NONE_COMMITTED=ScanNoneCommitted
SCAN_ALL_COMMITTED=ScanAllCommitted
FS_SCANINSERT_MRS=FullStackScanInsertMRSOnly
FS_SCANINSERT_DISK=FullStackScanInsertWithDisk
DENSE_NODE_ITEST=DenseNodeItest
BLOCKING_QUEUE_SYMMETRIC_TEST=BlockingQueueSymmetric
BLOCKING_QUEUE_NON_SYMMETRIC_TEST=BlockingQueueNonSymmetric
GET_TABLE_SCHEMA_RPC=GetTableSchemaTestRpc
GET_TABLE_SCHEMA_DIRECT_CALL=GetTableSchemaTestDirectCall
GET_TABLE_LOCATIONS_RPC=GetTableLocationsTestRpc
GET_TABLE_LOCATIONS_DIRECT_CALL=GetTableLocationsTestDirectCall
SAME_TABLET_CONCURRENT_WRITES=SameTabletConcurrentWrites
CLIENT_METACACHE_PERF=ClientMetacachePerf
CLIENT_METACACHE_PERF_SYNTHETIC=ClientMetacachePerfSynthetic
LOG_DIR_NAME=build/latest/bench-logs
OUT_DIR_NAME=build/latest/bench-out
HTML_FILE="benchmarks.html"
# Most tests will run this many times.
NUM_SAMPLES=${NUM_SAMPLES:-10}
# Whether to run the benchmarks under 'perf stat', recording and plotting
# the statistics collected by 'perf stat'. The 'perf stat' has the
# least overhead among all sub-commands of the 'perf' suite (e.g., see
# http://www.brendangregg.com/perf.html).
RUN_PERF_STAT=${RUN_PERF_STAT:-1}
if [ $RUN_PERF_STAT -ne 0 -o "$RUN_PERF_STAT" = x"yes" ]; then
PERF_STAT="perf stat"
else
PERF_STAT=""
fi
################################################################
# Global variables
################################################################
BENCHMARK_MODE=$MODE_JENKINS # we default to "jenkins mode"
BASE_DIR=$(pwd)
LOGDIR=""
OUTDIR=""
################################################################
# Functions
################################################################
usage_and_die() {
set +x
echo "Usage: $0 [-local [git-hash-1 [git-hash-2 ...]]]"
echo " When -local is specified, perf of 1 or more git hashes are plotted."
echo " Otherwise, the script is run in 'Jenkins' mode and expects the"
echo " usual Jenkins environment variables to be defined, such as"
echo " BUILD_NUMBER and JOB_NAME."
exit 1
}
perf_stat_enabled() {
if [ -z "$PERF_STAT" ]; then
return 1
fi
return 0
}
# Ensure the necessary performance tools are present.
check_tools() {
if perf_stat_enabled; then
# Check for 'perf' assuming the 'stat' sub-tool is always available.
if ! which perf > /dev/null; then
return 1
fi
fi
return 0
}
ensure_cpu_scaling() {
$BASE_DIR/src/kudu/scripts/ensure_cpu_scaling.sh "$@"
}
record_result() {
local BUILD_IDENTIFIER=$1
local TEST_NAME=$2
local ITER=$3
local VALUE=$4
if [ $BENCHMARK_MODE = $MODE_JENKINS ]; then
python write-jobs-stats-to-mysql.py $JOB_NAME $BUILD_IDENTIFIER $TEST_NAME $ITER $VALUE
else
local STATS_FILE="$OUTDIR/$LOCAL_STATS_BASE-$TEST_NAME.tsv"
# Note: literal tabs in below string.
echo "${TEST_NAME} ${VALUE} ${BUILD_IDENTIFIER}" >> "$STATS_FILE"
fi
}
load_stats() {
local TEST_NAME="$1"
if [ "$BENCHMARK_MODE" = "$MODE_JENKINS" ]; then
python get-job-stats-from-mysql.py $TEST_NAME $STATS_DAYS_TO_FETCH
else
# Convert MySQL wildcards to shell wildcards.
local TEST_NAME=$(echo $TEST_NAME | perl -pe 's/%/*/g')
local STATS_FILES=$(ls $OUTDIR/$LOCAL_STATS_BASE-$TEST_NAME.tsv)
# Note: literal tabs in below string.
echo "workload runtime build_number"
for f in $STATS_FILES; do
cat $f
done
fi
}
write_img_plot() {
local input_file="$1"
local test_name="$2"
local y_axis_label="$3"
# Rscript fails when there's only a header, so just skip
if [ `wc -l $input_file | cut -d ' ' -f1` -gt 1 ]; then
Rscript jobs_runtime.R "$input_file" "$test_name" "$y_axis_label"
fi
}
write_mttablet_img_plots() {
local INPUT_FILE=$1
local TEST_NAME=$2
xvfb-run Rscript mt-tablet-test-graph.R $INPUT_FILE $TEST_NAME
}
build_kudu() {
# PATH=<toolchain_stuff>:$PATH
export TOOLCHAIN=/mnt/toolchain/toolchain.sh
if [ -f "$TOOLCHAIN" ]; then
source $TOOLCHAIN
fi
# Build thirdparty
$BASE_DIR/build-support/enable_devtoolset.sh thirdparty/build-if-necessary.sh
# PATH=<thirdparty_stuff>:<toolchain_stuff>:$PATH
THIRDPARTY_BIN=$BASE_DIR/thirdparty/installed/common/bin
export PPROF_PATH=$THIRDPARTY_BIN/pprof
BUILD_TYPE=release
# Build Kudu
mkdir -p build/$BUILD_TYPE
pushd build/$BUILD_TYPE
rm -rf CMakeCache.txt CMakeFiles/
$BASE_DIR/build-support/enable_devtoolset.sh $THIRDPARTY_BIN/cmake -DCMAKE_BUILD_TYPE=${BUILD_TYPE} ../..
# clean up before we run
rm -Rf /tmp/kudutpch1-$UID
mkdir -p /tmp/kudutpch1-$UID
NUM_PROCS=$(cat /proc/cpuinfo | grep processor | wc -l)
make -j${NUM_PROCS} 2>&1 | tee build.log
popd
}
run_benchmarks() {
# Create output directories if needed.
mkdir -p "$LOGDIR"
mkdir -p "$OUTDIR"
export KUDU_ALLOW_SLOW_TESTS=true
# run all of the variations of mt-tablet-test
$PERF_STAT ./build/latest/bin/mt-tablet-test \
--gtest_filter=\*DoTestAllAtOnce\* \
--num_counter_threads=0 \
--tablet_test_flush_threshold_mb=32 \
--num_slowreader_threads=0 \
--flusher_backoff=1.0 \
--flusher_initial_frequency_ms=1000 \
--inserts_per_thread=1000000 \
&> $LOGDIR/${MT_TABLET_TEST}.log
# run rpc-bench test 5 times. 10 seconds per run
for i in $(seq 1 $NUM_SAMPLES); do
$PERF_STAT ./build/latest/bin/rpc-bench \
--gtest_filter=*BenchmarkCalls \
&> $LOGDIR/$RPC_BENCH_TEST$i.log
done
# run cbtree-test 5 times. 20 seconds per run
for i in $(seq 1 $NUM_SAMPLES); do
$PERF_STAT ./build/latest/bin/cbtree-test \
--gtest_filter=TestCBTree.TestScanPerformance \
&> $LOGDIR/${CBTREE_TEST}$i.log
done
# run bloomfile-test 5 times. ~3.3 seconds per run
for i in $(seq 1 $NUM_SAMPLES); do
$PERF_STAT ./build/latest/bin/bloomfile-test \
--gtest_filter=*Benchmark \
--benchmark_queries=10000000 \
--bloom_size_bytes=32768 \
--n_keys=100000 \
&> $LOGDIR/$BLOOM_TEST$i.log
done
# run mt-bloomfile-test 5 times. 20-30 seconds per run.
# We use 150M keys so the total bloom space is >150MB, and then
# set the cache capacity to be only 100M to ensure that the cache
# churns a lot.
for i in $(seq 1 $NUM_SAMPLES); do
$PERF_STAT ./build/latest/bin/mt-bloomfile-test \
--benchmark_queries=2000000 --bloom_size_bytes=4096 \
--n_keys=150000000 --block_cache_capacity_mb=100 \
&> $LOGDIR/$MT_BLOOM_TEST$i.log
done
# run wire_protocol-test 5 times.
#
# We run the non-null 10-column benchmark, selecting all rows.
for i in $(seq 1 $NUM_SAMPLES); do
$PERF_STAT ./build/latest/bin/wire_protocol-test \
--gtest_filter=*Benchmark/10_int64_non_null_sel_100pct \
&> $LOGDIR/$WIRE_PROTOCOL_TEST$i.log
done
# run compaction-test 5 times, 6 seconds each
for i in $(seq 1 $NUM_SAMPLES); do
$PERF_STAT ./build/latest/bin/compaction-test \
--gtest_filter=TestCompaction.BenchmarkMerge* \
&> $LOGDIR/${COMPACT_MERGE_BENCH}$i.log
done
# run memrowset benchmark 5 times, ~10 seconds per run
for i in $(seq 1 $NUM_SAMPLES) ; do
$PERF_STAT ./build/latest/bin/memrowset-test \
--roundtrip_num_rows=10000000 \
--gtest_filter=\*InsertCount\* \
&> $LOGDIR/${MEMROWSET_BENCH}$i.log
done
# Run single-threaded TS insert latency benchmark, 5-6 seconds per run
for i in $(seq 1 $NUM_SAMPLES) ; do
$PERF_STAT ./build/latest/bin/tablet_server-test \
--gtest_filter=*MicroBench* \
--single_threaded_insert_latency_bench_warmup_rows=1000 \
--single_threaded_insert_latency_bench_insert_rows=10000 \
&> $LOGDIR/${TS_INSERT_LATENCY}$i.log
done
# Run multi-threaded TS insert benchmark
for i in $(seq 1 $NUM_SAMPLES) ; do
$PERF_STAT ./build/latest/bin/tablet_server-stress-test \
--num_inserts_per_thread=30000 -runtime_secs=0 \
&> $LOGDIR/${TS_8THREAD_BENCH}$i.log
done
# Run full stack scan/insert test using MRS only, ~26s each
for i in $(seq 1 $NUM_SAMPLES) ; do
# TODO(aserbin): fix --perf_stat_scan and enable stats collection
./build/latest/bin/full_stack-insert-scan-test \
--gtest_filter=FullStackInsertScanTest.MRSOnlyStressTest \
--concurrent_inserts=50 \
--inserts_per_client=200000 \
--rows_per_batch=10000 \
&> $LOGDIR/${FS_SCANINSERT_MRS}$i.log
done
# Run full stack scan/insert test with disk, ~50s each
for i in $(seq 1 $NUM_SAMPLES) ; do
# TODO(aserbin): fix --perf_stat_scan and enable stats collection
./build/latest/bin/full_stack-insert-scan-test \
--gtest_filter=FullStackInsertScanTest.WithDiskStressTest \
--concurrent_inserts=50 \
--inserts_per_client=200000 \
--rows_per_batch=10000 \
&> $LOGDIR/${FS_SCANINSERT_DISK}$i.log
done
# Run dense storage node test, ~150s each.
#
# Needs to run as root for measure_startup_drop_caches.
# TODO(aserbin): add performance counter collection once it's possible to run
# external minicluster's components under 'perf stat'
for i in $(seq 1 $NUM_SAMPLES) ; do
sudo ./build/latest/bin/dense_node-itest \
--num_seconds=60 \
--num_tablets=1000 \
--measure_startup_sync \
--measure_startup_drop_caches \
--measure_startup_wait_for_bootstrap \
&> $LOGDIR/${DENSE_NODE_ITEST}$i.log
done
# Run BlockingQueue concurrency test with 3 writers and 3 readers,
# (i.e. symmetric in number of readers/writers), no non-blocking writers.
for i in $(seq 1 $NUM_SAMPLES) ; do
$PERF_STAT ./build/latest/bin/blocking_queue-test \
--gtest_filter=BlockingQueueMultiThreadPerfTest.RequestRates \
--num_blocking_writers=3 \
--num_blocking_readers=3 \
--num_non_blocking_writers=0 \
&> $LOGDIR/${BLOCKING_QUEUE_SYMMETRIC_TEST}$i.log
done
# Run BlockingQueue concurrency test with 3 writers
# (2 blocking, 1 non-blocking) and 1 reader.
for i in $(seq 1 $NUM_SAMPLES) ; do
$PERF_STAT ./build/latest/bin/blocking_queue-test \
--gtest_filter=BlockingQueueMultiThreadPerfTest.RequestRates \
--num_blocking_writers=2 \
--num_blocking_readers=1 \
--num_non_blocking_writers=1 \
&> $LOGDIR/${BLOCKING_QUEUE_NON_SYMMETRIC_TEST}$i.log
done
# Run ConcurrentGetTableSchemaTest.RPC with and without authz.
for i in $(seq 1 $NUM_SAMPLES) ; do
$PERF_STAT ./build/latest/bin/master-test \
--gtest_filter='*/ConcurrentGetTableSchemaTest.Rpc/*' \
&> $LOGDIR/${GET_TABLE_SCHEMA_RPC}$i.log
done
# Run ConcurrentGetTableSchemaTest.DirectMethodCall with and without authz.
for i in $(seq 1 $NUM_SAMPLES) ; do
$PERF_STAT ./build/latest/bin/master-test \
--gtest_filter='*/ConcurrentGetTableSchemaTest.DirectMethodCall/*' \
&> $LOGDIR/${GET_TABLE_SCHEMA_DIRECT_CALL}$i.log
done
# Run GetTableLocationsBenchmark, with and without cache.
for capacity_mb in 0 32 ; do
for i in $(seq 1 $NUM_SAMPLES) ; do
$PERF_STAT ./build/latest/bin/table_locations-itest \
--gtest_filter=TableLocationsTest.GetTableLocationsBenchmark \
--rpc_num_service_threads=8 \
--benchmark_num_threads=12 \
--table_locations_cache_capacity_mb=$capacity_mb \
&> $LOGDIR/${GET_TABLE_LOCATIONS_RPC}_${capacity_mb}$i.log
done
done
# Run GetTableLocationsBenchmarkFunctionCall, with and without cache.
for capacity_mb in 0 32 ; do
for i in $(seq 1 $NUM_SAMPLES) ; do
$PERF_STAT ./build/latest/bin/table_locations-itest \
--gtest_filter=TableLocationsTest.GetTableLocationsBenchmarkFunctionCall \
--benchmark_num_threads=12 \
--table_locations_cache_capacity_mb=$capacity_mb \
&> $LOGDIR/${GET_TABLE_LOCATIONS_DIRECT_CALL}_${capacity_mb}$i.log
done
done
# Run SameTabletConcurrentWritesTest.InsertsOnly with 16 inserter threads.
for i in $(seq 1 $NUM_SAMPLES) ; do
$PERF_STAT ./build/latest/bin/same_tablet_concurrent_writes-itest \
--gtest_filter='SameTabletConcurrentWritesTest.InsertsOnly' \
--num_inserter_threads=16 \
&> $LOGDIR/${SAME_TABLET_CONCURRENT_WRITES}$i.log
done
# Run MetaCacheLookupStressTest.Perf scenario.
for i in $(seq 1 $NUM_SAMPLES) ; do
$PERF_STAT ./build/latest/bin/client-stress-test \
--gtest_filter='MetaCacheLookupStressTest.Perf' \
&> $LOGDIR/${CLIENT_METACACHE_PERF}$i.log
done
# Run MetaCacheLookupStressTest.PerfSynthetic scenario.
for i in $(seq 1 $NUM_SAMPLES) ; do
$PERF_STAT ./build/latest/bin/client-stress-test \
--gtest_filter='MetaCacheLookupStressTest.PerfSynthetic' \
&> $LOGDIR/${CLIENT_METACACHE_PERF_SYNTHETIC}$i.log
done
}
parse_and_record_perf_stats() {
if ! perf_stat_enabled; then
return
fi
local build_id=$1
local test_id=$2
local iter=$3
local log=$4
local ins_per_cycle=$(grep -o '# .* ins.* per cycle' $log | awk '{print $2}')
record_result $build_id perf_stat_ins_per_cycle_${test_id} $iter $ins_per_cycle
local cpu_util=$(grep -o '# .* CPUs utilized' $log | awk '{print $2}')
record_result $build_id perf_stat_cpu_util_${test_id} $iter $cpu_util
local branch_misses=$(grep -o '# .* of all branches' $log | awk '{print $2}' | sed 's/%//g')
record_result $build_id perf_stat_branch_misses_${test_id} $iter $branch_misses
}
parse_and_record_all_results() {
local BUILD_IDENTIFIER="$1"
if [ -z "$BUILD_IDENTIFIER" ]; then
echo "ERROR: BUILD_IDENTIFIER not defined"
exit 1
fi
pushd src
pushd kudu
pushd scripts
# parse the number of ms out of "[ OK ] MultiThreadedTabletTest/5.DoTestAllAtOnce (14966 ms)"
local MT_TABLET_TEST_TIMINGS="${MT_TABLET_TEST}-timings"
grep OK $LOGDIR/${MT_TABLET_TEST}.log | cut -d "(" -f2 | cut -d ")" -f1 | cut -d " " -f1 \
> $LOGDIR/${MT_TABLET_TEST_TIMINGS}.txt
# The tests go from 0 to NUM_MT_TABLET_TEST, but files start at line one so we add +1 to the line number.
# Then using the timing we found, we multiply it by 1000 to gets seconds in float, then send it to MySQL
for i in $(seq 0 $NUM_MT_TABLET_TESTS); do
linenumber=$[ $i + 1 ]
timing=`sed -n "${linenumber}p" $LOGDIR/${MT_TABLET_TEST_TIMINGS}.txt`
record_result $BUILD_IDENTIFIER MultiThreadedTabletTest_$i 1 `echo $timing / 1000 | bc -l`
done
parse_and_record_perf_stats $BUILD_IDENTIFIER $MT_TABLET_TEST 1 $LOGDIR/${MT_TABLET_TEST}.log
# parse out the real time from: "Time spent Insert 10000000 keys: real 16.438s user 16.164s sys 0.229s"
for i in $(seq 1 $NUM_SAMPLES); do
local id=$CBTREE_TEST
local log=$LOGDIR/${id}${i}.log
parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
real=`grep "Time spent Insert" $log | ./parse_real_out.sh`
record_result $BUILD_IDENTIFIER ${id}Insert $i $real
real=`grep "not frozen" $log | ./parse_real_out.sh`
record_result $BUILD_IDENTIFIER ${id}NotFrozen $i $real
real=`grep "(frozen" $log | ./parse_real_out.sh`
record_result $BUILD_IDENTIFIER ${id}Frozen $i $real
done
# parse out the real time from "Time spent with overlap: real 0.557s user 0.546s sys 0.010s"
for i in $(seq 1 $NUM_SAMPLES); do
local id=$COMPACT_MERGE_BENCH
local log=$LOGDIR/${id}${i}.log
parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
real=`grep "with overlap" $log | ./parse_real_out.sh`
record_result $BUILD_IDENTIFIER ${id}${WITH_OVERLAP} $i $real
real=`grep "without overlap" $log | ./parse_real_out.sh`
record_result $BUILD_IDENTIFIER ${id}${NO_OVERLAP} $i $real
done
# parse out time from MRS benchmarks
for i in $(seq 1 $NUM_SAMPLES); do
local id=$MEMROWSET_BENCH
local log=$LOGDIR/${id}${i}.log
parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
real=`grep "Time spent Inserting" $log | ./parse_real_out.sh`
record_result $BUILD_IDENTIFIER ${id}${INSERT} $i $real
real=`grep "Time spent Scanning rows where none" $log | ./parse_real_out.sh`
record_result $BUILD_IDENTIFIER ${id}${SCAN_NONE_COMMITTED} $i $real
real=`grep "Time spent Scanning rows where all" $log | ./parse_real_out.sh`
record_result $BUILD_IDENTIFIER ${id}${SCAN_ALL_COMMITTED} $i $real
done
# Parse out the real time from: "Time spent Running 10000000 queries: real 3.281s user 3.273s sys 0.000s"
for i in $(seq 1 $NUM_SAMPLES); do
local id=$BLOOM_TEST
local log=$LOGDIR/${id}${i}.log
local real=`grep "Time spent Running" $log | ./parse_real_out.sh`
parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
record_result $BUILD_IDENTIFIER $id $i $real
done
# Parse out the real time from: "Time spent Running 2000000 queries: real 28.193s user 26.903s sys 1.032s"
# Many threads output their value, we keep the last;
for i in $(seq 1 $NUM_SAMPLES); do
local id=$MT_BLOOM_TEST
local log=$LOGDIR/${id}${i}.log
local real=`grep "Time spent Running" $log | tail -n1 | ./parse_real_out.sh`
parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
record_result $BUILD_IDENTIFIER $id $i $real
done
# Parse out row-wise rate of cycles/cell from:
# "Converting 10_int64_non_null to PB (method row-wise) row select rate 1: 30.196263 cycles/cell"
# Parse out the columnar rate of cycles/cell from:
# "Converting 10_int64_non_null to PB (method columnar) row select rate 1: 1.313369 cycles/cell"
for i in $(seq 1 $NUM_SAMPLES); do
local id=$WIRE_PROTOCOL_TEST
local log=$LOGDIR/${id}${i}.log
parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
local real_rowwise=`grep "Converting.*to PB (method row-wise)" $log | sed 's|^.*: \([[:digit:].]*\) cycles/cell$|\1|'`
record_result $BUILD_IDENTIFIER $id $i $real_rowwise
local real_colwise=`grep "Converting.*to PB (method columnar)" $log | sed 's|^.*: \([[:digit:].]*\) cycles/cell$|\1|'`
record_result $BUILD_IDENTIFIER ${id}_columnar $i $real_colwise
done
# parse the rate out of: "I1009 15:00:30.023576 27043 rpc-bench.cc:108] Reqs/sec: 84404.4"
for i in $(seq 1 $NUM_SAMPLES); do
local id=$RPC_BENCH_TEST
local log=$LOGDIR/${id}${i}.log
local rate=`grep Reqs $log | cut -d ":" -f 5 | tr -d ' '`
parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
record_result $BUILD_IDENTIFIER $id $i $rate
done
# parse latency numbers from single-threaded tserver benchmark
for i in $(seq 1 $NUM_SAMPLES); do
local id=$TS_INSERT_LATENCY
local log=$LOGDIR/${id}${i}.log
parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
for metric in min mean percentile_95 percentile_99 percentile_99_9 ; do
val=$(grep "\"$metric\": " $log | awk '{print $2}' | sed -e 's/,//')
record_result $BUILD_IDENTIFIER ${id}_${metric} $i $val
done
done
# parse latency and throughput numbers from multi-threaded tserver benchmark
for i in $(seq 1 $NUM_SAMPLES); do
local id=$TS_8THREAD_BENCH
local log=$LOGDIR/${id}${i}.log
parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
for metric in min mean percentile_95 percentile_99 percentile_99_9 ; do
val=$(grep "\"$metric\": " $log | awk '{print $2}' | sed -e 's/,//')
record_result $BUILD_IDENTIFIER ${id}_${metric}_latency $i $val
done
rate=$(grep -o 'Throughput.*' $log | awk '{print $2}')
record_result $BUILD_IDENTIFIER ${id}_throughput $i $rate
rate=$(grep -o 'CPU efficiency.*' $log | awk '{print $3}')
record_result $BUILD_IDENTIFIER ${id}_cpu_efficiency $i $rate
done
# parse scan timings for scans and inserts with MRS only
for i in $(seq 1 $NUM_SAMPLES); do
local id=$FS_SCANINSERT_MRS
local log=$LOGDIR/${id}${i}.log
insert=`grep "Time spent concurrent inserts" $log | ./parse_real_out.sh`
scan_full=`grep "Time spent full schema scan" $log | ./parse_real_out.sh`
scan_str=`grep "Time spent String projection" $log | ./parse_real_out.sh`
scan_int32=`grep "Time spent Int32 projection" $log | ./parse_real_out.sh`
scan_int64=`grep "Time spent Int64 projection" $log | ./parse_real_out.sh`
record_result $BUILD_IDENTIFIER ${id}_insert $i $insert
record_result $BUILD_IDENTIFIER ${id}_scan_full $i $scan_full
record_result $BUILD_IDENTIFIER ${id}_scan_str $i $scan_str
record_result $BUILD_IDENTIFIER ${id}_scan_int32 $i $scan_int32
record_result $BUILD_IDENTIFIER ${id}_scan_int64 $i $scan_int64
done
# parse scan timings for scans and inserts with disk
for i in $(seq 1 $NUM_SAMPLES); do
local id=$FS_SCANINSERT_DISK
local log=$LOGDIR/${id}${i}.log
insert=`grep "Time spent concurrent inserts" $log | ./parse_real_out.sh`
scan_full=`grep "Time spent full schema scan" $log | ./parse_real_out.sh`
scan_str=`grep "Time spent String projection" $log | ./parse_real_out.sh`
scan_int32=`grep "Time spent Int32 projection" $log | ./parse_real_out.sh`
scan_int64=`grep "Time spent Int64 projection" $log | ./parse_real_out.sh`
record_result $BUILD_IDENTIFIER ${id}_insert $i $insert
record_result $BUILD_IDENTIFIER ${id}_scan_full $i $scan_full
record_result $BUILD_IDENTIFIER ${id}_scan_str $i $scan_str
record_result $BUILD_IDENTIFIER ${id}_scan_int32 $i $scan_int32
record_result $BUILD_IDENTIFIER ${id}_scan_int64 $i $scan_int64
done
# Parse timings and thread count
for i in $(seq 1 $NUM_SAMPLES); do
local id=$DENSE_NODE_ITEST
local log=$LOGDIR/${id}${i}.log
num_threads=`grep threads_running $log | cut -d ":" -f 5 | tr -d ' '`
num_blocks=`grep log_block_manager_blocks_under_management $log | cut -d ":" -f 5 | tr -d ' '`
num_bytes=`grep log_block_manager_bytes_under_management $log | cut -d ":" -f 5 | tr -d ' '`
num_containers=`grep log_block_manager_containers $log | cut -d ":" -f 5 | tr -d ' '`
num_full_containers=`grep log_block_manager_full_containers $log | cut -d ":" -f 5 | tr -d ' '`
# For some inexplicable reason, these timing measurements are sometimes
# not emitted to the log file. Having spent a great deal of time trying to
# figure out why, let's just skip missing measurements.
#
# This may cause resulting graphs to look a little sharp, but that's better
# than failing the entire script.
set +e
time_restarting_tserver=`grep "Time spent restarting tserver" $log | ./parse_real_out.sh`
time_bootstrapping_tablets=`grep "Time spent bootstrapping tablets" $log | ./parse_real_out.sh`
set -e
record_result $BUILD_IDENTIFIER ${id}_num_threads $i $num_threads
record_result $BUILD_IDENTIFIER ${id}_num_blocks $i $num_blocks
record_result $BUILD_IDENTIFIER ${id}_num_bytes $i $num_bytes
record_result $BUILD_IDENTIFIER ${id}_num_containers $i $num_containers
record_result $BUILD_IDENTIFIER ${id}_num_full_containers $i $num_full_containers
if [ -n "$time_restarting_tserver" ]; then
record_result $BUILD_IDENTIFIER ${id}_time_restarting_tserver $i $time_restarting_tserver
fi
if [ -n "$time_bootstrapping_tablets" ]; then
record_result $BUILD_IDENTIFIER ${id}_time_bootstrapping_tablets $i $time_bootstrapping_tablets
fi
done
# Parse out total call rate and record the results for BlockingQueue test.
for id in $BLOCKING_QUEUE_SYMMETRIC_TEST $BLOCKING_QUEUE_NON_SYMMETRIC_TEST; do
for i in $(seq 1 $NUM_SAMPLES); do
local log=$LOGDIR/${id}${i}.log
parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
local rate=$(grep -o 'total rate: .* calls/sec' $log | awk '{print $3}')
record_result $BUILD_IDENTIFIER ${id}_total_call_rate $i $rate
done
done
# Parse out request rate and record the results for ConcurrentGetTableSchemaTest test.
for id in $GET_TABLE_SCHEMA_RPC $GET_TABLE_SCHEMA_DIRECT_CALL; do
for i in $(seq 1 $NUM_SAMPLES); do
local log=$LOGDIR/${id}${i}.log
parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
rate=$(grep -o 'GetTableSchema .*: .* req/sec (authz disabled)' $log | awk '{print $3}')
record_result $BUILD_IDENTIFIER ${id}_no_authz_req_rate $i $rate
rate=$(grep -o 'GetTableSchema .*: .* req/sec (authz enabled)' $log | awk '{print $3}')
record_result $BUILD_IDENTIFIER ${id}_authz_req_rate $i $rate
done
done
# Parse out request rate and record the results for GetTableLocationsBenchmark{FunctionCall}.
for capacity_mb in 0 32 ; do
for i in $(seq 1 $NUM_SAMPLES); do
local id=${GET_TABLE_LOCATIONS_RPC}_${capacity_mb}
local log=$LOGDIR/${id}${i}.log
parse_and_record_perf_stats $BUILD_IDENTIFIER ${id} $i $log
rate=$(grep -o 'GetTableLocations RPC: .* req/sec' $log | awk '{print $3}')
record_result $BUILD_IDENTIFIER ${id}_req_rate $i $rate
local id=${GET_TABLE_LOCATIONS_DIRECT_CALL}_${capacity_mb}
local log=$LOGDIR/${id}${i}.log
parse_and_record_perf_stats $BUILD_IDENTIFIER ${id} $i $log
rate=$(grep -o 'GetTableLocations function call: .* req/sec' $log | awk '{print $4}')
record_result $BUILD_IDENTIFIER ${id}_req_rate $i $rate
done
done
for i in $(seq 1 $NUM_SAMPLES); do
local id=$SAME_TABLET_CONCURRENT_WRITES
local log=$LOGDIR/${id}${i}.log
parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
rate=$(grep -o 'write RPC request rate: .* req/sec' $log | awk '{print $5}')
record_result $BUILD_IDENTIFIER ${id}_req_rate $i $rate
overflows=$(grep -o 'total count of RPC queue overflows: .*' $log | awk '{print $7}')
record_result $BUILD_IDENTIFIER ${id}_overflows $i $overflows
done
for i in $(seq 1 $NUM_SAMPLES); do
local id=$CLIENT_METACACHE_PERF
local log=$LOGDIR/${id}${i}.log
parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
local time_per_row=$(grep -o 'Time per row: .* ms' $log | awk '{print $4}')
record_result $BUILD_IDENTIFIER ${id}_time_per_row $i $time_per_row
done
for i in $(seq 1 $NUM_SAMPLES); do
local id=$CLIENT_METACACHE_PERF_SYNTHETIC
local log=$LOGDIR/${id}${i}.log
parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
local time_per_row=$(grep -o 'Time per row: .* ms' $log | awk '{print $4}')
record_result $BUILD_IDENTIFIER ${id}_time_per_row $i $time_per_row
done
popd
popd
popd
}
generate_ycsb_plots() {
local WORKLOAD=$1
local PHASE=$2
METRIC_NAME=ycsb-$PHASE-$WORKLOAD
# first plot the overall stats for that phase
OVERALL_FILENAME=$METRIC_NAME-OVERALL
load_stats $OVERALL_FILENAME-runtime_ms > $OUTDIR/$OVERALL_FILENAME-runtime_ms.tsv
write_img_plot $OUTDIR/$OVERALL_FILENAME-runtime_ms.tsv $OVERALL_FILENAME-runtime_ms
load_stats $OVERALL_FILENAME-throughput_ops_sec > $OUTDIR/$OVERALL_FILENAME-throughput_ops_sec.tsv
write_img_plot $OUTDIR/$OVERALL_FILENAME-throughput_ops_sec.tsv $OVERALL_FILENAME-throughput_ops_sec
# now plot the individual operations
OPS="INSERT UPDATE READ"
for op in $OPS; do
OP_FILENAME=$METRIC_NAME-$op
load_stats $OP_FILENAME-average_latency_us > $OUTDIR/$OP_FILENAME-average_latency_us.tsv
write_img_plot $OUTDIR/$OP_FILENAME-average_latency_us.tsv $OP_FILENAME-average_latency_us
load_stats $OP_FILENAME-95th_latency_ms > $OUTDIR/$OP_FILENAME-95th_latency_ms.tsv
write_img_plot $OUTDIR/$OP_FILENAME-95th_latency_ms.tsv $OP_FILENAME-95th_latency_ms
load_stats $OP_FILENAME-99th_latency_ms > $OUTDIR/$OP_FILENAME-99th_latency_ms.tsv
write_img_plot $OUTDIR/$OP_FILENAME-99th_latency_ms.tsv $OP_FILENAME-99th_latency_ms
done
}
load_and_generate_plot() {
local test_name=$1
local plot_name=$2
local y_axis_label=$3
if [ -z "$y_axis_label" ]; then
local y_axis_label="runtime (sec)"
fi
local plot_file=${OUTDIR}/${plot_name}.tsv
load_stats "$test_name" > $plot_file
write_img_plot $plot_file $plot_name "$y_axis_label"
}
load_and_generate_perf_stat_plots() {
local id="perf_stat_ins_per_cycle"
local plot_file=${OUTDIR}/${id}.tsv
load_stats "${id}_%" > $plot_file
write_img_plot $plot_file "perf-stat-instructions-per-CPU-cycle" "instructions per cycle"
local id="perf_stat_cpu_util"
local plot_file=${OUTDIR}/${id}.tsv
load_stats "${id}_%" > $plot_file
write_img_plot $plot_file "perf-stat-cpu-utilization" "CPUs utilized"
local id="perf_stat_branch_misses"
local plot_file=${OUTDIR}/${id}.tsv
load_stats "${id}_%" > $plot_file
write_img_plot $plot_file "perf-stat-branch-misses" "% of all branches"
}
load_stats_and_generate_plots() {
pushd src
pushd kudu
pushd scripts
load_and_generate_plot "%MultiThreadedTabletTest%" mt-tablet-test-runtime
load_and_generate_plot ${CBTREE_TEST}Insert cb-tree-insert
load_and_generate_plot ${CBTREE_TEST}NotFrozen cb-ctree-not-frozen
load_and_generate_plot ${CBTREE_TEST}Frozen cb-ctree-frozen
load_and_generate_plot "${COMPACT_MERGE_BENCH}%" compact-merge-bench
load_and_generate_plot "${MEMROWSET_BENCH}${INSERT}" memrowset-bench-insert
load_and_generate_plot "${MEMROWSET_BENCH}Scan%" memrowset-bench-scan
load_and_generate_plot "$BLOOM_TEST" bloom-test
load_and_generate_plot "$MT_BLOOM_TEST" mt-bloom-test
load_and_generate_plot "${WIRE_PROTOCOL_TEST}" wire-protocol-test "conversion rate (cycle/cell)"
load_and_generate_plot "${WIRE_PROTOCOL_TEST}_columnar" wire-protocol-test-columnar "conversion rate (cycle/cell)"
load_and_generate_plot "$RPC_BENCH_TEST" rpc-bench-test "request rate (RPC/sec)"
load_and_generate_plot "${TS_INSERT_LATENCY}%" ts-insert-latency "latency (usec)"
load_and_generate_plot "${TS_8THREAD_BENCH}%_latency" ts-8thread-insert-latency "latency (usec)"
load_and_generate_plot "${TS_8THREAD_BENCH}_throughput" ts-8thread-insert-throughput "throughput (row/sec)"
load_and_generate_plot "${TS_8THREAD_BENCH}_cpu_efficiency" ts-8thread-insert-cpu-efficiency "CPU efficiency (row/cpu-sec)"
load_and_generate_plot "${FS_SCANINSERT_MRS}%_insert" fs-mrsonly-insert
load_and_generate_plot "${FS_SCANINSERT_MRS}%_scan%" fs-mrsonly-scan
load_and_generate_plot "${FS_SCANINSERT_DISK}%_insert" fs-withdisk-insert
load_and_generate_plot "${FS_SCANINSERT_DISK}%_scan%" fs-withdisk-scan
load_and_generate_plot "${DENSE_NODE_ITEST}_time%" dense-node-bench-times
load_and_generate_plot "${DENSE_NODE_ITEST}_num%containers%" dense-node-bench-containers "number of log block containers"
load_and_generate_plot "${DENSE_NODE_ITEST}_num_blocks%" dense-node-bench-blocks "number of blocks"
load_and_generate_plot "${DENSE_NODE_ITEST}_num_threads%" dense-node-bench-threads "threads running"
load_and_generate_plot "${DENSE_NODE_ITEST}_num_bytes%" dense-node-bench-bytes "bytes under management"
load_and_generate_plot "${BLOCKING_QUEUE_SYMMETRIC_TEST}%" blocking-queue-symmetric "function call rate (req/sec)"
load_and_generate_plot "${BLOCKING_QUEUE_NON_SYMMETRIC_TEST}%" blocking-queue-non-symmetric "function call rate (req/sec)"
load_and_generate_plot "${GET_TABLE_SCHEMA_RPC}%_req_rate" get-table-schema-rpc "RPC rate (req/sec)"
load_and_generate_plot "${GET_TABLE_SCHEMA_DIRECT_CALL}_authz_req_rate" get-table-schema-dc-authz "function call rate (req/sec)"
load_and_generate_plot "${GET_TABLE_SCHEMA_DIRECT_CALL}_no_authz_req_rate" get-table-schema-dc-no-authz "function call rate (req/sec)"
load_and_generate_plot "${GET_TABLE_LOCATIONS_RPC}%_req_rate" get-table-locations-rpc
load_and_generate_plot "${GET_TABLE_LOCATIONS_DIRECT_CALL}%_req_rate" get-table-locations-dc
load_and_generate_plot "${SAME_TABLET_CONCURRENT_WRITES}_req_rate" same-tablet-concurrent-writes-rate "RPC rate (req/sec)"
load_and_generate_plot "${SAME_TABLET_CONCURRENT_WRITES}_overflows" same-tablet-concurrent-writes-overflows "RPC queue overflows (total count)"
load_and_generate_plot "${CLIENT_METACACHE_PERF}_time_per_row" client-metacache-lookup-perf "per-row apply time (sec)"
load_and_generate_plot "${CLIENT_METACACHE_PERF_SYNTHETIC}_time_per_row" client-metacache-lookup-perf-synthetic "per-row lookup time (sec)"
# Generate all the pngs for all the mt-tablet tests
for i in $(seq 0 $NUM_MT_TABLET_TESTS); do
cat $LOGDIR/${MT_TABLET_TEST}.log | ./graph-metrics.py MultiThreadedTabletTest/$i > $OUTDIR/test$i.tsv
# Don't bail on failure (why not?)
write_mttablet_img_plots $OUTDIR/test$i.tsv test$i || true
done
# Generate performance counters plots for all tests run under 'perf stat'.
load_and_generate_perf_stat_plots
if [ "${BENCHMARK_MODE}" = "${MODE_JENKINS}" ]; then
################################################################
# Plot the separately-recorded TPCH and YCSB graphs as well
# (only for Jenkins)
################################################################
# TPC-H 1 runs separately, let's just get those graphs
load_and_generate_plot query_1_1gb tpch1-query
load_and_generate_plot insert_1gb tpch1-insert
# YCSB which runs the 5nodes_workload on a cluster
# First we process the loading phase
generate_ycsb_plots 5nodes_workload load
# Then the running phase
generate_ycsb_plots 5nodes_workload run
fi
# Move all the pngs to OUT_DIR.
mv *.png $OUTDIR/
# Generate an HTML file aggregating the PNGs.
# Mostly for local usage, but somewhat useful to check the Jenkins runs too.
pushd $OUTDIR/
PNGS=$(ls *.png)
echo -n > "$OUTDIR/$HTML_FILE"
echo "<title>Kudu Benchmarks</title>" >> "$OUTDIR/$HTML_FILE"
echo "<h1 align=center>Kudu Benchmarks</h1>" >> "$OUTDIR/$HTML_FILE"
for png in $PNGS; do
echo "<img src=$png><br>" >> "$OUTDIR/$HTML_FILE"
done
popd
popd
popd
popd
}
build_run_record() {
local BUILD_IDENTIFIER=$1
build_kudu
run_benchmarks
parse_and_record_all_results "$BUILD_IDENTIFIER"
}
git_checkout() {
local GIT_HASH=$1
git checkout $GIT_HASH
}
run() {
# Parse command-line options.
if [ -n "$1" ]; then
[ "$1" = "-local" ] || usage_and_die
shift
BENCHMARK_MODE=$MODE_LOCAL
# If no hashes are provided, run against the current HEAD.
if [ -z "$1" ]; then
build_run_record "working_tree"
else
# Convert the passed-in git refs into their hashes.
# This allows you to use "HEAD~3 HEAD" as arguments
# and end up with those being evaluated with regard to
# the _current_ branch, instead of evaluating the second
# "HEAD" after checking out the first.
local ref
local hashes
for ref in "$@" ; do
hashes="$hashes $(git rev-parse "$ref")"
done
set $hashes
while [ -n "$1" ]; do
local GIT_HASH="$1"
shift
git_checkout "$GIT_HASH"
build_run_record "$GIT_HASH"
done
fi
else
[ -n "$BUILD_NUMBER" ] || usage_and_die
build_run_record "$BUILD_NUMBER"
fi
# The last step is the same for both modes.
load_stats_and_generate_plots
}
################################################################
# main
################################################################
# Figure out where we are, store in global variables.
LOGDIR="$BASE_DIR/$LOG_DIR_NAME"
OUTDIR="$BASE_DIR/$OUT_DIR_NAME"
# Ensure we are in KUDU_HOME
if [ ! -f "$BASE_DIR/LICENSE.txt" ]; then
echo "Error: must run from top of Kudu source tree"
usage_and_die
fi
if ! check_tools; then
echo "Error: could not find necessary tools to run benchmarks"
exit 1
fi
# Set up environment.
ulimit -m $[3000*1000]
ulimit -c unlimited # gather core dumps
# Set CPU governor, and restore it on exit.
old_governor=$(ensure_cpu_scaling performance)
restore_governor() {
ensure_cpu_scaling $old_governor >/dev/null
}
trap restore_governor EXIT
# Kick off the benchmark script.
run $*
exit 0