blob: 0f89095434d1ffb36d7d84c794b6a7cb392a68ba [file] [log] [blame]
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#ifdef GFLAGS
#ifdef NUMA
#include <numa.h>
#include <numaif.h>
#endif
#ifndef OS_WIN
#include <unistd.h>
#endif
#include <fcntl.h>
#include <gflags/gflags.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <atomic>
#include <condition_variable>
#include <cstddef>
#include <memory>
#include <mutex>
#include <thread>
#include <unordered_map>
#include "db/db_impl.h"
#include "db/version_set.h"
#include "hdfs/env_hdfs.h"
#include "monitoring/histogram.h"
#include "monitoring/statistics.h"
#include "port/port.h"
#include "port/stack_trace.h"
#include "rocksdb/cache.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/filter_policy.h"
#include "rocksdb/memtablerep.h"
#include "rocksdb/options.h"
#include "rocksdb/perf_context.h"
#include "rocksdb/persistent_cache.h"
#include "rocksdb/rate_limiter.h"
#include "rocksdb/slice.h"
#include "rocksdb/slice_transform.h"
#include "rocksdb/utilities/object_registry.h"
#include "rocksdb/utilities/optimistic_transaction_db.h"
#include "rocksdb/utilities/options_util.h"
#include "rocksdb/utilities/sim_cache.h"
#include "rocksdb/utilities/transaction.h"
#include "rocksdb/utilities/transaction_db.h"
#include "rocksdb/write_batch.h"
#include "util/cast_util.h"
#include "util/compression.h"
#include "util/crc32c.h"
#include "util/mutexlock.h"
#include "util/random.h"
#include "util/stderr_logger.h"
#include "util/string_util.h"
#include "util/testutil.h"
#include "util/transaction_test_util.h"
#include "util/xxhash.h"
#include "utilities/blob_db/blob_db.h"
#include "utilities/merge_operators.h"
#include "utilities/persistent_cache/block_cache_tier.h"
#ifdef OS_WIN
#include <io.h> // open/close
#endif
using GFLAGS::ParseCommandLineFlags;
using GFLAGS::RegisterFlagValidator;
using GFLAGS::SetUsageMessage;
DEFINE_string(
benchmarks,
"fillseq,"
"fillseqdeterministic,"
"fillsync,"
"fillrandom,"
"filluniquerandomdeterministic,"
"overwrite,"
"readrandom,"
"newiterator,"
"newiteratorwhilewriting,"
"seekrandom,"
"seekrandomwhilewriting,"
"seekrandomwhilemerging,"
"readseq,"
"readreverse,"
"compact,"
"compactall,"
"readrandom,"
"multireadrandom,"
"readseq,"
"readtocache,"
"readreverse,"
"readwhilewriting,"
"readwhilemerging,"
"readrandomwriterandom,"
"updaterandom,"
"randomwithverify,"
"fill100K,"
"crc32c,"
"xxhash,"
"compress,"
"uncompress,"
"acquireload,"
"fillseekseq,"
"randomtransaction,"
"randomreplacekeys,"
"timeseries",
"Comma-separated list of operations to run in the specified"
" order. Available benchmarks:\n"
"\tfillseq -- write N values in sequential key"
" order in async mode\n"
"\tfillseqdeterministic -- write N values in the specified"
" key order and keep the shape of the LSM tree\n"
"\tfillrandom -- write N values in random key order in async"
" mode\n"
"\tfilluniquerandomdeterministic -- write N values in a random"
" key order and keep the shape of the LSM tree\n"
"\toverwrite -- overwrite N values in random key order in"
" async mode\n"
"\tfillsync -- write N/100 values in random key order in "
"sync mode\n"
"\tfill100K -- write N/1000 100K values in random order in"
" async mode\n"
"\tdeleteseq -- delete N keys in sequential order\n"
"\tdeleterandom -- delete N keys in random order\n"
"\treadseq -- read N times sequentially\n"
"\treadtocache -- 1 thread reading database sequentially\n"
"\treadreverse -- read N times in reverse order\n"
"\treadrandom -- read N times in random order\n"
"\treadmissing -- read N missing keys in random order\n"
"\treadwhilewriting -- 1 writer, N threads doing random "
"reads\n"
"\treadwhilemerging -- 1 merger, N threads doing random "
"reads\n"
"\treadrandomwriterandom -- N threads doing random-read, "
"random-write\n"
"\tprefixscanrandom -- prefix scan N times in random order\n"
"\tupdaterandom -- N threads doing read-modify-write for random "
"keys\n"
"\tappendrandom -- N threads doing read-modify-write with "
"growing values\n"
"\tmergerandom -- same as updaterandom/appendrandom using merge"
" operator. "
"Must be used with merge_operator\n"
"\treadrandommergerandom -- perform N random read-or-merge "
"operations. Must be used with merge_operator\n"
"\tnewiterator -- repeated iterator creation\n"
"\tseekrandom -- N random seeks, call Next seek_nexts times "
"per seek\n"
"\tseekrandomwhilewriting -- seekrandom and 1 thread doing "
"overwrite\n"
"\tseekrandomwhilemerging -- seekrandom and 1 thread doing "
"merge\n"
"\tcrc32c -- repeated crc32c of 4K of data\n"
"\txxhash -- repeated xxHash of 4K of data\n"
"\tacquireload -- load N*1000 times\n"
"\tfillseekseq -- write N values in sequential key, then read "
"them by seeking to each key\n"
"\trandomtransaction -- execute N random transactions and "
"verify correctness\n"
"\trandomreplacekeys -- randomly replaces N keys by deleting "
"the old version and putting the new version\n\n"
"\ttimeseries -- 1 writer generates time series data "
"and multiple readers doing random reads on id\n\n"
"Meta operations:\n"
"\tcompact -- Compact the entire DB; If multiple, randomly choose one\n"
"\tcompactall -- Compact the entire DB\n"
"\tstats -- Print DB stats\n"
"\tresetstats -- Reset DB stats\n"
"\tlevelstats -- Print the number of files and bytes per level\n"
"\tsstables -- Print sstable info\n"
"\theapprofile -- Dump a heap profile (if supported by this"
" port)\n");
DEFINE_int64(num, 1000000, "Number of key/values to place in database");
DEFINE_int64(numdistinct, 1000,
"Number of distinct keys to use. Used in RandomWithVerify to "
"read/write on fewer keys so that gets are more likely to find the"
" key and puts are more likely to update the same key");
DEFINE_int64(merge_keys, -1,
"Number of distinct keys to use for MergeRandom and "
"ReadRandomMergeRandom. "
"If negative, there will be FLAGS_num keys.");
DEFINE_int32(num_column_families, 1, "Number of Column Families to use.");
DEFINE_int32(
num_hot_column_families, 0,
"Number of Hot Column Families. If more than 0, only write to this "
"number of column families. After finishing all the writes to them, "
"create new set of column families and insert to them. Only used "
"when num_column_families > 1.");
DEFINE_string(column_family_distribution, "",
"Comma-separated list of percentages, where the ith element "
"indicates the probability of an op using the ith column family. "
"The number of elements must be `num_hot_column_families` if "
"specified; otherwise, it must be `num_column_families`. The "
"sum of elements must be 100. E.g., if `num_column_families=4`, "
"and `num_hot_column_families=0`, a valid list could be "
"\"10,20,30,40\".");
DEFINE_int64(reads, -1, "Number of read operations to do. "
"If negative, do FLAGS_num reads.");
DEFINE_int64(deletes, -1, "Number of delete operations to do. "
"If negative, do FLAGS_num deletions.");
DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality");
DEFINE_int64(seed, 0, "Seed base for random number generators. "
"When 0 it is deterministic.");
DEFINE_int32(threads, 1, "Number of concurrent threads to run.");
DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run."
" When 0 then num & reads determine the test duration");
DEFINE_int32(value_size, 100, "Size of each value");
DEFINE_int32(seek_nexts, 0,
"How many times to call Next() after Seek() in "
"fillseekseq, seekrandom, seekrandomwhilewriting and "
"seekrandomwhilemerging");
DEFINE_bool(reverse_iterator, false,
"When true use Prev rather than Next for iterators that do "
"Seek and then Next");
DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
DEFINE_int64(batch_size, 1, "Batch size");
static bool ValidateKeySize(const char* flagname, int32_t value) {
return true;
}
static bool ValidateUint32Range(const char* flagname, uint64_t value) {
if (value > std::numeric_limits<uint32_t>::max()) {
fprintf(stderr, "Invalid value for --%s: %lu, overflow\n", flagname,
(unsigned long)value);
return false;
}
return true;
}
DEFINE_int32(key_size, 16, "size of each key");
DEFINE_int32(num_multi_db, 0,
"Number of DBs used in the benchmark. 0 means single DB.");
DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink"
" to this fraction of their original size after compression");
DEFINE_double(read_random_exp_range, 0.0,
"Read random's key will be generated using distribution of "
"num * exp(-r) where r is uniform number from 0 to this value. "
"The larger the number is, the more skewed the reads are. "
"Only used in readrandom and multireadrandom benchmarks.");
DEFINE_bool(histogram, false, "Print histogram of operation timings");
DEFINE_bool(enable_numa, false,
"Make operations aware of NUMA architecture and bind memory "
"and cpus corresponding to nodes together. In NUMA, memory "
"in same node as CPUs are closer when compared to memory in "
"other nodes. Reads can be faster when the process is bound to "
"CPU and memory of same node. Use \"$numactl --hardware\" command "
"to see NUMA memory architecture.");
DEFINE_int64(db_write_buffer_size, rocksdb::Options().db_write_buffer_size,
"Number of bytes to buffer in all memtables before compacting");
DEFINE_bool(cost_write_buffer_to_cache, false,
"The usage of memtable is costed to the block cache");
DEFINE_int64(write_buffer_size, rocksdb::Options().write_buffer_size,
"Number of bytes to buffer in memtable before compacting");
DEFINE_int32(max_write_buffer_number,
rocksdb::Options().max_write_buffer_number,
"The number of in-memory memtables. Each memtable is of size"
"write_buffer_size.");
DEFINE_int32(min_write_buffer_number_to_merge,
rocksdb::Options().min_write_buffer_number_to_merge,
"The minimum number of write buffers that will be merged together"
"before writing to storage. This is cheap because it is an"
"in-memory merge. If this feature is not enabled, then all these"
"write buffers are flushed to L0 as separate files and this "
"increases read amplification because a get request has to check"
" in all of these files. Also, an in-memory merge may result in"
" writing less data to storage if there are duplicate records "
" in each of these individual write buffers.");
DEFINE_int32(max_write_buffer_number_to_maintain,
rocksdb::Options().max_write_buffer_number_to_maintain,
"The total maximum number of write buffers to maintain in memory "
"including copies of buffers that have already been flushed. "
"Unlike max_write_buffer_number, this parameter does not affect "
"flushing. This controls the minimum amount of write history "
"that will be available in memory for conflict checking when "
"Transactions are used. If this value is too low, some "
"transactions may fail at commit time due to not being able to "
"determine whether there were any write conflicts. Setting this "
"value to 0 will cause write buffers to be freed immediately "
"after they are flushed. If this value is set to -1, "
"'max_write_buffer_number' will be used.");
DEFINE_int32(max_background_jobs,
rocksdb::Options().max_background_jobs,
"The maximum number of concurrent background jobs that can occur "
"in parallel.");
DEFINE_int32(num_bottom_pri_threads, 0,
"The number of threads in the bottom-priority thread pool (used "
"by universal compaction only).");
DEFINE_int32(num_high_pri_threads, 0,
"The maximum number of concurrent background compactions"
" that can occur in parallel.");
DEFINE_int32(num_low_pri_threads, 0,
"The maximum number of concurrent background compactions"
" that can occur in parallel.");
DEFINE_int32(max_background_compactions,
rocksdb::Options().max_background_compactions,
"The maximum number of concurrent background compactions"
" that can occur in parallel.");
DEFINE_int32(base_background_compactions, -1, "DEPRECATED");
DEFINE_uint64(subcompactions, 1,
"Maximum number of subcompactions to divide L0-L1 compactions "
"into.");
static const bool FLAGS_subcompactions_dummy
__attribute__((unused)) = RegisterFlagValidator(&FLAGS_subcompactions,
&ValidateUint32Range);
DEFINE_int32(max_background_flushes,
rocksdb::Options().max_background_flushes,
"The maximum number of concurrent background flushes"
" that can occur in parallel.");
static rocksdb::CompactionStyle FLAGS_compaction_style_e;
DEFINE_int32(compaction_style, (int32_t) rocksdb::Options().compaction_style,
"style of compaction: level-based, universal and fifo");
static rocksdb::CompactionPri FLAGS_compaction_pri_e;
DEFINE_int32(compaction_pri, (int32_t)rocksdb::Options().compaction_pri,
"priority of files to compaction: by size or by data age");
DEFINE_int32(universal_size_ratio, 0,
"Percentage flexibility while comparing file size"
" (for universal compaction only).");
DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files in a"
" single compaction run (for universal compaction only).");
DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact"
" in universal style compaction");
DEFINE_int32(universal_max_size_amplification_percent, 0,
"The max size amplification for universal style compaction");
DEFINE_int32(universal_compression_size_percent, -1,
"The percentage of the database to compress for universal "
"compaction. -1 means compress everything.");
DEFINE_bool(universal_allow_trivial_move, false,
"Allow trivial move in universal compaction.");
DEFINE_int64(cache_size, 8 << 20, // 8MB
"Number of bytes to use as a cache of uncompressed data");
DEFINE_int32(cache_numshardbits, 6,
"Number of shards for the block cache"
" is 2 ** cache_numshardbits. Negative means use default settings."
" This is applied only if FLAGS_cache_size is non-negative.");
DEFINE_double(cache_high_pri_pool_ratio, 0.0,
"Ratio of block cache reserve for high pri blocks. "
"If > 0.0, we also enable "
"cache_index_and_filter_blocks_with_high_priority.");
DEFINE_bool(use_clock_cache, false,
"Replace default LRU block cache with clock cache.");
DEFINE_int64(simcache_size, -1,
"Number of bytes to use as a simcache of "
"uncompressed data. Nagative value disables simcache.");
DEFINE_bool(cache_index_and_filter_blocks, false,
"Cache index/filter blocks in block cache.");
DEFINE_bool(partition_index_and_filters, false,
"Partition index and filter blocks.");
DEFINE_int64(metadata_block_size,
rocksdb::BlockBasedTableOptions().metadata_block_size,
"Max partition size when partitioning index/filters");
// The default reduces the overhead of reading time with flash. With HDD, which
// offers much less throughput, however, this number better to be set to 1.
DEFINE_int32(ops_between_duration_checks, 1000,
"Check duration limit every x ops");
DEFINE_bool(pin_l0_filter_and_index_blocks_in_cache, false,
"Pin index/filter blocks of L0 files in block cache.");
DEFINE_int32(block_size,
static_cast<int32_t>(rocksdb::BlockBasedTableOptions().block_size),
"Number of bytes in a block.");
DEFINE_int32(block_restart_interval,
rocksdb::BlockBasedTableOptions().block_restart_interval,
"Number of keys between restart points "
"for delta encoding of keys in data block.");
DEFINE_int32(index_block_restart_interval,
rocksdb::BlockBasedTableOptions().index_block_restart_interval,
"Number of keys between restart points "
"for delta encoding of keys in index block.");
DEFINE_int32(read_amp_bytes_per_bit,
rocksdb::BlockBasedTableOptions().read_amp_bytes_per_bit,
"Number of bytes per bit to be used in block read-amp bitmap");
DEFINE_int64(compressed_cache_size, -1,
"Number of bytes to use as a cache of compressed data.");
DEFINE_int64(row_cache_size, 0,
"Number of bytes to use as a cache of individual rows"
" (0 = disabled).");
DEFINE_int32(open_files, rocksdb::Options().max_open_files,
"Maximum number of files to keep open at the same time"
" (use default if == 0)");
DEFINE_int32(file_opening_threads, rocksdb::Options().max_file_opening_threads,
"If open_files is set to -1, this option set the number of "
"threads that will be used to open files during DB::Open()");
DEFINE_bool(new_table_reader_for_compaction_inputs, true,
"If true, uses a separate file handle for compaction inputs");
DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");
DEFINE_int32(random_access_max_buffer_size, 1024 * 1024,
"Maximum windows randomaccess buffer size");
DEFINE_int32(writable_file_max_buffer_size, 1024 * 1024,
"Maximum write buffer for Writable File");
DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means"
" use default settings.");
DEFINE_double(memtable_bloom_size_ratio, 0,
"Ratio of memtable size used for bloom filter. 0 means no bloom "
"filter.");
DEFINE_bool(memtable_use_huge_page, false,
"Try to use huge page in memtables.");
DEFINE_bool(use_existing_db, false, "If true, do not destroy the existing"
" database. If you set this flag and also specify a benchmark that"
" wants a fresh database, that benchmark will fail.");
DEFINE_bool(show_table_properties, false,
"If true, then per-level table"
" properties will be printed on every stats-interval when"
" stats_interval is set and stats_per_interval is on.");
DEFINE_string(db, "", "Use the db with the following name.");
// Read cache flags
DEFINE_string(read_cache_path, "",
"If not empty string, a read cache will be used in this path");
DEFINE_int64(read_cache_size, 4LL * 1024 * 1024 * 1024,
"Maximum size of the read cache");
DEFINE_bool(read_cache_direct_write, true,
"Whether to use Direct IO for writing to the read cache");
DEFINE_bool(read_cache_direct_read, true,
"Whether to use Direct IO for reading from read cache");
static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) {
if (value >= 20) {
fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n",
flagname, value);
return false;
}
return true;
}
DEFINE_bool(verify_checksum, true,
"Verify checksum for every block read"
" from storage");
DEFINE_bool(statistics, false, "Database statistics");
DEFINE_string(statistics_string, "", "Serialized statistics string");
static class std::shared_ptr<rocksdb::Statistics> dbstats;
DEFINE_int64(writes, -1, "Number of write operations to do. If negative, do"
" --num reads.");
DEFINE_bool(finish_after_writes, false, "Write thread terminates after all writes are finished");
DEFINE_bool(sync, false, "Sync all writes to disk");
DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");
DEFINE_string(truth_db, "/dev/shm/truth_db/dbbench",
"Truth key/values used when using verify");
DEFINE_int32(num_levels, 7, "The total number of levels");
DEFINE_int64(target_file_size_base, rocksdb::Options().target_file_size_base,
"Target file size at level-1");
DEFINE_int32(target_file_size_multiplier,
rocksdb::Options().target_file_size_multiplier,
"A multiplier to compute target level-N file size (N >= 2)");
DEFINE_uint64(max_bytes_for_level_base,
rocksdb::Options().max_bytes_for_level_base,
"Max bytes for level-1");
DEFINE_bool(level_compaction_dynamic_level_bytes, false,
"Whether level size base is dynamic");
DEFINE_double(max_bytes_for_level_multiplier, 10,
"A multiplier to compute max bytes for level-N (N >= 2)");
static std::vector<int> FLAGS_max_bytes_for_level_multiplier_additional_v;
DEFINE_string(max_bytes_for_level_multiplier_additional, "",
"A vector that specifies additional fanout per level");
DEFINE_int32(level0_stop_writes_trigger,
rocksdb::Options().level0_stop_writes_trigger,
"Number of files in level-0"
" that will trigger put stop.");
DEFINE_int32(level0_slowdown_writes_trigger,
rocksdb::Options().level0_slowdown_writes_trigger,
"Number of files in level-0"
" that will slow down writes.");
DEFINE_int32(level0_file_num_compaction_trigger,
rocksdb::Options().level0_file_num_compaction_trigger,
"Number of files in level-0"
" when compactions start");
static bool ValidateInt32Percent(const char* flagname, int32_t value) {
if (value <= 0 || value>=100) {
fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n",
flagname, value);
return false;
}
return true;
}
DEFINE_int32(readwritepercent, 90, "Ratio of reads to reads/writes (expressed"
" as percentage) for the ReadRandomWriteRandom workload. The "
"default value 90 means 90% operations out of all reads and writes"
" operations are reads. In other words, 9 gets for every 1 put.");
DEFINE_int32(mergereadpercent, 70, "Ratio of merges to merges&reads (expressed"
" as percentage) for the ReadRandomMergeRandom workload. The"
" default value 70 means 70% out of all read and merge operations"
" are merges. In other words, 7 merges for every 3 gets.");
DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/"
"deletes (used in RandomWithVerify only). RandomWithVerify "
"calculates writepercent as (100 - FLAGS_readwritepercent - "
"deletepercent), so deletepercent must be smaller than (100 - "
"FLAGS_readwritepercent)");
DEFINE_bool(optimize_filters_for_hits, false,
"Optimizes bloom filters for workloads for most lookups return "
"a value. For now this doesn't create bloom filters for the max "
"level of the LSM to reduce metadata that should fit in RAM. ");
DEFINE_uint64(delete_obsolete_files_period_micros, 0,
"Ignored. Left here for backward compatibility");
DEFINE_int64(writes_per_range_tombstone, 0,
"Number of writes between range "
"tombstones");
DEFINE_int64(range_tombstone_width, 100, "Number of keys in tombstone's range");
DEFINE_int64(max_num_range_tombstones, 0,
"Maximum number of range tombstones "
"to insert.");
DEFINE_bool(expand_range_tombstones, false,
"Expand range tombstone into sequential regular tombstones.");
#ifndef ROCKSDB_LITE
DEFINE_bool(optimistic_transaction_db, false,
"Open a OptimisticTransactionDB instance. "
"Required for randomtransaction benchmark.");
DEFINE_bool(use_blob_db, false,
"Open a BlobDB instance. "
"Required for largevalue benchmark.");
DEFINE_bool(transaction_db, false,
"Open a TransactionDB instance. "
"Required for randomtransaction benchmark.");
DEFINE_uint64(transaction_sets, 2,
"Number of keys each transaction will "
"modify (use in RandomTransaction only). Max: 9999");
DEFINE_bool(transaction_set_snapshot, false,
"Setting to true will have each transaction call SetSnapshot()"
" upon creation.");
DEFINE_int32(transaction_sleep, 0,
"Max microseconds to sleep in between "
"reading and writing a value (used in RandomTransaction only). ");
DEFINE_uint64(transaction_lock_timeout, 100,
"If using a transaction_db, specifies the lock wait timeout in"
" milliseconds before failing a transaction waiting on a lock");
DEFINE_string(
options_file, "",
"The path to a RocksDB options file. If specified, then db_bench will "
"run with the RocksDB options in the default column family of the "
"specified options file. "
"Note that with this setting, db_bench will ONLY accept the following "
"RocksDB options related command-line arguments, all other arguments "
"that are related to RocksDB options will be ignored:\n"
"\t--use_existing_db\n"
"\t--statistics\n"
"\t--row_cache_size\n"
"\t--row_cache_numshardbits\n"
"\t--enable_io_prio\n"
"\t--dump_malloc_stats\n"
"\t--num_multi_db\n");
DEFINE_uint64(fifo_compaction_max_table_files_size_mb, 0,
"The limit of total table file sizes to trigger FIFO compaction");
DEFINE_bool(fifo_compaction_allow_compaction, true,
"Allow compaction in FIFO compaction.");
DEFINE_uint64(fifo_compaction_ttl, 0, "TTL for the SST Files in seconds.");
#endif // ROCKSDB_LITE
DEFINE_bool(report_bg_io_stats, false,
"Measure times spents on I/Os while in compactions. ");
DEFINE_bool(use_stderr_info_logger, false,
"Write info logs to stderr instead of to LOG file. ");
static enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
assert(ctype);
if (!strcasecmp(ctype, "none"))
return rocksdb::kNoCompression;
else if (!strcasecmp(ctype, "snappy"))
return rocksdb::kSnappyCompression;
else if (!strcasecmp(ctype, "zlib"))
return rocksdb::kZlibCompression;
else if (!strcasecmp(ctype, "bzip2"))
return rocksdb::kBZip2Compression;
else if (!strcasecmp(ctype, "lz4"))
return rocksdb::kLZ4Compression;
else if (!strcasecmp(ctype, "lz4hc"))
return rocksdb::kLZ4HCCompression;
else if (!strcasecmp(ctype, "xpress"))
return rocksdb::kXpressCompression;
else if (!strcasecmp(ctype, "zstd"))
return rocksdb::kZSTD;
fprintf(stdout, "Cannot parse compression type '%s'\n", ctype);
return rocksdb::kSnappyCompression; // default value
}
static std::string ColumnFamilyName(size_t i) {
if (i == 0) {
return rocksdb::kDefaultColumnFamilyName;
} else {
char name[100];
snprintf(name, sizeof(name), "column_family_name_%06zu", i);
return std::string(name);
}
}
DEFINE_string(compression_type, "snappy",
"Algorithm to use to compress the database");
static enum rocksdb::CompressionType FLAGS_compression_type_e =
rocksdb::kSnappyCompression;
DEFINE_int32(compression_level, -1,
"Compression level. For zlib this should be -1 for the "
"default level, or between 0 and 9.");
DEFINE_int32(compression_max_dict_bytes, 0,
"Maximum size of dictionary used to prime the compression "
"library.");
static bool ValidateCompressionLevel(const char* flagname, int32_t value) {
if (value < -1 || value > 9) {
fprintf(stderr, "Invalid value for --%s: %d, must be between -1 and 9\n",
flagname, value);
return false;
}
return true;
}
static const bool FLAGS_compression_level_dummy __attribute__((unused)) =
RegisterFlagValidator(&FLAGS_compression_level, &ValidateCompressionLevel);
DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts"
" from this level. Levels with number < min_level_to_compress are"
" not compressed. Otherwise, apply compression_type to "
"all levels.");
static bool ValidateTableCacheNumshardbits(const char* flagname,
int32_t value) {
if (0 >= value || value > 20) {
fprintf(stderr, "Invalid value for --%s: %d, must be 0 < val <= 20\n",
flagname, value);
return false;
}
return true;
}
DEFINE_int32(table_cache_numshardbits, 4, "");
#ifndef ROCKSDB_LITE
DEFINE_string(env_uri, "", "URI for registry Env lookup. Mutually exclusive"
" with --hdfs.");
#endif // ROCKSDB_LITE
DEFINE_string(hdfs, "", "Name of hdfs environment. Mutually exclusive with"
" --env_uri.");
static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when "
"this is greater than zero. When 0 the interval grows over time.");
DEFINE_int64(stats_interval_seconds, 0, "Report stats every N seconds. This "
"overrides stats_interval when both are > 0.");
DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when"
" this is greater than 0.");
DEFINE_int64(report_interval_seconds, 0,
"If greater than zero, it will write simple stats in CVS format "
"to --report_file every N seconds");
DEFINE_string(report_file, "report.csv",
"Filename where some simple stats are reported to (if "
"--report_interval_seconds is bigger than 0)");
DEFINE_int32(thread_status_per_interval, 0,
"Takes and report a snapshot of the current status of each thread"
" when this is greater than 0.");
DEFINE_int32(perf_level, rocksdb::PerfLevel::kDisable, "Level of perf collection");
static bool ValidateRateLimit(const char* flagname, double value) {
const double EPSILON = 1e-10;
if ( value < -EPSILON ) {
fprintf(stderr, "Invalid value for --%s: %12.6f, must be >= 0.0\n",
flagname, value);
return false;
}
return true;
}
DEFINE_double(soft_rate_limit, 0.0, "DEPRECATED");
DEFINE_double(hard_rate_limit, 0.0, "DEPRECATED");
DEFINE_uint64(soft_pending_compaction_bytes_limit, 64ull * 1024 * 1024 * 1024,
"Slowdown writes if pending compaction bytes exceed this number");
DEFINE_uint64(hard_pending_compaction_bytes_limit, 128ull * 1024 * 1024 * 1024,
"Stop writes if pending compaction bytes exceed this number");
DEFINE_uint64(delayed_write_rate, 8388608u,
"Limited bytes allowed to DB when soft_rate_limit or "
"level0_slowdown_writes_trigger triggers");
DEFINE_bool(enable_pipelined_write, true,
"Allow WAL and memtable writes to be pipelined");
DEFINE_bool(allow_concurrent_memtable_write, true,
"Allow multi-writers to update mem tables in parallel.");
DEFINE_bool(enable_write_thread_adaptive_yield, true,
"Use a yielding spin loop for brief writer thread waits.");
DEFINE_uint64(
write_thread_max_yield_usec, 100,
"Maximum microseconds for enable_write_thread_adaptive_yield operation.");
DEFINE_uint64(write_thread_slow_yield_usec, 3,
"The threshold at which a slow yield is considered a signal that "
"other processes or threads want the core.");
DEFINE_int32(rate_limit_delay_max_milliseconds, 1000,
"When hard_rate_limit is set then this is the max time a put will"
" be stalled.");
DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");
DEFINE_bool(rate_limit_bg_reads, false,
"Use options.rate_limiter on compaction reads");
DEFINE_uint64(
benchmark_write_rate_limit, 0,
"If non-zero, db_bench will rate-limit the writes going into RocksDB. This "
"is the global rate in bytes/second.");
DEFINE_uint64(
benchmark_read_rate_limit, 0,
"If non-zero, db_bench will rate-limit the reads from RocksDB. This "
"is the global rate in ops/second.");
DEFINE_uint64(max_compaction_bytes, rocksdb::Options().max_compaction_bytes,
"Max bytes allowed in one compaction");
#ifndef ROCKSDB_LITE
DEFINE_bool(readonly, false, "Run read only benchmarks.");
#endif // ROCKSDB_LITE
DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions");
DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds.");
DEFINE_uint64(wal_size_limit_MB, 0, "Set the size limit for the WAL Files"
" in MB.");
DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size");
DEFINE_bool(mmap_read, rocksdb::Options().allow_mmap_reads,
"Allow reads to occur via mmap-ing files");
DEFINE_bool(mmap_write, rocksdb::Options().allow_mmap_writes,
"Allow writes to occur via mmap-ing files");
DEFINE_bool(use_direct_reads, rocksdb::Options().use_direct_reads,
"Use O_DIRECT for reading data");
DEFINE_bool(use_direct_io_for_flush_and_compaction,
rocksdb::Options().use_direct_io_for_flush_and_compaction,
"Use O_DIRECT for background flush and compaction I/O");
DEFINE_bool(advise_random_on_open, rocksdb::Options().advise_random_on_open,
"Advise random access on table file open");
DEFINE_string(compaction_fadvice, "NORMAL",
"Access pattern advice when a file is compacted");
static auto FLAGS_compaction_fadvice_e =
rocksdb::Options().access_hint_on_compaction_start;
DEFINE_bool(use_tailing_iterator, false,
"Use tailing iterator to access a series of keys instead of get");
DEFINE_bool(use_adaptive_mutex, rocksdb::Options().use_adaptive_mutex,
"Use adaptive mutex");
DEFINE_uint64(bytes_per_sync, rocksdb::Options().bytes_per_sync,
"Allows OS to incrementally sync SST files to disk while they are"
" being written, in the background. Issue one request for every"
" bytes_per_sync written. 0 turns it off.");
DEFINE_uint64(wal_bytes_per_sync, rocksdb::Options().wal_bytes_per_sync,
"Allows OS to incrementally sync WAL files to disk while they are"
" being written, in the background. Issue one request for every"
" wal_bytes_per_sync written. 0 turns it off.");
DEFINE_bool(use_single_deletes, true,
"Use single deletes (used in RandomReplaceKeys only).");
DEFINE_double(stddev, 2000.0,
"Standard deviation of normal distribution used for picking keys"
" (used in RandomReplaceKeys only).");
DEFINE_int32(key_id_range, 100000,
"Range of possible value of key id (used in TimeSeries only).");
DEFINE_string(expire_style, "none",
"Style to remove expired time entries. Can be one of the options "
"below: none (do not expired data), compaction_filter (use a "
"compaction filter to remove expired data), delete (seek IDs and "
"remove expired data) (used in TimeSeries only).");
DEFINE_uint64(
time_range, 100000,
"Range of timestamp that store in the database (used in TimeSeries"
" only).");
DEFINE_int32(num_deletion_threads, 1,
"Number of threads to do deletion (used in TimeSeries and delete "
"expire_style only).");
DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge"
" operations on a key in the memtable");
static bool ValidatePrefixSize(const char* flagname, int32_t value) {
if (value < 0 || value>=2000000000) {
fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
flagname, value);
return false;
}
return true;
}
DEFINE_int32(prefix_size, 0, "control the prefix size for HashSkipList and "
"plain table");
DEFINE_int64(keys_per_prefix, 0, "control average number of keys generated "
"per prefix, 0 means no special handling of the prefix, "
"i.e. use the prefix comes with the generated random number.");
DEFINE_int32(memtable_insert_with_hint_prefix_size, 0,
"If non-zero, enable "
"memtable insert with hint with the given prefix size.");
DEFINE_bool(enable_io_prio, false, "Lower the background flush/compaction "
"threads' IO priority");
DEFINE_bool(identity_as_first_hash, false, "the first hash function of cuckoo "
"table becomes an identity function. This is only valid when key "
"is 8 bytes");
DEFINE_bool(dump_malloc_stats, true, "Dump malloc stats in LOG ");
enum RepFactory {
kSkipList,
kPrefixHash,
kVectorRep,
kHashLinkedList,
kCuckoo
};
static enum RepFactory StringToRepFactory(const char* ctype) {
assert(ctype);
if (!strcasecmp(ctype, "skip_list"))
return kSkipList;
else if (!strcasecmp(ctype, "prefix_hash"))
return kPrefixHash;
else if (!strcasecmp(ctype, "vector"))
return kVectorRep;
else if (!strcasecmp(ctype, "hash_linkedlist"))
return kHashLinkedList;
else if (!strcasecmp(ctype, "cuckoo"))
return kCuckoo;
fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
return kSkipList;
}
static enum RepFactory FLAGS_rep_factory;
DEFINE_string(memtablerep, "skip_list", "");
DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count");
DEFINE_bool(use_plain_table, false, "if use plain table "
"instead of block-based table format");
DEFINE_bool(use_cuckoo_table, false, "if use cuckoo table format");
DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table.");
DEFINE_bool(use_hash_search, false, "if use kHashSearch "
"instead of kBinarySearch. "
"This is valid if only we use BlockTable");
DEFINE_bool(use_block_based_filter, false, "if use kBlockBasedFilter "
"instead of kFullFilter for filter block. "
"This is valid if only we use BlockTable");
DEFINE_string(merge_operator, "", "The merge operator to use with the database."
"If a new merge operator is specified, be sure to use fresh"
" database The possible merge operators are defined in"
" utilities/merge_operators.h");
DEFINE_int32(skip_list_lookahead, 0, "Used with skip_list memtablerep; try "
"linear search first for this many steps from the previous "
"position");
DEFINE_bool(report_file_operations, false, "if report number of file "
"operations");
static const bool FLAGS_soft_rate_limit_dummy __attribute__((unused)) =
RegisterFlagValidator(&FLAGS_soft_rate_limit, &ValidateRateLimit);
static const bool FLAGS_hard_rate_limit_dummy __attribute__((unused)) =
RegisterFlagValidator(&FLAGS_hard_rate_limit, &ValidateRateLimit);
static const bool FLAGS_prefix_size_dummy __attribute__((unused)) =
RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
static const bool FLAGS_key_size_dummy __attribute__((unused)) =
RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize);
static const bool FLAGS_cache_numshardbits_dummy __attribute__((unused)) =
RegisterFlagValidator(&FLAGS_cache_numshardbits,
&ValidateCacheNumshardbits);
static const bool FLAGS_readwritepercent_dummy __attribute__((unused)) =
RegisterFlagValidator(&FLAGS_readwritepercent, &ValidateInt32Percent);
DEFINE_int32(disable_seek_compaction, false,
"Not used, left here for backwards compatibility");
static const bool FLAGS_deletepercent_dummy __attribute__((unused)) =
RegisterFlagValidator(&FLAGS_deletepercent, &ValidateInt32Percent);
static const bool FLAGS_table_cache_numshardbits_dummy __attribute__((unused)) =
RegisterFlagValidator(&FLAGS_table_cache_numshardbits,
&ValidateTableCacheNumshardbits);
namespace rocksdb {
namespace {
struct ReportFileOpCounters {
std::atomic<int> open_counter_;
std::atomic<int> read_counter_;
std::atomic<int> append_counter_;
std::atomic<uint64_t> bytes_read_;
std::atomic<uint64_t> bytes_written_;
};
// A special Env to records and report file operations in db_bench
class ReportFileOpEnv : public EnvWrapper {
public:
explicit ReportFileOpEnv(Env* base) : EnvWrapper(base) { reset(); }
void reset() {
counters_.open_counter_ = 0;
counters_.read_counter_ = 0;
counters_.append_counter_ = 0;
counters_.bytes_read_ = 0;
counters_.bytes_written_ = 0;
}
Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
const EnvOptions& soptions) override {
class CountingFile : public SequentialFile {
private:
unique_ptr<SequentialFile> target_;
ReportFileOpCounters* counters_;
public:
CountingFile(unique_ptr<SequentialFile>&& target,
ReportFileOpCounters* counters)
: target_(std::move(target)), counters_(counters) {}
virtual Status Read(size_t n, Slice* result, char* scratch) override {
counters_->read_counter_.fetch_add(1, std::memory_order_relaxed);
Status rv = target_->Read(n, result, scratch);
counters_->bytes_read_.fetch_add(result->size(),
std::memory_order_relaxed);
return rv;
}
virtual Status Skip(uint64_t n) override { return target_->Skip(n); }
};
Status s = target()->NewSequentialFile(f, r, soptions);
if (s.ok()) {
counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
r->reset(new CountingFile(std::move(*r), counters()));
}
return s;
}
Status NewRandomAccessFile(const std::string& f,
unique_ptr<RandomAccessFile>* r,
const EnvOptions& soptions) override {
class CountingFile : public RandomAccessFile {
private:
unique_ptr<RandomAccessFile> target_;
ReportFileOpCounters* counters_;
public:
CountingFile(unique_ptr<RandomAccessFile>&& target,
ReportFileOpCounters* counters)
: target_(std::move(target)), counters_(counters) {}
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const override {
counters_->read_counter_.fetch_add(1, std::memory_order_relaxed);
Status rv = target_->Read(offset, n, result, scratch);
counters_->bytes_read_.fetch_add(result->size(),
std::memory_order_relaxed);
return rv;
}
};
Status s = target()->NewRandomAccessFile(f, r, soptions);
if (s.ok()) {
counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
r->reset(new CountingFile(std::move(*r), counters()));
}
return s;
}
Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
const EnvOptions& soptions) override {
class CountingFile : public WritableFile {
private:
unique_ptr<WritableFile> target_;
ReportFileOpCounters* counters_;
public:
CountingFile(unique_ptr<WritableFile>&& target,
ReportFileOpCounters* counters)
: target_(std::move(target)), counters_(counters) {}
Status Append(const Slice& data) override {
counters_->append_counter_.fetch_add(1, std::memory_order_relaxed);
Status rv = target_->Append(data);
counters_->bytes_written_.fetch_add(data.size(),
std::memory_order_relaxed);
return rv;
}
Status Truncate(uint64_t size) override { return target_->Truncate(size); }
Status Close() override { return target_->Close(); }
Status Flush() override { return target_->Flush(); }
Status Sync() override { return target_->Sync(); }
};
Status s = target()->NewWritableFile(f, r, soptions);
if (s.ok()) {
counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
r->reset(new CountingFile(std::move(*r), counters()));
}
return s;
}
// getter
ReportFileOpCounters* counters() { return &counters_; }
private:
ReportFileOpCounters counters_;
};
} // namespace
// Helper for quickly generating random data.
class RandomGenerator {
private:
std::string data_;
unsigned int pos_;
public:
RandomGenerator() {
// We use a limited amount of data over and over again and ensure
// that it is larger than the compression window (32KB), and also
// large enough to serve all typical value sizes we want to write.
Random rnd(301);
std::string piece;
while (data_.size() < (unsigned)std::max(1048576, FLAGS_value_size)) {
// Add a short fragment that is as compressible as specified
// by FLAGS_compression_ratio.
test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
data_.append(piece);
}
pos_ = 0;
}
Slice Generate(unsigned int len) {
assert(len <= data_.size());
if (pos_ + len > data_.size()) {
pos_ = 0;
}
pos_ += len;
return Slice(data_.data() + pos_ - len, len);
}
Slice GenerateWithTTL(unsigned int len) {
assert(len <= data_.size());
if (pos_ + len > data_.size()) {
pos_ = 0;
}
pos_ += len;
return Slice(data_.data() + pos_ - len, len);
}
};
static void AppendWithSpace(std::string* str, Slice msg) {
if (msg.empty()) return;
if (!str->empty()) {
str->push_back(' ');
}
str->append(msg.data(), msg.size());
}
struct DBWithColumnFamilies {
std::vector<ColumnFamilyHandle*> cfh;
DB* db;
#ifndef ROCKSDB_LITE
OptimisticTransactionDB* opt_txn_db;
#endif // ROCKSDB_LITE
std::atomic<size_t> num_created; // Need to be updated after all the
// new entries in cfh are set.
size_t num_hot; // Number of column families to be queried at each moment.
// After each CreateNewCf(), another num_hot number of new
// Column families will be created and used to be queried.
port::Mutex create_cf_mutex; // Only one thread can execute CreateNewCf()
std::vector<int> cfh_idx_to_prob; // ith index holds probability of operating
// on cfh[i].
DBWithColumnFamilies()
: db(nullptr)
#ifndef ROCKSDB_LITE
, opt_txn_db(nullptr)
#endif // ROCKSDB_LITE
{
cfh.clear();
num_created = 0;
num_hot = 0;
}
DBWithColumnFamilies(const DBWithColumnFamilies& other)
: cfh(other.cfh),
db(other.db),
#ifndef ROCKSDB_LITE
opt_txn_db(other.opt_txn_db),
#endif // ROCKSDB_LITE
num_created(other.num_created.load()),
num_hot(other.num_hot),
cfh_idx_to_prob(other.cfh_idx_to_prob) {
}
void DeleteDBs() {
std::for_each(cfh.begin(), cfh.end(),
[](ColumnFamilyHandle* cfhi) { delete cfhi; });
cfh.clear();
#ifndef ROCKSDB_LITE
if (opt_txn_db) {
delete opt_txn_db;
opt_txn_db = nullptr;
} else {
delete db;
db = nullptr;
}
#else
delete db;
db = nullptr;
#endif // ROCKSDB_LITE
}
ColumnFamilyHandle* GetCfh(int64_t rand_num) {
assert(num_hot > 0);
size_t rand_offset = 0;
if (!cfh_idx_to_prob.empty()) {
assert(cfh_idx_to_prob.size() == num_hot);
int sum = 0;
while (sum + cfh_idx_to_prob[rand_offset] < rand_num % 100) {
sum += cfh_idx_to_prob[rand_offset];
++rand_offset;
}
assert(rand_offset < cfh_idx_to_prob.size());
} else {
rand_offset = rand_num % num_hot;
}
return cfh[num_created.load(std::memory_order_acquire) - num_hot +
rand_offset];
}
// stage: assume CF from 0 to stage * num_hot has be created. Need to create
// stage * num_hot + 1 to stage * (num_hot + 1).
void CreateNewCf(ColumnFamilyOptions options, int64_t stage) {
MutexLock l(&create_cf_mutex);
if ((stage + 1) * num_hot <= num_created) {
// Already created.
return;
}
auto new_num_created = num_created + num_hot;
assert(new_num_created <= cfh.size());
for (size_t i = num_created; i < new_num_created; i++) {
Status s =
db->CreateColumnFamily(options, ColumnFamilyName(i), &(cfh[i]));
if (!s.ok()) {
fprintf(stderr, "create column family error: %s\n",
s.ToString().c_str());
abort();
}
}
num_created.store(new_num_created, std::memory_order_release);
}
};
// a class that reports stats to CSV file
class ReporterAgent {
public:
ReporterAgent(Env* env, const std::string& fname,
uint64_t report_interval_secs)
: env_(env),
total_ops_done_(0),
last_report_(0),
report_interval_secs_(report_interval_secs),
stop_(false) {
auto s = env_->NewWritableFile(fname, &report_file_, EnvOptions());
if (s.ok()) {
s = report_file_->Append(Header() + "\n");
}
if (s.ok()) {
s = report_file_->Flush();
}
if (!s.ok()) {
fprintf(stderr, "Can't open %s: %s\n", fname.c_str(),
s.ToString().c_str());
abort();
}
reporting_thread_ = port::Thread([&]() { SleepAndReport(); });
}
~ReporterAgent() {
{
std::unique_lock<std::mutex> lk(mutex_);
stop_ = true;
stop_cv_.notify_all();
}
reporting_thread_.join();
}
// thread safe
void ReportFinishedOps(int64_t num_ops) {
total_ops_done_.fetch_add(num_ops);
}
private:
std::string Header() const { return "secs_elapsed,interval_qps"; }
void SleepAndReport() {
uint64_t kMicrosInSecond = 1000 * 1000;
auto time_started = env_->NowMicros();
while (true) {
{
std::unique_lock<std::mutex> lk(mutex_);
if (stop_ ||
stop_cv_.wait_for(lk, std::chrono::seconds(report_interval_secs_),
[&]() { return stop_; })) {
// stopping
break;
}
// else -> timeout, which means time for a report!
}
auto total_ops_done_snapshot = total_ops_done_.load();
// round the seconds elapsed
auto secs_elapsed =
(env_->NowMicros() - time_started + kMicrosInSecond / 2) /
kMicrosInSecond;
std::string report = ToString(secs_elapsed) + "," +
ToString(total_ops_done_snapshot - last_report_) +
"\n";
auto s = report_file_->Append(report);
if (s.ok()) {
s = report_file_->Flush();
}
if (!s.ok()) {
fprintf(stderr,
"Can't write to report file (%s), stopping the reporting\n",
s.ToString().c_str());
break;
}
last_report_ = total_ops_done_snapshot;
}
}
Env* env_;
std::unique_ptr<WritableFile> report_file_;
std::atomic<int64_t> total_ops_done_;
int64_t last_report_;
const uint64_t report_interval_secs_;
rocksdb::port::Thread reporting_thread_;
std::mutex mutex_;
// will notify on stop
std::condition_variable stop_cv_;
bool stop_;
};
enum OperationType : unsigned char {
kRead = 0,
kWrite,
kDelete,
kSeek,
kMerge,
kUpdate,
kCompress,
kUncompress,
kCrc,
kHash,
kOthers
};
static std::unordered_map<OperationType, std::string, std::hash<unsigned char>>
OperationTypeString = {
{kRead, "read"},
{kWrite, "write"},
{kDelete, "delete"},
{kSeek, "seek"},
{kMerge, "merge"},
{kUpdate, "update"},
{kCompress, "compress"},
{kCompress, "uncompress"},
{kCrc, "crc"},
{kHash, "hash"},
{kOthers, "op"}
};
class CombinedStats;
class Stats {
private:
int id_;
uint64_t start_;
uint64_t finish_;
double seconds_;
uint64_t done_;
uint64_t last_report_done_;
uint64_t next_report_;
uint64_t bytes_;
uint64_t last_op_finish_;
uint64_t last_report_finish_;
std::unordered_map<OperationType, std::shared_ptr<HistogramImpl>,
std::hash<unsigned char>> hist_;
std::string message_;
bool exclude_from_merge_;
ReporterAgent* reporter_agent_; // does not own
friend class CombinedStats;
public:
Stats() { Start(-1); }
void SetReporterAgent(ReporterAgent* reporter_agent) {
reporter_agent_ = reporter_agent;
}
void Start(int id) {
id_ = id;
next_report_ = FLAGS_stats_interval ? FLAGS_stats_interval : 100;
last_op_finish_ = start_;
hist_.clear();
done_ = 0;
last_report_done_ = 0;
bytes_ = 0;
seconds_ = 0;
start_ = FLAGS_env->NowMicros();
finish_ = start_;
last_report_finish_ = start_;
message_.clear();
// When set, stats from this thread won't be merged with others.
exclude_from_merge_ = false;
}
void Merge(const Stats& other) {
if (other.exclude_from_merge_)
return;
for (auto it = other.hist_.begin(); it != other.hist_.end(); ++it) {
auto this_it = hist_.find(it->first);
if (this_it != hist_.end()) {
this_it->second->Merge(*(other.hist_.at(it->first)));
} else {
hist_.insert({ it->first, it->second });
}
}
done_ += other.done_;
bytes_ += other.bytes_;
seconds_ += other.seconds_;
if (other.start_ < start_) start_ = other.start_;
if (other.finish_ > finish_) finish_ = other.finish_;
// Just keep the messages from one thread
if (message_.empty()) message_ = other.message_;
}
void Stop() {
finish_ = FLAGS_env->NowMicros();
seconds_ = (finish_ - start_) * 1e-6;
}
void AddMessage(Slice msg) {
AppendWithSpace(&message_, msg);
}
void SetId(int id) { id_ = id; }
void SetExcludeFromMerge() { exclude_from_merge_ = true; }
void PrintThreadStatus() {
std::vector<ThreadStatus> thread_list;
FLAGS_env->GetThreadList(&thread_list);
fprintf(stderr, "\n%18s %10s %12s %20s %13s %45s %12s %s\n",
"ThreadID", "ThreadType", "cfName", "Operation",
"ElapsedTime", "Stage", "State", "OperationProperties");
int64_t current_time = 0;
Env::Default()->GetCurrentTime(&current_time);
for (auto ts : thread_list) {
fprintf(stderr, "%18" PRIu64 " %10s %12s %20s %13s %45s %12s",
ts.thread_id,
ThreadStatus::GetThreadTypeName(ts.thread_type).c_str(),
ts.cf_name.c_str(),
ThreadStatus::GetOperationName(ts.operation_type).c_str(),
ThreadStatus::MicrosToString(ts.op_elapsed_micros).c_str(),
ThreadStatus::GetOperationStageName(ts.operation_stage).c_str(),
ThreadStatus::GetStateName(ts.state_type).c_str());
auto op_properties = ThreadStatus::InterpretOperationProperties(
ts.operation_type, ts.op_properties);
for (const auto& op_prop : op_properties) {
fprintf(stderr, " %s %" PRIu64" |",
op_prop.first.c_str(), op_prop.second);
}
fprintf(stderr, "\n");
}
}
void ResetLastOpTime() {
// Set to now to avoid latency from calls to SleepForMicroseconds
last_op_finish_ = FLAGS_env->NowMicros();
}
void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops,
enum OperationType op_type = kOthers) {
if (reporter_agent_) {
reporter_agent_->ReportFinishedOps(num_ops);
}
if (FLAGS_histogram) {
uint64_t now = FLAGS_env->NowMicros();
uint64_t micros = now - last_op_finish_;
if (hist_.find(op_type) == hist_.end())
{
auto hist_temp = std::make_shared<HistogramImpl>();
hist_.insert({op_type, std::move(hist_temp)});
}
hist_[op_type]->Add(micros);
if (micros > 20000 && !FLAGS_stats_interval) {
fprintf(stderr, "long op: %" PRIu64 " micros%30s\r", micros, "");
fflush(stderr);
}
last_op_finish_ = now;
}
done_ += num_ops;
if (done_ >= next_report_) {
if (!FLAGS_stats_interval) {
if (next_report_ < 1000) next_report_ += 100;
else if (next_report_ < 5000) next_report_ += 500;
else if (next_report_ < 10000) next_report_ += 1000;
else if (next_report_ < 50000) next_report_ += 5000;
else if (next_report_ < 100000) next_report_ += 10000;
else if (next_report_ < 500000) next_report_ += 50000;
else next_report_ += 100000;
fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, "");
} else {
uint64_t now = FLAGS_env->NowMicros();
int64_t usecs_since_last = now - last_report_finish_;
// Determine whether to print status where interval is either
// each N operations or each N seconds.
if (FLAGS_stats_interval_seconds &&
usecs_since_last < (FLAGS_stats_interval_seconds * 1000000)) {
// Don't check again for this many operations
next_report_ += FLAGS_stats_interval;
} else {
fprintf(stderr,
"%s ... thread %d: (%" PRIu64 ",%" PRIu64 ") ops and "
"(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n",
FLAGS_env->TimeToString(now/1000000).c_str(),
id_,
done_ - last_report_done_, done_,
(done_ - last_report_done_) /
(usecs_since_last / 1000000.0),
done_ / ((now - start_) / 1000000.0),
(now - last_report_finish_) / 1000000.0,
(now - start_) / 1000000.0);
if (id_ == 0 && FLAGS_stats_per_interval) {
std::string stats;
if (db_with_cfh && db_with_cfh->num_created.load()) {
for (size_t i = 0; i < db_with_cfh->num_created.load(); ++i) {
if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats",
&stats))
fprintf(stderr, "%s\n", stats.c_str());
if (FLAGS_show_table_properties) {
for (int level = 0; level < FLAGS_num_levels; ++level) {
if (db->GetProperty(
db_with_cfh->cfh[i],
"rocksdb.aggregated-table-properties-at-level" +
ToString(level),
&stats)) {
if (stats.find("# entries=0") == std::string::npos) {
fprintf(stderr, "Level[%d]: %s\n", level,
stats.c_str());
}
}
}
}
}
} else if (db) {
if (db->GetProperty("rocksdb.stats", &stats)) {
fprintf(stderr, "%s\n", stats.c_str());
}
if (FLAGS_show_table_properties) {
for (int level = 0; level < FLAGS_num_levels; ++level) {
if (db->GetProperty(
"rocksdb.aggregated-table-properties-at-level" +
ToString(level),
&stats)) {
if (stats.find("# entries=0") == std::string::npos) {
fprintf(stderr, "Level[%d]: %s\n", level, stats.c_str());
}
}
}
}
}
}
next_report_ += FLAGS_stats_interval;
last_report_finish_ = now;
last_report_done_ = done_;
}
}
if (id_ == 0 && FLAGS_thread_status_per_interval) {
PrintThreadStatus();
}
fflush(stderr);
}
}
void AddBytes(int64_t n) {
bytes_ += n;
}
void Report(const Slice& name) {
// Pretend at least one op was done in case we are running a benchmark
// that does not call FinishedOps().
if (done_ < 1) done_ = 1;
std::string extra;
if (bytes_ > 0) {
// Rate is computed on actual elapsed time, not the sum of per-thread
// elapsed times.
double elapsed = (finish_ - start_) * 1e-6;
char rate[100];
snprintf(rate, sizeof(rate), "%6.1f MB/s",
(bytes_ / 1048576.0) / elapsed);
extra = rate;
}
AppendWithSpace(&extra, message_);
double elapsed = (finish_ - start_) * 1e-6;
double throughput = (double)done_/elapsed;
fprintf(stdout, "%-12s : %11.3f micros/op %ld ops/sec;%s%s\n",
name.ToString().c_str(),
elapsed * 1e6 / done_,
(long)throughput,
(extra.empty() ? "" : " "),
extra.c_str());
if (FLAGS_histogram) {
for (auto it = hist_.begin(); it != hist_.end(); ++it) {
fprintf(stdout, "Microseconds per %s:\n%s\n",
OperationTypeString[it->first].c_str(),
it->second->ToString().c_str());
}
}
if (FLAGS_report_file_operations) {
ReportFileOpEnv* env = static_cast<ReportFileOpEnv*>(FLAGS_env);
ReportFileOpCounters* counters = env->counters();
fprintf(stdout, "Num files opened: %d\n",
counters->open_counter_.load(std::memory_order_relaxed));
fprintf(stdout, "Num Read(): %d\n",
counters->read_counter_.load(std::memory_order_relaxed));
fprintf(stdout, "Num Append(): %d\n",
counters->append_counter_.load(std::memory_order_relaxed));
fprintf(stdout, "Num bytes read: %" PRIu64 "\n",
counters->bytes_read_.load(std::memory_order_relaxed));
fprintf(stdout, "Num bytes written: %" PRIu64 "\n",
counters->bytes_written_.load(std::memory_order_relaxed));
env->reset();
}
fflush(stdout);
}
};
class CombinedStats {
public:
void AddStats(const Stats& stat) {
uint64_t total_ops = stat.done_;
uint64_t total_bytes_ = stat.bytes_;
double elapsed;
if (total_ops < 1) {
total_ops = 1;
}
elapsed = (stat.finish_ - stat.start_) * 1e-6;
throughput_ops_.emplace_back(total_ops / elapsed);
if (total_bytes_ > 0) {
double mbs = (total_bytes_ / 1048576.0);
throughput_mbs_.emplace_back(mbs / elapsed);
}
}
void Report(const std::string& bench_name) {
const char* name = bench_name.c_str();
int num_runs = static_cast<int>(throughput_ops_.size());
if (throughput_mbs_.size() == throughput_ops_.size()) {
fprintf(stdout,
"%s [AVG %d runs] : %d ops/sec; %6.1f MB/sec\n"
"%s [MEDIAN %d runs] : %d ops/sec; %6.1f MB/sec\n",
name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
CalcAvg(throughput_mbs_), name, num_runs,
static_cast<int>(CalcMedian(throughput_ops_)),
CalcMedian(throughput_mbs_));
} else {
fprintf(stdout,
"%s [AVG %d runs] : %d ops/sec\n"
"%s [MEDIAN %d runs] : %d ops/sec\n",
name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)), name,
num_runs, static_cast<int>(CalcMedian(throughput_ops_)));
}
}
private:
double CalcAvg(std::vector<double> data) {
double avg = 0;
for (double x : data) {
avg += x;
}
avg = avg / data.size();
return avg;
}
double CalcMedian(std::vector<double> data) {
assert(data.size() > 0);
std::sort(data.begin(), data.end());
size_t mid = data.size() / 2;
if (data.size() % 2 == 1) {
// Odd number of entries
return data[mid];
} else {
// Even number of entries
return (data[mid] + data[mid - 1]) / 2;
}
}
std::vector<double> throughput_ops_;
std::vector<double> throughput_mbs_;
};
class TimestampEmulator {
private:
std::atomic<uint64_t> timestamp_;
public:
TimestampEmulator() : timestamp_(0) {}
uint64_t Get() const { return timestamp_.load(); }
void Inc() { timestamp_++; }
};
// State shared by all concurrent executions of the same benchmark.
struct SharedState {
port::Mutex mu;
port::CondVar cv;
int total;
int perf_level;
std::shared_ptr<RateLimiter> write_rate_limiter;
std::shared_ptr<RateLimiter> read_rate_limiter;
// Each thread goes through the following states:
// (1) initializing
// (2) waiting for others to be initialized
// (3) running
// (4) done
long num_initialized;
long num_done;
bool start;
SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { }
};
// Per-thread state for concurrent executions of the same benchmark.
struct ThreadState {
int tid; // 0..n-1 when running in n threads
Random64 rand; // Has different seeds for different threads
Stats stats;
SharedState* shared;
/* implicit */ ThreadState(int index)
: tid(index),
rand((FLAGS_seed ? FLAGS_seed : 1000) + index) {
}
};
class Duration {
public:
Duration(uint64_t max_seconds, int64_t max_ops, int64_t ops_per_stage = 0) {
max_seconds_ = max_seconds;
max_ops_= max_ops;
ops_per_stage_ = (ops_per_stage > 0) ? ops_per_stage : max_ops;
ops_ = 0;
start_at_ = FLAGS_env->NowMicros();
}
int64_t GetStage() { return std::min(ops_, max_ops_ - 1) / ops_per_stage_; }
bool Done(int64_t increment) {
if (increment <= 0) increment = 1; // avoid Done(0) and infinite loops
ops_ += increment;
if (max_seconds_) {
// Recheck every appx 1000 ops (exact iff increment is factor of 1000)
auto granularity = FLAGS_ops_between_duration_checks;
if ((ops_ / granularity) != ((ops_ - increment) / granularity)) {
uint64_t now = FLAGS_env->NowMicros();
return ((now - start_at_) / 1000000) >= max_seconds_;
} else {
return false;
}
} else {
return ops_ > max_ops_;
}
}
private:
uint64_t max_seconds_;
int64_t max_ops_;
int64_t ops_per_stage_;
int64_t ops_;
uint64_t start_at_;
};
class Benchmark {
private:
std::shared_ptr<Cache> cache_;
std::shared_ptr<Cache> compressed_cache_;
std::shared_ptr<const FilterPolicy> filter_policy_;
const SliceTransform* prefix_extractor_;
DBWithColumnFamilies db_;
std::vector<DBWithColumnFamilies> multi_dbs_;
int64_t num_;
int value_size_;
int key_size_;
int prefix_size_;
int64_t keys_per_prefix_;
int64_t entries_per_batch_;
int64_t writes_per_range_tombstone_;
int64_t range_tombstone_width_;
int64_t max_num_range_tombstones_;
WriteOptions write_options_;
Options open_options_; // keep options around to properly destroy db later
int64_t reads_;
int64_t deletes_;
double read_random_exp_range_;
int64_t writes_;
int64_t readwrites_;
int64_t merge_keys_;
bool report_file_operations_;
bool use_blob_db_;
bool SanityCheck() {
if (FLAGS_compression_ratio > 1) {
fprintf(stderr, "compression_ratio should be between 0 and 1\n");
return false;
}
return true;
}
inline bool CompressSlice(const Slice& input, std::string* compressed) {
bool ok = true;
switch (FLAGS_compression_type_e) {
case rocksdb::kSnappyCompression:
ok = Snappy_Compress(Options().compression_opts, input.data(),
input.size(), compressed);
break;
case rocksdb::kZlibCompression:
ok = Zlib_Compress(Options().compression_opts, 2, input.data(),
input.size(), compressed);
break;
case rocksdb::kBZip2Compression:
ok = BZip2_Compress(Options().compression_opts, 2, input.data(),
input.size(), compressed);
break;
case rocksdb::kLZ4Compression:
ok = LZ4_Compress(Options().compression_opts, 2, input.data(),
input.size(), compressed);
break;
case rocksdb::kLZ4HCCompression:
ok = LZ4HC_Compress(Options().compression_opts, 2, input.data(),
input.size(), compressed);
break;
case rocksdb::kXpressCompression:
ok = XPRESS_Compress(input.data(),
input.size(), compressed);
break;
case rocksdb::kZSTD:
ok = ZSTD_Compress(Options().compression_opts, input.data(),
input.size(), compressed);
break;
default:
ok = false;
}
return ok;
}
void PrintHeader() {
PrintEnvironment();
fprintf(stdout, "Keys: %d bytes each\n", FLAGS_key_size);
fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n",
FLAGS_value_size,
static_cast<int>(FLAGS_value_size * FLAGS_compression_ratio + 0.5));
fprintf(stdout, "Entries: %" PRIu64 "\n", num_);
fprintf(stdout, "Prefix: %d bytes\n", FLAGS_prefix_size);
fprintf(stdout, "Keys per prefix: %" PRIu64 "\n", keys_per_prefix_);
fprintf(stdout, "RawSize: %.1f MB (estimated)\n",
((static_cast<int64_t>(FLAGS_key_size + FLAGS_value_size) * num_)
/ 1048576.0));
fprintf(stdout, "FileSize: %.1f MB (estimated)\n",
(((FLAGS_key_size + FLAGS_value_size * FLAGS_compression_ratio)
* num_)
/ 1048576.0));
fprintf(stdout, "Write rate: %" PRIu64 " bytes/second\n",
FLAGS_benchmark_write_rate_limit);
fprintf(stdout, "Read rate: %" PRIu64 " ops/second\n",
FLAGS_benchmark_read_rate_limit);
if (FLAGS_enable_numa) {
fprintf(stderr, "Running in NUMA enabled mode.\n");
#ifndef NUMA
fprintf(stderr, "NUMA is not defined in the system.\n");
exit(1);
#else
if (numa_available() == -1) {
fprintf(stderr, "NUMA is not supported by the system.\n");
exit(1);
}
#endif
}
auto compression = CompressionTypeToString(FLAGS_compression_type_e);
fprintf(stdout, "Compression: %s\n", compression.c_str());
switch (FLAGS_rep_factory) {
case kPrefixHash:
fprintf(stdout, "Memtablerep: prefix_hash\n");
break;
case kSkipList:
fprintf(stdout, "Memtablerep: skip_list\n");
break;
case kVectorRep:
fprintf(stdout, "Memtablerep: vector\n");
break;
case kHashLinkedList:
fprintf(stdout, "Memtablerep: hash_linkedlist\n");
break;
case kCuckoo:
fprintf(stdout, "Memtablerep: cuckoo\n");
break;
}
fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
PrintWarnings(compression.c_str());
fprintf(stdout, "------------------------------------------------\n");
}
void PrintWarnings(const char* compression) {
#if defined(__GNUC__) && !defined(__OPTIMIZE__)
fprintf(stdout,
"WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
);
#endif
#ifndef NDEBUG
fprintf(stdout,
"WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
#endif
if (FLAGS_compression_type_e != rocksdb::kNoCompression) {
// The test string should not be too small.
const int len = FLAGS_block_size;
std::string input_str(len, 'y');
std::string compressed;
bool result = CompressSlice(Slice(input_str), &compressed);
if (!result) {
fprintf(stdout, "WARNING: %s compression is not enabled\n",
compression);
} else if (compressed.size() >= input_str.size()) {
fprintf(stdout, "WARNING: %s compression is not effective\n",
compression);
}
}
}
// Current the following isn't equivalent to OS_LINUX.
#if defined(__linux)
static Slice TrimSpace(Slice s) {
unsigned int start = 0;
while (start < s.size() && isspace(s[start])) {
start++;
}
unsigned int limit = static_cast<unsigned int>(s.size());
while (limit > start && isspace(s[limit-1])) {
limit--;
}
return Slice(s.data() + start, limit - start);
}
#endif
void PrintEnvironment() {
fprintf(stderr, "RocksDB: version %d.%d\n",
kMajorVersion, kMinorVersion);
#if defined(__linux)
time_t now = time(nullptr);
char buf[52];
// Lint complains about ctime() usage, so replace it with ctime_r(). The
// requirement is to provide a buffer which is at least 26 bytes.
fprintf(stderr, "Date: %s",
ctime_r(&now, buf)); // ctime_r() adds newline
FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
if (cpuinfo != nullptr) {
char line[1000];
int num_cpus = 0;
std::string cpu_type;
std::string cache_size;
while (fgets(line, sizeof(line), cpuinfo) != nullptr) {
const char* sep = strchr(line, ':');
if (sep == nullptr) {
continue;
}
Slice key = TrimSpace(Slice(line, sep - 1 - line));
Slice val = TrimSpace(Slice(sep + 1));
if (key == "model name") {
++num_cpus;
cpu_type = val.ToString();
} else if (key == "cache size") {
cache_size = val.ToString();
}
}
fclose(cpuinfo);
fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str());
fprintf(stderr, "CPUCache: %s\n", cache_size.c_str());
}
#endif
}
static bool KeyExpired(const TimestampEmulator* timestamp_emulator,
const Slice& key) {
const char* pos = key.data();
pos += 8;
uint64_t timestamp = 0;
if (port::kLittleEndian) {
int bytes_to_fill = 8;
for (int i = 0; i < bytes_to_fill; ++i) {
timestamp |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i]))
<< ((bytes_to_fill - i - 1) << 3));
}
} else {
memcpy(&timestamp, pos, sizeof(timestamp));
}
return timestamp_emulator->Get() - timestamp > FLAGS_time_range;
}
class ExpiredTimeFilter : public CompactionFilter {
public:
explicit ExpiredTimeFilter(
const std::shared_ptr<TimestampEmulator>& timestamp_emulator)
: timestamp_emulator_(timestamp_emulator) {}
bool Filter(int level, const Slice& key, const Slice& existing_value,
std::string* new_value, bool* value_changed) const override {
return KeyExpired(timestamp_emulator_.get(), key);
}
const char* Name() const override { return "ExpiredTimeFilter"; }
private:
std::shared_ptr<TimestampEmulator> timestamp_emulator_;
};
std::shared_ptr<Cache> NewCache(int64_t capacity) {
if (capacity <= 0) {
return nullptr;
}
if (FLAGS_use_clock_cache) {
auto cache = NewClockCache((size_t)capacity, FLAGS_cache_numshardbits);
if (!cache) {
fprintf(stderr, "Clock cache not supported.");
exit(1);
}
return cache;
} else {
return NewLRUCache((size_t)capacity, FLAGS_cache_numshardbits,
false /*strict_capacity_limit*/,
FLAGS_cache_high_pri_pool_ratio);
}
}
public:
Benchmark()
: cache_(NewCache(FLAGS_cache_size)),
compressed_cache_(NewCache(FLAGS_compressed_cache_size)),
filter_policy_(FLAGS_bloom_bits >= 0
? NewBloomFilterPolicy(FLAGS_bloom_bits,
FLAGS_use_block_based_filter)
: nullptr),
prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)),
num_(FLAGS_num),
value_size_(FLAGS_value_size),
key_size_(FLAGS_key_size),
prefix_size_(FLAGS_prefix_size),
keys_per_prefix_(FLAGS_keys_per_prefix),
entries_per_batch_(1),
reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
read_random_exp_range_(0.0),
writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes),
readwrites_(
(FLAGS_writes < 0 && FLAGS_reads < 0)
? FLAGS_num
: ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)),
merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
report_file_operations_(FLAGS_report_file_operations),
#ifndef ROCKSDB_LITE
use_blob_db_(FLAGS_use_blob_db) {
#else
use_blob_db_(false) {
#endif // !ROCKSDB_LITE
// use simcache instead of cache
if (FLAGS_simcache_size >= 0) {
if (FLAGS_cache_numshardbits >= 1) {
cache_ =
NewSimCache(cache_, FLAGS_simcache_size, FLAGS_cache_numshardbits);
} else {
cache_ = NewSimCache(cache_, FLAGS_simcache_size, 0);
}
}
if (report_file_operations_) {
if (!FLAGS_hdfs.empty()) {
fprintf(stderr,
"--hdfs and --report_file_operations cannot be enabled "
"at the same time");
exit(1);
}
FLAGS_env = new ReportFileOpEnv(rocksdb::Env::Default());
}
if (FLAGS_prefix_size > FLAGS_key_size) {
fprintf(stderr, "prefix size is larger than key size");
exit(1);
}
std::vector<std::string> files;
FLAGS_env->GetChildren(FLAGS_db, &files);
for (size_t i = 0; i < files.size(); i++) {
if (Slice(files[i]).starts_with("heap-")) {
FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
}
}
if (!FLAGS_use_existing_db) {
Options options;
if (!FLAGS_wal_dir.empty()) {
options.wal_dir = FLAGS_wal_dir;
}
#ifndef ROCKSDB_LITE
if (use_blob_db_) {
blob_db::DestroyBlobDB(FLAGS_db, options, blob_db::BlobDBOptions());
}
#endif // !ROCKSDB_LITE
DestroyDB(FLAGS_db, options);
if (!FLAGS_wal_dir.empty()) {
FLAGS_env->DeleteDir(FLAGS_wal_dir);
}
if (FLAGS_num_multi_db > 1) {
FLAGS_env->CreateDir(FLAGS_db);
if (!FLAGS_wal_dir.empty()) {
FLAGS_env->CreateDir(FLAGS_wal_dir);
}
}
}
}
~Benchmark() {
db_.DeleteDBs();
delete prefix_extractor_;
if (cache_.get() != nullptr) {
// this will leak, but we're shutting down so nobody cares
cache_->DisownData();
}
}
Slice AllocateKey(std::unique_ptr<const char[]>* key_guard) {
char* data = new char[key_size_];
const char* const_data = data;
key_guard->reset(const_data);
return Slice(key_guard->get(), key_size_);
}
// Generate key according to the given specification and random number.
// The resulting key will have the following format (if keys_per_prefix_
// is positive), extra trailing bytes are either cut off or padded with '0'.
// The prefix value is derived from key value.
// ----------------------------
// | prefix 00000 | key 00000 |
// ----------------------------
// If keys_per_prefix_ is 0, the key is simply a binary representation of
// random number followed by trailing '0's
// ----------------------------
// | key 00000 |
// ----------------------------
void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) {
char* start = const_cast<char*>(key->data());
char* pos = start;
if (keys_per_prefix_ > 0) {
int64_t num_prefix = num_keys / keys_per_prefix_;
int64_t prefix = v % num_prefix;
int bytes_to_fill = std::min(prefix_size_, 8);
if (port::kLittleEndian) {
for (int i = 0; i < bytes_to_fill; ++i) {
pos[i] = (prefix >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
}
} else {
memcpy(pos, static_cast<void*>(&prefix), bytes_to_fill);
}
if (prefix_size_ > 8) {
// fill the rest with 0s
memset(pos + 8, '0', prefix_size_ - 8);
}
pos += prefix_size_;
}
int bytes_to_fill = std::min(key_size_ - static_cast<int>(pos - start), 8);
if (port::kLittleEndian) {
for (int i = 0; i < bytes_to_fill; ++i) {
pos[i] = (v >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
}
} else {
memcpy(pos, static_cast<void*>(&v), bytes_to_fill);
}
pos += bytes_to_fill;
if (key_size_ > pos - start) {
memset(pos, '0', key_size_ - (pos - start));
}
}
std::string GetPathForMultiple(std::string base_name, size_t id) {
if (!base_name.empty()) {
#ifndef OS_WIN
if (base_name.back() != '/') {
base_name += '/';
}
#else
if (base_name.back() != '\\') {
base_name += '\\';
}
#endif
}
return base_name + ToString(id);
}
void VerifyDBFromDB(std::string& truth_db_name) {
DBWithColumnFamilies truth_db;
auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db);
if (!s.ok()) {
fprintf(stderr, "open error: %s\n", s.ToString().c_str());
exit(1);
}
ReadOptions ro;
ro.total_order_seek = true;
std::unique_ptr<Iterator> truth_iter(truth_db.db->NewIterator(ro));
std::unique_ptr<Iterator> db_iter(db_.db->NewIterator(ro));
// Verify that all the key/values in truth_db are retrivable in db with ::Get
fprintf(stderr, "Verifying db >= truth_db with ::Get...\n");
for (truth_iter->SeekToFirst(); truth_iter->Valid(); truth_iter->Next()) {
std::string value;
s = db_.db->Get(ro, truth_iter->key(), &value);
assert(s.ok());
// TODO(myabandeh): provide debugging hints
assert(Slice(value) == truth_iter->value());
}
// Verify that the db iterator does not give any extra key/value
fprintf(stderr, "Verifying db == truth_db...\n");
for (db_iter->SeekToFirst(), truth_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next(), truth_iter->Next()) {
assert(truth_iter->Valid());
assert(truth_iter->value() == db_iter->value());
}
// No more key should be left unchecked in truth_db
assert(!truth_iter->Valid());
fprintf(stderr, "...Verified\n");
}
void Run() {
if (!SanityCheck()) {
exit(1);
}
Open(&open_options_);
PrintHeader();
std::stringstream benchmark_stream(FLAGS_benchmarks);
std::string name;
std::unique_ptr<ExpiredTimeFilter> filter;
while (std::getline(benchmark_stream, name, ',')) {
// Sanitize parameters
num_ = FLAGS_num;
reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes);
deletes_ = (FLAGS_deletes < 0 ? FLAGS_num : FLAGS_deletes);
value_size_ = FLAGS_value_size;
key_size_ = FLAGS_key_size;
entries_per_batch_ = FLAGS_batch_size;
writes_per_range_tombstone_ = FLAGS_writes_per_range_tombstone;
range_tombstone_width_ = FLAGS_range_tombstone_width;
max_num_range_tombstones_ = FLAGS_max_num_range_tombstones;
write_options_ = WriteOptions();
read_random_exp_range_ = FLAGS_read_random_exp_range;
if (FLAGS_sync) {
write_options_.sync = true;
}
write_options_.disableWAL = FLAGS_disable_wal;
void (Benchmark::*method)(ThreadState*) = nullptr;
void (Benchmark::*post_process_method)() = nullptr;
bool fresh_db = false;
int num_threads = FLAGS_threads;
int num_repeat = 1;
int num_warmup = 0;
if (!name.empty() && *name.rbegin() == ']') {
auto it = name.find('[');
if (it == std::string::npos) {
fprintf(stderr, "unknown benchmark arguments '%s'\n", name.c_str());
exit(1);
}
std::string args = name.substr(it + 1);
args.resize(args.size() - 1);
name.resize(it);
std::string bench_arg;
std::stringstream args_stream(args);
while (std::getline(args_stream, bench_arg, '-')) {
if (bench_arg.empty()) {
continue;
}
if (bench_arg[0] == 'X') {
// Repeat the benchmark n times
std::string num_str = bench_arg.substr(1);
num_repeat = std::stoi(num_str);
} else if (bench_arg[0] == 'W') {
// Warm up the benchmark for n times
std::string num_str = bench_arg.substr(1);
num_warmup = std::stoi(num_str);
}
}
}
// Both fillseqdeterministic and filluniquerandomdeterministic
// fill the levels except the max level with UNIQUE_RANDOM
// and fill the max level with fillseq and filluniquerandom, respectively
if (name == "fillseqdeterministic" ||
name == "filluniquerandomdeterministic") {
if (!FLAGS_disable_auto_compactions) {
fprintf(stderr,
"Please disable_auto_compactions in FillDeterministic "
"benchmark\n");
exit(1);
}
if (num_threads > 1) {
fprintf(stderr,
"filldeterministic multithreaded not supported"
", use 1 thread\n");
num_threads = 1;
}
fresh_db = true;
if (name == "fillseqdeterministic") {
method = &Benchmark::WriteSeqDeterministic;
} else {
method = &Benchmark::WriteUniqueRandomDeterministic;
}
} else if (name == "fillseq") {
fresh_db = true;
method = &Benchmark::WriteSeq;
} else if (name == "fillbatch") {
fresh_db = true;
entries_per_batch_ = 1000;
method = &Benchmark::WriteSeq;
} else if (name == "fillrandom") {
fresh_db = true;
method = &Benchmark::WriteRandom;
} else if (name == "filluniquerandom") {
fresh_db = true;
if (num_threads > 1) {
fprintf(stderr,
"filluniquerandom multithreaded not supported"
", use 1 thread");
num_threads = 1;
}
method = &Benchmark::WriteUniqueRandom;
} else if (name == "overwrite") {
method = &Benchmark::WriteRandom;
} else if (name == "fillsync") {
fresh_db = true;
num_ /= 1000;
write_options_.sync = true;
method = &Benchmark::WriteRandom;
} else if (name == "fill100K") {
fresh_db = true;
num_ /= 1000;
value_size_ = 100 * 1000;
method = &Benchmark::WriteRandom;
} else if (name == "readseq") {
method = &Benchmark::ReadSequential;
} else if (name == "readtocache") {
method = &Benchmark::ReadSequential;
num_threads = 1;
reads_ = num_;
} else if (name == "readreverse") {
method = &Benchmark::ReadReverse;
} else if (name == "readrandom") {
method = &Benchmark::ReadRandom;
} else if (name == "readrandomfast") {
method = &Benchmark::ReadRandomFast;
} else if (name == "multireadrandom") {
fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
entries_per_batch_);
method = &Benchmark::MultiReadRandom;
} else if (name == "readmissing") {
++key_size_;
method = &Benchmark::ReadRandom;
} else if (name == "newiterator") {
method = &Benchmark::IteratorCreation;
} else if (name == "newiteratorwhilewriting") {
num_threads++; // Add extra thread for writing
method = &Benchmark::IteratorCreationWhileWriting;
} else if (name == "seekrandom") {
method = &Benchmark::SeekRandom;
} else if (name == "seekrandomwhilewriting") {
num_threads++; // Add extra thread for writing
method = &Benchmark::SeekRandomWhileWriting;
} else if (name == "seekrandomwhilemerging") {
num_threads++; // Add extra thread for merging
method = &Benchmark::SeekRandomWhileMerging;
} else if (name == "readrandomsmall") {
reads_ /= 1000;
method = &Benchmark::ReadRandom;
} else if (name == "deleteseq") {
method = &Benchmark::DeleteSeq;
} else if (name == "deleterandom") {
method = &Benchmark::DeleteRandom;
} else if (name == "readwhilewriting") {
num_threads++; // Add extra thread for writing
method = &Benchmark::ReadWhileWriting;
} else if (name == "readwhilemerging") {
num_threads++; // Add extra thread for writing
method = &Benchmark::ReadWhileMerging;
} else if (name == "readrandomwriterandom") {
method = &Benchmark::ReadRandomWriteRandom;
} else if (name == "readrandommergerandom") {
if (FLAGS_merge_operator.empty()) {
fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
name.c_str());
exit(1);
}
method = &Benchmark::ReadRandomMergeRandom;
} else if (name == "updaterandom") {
method = &Benchmark::UpdateRandom;
} else if (name == "appendrandom") {
method = &Benchmark::AppendRandom;
} else if (name == "mergerandom") {
if (FLAGS_merge_operator.empty()) {
fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
name.c_str());
exit(1);
}
method = &Benchmark::MergeRandom;
} else if (name == "randomwithverify") {
method = &Benchmark::RandomWithVerify;
} else if (name == "fillseekseq") {
method = &Benchmark::WriteSeqSeekSeq;
} else if (name == "compact") {
method = &Benchmark::Compact;
} else if (name == "compactall") {
CompactAll();
} else if (name == "crc32c") {
method = &Benchmark::Crc32c;
} else if (name == "xxhash") {
method = &Benchmark::xxHash;
} else if (name == "acquireload") {
method = &Benchmark::AcquireLoad;
} else if (name == "compress") {
method = &Benchmark::Compress;
} else if (name == "uncompress") {
method = &Benchmark::Uncompress;
#ifndef ROCKSDB_LITE
} else if (name == "randomtransaction") {
method = &Benchmark::RandomTransaction;
post_process_method = &Benchmark::RandomTransactionVerify;
#endif // ROCKSDB_LITE
} else if (name == "randomreplacekeys") {
fresh_db = true;
method = &Benchmark::RandomReplaceKeys;
} else if (name == "timeseries") {
timestamp_emulator_.reset(new TimestampEmulator());
if (FLAGS_expire_style == "compaction_filter") {
filter.reset(new ExpiredTimeFilter(timestamp_emulator_));
fprintf(stdout, "Compaction filter is used to remove expired data");
open_options_.compaction_filter = filter.get();
}
fresh_db = true;
method = &Benchmark::TimeSeries;
} else if (name == "stats") {
PrintStats("rocksdb.stats");
} else if (name == "resetstats") {
ResetStats();
} else if (name == "verify") {
VerifyDBFromDB(FLAGS_truth_db);
} else if (name == "levelstats") {
PrintStats("rocksdb.levelstats");
} else if (name == "sstables") {
PrintStats("rocksdb.sstables");
} else if (!name.empty()) { // No error message for empty name
fprintf(stderr, "unknown benchmark '%s'\n", name.c_str());
exit(1);
}
if (fresh_db) {
if (FLAGS_use_existing_db) {
fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
name.c_str());
method = nullptr;
} else {
if (db_.db != nullptr) {
db_.DeleteDBs();
DestroyDB(FLAGS_db, open_options_);
}
Options options = open_options_;
for (size_t i = 0; i < multi_dbs_.size(); i++) {
delete multi_dbs_[i].db;
if (!open_options_.wal_dir.empty()) {
options.wal_dir = GetPathForMultiple(open_options_.wal_dir, i);
}
DestroyDB(GetPathForMultiple(FLAGS_db, i), options);
}
multi_dbs_.clear();
}
Open(&open_options_); // use open_options for the last accessed
}
if (method != nullptr) {
fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
if (num_warmup > 0) {
printf("Warming up benchmark by running %d times\n", num_warmup);
}
for (int i = 0; i < num_warmup; i++) {
RunBenchmark(num_threads, name, method);
}
if (num_repeat > 1) {
printf("Running benchmark for %d times\n", num_repeat);
}
CombinedStats combined_stats;
for (int i = 0; i < num_repeat; i++) {
Stats stats = RunBenchmark(num_threads, name, method);
combined_stats.AddStats(stats);
}
if (num_repeat > 1) {
combined_stats.Report(name);
}
}
if (post_process_method != nullptr) {
(this->*post_process_method)();
}
}
if (FLAGS_statistics) {
fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
}
if (FLAGS_simcache_size >= 0) {
fprintf(stdout, "SIMULATOR CACHE STATISTICS:\n%s\n",
static_cast_with_check<SimCache, Cache>(cache_.get())
->ToString()
.c_str());
}
}
private:
std::shared_ptr<TimestampEmulator> timestamp_emulator_;
struct ThreadArg {
Benchmark* bm;
SharedSt