blob: e9ae770cfaf14712a2d495144d37027078705b6e [file] [log] [blame]
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#if !defined(GFLAGS) || defined(ROCKSDB_LITE)
#include <cstdio>
int main() {
fprintf(stderr, "Please install gflags to run rocksdb tools\n");
return 1;
}
#elif defined(OS_MACOSX) || defined(OS_WIN)
// Block forward_iterator_bench under MAC and Windows
int main() { return 0; }
#else
#include <gflags/gflags.h>
#include <semaphore.h>
#include <atomic>
#include <bitset>
#include <chrono>
#include <climits>
#include <condition_variable>
#include <limits>
#include <mutex>
#include <queue>
#include <random>
#include <thread>
#include "rocksdb/cache.h"
#include "rocksdb/db.h"
#include "rocksdb/status.h"
#include "rocksdb/table.h"
#include "port/port.h"
#include "util/testharness.h"
const int MAX_SHARDS = 100000;
DEFINE_int32(writers, 8, "");
DEFINE_int32(readers, 8, "");
DEFINE_int64(rate, 100000, "");
DEFINE_int64(value_size, 300, "");
DEFINE_int64(shards, 1000, "");
DEFINE_int64(memtable_size, 500000000, "");
DEFINE_int64(block_cache_size, 300000000, "");
DEFINE_int64(block_size, 65536, "");
DEFINE_double(runtime, 300.0, "");
DEFINE_bool(cache_only_first, true, "");
DEFINE_bool(iterate_upper_bound, true, "");
struct Stats {
char pad1[128] __attribute__((__unused__));
std::atomic<uint64_t> written{0};
char pad2[128] __attribute__((__unused__));
std::atomic<uint64_t> read{0};
std::atomic<uint64_t> cache_misses{0};
char pad3[128] __attribute__((__unused__));
} stats;
struct Key {
Key() {}
Key(uint64_t shard_in, uint64_t seqno_in)
: shard_be(htobe64(shard_in)), seqno_be(htobe64(seqno_in)) {}
uint64_t shard() const { return be64toh(shard_be); }
uint64_t seqno() const { return be64toh(seqno_be); }
private:
uint64_t shard_be;
uint64_t seqno_be;
} __attribute__((__packed__));
struct Reader;
struct Writer;
struct ShardState {
char pad1[128] __attribute__((__unused__));
std::atomic<uint64_t> last_written{0};
Writer* writer;
Reader* reader;
char pad2[128] __attribute__((__unused__));
std::atomic<uint64_t> last_read{0};
std::unique_ptr<rocksdb::Iterator> it;
std::unique_ptr<rocksdb::Iterator> it_cacheonly;
Key upper_bound;
rocksdb::Slice upper_bound_slice;
char pad3[128] __attribute__((__unused__));
};
struct Reader {
public:
explicit Reader(std::vector<ShardState>* shard_states, rocksdb::DB* db)
: shard_states_(shard_states), db_(db) {
sem_init(&sem_, 0, 0);
thread_ = port::Thread(&Reader::run, this);
}
void run() {
while (1) {
sem_wait(&sem_);
if (done_.load()) {
break;
}
uint64_t shard;
{
std::lock_guard<std::mutex> guard(queue_mutex_);
assert(!shards_pending_queue_.empty());
shard = shards_pending_queue_.front();
shards_pending_queue_.pop();
shards_pending_set_.reset(shard);
}
readOnceFromShard(shard);
}
}
void readOnceFromShard(uint64_t shard) {
ShardState& state = (*shard_states_)[shard];
if (!state.it) {
// Initialize iterators
rocksdb::ReadOptions options;
options.tailing = true;
if (FLAGS_iterate_upper_bound) {
state.upper_bound = Key(shard, std::numeric_limits<uint64_t>::max());
state.upper_bound_slice = rocksdb::Slice(
(const char*)&state.upper_bound, sizeof(state.upper_bound));
options.iterate_upper_bound = &state.upper_bound_slice;
}
state.it.reset(db_->NewIterator(options));
if (FLAGS_cache_only_first) {
options.read_tier = rocksdb::ReadTier::kBlockCacheTier;
state.it_cacheonly.reset(db_->NewIterator(options));
}
}
const uint64_t upto = state.last_written.load();
for (rocksdb::Iterator* it : {state.it_cacheonly.get(), state.it.get()}) {
if (it == nullptr) {
continue;
}
if (state.last_read.load() >= upto) {
break;
}
bool need_seek = true;
for (uint64_t seq = state.last_read.load() + 1; seq <= upto; ++seq) {
if (need_seek) {
Key from(shard, state.last_read.load() + 1);
it->Seek(rocksdb::Slice((const char*)&from, sizeof(from)));
need_seek = false;
} else {
it->Next();
}
if (it->status().IsIncomplete()) {
++::stats.cache_misses;
break;
}
assert(it->Valid());
assert(it->key().size() == sizeof(Key));
Key key;
memcpy(&key, it->key().data(), it->key().size());
// fprintf(stderr, "Expecting (%ld, %ld) read (%ld, %ld)\n",
// shard, seq, key.shard(), key.seqno());
assert(key.shard() == shard);
assert(key.seqno() == seq);
state.last_read.store(seq);
++::stats.read;
}
}
}
void onWrite(uint64_t shard) {
{
std::lock_guard<std::mutex> guard(queue_mutex_);
if (!shards_pending_set_.test(shard)) {
shards_pending_queue_.push(shard);
shards_pending_set_.set(shard);
sem_post(&sem_);
}
}
}
~Reader() {
done_.store(true);
sem_post(&sem_);
thread_.join();
}
private:
char pad1[128] __attribute__((__unused__));
std::vector<ShardState>* shard_states_;
rocksdb::DB* db_;
rocksdb::port::Thread thread_;
sem_t sem_;
std::mutex queue_mutex_;
std::bitset<MAX_SHARDS + 1> shards_pending_set_;
std::queue<uint64_t> shards_pending_queue_;
std::atomic<bool> done_{false};
char pad2[128] __attribute__((__unused__));
};
struct Writer {
explicit Writer(std::vector<ShardState>* shard_states, rocksdb::DB* db)
: shard_states_(shard_states), db_(db) {}
void start() { thread_ = port::Thread(&Writer::run, this); }
void run() {
std::queue<std::chrono::steady_clock::time_point> workq;
std::chrono::steady_clock::time_point deadline(
std::chrono::steady_clock::now() +
std::chrono::nanoseconds((uint64_t)(1000000000 * FLAGS_runtime)));
std::vector<uint64_t> my_shards;
for (int i = 1; i <= FLAGS_shards; ++i) {
if ((*shard_states_)[i].writer == this) {
my_shards.push_back(i);
}
}
std::mt19937 rng{std::random_device()()};
std::uniform_int_distribution<int> shard_dist(
0, static_cast<int>(my_shards.size()) - 1);
std::string value(FLAGS_value_size, '*');
while (1) {
auto now = std::chrono::steady_clock::now();
if (FLAGS_runtime >= 0 && now >= deadline) {
break;
}
if (workq.empty()) {
for (int i = 0; i < FLAGS_rate; i += FLAGS_writers) {
std::chrono::nanoseconds offset(1000000000LL * i / FLAGS_rate);
workq.push(now + offset);
}
}
while (!workq.empty() && workq.front() < now) {
workq.pop();
uint64_t shard = my_shards[shard_dist(rng)];
ShardState& state = (*shard_states_)[shard];
uint64_t seqno = state.last_written.load() + 1;
Key key(shard, seqno);
// fprintf(stderr, "Writing (%ld, %ld)\n", shard, seqno);
rocksdb::Status status =
db_->Put(rocksdb::WriteOptions(),
rocksdb::Slice((const char*)&key, sizeof(key)),
rocksdb::Slice(value));
assert(status.ok());
state.last_written.store(seqno);
state.reader->onWrite(shard);
++::stats.written;
}
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
// fprintf(stderr, "Writer done\n");
}
~Writer() { thread_.join(); }
private:
char pad1[128] __attribute__((__unused__));
std::vector<ShardState>* shard_states_;
rocksdb::DB* db_;
rocksdb::port::Thread thread_;
char pad2[128] __attribute__((__unused__));
};
struct StatsThread {
explicit StatsThread(rocksdb::DB* db)
: db_(db), thread_(&StatsThread::run, this) {}
void run() {
// using namespace std::chrono;
auto tstart = std::chrono::steady_clock::now(), tlast = tstart;
uint64_t wlast = 0, rlast = 0;
while (!done_.load()) {
{
std::unique_lock<std::mutex> lock(cvm_);
cv_.wait_for(lock, std::chrono::seconds(1));
}
auto now = std::chrono::steady_clock::now();
double elapsed =
std::chrono::duration_cast<std::chrono::duration<double> >(
now - tlast).count();
uint64_t w = ::stats.written.load();
uint64_t r = ::stats.read.load();
fprintf(stderr,
"%s elapsed %4lds | written %10ld | w/s %10.0f | read %10ld | "
"r/s %10.0f | cache misses %10ld\n",
db_->GetEnv()->TimeToString(time(nullptr)).c_str(),
std::chrono::duration_cast<std::chrono::seconds>(now - tstart)
.count(),
w, (w - wlast) / elapsed, r, (r - rlast) / elapsed,
::stats.cache_misses.load());
wlast = w;
rlast = r;
tlast = now;
}
}
~StatsThread() {
{
std::lock_guard<std::mutex> guard(cvm_);
done_.store(true);
}
cv_.notify_all();
thread_.join();
}
private:
rocksdb::DB* db_;
std::mutex cvm_;
std::condition_variable cv_;
rocksdb::port::Thread thread_;
std::atomic<bool> done_{false};
};
int main(int argc, char** argv) {
GFLAGS::ParseCommandLineFlags(&argc, &argv, true);
std::mt19937 rng{std::random_device()()};
rocksdb::Status status;
std::string path = rocksdb::test::TmpDir() + "/forward_iterator_test";
fprintf(stderr, "db path is %s\n", path.c_str());
rocksdb::Options options;
options.create_if_missing = true;
options.compression = rocksdb::CompressionType::kNoCompression;
options.compaction_style = rocksdb::CompactionStyle::kCompactionStyleNone;
options.level0_slowdown_writes_trigger = 99999;
options.level0_stop_writes_trigger = 99999;
options.use_direct_io_for_flush_and_compaction = true;
options.write_buffer_size = FLAGS_memtable_size;
rocksdb::BlockBasedTableOptions table_options;
table_options.block_cache = rocksdb::NewLRUCache(FLAGS_block_cache_size);
table_options.block_size = FLAGS_block_size;
options.table_factory.reset(
rocksdb::NewBlockBasedTableFactory(table_options));
status = rocksdb::DestroyDB(path, options);
assert(status.ok());
rocksdb::DB* db_raw;
status = rocksdb::DB::Open(options, path, &db_raw);
assert(status.ok());
std::unique_ptr<rocksdb::DB> db(db_raw);
std::vector<ShardState> shard_states(FLAGS_shards + 1);
std::deque<Reader> readers;
while (static_cast<int>(readers.size()) < FLAGS_readers) {
readers.emplace_back(&shard_states, db_raw);
}
std::deque<Writer> writers;
while (static_cast<int>(writers.size()) < FLAGS_writers) {
writers.emplace_back(&shard_states, db_raw);
}
// Each shard gets a random reader and random writer assigned to it
for (int i = 1; i <= FLAGS_shards; ++i) {
std::uniform_int_distribution<int> reader_dist(0, FLAGS_readers - 1);
std::uniform_int_distribution<int> writer_dist(0, FLAGS_writers - 1);
shard_states[i].reader = &readers[reader_dist(rng)];
shard_states[i].writer = &writers[writer_dist(rng)];
}
StatsThread stats_thread(db_raw);
for (Writer& w : writers) {
w.start();
}
writers.clear();
readers.clear();
}
#endif // !defined(GFLAGS) || defined(ROCKSDB_LITE)