972f96b3fb
Summary: rocksdb direct io support ``` [gzh@dev11575.prn2 ~/rocksdb] ./db_bench -benchmarks=fillseq --num=1000000 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 5.0 Date: Wed Nov 23 13:17:43 2016 CPU: 40 * Intel(R) Xeon(R) CPU E5-2660 v2 @ 2.20GHz CPUCache: 25600 KB Keys: 16 bytes each Values: 100 bytes each (50 bytes after compression) Entries: 1000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 110.6 MB (estimated) FileSize: 62.9 MB (estimated) Write rate: 0 bytes/second Compression: Snappy Memtablerep: skip_list Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags DB path: [/tmp/rocksdbtest-112628/dbbench] fillseq : 4.393 micros/op 227639 ops/sec; 25.2 MB/s [gzh@dev11575.prn2 ~/roc Closes https://github.com/facebook/rocksdb/pull/1564 Differential Revision: D4241093 Pulled By: lightmark fbshipit-source-id: 98c29e3
375 lines
11 KiB
C++
375 lines
11 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under the BSD-style license found in the
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
#ifndef __STDC_FORMAT_MACROS
|
|
#define __STDC_FORMAT_MACROS
|
|
#endif
|
|
|
|
#if !defined(GFLAGS) || defined(ROCKSDB_LITE)
|
|
#include <cstdio>
|
|
int main() {
|
|
fprintf(stderr, "Please install gflags to run rocksdb tools\n");
|
|
return 1;
|
|
}
|
|
#elif defined(OS_MACOSX) || defined(OS_WIN)
|
|
// Block forward_iterator_bench under MAC and Windows
|
|
int main() { return 0; }
|
|
#else
|
|
#include <gflags/gflags.h>
|
|
#include <semaphore.h>
|
|
#include <atomic>
|
|
#include <bitset>
|
|
#include <chrono>
|
|
#include <climits>
|
|
#include <condition_variable>
|
|
#include <limits>
|
|
#include <mutex>
|
|
#include <queue>
|
|
#include <random>
|
|
#include <thread>
|
|
|
|
#include "rocksdb/cache.h"
|
|
#include "rocksdb/db.h"
|
|
#include "rocksdb/status.h"
|
|
#include "rocksdb/table.h"
|
|
#include "util/testharness.h"
|
|
|
|
const int MAX_SHARDS = 100000;
|
|
|
|
DEFINE_int32(writers, 8, "");
|
|
DEFINE_int32(readers, 8, "");
|
|
DEFINE_int64(rate, 100000, "");
|
|
DEFINE_int64(value_size, 300, "");
|
|
DEFINE_int64(shards, 1000, "");
|
|
DEFINE_int64(memtable_size, 500000000, "");
|
|
DEFINE_int64(block_cache_size, 300000000, "");
|
|
DEFINE_int64(block_size, 65536, "");
|
|
DEFINE_double(runtime, 300.0, "");
|
|
DEFINE_bool(cache_only_first, true, "");
|
|
DEFINE_bool(iterate_upper_bound, true, "");
|
|
|
|
struct Stats {
|
|
char pad1[128] __attribute__((__unused__));
|
|
std::atomic<uint64_t> written{0};
|
|
char pad2[128] __attribute__((__unused__));
|
|
std::atomic<uint64_t> read{0};
|
|
std::atomic<uint64_t> cache_misses{0};
|
|
char pad3[128] __attribute__((__unused__));
|
|
} stats;
|
|
|
|
struct Key {
|
|
Key() {}
|
|
Key(uint64_t shard_in, uint64_t seqno_in)
|
|
: shard_be(htobe64(shard_in)), seqno_be(htobe64(seqno_in)) {}
|
|
|
|
uint64_t shard() const { return be64toh(shard_be); }
|
|
uint64_t seqno() const { return be64toh(seqno_be); }
|
|
|
|
private:
|
|
uint64_t shard_be;
|
|
uint64_t seqno_be;
|
|
} __attribute__((__packed__));
|
|
|
|
struct Reader;
|
|
struct Writer;
|
|
|
|
struct ShardState {
|
|
char pad1[128] __attribute__((__unused__));
|
|
std::atomic<uint64_t> last_written{0};
|
|
Writer* writer;
|
|
Reader* reader;
|
|
char pad2[128] __attribute__((__unused__));
|
|
std::atomic<uint64_t> last_read{0};
|
|
std::unique_ptr<rocksdb::Iterator> it;
|
|
std::unique_ptr<rocksdb::Iterator> it_cacheonly;
|
|
Key upper_bound;
|
|
rocksdb::Slice upper_bound_slice;
|
|
char pad3[128] __attribute__((__unused__));
|
|
};
|
|
|
|
struct Reader {
|
|
public:
|
|
explicit Reader(std::vector<ShardState>* shard_states, rocksdb::DB* db)
|
|
: shard_states_(shard_states), db_(db) {
|
|
sem_init(&sem_, 0, 0);
|
|
thread_ = std::thread(&Reader::run, this);
|
|
}
|
|
|
|
void run() {
|
|
while (1) {
|
|
sem_wait(&sem_);
|
|
if (done_.load()) {
|
|
break;
|
|
}
|
|
|
|
uint64_t shard;
|
|
{
|
|
std::lock_guard<std::mutex> guard(queue_mutex_);
|
|
assert(!shards_pending_queue_.empty());
|
|
shard = shards_pending_queue_.front();
|
|
shards_pending_queue_.pop();
|
|
shards_pending_set_.reset(shard);
|
|
}
|
|
readOnceFromShard(shard);
|
|
}
|
|
}
|
|
|
|
void readOnceFromShard(uint64_t shard) {
|
|
ShardState& state = (*shard_states_)[shard];
|
|
if (!state.it) {
|
|
// Initialize iterators
|
|
rocksdb::ReadOptions options;
|
|
options.tailing = true;
|
|
if (FLAGS_iterate_upper_bound) {
|
|
state.upper_bound = Key(shard, std::numeric_limits<uint64_t>::max());
|
|
state.upper_bound_slice = rocksdb::Slice(
|
|
(const char*)&state.upper_bound, sizeof(state.upper_bound));
|
|
options.iterate_upper_bound = &state.upper_bound_slice;
|
|
}
|
|
|
|
state.it.reset(db_->NewIterator(options));
|
|
|
|
if (FLAGS_cache_only_first) {
|
|
options.read_tier = rocksdb::ReadTier::kBlockCacheTier;
|
|
state.it_cacheonly.reset(db_->NewIterator(options));
|
|
}
|
|
}
|
|
|
|
const uint64_t upto = state.last_written.load();
|
|
for (rocksdb::Iterator* it : {state.it_cacheonly.get(), state.it.get()}) {
|
|
if (it == nullptr) {
|
|
continue;
|
|
}
|
|
if (state.last_read.load() >= upto) {
|
|
break;
|
|
}
|
|
bool need_seek = true;
|
|
for (uint64_t seq = state.last_read.load() + 1; seq <= upto; ++seq) {
|
|
if (need_seek) {
|
|
Key from(shard, state.last_read.load() + 1);
|
|
it->Seek(rocksdb::Slice((const char*)&from, sizeof(from)));
|
|
need_seek = false;
|
|
} else {
|
|
it->Next();
|
|
}
|
|
if (it->status().IsIncomplete()) {
|
|
++::stats.cache_misses;
|
|
break;
|
|
}
|
|
assert(it->Valid());
|
|
assert(it->key().size() == sizeof(Key));
|
|
Key key;
|
|
memcpy(&key, it->key().data(), it->key().size());
|
|
// fprintf(stderr, "Expecting (%ld, %ld) read (%ld, %ld)\n",
|
|
// shard, seq, key.shard(), key.seqno());
|
|
assert(key.shard() == shard);
|
|
assert(key.seqno() == seq);
|
|
state.last_read.store(seq);
|
|
++::stats.read;
|
|
}
|
|
}
|
|
}
|
|
|
|
void onWrite(uint64_t shard) {
|
|
{
|
|
std::lock_guard<std::mutex> guard(queue_mutex_);
|
|
if (!shards_pending_set_.test(shard)) {
|
|
shards_pending_queue_.push(shard);
|
|
shards_pending_set_.set(shard);
|
|
sem_post(&sem_);
|
|
}
|
|
}
|
|
}
|
|
|
|
~Reader() {
|
|
done_.store(true);
|
|
sem_post(&sem_);
|
|
thread_.join();
|
|
}
|
|
|
|
private:
|
|
char pad1[128] __attribute__((__unused__));
|
|
std::vector<ShardState>* shard_states_;
|
|
rocksdb::DB* db_;
|
|
std::thread thread_;
|
|
sem_t sem_;
|
|
std::mutex queue_mutex_;
|
|
std::bitset<MAX_SHARDS + 1> shards_pending_set_;
|
|
std::queue<uint64_t> shards_pending_queue_;
|
|
std::atomic<bool> done_{false};
|
|
char pad2[128] __attribute__((__unused__));
|
|
};
|
|
|
|
struct Writer {
|
|
explicit Writer(std::vector<ShardState>* shard_states, rocksdb::DB* db)
|
|
: shard_states_(shard_states), db_(db) {}
|
|
|
|
void start() { thread_ = std::thread(&Writer::run, this); }
|
|
|
|
void run() {
|
|
std::queue<std::chrono::steady_clock::time_point> workq;
|
|
std::chrono::steady_clock::time_point deadline(
|
|
std::chrono::steady_clock::now() +
|
|
std::chrono::nanoseconds((uint64_t)(1000000000 * FLAGS_runtime)));
|
|
std::vector<uint64_t> my_shards;
|
|
for (int i = 1; i <= FLAGS_shards; ++i) {
|
|
if ((*shard_states_)[i].writer == this) {
|
|
my_shards.push_back(i);
|
|
}
|
|
}
|
|
|
|
std::mt19937 rng{std::random_device()()};
|
|
std::uniform_int_distribution<int> shard_dist(
|
|
0, static_cast<int>(my_shards.size()) - 1);
|
|
std::string value(FLAGS_value_size, '*');
|
|
|
|
while (1) {
|
|
auto now = std::chrono::steady_clock::now();
|
|
if (FLAGS_runtime >= 0 && now >= deadline) {
|
|
break;
|
|
}
|
|
if (workq.empty()) {
|
|
for (int i = 0; i < FLAGS_rate; i += FLAGS_writers) {
|
|
std::chrono::nanoseconds offset(1000000000LL * i / FLAGS_rate);
|
|
workq.push(now + offset);
|
|
}
|
|
}
|
|
while (!workq.empty() && workq.front() < now) {
|
|
workq.pop();
|
|
uint64_t shard = my_shards[shard_dist(rng)];
|
|
ShardState& state = (*shard_states_)[shard];
|
|
uint64_t seqno = state.last_written.load() + 1;
|
|
Key key(shard, seqno);
|
|
// fprintf(stderr, "Writing (%ld, %ld)\n", shard, seqno);
|
|
rocksdb::Status status =
|
|
db_->Put(rocksdb::WriteOptions(),
|
|
rocksdb::Slice((const char*)&key, sizeof(key)),
|
|
rocksdb::Slice(value));
|
|
assert(status.ok());
|
|
state.last_written.store(seqno);
|
|
state.reader->onWrite(shard);
|
|
++::stats.written;
|
|
}
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(1));
|
|
}
|
|
// fprintf(stderr, "Writer done\n");
|
|
}
|
|
|
|
~Writer() { thread_.join(); }
|
|
|
|
private:
|
|
char pad1[128] __attribute__((__unused__));
|
|
std::vector<ShardState>* shard_states_;
|
|
rocksdb::DB* db_;
|
|
std::thread thread_;
|
|
char pad2[128] __attribute__((__unused__));
|
|
};
|
|
|
|
struct StatsThread {
|
|
explicit StatsThread(rocksdb::DB* db)
|
|
: db_(db), thread_(&StatsThread::run, this) {}
|
|
|
|
void run() {
|
|
// using namespace std::chrono;
|
|
auto tstart = std::chrono::steady_clock::now(), tlast = tstart;
|
|
uint64_t wlast = 0, rlast = 0;
|
|
while (!done_.load()) {
|
|
{
|
|
std::unique_lock<std::mutex> lock(cvm_);
|
|
cv_.wait_for(lock, std::chrono::seconds(1));
|
|
}
|
|
auto now = std::chrono::steady_clock::now();
|
|
double elapsed =
|
|
std::chrono::duration_cast<std::chrono::duration<double> >(
|
|
now - tlast).count();
|
|
uint64_t w = ::stats.written.load();
|
|
uint64_t r = ::stats.read.load();
|
|
fprintf(stderr,
|
|
"%s elapsed %4lds | written %10ld | w/s %10.0f | read %10ld | "
|
|
"r/s %10.0f | cache misses %10ld\n",
|
|
db_->GetEnv()->TimeToString(time(nullptr)).c_str(),
|
|
std::chrono::duration_cast<std::chrono::seconds>(now - tstart)
|
|
.count(),
|
|
w, (w - wlast) / elapsed, r, (r - rlast) / elapsed,
|
|
::stats.cache_misses.load());
|
|
wlast = w;
|
|
rlast = r;
|
|
tlast = now;
|
|
}
|
|
}
|
|
|
|
~StatsThread() {
|
|
{
|
|
std::lock_guard<std::mutex> guard(cvm_);
|
|
done_.store(true);
|
|
}
|
|
cv_.notify_all();
|
|
thread_.join();
|
|
}
|
|
|
|
private:
|
|
rocksdb::DB* db_;
|
|
std::mutex cvm_;
|
|
std::condition_variable cv_;
|
|
std::thread thread_;
|
|
std::atomic<bool> done_{false};
|
|
};
|
|
|
|
int main(int argc, char** argv) {
|
|
GFLAGS::ParseCommandLineFlags(&argc, &argv, true);
|
|
|
|
std::mt19937 rng{std::random_device()()};
|
|
rocksdb::Status status;
|
|
std::string path = rocksdb::test::TmpDir() + "/forward_iterator_test";
|
|
fprintf(stderr, "db path is %s\n", path.c_str());
|
|
rocksdb::Options options;
|
|
options.create_if_missing = true;
|
|
options.compression = rocksdb::CompressionType::kNoCompression;
|
|
options.compaction_style = rocksdb::CompactionStyle::kCompactionStyleNone;
|
|
options.level0_slowdown_writes_trigger = 99999;
|
|
options.level0_stop_writes_trigger = 99999;
|
|
options.use_direct_writes = true;
|
|
options.write_buffer_size = FLAGS_memtable_size;
|
|
rocksdb::BlockBasedTableOptions table_options;
|
|
table_options.block_cache = rocksdb::NewLRUCache(FLAGS_block_cache_size);
|
|
table_options.block_size = FLAGS_block_size;
|
|
options.table_factory.reset(
|
|
rocksdb::NewBlockBasedTableFactory(table_options));
|
|
|
|
status = rocksdb::DestroyDB(path, options);
|
|
assert(status.ok());
|
|
rocksdb::DB* db_raw;
|
|
status = rocksdb::DB::Open(options, path, &db_raw);
|
|
assert(status.ok());
|
|
std::unique_ptr<rocksdb::DB> db(db_raw);
|
|
|
|
std::vector<ShardState> shard_states(FLAGS_shards + 1);
|
|
std::deque<Reader> readers;
|
|
while (static_cast<int>(readers.size()) < FLAGS_readers) {
|
|
readers.emplace_back(&shard_states, db_raw);
|
|
}
|
|
std::deque<Writer> writers;
|
|
while (static_cast<int>(writers.size()) < FLAGS_writers) {
|
|
writers.emplace_back(&shard_states, db_raw);
|
|
}
|
|
|
|
// Each shard gets a random reader and random writer assigned to it
|
|
for (int i = 1; i <= FLAGS_shards; ++i) {
|
|
std::uniform_int_distribution<int> reader_dist(0, FLAGS_readers - 1);
|
|
std::uniform_int_distribution<int> writer_dist(0, FLAGS_writers - 1);
|
|
shard_states[i].reader = &readers[reader_dist(rng)];
|
|
shard_states[i].writer = &writers[writer_dist(rng)];
|
|
}
|
|
|
|
StatsThread stats_thread(db_raw);
|
|
for (Writer& w : writers) {
|
|
w.start();
|
|
}
|
|
|
|
writers.clear();
|
|
readers.clear();
|
|
}
|
|
#endif // !defined(GFLAGS) || defined(ROCKSDB_LITE)
|