More improvements to filter_bench (#5968)
Summary: * Adds support for plain table filter. This is not critical right now, but does add a -impl flag that will be useful for new filter implementations initially targeted at block-based table (and maybe later ported to plain table) * Better mixing of inside vs. outside queries, for more realism * A -best_case option handy for implementation tuning inner loop * Option for whether to include hashing time in dry run / net timings No modifications to production code, just filter_bench. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5968 Differential Revision: D18139872 Pulled By: pdillinger fbshipit-source-id: 5b09eba963111b48f9e0525a706e9921070990e8
This commit is contained in:
parent
b3dc2f3691
commit
3f891c40a0
@ -16,11 +16,13 @@ int main() {
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
|
||||
#include "memory/arena.h"
|
||||
#include "port/port.h"
|
||||
#include "port/stack_trace.h"
|
||||
#include "rocksdb/filter_policy.h"
|
||||
#include "table/block_based/full_filter_block.h"
|
||||
#include "table/block_based/mock_block_based_table.h"
|
||||
#include "table/plain/plain_table_bloom.h"
|
||||
#include "util/gflags_compat.h"
|
||||
#include "util/hash.h"
|
||||
#include "util/random.h"
|
||||
@ -57,8 +59,24 @@ DEFINE_double(m_queries, 200, "Millions of queries for each test mode");
|
||||
DEFINE_bool(use_full_block_reader, false,
|
||||
"Use FullFilterBlockReader interface rather than FilterBitsReader");
|
||||
|
||||
DEFINE_bool(use_plain_table_bloom, false,
|
||||
"Use PlainTableBloom structure and interface rather than "
|
||||
"FilterBitsReader/FullFilterBlockReader");
|
||||
|
||||
DEFINE_uint32(impl, 0,
|
||||
"Select filter implementation. Without -use_plain_table_bloom:"
|
||||
"0 = full filter, 1 = block-based filter. With "
|
||||
"-use_plain_table_bloom: 0 = no locality, 1 = locality.");
|
||||
|
||||
DEFINE_bool(net_includes_hashing, false,
|
||||
"Whether query net ns/op times should include hashing. "
|
||||
"(if not, dry run will include hashing) "
|
||||
"(build times always include hashing)");
|
||||
|
||||
DEFINE_bool(quick, false, "Run more limited set of tests, fewer queries");
|
||||
|
||||
DEFINE_bool(best_case, false, "Run limited tests only for best-case");
|
||||
|
||||
DEFINE_bool(allow_bad_fp_rate, false, "Continue even if FP rate is bad");
|
||||
|
||||
DEFINE_bool(legend, false,
|
||||
@ -73,14 +91,18 @@ void _always_assert_fail(int line, const char *file, const char *expr) {
|
||||
#define ALWAYS_ASSERT(cond) \
|
||||
((cond) ? (void)0 : ::_always_assert_fail(__LINE__, __FILE__, #cond))
|
||||
|
||||
using rocksdb::Arena;
|
||||
using rocksdb::BlockContents;
|
||||
using rocksdb::BloomHash;
|
||||
using rocksdb::CachableEntry;
|
||||
using rocksdb::EncodeFixed32;
|
||||
using rocksdb::fastrange32;
|
||||
using rocksdb::FilterBitsBuilder;
|
||||
using rocksdb::FilterBitsReader;
|
||||
using rocksdb::FullFilterBlockReader;
|
||||
using rocksdb::GetSliceHash;
|
||||
using rocksdb::ParsedFullFilterBlock;
|
||||
using rocksdb::PlainTableBloomV1;
|
||||
using rocksdb::Random32;
|
||||
using rocksdb::Slice;
|
||||
using rocksdb::mock::MockBlockBasedTableTester;
|
||||
@ -142,6 +164,7 @@ struct FilterInfo {
|
||||
uint32_t keys_added_ = 0;
|
||||
std::unique_ptr<FilterBitsReader> reader_;
|
||||
std::unique_ptr<FullFilterBlockReader> full_block_reader_;
|
||||
std::unique_ptr<PlainTableBloomV1> plain_table_bloom_;
|
||||
uint64_t outside_queries_ = 0;
|
||||
uint64_t false_positives_ = 0;
|
||||
};
|
||||
@ -165,6 +188,10 @@ static const std::vector<TestMode> quickTestModes = {
|
||||
kRandomFilter,
|
||||
};
|
||||
|
||||
static const std::vector<TestMode> bestCaseTestModes = {
|
||||
kSingleFilter,
|
||||
};
|
||||
|
||||
const char *TestModeToString(TestMode tm) {
|
||||
switch (tm) {
|
||||
case kSingleFilter:
|
||||
@ -183,11 +210,23 @@ const char *TestModeToString(TestMode tm) {
|
||||
return "Bad TestMode";
|
||||
}
|
||||
|
||||
// Do just enough to keep some data dependence for the
|
||||
// compiler / CPU
|
||||
static inline uint32_t NoHash(Slice &s) {
|
||||
uint32_t sz = static_cast<uint32_t>(s.size());
|
||||
if (sz >= 4) {
|
||||
return sz + s.data()[3];
|
||||
} else {
|
||||
return sz;
|
||||
}
|
||||
}
|
||||
|
||||
struct FilterBench : public MockBlockBasedTableTester {
|
||||
std::vector<KeyMaker> kms_;
|
||||
std::vector<FilterInfo> infos_;
|
||||
Random32 random_;
|
||||
std::ostringstream fp_rate_report_;
|
||||
Arena arena_;
|
||||
|
||||
FilterBench()
|
||||
: MockBlockBasedTableTester(
|
||||
@ -200,12 +239,27 @@ struct FilterBench : public MockBlockBasedTableTester {
|
||||
|
||||
void Go();
|
||||
|
||||
double RandomQueryTest(bool inside, bool dry_run, TestMode mode);
|
||||
double RandomQueryTest(uint32_t inside_threshold, bool dry_run,
|
||||
TestMode mode);
|
||||
};
|
||||
|
||||
void FilterBench::Go() {
|
||||
std::unique_ptr<FilterBitsBuilder> builder(
|
||||
table_options_.filter_policy->GetFilterBitsBuilder());
|
||||
if (FLAGS_use_plain_table_bloom && FLAGS_use_full_block_reader) {
|
||||
throw std::runtime_error(
|
||||
"Can't combine -use_plain_table_bloom and -use_full_block_reader");
|
||||
}
|
||||
if (FLAGS_impl > 1) {
|
||||
throw std::runtime_error("-impl must currently be >= 0 and <= 1");
|
||||
}
|
||||
if (!FLAGS_use_plain_table_bloom && FLAGS_impl == 1) {
|
||||
throw std::runtime_error(
|
||||
"Block-based filter not currently supported by filter_bench");
|
||||
}
|
||||
|
||||
std::unique_ptr<FilterBitsBuilder> builder;
|
||||
if (!FLAGS_use_plain_table_bloom && FLAGS_impl != 1) {
|
||||
builder.reset(table_options_.filter_policy->GetFilterBitsBuilder());
|
||||
}
|
||||
|
||||
uint32_t variance_mask = 1;
|
||||
while (variance_mask * variance_mask * 4 < FLAGS_average_keys_per_filter) {
|
||||
@ -213,9 +267,13 @@ void FilterBench::Go() {
|
||||
}
|
||||
|
||||
const std::vector<TestMode> &testModes =
|
||||
FLAGS_quick ? quickTestModes : allTestModes;
|
||||
FLAGS_best_case ? bestCaseTestModes
|
||||
: FLAGS_quick ? quickTestModes : allTestModes;
|
||||
if (FLAGS_quick) {
|
||||
FLAGS_m_queries /= 7.0;
|
||||
} else if (FLAGS_best_case) {
|
||||
FLAGS_m_queries /= 3.0;
|
||||
FLAGS_working_mem_size_mb /= 10.0;
|
||||
}
|
||||
|
||||
std::cout << "Building..." << std::endl;
|
||||
@ -230,22 +288,35 @@ void FilterBench::Go() {
|
||||
uint32_t keys_to_add = FLAGS_average_keys_per_filter +
|
||||
(random_.Next() & variance_mask) -
|
||||
(variance_mask / 2);
|
||||
for (uint32_t i = 0; i < keys_to_add; ++i) {
|
||||
builder->AddKey(kms_[0].Get(filter_id, i));
|
||||
}
|
||||
infos_.emplace_back();
|
||||
FilterInfo &info = infos_.back();
|
||||
info.filter_id_ = filter_id;
|
||||
info.filter_ = builder->Finish(&info.owner_);
|
||||
info.keys_added_ = keys_to_add;
|
||||
if (FLAGS_use_plain_table_bloom) {
|
||||
info.plain_table_bloom_.reset(new PlainTableBloomV1());
|
||||
info.plain_table_bloom_->SetTotalBits(
|
||||
&arena_, keys_to_add * FLAGS_bits_per_key, FLAGS_impl,
|
||||
0 /*huge_page*/, nullptr /*logger*/);
|
||||
for (uint32_t i = 0; i < keys_to_add; ++i) {
|
||||
uint32_t hash = GetSliceHash(kms_[0].Get(filter_id, i));
|
||||
info.plain_table_bloom_->AddHash(hash);
|
||||
}
|
||||
info.filter_ = info.plain_table_bloom_->GetRawData();
|
||||
} else {
|
||||
for (uint32_t i = 0; i < keys_to_add; ++i) {
|
||||
builder->AddKey(kms_[0].Get(filter_id, i));
|
||||
}
|
||||
info.filter_ = builder->Finish(&info.owner_);
|
||||
info.reader_.reset(
|
||||
table_options_.filter_policy->GetFilterBitsReader(info.filter_));
|
||||
CachableEntry<ParsedFullFilterBlock> block(
|
||||
new ParsedFullFilterBlock(table_options_.filter_policy.get(),
|
||||
BlockContents(info.filter_)),
|
||||
nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
|
||||
nullptr /* cache */, nullptr /* cache_handle */,
|
||||
true /* own_value */);
|
||||
info.full_block_reader_.reset(
|
||||
new FullFilterBlockReader(table_.get(), std::move(block)));
|
||||
}
|
||||
total_memory_used += info.filter_.size();
|
||||
total_keys_added += keys_to_add;
|
||||
}
|
||||
@ -259,7 +330,7 @@ void FilterBench::Go() {
|
||||
|
||||
double bpk = total_memory_used * 8.0 / total_keys_added;
|
||||
std::cout << "Bits/key actual: " << bpk << std::endl;
|
||||
if (!FLAGS_quick) {
|
||||
if (!FLAGS_quick && !FLAGS_best_case) {
|
||||
double tolerable_rate = std::pow(2.0, -(bpk - 1.0) / (1.4 + bpk / 50.0));
|
||||
std::cout << "Best possible FP rate %: " << 100.0 * std::pow(2.0, -bpk)
|
||||
<< std::endl;
|
||||
@ -273,13 +344,25 @@ void FilterBench::Go() {
|
||||
for (uint32_t i = 0; i < infos_.size(); ++i) {
|
||||
FilterInfo &info = infos_[i];
|
||||
for (uint32_t j = 0; j < info.keys_added_; ++j) {
|
||||
ALWAYS_ASSERT(info.reader_->MayMatch(kms_[0].Get(info.filter_id_, j)));
|
||||
if (FLAGS_use_plain_table_bloom) {
|
||||
uint32_t hash = GetSliceHash(kms_[0].Get(info.filter_id_, j));
|
||||
ALWAYS_ASSERT(info.plain_table_bloom_->MayContainHash(hash));
|
||||
} else {
|
||||
ALWAYS_ASSERT(
|
||||
info.reader_->MayMatch(kms_[0].Get(info.filter_id_, j)));
|
||||
}
|
||||
}
|
||||
for (uint32_t j = 0; j < outside_q_per_f; ++j) {
|
||||
if (FLAGS_use_plain_table_bloom) {
|
||||
uint32_t hash =
|
||||
GetSliceHash(kms_[0].Get(info.filter_id_, j | 0x80000000));
|
||||
fps += info.plain_table_bloom_->MayContainHash(hash);
|
||||
} else {
|
||||
fps += info.reader_->MayMatch(
|
||||
kms_[0].Get(info.filter_id_, j | 0x80000000));
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout << " No FNs :)" << std::endl;
|
||||
double prelim_rate = double(fps) / outside_q_per_f / infos_.size();
|
||||
std::cout << " Prelim FP rate %: " << (100.0 * prelim_rate) << std::endl;
|
||||
@ -290,34 +373,55 @@ void FilterBench::Go() {
|
||||
}
|
||||
|
||||
std::cout << "----------------------------" << std::endl;
|
||||
std::cout << "Inside queries..." << std::endl;
|
||||
std::cout << "Mixed inside/outside queries..." << std::endl;
|
||||
// 50% each inside and outside
|
||||
uint32_t inside_threshold = UINT32_MAX / 2;
|
||||
for (TestMode tm : testModes) {
|
||||
random_.Seed(FLAGS_seed + 1);
|
||||
double f = RandomQueryTest(/*inside*/ true, /*dry_run*/ false, tm);
|
||||
double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm);
|
||||
random_.Seed(FLAGS_seed + 1);
|
||||
double d = RandomQueryTest(/*inside*/ true, /*dry_run*/ true, tm);
|
||||
double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm);
|
||||
std::cout << " " << TestModeToString(tm) << " net ns/op: " << (f - d)
|
||||
<< std::endl;
|
||||
}
|
||||
std::cout << fp_rate_report_.str();
|
||||
|
||||
if (!FLAGS_quick) {
|
||||
std::cout << "----------------------------" << std::endl;
|
||||
std::cout << "Outside queries..." << std::endl;
|
||||
std::cout << "Inside queries (mostly)..." << std::endl;
|
||||
// Do about 95% inside queries rather than 100% so that branch predictor
|
||||
// can't give itself an artifically crazy advantage.
|
||||
inside_threshold = UINT32_MAX / 20 * 19;
|
||||
for (TestMode tm : testModes) {
|
||||
random_.Seed(FLAGS_seed + 2);
|
||||
double f = RandomQueryTest(/*inside*/ false, /*dry_run*/ false, tm);
|
||||
random_.Seed(FLAGS_seed + 2);
|
||||
double d = RandomQueryTest(/*inside*/ false, /*dry_run*/ true, tm);
|
||||
random_.Seed(FLAGS_seed + 1);
|
||||
double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm);
|
||||
random_.Seed(FLAGS_seed + 1);
|
||||
double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm);
|
||||
std::cout << " " << TestModeToString(tm) << " net ns/op: " << (f - d)
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
std::cout << "----------------------------" << std::endl;
|
||||
std::cout << "Outside queries (mostly)..." << std::endl;
|
||||
// Do about 95% outside queries rather than 100% so that branch predictor
|
||||
// can't give itself an artifically crazy advantage.
|
||||
inside_threshold = UINT32_MAX / 20;
|
||||
for (TestMode tm : testModes) {
|
||||
random_.Seed(FLAGS_seed + 2);
|
||||
double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm);
|
||||
random_.Seed(FLAGS_seed + 2);
|
||||
double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm);
|
||||
std::cout << " " << TestModeToString(tm) << " net ns/op: " << (f - d)
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
std::cout << fp_rate_report_.str();
|
||||
|
||||
std::cout << "----------------------------" << std::endl;
|
||||
std::cout << "Done. (For more info, run with -legend or -help.)" << std::endl;
|
||||
}
|
||||
|
||||
double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
|
||||
double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
|
||||
TestMode mode) {
|
||||
for (auto &info : infos_) {
|
||||
info.outside_queries_ = 0;
|
||||
info.false_positives_ = 0;
|
||||
@ -368,6 +472,8 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
|
||||
rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
|
||||
|
||||
for (uint64_t q = 0; q < max_queries; q += batch_size) {
|
||||
bool inside_this_time = random_.Next() <= inside_threshold;
|
||||
|
||||
uint32_t filter_index;
|
||||
if (random_.Next() <= primary_filter_threshold) {
|
||||
filter_index = random_.Uniformish(num_primary_filters);
|
||||
@ -378,7 +484,7 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
|
||||
}
|
||||
FilterInfo &info = infos_[filter_index];
|
||||
for (uint32_t i = 0; i < batch_size; ++i) {
|
||||
if (inside) {
|
||||
if (inside_this_time) {
|
||||
batch_slices[i] =
|
||||
kms_[i].Get(info.filter_id_, random_.Uniformish(info.keys_added_));
|
||||
} else {
|
||||
@ -389,14 +495,27 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
|
||||
}
|
||||
}
|
||||
// TODO: implement batched interface to full block reader
|
||||
if (mode == kBatchPrepared && !dry_run && !FLAGS_use_full_block_reader) {
|
||||
// TODO: implement batched interface to plain table bloom
|
||||
if (mode == kBatchPrepared && !FLAGS_use_full_block_reader &&
|
||||
!FLAGS_use_plain_table_bloom) {
|
||||
for (uint32_t i = 0; i < batch_size; ++i) {
|
||||
batch_results[i] = false;
|
||||
}
|
||||
if (dry_run) {
|
||||
for (uint32_t i = 0; i < batch_size; ++i) {
|
||||
batch_results[i] = true;
|
||||
if (FLAGS_net_includes_hashing) {
|
||||
dry_run_hash += NoHash(batch_slices[i]);
|
||||
} else {
|
||||
dry_run_hash ^= BloomHash(batch_slices[i]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
info.reader_->MayMatch(batch_size, batch_slice_ptrs.get(),
|
||||
batch_results.get());
|
||||
}
|
||||
for (uint32_t i = 0; i < batch_size; ++i) {
|
||||
if (inside) {
|
||||
if (inside_this_time) {
|
||||
ALWAYS_ASSERT(batch_results[i]);
|
||||
} else {
|
||||
info.false_positives_ += batch_results[i];
|
||||
@ -404,11 +523,28 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
|
||||
}
|
||||
} else {
|
||||
for (uint32_t i = 0; i < batch_size; ++i) {
|
||||
if (dry_run) {
|
||||
dry_run_hash ^= rocksdb::BloomHash(batch_slices[i]);
|
||||
} else {
|
||||
bool may_match;
|
||||
if (FLAGS_use_full_block_reader) {
|
||||
if (FLAGS_use_plain_table_bloom) {
|
||||
if (dry_run) {
|
||||
if (FLAGS_net_includes_hashing) {
|
||||
dry_run_hash += NoHash(batch_slices[i]);
|
||||
} else {
|
||||
dry_run_hash ^= GetSliceHash(batch_slices[i]);
|
||||
}
|
||||
may_match = true;
|
||||
} else {
|
||||
uint32_t hash = GetSliceHash(batch_slices[i]);
|
||||
may_match = info.plain_table_bloom_->MayContainHash(hash);
|
||||
}
|
||||
} else if (FLAGS_use_full_block_reader) {
|
||||
if (dry_run) {
|
||||
if (FLAGS_net_includes_hashing) {
|
||||
dry_run_hash += NoHash(batch_slices[i]);
|
||||
} else {
|
||||
dry_run_hash ^= BloomHash(batch_slices[i]);
|
||||
}
|
||||
may_match = true;
|
||||
} else {
|
||||
may_match = info.full_block_reader_->KeyMayMatch(
|
||||
batch_slices[i],
|
||||
/*prefix_extractor=*/nullptr,
|
||||
@ -416,10 +552,20 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
|
||||
/*no_io=*/false, /*const_ikey_ptr=*/nullptr,
|
||||
/*get_context=*/nullptr,
|
||||
/*lookup_context=*/nullptr);
|
||||
}
|
||||
} else {
|
||||
if (dry_run) {
|
||||
if (FLAGS_net_includes_hashing) {
|
||||
dry_run_hash += NoHash(batch_slices[i]);
|
||||
} else {
|
||||
dry_run_hash ^= BloomHash(batch_slices[i]);
|
||||
}
|
||||
may_match = true;
|
||||
} else {
|
||||
may_match = info.reader_->MayMatch(batch_slices[i]);
|
||||
}
|
||||
if (inside) {
|
||||
}
|
||||
if (inside_this_time) {
|
||||
ALWAYS_ASSERT(may_match);
|
||||
} else {
|
||||
info.false_positives_ += may_match;
|
||||
@ -427,7 +573,6 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t elapsed_nanos = timer.ElapsedNanos();
|
||||
double ns = double(elapsed_nanos) / max_queries;
|
||||
@ -444,7 +589,8 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
|
||||
std::cout << "ns/op: " << ns << std::endl;
|
||||
}
|
||||
|
||||
if (!inside && !dry_run && mode == kRandomFilter) {
|
||||
if (!dry_run) {
|
||||
fp_rate_report_ = std::ostringstream();
|
||||
uint64_t q = 0;
|
||||
uint64_t fp = 0;
|
||||
double worst_fp_rate = 0.0;
|
||||
@ -459,7 +605,7 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
|
||||
}
|
||||
}
|
||||
fp_rate_report_ << " Average FP rate %: " << 100.0 * fp / q << std::endl;
|
||||
if (!FLAGS_quick) {
|
||||
if (!FLAGS_quick && !FLAGS_best_case) {
|
||||
fp_rate_report_ << " Worst FP rate %: " << 100.0 * worst_fp_rate
|
||||
<< std::endl;
|
||||
fp_rate_report_ << " Best FP rate %: " << 100.0 * best_fp_rate
|
||||
@ -467,8 +613,6 @@ double FilterBench::RandomQueryTest(bool inside, bool dry_run, TestMode mode) {
|
||||
fp_rate_report_ << " Best possible bits/key: "
|
||||
<< -std::log(double(fp) / q) / std::log(2.0) << std::endl;
|
||||
}
|
||||
} else {
|
||||
fp_rate_report_.clear();
|
||||
}
|
||||
return ns;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user