add whole key bloom filter support in memtables (#4985)

Summary:
MyRocks calls `GetForUpdate` on `INSERT`, for unique key check, and in almost all cases GetForUpdate returns empty result. For such cases, whole key bloom filter is helpful.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4985

Differential Revision: D14118257

Pulled By: miasantreble

fbshipit-source-id: d35cb7109c62fd5ad541a26968e3a3e16d3e85ea
This commit is contained in:
Zhongyi Xie 2019-02-19 12:12:25 -08:00 committed by Facebook Github Bot
parent c2affccc18
commit ed995c6a69
17 changed files with 130 additions and 26 deletions

View File

@ -11,6 +11,7 @@
* Add support for trace sampling.
* Enable properties block checksum verification for block-based tables.
* For all users of dictionary compression, we now generate a separate dictionary for compressing each bottom-level SST file. Previously we reused a single dictionary for a whole compaction to bottom level. The new approach achieves better compression ratios; however, it uses more memory and CPU for buffering/sampling data blocks and training dictionaries.
* Add whole key bloom filter support in memtable.
* Files written by `SstFileWriter` will now use dictionary compression if it is configured in the file writer's `CompressionOptions`.
### Public API Change

View File

@ -786,6 +786,56 @@ TEST_F(DBBloomFilterTest, PrefixExtractorBlockFilter) {
delete iter;
}
TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilter) {
// regression test for #2743. the range delete tombstones in memtable should
// be added even when Get() skips searching due to its prefix bloom filter
const int kMemtableSize = 1 << 20; // 1MB
const int kMemtablePrefixFilterSize = 1 << 13; // 8KB
const int kPrefixLen = 4;
Options options = CurrentOptions();
options.memtable_prefix_bloom_size_ratio =
static_cast<double>(kMemtablePrefixFilterSize) / kMemtableSize;
options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(kPrefixLen));
options.write_buffer_size = kMemtableSize;
options.memtable_whole_key_filtering = false;
Reopen(options);
std::string key1("AAAABBBB");
std::string key2("AAAACCCC"); // not in DB
std::string key3("AAAADDDD");
std::string key4("AAAAEEEE");
std::string value1("Value1");
std::string value3("Value3");
std::string value4("Value4");
ASSERT_OK(Put(key1, value1, WriteOptions()));
// check memtable bloom stats
ASSERT_EQ("NOT_FOUND", Get(key2));
ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
// same prefix, bloom filter false positive
ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
// enable whole key bloom filter
options.memtable_whole_key_filtering = true;
Reopen(options);
// check memtable bloom stats
ASSERT_OK(Put(key3, value3, WriteOptions()));
ASSERT_EQ("NOT_FOUND", Get(key2));
// whole key bloom filter kicks in and determines it's a miss
ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
// verify whole key filtering does not depend on prefix_extractor
options.prefix_extractor.reset();
Reopen(options);
// check memtable bloom stats
ASSERT_OK(Put(key4, value4, WriteOptions()));
ASSERT_EQ("NOT_FOUND", Get(key2));
// whole key bloom filter kicks in and determines it's a miss
ASSERT_EQ(2, get_perf_context()->bloom_memtable_miss_count);
ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
}
#ifndef ROCKSDB_LITE
class BloomStatsTestWithParam
: public DBBloomFilterTest,

View File

@ -2333,9 +2333,7 @@ TEST_F(ExternalSSTFileTest, IngestFileWrittenWithCompressionDictionary) {
std::atomic<int> num_compression_dicts(0);
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
[&](void* /* arg */) {
++num_compression_dicts;
});
[&](void* /* arg */) { ++num_compression_dicts; });
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
Random rnd(301);

View File

@ -51,6 +51,8 @@ ImmutableMemTableOptions::ImmutableMemTableOptions(
mutable_cf_options.memtable_prefix_bloom_size_ratio) *
8u),
memtable_huge_page_size(mutable_cf_options.memtable_huge_page_size),
memtable_whole_key_filtering(
mutable_cf_options.memtable_whole_key_filtering),
inplace_update_support(ioptions.inplace_update_support),
inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks),
inplace_callback(ioptions.inplace_callback),
@ -109,8 +111,10 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
// something went wrong if we need to flush before inserting anything
assert(!ShouldScheduleFlush());
if (prefix_extractor_ && moptions_.memtable_prefix_bloom_bits > 0) {
prefix_bloom_.reset(
// use bloom_filter_ for both whole key and prefix bloom filter
if ((prefix_extractor_ || moptions_.memtable_whole_key_filtering) &&
moptions_.memtable_prefix_bloom_bits > 0) {
bloom_filter_.reset(
new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits,
ioptions.bloom_locality, 6 /* hard coded 6 probes */,
moptions_.memtable_huge_page_size, ioptions.info_log));
@ -282,7 +286,7 @@ class MemTableIterator : public InternalIterator {
if (use_range_del_table) {
iter_ = mem.range_del_table_->GetIterator(arena);
} else if (prefix_extractor_ != nullptr && !read_options.total_order_seek) {
bloom_ = mem.prefix_bloom_.get();
bloom_ = mem.bloom_filter_.get();
iter_ = mem.table_->GetDynamicPrefixIterator(arena);
} else {
iter_ = mem.table_->GetIterator(arena);
@ -313,7 +317,8 @@ class MemTableIterator : public InternalIterator {
void Seek(const Slice& k) override {
PERF_TIMER_GUARD(seek_on_memtable_time);
PERF_COUNTER_ADD(seek_on_memtable_count, 1);
if (bloom_ != nullptr) {
if (bloom_) {
// iterator should only use prefix bloom filter
if (!bloom_->MayContain(
prefix_extractor_->Transform(ExtractUserKey(k)))) {
PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
@ -329,7 +334,7 @@ class MemTableIterator : public InternalIterator {
void SeekForPrev(const Slice& k) override {
PERF_TIMER_GUARD(seek_on_memtable_time);
PERF_COUNTER_ADD(seek_on_memtable_count, 1);
if (bloom_ != nullptr) {
if (bloom_) {
if (!bloom_->MayContain(
prefix_extractor_->Transform(ExtractUserKey(k)))) {
PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
@ -515,9 +520,11 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
std::memory_order_relaxed);
}
if (prefix_bloom_) {
assert(prefix_extractor_);
prefix_bloom_->Add(prefix_extractor_->Transform(key));
if (bloom_filter_ && prefix_extractor_) {
bloom_filter_->Add(prefix_extractor_->Transform(key));
}
if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
bloom_filter_->Add(key);
}
// The first sequence number inserted into the memtable
@ -546,9 +553,11 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
post_process_info->num_deletes++;
}
if (prefix_bloom_) {
assert(prefix_extractor_);
prefix_bloom_->AddConcurrently(prefix_extractor_->Transform(key));
if (bloom_filter_ && prefix_extractor_) {
bloom_filter_->AddConcurrently(prefix_extractor_->Transform(key));
}
if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
bloom_filter_->AddConcurrently(key);
}
// atomically update first_seqno_ and earliest_seqno_.
@ -755,16 +764,24 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
Slice user_key = key.user_key();
bool found_final_value = false;
bool merge_in_progress = s->IsMergeInProgress();
bool const may_contain =
nullptr == prefix_bloom_
? false
: prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key));
if (prefix_bloom_ && !may_contain) {
bool may_contain = true;
if (bloom_filter_) {
// when both memtable_whole_key_filtering and prefix_extractor_ are set,
// only do whole key filtering for Get() to save CPU
if (moptions_.memtable_whole_key_filtering) {
may_contain = bloom_filter_->MayContain(user_key);
} else {
assert(prefix_extractor_);
may_contain =
bloom_filter_->MayContain(prefix_extractor_->Transform(user_key));
}
}
if (bloom_filter_ && !may_contain) {
// iter is null if prefix bloom says the key does not exist
PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
*seq = kMaxSequenceNumber;
} else {
if (prefix_bloom_) {
if (bloom_filter_) {
PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
}
Saver saver;

View File

@ -41,6 +41,7 @@ struct ImmutableMemTableOptions {
size_t arena_block_size;
uint32_t memtable_prefix_bloom_bits;
size_t memtable_huge_page_size;
bool memtable_whole_key_filtering;
bool inplace_update_support;
size_t inplace_update_num_locks;
UpdateStatus (*inplace_callback)(char* existing_value,
@ -274,7 +275,7 @@ class MemTable {
// memtable prefix bloom is disabled, since we can't easily allocate more
// space.
void UpdateWriteBufferSize(size_t new_write_buffer_size) {
if (prefix_bloom_ == nullptr ||
if (bloom_filter_ == nullptr ||
new_write_buffer_size < write_buffer_size_) {
write_buffer_size_.store(new_write_buffer_size,
std::memory_order_relaxed);
@ -454,7 +455,7 @@ class MemTable {
std::vector<port::RWMutex> locks_;
const SliceTransform* const prefix_extractor_;
std::unique_ptr<DynamicBloom> prefix_bloom_;
std::unique_ptr<DynamicBloom> bloom_filter_;
std::atomic<FlushStateEnum> flush_state_;

View File

@ -272,6 +272,15 @@ struct AdvancedColumnFamilyOptions {
// Dynamically changeable through SetOptions() API
double memtable_prefix_bloom_size_ratio = 0.0;
// Enable whole key bloom filter in memtable. Note this will only take effect
// if memtable_prefix_bloom_size_ratio is not 0. Enabling whole key filtering
// can potentially reduce CPU usage for point-look-ups.
//
// Default: false (disable)
//
// Dynamically changeable through SetOptions() API
bool memtable_whole_key_filtering = false;
// Page size for huge page for the arena used by the memtable. If <=0, it
// won't allocate from huge page but from malloc.
// Users are responsible to reserve huge pages for it to be allocated. For

View File

@ -135,6 +135,8 @@ void MutableCFOptions::Dump(Logger* log) const {
arena_block_size);
ROCKS_LOG_INFO(log, " memtable_prefix_bloom_ratio: %f",
memtable_prefix_bloom_size_ratio);
ROCKS_LOG_INFO(log, " memtable_whole_key_filtering: %d",
memtable_whole_key_filtering);
ROCKS_LOG_INFO(log,
" memtable_huge_page_size: %" ROCKSDB_PRIszt,
memtable_huge_page_size);

View File

@ -131,6 +131,7 @@ struct MutableCFOptions {
arena_block_size(options.arena_block_size),
memtable_prefix_bloom_size_ratio(
options.memtable_prefix_bloom_size_ratio),
memtable_whole_key_filtering(options.memtable_whole_key_filtering),
memtable_huge_page_size(options.memtable_huge_page_size),
max_successive_merges(options.max_successive_merges),
inplace_update_num_locks(options.inplace_update_num_locks),
@ -167,6 +168,7 @@ struct MutableCFOptions {
max_write_buffer_number(0),
arena_block_size(0),
memtable_prefix_bloom_size_ratio(0),
memtable_whole_key_filtering(false),
memtable_huge_page_size(0),
max_successive_merges(0),
inplace_update_num_locks(0),
@ -213,6 +215,7 @@ struct MutableCFOptions {
int max_write_buffer_number;
size_t arena_block_size;
double memtable_prefix_bloom_size_ratio;
bool memtable_whole_key_filtering;
size_t memtable_huge_page_size;
size_t max_successive_merges;
size_t inplace_update_num_locks;

View File

@ -51,6 +51,7 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options)
inplace_callback(options.inplace_callback),
memtable_prefix_bloom_size_ratio(
options.memtable_prefix_bloom_size_ratio),
memtable_whole_key_filtering(options.memtable_whole_key_filtering),
memtable_huge_page_size(options.memtable_huge_page_size),
memtable_insert_with_hint_prefix_extractor(
options.memtable_insert_with_hint_prefix_extractor),
@ -325,6 +326,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
ROCKS_LOG_HEADER(
log, " Options.memtable_prefix_bloom_size_ratio: %f",
memtable_prefix_bloom_size_ratio);
ROCKS_LOG_HEADER(log,
" Options.memtable_whole_key_filtering: %d",
memtable_whole_key_filtering);
ROCKS_LOG_HEADER(log, " Options.memtable_huge_page_size: %" ROCKSDB_PRIszt,
memtable_huge_page_size);

View File

@ -142,6 +142,8 @@ ColumnFamilyOptions BuildColumnFamilyOptions(
cf_opts.arena_block_size = mutable_cf_options.arena_block_size;
cf_opts.memtable_prefix_bloom_size_ratio =
mutable_cf_options.memtable_prefix_bloom_size_ratio;
cf_opts.memtable_whole_key_filtering =
mutable_cf_options.memtable_whole_key_filtering;
cf_opts.memtable_huge_page_size = mutable_cf_options.memtable_huge_page_size;
cf_opts.max_successive_merges = mutable_cf_options.max_successive_merges;
cf_opts.inplace_update_num_locks =
@ -1801,6 +1803,10 @@ std::unordered_map<std::string, OptionTypeInfo>
{"memtable_prefix_bloom_probes",
{0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, true,
0}},
{"memtable_whole_key_filtering",
{offset_of(&ColumnFamilyOptions::memtable_whole_key_filtering),
OptionType::kBoolean, OptionVerificationType::kNormal, true,
offsetof(struct MutableCFOptions, memtable_whole_key_filtering)}},
{"min_partial_merge_operands",
{0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, true,
0}},

View File

@ -435,6 +435,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
"max_write_buffer_number_to_maintain=84;"
"merge_operator=aabcxehazrMergeOperator;"
"memtable_prefix_bloom_size_ratio=0.4642;"
"memtable_whole_key_filtering=true;"
"memtable_insert_with_hint_prefix_extractor=rocksdb.CappedPrefix.13;"
"paranoid_file_checks=true;"
"force_consistency_checks=true;"

View File

@ -90,6 +90,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
{"compaction_measure_io_stats", "false"},
{"inplace_update_num_locks", "25"},
{"memtable_prefix_bloom_size_ratio", "0.26"},
{"memtable_whole_key_filtering", "true"},
{"memtable_huge_page_size", "28"},
{"bloom_locality", "29"},
{"max_successive_merges", "30"},
@ -195,6 +196,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
ASSERT_EQ(new_cf_opt.inplace_update_support, true);
ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 25U);
ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_size_ratio, 0.26);
ASSERT_EQ(new_cf_opt.memtable_whole_key_filtering, true);
ASSERT_EQ(new_cf_opt.memtable_huge_page_size, 28U);
ASSERT_EQ(new_cf_opt.bloom_locality, 29U);
ASSERT_EQ(new_cf_opt.max_successive_merges, 30U);

View File

@ -354,8 +354,7 @@ struct BlockBasedTableBuilder::Rep {
compression_dict(),
compression_ctx(_compression_type),
verify_dict(),
state((_compression_opts.max_dict_bytes > 0)
? State::kBuffered
state((_compression_opts.max_dict_bytes > 0) ? State::kBuffered
: State::kUnbuffered),
use_delta_encoding_for_index_values(table_opt.format_version >= 4 &&
!table_opt.block_align),

View File

@ -514,6 +514,8 @@ DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means"
DEFINE_double(memtable_bloom_size_ratio, 0,
"Ratio of memtable size used for bloom filter. 0 means no bloom "
"filter.");
DEFINE_bool(memtable_whole_key_filtering, false,
"Try to use whole key bloom filter in memtables.");
DEFINE_bool(memtable_use_huge_page, false,
"Try to use huge page in memtables.");
@ -3247,6 +3249,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
}
options.memtable_huge_page_size = FLAGS_memtable_use_huge_page ? 2048 : 0;
options.memtable_prefix_bloom_size_ratio = FLAGS_memtable_bloom_size_ratio;
options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering;
if (FLAGS_memtable_insert_with_hint_prefix_size > 0) {
options.memtable_insert_with_hint_prefix_extractor.reset(
NewCappedPrefixTransform(

View File

@ -248,6 +248,7 @@ const std::string options_file_content = R"OPTIONS_FILE(
verify_checksums_in_compaction=true
merge_operator=nullptr
memtable_prefix_bloom_bits=0
memtable_whole_key_filtering=true
paranoid_file_checks=false
inplace_update_num_locks=10000
optimize_filters_for_hits=false

View File

@ -210,6 +210,10 @@ DEFINE_double(memtable_prefix_bloom_size_ratio,
"creates prefix blooms for memtables, each with size "
"`write_buffer_size * memtable_prefix_bloom_size_ratio`.");
DEFINE_bool(memtable_whole_key_filtering,
rocksdb::Options().memtable_whole_key_filtering,
"Enable whole key filtering in memtables.");
DEFINE_int32(open_files, rocksdb::Options().max_open_files,
"Maximum number of files to keep open at the same time "
"(use default if == 0)");
@ -2583,6 +2587,8 @@ class StressTest {
FLAGS_max_write_buffer_number_to_maintain;
options_.memtable_prefix_bloom_size_ratio =
FLAGS_memtable_prefix_bloom_size_ratio;
options_.memtable_whole_key_filtering =
FLAGS_memtable_whole_key_filtering;
options_.max_background_compactions = FLAGS_max_background_compactions;
options_.max_background_flushes = FLAGS_max_background_flushes;
options_.compaction_style =

View File

@ -306,6 +306,7 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd) {
cf_opt->purge_redundant_kvs_while_flush = rnd->Uniform(2);
cf_opt->force_consistency_checks = rnd->Uniform(2);
cf_opt->compaction_options_fifo.allow_compaction = rnd->Uniform(2);
cf_opt->memtable_whole_key_filtering = rnd->Uniform(2);
// double options
cf_opt->hard_rate_limit = static_cast<double>(rnd->Uniform(10000)) / 13;