Fix major bug with MultiGet, DeleteRange, and memtable Bloom (#9453)

Summary:
MemTable::MultiGet was not considering range tombstones before
querying Bloom filter. This means range tombstones would be skipped for
keys (or prefixes) with no other entries in the memtable. This could cause
old values for a key (in SST files) to still show up until the range tombstone
covering it has been flushed.

This is fixed by essentially disabling the memtable Bloom filter when there
are any range tombstones. (This could be better optimized in the future, but
good enough for now.)

Did some other cleanup/optimization in the same code to (more than) offset
the cost of checking on range tombstones in more cases. There is now
notable improvement when memtable_whole_key_filtering and prefix_extractor
are used together (unusual), and this makes MultiGet closer to the Get
implementation.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9453

Test Plan:
new unit test added. Added memtable Bloom to crash test.

Performance testing
--------------------

Build WAL-only DB (recovers to memtable):
```
TEST_TMPDIR=/dev/shm/rocksdb ./db_bench -benchmarks=fillrandom -num=1000000 -write_buffer_size=250000000
```

Query test command, to maximize sensitivity to the changed code:
```
TEST_TMPDIR=/dev/shm/rocksdb ./db_bench -use_existing_db -readonly -benchmarks=multireadrandom -num=10000000 -write_buffer_size=250000000 -memtable_bloom_size_ratio=0.015 -multiread_batched -batch_size=24 -threads=8 -memtable_whole_key_filtering=$MWKF -prefix_size=$PXS
```
(Note -num here is 10x larger for mostly memtable misses)

Before & after run simultaneously, average over 10 iterations per data point, ops/sec.

MWKF=0 PXS=0 (Bloom disabled)
Before: 5724844
After: 6722066

MWKF=0 PXS=7 (prefixes hardly unique; Bloom not useful)
Before: 9981319
After: 10237990

MWKF=0 PXS=8 (prefixes unique; Bloom useful)
Before:  12081715
After: 12117603

MWKF=1 PXS=0 (whole key Bloom useful)
Before: 11944354
After: 12096085

MWKF=1 PXS=7 (whole key Bloom useful in new version; prefixes not useful in old version)
Before: 9444299
After: 11826029

MWKF=1 PXS=7 (whole key Bloom useful in new version; prefixes useful in old version)
Before: 11784465
After: 11778591

Only in this last case is the 'before' *slightly* faster, perhaps because hashing prefixes is slightly faster than hashing whole keys. Otherwise, 'after' is faster.

Reviewed By: ajkr

Differential Revision: D33805025

Pulled By: pdillinger

fbshipit-source-id: 597523cae4f4eafdf6ae6bb2bc6cb46f83b017bf
This commit is contained in:
Peter Dillinger 2022-01-27 14:53:39 -08:00 committed by Facebook GitHub Bot
parent 1e0e883ca5
commit ea89c77f27
8 changed files with 122 additions and 39 deletions

View File

@ -1,5 +1,8 @@
# Rocksdb Change Log
## Unreleased
### Bug Fixes
* Fixed a major bug in which batched MultiGet could return old values for keys deleted by DeleteRange when memtable Bloom filter is enabled (memtable_prefix_bloom_size_ratio > 0). (The fix includes a substantial MultiGet performance improvement in the unusual case of both memtable_whole_key_filtering and prefix_extractor.)
### Public API changes
* Remove HDFS support from main repo.
* Remove librados support from main repo.

View File

@ -1502,6 +1502,63 @@ TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilter) {
ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
}
TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) {
Options options = CurrentOptions();
options.memtable_prefix_bloom_size_ratio = 0.015;
options.memtable_whole_key_filtering = true;
Reopen(options);
std::string key1("AA");
std::string key2("BB");
std::string key3("CC");
std::string key4("DD");
std::string key_not("EE");
std::string value1("Value1");
std::string value2("Value2");
std::string value3("Value3");
std::string value4("Value4");
ASSERT_OK(Put(key1, value1, WriteOptions()));
ASSERT_OK(Put(key2, value2, WriteOptions()));
ASSERT_OK(Flush());
ASSERT_OK(Put(key3, value3, WriteOptions()));
const Snapshot* snapshot = db_->GetSnapshot();
ASSERT_OK(Put(key4, value4, WriteOptions()));
// Delete key2 and key3
ASSERT_OK(
db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "BA", "CZ"));
// Read without snapshot
auto results = MultiGet({key_not, key1, key2, key3, key4});
ASSERT_EQ(results[0], "NOT_FOUND");
ASSERT_EQ(results[1], value1);
ASSERT_EQ(results[2], "NOT_FOUND");
ASSERT_EQ(results[3], "NOT_FOUND");
ASSERT_EQ(results[4], value4);
// Also check Get
ASSERT_EQ(Get(key1), value1);
ASSERT_EQ(Get(key2), "NOT_FOUND");
ASSERT_EQ(Get(key3), "NOT_FOUND");
ASSERT_EQ(Get(key4), value4);
// Read with snapshot
results = MultiGet({key_not, key1, key2, key3, key4}, snapshot);
ASSERT_EQ(results[0], "NOT_FOUND");
ASSERT_EQ(results[1], value1);
ASSERT_EQ(results[2], value2);
ASSERT_EQ(results[3], value3);
ASSERT_EQ(results[4], "NOT_FOUND");
// Also check Get
ASSERT_EQ(Get(key1, snapshot), value1);
ASSERT_EQ(Get(key2, snapshot), value2);
ASSERT_EQ(Get(key3, snapshot), value3);
ASSERT_EQ(Get(key4, snapshot), "NOT_FOUND");
db_->ReleaseSnapshot(snapshot);
}
TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
constexpr size_t kPrefixSize = 8;
const std::string kKey = "key";

View File

@ -33,6 +33,7 @@
#include "rocksdb/iterator.h"
#include "rocksdb/merge_operator.h"
#include "rocksdb/slice_transform.h"
#include "rocksdb/types.h"
#include "rocksdb/write_buffer_manager.h"
#include "table/internal_iterator.h"
#include "table/iterator_wrapper.h"
@ -447,11 +448,13 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
is_range_del_table_empty_.load(std::memory_order_relaxed)) {
return nullptr;
}
return NewRangeTombstoneIteratorInternal(read_options, read_seq);
}
FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
const ReadOptions& read_options, SequenceNumber read_seq) {
auto* unfragmented_iter = new MemTableIterator(
*this, read_options, nullptr /* arena */, true /* use_range_del_table */);
if (unfragmented_iter == nullptr) {
return nullptr;
}
auto fragmented_tombstone_list =
std::make_shared<FragmentedRangeTombstoneList>(
std::unique_ptr<InternalIterator>(unfragmented_iter),
@ -960,53 +963,58 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
}
PERF_TIMER_GUARD(get_from_memtable_time);
// For now, memtable Bloom filter is effectively disabled if there are any
// range tombstones. This is the simplest way to ensure range tombstones are
// handled. TODO: allow Bloom checks where max_covering_tombstone_seq==0
bool no_range_del = read_options.ignore_range_deletions ||
is_range_del_table_empty_.load(std::memory_order_relaxed);
MultiGetRange temp_range(*range, range->begin(), range->end());
if (bloom_filter_) {
std::array<Slice*, MultiGetContext::MAX_BATCH_SIZE> keys;
std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match = {{true}};
autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> prefixes;
if (bloom_filter_ && no_range_del) {
bool whole_key =
!prefix_extractor_ || moptions_.memtable_whole_key_filtering;
std::array<Slice, MultiGetContext::MAX_BATCH_SIZE> bloom_keys;
std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match;
std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> range_indexes;
int num_keys = 0;
for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
if (!prefix_extractor_) {
keys[num_keys++] = &iter->ukey_without_ts;
if (whole_key) {
bloom_keys[num_keys] = iter->ukey_without_ts;
range_indexes[num_keys++] = iter.index();
} else if (prefix_extractor_->InDomain(iter->ukey_without_ts)) {
prefixes.emplace_back(
prefix_extractor_->Transform(iter->ukey_without_ts));
keys[num_keys++] = &prefixes.back();
}
}
bloom_filter_->MayContain(num_keys, &keys[0], &may_match[0]);
int idx = 0;
for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
if (prefix_extractor_ &&
!prefix_extractor_->InDomain(iter->ukey_without_ts)) {
bloom_keys[num_keys] =
prefix_extractor_->Transform(iter->ukey_without_ts);
range_indexes[num_keys++] = iter.index();
} else {
// TODO: consider not counting these as Bloom hits to more closely
// match bloom_sst_hit_count
PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
continue;
}
if (!may_match[idx]) {
temp_range.SkipKey(iter);
}
bloom_filter_->MayContain(num_keys, &bloom_keys[0], &may_match[0]);
for (int i = 0; i < num_keys; ++i) {
if (!may_match[i]) {
temp_range.SkipIndex(range_indexes[i]);
PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
} else {
PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
}
idx++;
}
}
for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
SequenceNumber seq = kMaxSequenceNumber;
bool found_final_value{false};
bool merge_in_progress = iter->s->IsMergeInProgress();
if (!no_range_del) {
std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
NewRangeTombstoneIterator(
NewRangeTombstoneIteratorInternal(
read_options, GetInternalKeySeqno(iter->lkey->internal_key())));
if (range_del_iter != nullptr) {
iter->max_covering_tombstone_seq = std::max(
iter->max_covering_tombstone_seq,
range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key()));
}
SequenceNumber dummy_seq;
GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true,
callback, &iter->is_blob_index, iter->value->GetSelf(),
iter->timestamp, iter->s, &(iter->merge_context), &seq,
iter->timestamp, iter->s, &(iter->merge_context), &dummy_seq,
&found_final_value, &merge_in_progress);
if (!found_final_value && merge_in_progress) {

View File

@ -600,6 +600,10 @@ class MemTable {
std::string* value, std::string* timestamp, Status* s,
MergeContext* merge_context, SequenceNumber* seq,
bool* found_final_value, bool* merge_in_progress);
// Always returns non-null and assumes certain pre-checks are done
FragmentedRangeTombstoneIterator* NewRangeTombstoneIteratorInternal(
const ReadOptions& read_options, SequenceNumber read_seq);
};
extern const char* EncodeKey(std::string* scratch, const Slice& target);

View File

@ -366,10 +366,16 @@ struct AdvancedColumnFamilyOptions {
Slice delta_value,
std::string* merged_value) = nullptr;
// if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0,
// create prefix bloom for memtable with the size of
// Should really be called `memtable_bloom_size_ratio`. Enables a dynamic
// Bloom filter in memtable to optimize many queries that must go beyond
// the memtable. The size in bytes of the filter is
// write_buffer_size * memtable_prefix_bloom_size_ratio.
// If it is larger than 0.25, it is sanitized to 0.25.
// * If prefix_extractor is set, the filter includes prefixes.
// * If memtable_whole_key_filtering, the filter includes whole keys.
// * If both, the filter includes both.
// * If neither, the feature is disabled.
//
// If this value is larger than 0.25, it is sanitized to 0.25.
//
// Default: 0 (disable)
//

View File

@ -235,9 +235,9 @@ class MultiGetContext {
bool empty() const { return RemainingMask() == 0; }
void SkipKey(const Iterator& iter) {
skip_mask_ |= uint64_t{1} << iter.index_;
}
void SkipIndex(size_t index) { skip_mask_ |= uint64_t{1} << index; }
void SkipKey(const Iterator& iter) { SkipIndex(iter.index_); }
bool IsKeySkipped(const Iterator& iter) const {
return skip_mask_ & (uint64_t{1} << iter.index_);

View File

@ -87,6 +87,7 @@ default_params = {
"partition_filters": lambda: random.randint(0, 1),
"partition_pinning": lambda: random.randint(0, 3),
"pause_background_one_in": 1000000,
"prefix_size" : -1,
"prefixpercent": 5,
"progress_reports": 0,
"readpercent": 45,
@ -155,6 +156,8 @@ default_params = {
"user_timestamp_size": 0,
"secondary_cache_fault_one_in" : lambda: random.choice([0, 0, 32]),
"prepopulate_block_cache" : lambda: random.choice([0, 1]),
"memtable_prefix_bloom_size_ratio": lambda: random.choice([0.001, 0.01, 0.1, 0.5]),
"memtable_whole_key_filtering": lambda: random.randint(0, 1),
}
_TEST_DIR_ENV_VAR = 'TEST_TMPDIR'
@ -242,7 +245,6 @@ simple_default_params = {
"memtablerep": "skip_list",
"prefixpercent": 0,
"readpercent": 50,
"prefix_size" : -1,
"target_file_size_base": 16777216,
"target_file_size_multiplier": 1,
"test_batches_snapshots": 0,
@ -401,6 +403,9 @@ def finalize_and_sanitize(src_params):
dest_params["test_batches_snapshots"] = 0
if dest_params.get("test_batches_snapshots") == 0:
dest_params["batch_protection_bytes_per_key"] = 0
if (dest_params.get("prefix_size") == -1 and
dest_params.get("memtable_whole_key_filtering") == 0):
dest_params["memtable_prefix_bloom_size_ratio"] = 0
return dest_params
def gen_cmd_params(args):

View File

@ -65,7 +65,7 @@ class DynamicBloom {
// Multithreaded access to this function is OK
bool MayContain(const Slice& key) const;
void MayContain(int num_keys, Slice** keys, bool* may_match) const;
void MayContain(int num_keys, Slice* keys, bool* may_match) const;
// Multithreaded access to this function is OK
bool MayContainHash(uint32_t hash) const;
@ -120,12 +120,12 @@ inline bool DynamicBloom::MayContain(const Slice& key) const {
return (MayContainHash(BloomHash(key)));
}
inline void DynamicBloom::MayContain(int num_keys, Slice** keys,
inline void DynamicBloom::MayContain(int num_keys, Slice* keys,
bool* may_match) const {
std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
for (int i = 0; i < num_keys; ++i) {
hashes[i] = BloomHash(*keys[i]);
hashes[i] = BloomHash(keys[i]);
size_t a = FastRange32(kLen, hashes[i]);
PREFETCH(data_ + a, 0, 3);
byte_offsets[i] = a;