From ea89c77f27a0339821f66eaa1b8afcf345b9175d Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Thu, 27 Jan 2022 14:53:39 -0800 Subject: [PATCH] Fix major bug with MultiGet, DeleteRange, and memtable Bloom (#9453) Summary: MemTable::MultiGet was not considering range tombstones before querying Bloom filter. This means range tombstones would be skipped for keys (or prefixes) with no other entries in the memtable. This could cause old values for a key (in SST files) to still show up until the range tombstone covering it has been flushed. This is fixed by essentially disabling the memtable Bloom filter when there are any range tombstones. (This could be better optimized in the future, but good enough for now.) Did some other cleanup/optimization in the same code to (more than) offset the cost of checking on range tombstones in more cases. There is now notable improvement when memtable_whole_key_filtering and prefix_extractor are used together (unusual), and this makes MultiGet closer to the Get implementation. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9453 Test Plan: new unit test added. Added memtable Bloom to crash test. Performance testing -------------------- Build WAL-only DB (recovers to memtable): ``` TEST_TMPDIR=/dev/shm/rocksdb ./db_bench -benchmarks=fillrandom -num=1000000 -write_buffer_size=250000000 ``` Query test command, to maximize sensitivity to the changed code: ``` TEST_TMPDIR=/dev/shm/rocksdb ./db_bench -use_existing_db -readonly -benchmarks=multireadrandom -num=10000000 -write_buffer_size=250000000 -memtable_bloom_size_ratio=0.015 -multiread_batched -batch_size=24 -threads=8 -memtable_whole_key_filtering=$MWKF -prefix_size=$PXS ``` (Note -num here is 10x larger for mostly memtable misses) Before & after run simultaneously, average over 10 iterations per data point, ops/sec. MWKF=0 PXS=0 (Bloom disabled) Before: 5724844 After: 6722066 MWKF=0 PXS=7 (prefixes hardly unique; Bloom not useful) Before: 9981319 After: 10237990 MWKF=0 PXS=8 (prefixes unique; Bloom useful) Before: 12081715 After: 12117603 MWKF=1 PXS=0 (whole key Bloom useful) Before: 11944354 After: 12096085 MWKF=1 PXS=7 (whole key Bloom useful in new version; prefixes not useful in old version) Before: 9444299 After: 11826029 MWKF=1 PXS=7 (whole key Bloom useful in new version; prefixes useful in old version) Before: 11784465 After: 11778591 Only in this last case is the 'before' *slightly* faster, perhaps because hashing prefixes is slightly faster than hashing whole keys. Otherwise, 'after' is faster. Reviewed By: ajkr Differential Revision: D33805025 Pulled By: pdillinger fbshipit-source-id: 597523cae4f4eafdf6ae6bb2bc6cb46f83b017bf --- HISTORY.md | 3 ++ db/db_bloom_filter_test.cc | 57 ++++++++++++++++++++++++++ db/memtable.cc | 66 +++++++++++++++++------------- db/memtable.h | 4 ++ include/rocksdb/advanced_options.h | 12 ++++-- table/multiget_context.h | 6 +-- tools/db_crashtest.py | 7 +++- util/dynamic_bloom.h | 6 +-- 8 files changed, 122 insertions(+), 39 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 9e8ebf3d4..5c9da97e1 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,8 @@ # Rocksdb Change Log ## Unreleased +### Bug Fixes +* Fixed a major bug in which batched MultiGet could return old values for keys deleted by DeleteRange when memtable Bloom filter is enabled (memtable_prefix_bloom_size_ratio > 0). (The fix includes a substantial MultiGet performance improvement in the unusual case of both memtable_whole_key_filtering and prefix_extractor.) + ### Public API changes * Remove HDFS support from main repo. * Remove librados support from main repo. diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index 92f4db438..5b444d841 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -1502,6 +1502,63 @@ TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilter) { ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); } +TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) { + Options options = CurrentOptions(); + options.memtable_prefix_bloom_size_ratio = 0.015; + options.memtable_whole_key_filtering = true; + Reopen(options); + std::string key1("AA"); + std::string key2("BB"); + std::string key3("CC"); + std::string key4("DD"); + std::string key_not("EE"); + std::string value1("Value1"); + std::string value2("Value2"); + std::string value3("Value3"); + std::string value4("Value4"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + ASSERT_OK(Put(key2, value2, WriteOptions())); + ASSERT_OK(Flush()); + ASSERT_OK(Put(key3, value3, WriteOptions())); + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(Put(key4, value4, WriteOptions())); + + // Delete key2 and key3 + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "BA", "CZ")); + + // Read without snapshot + auto results = MultiGet({key_not, key1, key2, key3, key4}); + ASSERT_EQ(results[0], "NOT_FOUND"); + ASSERT_EQ(results[1], value1); + ASSERT_EQ(results[2], "NOT_FOUND"); + ASSERT_EQ(results[3], "NOT_FOUND"); + ASSERT_EQ(results[4], value4); + + // Also check Get + ASSERT_EQ(Get(key1), value1); + ASSERT_EQ(Get(key2), "NOT_FOUND"); + ASSERT_EQ(Get(key3), "NOT_FOUND"); + ASSERT_EQ(Get(key4), value4); + + // Read with snapshot + results = MultiGet({key_not, key1, key2, key3, key4}, snapshot); + ASSERT_EQ(results[0], "NOT_FOUND"); + ASSERT_EQ(results[1], value1); + ASSERT_EQ(results[2], value2); + ASSERT_EQ(results[3], value3); + ASSERT_EQ(results[4], "NOT_FOUND"); + + // Also check Get + ASSERT_EQ(Get(key1, snapshot), value1); + ASSERT_EQ(Get(key2, snapshot), value2); + ASSERT_EQ(Get(key3, snapshot), value3); + ASSERT_EQ(Get(key4, snapshot), "NOT_FOUND"); + + db_->ReleaseSnapshot(snapshot); +} + TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) { constexpr size_t kPrefixSize = 8; const std::string kKey = "key"; diff --git a/db/memtable.cc b/db/memtable.cc index f2803dfc1..3ce44ea1d 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -33,6 +33,7 @@ #include "rocksdb/iterator.h" #include "rocksdb/merge_operator.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/types.h" #include "rocksdb/write_buffer_manager.h" #include "table/internal_iterator.h" #include "table/iterator_wrapper.h" @@ -447,11 +448,13 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator( is_range_del_table_empty_.load(std::memory_order_relaxed)) { return nullptr; } + return NewRangeTombstoneIteratorInternal(read_options, read_seq); +} + +FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal( + const ReadOptions& read_options, SequenceNumber read_seq) { auto* unfragmented_iter = new MemTableIterator( *this, read_options, nullptr /* arena */, true /* use_range_del_table */); - if (unfragmented_iter == nullptr) { - return nullptr; - } auto fragmented_tombstone_list = std::make_shared( std::unique_ptr(unfragmented_iter), @@ -960,53 +963,58 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, } PERF_TIMER_GUARD(get_from_memtable_time); + // For now, memtable Bloom filter is effectively disabled if there are any + // range tombstones. This is the simplest way to ensure range tombstones are + // handled. TODO: allow Bloom checks where max_covering_tombstone_seq==0 + bool no_range_del = read_options.ignore_range_deletions || + is_range_del_table_empty_.load(std::memory_order_relaxed); MultiGetRange temp_range(*range, range->begin(), range->end()); - if (bloom_filter_) { - std::array keys; - std::array may_match = {{true}}; - autovector prefixes; + if (bloom_filter_ && no_range_del) { + bool whole_key = + !prefix_extractor_ || moptions_.memtable_whole_key_filtering; + std::array bloom_keys; + std::array may_match; + std::array range_indexes; int num_keys = 0; for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { - if (!prefix_extractor_) { - keys[num_keys++] = &iter->ukey_without_ts; + if (whole_key) { + bloom_keys[num_keys] = iter->ukey_without_ts; + range_indexes[num_keys++] = iter.index(); } else if (prefix_extractor_->InDomain(iter->ukey_without_ts)) { - prefixes.emplace_back( - prefix_extractor_->Transform(iter->ukey_without_ts)); - keys[num_keys++] = &prefixes.back(); + bloom_keys[num_keys] = + prefix_extractor_->Transform(iter->ukey_without_ts); + range_indexes[num_keys++] = iter.index(); + } else { + // TODO: consider not counting these as Bloom hits to more closely + // match bloom_sst_hit_count + PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); } } - bloom_filter_->MayContain(num_keys, &keys[0], &may_match[0]); - int idx = 0; - for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { - if (prefix_extractor_ && - !prefix_extractor_->InDomain(iter->ukey_without_ts)) { - PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); - continue; - } - if (!may_match[idx]) { - temp_range.SkipKey(iter); + bloom_filter_->MayContain(num_keys, &bloom_keys[0], &may_match[0]); + for (int i = 0; i < num_keys; ++i) { + if (!may_match[i]) { + temp_range.SkipIndex(range_indexes[i]); PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); } else { PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); } - idx++; } } for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { - SequenceNumber seq = kMaxSequenceNumber; bool found_final_value{false}; bool merge_in_progress = iter->s->IsMergeInProgress(); - std::unique_ptr range_del_iter( - NewRangeTombstoneIterator( - read_options, GetInternalKeySeqno(iter->lkey->internal_key()))); - if (range_del_iter != nullptr) { + if (!no_range_del) { + std::unique_ptr range_del_iter( + NewRangeTombstoneIteratorInternal( + read_options, GetInternalKeySeqno(iter->lkey->internal_key()))); iter->max_covering_tombstone_seq = std::max( iter->max_covering_tombstone_seq, range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key())); } + SequenceNumber dummy_seq; GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true, callback, &iter->is_blob_index, iter->value->GetSelf(), - iter->timestamp, iter->s, &(iter->merge_context), &seq, + iter->timestamp, iter->s, &(iter->merge_context), &dummy_seq, &found_final_value, &merge_in_progress); if (!found_final_value && merge_in_progress) { diff --git a/db/memtable.h b/db/memtable.h index 695cf2d94..52cfcc9d2 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -600,6 +600,10 @@ class MemTable { std::string* value, std::string* timestamp, Status* s, MergeContext* merge_context, SequenceNumber* seq, bool* found_final_value, bool* merge_in_progress); + + // Always returns non-null and assumes certain pre-checks are done + FragmentedRangeTombstoneIterator* NewRangeTombstoneIteratorInternal( + const ReadOptions& read_options, SequenceNumber read_seq); }; extern const char* EncodeKey(std::string* scratch, const Slice& target); diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index d7e3205b8..a12bebded 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -366,10 +366,16 @@ struct AdvancedColumnFamilyOptions { Slice delta_value, std::string* merged_value) = nullptr; - // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0, - // create prefix bloom for memtable with the size of + // Should really be called `memtable_bloom_size_ratio`. Enables a dynamic + // Bloom filter in memtable to optimize many queries that must go beyond + // the memtable. The size in bytes of the filter is // write_buffer_size * memtable_prefix_bloom_size_ratio. - // If it is larger than 0.25, it is sanitized to 0.25. + // * If prefix_extractor is set, the filter includes prefixes. + // * If memtable_whole_key_filtering, the filter includes whole keys. + // * If both, the filter includes both. + // * If neither, the feature is disabled. + // + // If this value is larger than 0.25, it is sanitized to 0.25. // // Default: 0 (disable) // diff --git a/table/multiget_context.h b/table/multiget_context.h index 3d1ce72bc..6ab337879 100644 --- a/table/multiget_context.h +++ b/table/multiget_context.h @@ -235,9 +235,9 @@ class MultiGetContext { bool empty() const { return RemainingMask() == 0; } - void SkipKey(const Iterator& iter) { - skip_mask_ |= uint64_t{1} << iter.index_; - } + void SkipIndex(size_t index) { skip_mask_ |= uint64_t{1} << index; } + + void SkipKey(const Iterator& iter) { SkipIndex(iter.index_); } bool IsKeySkipped(const Iterator& iter) const { return skip_mask_ & (uint64_t{1} << iter.index_); diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 617745979..5a2becca3 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -87,6 +87,7 @@ default_params = { "partition_filters": lambda: random.randint(0, 1), "partition_pinning": lambda: random.randint(0, 3), "pause_background_one_in": 1000000, + "prefix_size" : -1, "prefixpercent": 5, "progress_reports": 0, "readpercent": 45, @@ -155,6 +156,8 @@ default_params = { "user_timestamp_size": 0, "secondary_cache_fault_one_in" : lambda: random.choice([0, 0, 32]), "prepopulate_block_cache" : lambda: random.choice([0, 1]), + "memtable_prefix_bloom_size_ratio": lambda: random.choice([0.001, 0.01, 0.1, 0.5]), + "memtable_whole_key_filtering": lambda: random.randint(0, 1), } _TEST_DIR_ENV_VAR = 'TEST_TMPDIR' @@ -242,7 +245,6 @@ simple_default_params = { "memtablerep": "skip_list", "prefixpercent": 0, "readpercent": 50, - "prefix_size" : -1, "target_file_size_base": 16777216, "target_file_size_multiplier": 1, "test_batches_snapshots": 0, @@ -401,6 +403,9 @@ def finalize_and_sanitize(src_params): dest_params["test_batches_snapshots"] = 0 if dest_params.get("test_batches_snapshots") == 0: dest_params["batch_protection_bytes_per_key"] = 0 + if (dest_params.get("prefix_size") == -1 and + dest_params.get("memtable_whole_key_filtering") == 0): + dest_params["memtable_prefix_bloom_size_ratio"] = 0 return dest_params def gen_cmd_params(args): diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h index ba0836c4c..542183280 100644 --- a/util/dynamic_bloom.h +++ b/util/dynamic_bloom.h @@ -65,7 +65,7 @@ class DynamicBloom { // Multithreaded access to this function is OK bool MayContain(const Slice& key) const; - void MayContain(int num_keys, Slice** keys, bool* may_match) const; + void MayContain(int num_keys, Slice* keys, bool* may_match) const; // Multithreaded access to this function is OK bool MayContainHash(uint32_t hash) const; @@ -120,12 +120,12 @@ inline bool DynamicBloom::MayContain(const Slice& key) const { return (MayContainHash(BloomHash(key))); } -inline void DynamicBloom::MayContain(int num_keys, Slice** keys, +inline void DynamicBloom::MayContain(int num_keys, Slice* keys, bool* may_match) const { std::array hashes; std::array byte_offsets; for (int i = 0; i < num_keys; ++i) { - hashes[i] = BloomHash(*keys[i]); + hashes[i] = BloomHash(keys[i]); size_t a = FastRange32(kLen, hashes[i]); PREFETCH(data_ + a, 0, 3); byte_offsets[i] = a;