Fix major bug with MultiGet, DeleteRange, and memtable Bloom (#9453)

Summary: MemTable::MultiGet was not considering range tombstones before querying Bloom filter. This means range tombstones would be skipped for keys (or prefixes) with no other entries in the memtable. This could cause old values for a key (in SST files) to still show up until the range tombstone covering it has been flushed. This is fixed by essentially disabling the memtable Bloom filter when there are any range tombstones. (This could be better optimized in the future, but good enough for now.) Did some other cleanup/optimization in the same code to (more than) offset the cost of checking on range tombstones in more cases. There is now notable improvement when memtable_whole_key_filtering and prefix_extractor are used together (unusual), and this makes MultiGet closer to the Get implementation. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9453 Test Plan: new unit test added. Added memtable Bloom to crash test. Performance testing -------------------- Build WAL-only DB (recovers to memtable): ``` TEST_TMPDIR=/dev/shm/rocksdb ./db_bench -benchmarks=fillrandom -num=1000000 -write_buffer_size=250000000 ``` Query test command, to maximize sensitivity to the changed code: ``` TEST_TMPDIR=/dev/shm/rocksdb ./db_bench -use_existing_db -readonly -benchmarks=multireadrandom -num=10000000 -write_buffer_size=250000000 -memtable_bloom_size_ratio=0.015 -multiread_batched -batch_size=24 -threads=8 -memtable_whole_key_filtering=$MWKF -prefix_size=$PXS ``` (Note -num here is 10x larger for mostly memtable misses) Before & after run simultaneously, average over 10 iterations per data point, ops/sec. MWKF=0 PXS=0 (Bloom disabled) Before: 5724844 After: 6722066 MWKF=0 PXS=7 (prefixes hardly unique; Bloom not useful) Before: 9981319 After: 10237990 MWKF=0 PXS=8 (prefixes unique; Bloom useful) Before: 12081715 After: 12117603 MWKF=1 PXS=0 (whole key Bloom useful) Before: 11944354 After: 12096085 MWKF=1 PXS=7 (whole key Bloom useful in new version; prefixes not useful in old version) Before: 9444299 After: 11826029 MWKF=1 PXS=7 (whole key Bloom useful in new version; prefixes useful in old version) Before: 11784465 After: 11778591 Only in this last case is the 'before' *slightly* faster, perhaps because hashing prefixes is slightly faster than hashing whole keys. Otherwise, 'after' is faster. Reviewed By: ajkr Differential Revision: D33805025 Pulled By: pdillinger fbshipit-source-id: 597523cae4f4eafdf6ae6bb2bc6cb46f83b017bf
2022-01-27 14:53:39 -08:00 · 2022-01-27 14:53:39 -08:00 · da478f3eae
commit da478f3eae
parent d36eda014f
8 changed files with 123 additions and 39 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -1,4 +1,8 @@
 # Rocksdb Change Log
+## Unreleased
+### Bug Fixes
+* Fixed a major bug in which batched MultiGet could return old values for keys deleted by DeleteRange when memtable Bloom filter is enabled (memtable_prefix_bloom_size_ratio > 0). (The fix includes a substantial MultiGet performance improvement in the unusual case of both memtable_whole_key_filtering and prefix_extractor.)
+
 ## 6.29.0 (01/21/2022)
 Note: The next release will be major release 7.0. See https://github.com/facebook/rocksdb/issues/9390 for more info.
 ### Public API change
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@ -1502,6 +1502,63 @@ TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilter) {
  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
 }

+TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) {
+  Options options = CurrentOptions();
+  options.memtable_prefix_bloom_size_ratio = 0.015;
+  options.memtable_whole_key_filtering = true;
+  Reopen(options);
+  std::string key1("AA");
+  std::string key2("BB");
+  std::string key3("CC");
+  std::string key4("DD");
+  std::string key_not("EE");
+  std::string value1("Value1");
+  std::string value2("Value2");
+  std::string value3("Value3");
+  std::string value4("Value4");
+
+  ASSERT_OK(Put(key1, value1, WriteOptions()));
+  ASSERT_OK(Put(key2, value2, WriteOptions()));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(key3, value3, WriteOptions()));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(Put(key4, value4, WriteOptions()));
+
+  // Delete key2 and key3
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "BA", "CZ"));
+
+  // Read without snapshot
+  auto results = MultiGet({key_not, key1, key2, key3, key4});
+  ASSERT_EQ(results[0], "NOT_FOUND");
+  ASSERT_EQ(results[1], value1);
+  ASSERT_EQ(results[2], "NOT_FOUND");
+  ASSERT_EQ(results[3], "NOT_FOUND");
+  ASSERT_EQ(results[4], value4);
+
+  // Also check Get
+  ASSERT_EQ(Get(key1), value1);
+  ASSERT_EQ(Get(key2), "NOT_FOUND");
+  ASSERT_EQ(Get(key3), "NOT_FOUND");
+  ASSERT_EQ(Get(key4), value4);
+
+  // Read with snapshot
+  results = MultiGet({key_not, key1, key2, key3, key4}, snapshot);
+  ASSERT_EQ(results[0], "NOT_FOUND");
+  ASSERT_EQ(results[1], value1);
+  ASSERT_EQ(results[2], value2);
+  ASSERT_EQ(results[3], value3);
+  ASSERT_EQ(results[4], "NOT_FOUND");
+
+  // Also check Get
+  ASSERT_EQ(Get(key1, snapshot), value1);
+  ASSERT_EQ(Get(key2, snapshot), value2);
+  ASSERT_EQ(Get(key3, snapshot), value3);
+  ASSERT_EQ(Get(key4, snapshot), "NOT_FOUND");
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
 TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
  constexpr size_t kPrefixSize = 8;
  const std::string kKey = "key";
--- a/db/memtable.cc
+++ b/db/memtable.cc
@ -33,6 +33,7 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/types.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/internal_iterator.h"
 #include "table/iterator_wrapper.h"
@ -447,11 +448,13 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
      is_range_del_table_empty_.load(std::memory_order_relaxed)) {
    return nullptr;
  }
+  return NewRangeTombstoneIteratorInternal(read_options, read_seq);
+}
+
+FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
+    const ReadOptions& read_options, SequenceNumber read_seq) {
  auto* unfragmented_iter = new MemTableIterator(
      *this, read_options, nullptr /* arena */, true /* use_range_del_table */);
-  if (unfragmented_iter == nullptr) {
-    return nullptr;
-  }
  auto fragmented_tombstone_list =
      std::make_shared<FragmentedRangeTombstoneList>(
          std::unique_ptr<InternalIterator>(unfragmented_iter),
@ -960,53 +963,58 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
  }
  PERF_TIMER_GUARD(get_from_memtable_time);

+  // For now, memtable Bloom filter is effectively disabled if there are any
+  // range tombstones. This is the simplest way to ensure range tombstones are
+  // handled. TODO: allow Bloom checks where max_covering_tombstone_seq==0
+  bool no_range_del = read_options.ignore_range_deletions ||
+                      is_range_del_table_empty_.load(std::memory_order_relaxed);
  MultiGetRange temp_range(*range, range->begin(), range->end());
-  if (bloom_filter_) {
-    std::array<Slice*, MultiGetContext::MAX_BATCH_SIZE> keys;
-    std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match = {{true}};
-    autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> prefixes;
+  if (bloom_filter_ && no_range_del) {
+    bool whole_key =
+        !prefix_extractor_ || moptions_.memtable_whole_key_filtering;
+    std::array<Slice, MultiGetContext::MAX_BATCH_SIZE> bloom_keys;
+    std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match;
+    std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> range_indexes;
    int num_keys = 0;
    for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
-      if (!prefix_extractor_) {
-        keys[num_keys++] = &iter->ukey_without_ts;
+      if (whole_key) {
+        bloom_keys[num_keys] = iter->ukey_without_ts;
+        range_indexes[num_keys++] = iter.index();
      } else if (prefix_extractor_->InDomain(iter->ukey_without_ts)) {
-        prefixes.emplace_back(
-            prefix_extractor_->Transform(iter->ukey_without_ts));
-        keys[num_keys++] = &prefixes.back();
-      }
-    }
-    bloom_filter_->MayContain(num_keys, &keys[0], &may_match[0]);
-    int idx = 0;
-    for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
-      if (prefix_extractor_ &&
-          !prefix_extractor_->InDomain(iter->ukey_without_ts)) {
+        bloom_keys[num_keys] =
+            prefix_extractor_->Transform(iter->ukey_without_ts);
+        range_indexes[num_keys++] = iter.index();
+      } else {
+        // TODO: consider not counting these as Bloom hits to more closely
+        // match bloom_sst_hit_count
        PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
-        continue;
      }
-      if (!may_match[idx]) {
-        temp_range.SkipKey(iter);
+    }
+    bloom_filter_->MayContain(num_keys, &bloom_keys[0], &may_match[0]);
+    for (int i = 0; i < num_keys; ++i) {
+      if (!may_match[i]) {
+        temp_range.SkipIndex(range_indexes[i]);
        PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
      } else {
        PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
      }
-      idx++;
    }
  }
  for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
-    SequenceNumber seq = kMaxSequenceNumber;
    bool found_final_value{false};
    bool merge_in_progress = iter->s->IsMergeInProgress();
+    if (!no_range_del) {
      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
-        NewRangeTombstoneIterator(
+          NewRangeTombstoneIteratorInternal(
              read_options, GetInternalKeySeqno(iter->lkey->internal_key())));
-    if (range_del_iter != nullptr) {
      iter->max_covering_tombstone_seq = std::max(
          iter->max_covering_tombstone_seq,
          range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key()));
    }
+    SequenceNumber dummy_seq;
    GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true,
                 callback, &iter->is_blob_index, iter->value->GetSelf(),
-                 iter->timestamp, iter->s, &(iter->merge_context), &seq,
+                 iter->timestamp, iter->s, &(iter->merge_context), &dummy_seq,
                 &found_final_value, &merge_in_progress);

    if (!found_final_value && merge_in_progress) {
--- a/db/memtable.h
+++ b/db/memtable.h
@ -600,6 +600,10 @@ class MemTable {
                    std::string* value, std::string* timestamp, Status* s,
                    MergeContext* merge_context, SequenceNumber* seq,
                    bool* found_final_value, bool* merge_in_progress);
+
+  // Always returns non-null and assumes certain pre-checks are done
+  FragmentedRangeTombstoneIterator* NewRangeTombstoneIteratorInternal(
+      const ReadOptions& read_options, SequenceNumber read_seq);
 };

 extern const char* EncodeKey(std::string* scratch, const Slice& target);
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@ -366,10 +366,16 @@ struct AdvancedColumnFamilyOptions {
                                   Slice delta_value,
                                   std::string* merged_value) = nullptr;

-  // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0,
-  // create prefix bloom for memtable with the size of
+  // Should really be called `memtable_bloom_size_ratio`. Enables a dynamic
+  // Bloom filter in memtable to optimize many queries that must go beyond
+  // the memtable. The size in bytes of the filter is
  // write_buffer_size * memtable_prefix_bloom_size_ratio.
-  // If it is larger than 0.25, it is sanitized to 0.25.
+  // * If prefix_extractor is set, the filter includes prefixes.
+  // * If memtable_whole_key_filtering, the filter includes whole keys.
+  // * If both, the filter includes both.
+  // * If neither, the feature is disabled.
+  //
+  // If this value is larger than 0.25, it is sanitized to 0.25.
  //
  // Default: 0 (disable)
  //
--- a/table/multiget_context.h
+++ b/table/multiget_context.h
@ -235,9 +235,9 @@ class MultiGetContext {

    bool empty() const { return RemainingMask() == 0; }

-    void SkipKey(const Iterator& iter) {
-      skip_mask_ |= uint64_t{1} << iter.index_;
-    }
+    void SkipIndex(size_t index) { skip_mask_ |= uint64_t{1} << index; }
+
+    void SkipKey(const Iterator& iter) { SkipIndex(iter.index_); }

    bool IsKeySkipped(const Iterator& iter) const {
      return skip_mask_ & (uint64_t{1} << iter.index_);
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@ -87,6 +87,7 @@ default_params = {
    "partition_filters": lambda: random.randint(0, 1),
    "partition_pinning": lambda: random.randint(0, 3),
    "pause_background_one_in": 1000000,
+    "prefix_size" : -1,
    "prefixpercent": 5,
    "progress_reports": 0,
    "readpercent": 45,
@ -155,6 +156,8 @@ default_params = {
    "user_timestamp_size": 0,
    "secondary_cache_fault_one_in" : lambda: random.choice([0, 0, 32]),
    "prepopulate_block_cache" : lambda: random.choice([0, 1]),
+    "memtable_prefix_bloom_size_ratio": lambda: random.choice([0.001, 0.01, 0.1, 0.5]),
+    "memtable_whole_key_filtering": lambda: random.randint(0, 1),
 }

 _TEST_DIR_ENV_VAR = 'TEST_TMPDIR'
@ -242,7 +245,6 @@ simple_default_params = {
    "memtablerep": "skip_list",
    "prefixpercent": 0,
    "readpercent": 50,
-    "prefix_size" : -1,
    "target_file_size_base": 16777216,
    "target_file_size_multiplier": 1,
    "test_batches_snapshots": 0,
@ -401,6 +403,9 @@ def finalize_and_sanitize(src_params):
        dest_params["test_batches_snapshots"] = 0
    if dest_params.get("test_batches_snapshots") == 0:
        dest_params["batch_protection_bytes_per_key"] = 0
+    if (dest_params.get("prefix_size") == -1 and
+        dest_params.get("memtable_whole_key_filtering") == 0):
+        dest_params["memtable_prefix_bloom_size_ratio"] = 0
    return dest_params

 def gen_cmd_params(args):
--- a/util/dynamic_bloom.h
+++ b/util/dynamic_bloom.h
@ -65,7 +65,7 @@ class DynamicBloom {
  // Multithreaded access to this function is OK
  bool MayContain(const Slice& key) const;

-  void MayContain(int num_keys, Slice** keys, bool* may_match) const;
+  void MayContain(int num_keys, Slice* keys, bool* may_match) const;

  // Multithreaded access to this function is OK
  bool MayContainHash(uint32_t hash) const;
@ -120,12 +120,12 @@ inline bool DynamicBloom::MayContain(const Slice& key) const {
  return (MayContainHash(BloomHash(key)));
 }

-inline void DynamicBloom::MayContain(int num_keys, Slice** keys,
+inline void DynamicBloom::MayContain(int num_keys, Slice* keys,
                                     bool* may_match) const {
  std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
  std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
  for (int i = 0; i < num_keys; ++i) {
-    hashes[i] = BloomHash(*keys[i]);
+    hashes[i] = BloomHash(keys[i]);
    size_t a = FastRange32(kLen, hashes[i]);
    PREFETCH(data_ + a, 0, 3);
    byte_offsets[i] = a;