From 28a9d8910bbc706d9e7dd20c0d1931cc9f55042a Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Tue, 19 Jun 2018 14:06:17 -0700 Subject: [PATCH] Fix the bug with duplicate prefix in partition filters (#4024) Summary: https://github.com/facebook/rocksdb/pull/3764 introduced an optimization feature to skip duplicate prefix entires in full bloom filters. Unfortunately it also introduces a bug in partitioned full filters, where the duplicate prefix should still be inserted if it is in a new partition. The patch fixes the bug by resetting the duplicate detection logic each time a partition is cut. This bug could result into false negatives, which means that DB could skip an existing key. Closes https://github.com/facebook/rocksdb/pull/4024 Differential Revision: D8518866 Pulled By: maysamyabandeh fbshipit-source-id: 044f4d988e606a330ecafd8c79daceb68b8796bf --- HISTORY.md | 1 + table/block_based_table_reader.h | 2 +- table/full_filter_block.cc | 6 ++ table/full_filter_block.h | 1 + table/partitioned_filter_block.cc | 1 + table/partitioned_filter_block_test.cc | 82 +++++++++++++++++++------- 6 files changed, 72 insertions(+), 21 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index c633014f6..37d3ce216 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -10,6 +10,7 @@ ### Bug Fixes * fix deadlock with enable_pipelined_write=true and max_successive_merges > 0 * Fix corruption in non-iterator reads when mmap is used for file reads +* Fix bug with prefix search in partition filters where a shared prefix would be ignored from the later partitions. The bug could report an eixstent key as missing. The bug could be triggered if prefix_extractor is set and partition filters is enabled. ## 5.14.0 (5/16/2018) ### Public API Change diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index cedab053c..f53774f5e 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -347,7 +347,7 @@ class BlockBasedTable : public TableReader { Status VerifyChecksumInBlocks(InternalIterator* index_iter); // Create the filter from the filter block. - FilterBlockReader* ReadFilter( + virtual FilterBlockReader* ReadFilter( FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle, const bool is_a_filter_partition, const SliceTransform* prefix_extractor = nullptr) const; diff --git a/table/full_filter_block.cc b/table/full_filter_block.cc index 359b19c62..150caa6d1 100644 --- a/table/full_filter_block.cc +++ b/table/full_filter_block.cc @@ -72,8 +72,14 @@ inline void FullFilterBlockBuilder::AddPrefix(const Slice& key) { } } +void FullFilterBlockBuilder::Reset() { + last_whole_key_recorded_ = false; + last_prefix_recorded_ = false; +} + Slice FullFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/, Status* status) { + Reset(); // In this impl we ignore BlockHandle *status = Status::OK(); if (num_added_ != 0) { diff --git a/table/full_filter_block.h b/table/full_filter_block.h index 628cefdd8..efa93da12 100644 --- a/table/full_filter_block.h +++ b/table/full_filter_block.h @@ -52,6 +52,7 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { protected: virtual void AddKey(const Slice& key); std::unique_ptr filter_bits_builder_; + virtual void Reset(); private: // important: all of these might point to invalid addresses diff --git a/table/partitioned_filter_block.cc b/table/partitioned_filter_block.cc index 28cc3736b..e9d02ea67 100644 --- a/table/partitioned_filter_block.cc +++ b/table/partitioned_filter_block.cc @@ -49,6 +49,7 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() { std::string& index_key = p_index_builder_->GetPartitionKey(); filters.push_back({index_key, filter}); filters_in_partition_ = 0; + Reset(); } void PartitionedFilterBlockBuilder::AddKey(const Slice& key) { diff --git a/table/partitioned_filter_block_test.cc b/table/partitioned_filter_block_test.cc index 6317e9107..9f012e269 100644 --- a/table/partitioned_filter_block_test.cc +++ b/table/partitioned_filter_block_test.cc @@ -30,13 +30,24 @@ class MockedBlockBasedTable : public BlockBasedTable { virtual CachableEntry GetFilter( FilePrefetchBuffer*, const BlockHandle& filter_blk_handle, const bool /* unused */, bool /* unused */, GetContext* /* unused */, - const SliceTransform* /* unused */) const override { + const SliceTransform* prefix_extractor) const override { Slice slice = slices[filter_blk_handle.offset()]; auto obj = new FullFilterBlockReader( - nullptr, true, BlockContents(slice, false, kNoCompression), + prefix_extractor, true, BlockContents(slice, false, kNoCompression), rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr); return {obj, nullptr}; } + + virtual FilterBlockReader* ReadFilter( + FilePrefetchBuffer*, const BlockHandle& filter_blk_handle, + const bool /* unused */, + const SliceTransform* prefix_extractor) const override { + Slice slice = slices[filter_blk_handle.offset()]; + auto obj = new FullFilterBlockReader( + prefix_extractor, true, BlockContents(slice, false, kNoCompression), + rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr); + return obj; + } }; class PartitionedFilterBlockTest : public testing::Test { @@ -93,7 +104,8 @@ class PartitionedFilterBlockTest : public testing::Test { } PartitionedFilterBlockBuilder* NewBuilder( - PartitionedIndexBuilder* const p_index_builder) { + PartitionedIndexBuilder* const p_index_builder, + const SliceTransform* prefix_extractor = nullptr) { assert(table_options_.block_size_deviation <= 100); auto partition_size = static_cast( ((table_options_.metadata_block_size * @@ -102,7 +114,7 @@ class PartitionedFilterBlockTest : public testing::Test { 100); partition_size = std::max(partition_size, static_cast(1)); return new PartitionedFilterBlockBuilder( - nullptr, table_options_.whole_key_filtering, + prefix_extractor, table_options_.whole_key_filtering, table_options_.filter_policy->GetFilterBitsBuilder(), table_options_.index_block_restart_interval, p_index_builder, partition_size); @@ -111,7 +123,8 @@ class PartitionedFilterBlockTest : public testing::Test { std::unique_ptr table; PartitionedFilterBlockReader* NewReader( - PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib) { + PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib, + const SliceTransform* prefix_extractor) { BlockHandle bh; Status status; Slice slice; @@ -126,41 +139,42 @@ class PartitionedFilterBlockTest : public testing::Test { table.reset(new MockedBlockBasedTable(new BlockBasedTable::Rep( ioptions, env_options, table_options_, icomp, false))); auto reader = new PartitionedFilterBlockReader( - nullptr, true, BlockContents(slice, false, kNoCompression), nullptr, - nullptr, icomp, table.get(), pib->seperator_is_key_plus_seq()); + prefix_extractor, true, BlockContents(slice, false, kNoCompression), + nullptr, nullptr, icomp, table.get(), pib->seperator_is_key_plus_seq()); return reader; } void VerifyReader(PartitionedFilterBlockBuilder* builder, - PartitionedIndexBuilder* pib, bool empty = false) { + PartitionedIndexBuilder* pib, bool empty = false, + const SliceTransform* prefix_extractor = nullptr) { std::unique_ptr reader( - NewReader(builder, pib)); + NewReader(builder, pib, prefix_extractor)); // Querying added keys const bool no_io = true; for (auto key : keys) { auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); - ASSERT_TRUE( - reader->KeyMayMatch(key, nullptr, kNotValid, !no_io, &ikey_slice)); + ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, !no_io, + &ikey_slice)); } { // querying a key twice auto ikey = InternalKey(keys[0], 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); - ASSERT_TRUE(reader->KeyMayMatch(keys[0], nullptr, kNotValid, !no_io, - &ikey_slice)); + ASSERT_TRUE(reader->KeyMayMatch(keys[0], prefix_extractor, kNotValid, + !no_io, &ikey_slice)); } // querying missing keys for (auto key : missing_keys) { auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); if (empty) { - ASSERT_TRUE( - reader->KeyMayMatch(key, nullptr, kNotValid, !no_io, &ikey_slice)); + ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, + !no_io, &ikey_slice)); } else { // assuming a good hash function - ASSERT_FALSE( - reader->KeyMayMatch(key, nullptr, kNotValid, !no_io, &ikey_slice)); + ASSERT_FALSE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, + !no_io, &ikey_slice)); } } } @@ -187,10 +201,10 @@ class PartitionedFilterBlockTest : public testing::Test { return CountNumOfIndexPartitions(pib.get()); } - void TestBlockPerTwoKeys() { + void TestBlockPerTwoKeys(const SliceTransform* prefix_extractor = nullptr) { std::unique_ptr pib(NewIndexBuilder()); std::unique_ptr builder( - NewBuilder(pib.get())); + NewBuilder(pib.get(), prefix_extractor)); int i = 0; builder->Add(keys[i]); i++; @@ -203,7 +217,7 @@ class PartitionedFilterBlockTest : public testing::Test { builder->Add(keys[i]); CutABlock(pib.get(), keys[i]); - VerifyReader(builder.get(), pib.get()); + VerifyReader(builder.get(), pib.get(), prefix_extractor); } void TestBlockPerAllKeys() { @@ -281,6 +295,34 @@ TEST_F(PartitionedFilterBlockTest, TwoBlocksPerKey) { } } +// This reproduces the bug that a prefix is the same among multiple consecutive +// blocks but the bug would add it only to the first block. +TEST_F(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) { + // some small number to cause partition cuts + table_options_.metadata_block_size = 1; + std::unique_ptr prefix_extractor + (rocksdb::NewFixedPrefixTransform(1)); + std::unique_ptr pib(NewIndexBuilder()); + std::unique_ptr builder( + NewBuilder(pib.get(), prefix_extractor.get())); + const std::string pkeys[3] = {"p-key1", "p-key2", "p-key3"}; + builder->Add(pkeys[0]); + CutABlock(pib.get(), pkeys[0], pkeys[1]); + builder->Add(pkeys[1]); + CutABlock(pib.get(), pkeys[1], pkeys[2]); + builder->Add(pkeys[2]); + CutABlock(pib.get(), pkeys[2]); + std::unique_ptr reader( + NewReader(builder.get(), pib.get(), prefix_extractor.get())); + for (auto key : pkeys) { + auto ikey = InternalKey(key, 0, ValueType::kTypeValue); + const Slice ikey_slice = Slice(*ikey.rep()); + ASSERT_TRUE(reader->PrefixMayMatch(prefix_extractor->Transform(key), + prefix_extractor.get(), kNotValid, + false /*no_io*/, &ikey_slice)); + } +} + TEST_F(PartitionedFilterBlockTest, OneBlockPerKey) { uint64_t max_index_size = MaxIndexSize(); for (uint64_t i = 1; i < max_index_size + 1; i++) {