From 685e895652da4b18c61069b423b9b9c5669eea53 Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Thu, 31 Oct 2019 14:11:08 -0700 Subject: [PATCH] Prepare filter tests for more implementations (#5967) Summary: This change sets up for alternate implementations underlying BloomFilterPolicy: * Refactor BloomFilterPolicy and expose in internal .h file so that it's easy to iterate over / select implementations for testing, regardless of what the best public interface will look like. Most notably updated db_bloom_filter_test to use this. * Hide FullFilterBitsBuilder from unit tests (alternate derived classes planned); expose the part important for testing (CalculateSpace), as abstract class BuiltinFilterBitsBuilder. (Also cleaned up internally exposed interface to CalculateSpace.) * Rename BloomTest -> BlockBasedBloomTest for clarity (despite ongoing confusion between block-based table and block-based filter) * Assert that block-based filter construction interface is only used on BloomFilterPolicy appropriately constructed. (A couple of tests updated to add ", true".) Pull Request resolved: https://github.com/facebook/rocksdb/pull/5967 Test Plan: make check Differential Revision: D18138704 Pulled By: pdillinger fbshipit-source-id: 55ef9273423b0696309e251f50b8c1b5e9ec7597 --- db/db_bloom_filter_test.cc | 136 +++--- include/rocksdb/filter_policy.h | 4 +- .../block_based_filter_block_test.cc | 2 +- table/block_based/filter_policy.cc | 386 ++++++++++-------- table/block_based/filter_policy_internal.h | 90 ++-- util/bloom_test.cc | 44 +- 6 files changed, 356 insertions(+), 306 deletions(-) diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index beed590ae..d2e88b0e4 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -10,9 +10,14 @@ #include "db/db_test_util.h" #include "port/stack_trace.h" #include "rocksdb/perf_context.h" +#include "table/block_based/filter_policy_internal.h" namespace rocksdb { +namespace { +using BFP = BloomFilterPolicy; +} // namespace + // DB tests related to bloom filter. class DBBloomFilterTest : public DBTestBase { @@ -20,12 +25,12 @@ class DBBloomFilterTest : public DBTestBase { DBBloomFilterTest() : DBTestBase("/db_bloom_filter_test") {} }; -class DBBloomFilterTestWithParam - : public DBTestBase, - public testing::WithParamInterface> { +class DBBloomFilterTestWithParam : public DBTestBase, + public testing::WithParamInterface< + std::tuple> { // public testing::WithParamInterface { protected: - bool use_block_based_filter_; + BFP::Impl bfp_impl_; bool partition_filters_; uint32_t format_version_; @@ -35,7 +40,7 @@ class DBBloomFilterTestWithParam ~DBBloomFilterTestWithParam() override {} void SetUp() override { - use_block_based_filter_ = std::get<0>(GetParam()); + bfp_impl_ = std::get<0>(GetParam()); partition_filters_ = std::get<1>(GetParam()); format_version_ = std::get<2>(GetParam()); } @@ -71,8 +76,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) { ReadOptions ropts; std::string value; anon::OptionsOverride options_override; - options_override.filter_policy.reset( - NewBloomFilterPolicy(20, use_block_based_filter_)); + options_override.filter_policy.reset(new BFP(20, bfp_impl_)); options_override.partition_filters = partition_filters_; options_override.metadata_block_size = 32; Options options = CurrentOptions(options_override); @@ -432,8 +436,7 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) { // trigger reset of table_factory BlockBasedTableOptions table_options; table_options.no_block_cache = true; - table_options.filter_policy.reset( - NewBloomFilterPolicy(10, use_block_based_filter_)); + table_options.filter_policy.reset(new BFP(10, bfp_impl_)); table_options.partition_filters = partition_filters_; if (partition_filters_) { table_options.index_type = @@ -502,24 +505,24 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) { #ifndef ROCKSDB_VALGRIND_RUN INSTANTIATE_TEST_CASE_P( FormatDef, DBBloomFilterTestDefFormatVersion, - ::testing::Values(std::make_tuple(true, false, test::kDefaultFormatVersion), - std::make_tuple(false, true, test::kDefaultFormatVersion), - std::make_tuple(false, false, - test::kDefaultFormatVersion))); + ::testing::Values( + std::make_tuple(BFP::kBlock, false, test::kDefaultFormatVersion), + std::make_tuple(BFP::kFull, true, test::kDefaultFormatVersion), + std::make_tuple(BFP::kFull, false, test::kDefaultFormatVersion))); INSTANTIATE_TEST_CASE_P( FormatDef, DBBloomFilterTestWithParam, - ::testing::Values(std::make_tuple(true, false, test::kDefaultFormatVersion), - std::make_tuple(false, true, test::kDefaultFormatVersion), - std::make_tuple(false, false, - test::kDefaultFormatVersion))); + ::testing::Values( + std::make_tuple(BFP::kBlock, false, test::kDefaultFormatVersion), + std::make_tuple(BFP::kFull, true, test::kDefaultFormatVersion), + std::make_tuple(BFP::kFull, false, test::kDefaultFormatVersion))); INSTANTIATE_TEST_CASE_P( FormatLatest, DBBloomFilterTestWithParam, - ::testing::Values(std::make_tuple(true, false, test::kLatestFormatVersion), - std::make_tuple(false, true, test::kLatestFormatVersion), - std::make_tuple(false, false, - test::kLatestFormatVersion))); + ::testing::Values( + std::make_tuple(BFP::kBlock, false, test::kLatestFormatVersion), + std::make_tuple(BFP::kFull, true, test::kLatestFormatVersion), + std::make_tuple(BFP::kFull, false, test::kLatestFormatVersion))); #endif // ROCKSDB_VALGRIND_RUN TEST_F(DBBloomFilterTest, BloomFilterRate) { @@ -640,7 +643,7 @@ namespace { class WrappedBloom : public FilterPolicy { public: explicit WrappedBloom(int bits_per_key) - : filter_(NewBloomFilterPolicy(bits_per_key)), counter_(0) {} + : filter_(NewBloomFilterPolicy(bits_per_key, true)), counter_(0) {} ~WrappedBloom() override { delete filter_; } @@ -858,11 +861,11 @@ TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) { #ifndef ROCKSDB_LITE class BloomStatsTestWithParam : public DBBloomFilterTest, - public testing::WithParamInterface> { + public testing::WithParamInterface> { public: BloomStatsTestWithParam() { use_block_table_ = std::get<0>(GetParam()); - use_block_based_builder_ = std::get<1>(GetParam()); + bfp_impl_ = std::get<1>(GetParam()); partition_filters_ = std::get<2>(GetParam()); options_.create_if_missing = true; @@ -873,13 +876,12 @@ class BloomStatsTestWithParam BlockBasedTableOptions table_options; table_options.hash_index_allow_collision = false; if (partition_filters_) { - assert(!use_block_based_builder_); + assert(bfp_impl_ != BFP::kBlock); table_options.partition_filters = partition_filters_; table_options.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; } - table_options.filter_policy.reset( - NewBloomFilterPolicy(10, use_block_based_builder_)); + table_options.filter_policy.reset(new BFP(10, bfp_impl_)); options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); } else { assert(!partition_filters_); // not supported in plain table @@ -902,7 +904,7 @@ class BloomStatsTestWithParam static void TearDownTestCase() {} bool use_block_table_; - bool use_block_based_builder_; + BFP::Impl bfp_impl_; bool partition_filters_; Options options_; }; @@ -1006,7 +1008,7 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { ASSERT_EQ(value3, iter->value().ToString()); // The seek doesn't check block-based bloom filter because last index key // starts with the same prefix we're seeking to. - uint64_t expected_hits = use_block_based_builder_ ? 1 : 2; + uint64_t expected_hits = bfp_impl_ == BFP::kBlock ? 1 : 2; ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count); iter->Seek(key2); @@ -1016,12 +1018,12 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count); } -INSTANTIATE_TEST_CASE_P(BloomStatsTestWithParam, BloomStatsTestWithParam, - ::testing::Values(std::make_tuple(true, true, false), - std::make_tuple(true, false, false), - std::make_tuple(true, false, true), - std::make_tuple(false, false, - false))); +INSTANTIATE_TEST_CASE_P( + BloomStatsTestWithParam, BloomStatsTestWithParam, + ::testing::Values(std::make_tuple(true, BFP::kBlock, false), + std::make_tuple(true, BFP::kFull, false), + std::make_tuple(true, BFP::kFull, true), + std::make_tuple(false, BFP::kFull, false))); namespace { void PrefixScanInit(DBBloomFilterTest* dbtest) { @@ -1328,8 +1330,8 @@ int CountIter(std::unique_ptr& iter, const Slice& key) { // into the same string, or 2) the transformed seek key is of the same length // as the upper bound and two keys are adjacent according to the comparator. TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { - int iteration = 0; - for (bool use_block_based_builder : {true, false}) { + for (auto bfp_impl : BFP::kAllImpls) { + int using_full_builder = bfp_impl != BFP::kBlock; Options options; options.create_if_missing = true; options.prefix_extractor.reset(NewCappedPrefixTransform(4)); @@ -1338,8 +1340,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { // Enable prefix bloom for SST files BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; - table_options.filter_policy.reset( - NewBloomFilterPolicy(10, use_block_based_builder)); + table_options.filter_policy.reset(new BFP(10, bfp_impl)); table_options.index_shortening = BlockBasedTableOptions:: IndexShorteningMode::kShortenSeparatorsAndSuccessor; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -1382,7 +1383,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { ASSERT_EQ(CountIter(iter, "abcdxx00"), 4); // should check bloom filter since upper bound meets requirement ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 2 + iteration); + 2 + using_full_builder); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } { @@ -1396,7 +1397,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { ASSERT_EQ(CountIter(iter, "abcdxx01"), 4); // should skip bloom filter since upper bound is too long ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 2 + iteration); + 2 + using_full_builder); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } { @@ -1410,7 +1411,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { // should check bloom filter since upper bound matches transformed seek // key ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 2 + iteration * 2); + 2 + using_full_builder * 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } { @@ -1424,7 +1425,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0); // should skip bloom filter since mismatch is found ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 2 + iteration * 2); + 2 + using_full_builder * 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}})); @@ -1438,7 +1439,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abc"), 4); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 2 + iteration * 2); + 2 + using_full_builder * 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:4"}})); @@ -1451,18 +1452,17 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abc"), 0); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 3 + iteration * 2); + 3 + using_full_builder * 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); } - iteration++; } } // Create multiple SST files each with a different prefix_extractor config, // verify iterators can read all SST files using the latest config. TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { - int iteration = 0; - for (bool use_block_based_builder : {true, false}) { + for (auto bfp_impl : BFP::kAllImpls) { + int using_full_builder = bfp_impl != BFP::kBlock; Options options; options.create_if_missing = true; options.prefix_extractor.reset(NewFixedPrefixTransform(1)); @@ -1470,8 +1470,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { options.statistics = CreateDBStatistics(); // Enable prefix bloom for SST files BlockBasedTableOptions table_options; - table_options.filter_policy.reset( - NewBloomFilterPolicy(10, use_block_based_builder)); + table_options.filter_policy.reset(new BFP(10, bfp_impl)); table_options.cache_index_and_filter_blocks = true; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); @@ -1497,10 +1496,10 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "foo"), 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 1 + iteration); + 1 + using_full_builder); ASSERT_EQ(CountIter(iter, "gpk"), 0); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 1 + iteration); + 1 + using_full_builder); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); // second SST with capped:3 BF @@ -1514,13 +1513,13 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { std::unique_ptr iter_tmp(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter_tmp, "foo"), 4); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 2 + iteration * 2); + 2 + using_full_builder * 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); // both counters are incremented because BF is "not changed" for 1 of the // 2 SST files, so filter is checked once and found no match. ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 3 + iteration * 2); + 3 + using_full_builder * 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); } @@ -1539,24 +1538,24 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { ASSERT_EQ(CountIter(iter_tmp, "foo"), 9); // the first and last BF are checked ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 4 + iteration * 3); + 4 + using_full_builder * 3); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); // only last BF is checked and not found ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 5 + iteration * 3); + 5 + using_full_builder * 3); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); } // iter_old can only see the first SST, so checked plus 1 ASSERT_EQ(CountIter(iter_old, "foo"), 4); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 6 + iteration * 3); + 6 + using_full_builder * 3); // iter was created after the first setoptions call so only full filter // will check the filter ASSERT_EQ(CountIter(iter, "foo"), 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 6 + iteration * 4); + 6 + using_full_builder * 4); { // keys in all three SSTs are visible to iterator @@ -1565,11 +1564,11 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { std::unique_ptr iter_all(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter_all, "foo"), 9); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 7 + iteration * 5); + 7 + using_full_builder * 5); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); ASSERT_EQ(CountIter(iter_all, "gpk"), 0); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 8 + iteration * 5); + 8 + using_full_builder * 5); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); } ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); @@ -1581,15 +1580,14 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { // all three SST are checked because the current options has the same as // the remaining SST (capped:3) ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 9 + iteration * 7); + 9 + using_full_builder * 7); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); ASSERT_EQ(CountIter(iter_all, "gpk"), 0); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 10 + iteration * 7); + 10 + using_full_builder * 7); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4); } // TODO(Zhongyi): Maybe also need to add Get calls to test point look up? - iteration++; } } @@ -1598,7 +1596,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { // as expected TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) { int iteration = 0; - for (bool use_block_based_builder : {true, false}) { + for (auto bfp_impl : BFP::kAllImpls) { Options options = CurrentOptions(); options.create_if_missing = true; options.prefix_extractor.reset(NewFixedPrefixTransform(1)); @@ -1607,8 +1605,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) { // Enable prefix bloom for SST files BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; - table_options.filter_policy.reset( - NewBloomFilterPolicy(10, use_block_based_builder)); + table_options.filter_policy.reset(new BFP(10, bfp_impl)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu" + std::to_string(iteration)}, options); ReadOptions read_options; @@ -1657,8 +1654,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) { // Verify it's possible to change prefix_extractor at runtime and iterators // behaves as expected TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) { - int iteration = 0; - for (bool use_block_based_builder : {true, false}) { + for (auto bfp_impl : BFP::kAllImpls) { Options options; options.create_if_missing = true; options.prefix_extractor.reset(NewFixedPrefixTransform(1)); @@ -1667,8 +1663,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) { // Enable prefix bloom for SST files BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; - table_options.filter_policy.reset( - NewBloomFilterPolicy(10, use_block_based_builder)); + table_options.filter_policy.reset(new BFP(10, bfp_impl)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); @@ -1719,7 +1714,6 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) { ASSERT_EQ(CountIter(iter_old, "abc"), 0); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); - iteration++; } } diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h index c923ba354..6cc7bff67 100644 --- a/include/rocksdb/filter_policy.h +++ b/include/rocksdb/filter_policy.h @@ -138,8 +138,8 @@ class FilterPolicy { // // bits_per_key: bits per key in bloom filter. A good value for bits_per_key // is 10, which yields a filter with ~ 1% false positive rate. -// use_block_based_builder: use block based filter rather than full filter. -// If you want to builder full filter, it needs to be set to false. +// use_block_based_builder: use deprecated block based filter (true) rather +// than full or partitioned filter (false). // // Callers must delete the result after any database that is using the // result has been closed. diff --git a/table/block_based/block_based_filter_block_test.cc b/table/block_based/block_based_filter_block_test.cc index eb7a2dc1d..677203ae2 100644 --- a/table/block_based/block_based_filter_block_test.cc +++ b/table/block_based/block_based_filter_block_test.cc @@ -240,7 +240,7 @@ class BlockBasedFilterBlockTest : public mock::MockBlockBasedTableTester, public testing::Test { public: BlockBasedFilterBlockTest() - : mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10)) {} + : mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10, true)) {} }; TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) { diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc index c2acc570a..6a458f72b 100644 --- a/table/block_based/filter_policy.cc +++ b/table/block_based/filter_policy.cc @@ -20,9 +20,66 @@ namespace rocksdb { +namespace { + typedef LegacyLocalityBloomImpl LegacyFullFilterImpl; -class BlockBasedFilterBlockBuilder; -class FullFilterBlockBuilder; + +class FullFilterBitsBuilder : public BuiltinFilterBitsBuilder { + public: + explicit FullFilterBitsBuilder(const int bits_per_key, const int num_probes); + + // No Copy allowed + FullFilterBitsBuilder(const FullFilterBitsBuilder&) = delete; + void operator=(const FullFilterBitsBuilder&) = delete; + + ~FullFilterBitsBuilder() override; + + void AddKey(const Slice& key) override; + + // Create a filter that for hashes [0, n-1], the filter is allocated here + // When creating filter, it is ensured that + // total_bits = num_lines * CACHE_LINE_SIZE * 8 + // dst len is >= 5, 1 for num_probes, 4 for num_lines + // Then total_bits = (len - 5) * 8, and cache_line_size could be calculated + // +----------------------------------------------------------------+ + // | filter data with length total_bits/8 | + // +----------------------------------------------------------------+ + // | | + // | ... | + // | | + // +----------------------------------------------------------------+ + // | ... | num_probes : 1 byte | num_lines : 4 bytes | + // +----------------------------------------------------------------+ + Slice Finish(std::unique_ptr* buf) override; + + int CalculateNumEntry(const uint32_t bytes) override; + + uint32_t CalculateSpace(const int num_entry) override { + uint32_t dont_care1; + uint32_t dont_care2; + return CalculateSpace(num_entry, &dont_care1, &dont_care2); + } + + private: + friend class FullFilterBlockTest_DuplicateEntries_Test; + int bits_per_key_; + int num_probes_; + std::vector hash_entries_; + + // Get totalbits that optimized for cpu cache line + uint32_t GetTotalBitsForLocality(uint32_t total_bits); + + // Reserve space for new filter + char* ReserveSpace(const int num_entry, uint32_t* total_bits, + uint32_t* num_lines); + + // Implementation-specific variant of public CalculateSpace + uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits, + uint32_t* num_lines); + + // Assuming single threaded access to this function. + void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits); +}; FullFilterBitsBuilder::FullFilterBitsBuilder(const int bits_per_key, const int num_probes) @@ -30,35 +87,35 @@ FullFilterBitsBuilder::FullFilterBitsBuilder(const int bits_per_key, assert(bits_per_key_); } - FullFilterBitsBuilder::~FullFilterBitsBuilder() {} +FullFilterBitsBuilder::~FullFilterBitsBuilder() {} - void FullFilterBitsBuilder::AddKey(const Slice& key) { - uint32_t hash = BloomHash(key); - if (hash_entries_.size() == 0 || hash != hash_entries_.back()) { - hash_entries_.push_back(hash); +void FullFilterBitsBuilder::AddKey(const Slice& key) { + uint32_t hash = BloomHash(key); + if (hash_entries_.size() == 0 || hash != hash_entries_.back()) { + hash_entries_.push_back(hash); + } +} + +Slice FullFilterBitsBuilder::Finish(std::unique_ptr* buf) { + uint32_t total_bits, num_lines; + char* data = ReserveSpace(static_cast(hash_entries_.size()), &total_bits, + &num_lines); + assert(data); + + if (total_bits != 0 && num_lines != 0) { + for (auto h : hash_entries_) { + AddHash(h, data, num_lines, total_bits); } } + data[total_bits / 8] = static_cast(num_probes_); + EncodeFixed32(data + total_bits / 8 + 1, static_cast(num_lines)); - Slice FullFilterBitsBuilder::Finish(std::unique_ptr* buf) { - uint32_t total_bits, num_lines; - char* data = ReserveSpace(static_cast(hash_entries_.size()), - &total_bits, &num_lines); - assert(data); + const char* const_data = data; + buf->reset(const_data); + hash_entries_.clear(); - if (total_bits != 0 && num_lines != 0) { - for (auto h : hash_entries_) { - AddHash(h, data, num_lines, total_bits); - } - } - data[total_bits/8] = static_cast(num_probes_); - EncodeFixed32(data + total_bits/8 + 1, static_cast(num_lines)); - - const char* const_data = data; - buf->reset(const_data); - hash_entries_.clear(); - - return Slice(data, total_bits / 8 + 5); - } + return Slice(data, total_bits / 8 + 5); +} uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) { uint32_t num_lines = @@ -106,13 +163,11 @@ char* FullFilterBitsBuilder::ReserveSpace(const int num_entry, int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t bytes) { assert(bits_per_key_); assert(bytes > 0); - uint32_t dont_care1, dont_care2; int high = static_cast(bytes * 8 / bits_per_key_ + 1); int low = 1; int n = high; for (; n >= low; n--) { - uint32_t sz = CalculateSpace(n, &dont_care1, &dont_care2); - if (sz <= bytes) { + if (CalculateSpace(n) <= bytes) { break; } } @@ -131,7 +186,6 @@ inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data, folly::constexpr_log2(CACHE_LINE_SIZE)); } -namespace { class AlwaysTrueFilter : public FilterBitsReader { public: bool MayMatch(const Slice&) override { return true; } @@ -196,148 +250,148 @@ class FullFilterBitsReader : public FilterBitsReader { const uint32_t log2_cache_line_size_; }; +} // namespace -// An implementation of filter policy -class BloomFilterPolicy : public FilterPolicy { - public: - explicit BloomFilterPolicy(int bits_per_key, bool use_block_based_builder) - : bits_per_key_(bits_per_key), - use_block_based_builder_(use_block_based_builder) { - initialize(); - } - - ~BloomFilterPolicy() override {} - - const char* Name() const override { return "rocksdb.BuiltinBloomFilter"; } - - void CreateFilter(const Slice* keys, int n, std::string* dst) const override { - // Compute bloom filter size (in both bits and bytes) - uint32_t bits = static_cast(n * bits_per_key_); - - // For small n, we can see a very high false positive rate. Fix it - // by enforcing a minimum bloom filter length. - if (bits < 64) bits = 64; - - uint32_t bytes = (bits + 7) / 8; - bits = bytes * 8; - - const size_t init_size = dst->size(); - dst->resize(init_size + bytes, 0); - dst->push_back(static_cast(num_probes_)); // Remember # of probes - char* array = &(*dst)[init_size]; - for (int i = 0; i < n; i++) { - LegacyNoLocalityBloomImpl::AddHash(BloomHash(keys[i]), bits, num_probes_, - array); - } - } - - bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override { - const size_t len = bloom_filter.size(); - if (len < 2 || len > 0xffffffffU) { - return false; - } - - const char* array = bloom_filter.data(); - const uint32_t bits = static_cast(len - 1) * 8; - - // Use the encoded k so that we can read filters generated by - // bloom filters created using different parameters. - const int k = static_cast(array[len - 1]); - if (k > 30) { - // Reserved for potentially new encodings for short bloom filters. - // Consider it a match. - return true; - } - // NB: using k not num_probes_ - return LegacyNoLocalityBloomImpl::HashMayMatch(BloomHash(key), bits, k, - array); - } - - FilterBitsBuilder* GetFilterBitsBuilder() const override { - if (use_block_based_builder_) { - return nullptr; - } - - return new FullFilterBitsBuilder(bits_per_key_, num_probes_); - } - - // Read metadata to determine what kind of FilterBitsReader is needed - // and return a new one. - FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override { - uint32_t len_with_meta = static_cast(contents.size()); - if (len_with_meta <= 5) { - // filter is empty or broken. Treat like zero keys added. - return new AlwaysFalseFilter(); - } - - char raw_num_probes = contents.data()[len_with_meta - 5]; - // NB: *num_probes > 30 and < 128 probably have not been used, because of - // BloomFilterPolicy::initialize, unless directly calling - // FullFilterBitsBuilder as an API, but we are leaving those cases in - // limbo with FullFilterBitsReader for now. - - if (raw_num_probes < 1) { - // Treat as zero probes (always FP) for now. - // NB: < 0 (or unsigned > 127) effectively reserved for future use. - return new AlwaysTrueFilter(); - } - // else attempt decode for FullFilterBitsReader - - int num_probes = raw_num_probes; - assert(num_probes >= 1); - assert(num_probes <= 127); - - uint32_t len = len_with_meta - 5; - assert(len > 0); - - uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4); - uint32_t log2_cache_line_size; - - if (num_lines * CACHE_LINE_SIZE == len) { - // Common case - log2_cache_line_size = folly::constexpr_log2(CACHE_LINE_SIZE); - } else if (num_lines == 0 || len % num_lines != 0) { - // Invalid (no solution to num_lines * x == len) - // Treat as zero probes (always FP) for now. - return new AlwaysTrueFilter(); - } else { - // Determine the non-native cache line size (from another system) - log2_cache_line_size = 0; - while ((num_lines << log2_cache_line_size) < len) { - ++log2_cache_line_size; - } - if ((num_lines << log2_cache_line_size) != len) { - // Invalid (block size not a power of two) - // Treat as zero probes (always FP) for now. - return new AlwaysTrueFilter(); - } - } - // if not early return - return new FullFilterBitsReader(contents.data(), num_probes, num_lines, - log2_cache_line_size); - } - - // If choose to use block based builder - bool UseBlockBasedBuilder() { return use_block_based_builder_; } - - private: - int bits_per_key_; - int num_probes_; - const bool use_block_based_builder_; - - void initialize() { - // We intentionally round down to reduce probing cost a little bit - num_probes_ = static_cast(bits_per_key_ * 0.69); // 0.69 =~ ln(2) - if (num_probes_ < 1) num_probes_ = 1; - if (num_probes_ > 30) num_probes_ = 30; - } +const std::vector BloomFilterPolicy::kAllImpls = { + kFull, + kBlock, }; -} // namespace +BloomFilterPolicy::BloomFilterPolicy(int bits_per_key, Impl impl) + : bits_per_key_(bits_per_key), impl_(impl) { + // We intentionally round down to reduce probing cost a little bit + num_probes_ = static_cast(bits_per_key_ * 0.69); // 0.69 =~ ln(2) + if (num_probes_ < 1) num_probes_ = 1; + if (num_probes_ > 30) num_probes_ = 30; +} + +BloomFilterPolicy::~BloomFilterPolicy() {} + +const char* BloomFilterPolicy::Name() const { + return "rocksdb.BuiltinBloomFilter"; +} + +void BloomFilterPolicy::CreateFilter(const Slice* keys, int n, + std::string* dst) const { + // We should ideally only be using this deprecated interface for + // appropriately constructed BloomFilterPolicy + assert(impl_ == kBlock); + + // Compute bloom filter size (in both bits and bytes) + uint32_t bits = static_cast(n * bits_per_key_); + + // For small n, we can see a very high false positive rate. Fix it + // by enforcing a minimum bloom filter length. + if (bits < 64) bits = 64; + + uint32_t bytes = (bits + 7) / 8; + bits = bytes * 8; + + const size_t init_size = dst->size(); + dst->resize(init_size + bytes, 0); + dst->push_back(static_cast(num_probes_)); // Remember # of probes + char* array = &(*dst)[init_size]; + for (int i = 0; i < n; i++) { + LegacyNoLocalityBloomImpl::AddHash(BloomHash(keys[i]), bits, num_probes_, + array); + } +} + +bool BloomFilterPolicy::KeyMayMatch(const Slice& key, + const Slice& bloom_filter) const { + const size_t len = bloom_filter.size(); + if (len < 2 || len > 0xffffffffU) { + return false; + } + + const char* array = bloom_filter.data(); + const uint32_t bits = static_cast(len - 1) * 8; + + // Use the encoded k so that we can read filters generated by + // bloom filters created using different parameters. + const int k = static_cast(array[len - 1]); + if (k > 30) { + // Reserved for potentially new encodings for short bloom filters. + // Consider it a match. + return true; + } + // NB: using k not num_probes_ + return LegacyNoLocalityBloomImpl::HashMayMatch(BloomHash(key), bits, k, + array); +} + +FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilder() const { + if (impl_ == kBlock) { + return nullptr; + } else { + return new FullFilterBitsBuilder(bits_per_key_, num_probes_); + } +} + +// Read metadata to determine what kind of FilterBitsReader is needed +// and return a new one. +FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader( + const Slice& contents) const { + uint32_t len_with_meta = static_cast(contents.size()); + if (len_with_meta <= 5) { + // filter is empty or broken. Treat like zero keys added. + return new AlwaysFalseFilter(); + } + + char raw_num_probes = contents.data()[len_with_meta - 5]; + // NB: *num_probes > 30 and < 128 probably have not been used, because of + // BloomFilterPolicy::initialize, unless directly calling + // FullFilterBitsBuilder as an API, but we are leaving those cases in + // limbo with FullFilterBitsReader for now. + + if (raw_num_probes < 1) { + // Treat as zero probes (always FP) for now. + // NB: < 0 (or unsigned > 127) effectively reserved for future use. + return new AlwaysTrueFilter(); + } + // else attempt decode for FullFilterBitsReader + + int num_probes = raw_num_probes; + assert(num_probes >= 1); + assert(num_probes <= 127); + + uint32_t len = len_with_meta - 5; + assert(len > 0); + + uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4); + uint32_t log2_cache_line_size; + + if (num_lines * CACHE_LINE_SIZE == len) { + // Common case + log2_cache_line_size = folly::constexpr_log2(CACHE_LINE_SIZE); + } else if (num_lines == 0 || len % num_lines != 0) { + // Invalid (no solution to num_lines * x == len) + // Treat as zero probes (always FP) for now. + return new AlwaysTrueFilter(); + } else { + // Determine the non-native cache line size (from another system) + log2_cache_line_size = 0; + while ((num_lines << log2_cache_line_size) < len) { + ++log2_cache_line_size; + } + if ((num_lines << log2_cache_line_size) != len) { + // Invalid (block size not a power of two) + // Treat as zero probes (always FP) for now. + return new AlwaysTrueFilter(); + } + } + // if not early return + return new FullFilterBitsReader(contents.data(), num_probes, num_lines, + log2_cache_line_size); +} const FilterPolicy* NewBloomFilterPolicy(int bits_per_key, bool use_block_based_builder) { - return new BloomFilterPolicy(bits_per_key, use_block_based_builder); + if (use_block_based_builder) { + return new BloomFilterPolicy(bits_per_key, BloomFilterPolicy::kBlock); + } else { + return new BloomFilterPolicy(bits_per_key, BloomFilterPolicy::kFull); + } } FilterPolicy::~FilterPolicy() { } diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h index 458a34412..017c8b1d2 100644 --- a/table/block_based/filter_policy_internal.h +++ b/table/block_based/filter_policy_internal.h @@ -18,57 +18,61 @@ namespace rocksdb { class Slice; -class FullFilterBitsBuilder : public FilterBitsBuilder { +// Exposes any extra information needed for testing built-in +// FilterBitsBuilders +class BuiltinFilterBitsBuilder : public FilterBitsBuilder { public: - explicit FullFilterBitsBuilder(const int bits_per_key, const int num_probes); - - // No Copy allowed - FullFilterBitsBuilder(const FullFilterBitsBuilder&) = delete; - void operator=(const FullFilterBitsBuilder&) = delete; - - ~FullFilterBitsBuilder(); - - void AddKey(const Slice& key) override; - - // Create a filter that for hashes [0, n-1], the filter is allocated here - // When creating filter, it is ensured that - // total_bits = num_lines * CACHE_LINE_SIZE * 8 - // dst len is >= 5, 1 for num_probes, 4 for num_lines - // Then total_bits = (len - 5) * 8, and cache_line_size could be calculated - // +----------------------------------------------------------------+ - // | filter data with length total_bits/8 | - // +----------------------------------------------------------------+ - // | | - // | ... | - // | | - // +----------------------------------------------------------------+ - // | ... | num_probes : 1 byte | num_lines : 4 bytes | - // +----------------------------------------------------------------+ - Slice Finish(std::unique_ptr* buf) override; - - int CalculateNumEntry(const uint32_t bytes) override; - // Calculate number of bytes needed for a new filter, including // metadata. Passing the result to CalculateNumEntry should // return >= the num_entry passed in. - uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits, - uint32_t* num_lines); + virtual uint32_t CalculateSpace(const int num_entry) = 0; +}; + +// RocksDB built-in filter policy for Bloom or Bloom-like filters. +// This class is considered internal API and subject to change. +// See NewBloomFilterPolicy. +class BloomFilterPolicy : public FilterPolicy { + public: + // An internal marker for which Bloom filter implementation to use. + // This makes it easier for tests to track or to walk over the built-in + // set of Bloom filter policies. The only variance in BloomFilterPolicy + // by implementation is in GetFilterBitsBuilder(), so an enum is practical + // here vs. subclasses. + enum Impl { + // Implementation of Bloom filter for full and partitioned filters. + // Set to 0 in case of value confusion with bool use_block_based_builder + kFull = 0, + // Deprecated block-based Bloom filter implementation. + // Set to 1 in case of value confusion with bool use_block_based_builder + kBlock = 1, + }; + static const std::vector kAllImpls; + + explicit BloomFilterPolicy(int bits_per_key, Impl impl); + + ~BloomFilterPolicy() override; + + const char* Name() const override; + + // Deprecated block-based filter only + void CreateFilter(const Slice* keys, int n, std::string* dst) const override; + + // Deprecated block-based filter only + bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override; + + FilterBitsBuilder* GetFilterBitsBuilder() const override; + + // Read metadata to determine what kind of FilterBitsReader is needed + // and return a new one. This must successfully process any filter data + // generated by a built-in FilterBitsBuilder, regardless of the impl + // chosen for this BloomFilterPolicy. Not compatible with CreateFilter. + FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override; private: - friend class FullFilterBlockTest_DuplicateEntries_Test; int bits_per_key_; int num_probes_; - std::vector hash_entries_; - - // Get totalbits that optimized for cpu cache line - uint32_t GetTotalBitsForLocality(uint32_t total_bits); - - // Reserve space for new filter - char* ReserveSpace(const int num_entry, uint32_t* total_bits, - uint32_t* num_lines); - - // Assuming single threaded access to this function. - void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits); + // Selected implementation for building new SST filters + Impl impl_; }; } // namespace rocksdb diff --git a/util/bloom_test.cc b/util/bloom_test.cc index 9e9b794f1..9d969fea9 100644 --- a/util/bloom_test.cc +++ b/util/bloom_test.cc @@ -55,15 +55,15 @@ static int NextLength(int length) { return length; } -class BloomTest : public testing::Test { +class BlockBasedBloomTest : public testing::Test { private: std::unique_ptr policy_; std::string filter_; std::vector keys_; public: - BloomTest() : policy_( - NewBloomFilterPolicy(FLAGS_bits_per_key)) {} + BlockBasedBloomTest() + : policy_(NewBloomFilterPolicy(FLAGS_bits_per_key, true)) {} void Reset() { keys_.clear(); @@ -72,7 +72,7 @@ class BloomTest : public testing::Test { void ResetPolicy(const FilterPolicy* policy = nullptr) { if (policy == nullptr) { - policy_.reset(NewBloomFilterPolicy(FLAGS_bits_per_key)); + policy_.reset(NewBloomFilterPolicy(FLAGS_bits_per_key, true)); } else { policy_.reset(policy); } @@ -131,12 +131,12 @@ class BloomTest : public testing::Test { } }; -TEST_F(BloomTest, EmptyFilter) { +TEST_F(BlockBasedBloomTest, EmptyFilter) { ASSERT_TRUE(! Matches("hello")); ASSERT_TRUE(! Matches("world")); } -TEST_F(BloomTest, Small) { +TEST_F(BlockBasedBloomTest, Small) { Add("hello"); Add("world"); ASSERT_TRUE(Matches("hello")); @@ -145,7 +145,7 @@ TEST_F(BloomTest, Small) { ASSERT_TRUE(! Matches("foo")); } -TEST_F(BloomTest, VaryingLengths) { +TEST_F(BlockBasedBloomTest, VaryingLengths) { char buffer[sizeof(int)]; // Count number of filters that significantly exceed the false positive rate @@ -186,45 +186,45 @@ TEST_F(BloomTest, VaryingLengths) { // Ensure the implementation doesn't accidentally change in an // incompatible way -TEST_F(BloomTest, Schema) { +TEST_F(BlockBasedBloomTest, Schema) { char buffer[sizeof(int)]; - ResetPolicy(NewBloomFilterPolicy(8)); // num_probes = 5 + ResetPolicy(NewBloomFilterPolicy(8, true)); // num_probes = 5 for (int key = 0; key < 87; key++) { Add(Key(key, buffer)); } Build(); ASSERT_EQ(BloomHash(FilterData()), 3589896109U); - ResetPolicy(NewBloomFilterPolicy(9)); // num_probes = 6 + ResetPolicy(NewBloomFilterPolicy(9, true)); // num_probes = 6 for (int key = 0; key < 87; key++) { Add(Key(key, buffer)); } Build(); ASSERT_EQ(BloomHash(FilterData()), 969445585); - ResetPolicy(NewBloomFilterPolicy(11)); // num_probes = 7 + ResetPolicy(NewBloomFilterPolicy(11, true)); // num_probes = 7 for (int key = 0; key < 87; key++) { Add(Key(key, buffer)); } Build(); ASSERT_EQ(BloomHash(FilterData()), 1694458207); - ResetPolicy(NewBloomFilterPolicy(10)); // num_probes = 6 + ResetPolicy(NewBloomFilterPolicy(10, true)); // num_probes = 6 for (int key = 0; key < 87; key++) { Add(Key(key, buffer)); } Build(); ASSERT_EQ(BloomHash(FilterData()), 2373646410U); - ResetPolicy(NewBloomFilterPolicy(10)); + ResetPolicy(NewBloomFilterPolicy(10, true)); for (int key = 1; key < 87; key++) { Add(Key(key, buffer)); } Build(); ASSERT_EQ(BloomHash(FilterData()), 1908442116); - ResetPolicy(NewBloomFilterPolicy(10)); + ResetPolicy(NewBloomFilterPolicy(10, true)); for (int key = 1; key < 88; key++) { Add(Key(key, buffer)); } @@ -251,8 +251,9 @@ class FullBloomTest : public testing::Test { Reset(); } - FullFilterBitsBuilder* GetFullFilterBitsBuilder() { - return dynamic_cast(bits_builder_.get()); + BuiltinFilterBitsBuilder* GetBuiltinFilterBitsBuilder() { + // Throws on bad cast + return &dynamic_cast(*bits_builder_.get()); } void Reset() { @@ -322,15 +323,12 @@ class FullBloomTest : public testing::Test { }; TEST_F(FullBloomTest, FilterSize) { - uint32_t dont_care1, dont_care2; - auto full_bits_builder = GetFullFilterBitsBuilder(); - ASSERT_TRUE(full_bits_builder != nullptr); + auto bits_builder = GetBuiltinFilterBitsBuilder(); for (int n = 1; n < 100; n++) { - auto space = full_bits_builder->CalculateSpace(n, &dont_care1, &dont_care2); - auto n2 = full_bits_builder->CalculateNumEntry(space); + auto space = bits_builder->CalculateSpace(n); + auto n2 = bits_builder->CalculateNumEntry(space); ASSERT_GE(n2, n); - auto space2 = - full_bits_builder->CalculateSpace(n2, &dont_care1, &dont_care2); + auto space2 = bits_builder->CalculateSpace(n2); ASSERT_EQ(space, space2); } }