From 45b9bb03317dd19e39d669969cc7e5fd531c77da Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Sun, 2 Jul 2017 10:36:10 -0700 Subject: [PATCH] Cut filter partition based on metadata_block_size Summary: Currently metadata_block_size controls only index partition size. With this patch a partition is cut after any of index or filter partitions reaches metadata_block_size. Closes https://github.com/facebook/rocksdb/pull/2452 Differential Revision: D5275651 Pulled By: maysamyabandeh fbshipit-source-id: 5057e4424b4c8902043782e6bf8c38f0c4f25160 --- include/rocksdb/filter_policy.h | 14 +++- table/block_based_table_builder.cc | 10 ++- table/full_filter_bits_builder.h | 73 +++++++++++++++++++++ table/index_builder.cc | 6 ++ table/index_builder.h | 7 ++ table/partitioned_filter_block.cc | 16 ++++- table/partitioned_filter_block.h | 11 +++- table/partitioned_filter_block_test.cc | 23 ++++++- util/bloom.cc | 88 +++++++++++--------------- util/bloom_test.cc | 20 +++++- 10 files changed, 209 insertions(+), 59 deletions(-) create mode 100644 table/full_filter_bits_builder.h diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h index 662cf4ef2..0593c04a1 100644 --- a/include/rocksdb/filter_policy.h +++ b/include/rocksdb/filter_policy.h @@ -20,8 +20,10 @@ #ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ #define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ -#include #include +#include +#include +#include namespace rocksdb { @@ -41,6 +43,16 @@ class FilterBitsBuilder { // The return value of this function would be the filter bits, // The ownership of actual data is set to buf virtual Slice Finish(std::unique_ptr* buf) = 0; + + // Calculate num of entries fit into a space. + virtual int CalculateNumEntry(const uint32_t space) { +#ifndef ROCKSDB_LITE + throw std::runtime_error("CalculateNumEntry not Implemented"); +#else + abort(); +#endif + return 0; + } }; // A class that checks if a key can be in filter diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index e3cc2b8d3..6545c7b8f 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -76,10 +76,18 @@ FilterBlockBuilder* CreateFilterBlockBuilder( } else { if (table_opt.partition_filters) { assert(p_index_builder != nullptr); + // Since after partition cut request from filter builder it takes time + // until index builder actully cuts the partition, we take the lower bound + // as partition size. + assert(table_opt.block_size_deviation <= 100); + auto partition_size = + (const uint32_t)(table_opt.metadata_block_size * + (100 - table_opt.block_size_deviation)); + partition_size = std::max(partition_size, (const uint32_t)1); return new PartitionedFilterBlockBuilder( opt.prefix_extractor, table_opt.whole_key_filtering, filter_bits_builder, table_opt.index_block_restart_interval, - p_index_builder); + p_index_builder, partition_size); } else { return new FullFilterBlockBuilder(opt.prefix_extractor, table_opt.whole_key_filtering, diff --git a/table/full_filter_bits_builder.h b/table/full_filter_bits_builder.h new file mode 100644 index 000000000..c47a74754 --- /dev/null +++ b/table/full_filter_bits_builder.h @@ -0,0 +1,73 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include + +#include "rocksdb/filter_policy.h" + +namespace rocksdb { + +class Slice; + +class FullFilterBitsBuilder : public FilterBitsBuilder { + public: + explicit FullFilterBitsBuilder(const size_t bits_per_key, + const size_t num_probes); + + ~FullFilterBitsBuilder(); + + virtual void AddKey(const Slice& key) override; + + // Create a filter that for hashes [0, n-1], the filter is allocated here + // When creating filter, it is ensured that + // total_bits = num_lines * CACHE_LINE_SIZE * 8 + // dst len is >= 5, 1 for num_probes, 4 for num_lines + // Then total_bits = (len - 5) * 8, and cache_line_size could be calculated + // +----------------------------------------------------------------+ + // | filter data with length total_bits/8 | + // +----------------------------------------------------------------+ + // | | + // | ... | + // | | + // +----------------------------------------------------------------+ + // | ... | num_probes : 1 byte | num_lines : 4 bytes | + // +----------------------------------------------------------------+ + virtual Slice Finish(std::unique_ptr* buf) override; + + // Calculate num of entries fit into a space. + virtual int CalculateNumEntry(const uint32_t space) override; + + // Calculate space for new filter. This is reverse of CalculateNumEntry. + uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits, + uint32_t* num_lines); + + private: + size_t bits_per_key_; + size_t num_probes_; + std::vector hash_entries_; + + // Get totalbits that optimized for cpu cache line + uint32_t GetTotalBitsForLocality(uint32_t total_bits); + + // Reserve space for new filter + char* ReserveSpace(const int num_entry, uint32_t* total_bits, + uint32_t* num_lines); + + // Assuming single threaded access to this function. + void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits); + + // No Copy allowed + FullFilterBitsBuilder(const FullFilterBitsBuilder&); + void operator=(const FullFilterBitsBuilder&); +}; + +} // namespace rocksdb diff --git a/table/index_builder.cc b/table/index_builder.cc index fdde77bc8..a67b97895 100644 --- a/table/index_builder.cc +++ b/table/index_builder.cc @@ -77,6 +77,11 @@ void PartitionedIndexBuilder::MakeNewSubIndexBuilder() { flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( table_opt_.metadata_block_size, table_opt_.block_size_deviation, sub_index_builder_->index_block_builder_)); + partition_cut_requested_ = false; +} + +void PartitionedIndexBuilder::RequestPartitionCut() { + partition_cut_requested_ = true; } void PartitionedIndexBuilder::AddIndexEntry( @@ -102,6 +107,7 @@ void PartitionedIndexBuilder::AddIndexEntry( std::string handle_encoding; block_handle.EncodeTo(&handle_encoding); bool do_flush = + partition_cut_requested_ || flush_policy_->Update(*last_key_in_current_block, handle_encoding); if (do_flush) { entries_.push_back( diff --git a/table/index_builder.h b/table/index_builder.h index 1871abfc5..43d484d76 100644 --- a/table/index_builder.h +++ b/table/index_builder.h @@ -314,6 +314,10 @@ class PartitionedIndexBuilder : public IndexBuilder { std::string& GetPartitionKey() { return sub_index_last_key_; } + // Called when an external entity (such as filter partition builder) request + // cutting the next partition + void RequestPartitionCut(); + private: void MakeNewSubIndexBuilder(); @@ -331,6 +335,9 @@ class PartitionedIndexBuilder : public IndexBuilder { // true if Finish is called once but not complete yet. bool finishing_indexes = false; const BlockBasedTableOptions& table_opt_; + // true if an external entity (such as filter partition builder) request + // cutting the next partition + bool partition_cut_requested_ = true; // true if it should cut the next filter partition block bool cut_filter_block = false; }; diff --git a/table/partitioned_filter_block.cc b/table/partitioned_filter_block.cc index 585700284..c90743daf 100644 --- a/table/partitioned_filter_block.cc +++ b/table/partitioned_filter_block.cc @@ -20,15 +20,25 @@ namespace rocksdb { PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( const SliceTransform* prefix_extractor, bool whole_key_filtering, FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, - PartitionedIndexBuilder* const p_index_builder) + PartitionedIndexBuilder* const p_index_builder, + const uint32_t partition_size) : FullFilterBlockBuilder(prefix_extractor, whole_key_filtering, filter_bits_builder), index_on_filter_block_builder_(index_block_restart_interval), - p_index_builder_(p_index_builder) {} + p_index_builder_(p_index_builder) { + filters_per_partition_ = + filter_bits_builder_->CalculateNumEntry(partition_size); +} PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {} void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() { + // Use == to send the request only once + if (filters_in_partition_ == filters_per_partition_) { + // Currently only index builder is in charge of cutting a partition. We keep + // requesting until it is granted. + p_index_builder_->RequestPartitionCut(); + } if (!p_index_builder_->ShouldCutFilterBlock()) { return; } @@ -36,11 +46,13 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() { Slice filter = filter_bits_builder_->Finish(&filter_gc.back()); std::string& index_key = p_index_builder_->GetPartitionKey(); filters.push_back({index_key, filter}); + filters_in_partition_ = 0; } void PartitionedFilterBlockBuilder::AddKey(const Slice& key) { MaybeCutAFilterBlock(); filter_bits_builder_->AddKey(key); + filters_in_partition_++; } Slice PartitionedFilterBlockBuilder::Finish( diff --git a/table/partitioned_filter_block.h b/table/partitioned_filter_block.h index 48f10c015..97bbd1ea8 100644 --- a/table/partitioned_filter_block.h +++ b/table/partitioned_filter_block.h @@ -28,7 +28,8 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { explicit PartitionedFilterBlockBuilder( const SliceTransform* prefix_extractor, bool whole_key_filtering, FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, - PartitionedIndexBuilder* const p_index_builder); + PartitionedIndexBuilder* const p_index_builder, + const uint32_t partition_size); virtual ~PartitionedFilterBlockBuilder(); @@ -51,7 +52,15 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { false; // true if Finish is called once but not complete yet. // The policy of when cut a filter block and Finish it void MaybeCutAFilterBlock(); + // Currently we keep the same number of partitions for filters and indexes. + // This would allow for some potentioal optimizations in future. If such + // optimizations did not realize we can use different number of partitions and + // eliminate p_index_builder_ PartitionedIndexBuilder* const p_index_builder_; + // The desired number of filters per partition + uint32_t filters_per_partition_; + // The current number of filters in the last partition + uint32_t filters_in_partition_; }; class PartitionedFilterBlockReader : public FilterBlockReader { diff --git a/table/partitioned_filter_block_test.cc b/table/partitioned_filter_block_test.cc index f42f57f39..288a75a95 100644 --- a/table/partitioned_filter_block_test.cc +++ b/table/partitioned_filter_block_test.cc @@ -9,6 +9,7 @@ #include "rocksdb/filter_policy.h" +#include "table/full_filter_bits_builder.h" #include "table/index_builder.h" #include "table/partitioned_filter_block.h" #include "util/coding.h" @@ -64,6 +65,16 @@ class PartitionedFilterBlockTest : public testing::Test { return max_index_size; } + uint64_t MaxFilterSize() { + int num_keys = sizeof(keys) / sizeof(*keys); + auto filter_bits_reader = dynamic_cast( + table_options_.filter_policy->GetFilterBitsBuilder()); + uint32_t dont_care1, dont_care2; + auto partition_size = + filter_bits_reader->CalculateSpace(num_keys, &dont_care1, &dont_care2); + return partition_size + table_options_.block_size_deviation; + } + int last_offset = 10; BlockHandle Write(const Slice& slice) { BlockHandle bh(last_offset + 1, slice.size()); @@ -78,10 +89,17 @@ class PartitionedFilterBlockTest : public testing::Test { PartitionedFilterBlockBuilder* NewBuilder( PartitionedIndexBuilder* const p_index_builder) { + uint32_t partition_size = + table_options_.metadata_block_size > + (uint64_t)table_options_.block_size_deviation + ? table_options_.metadata_block_size - + table_options_.block_size_deviation + : 1; return new PartitionedFilterBlockBuilder( nullptr, table_options_.whole_key_filtering, table_options_.filter_policy->GetFilterBitsBuilder(), - table_options_.index_block_restart_interval, p_index_builder); + table_options_.index_block_restart_interval, p_index_builder, + partition_size); } std::unique_ptr table; @@ -261,7 +279,8 @@ TEST_F(PartitionedFilterBlockTest, OneBlockPerKey) { TEST_F(PartitionedFilterBlockTest, PartitionCount) { int num_keys = sizeof(keys) / sizeof(*keys); - table_options_.metadata_block_size = MaxIndexSize(); + table_options_.metadata_block_size = + std::max(MaxIndexSize(), MaxFilterSize()); int partitions = TestBlockPerKey(); ASSERT_EQ(partitions, 1); // A low number ensures cutting a block after each key diff --git a/util/bloom.cc b/util/bloom.cc index efdf1fda4..5101f7931 100644 --- a/util/bloom.cc +++ b/util/bloom.cc @@ -13,49 +13,32 @@ #include "rocksdb/slice.h" #include "table/block_based_filter_block.h" +#include "table/full_filter_bits_builder.h" #include "table/full_filter_block.h" -#include "util/hash.h" #include "util/coding.h" +#include "util/hash.h" namespace rocksdb { class BlockBasedFilterBlockBuilder; class FullFilterBlockBuilder; -namespace { -class FullFilterBitsBuilder : public FilterBitsBuilder { - public: - explicit FullFilterBitsBuilder(const size_t bits_per_key, - const size_t num_probes) - : bits_per_key_(bits_per_key), - num_probes_(num_probes) { - assert(bits_per_key_); +FullFilterBitsBuilder::FullFilterBitsBuilder(const size_t bits_per_key, + const size_t num_probes) + : bits_per_key_(bits_per_key), num_probes_(num_probes) { + assert(bits_per_key_); } - ~FullFilterBitsBuilder() {} + FullFilterBitsBuilder::~FullFilterBitsBuilder() {} - virtual void AddKey(const Slice& key) override { + void FullFilterBitsBuilder::AddKey(const Slice& key) { uint32_t hash = BloomHash(key); if (hash_entries_.size() == 0 || hash != hash_entries_.back()) { hash_entries_.push_back(hash); } } - // Create a filter that for hashes [0, n-1], the filter is allocated here - // When creating filter, it is ensured that - // total_bits = num_lines * CACHE_LINE_SIZE * 8 - // dst len is >= 5, 1 for num_probes, 4 for num_lines - // Then total_bits = (len - 5) * 8, and cache_line_size could be calculated - // +----------------------------------------------------------------+ - // | filter data with length total_bits/8 | - // +----------------------------------------------------------------+ - // | | - // | ... | - // | | - // +----------------------------------------------------------------+ - // | ... | num_probes : 1 byte | num_lines : 4 bytes | - // +----------------------------------------------------------------+ - virtual Slice Finish(std::unique_ptr* buf) override { + Slice FullFilterBitsBuilder::Finish(std::unique_ptr* buf) { uint32_t total_bits, num_lines; char* data = ReserveSpace(static_cast(hash_entries_.size()), &total_bits, &num_lines); @@ -76,27 +59,6 @@ class FullFilterBitsBuilder : public FilterBitsBuilder { return Slice(data, total_bits / 8 + 5); } - private: - size_t bits_per_key_; - size_t num_probes_; - std::vector hash_entries_; - - // Get totalbits that optimized for cpu cache line - uint32_t GetTotalBitsForLocality(uint32_t total_bits); - - // Reserve space for new filter - char* ReserveSpace(const int num_entry, uint32_t* total_bits, - uint32_t* num_lines); - - // Assuming single threaded access to this function. - void AddHash(uint32_t h, char* data, uint32_t num_lines, - uint32_t total_bits); - - // No Copy allowed - FullFilterBitsBuilder(const FullFilterBitsBuilder&); - void operator=(const FullFilterBitsBuilder&); -}; - uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) { uint32_t num_lines = (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8); @@ -109,10 +71,10 @@ uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) { return num_lines * (CACHE_LINE_SIZE * 8); } -char* FullFilterBitsBuilder::ReserveSpace(const int num_entry, - uint32_t* total_bits, uint32_t* num_lines) { +uint32_t FullFilterBitsBuilder::CalculateSpace(const int num_entry, + uint32_t* total_bits, + uint32_t* num_lines) { assert(bits_per_key_); - char* data = nullptr; if (num_entry != 0) { uint32_t total_bits_tmp = num_entry * static_cast(bits_per_key_); @@ -128,12 +90,35 @@ char* FullFilterBitsBuilder::ReserveSpace(const int num_entry, // Reserve space for Filter uint32_t sz = *total_bits / 8; sz += 5; // 4 bytes for num_lines, 1 byte for num_probes + return sz; +} - data = new char[sz]; +char* FullFilterBitsBuilder::ReserveSpace(const int num_entry, + uint32_t* total_bits, + uint32_t* num_lines) { + uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines); + char* data = new char[sz]; memset(data, 0, sz); return data; } +int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t space) { + assert(bits_per_key_); + assert(space > 0); + uint32_t dont_care1, dont_care2; + int high = (int) (space * 8 / bits_per_key_ + 1); + int low = 1; + int n = high; + for (; n >= low; n--) { + uint32_t sz = CalculateSpace(n, &dont_care1, &dont_care2); + if (sz <= space) { + break; + } + } + assert(n < high); // High should be an overestimation + return n; +} + inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits) { assert(num_lines > 0 && total_bits > 0); @@ -151,6 +136,7 @@ inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data, } } +namespace { class FullFilterBitsReader : public FilterBitsReader { public: explicit FullFilterBitsReader(const Slice& contents) diff --git a/util/bloom_test.cc b/util/bloom_test.cc index 7334439af..e868021e5 100644 --- a/util/bloom_test.cc +++ b/util/bloom_test.cc @@ -21,10 +21,11 @@ int main() { #include #include "rocksdb/filter_policy.h" +#include "table/full_filter_bits_builder.h" +#include "util/arena.h" #include "util/logging.h" #include "util/testharness.h" #include "util/testutil.h" -#include "util/arena.h" using GFLAGS::ParseCommandLineFlags; @@ -197,6 +198,10 @@ class FullBloomTest : public testing::Test { delete policy_; } + FullFilterBitsBuilder* GetFullFilterBitsBuilder() { + return dynamic_cast(bits_builder_.get()); + } + void Reset() { bits_builder_.reset(policy_->GetFilterBitsBuilder()); bits_reader_.reset(nullptr); @@ -237,6 +242,19 @@ class FullBloomTest : public testing::Test { } }; +TEST_F(FullBloomTest, FilterSize) { + uint32_t dont_care1, dont_care2; + auto full_bits_builder = GetFullFilterBitsBuilder(); + for (int n = 1; n < 100; n++) { + auto space = full_bits_builder->CalculateSpace(n, &dont_care1, &dont_care2); + auto n2 = full_bits_builder->CalculateNumEntry(space); + ASSERT_GE(n2, n); + auto space2 = + full_bits_builder->CalculateSpace(n2, &dont_care1, &dont_care2); + ASSERT_EQ(space, space2); + } +} + TEST_F(FullBloomTest, FullEmptyFilter) { // Empty filter is not match, at this level ASSERT_TRUE(!Matches("hello"));