Cut filter partition based on metadata_block_size

Summary:
Currently metadata_block_size controls only index partition size. With this patch a partition is cut after any of index or filter partitions reaches metadata_block_size.
Closes https://github.com/facebook/rocksdb/pull/2452

Differential Revision: D5275651

Pulled By: maysamyabandeh

fbshipit-source-id: 5057e4424b4c8902043782e6bf8c38f0c4f25160
This commit is contained in:
Maysam Yabandeh 2017-07-02 10:36:10 -07:00 committed by Facebook Github Bot
parent f4ae1bab02
commit 45b9bb0331
10 changed files with 209 additions and 59 deletions

View File

@ -20,8 +20,10 @@
#ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ #ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ #define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
#include <string>
#include <memory> #include <memory>
#include <stdexcept>
#include <string>
#include <vector>
namespace rocksdb { namespace rocksdb {
@ -41,6 +43,16 @@ class FilterBitsBuilder {
// The return value of this function would be the filter bits, // The return value of this function would be the filter bits,
// The ownership of actual data is set to buf // The ownership of actual data is set to buf
virtual Slice Finish(std::unique_ptr<const char[]>* buf) = 0; virtual Slice Finish(std::unique_ptr<const char[]>* buf) = 0;
// Calculate num of entries fit into a space.
virtual int CalculateNumEntry(const uint32_t space) {
#ifndef ROCKSDB_LITE
throw std::runtime_error("CalculateNumEntry not Implemented");
#else
abort();
#endif
return 0;
}
}; };
// A class that checks if a key can be in filter // A class that checks if a key can be in filter

View File

@ -76,10 +76,18 @@ FilterBlockBuilder* CreateFilterBlockBuilder(
} else { } else {
if (table_opt.partition_filters) { if (table_opt.partition_filters) {
assert(p_index_builder != nullptr); assert(p_index_builder != nullptr);
// Since after partition cut request from filter builder it takes time
// until index builder actully cuts the partition, we take the lower bound
// as partition size.
assert(table_opt.block_size_deviation <= 100);
auto partition_size =
(const uint32_t)(table_opt.metadata_block_size *
(100 - table_opt.block_size_deviation));
partition_size = std::max(partition_size, (const uint32_t)1);
return new PartitionedFilterBlockBuilder( return new PartitionedFilterBlockBuilder(
opt.prefix_extractor, table_opt.whole_key_filtering, opt.prefix_extractor, table_opt.whole_key_filtering,
filter_bits_builder, table_opt.index_block_restart_interval, filter_bits_builder, table_opt.index_block_restart_interval,
p_index_builder); p_index_builder, partition_size);
} else { } else {
return new FullFilterBlockBuilder(opt.prefix_extractor, return new FullFilterBlockBuilder(opt.prefix_extractor,
table_opt.whole_key_filtering, table_opt.whole_key_filtering,

View File

@ -0,0 +1,73 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "rocksdb/filter_policy.h"
namespace rocksdb {
class Slice;
class FullFilterBitsBuilder : public FilterBitsBuilder {
public:
explicit FullFilterBitsBuilder(const size_t bits_per_key,
const size_t num_probes);
~FullFilterBitsBuilder();
virtual void AddKey(const Slice& key) override;
// Create a filter that for hashes [0, n-1], the filter is allocated here
// When creating filter, it is ensured that
// total_bits = num_lines * CACHE_LINE_SIZE * 8
// dst len is >= 5, 1 for num_probes, 4 for num_lines
// Then total_bits = (len - 5) * 8, and cache_line_size could be calculated
// +----------------------------------------------------------------+
// | filter data with length total_bits/8 |
// +----------------------------------------------------------------+
// | |
// | ... |
// | |
// +----------------------------------------------------------------+
// | ... | num_probes : 1 byte | num_lines : 4 bytes |
// +----------------------------------------------------------------+
virtual Slice Finish(std::unique_ptr<const char[]>* buf) override;
// Calculate num of entries fit into a space.
virtual int CalculateNumEntry(const uint32_t space) override;
// Calculate space for new filter. This is reverse of CalculateNumEntry.
uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits,
uint32_t* num_lines);
private:
size_t bits_per_key_;
size_t num_probes_;
std::vector<uint32_t> hash_entries_;
// Get totalbits that optimized for cpu cache line
uint32_t GetTotalBitsForLocality(uint32_t total_bits);
// Reserve space for new filter
char* ReserveSpace(const int num_entry, uint32_t* total_bits,
uint32_t* num_lines);
// Assuming single threaded access to this function.
void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits);
// No Copy allowed
FullFilterBitsBuilder(const FullFilterBitsBuilder&);
void operator=(const FullFilterBitsBuilder&);
};
} // namespace rocksdb

View File

@ -77,6 +77,11 @@ void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
table_opt_.metadata_block_size, table_opt_.block_size_deviation, table_opt_.metadata_block_size, table_opt_.block_size_deviation,
sub_index_builder_->index_block_builder_)); sub_index_builder_->index_block_builder_));
partition_cut_requested_ = false;
}
void PartitionedIndexBuilder::RequestPartitionCut() {
partition_cut_requested_ = true;
} }
void PartitionedIndexBuilder::AddIndexEntry( void PartitionedIndexBuilder::AddIndexEntry(
@ -102,6 +107,7 @@ void PartitionedIndexBuilder::AddIndexEntry(
std::string handle_encoding; std::string handle_encoding;
block_handle.EncodeTo(&handle_encoding); block_handle.EncodeTo(&handle_encoding);
bool do_flush = bool do_flush =
partition_cut_requested_ ||
flush_policy_->Update(*last_key_in_current_block, handle_encoding); flush_policy_->Update(*last_key_in_current_block, handle_encoding);
if (do_flush) { if (do_flush) {
entries_.push_back( entries_.push_back(

View File

@ -314,6 +314,10 @@ class PartitionedIndexBuilder : public IndexBuilder {
std::string& GetPartitionKey() { return sub_index_last_key_; } std::string& GetPartitionKey() { return sub_index_last_key_; }
// Called when an external entity (such as filter partition builder) request
// cutting the next partition
void RequestPartitionCut();
private: private:
void MakeNewSubIndexBuilder(); void MakeNewSubIndexBuilder();
@ -331,6 +335,9 @@ class PartitionedIndexBuilder : public IndexBuilder {
// true if Finish is called once but not complete yet. // true if Finish is called once but not complete yet.
bool finishing_indexes = false; bool finishing_indexes = false;
const BlockBasedTableOptions& table_opt_; const BlockBasedTableOptions& table_opt_;
// true if an external entity (such as filter partition builder) request
// cutting the next partition
bool partition_cut_requested_ = true;
// true if it should cut the next filter partition block // true if it should cut the next filter partition block
bool cut_filter_block = false; bool cut_filter_block = false;
}; };

View File

@ -20,15 +20,25 @@ namespace rocksdb {
PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
const SliceTransform* prefix_extractor, bool whole_key_filtering, const SliceTransform* prefix_extractor, bool whole_key_filtering,
FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
PartitionedIndexBuilder* const p_index_builder) PartitionedIndexBuilder* const p_index_builder,
const uint32_t partition_size)
: FullFilterBlockBuilder(prefix_extractor, whole_key_filtering, : FullFilterBlockBuilder(prefix_extractor, whole_key_filtering,
filter_bits_builder), filter_bits_builder),
index_on_filter_block_builder_(index_block_restart_interval), index_on_filter_block_builder_(index_block_restart_interval),
p_index_builder_(p_index_builder) {} p_index_builder_(p_index_builder) {
filters_per_partition_ =
filter_bits_builder_->CalculateNumEntry(partition_size);
}
PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {} PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {}
void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() { void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() {
// Use == to send the request only once
if (filters_in_partition_ == filters_per_partition_) {
// Currently only index builder is in charge of cutting a partition. We keep
// requesting until it is granted.
p_index_builder_->RequestPartitionCut();
}
if (!p_index_builder_->ShouldCutFilterBlock()) { if (!p_index_builder_->ShouldCutFilterBlock()) {
return; return;
} }
@ -36,11 +46,13 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() {
Slice filter = filter_bits_builder_->Finish(&filter_gc.back()); Slice filter = filter_bits_builder_->Finish(&filter_gc.back());
std::string& index_key = p_index_builder_->GetPartitionKey(); std::string& index_key = p_index_builder_->GetPartitionKey();
filters.push_back({index_key, filter}); filters.push_back({index_key, filter});
filters_in_partition_ = 0;
} }
void PartitionedFilterBlockBuilder::AddKey(const Slice& key) { void PartitionedFilterBlockBuilder::AddKey(const Slice& key) {
MaybeCutAFilterBlock(); MaybeCutAFilterBlock();
filter_bits_builder_->AddKey(key); filter_bits_builder_->AddKey(key);
filters_in_partition_++;
} }
Slice PartitionedFilterBlockBuilder::Finish( Slice PartitionedFilterBlockBuilder::Finish(

View File

@ -28,7 +28,8 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
explicit PartitionedFilterBlockBuilder( explicit PartitionedFilterBlockBuilder(
const SliceTransform* prefix_extractor, bool whole_key_filtering, const SliceTransform* prefix_extractor, bool whole_key_filtering,
FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
PartitionedIndexBuilder* const p_index_builder); PartitionedIndexBuilder* const p_index_builder,
const uint32_t partition_size);
virtual ~PartitionedFilterBlockBuilder(); virtual ~PartitionedFilterBlockBuilder();
@ -51,7 +52,15 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
false; // true if Finish is called once but not complete yet. false; // true if Finish is called once but not complete yet.
// The policy of when cut a filter block and Finish it // The policy of when cut a filter block and Finish it
void MaybeCutAFilterBlock(); void MaybeCutAFilterBlock();
// Currently we keep the same number of partitions for filters and indexes.
// This would allow for some potentioal optimizations in future. If such
// optimizations did not realize we can use different number of partitions and
// eliminate p_index_builder_
PartitionedIndexBuilder* const p_index_builder_; PartitionedIndexBuilder* const p_index_builder_;
// The desired number of filters per partition
uint32_t filters_per_partition_;
// The current number of filters in the last partition
uint32_t filters_in_partition_;
}; };
class PartitionedFilterBlockReader : public FilterBlockReader { class PartitionedFilterBlockReader : public FilterBlockReader {

View File

@ -9,6 +9,7 @@
#include "rocksdb/filter_policy.h" #include "rocksdb/filter_policy.h"
#include "table/full_filter_bits_builder.h"
#include "table/index_builder.h" #include "table/index_builder.h"
#include "table/partitioned_filter_block.h" #include "table/partitioned_filter_block.h"
#include "util/coding.h" #include "util/coding.h"
@ -64,6 +65,16 @@ class PartitionedFilterBlockTest : public testing::Test {
return max_index_size; return max_index_size;
} }
uint64_t MaxFilterSize() {
int num_keys = sizeof(keys) / sizeof(*keys);
auto filter_bits_reader = dynamic_cast<rocksdb::FullFilterBitsBuilder*>(
table_options_.filter_policy->GetFilterBitsBuilder());
uint32_t dont_care1, dont_care2;
auto partition_size =
filter_bits_reader->CalculateSpace(num_keys, &dont_care1, &dont_care2);
return partition_size + table_options_.block_size_deviation;
}
int last_offset = 10; int last_offset = 10;
BlockHandle Write(const Slice& slice) { BlockHandle Write(const Slice& slice) {
BlockHandle bh(last_offset + 1, slice.size()); BlockHandle bh(last_offset + 1, slice.size());
@ -78,10 +89,17 @@ class PartitionedFilterBlockTest : public testing::Test {
PartitionedFilterBlockBuilder* NewBuilder( PartitionedFilterBlockBuilder* NewBuilder(
PartitionedIndexBuilder* const p_index_builder) { PartitionedIndexBuilder* const p_index_builder) {
uint32_t partition_size =
table_options_.metadata_block_size >
(uint64_t)table_options_.block_size_deviation
? table_options_.metadata_block_size -
table_options_.block_size_deviation
: 1;
return new PartitionedFilterBlockBuilder( return new PartitionedFilterBlockBuilder(
nullptr, table_options_.whole_key_filtering, nullptr, table_options_.whole_key_filtering,
table_options_.filter_policy->GetFilterBitsBuilder(), table_options_.filter_policy->GetFilterBitsBuilder(),
table_options_.index_block_restart_interval, p_index_builder); table_options_.index_block_restart_interval, p_index_builder,
partition_size);
} }
std::unique_ptr<MockedBlockBasedTable> table; std::unique_ptr<MockedBlockBasedTable> table;
@ -261,7 +279,8 @@ TEST_F(PartitionedFilterBlockTest, OneBlockPerKey) {
TEST_F(PartitionedFilterBlockTest, PartitionCount) { TEST_F(PartitionedFilterBlockTest, PartitionCount) {
int num_keys = sizeof(keys) / sizeof(*keys); int num_keys = sizeof(keys) / sizeof(*keys);
table_options_.metadata_block_size = MaxIndexSize(); table_options_.metadata_block_size =
std::max(MaxIndexSize(), MaxFilterSize());
int partitions = TestBlockPerKey(); int partitions = TestBlockPerKey();
ASSERT_EQ(partitions, 1); ASSERT_EQ(partitions, 1);
// A low number ensures cutting a block after each key // A low number ensures cutting a block after each key

View File

@ -13,49 +13,32 @@
#include "rocksdb/slice.h" #include "rocksdb/slice.h"
#include "table/block_based_filter_block.h" #include "table/block_based_filter_block.h"
#include "table/full_filter_bits_builder.h"
#include "table/full_filter_block.h" #include "table/full_filter_block.h"
#include "util/hash.h"
#include "util/coding.h" #include "util/coding.h"
#include "util/hash.h"
namespace rocksdb { namespace rocksdb {
class BlockBasedFilterBlockBuilder; class BlockBasedFilterBlockBuilder;
class FullFilterBlockBuilder; class FullFilterBlockBuilder;
namespace { FullFilterBitsBuilder::FullFilterBitsBuilder(const size_t bits_per_key,
class FullFilterBitsBuilder : public FilterBitsBuilder { const size_t num_probes)
public: : bits_per_key_(bits_per_key), num_probes_(num_probes) {
explicit FullFilterBitsBuilder(const size_t bits_per_key, assert(bits_per_key_);
const size_t num_probes)
: bits_per_key_(bits_per_key),
num_probes_(num_probes) {
assert(bits_per_key_);
} }
~FullFilterBitsBuilder() {} FullFilterBitsBuilder::~FullFilterBitsBuilder() {}
virtual void AddKey(const Slice& key) override { void FullFilterBitsBuilder::AddKey(const Slice& key) {
uint32_t hash = BloomHash(key); uint32_t hash = BloomHash(key);
if (hash_entries_.size() == 0 || hash != hash_entries_.back()) { if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
hash_entries_.push_back(hash); hash_entries_.push_back(hash);
} }
} }
// Create a filter that for hashes [0, n-1], the filter is allocated here Slice FullFilterBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
// When creating filter, it is ensured that
// total_bits = num_lines * CACHE_LINE_SIZE * 8
// dst len is >= 5, 1 for num_probes, 4 for num_lines
// Then total_bits = (len - 5) * 8, and cache_line_size could be calculated
// +----------------------------------------------------------------+
// | filter data with length total_bits/8 |
// +----------------------------------------------------------------+
// | |
// | ... |
// | |
// +----------------------------------------------------------------+
// | ... | num_probes : 1 byte | num_lines : 4 bytes |
// +----------------------------------------------------------------+
virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
uint32_t total_bits, num_lines; uint32_t total_bits, num_lines;
char* data = ReserveSpace(static_cast<int>(hash_entries_.size()), char* data = ReserveSpace(static_cast<int>(hash_entries_.size()),
&total_bits, &num_lines); &total_bits, &num_lines);
@ -76,27 +59,6 @@ class FullFilterBitsBuilder : public FilterBitsBuilder {
return Slice(data, total_bits / 8 + 5); return Slice(data, total_bits / 8 + 5);
} }
private:
size_t bits_per_key_;
size_t num_probes_;
std::vector<uint32_t> hash_entries_;
// Get totalbits that optimized for cpu cache line
uint32_t GetTotalBitsForLocality(uint32_t total_bits);
// Reserve space for new filter
char* ReserveSpace(const int num_entry, uint32_t* total_bits,
uint32_t* num_lines);
// Assuming single threaded access to this function.
void AddHash(uint32_t h, char* data, uint32_t num_lines,
uint32_t total_bits);
// No Copy allowed
FullFilterBitsBuilder(const FullFilterBitsBuilder&);
void operator=(const FullFilterBitsBuilder&);
};
uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) { uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
uint32_t num_lines = uint32_t num_lines =
(total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8); (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
@ -109,10 +71,10 @@ uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
return num_lines * (CACHE_LINE_SIZE * 8); return num_lines * (CACHE_LINE_SIZE * 8);
} }
char* FullFilterBitsBuilder::ReserveSpace(const int num_entry, uint32_t FullFilterBitsBuilder::CalculateSpace(const int num_entry,
uint32_t* total_bits, uint32_t* num_lines) { uint32_t* total_bits,
uint32_t* num_lines) {
assert(bits_per_key_); assert(bits_per_key_);
char* data = nullptr;
if (num_entry != 0) { if (num_entry != 0) {
uint32_t total_bits_tmp = num_entry * static_cast<uint32_t>(bits_per_key_); uint32_t total_bits_tmp = num_entry * static_cast<uint32_t>(bits_per_key_);
@ -128,12 +90,35 @@ char* FullFilterBitsBuilder::ReserveSpace(const int num_entry,
// Reserve space for Filter // Reserve space for Filter
uint32_t sz = *total_bits / 8; uint32_t sz = *total_bits / 8;
sz += 5; // 4 bytes for num_lines, 1 byte for num_probes sz += 5; // 4 bytes for num_lines, 1 byte for num_probes
return sz;
}
data = new char[sz]; char* FullFilterBitsBuilder::ReserveSpace(const int num_entry,
uint32_t* total_bits,
uint32_t* num_lines) {
uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines);
char* data = new char[sz];
memset(data, 0, sz); memset(data, 0, sz);
return data; return data;
} }
int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t space) {
assert(bits_per_key_);
assert(space > 0);
uint32_t dont_care1, dont_care2;
int high = (int) (space * 8 / bits_per_key_ + 1);
int low = 1;
int n = high;
for (; n >= low; n--) {
uint32_t sz = CalculateSpace(n, &dont_care1, &dont_care2);
if (sz <= space) {
break;
}
}
assert(n < high); // High should be an overestimation
return n;
}
inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data, inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
uint32_t num_lines, uint32_t total_bits) { uint32_t num_lines, uint32_t total_bits) {
assert(num_lines > 0 && total_bits > 0); assert(num_lines > 0 && total_bits > 0);
@ -151,6 +136,7 @@ inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
} }
} }
namespace {
class FullFilterBitsReader : public FilterBitsReader { class FullFilterBitsReader : public FilterBitsReader {
public: public:
explicit FullFilterBitsReader(const Slice& contents) explicit FullFilterBitsReader(const Slice& contents)

View File

@ -21,10 +21,11 @@ int main() {
#include <vector> #include <vector>
#include "rocksdb/filter_policy.h" #include "rocksdb/filter_policy.h"
#include "table/full_filter_bits_builder.h"
#include "util/arena.h"
#include "util/logging.h" #include "util/logging.h"
#include "util/testharness.h" #include "util/testharness.h"
#include "util/testutil.h" #include "util/testutil.h"
#include "util/arena.h"
using GFLAGS::ParseCommandLineFlags; using GFLAGS::ParseCommandLineFlags;
@ -197,6 +198,10 @@ class FullBloomTest : public testing::Test {
delete policy_; delete policy_;
} }
FullFilterBitsBuilder* GetFullFilterBitsBuilder() {
return dynamic_cast<FullFilterBitsBuilder*>(bits_builder_.get());
}
void Reset() { void Reset() {
bits_builder_.reset(policy_->GetFilterBitsBuilder()); bits_builder_.reset(policy_->GetFilterBitsBuilder());
bits_reader_.reset(nullptr); bits_reader_.reset(nullptr);
@ -237,6 +242,19 @@ class FullBloomTest : public testing::Test {
} }
}; };
TEST_F(FullBloomTest, FilterSize) {
uint32_t dont_care1, dont_care2;
auto full_bits_builder = GetFullFilterBitsBuilder();
for (int n = 1; n < 100; n++) {
auto space = full_bits_builder->CalculateSpace(n, &dont_care1, &dont_care2);
auto n2 = full_bits_builder->CalculateNumEntry(space);
ASSERT_GE(n2, n);
auto space2 =
full_bits_builder->CalculateSpace(n2, &dont_care1, &dont_care2);
ASSERT_EQ(space, space2);
}
}
TEST_F(FullBloomTest, FullEmptyFilter) { TEST_F(FullBloomTest, FullEmptyFilter) {
// Empty filter is not match, at this level // Empty filter is not match, at this level
ASSERT_TRUE(!Matches("hello")); ASSERT_TRUE(!Matches("hello"));