9e4913ce9d
Summary: There was significant untested logic in FullFilterBitsReader in the handling of serialized Bloom filter bits that cannot be generated by FullFilterBitsBuilder in the current compilation. These now test many of those corner-case behaviors, including bad metadata or filters created with different cache line size than the current compiled-in value. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5834 Test Plan: thisisthetest Differential Revision: D17726372 Pulled By: pdillinger fbshipit-source-id: fb7b8003b5a8e6fb4666fe95206128f3d5835fc7
344 lines
11 KiB
C++
344 lines
11 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#include "rocksdb/filter_policy.h"
|
|
|
|
#include "rocksdb/slice.h"
|
|
#include "table/block_based/block_based_filter_block.h"
|
|
#include "table/block_based/full_filter_block.h"
|
|
#include "table/full_filter_bits_builder.h"
|
|
#include "third-party/folly/folly/ConstexprMath.h"
|
|
#include "util/bloom_impl.h"
|
|
#include "util/coding.h"
|
|
#include "util/hash.h"
|
|
|
|
namespace rocksdb {
|
|
|
|
typedef LegacyLocalityBloomImpl</*ExtraRotates*/ false> LegacyFullFilterImpl;
|
|
class BlockBasedFilterBlockBuilder;
|
|
class FullFilterBlockBuilder;
|
|
|
|
FullFilterBitsBuilder::FullFilterBitsBuilder(const int bits_per_key,
|
|
const int num_probes)
|
|
: bits_per_key_(bits_per_key), num_probes_(num_probes) {
|
|
assert(bits_per_key_);
|
|
}
|
|
|
|
FullFilterBitsBuilder::~FullFilterBitsBuilder() {}
|
|
|
|
void FullFilterBitsBuilder::AddKey(const Slice& key) {
|
|
uint32_t hash = BloomHash(key);
|
|
if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
|
|
hash_entries_.push_back(hash);
|
|
}
|
|
}
|
|
|
|
Slice FullFilterBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
|
|
uint32_t total_bits, num_lines;
|
|
char* data = ReserveSpace(static_cast<int>(hash_entries_.size()),
|
|
&total_bits, &num_lines);
|
|
assert(data);
|
|
|
|
if (total_bits != 0 && num_lines != 0) {
|
|
for (auto h : hash_entries_) {
|
|
AddHash(h, data, num_lines, total_bits);
|
|
}
|
|
}
|
|
data[total_bits/8] = static_cast<char>(num_probes_);
|
|
EncodeFixed32(data + total_bits/8 + 1, static_cast<uint32_t>(num_lines));
|
|
|
|
const char* const_data = data;
|
|
buf->reset(const_data);
|
|
hash_entries_.clear();
|
|
|
|
return Slice(data, total_bits / 8 + 5);
|
|
}
|
|
|
|
uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
|
|
uint32_t num_lines =
|
|
(total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
|
|
|
|
// Make num_lines an odd number to make sure more bits are involved
|
|
// when determining which block.
|
|
if (num_lines % 2 == 0) {
|
|
num_lines++;
|
|
}
|
|
return num_lines * (CACHE_LINE_SIZE * 8);
|
|
}
|
|
|
|
uint32_t FullFilterBitsBuilder::CalculateSpace(const int num_entry,
|
|
uint32_t* total_bits,
|
|
uint32_t* num_lines) {
|
|
assert(bits_per_key_);
|
|
if (num_entry != 0) {
|
|
uint32_t total_bits_tmp = static_cast<uint32_t>(num_entry * bits_per_key_);
|
|
|
|
*total_bits = GetTotalBitsForLocality(total_bits_tmp);
|
|
*num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
|
|
assert(*total_bits > 0 && *total_bits % 8 == 0);
|
|
} else {
|
|
// filter is empty, just leave space for metadata
|
|
*total_bits = 0;
|
|
*num_lines = 0;
|
|
}
|
|
|
|
// Reserve space for Filter
|
|
uint32_t sz = *total_bits / 8;
|
|
sz += 5; // 4 bytes for num_lines, 1 byte for num_probes
|
|
return sz;
|
|
}
|
|
|
|
char* FullFilterBitsBuilder::ReserveSpace(const int num_entry,
|
|
uint32_t* total_bits,
|
|
uint32_t* num_lines) {
|
|
uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines);
|
|
char* data = new char[sz];
|
|
memset(data, 0, sz);
|
|
return data;
|
|
}
|
|
|
|
int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t space) {
|
|
assert(bits_per_key_);
|
|
assert(space > 0);
|
|
uint32_t dont_care1, dont_care2;
|
|
int high = static_cast<int>(space * 8 / bits_per_key_ + 1);
|
|
int low = 1;
|
|
int n = high;
|
|
for (; n >= low; n--) {
|
|
uint32_t sz = CalculateSpace(n, &dont_care1, &dont_care2);
|
|
if (sz <= space) {
|
|
break;
|
|
}
|
|
}
|
|
assert(n < high); // High should be an overestimation
|
|
return n;
|
|
}
|
|
|
|
inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
|
|
uint32_t num_lines, uint32_t total_bits) {
|
|
#ifdef NDEBUG
|
|
static_cast<void>(total_bits);
|
|
#endif
|
|
assert(num_lines > 0 && total_bits > 0);
|
|
|
|
LegacyFullFilterImpl::AddHash(h, num_lines, num_probes_, data,
|
|
folly::constexpr_log2(CACHE_LINE_SIZE));
|
|
}
|
|
|
|
namespace {
|
|
class FullFilterBitsReader : public FilterBitsReader {
|
|
public:
|
|
explicit FullFilterBitsReader(const Slice& contents)
|
|
: data_(contents.data()),
|
|
data_len_(static_cast<uint32_t>(contents.size())),
|
|
num_probes_(0),
|
|
num_lines_(0),
|
|
log2_cache_line_size_(0) {
|
|
assert(data_);
|
|
GetFilterMeta(contents, &num_probes_, &num_lines_);
|
|
// Sanitize broken parameter
|
|
if (num_lines_ != 0 && (data_len_-5) % num_lines_ != 0) {
|
|
num_lines_ = 0;
|
|
num_probes_ = 0;
|
|
} else if (num_lines_ != 0) {
|
|
while (true) {
|
|
uint32_t num_lines_at_curr_cache_size =
|
|
(data_len_ - 5) >> log2_cache_line_size_;
|
|
if (num_lines_at_curr_cache_size == 0) {
|
|
// The cache line size seems not a power of two. It's not supported
|
|
// and indicates a corruption so disable using this filter.
|
|
// Removed for unit testing corruption: assert(false);
|
|
num_lines_ = 0;
|
|
num_probes_ = 0;
|
|
break;
|
|
}
|
|
if (num_lines_at_curr_cache_size == num_lines_) {
|
|
break;
|
|
}
|
|
++log2_cache_line_size_;
|
|
}
|
|
}
|
|
}
|
|
// No Copy allowed
|
|
FullFilterBitsReader(const FullFilterBitsReader&) = delete;
|
|
void operator=(const FullFilterBitsReader&) = delete;
|
|
|
|
~FullFilterBitsReader() override {}
|
|
|
|
// "contents" contains the data built by a preceding call to
|
|
// FilterBitsBuilder::Finish. MayMatch must return true if the key was
|
|
// passed to FilterBitsBuilder::AddKey. This method may return true or false
|
|
// if the key was not on the list, but it should aim to return false with a
|
|
// high probability.
|
|
bool MayMatch(const Slice& key) override {
|
|
if (data_len_ <= 5) { // remain same with original filter
|
|
return false;
|
|
}
|
|
// Other Error params, including a broken filter, regarded as match
|
|
if (num_probes_ == 0 || num_lines_ == 0) return true;
|
|
uint32_t hash = BloomHash(key);
|
|
uint32_t byte_offset;
|
|
LegacyFullFilterImpl::PrepareHashMayMatch(
|
|
hash, num_lines_, data_, /*out*/ &byte_offset, log2_cache_line_size_);
|
|
return LegacyFullFilterImpl::HashMayMatchPrepared(
|
|
hash, num_probes_, data_ + byte_offset, log2_cache_line_size_);
|
|
}
|
|
|
|
virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
|
|
if (data_len_ <= 5) { // remain same with original filter
|
|
for (int i = 0; i < num_keys; ++i) {
|
|
may_match[i] = false;
|
|
}
|
|
return;
|
|
}
|
|
for (int i = 0; i < num_keys; ++i) {
|
|
may_match[i] = true;
|
|
}
|
|
// Other Error params, including a broken filter, regarded as match
|
|
if (num_probes_ == 0 || num_lines_ == 0) return;
|
|
uint32_t hashes[MultiGetContext::MAX_BATCH_SIZE];
|
|
uint32_t byte_offsets[MultiGetContext::MAX_BATCH_SIZE];
|
|
for (int i = 0; i < num_keys; ++i) {
|
|
hashes[i] = BloomHash(*keys[i]);
|
|
LegacyFullFilterImpl::PrepareHashMayMatch(hashes[i], num_lines_, data_,
|
|
/*out*/ &byte_offsets[i],
|
|
log2_cache_line_size_);
|
|
}
|
|
|
|
for (int i = 0; i < num_keys; ++i) {
|
|
if (!LegacyFullFilterImpl::HashMayMatchPrepared(hashes[i], num_probes_,
|
|
data_ + byte_offsets[i],
|
|
log2_cache_line_size_)) {
|
|
may_match[i] = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
private:
|
|
// Filter meta data
|
|
const char* data_;
|
|
uint32_t data_len_;
|
|
int num_probes_;
|
|
uint32_t num_lines_;
|
|
uint32_t log2_cache_line_size_;
|
|
|
|
// Get num_probes, and num_lines from filter
|
|
// If filter format broken, set both to 0.
|
|
void GetFilterMeta(const Slice& filter, int* num_probes, uint32_t* num_lines);
|
|
};
|
|
|
|
void FullFilterBitsReader::GetFilterMeta(const Slice& filter, int* num_probes,
|
|
uint32_t* num_lines) {
|
|
uint32_t len = static_cast<uint32_t>(filter.size());
|
|
if (len <= 5) {
|
|
// filter is empty or broken
|
|
*num_probes = 0;
|
|
*num_lines = 0;
|
|
return;
|
|
}
|
|
|
|
*num_probes = filter.data()[len - 5];
|
|
*num_lines = DecodeFixed32(filter.data() + len - 4);
|
|
}
|
|
|
|
// An implementation of filter policy
|
|
class BloomFilterPolicy : public FilterPolicy {
|
|
public:
|
|
explicit BloomFilterPolicy(int bits_per_key, bool use_block_based_builder)
|
|
: bits_per_key_(bits_per_key), hash_func_(BloomHash),
|
|
use_block_based_builder_(use_block_based_builder) {
|
|
initialize();
|
|
}
|
|
|
|
~BloomFilterPolicy() override {}
|
|
|
|
const char* Name() const override { return "rocksdb.BuiltinBloomFilter"; }
|
|
|
|
void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
|
|
// Compute bloom filter size (in both bits and bytes)
|
|
uint32_t bits = static_cast<uint32_t>(n * bits_per_key_);
|
|
|
|
// For small n, we can see a very high false positive rate. Fix it
|
|
// by enforcing a minimum bloom filter length.
|
|
if (bits < 64) bits = 64;
|
|
|
|
uint32_t bytes = (bits + 7) / 8;
|
|
bits = bytes * 8;
|
|
|
|
const size_t init_size = dst->size();
|
|
dst->resize(init_size + bytes, 0);
|
|
dst->push_back(static_cast<char>(num_probes_)); // Remember # of probes
|
|
char* array = &(*dst)[init_size];
|
|
for (int i = 0; i < n; i++) {
|
|
LegacyNoLocalityBloomImpl::AddHash(hash_func_(keys[i]), bits, num_probes_,
|
|
array);
|
|
}
|
|
}
|
|
|
|
bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override {
|
|
const size_t len = bloom_filter.size();
|
|
if (len < 2 || len > 0xffffffffU) {
|
|
return false;
|
|
}
|
|
|
|
const char* array = bloom_filter.data();
|
|
const uint32_t bits = static_cast<uint32_t>(len - 1) * 8;
|
|
|
|
// Use the encoded k so that we can read filters generated by
|
|
// bloom filters created using different parameters.
|
|
const int k = static_cast<uint8_t>(array[len - 1]);
|
|
if (k > 30) {
|
|
// Reserved for potentially new encodings for short bloom filters.
|
|
// Consider it a match.
|
|
return true;
|
|
}
|
|
// NB: using k not num_probes_
|
|
return LegacyNoLocalityBloomImpl::HashMayMatch(hash_func_(key), bits, k,
|
|
array);
|
|
}
|
|
|
|
FilterBitsBuilder* GetFilterBitsBuilder() const override {
|
|
if (use_block_based_builder_) {
|
|
return nullptr;
|
|
}
|
|
|
|
return new FullFilterBitsBuilder(bits_per_key_, num_probes_);
|
|
}
|
|
|
|
FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
|
|
return new FullFilterBitsReader(contents);
|
|
}
|
|
|
|
// If choose to use block based builder
|
|
bool UseBlockBasedBuilder() { return use_block_based_builder_; }
|
|
|
|
private:
|
|
int bits_per_key_;
|
|
int num_probes_;
|
|
uint32_t (*hash_func_)(const Slice& key);
|
|
|
|
const bool use_block_based_builder_;
|
|
|
|
void initialize() {
|
|
// We intentionally round down to reduce probing cost a little bit
|
|
num_probes_ = static_cast<int>(bits_per_key_ * 0.69); // 0.69 =~ ln(2)
|
|
if (num_probes_ < 1) num_probes_ = 1;
|
|
if (num_probes_ > 30) num_probes_ = 30;
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
|
|
bool use_block_based_builder) {
|
|
return new BloomFilterPolicy(bits_per_key, use_block_based_builder);
|
|
}
|
|
|
|
} // namespace rocksdb
|