rocksdb/util/bloom.cc
Peter Dillinger 68626249c3 Refactor/consolidate legacy Bloom implementation details (#5784)
Summary:
Refactoring to consolidate implementation details of legacy
Bloom filters. This helps to organize and document some related,
obscure code.

Also added make/cpp var TEST_CACHE_LINE_SIZE so that it's easy to
compile and run unit tests for non-native cache line size. (Fixed a
related test failure in db_properties_test.)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5784

Test Plan:
make check, including Recently added Bloom schema unit tests
(in ./plain_table_db_test && ./bloom_test), and including with
TEST_CACHE_LINE_SIZE=128U and TEST_CACHE_LINE_SIZE=256U. Tested the
schema tests with temporary fault injection into new implementations.

Some performance testing with modified unit tests suggest a small to moderate
improvement in speed.

Differential Revision: D17381384

Pulled By: pdillinger

fbshipit-source-id: ee42586da996798910fc45ac0b6289147f16d8df
2019-09-16 16:17:09 -07:00

344 lines
11 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "rocksdb/filter_policy.h"
#include "rocksdb/slice.h"
#include "table/block_based/block_based_filter_block.h"
#include "table/block_based/full_filter_block.h"
#include "table/full_filter_bits_builder.h"
#include "third-party/folly/folly/ConstexprMath.h"
#include "util/bloom_impl.h"
#include "util/coding.h"
#include "util/hash.h"
namespace rocksdb {
typedef LegacyLocalityBloomImpl</*ExtraRotates*/ false> LegacyFullFilterImpl;
class BlockBasedFilterBlockBuilder;
class FullFilterBlockBuilder;
FullFilterBitsBuilder::FullFilterBitsBuilder(const int bits_per_key,
const int num_probes)
: bits_per_key_(bits_per_key), num_probes_(num_probes) {
assert(bits_per_key_);
}
FullFilterBitsBuilder::~FullFilterBitsBuilder() {}
void FullFilterBitsBuilder::AddKey(const Slice& key) {
uint32_t hash = BloomHash(key);
if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
hash_entries_.push_back(hash);
}
}
Slice FullFilterBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
uint32_t total_bits, num_lines;
char* data = ReserveSpace(static_cast<int>(hash_entries_.size()),
&total_bits, &num_lines);
assert(data);
if (total_bits != 0 && num_lines != 0) {
for (auto h : hash_entries_) {
AddHash(h, data, num_lines, total_bits);
}
}
data[total_bits/8] = static_cast<char>(num_probes_);
EncodeFixed32(data + total_bits/8 + 1, static_cast<uint32_t>(num_lines));
const char* const_data = data;
buf->reset(const_data);
hash_entries_.clear();
return Slice(data, total_bits / 8 + 5);
}
uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
uint32_t num_lines =
(total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
// Make num_lines an odd number to make sure more bits are involved
// when determining which block.
if (num_lines % 2 == 0) {
num_lines++;
}
return num_lines * (CACHE_LINE_SIZE * 8);
}
uint32_t FullFilterBitsBuilder::CalculateSpace(const int num_entry,
uint32_t* total_bits,
uint32_t* num_lines) {
assert(bits_per_key_);
if (num_entry != 0) {
uint32_t total_bits_tmp = static_cast<uint32_t>(num_entry * bits_per_key_);
*total_bits = GetTotalBitsForLocality(total_bits_tmp);
*num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
assert(*total_bits > 0 && *total_bits % 8 == 0);
} else {
// filter is empty, just leave space for metadata
*total_bits = 0;
*num_lines = 0;
}
// Reserve space for Filter
uint32_t sz = *total_bits / 8;
sz += 5; // 4 bytes for num_lines, 1 byte for num_probes
return sz;
}
char* FullFilterBitsBuilder::ReserveSpace(const int num_entry,
uint32_t* total_bits,
uint32_t* num_lines) {
uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines);
char* data = new char[sz];
memset(data, 0, sz);
return data;
}
int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t space) {
assert(bits_per_key_);
assert(space > 0);
uint32_t dont_care1, dont_care2;
int high = static_cast<int>(space * 8 / bits_per_key_ + 1);
int low = 1;
int n = high;
for (; n >= low; n--) {
uint32_t sz = CalculateSpace(n, &dont_care1, &dont_care2);
if (sz <= space) {
break;
}
}
assert(n < high); // High should be an overestimation
return n;
}
inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
uint32_t num_lines, uint32_t total_bits) {
#ifdef NDEBUG
static_cast<void>(total_bits);
#endif
assert(num_lines > 0 && total_bits > 0);
LegacyFullFilterImpl::AddHash(h, num_lines, num_probes_, data,
folly::constexpr_log2(CACHE_LINE_SIZE));
}
namespace {
class FullFilterBitsReader : public FilterBitsReader {
public:
explicit FullFilterBitsReader(const Slice& contents)
: data_(contents.data()),
data_len_(static_cast<uint32_t>(contents.size())),
num_probes_(0),
num_lines_(0),
log2_cache_line_size_(0) {
assert(data_);
GetFilterMeta(contents, &num_probes_, &num_lines_);
// Sanitize broken parameter
if (num_lines_ != 0 && (data_len_-5) % num_lines_ != 0) {
num_lines_ = 0;
num_probes_ = 0;
} else if (num_lines_ != 0) {
while (true) {
uint32_t num_lines_at_curr_cache_size =
(data_len_ - 5) >> log2_cache_line_size_;
if (num_lines_at_curr_cache_size == 0) {
// The cache line size seems not a power of two. It's not supported
// and indicates a corruption so disable using this filter.
assert(false);
num_lines_ = 0;
num_probes_ = 0;
break;
}
if (num_lines_at_curr_cache_size == num_lines_) {
break;
}
++log2_cache_line_size_;
}
}
}
// No Copy allowed
FullFilterBitsReader(const FullFilterBitsReader&) = delete;
void operator=(const FullFilterBitsReader&) = delete;
~FullFilterBitsReader() override {}
// "contents" contains the data built by a preceding call to
// FilterBitsBuilder::Finish. MayMatch must return true if the key was
// passed to FilterBitsBuilder::AddKey. This method may return true or false
// if the key was not on the list, but it should aim to return false with a
// high probability.
bool MayMatch(const Slice& key) override {
if (data_len_ <= 5) { // remain same with original filter
return false;
}
// Other Error params, including a broken filter, regarded as match
if (num_probes_ == 0 || num_lines_ == 0) return true;
uint32_t hash = BloomHash(key);
uint32_t byte_offset;
LegacyFullFilterImpl::PrepareHashMayMatch(
hash, num_lines_, data_, /*out*/ &byte_offset, log2_cache_line_size_);
return LegacyFullFilterImpl::HashMayMatchPrepared(
hash, num_probes_, data_ + byte_offset, log2_cache_line_size_);
}
virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
if (data_len_ <= 5) { // remain same with original filter
for (int i = 0; i < num_keys; ++i) {
may_match[i] = false;
}
return;
}
for (int i = 0; i < num_keys; ++i) {
may_match[i] = true;
}
// Other Error params, including a broken filter, regarded as match
if (num_probes_ == 0 || num_lines_ == 0) return;
uint32_t hashes[MultiGetContext::MAX_BATCH_SIZE];
uint32_t byte_offsets[MultiGetContext::MAX_BATCH_SIZE];
for (int i = 0; i < num_keys; ++i) {
hashes[i] = BloomHash(*keys[i]);
LegacyFullFilterImpl::PrepareHashMayMatch(hashes[i], num_lines_, data_,
/*out*/ &byte_offsets[i],
log2_cache_line_size_);
}
for (int i = 0; i < num_keys; ++i) {
if (!LegacyFullFilterImpl::HashMayMatchPrepared(hashes[i], num_probes_,
data_ + byte_offsets[i],
log2_cache_line_size_)) {
may_match[i] = false;
}
}
}
private:
// Filter meta data
const char* data_;
uint32_t data_len_;
int num_probes_;
uint32_t num_lines_;
uint32_t log2_cache_line_size_;
// Get num_probes, and num_lines from filter
// If filter format broken, set both to 0.
void GetFilterMeta(const Slice& filter, int* num_probes, uint32_t* num_lines);
};
void FullFilterBitsReader::GetFilterMeta(const Slice& filter, int* num_probes,
uint32_t* num_lines) {
uint32_t len = static_cast<uint32_t>(filter.size());
if (len <= 5) {
// filter is empty or broken
*num_probes = 0;
*num_lines = 0;
return;
}
*num_probes = filter.data()[len - 5];
*num_lines = DecodeFixed32(filter.data() + len - 4);
}
// An implementation of filter policy
class BloomFilterPolicy : public FilterPolicy {
public:
explicit BloomFilterPolicy(int bits_per_key, bool use_block_based_builder)
: bits_per_key_(bits_per_key), hash_func_(BloomHash),
use_block_based_builder_(use_block_based_builder) {
initialize();
}
~BloomFilterPolicy() override {}
const char* Name() const override { return "rocksdb.BuiltinBloomFilter"; }
void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
// Compute bloom filter size (in both bits and bytes)
uint32_t bits = static_cast<uint32_t>(n * bits_per_key_);
// For small n, we can see a very high false positive rate. Fix it
// by enforcing a minimum bloom filter length.
if (bits < 64) bits = 64;
uint32_t bytes = (bits + 7) / 8;
bits = bytes * 8;
const size_t init_size = dst->size();
dst->resize(init_size + bytes, 0);
dst->push_back(static_cast<char>(num_probes_)); // Remember # of probes
char* array = &(*dst)[init_size];
for (int i = 0; i < n; i++) {
LegacyNoLocalityBloomImpl::AddHash(hash_func_(keys[i]), bits, num_probes_,
array);
}
}
bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override {
const size_t len = bloom_filter.size();
if (len < 2 || len > 0xffffffffU) {
return false;
}
const char* array = bloom_filter.data();
const uint32_t bits = static_cast<uint32_t>(len - 1) * 8;
// Use the encoded k so that we can read filters generated by
// bloom filters created using different parameters.
const int k = static_cast<uint8_t>(array[len - 1]);
if (k > 30) {
// Reserved for potentially new encodings for short bloom filters.
// Consider it a match.
return true;
}
// NB: using k not num_probes_
return LegacyNoLocalityBloomImpl::HashMayMatch(hash_func_(key), bits, k,
array);
}
FilterBitsBuilder* GetFilterBitsBuilder() const override {
if (use_block_based_builder_) {
return nullptr;
}
return new FullFilterBitsBuilder(bits_per_key_, num_probes_);
}
FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
return new FullFilterBitsReader(contents);
}
// If choose to use block based builder
bool UseBlockBasedBuilder() { return use_block_based_builder_; }
private:
int bits_per_key_;
int num_probes_;
uint32_t (*hash_func_)(const Slice& key);
const bool use_block_based_builder_;
void initialize() {
// We intentionally round down to reduce probing cost a little bit
num_probes_ = static_cast<int>(bits_per_key_ * 0.69); // 0.69 =~ ln(2)
if (num_probes_ < 1) num_probes_ = 1;
if (num_probes_ > 30) num_probes_ = 30;
}
};
} // namespace
const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
bool use_block_based_builder) {
return new BloomFilterPolicy(bits_per_key, use_block_based_builder);
}
} // namespace rocksdb