rocksdb/util/bloom.cc
Dhruba Borthakur d11b637f34 bits_per_key is already configurable. It defines how many bloom bits will be used for every key in the database.
My change in this patch is to make the Hash code that is used for blooms to be confgurable. In fact,
one can specify a modified HashCode that inspects only parts of the Key to generate the Hash (used
by booms).

Test Plan: none

Differential Revision: https://reviews.facebook.net/D4059
2012-07-09 23:06:07 -07:00

107 lines
3.2 KiB
C++

// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "leveldb/filter_policy.h"
#include "leveldb/slice.h"
#include "util/hash.h"
namespace leveldb {
namespace {
static uint32_t BloomHash(const Slice& key) {
return Hash(key.data(), key.size(), 0xbc9f1d34);
}
class BloomFilterPolicy : public FilterPolicy {
private:
size_t bits_per_key_;
size_t k_;
uint32_t (*hash_func_)(const Slice& key);
void initialize() {
// We intentionally round down to reduce probing cost a little bit
k_ = static_cast<size_t>(bits_per_key_ * 0.69); // 0.69 =~ ln(2)
if (k_ < 1) k_ = 1;
if (k_ > 30) k_ = 30;
}
public:
explicit BloomFilterPolicy(int bits_per_key,
uint32_t (*hash_func)(const Slice& key))
: bits_per_key_(bits_per_key), hash_func_(hash_func) {
initialize();
}
explicit BloomFilterPolicy(int bits_per_key)
: bits_per_key_(bits_per_key) {
hash_func_ = BloomHash;
initialize();
}
virtual const char* Name() const {
return "leveldb.BuiltinBloomFilter";
}
virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
// Compute bloom filter size (in both bits and bytes)
size_t bits = n * bits_per_key_;
// For small n, we can see a very high false positive rate. Fix it
// by enforcing a minimum bloom filter length.
if (bits < 64) bits = 64;
size_t bytes = (bits + 7) / 8;
bits = bytes * 8;
const size_t init_size = dst->size();
dst->resize(init_size + bytes, 0);
dst->push_back(static_cast<char>(k_)); // Remember # of probes in filter
char* array = &(*dst)[init_size];
for (size_t i = 0; i < n; i++) {
// Use double-hashing to generate a sequence of hash values.
// See analysis in [Kirsch,Mitzenmacher 2006].
uint32_t h = hash_func_(keys[i]);
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
for (size_t j = 0; j < k_; j++) {
const uint32_t bitpos = h % bits;
array[bitpos/8] |= (1 << (bitpos % 8));
h += delta;
}
}
}
virtual bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const {
const size_t len = bloom_filter.size();
if (len < 2) return false;
const char* array = bloom_filter.data();
const size_t bits = (len - 1) * 8;
// Use the encoded k so that we can read filters generated by
// bloom filters created using different parameters.
const size_t k = array[len-1];
if (k > 30) {
// Reserved for potentially new encodings for short bloom filters.
// Consider it a match.
return true;
}
uint32_t h = hash_func_(key);
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
for (size_t j = 0; j < k; j++) {
const uint32_t bitpos = h % bits;
if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false;
h += delta;
}
return true;
}
};
}
const FilterPolicy* NewBloomFilterPolicy(int bits_per_key) {
return new BloomFilterPolicy(bits_per_key);
}
} // namespace leveldb