dynamic_bloom: replace some divide (remainder) operations with shifts in locality mode, and other improvements
Summary: This patch changes meaning of options.bloom_locality: 0 means disable cache line optimization and any positive number means use CACHE_LINE_SIZE as block size (the previous behavior is the block size will be CACHE_LINE_SIZE*options.bloom_locality). By doing it, the divide operations inside a block can be replaced by a shift. Performance is improved: https://reviews.facebook.net/P471 Also, improve the basic algorithm in two ways: (1) make sure num of blocks is an odd number (2) rotate bytes after every probe in locality mode. Since the divider is 2^n, unless doing it, we are never able to use all the bits. Improvements of false positive: https://reviews.facebook.net/P459 Test Plan: make all check Reviewers: ljin, haobo Reviewed By: haobo Subscribers: dhruba, yhchiang, igor, leveldb Differential Revision: https://reviews.facebook.net/D18843
This commit is contained in:
parent
91ddd587cc
commit
462796697c
|
@ -547,12 +547,9 @@ struct ColumnFamilyOptions {
|
||||||
|
|
||||||
// Control locality of bloom filter probes to improve cache miss rate.
|
// Control locality of bloom filter probes to improve cache miss rate.
|
||||||
// This option only applies to memtable prefix bloom and plaintable
|
// This option only applies to memtable prefix bloom and plaintable
|
||||||
// prefix bloom. It essentially limits the max number of cache lines each
|
// prefix bloom. It essentially limits every bloom checking to one cache line.
|
||||||
// bloom filter check can touch.
|
// This optimization is turned off when set to 0, and positive number to turn
|
||||||
// This optimization is turned off when set to 0. The number should never
|
// it on.
|
||||||
// be greater than number of probes. This option can boost performance
|
|
||||||
// for in-memory workload but should use with care since it can cause
|
|
||||||
// higher false positive rate.
|
|
||||||
// Default: 0
|
// Default: 0
|
||||||
uint32_t bloom_locality;
|
uint32_t bloom_locality;
|
||||||
|
|
||||||
|
|
|
@ -17,34 +17,41 @@ namespace {
|
||||||
static uint32_t BloomHash(const Slice& key) {
|
static uint32_t BloomHash(const Slice& key) {
|
||||||
return Hash(key.data(), key.size(), 0xbc9f1d34);
|
return Hash(key.data(), key.size(), 0xbc9f1d34);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint32_t GetNumBlocks(uint32_t total_bits) {
|
||||||
|
uint32_t num_blocks = (total_bits + CACHE_LINE_SIZE * 8 - 1) /
|
||||||
|
(CACHE_LINE_SIZE * 8) * (CACHE_LINE_SIZE * 8);
|
||||||
|
// Make num_blocks an odd number to make sure more bits are involved
|
||||||
|
// when determining which block.
|
||||||
|
if (num_blocks % 2 == 0) {
|
||||||
|
num_blocks++;
|
||||||
|
}
|
||||||
|
return num_blocks;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
DynamicBloom::DynamicBloom(uint32_t total_bits, uint32_t cl_per_block,
|
DynamicBloom::DynamicBloom(uint32_t total_bits, uint32_t locality,
|
||||||
uint32_t num_probes,
|
uint32_t num_probes,
|
||||||
uint32_t (*hash_func)(const Slice& key),
|
uint32_t (*hash_func)(const Slice& key),
|
||||||
size_t huge_page_tlb_size, Logger* logger)
|
size_t huge_page_tlb_size, Logger* logger)
|
||||||
: kBlocked(cl_per_block > 0),
|
: kTotalBits(((locality > 0) ? GetNumBlocks(total_bits) : total_bits + 7) /
|
||||||
kBitsPerBlock(std::min(cl_per_block, num_probes) * CACHE_LINE_SIZE * 8),
|
|
||||||
kTotalBits((kBlocked ? (total_bits + kBitsPerBlock - 1) / kBitsPerBlock *
|
|
||||||
kBitsPerBlock
|
|
||||||
: total_bits + 7) /
|
|
||||||
8 * 8),
|
8 * 8),
|
||||||
kNumBlocks(kBlocked ? kTotalBits / kBitsPerBlock : 1),
|
kNumBlocks((locality > 0) ? kTotalBits / (CACHE_LINE_SIZE * 8) : 0),
|
||||||
kNumProbes(num_probes),
|
kNumProbes(num_probes),
|
||||||
hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {
|
hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {
|
||||||
assert(kBlocked ? kTotalBits > 0 : kTotalBits >= kBitsPerBlock);
|
assert(kNumBlocks > 0 || kTotalBits > 0);
|
||||||
assert(kNumProbes > 0);
|
assert(kNumProbes > 0);
|
||||||
|
|
||||||
uint32_t sz = kTotalBits / 8;
|
uint32_t sz = kTotalBits / 8;
|
||||||
if (kBlocked) {
|
if (kNumBlocks > 0) {
|
||||||
sz += CACHE_LINE_SIZE - 1;
|
sz += CACHE_LINE_SIZE - 1;
|
||||||
}
|
}
|
||||||
raw_ = reinterpret_cast<unsigned char*>(
|
raw_ = reinterpret_cast<unsigned char*>(
|
||||||
arena_.AllocateAligned(sz, huge_page_tlb_size, logger));
|
arena_.AllocateAligned(sz, huge_page_tlb_size, logger));
|
||||||
memset(raw_, 0, sz);
|
memset(raw_, 0, sz);
|
||||||
if (kBlocked && (reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE)) {
|
if (kNumBlocks > 0 && (reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE)) {
|
||||||
data_ = raw_ + CACHE_LINE_SIZE -
|
data_ = raw_ + CACHE_LINE_SIZE -
|
||||||
reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE;
|
reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE;
|
||||||
} else {
|
} else {
|
||||||
data_ = raw_;
|
data_ = raw_;
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
|
#include "port/port.h"
|
||||||
#include <util/arena.h>
|
#include <util/arena.h>
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
|
@ -19,15 +20,14 @@ class DynamicBloom {
|
||||||
public:
|
public:
|
||||||
// total_bits: fixed total bits for the bloom
|
// total_bits: fixed total bits for the bloom
|
||||||
// num_probes: number of hash probes for a single key
|
// num_probes: number of hash probes for a single key
|
||||||
// cl_per_block: block size in cache lines. When this is non-zero, a
|
// locality: If positive, optimize for cache line locality, 0 otherwise.
|
||||||
// query/set is done within a block to improve cache locality.
|
|
||||||
// hash_func: customized hash function
|
// hash_func: customized hash function
|
||||||
// huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB
|
// huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB
|
||||||
// withi this page size. Need to reserve huge pages for
|
// withi this page size. Need to reserve huge pages for
|
||||||
// it to be allocated, like:
|
// it to be allocated, like:
|
||||||
// sysctl -w vm.nr_hugepages=20
|
// sysctl -w vm.nr_hugepages=20
|
||||||
// See linux doc Documentation/vm/hugetlbpage.txt
|
// See linux doc Documentation/vm/hugetlbpage.txt
|
||||||
explicit DynamicBloom(uint32_t total_bits, uint32_t cl_per_block = 0,
|
explicit DynamicBloom(uint32_t total_bits, uint32_t locality = 0,
|
||||||
uint32_t num_probes = 6,
|
uint32_t num_probes = 6,
|
||||||
uint32_t (*hash_func)(const Slice& key) = nullptr,
|
uint32_t (*hash_func)(const Slice& key) = nullptr,
|
||||||
size_t huge_page_tlb_size = 0,
|
size_t huge_page_tlb_size = 0,
|
||||||
|
@ -48,8 +48,6 @@ class DynamicBloom {
|
||||||
bool MayContainHash(uint32_t hash);
|
bool MayContainHash(uint32_t hash);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const bool kBlocked;
|
|
||||||
const uint32_t kBitsPerBlock;
|
|
||||||
const uint32_t kTotalBits;
|
const uint32_t kTotalBits;
|
||||||
const uint32_t kNumBlocks;
|
const uint32_t kNumBlocks;
|
||||||
const uint32_t kNumProbes;
|
const uint32_t kNumProbes;
|
||||||
|
@ -69,13 +67,18 @@ inline bool DynamicBloom::MayContain(const Slice& key) {
|
||||||
|
|
||||||
inline bool DynamicBloom::MayContainHash(uint32_t h) {
|
inline bool DynamicBloom::MayContainHash(uint32_t h) {
|
||||||
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
|
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
|
||||||
if (kBlocked) {
|
if (kNumBlocks != 0) {
|
||||||
uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * kBitsPerBlock;
|
uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
|
||||||
for (uint32_t i = 0; i < kNumProbes; ++i) {
|
for (uint32_t i = 0; i < kNumProbes; ++i) {
|
||||||
const uint32_t bitpos = b + h % kBitsPerBlock;
|
// Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
|
||||||
|
// to a simple and operation by compiler.
|
||||||
|
const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
|
||||||
if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
|
if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
// Rotate h so that we don't reuse the same bytes.
|
||||||
|
h = h / (CACHE_LINE_SIZE * 8) +
|
||||||
|
(h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
|
||||||
h += delta;
|
h += delta;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -92,11 +95,16 @@ inline bool DynamicBloom::MayContainHash(uint32_t h) {
|
||||||
|
|
||||||
inline void DynamicBloom::AddHash(uint32_t h) {
|
inline void DynamicBloom::AddHash(uint32_t h) {
|
||||||
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
|
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
|
||||||
if (kBlocked) {
|
if (kNumBlocks != 0) {
|
||||||
uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * kBitsPerBlock;
|
uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
|
||||||
for (uint32_t i = 0; i < kNumProbes; ++i) {
|
for (uint32_t i = 0; i < kNumProbes; ++i) {
|
||||||
const uint32_t bitpos = b + h % kBitsPerBlock;
|
// Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
|
||||||
|
// to a simple and operation by compiler.
|
||||||
|
const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
|
||||||
data_[bitpos / 8] |= (1 << (bitpos % 8));
|
data_[bitpos / 8] |= (1 << (bitpos % 8));
|
||||||
|
// Rotate h so that we don't reuse the same bytes.
|
||||||
|
h = h / (CACHE_LINE_SIZE * 8) +
|
||||||
|
(h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
|
||||||
h += delta;
|
h += delta;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -91,17 +91,16 @@ TEST(DynamicBloomTest, VaryingLengths) {
|
||||||
fprintf(stderr, "bits_per_key: %d num_probes: %d\n",
|
fprintf(stderr, "bits_per_key: %d num_probes: %d\n",
|
||||||
FLAGS_bits_per_key, num_probes);
|
FLAGS_bits_per_key, num_probes);
|
||||||
|
|
||||||
for (uint32_t cl_per_block = 0; cl_per_block < num_probes;
|
for (uint32_t enable_locality = 0; enable_locality < 2; ++enable_locality) {
|
||||||
++cl_per_block) {
|
|
||||||
for (uint32_t num = 1; num <= 10000; num = NextNum(num)) {
|
for (uint32_t num = 1; num <= 10000; num = NextNum(num)) {
|
||||||
uint32_t bloom_bits = 0;
|
uint32_t bloom_bits = 0;
|
||||||
if (cl_per_block == 0) {
|
if (enable_locality == 0) {
|
||||||
bloom_bits = std::max(num * FLAGS_bits_per_key, 64U);
|
bloom_bits = std::max(num * FLAGS_bits_per_key, 64U);
|
||||||
} else {
|
} else {
|
||||||
bloom_bits = std::max(num * FLAGS_bits_per_key,
|
bloom_bits = std::max(num * FLAGS_bits_per_key,
|
||||||
cl_per_block * CACHE_LINE_SIZE * 8);
|
enable_locality * CACHE_LINE_SIZE * 8);
|
||||||
}
|
}
|
||||||
DynamicBloom bloom(bloom_bits, cl_per_block, num_probes);
|
DynamicBloom bloom(bloom_bits, enable_locality, num_probes);
|
||||||
for (uint64_t i = 0; i < num; i++) {
|
for (uint64_t i = 0; i < num; i++) {
|
||||||
bloom.Add(Key(i, buffer));
|
bloom.Add(Key(i, buffer));
|
||||||
ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
|
ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
|
||||||
|
@ -123,8 +122,10 @@ TEST(DynamicBloomTest, VaryingLengths) {
|
||||||
}
|
}
|
||||||
double rate = result / 10000.0;
|
double rate = result / 10000.0;
|
||||||
|
|
||||||
fprintf(stderr, "False positives: %5.2f%% @ num = %6u, bloom_bits = %6u, "
|
fprintf(stderr,
|
||||||
"cl per block = %u\n", rate*100.0, num, bloom_bits, cl_per_block);
|
"False positives: %5.2f%% @ num = %6u, bloom_bits = %6u, "
|
||||||
|
"enable locality?%u\n",
|
||||||
|
rate * 100.0, num, bloom_bits, enable_locality);
|
||||||
|
|
||||||
if (rate > 0.0125)
|
if (rate > 0.0125)
|
||||||
mediocre_filters++; // Allowed, but not too often
|
mediocre_filters++; // Allowed, but not too often
|
||||||
|
@ -173,20 +174,20 @@ TEST(DynamicBloomTest, perf) {
|
||||||
elapsed / count);
|
elapsed / count);
|
||||||
ASSERT_TRUE(count == num_keys);
|
ASSERT_TRUE(count == num_keys);
|
||||||
|
|
||||||
for (uint32_t cl_per_block = 1; cl_per_block <= num_probes;
|
// Locality enabled version
|
||||||
++cl_per_block) {
|
DynamicBloom blocked_bloom(num_keys * 10, 1, num_probes);
|
||||||
DynamicBloom blocked_bloom(num_keys * 10, cl_per_block, num_probes);
|
|
||||||
|
|
||||||
timer.Start();
|
timer.Start();
|
||||||
for (uint64_t i = 1; i <= num_keys; ++i) {
|
for (uint64_t i = 1; i <= num_keys; ++i) {
|
||||||
blocked_bloom.Add(Slice(reinterpret_cast<const char*>(&i), 8));
|
blocked_bloom.Add(Slice(reinterpret_cast<const char*>(&i), 8));
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t elapsed = timer.ElapsedNanos();
|
elapsed = timer.ElapsedNanos();
|
||||||
fprintf(stderr, "blocked bloom(%d), avg add latency %" PRIu64 "\n",
|
fprintf(stderr,
|
||||||
cl_per_block, elapsed / num_keys);
|
"blocked bloom(enable locality), avg add latency %" PRIu64 "\n",
|
||||||
|
elapsed / num_keys);
|
||||||
|
|
||||||
uint64_t count = 0;
|
count = 0;
|
||||||
timer.Start();
|
timer.Start();
|
||||||
for (uint64_t i = 1; i <= num_keys; ++i) {
|
for (uint64_t i = 1; i <= num_keys; ++i) {
|
||||||
if (blocked_bloom.MayContain(
|
if (blocked_bloom.MayContain(
|
||||||
|
@ -196,11 +197,11 @@ TEST(DynamicBloomTest, perf) {
|
||||||
}
|
}
|
||||||
|
|
||||||
elapsed = timer.ElapsedNanos();
|
elapsed = timer.ElapsedNanos();
|
||||||
fprintf(stderr, "blocked bloom(%d), avg query latency %" PRIu64 "\n",
|
fprintf(stderr,
|
||||||
cl_per_block, elapsed / count);
|
"blocked bloom(enable locality), avg query latency %" PRIu64 "\n",
|
||||||
|
elapsed / count);
|
||||||
ASSERT_TRUE(count == num_keys);
|
ASSERT_TRUE(count == num_keys);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
|
Loading…
Reference in New Issue
Block a user