rocksdb/util/dynamic_bloom.h
Vijay Nadimpalli 4c49e38f15 MultiGet batching in memtable (#5818)
Summary:
RocksDB has a MultiGet() API that implements batched key lookup for higher performance (https://github.com/facebook/rocksdb/blob/master/include/rocksdb/db.h#L468). Currently, batching is implemented in BlockBasedTableReader::MultiGet() for SST file lookups. One of the ways it improves performance is by pipelining bloom filter lookups (by prefetching required cachelines for all the keys in the batch, and then doing the probe) and thus hiding the cache miss latency. The same concept can be extended to the memtable as well. This PR involves implementing a pipelined bloom filter lookup in DynamicBloom, and implementing MemTable::MultiGet() that can leverage it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5818

Test Plan:
Existing tests

Performance Test:
Ran the below command which fills up the memtable and makes sure there are no flushes and then call multiget. Ran it on master and on the new change and see atleast 1% performance improvement across all the test runs I did. Sometimes the improvement was upto 5%.

TEST_TMPDIR=/data/users/$USER/benchmarks/feature/ numactl -C 10 ./db_bench -benchmarks="fillseq,multireadrandom" -num=600000 -compression_type="none" -level_compaction_dynamic_level_bytes -write_buffer_size=200000000 -target_file_size_base=200000000 -max_bytes_for_level_base=16777216 -reads=90000 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 -statistics -memtable_whole_key_filtering=true -memtable_bloom_size_ratio=10

Differential Revision: D17578869

Pulled By: vjnadimpalli

fbshipit-source-id: 23dc651d9bf49db11d22375bf435708875a1f192
2019-10-10 09:39:39 -07:00

215 lines
7.4 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include <array>
#include <string>
#include "port/port.h"
#include "rocksdb/slice.h"
#include "table/multiget_context.h"
#include "util/hash.h"
#include <atomic>
#include <memory>
namespace rocksdb {
class Slice;
class Allocator;
class Logger;
// A Bloom filter intended only to be used in memory, never serialized in a way
// that could lead to schema incompatibility. Supports opt-in lock-free
// concurrent access.
//
// This implementation is also intended for applications generally preferring
// speed vs. maximum accuracy: roughly 0.9x BF op latency for 1.1x FP rate.
// For 1% FP rate, that means that the latency of a look-up triggered by an FP
// should be less than roughly 100x the cost of a Bloom filter op.
//
// For simplicity and performance, the current implementation requires
// num_probes to be a multiple of two and <= 10.
//
class DynamicBloom {
public:
// allocator: pass allocator to bloom filter, hence trace the usage of memory
// total_bits: fixed total bits for the bloom
// num_probes: number of hash probes for a single key
// hash_func: customized hash function
// huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB
// within this page size. Need to reserve huge pages for
// it to be allocated, like:
// sysctl -w vm.nr_hugepages=20
// See linux doc Documentation/vm/hugetlbpage.txt
explicit DynamicBloom(Allocator* allocator, uint32_t total_bits,
uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
Logger* logger = nullptr);
~DynamicBloom() {}
// Assuming single threaded access to this function.
void Add(const Slice& key);
// Like Add, but may be called concurrent with other functions.
void AddConcurrently(const Slice& key);
// Assuming single threaded access to this function.
void AddHash(uint32_t hash);
// Like AddHash, but may be called concurrent with other functions.
void AddHashConcurrently(uint32_t hash);
// Multithreaded access to this function is OK
bool MayContain(const Slice& key) const;
void MayContain(int num_keys, Slice** keys, bool* may_match) const;
// Multithreaded access to this function is OK
bool MayContainHash(uint32_t hash) const;
void Prefetch(uint32_t h);
private:
// Length of the structure, in 64-bit words. For this structure, "word"
// will always refer to 64-bit words.
uint32_t kLen;
// We make the k probes in pairs, two for each 64-bit read/write. Thus,
// this stores k/2, the number of words to double-probe.
const uint32_t kNumDoubleProbes;
std::atomic<uint64_t>* data_;
// or_func(ptr, mask) should effect *ptr |= mask with the appropriate
// concurrency safety, working with bytes.
template <typename OrFunc>
void AddHash(uint32_t hash, const OrFunc& or_func);
bool DoubleProbe(uint32_t h32, size_t a) const;
};
inline void DynamicBloom::Add(const Slice& key) { AddHash(BloomHash(key)); }
inline void DynamicBloom::AddConcurrently(const Slice& key) {
AddHashConcurrently(BloomHash(key));
}
inline void DynamicBloom::AddHash(uint32_t hash) {
AddHash(hash, [](std::atomic<uint64_t>* ptr, uint64_t mask) {
ptr->store(ptr->load(std::memory_order_relaxed) | mask,
std::memory_order_relaxed);
});
}
inline void DynamicBloom::AddHashConcurrently(uint32_t hash) {
AddHash(hash, [](std::atomic<uint64_t>* ptr, uint64_t mask) {
// Happens-before between AddHash and MaybeContains is handled by
// access to versions_->LastSequence(), so all we have to do here is
// avoid races (so we don't give the compiler a license to mess up
// our code) and not lose bits. std::memory_order_relaxed is enough
// for that.
if ((mask & ptr->load(std::memory_order_relaxed)) != mask) {
ptr->fetch_or(mask, std::memory_order_relaxed);
}
});
}
inline bool DynamicBloom::MayContain(const Slice& key) const {
return (MayContainHash(BloomHash(key)));
}
inline void DynamicBloom::MayContain(int num_keys, Slice** keys,
bool* may_match) const {
std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
for (int i = 0; i < num_keys; ++i) {
hashes[i] = BloomHash(*keys[i]);
size_t a = fastrange32(kLen, hashes[i]);
PREFETCH(data_ + a, 0, 3);
byte_offsets[i] = a;
}
for (int i = 0; i < num_keys; i++) {
may_match[i] = DoubleProbe(hashes[i], byte_offsets[i]);
}
}
#if defined(_MSC_VER)
#pragma warning(push)
// local variable is initialized but not referenced
#pragma warning(disable : 4189)
#endif
inline void DynamicBloom::Prefetch(uint32_t h32) {
size_t a = fastrange32(kLen, h32);
PREFETCH(data_ + a, 0, 3);
}
#if defined(_MSC_VER)
#pragma warning(pop)
#endif
// Speed hacks in this implementation:
// * Uses fastrange instead of %
// * Minimum logic to determine first (and all) probed memory addresses.
// (Uses constant bit-xor offsets from the starting probe address.)
// * (Major) Two probes per 64-bit memory fetch/write.
// Code simplification / optimization: only allow even number of probes.
// * Very fast and effective (murmur-like) hash expansion/re-mixing. (At
// least on recent CPUs, integer multiplication is very cheap. Each 64-bit
// remix provides five pairs of bit addresses within a uint64_t.)
// Code simplification / optimization: only allow up to 10 probes, from a
// single 64-bit remix.
//
// The FP rate penalty for this implementation, vs. standard Bloom filter, is
// roughly 1.12x on top of the 1.15x penalty for a 512-bit cache-local Bloom.
// This implementation does not explicitly use the cache line size, but is
// effectively cache-local (up to 16 probes) because of the bit-xor offsetting.
//
// NB: could easily be upgraded to support a 64-bit hash and
// total_bits > 2^32 (512MB). (The latter is a bad idea without the former,
// because of false positives.)
inline bool DynamicBloom::MayContainHash(uint32_t h32) const {
size_t a = fastrange32(kLen, h32);
PREFETCH(data_ + a, 0, 3);
return DoubleProbe(h32, a);
}
inline bool DynamicBloom::DoubleProbe(uint32_t h32, size_t byte_offset) const {
// Expand/remix with 64-bit golden ratio
uint64_t h = 0x9e3779b97f4a7c13ULL * h32;
for (unsigned i = 0;; ++i) {
// Two bit probes per uint64_t probe
uint64_t mask =
((uint64_t)1 << (h & 63)) | ((uint64_t)1 << ((h >> 6) & 63));
uint64_t val = data_[byte_offset ^ i].load(std::memory_order_relaxed);
if (i + 1 >= kNumDoubleProbes) {
return (val & mask) == mask;
} else if ((val & mask) != mask) {
return false;
}
h = (h >> 12) | (h << 52);
}
}
template <typename OrFunc>
inline void DynamicBloom::AddHash(uint32_t h32, const OrFunc& or_func) {
size_t a = fastrange32(kLen, h32);
PREFETCH(data_ + a, 0, 3);
// Expand/remix with 64-bit golden ratio
uint64_t h = 0x9e3779b97f4a7c13ULL * h32;
for (unsigned i = 0;; ++i) {
// Two bit probes per uint64_t probe
uint64_t mask =
((uint64_t)1 << (h & 63)) | ((uint64_t)1 << ((h >> 6) & 63));
or_func(&data_[a ^ i], mask);
if (i + 1 >= kNumDoubleProbes) {
return;
}
h = (h >> 12) | (h << 52);
}
}
} // rocksdb