ea89c77f27
Summary: MemTable::MultiGet was not considering range tombstones before querying Bloom filter. This means range tombstones would be skipped for keys (or prefixes) with no other entries in the memtable. This could cause old values for a key (in SST files) to still show up until the range tombstone covering it has been flushed. This is fixed by essentially disabling the memtable Bloom filter when there are any range tombstones. (This could be better optimized in the future, but good enough for now.) Did some other cleanup/optimization in the same code to (more than) offset the cost of checking on range tombstones in more cases. There is now notable improvement when memtable_whole_key_filtering and prefix_extractor are used together (unusual), and this makes MultiGet closer to the Get implementation. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9453 Test Plan: new unit test added. Added memtable Bloom to crash test. Performance testing -------------------- Build WAL-only DB (recovers to memtable): ``` TEST_TMPDIR=/dev/shm/rocksdb ./db_bench -benchmarks=fillrandom -num=1000000 -write_buffer_size=250000000 ``` Query test command, to maximize sensitivity to the changed code: ``` TEST_TMPDIR=/dev/shm/rocksdb ./db_bench -use_existing_db -readonly -benchmarks=multireadrandom -num=10000000 -write_buffer_size=250000000 -memtable_bloom_size_ratio=0.015 -multiread_batched -batch_size=24 -threads=8 -memtable_whole_key_filtering=$MWKF -prefix_size=$PXS ``` (Note -num here is 10x larger for mostly memtable misses) Before & after run simultaneously, average over 10 iterations per data point, ops/sec. MWKF=0 PXS=0 (Bloom disabled) Before: 5724844 After: 6722066 MWKF=0 PXS=7 (prefixes hardly unique; Bloom not useful) Before: 9981319 After: 10237990 MWKF=0 PXS=8 (prefixes unique; Bloom useful) Before: 12081715 After: 12117603 MWKF=1 PXS=0 (whole key Bloom useful) Before: 11944354 After: 12096085 MWKF=1 PXS=7 (whole key Bloom useful in new version; prefixes not useful in old version) Before: 9444299 After: 11826029 MWKF=1 PXS=7 (whole key Bloom useful in new version; prefixes useful in old version) Before: 11784465 After: 11778591 Only in this last case is the 'before' *slightly* faster, perhaps because hashing prefixes is slightly faster than hashing whole keys. Otherwise, 'after' is faster. Reviewed By: ajkr Differential Revision: D33805025 Pulled By: pdillinger fbshipit-source-id: 597523cae4f4eafdf6ae6bb2bc6cb46f83b017bf
215 lines
7.5 KiB
C++
215 lines
7.5 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#pragma once
|
|
|
|
#include <array>
|
|
#include <string>
|
|
#include "port/port.h"
|
|
#include "rocksdb/slice.h"
|
|
#include "table/multiget_context.h"
|
|
#include "util/hash.h"
|
|
|
|
#include <atomic>
|
|
#include <memory>
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
class Slice;
|
|
class Allocator;
|
|
class Logger;
|
|
|
|
// A Bloom filter intended only to be used in memory, never serialized in a way
|
|
// that could lead to schema incompatibility. Supports opt-in lock-free
|
|
// concurrent access.
|
|
//
|
|
// This implementation is also intended for applications generally preferring
|
|
// speed vs. maximum accuracy: roughly 0.9x BF op latency for 1.1x FP rate.
|
|
// For 1% FP rate, that means that the latency of a look-up triggered by an FP
|
|
// should be less than roughly 100x the cost of a Bloom filter op.
|
|
//
|
|
// For simplicity and performance, the current implementation requires
|
|
// num_probes to be a multiple of two and <= 10.
|
|
//
|
|
class DynamicBloom {
|
|
public:
|
|
// allocator: pass allocator to bloom filter, hence trace the usage of memory
|
|
// total_bits: fixed total bits for the bloom
|
|
// num_probes: number of hash probes for a single key
|
|
// hash_func: customized hash function
|
|
// huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB
|
|
// within this page size. Need to reserve huge pages for
|
|
// it to be allocated, like:
|
|
// sysctl -w vm.nr_hugepages=20
|
|
// See linux doc Documentation/vm/hugetlbpage.txt
|
|
explicit DynamicBloom(Allocator* allocator, uint32_t total_bits,
|
|
uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
|
|
Logger* logger = nullptr);
|
|
|
|
~DynamicBloom() {}
|
|
|
|
// Assuming single threaded access to this function.
|
|
void Add(const Slice& key);
|
|
|
|
// Like Add, but may be called concurrent with other functions.
|
|
void AddConcurrently(const Slice& key);
|
|
|
|
// Assuming single threaded access to this function.
|
|
void AddHash(uint32_t hash);
|
|
|
|
// Like AddHash, but may be called concurrent with other functions.
|
|
void AddHashConcurrently(uint32_t hash);
|
|
|
|
// Multithreaded access to this function is OK
|
|
bool MayContain(const Slice& key) const;
|
|
|
|
void MayContain(int num_keys, Slice* keys, bool* may_match) const;
|
|
|
|
// Multithreaded access to this function is OK
|
|
bool MayContainHash(uint32_t hash) const;
|
|
|
|
void Prefetch(uint32_t h);
|
|
|
|
private:
|
|
// Length of the structure, in 64-bit words. For this structure, "word"
|
|
// will always refer to 64-bit words.
|
|
uint32_t kLen;
|
|
// We make the k probes in pairs, two for each 64-bit read/write. Thus,
|
|
// this stores k/2, the number of words to double-probe.
|
|
const uint32_t kNumDoubleProbes;
|
|
|
|
std::atomic<uint64_t>* data_;
|
|
|
|
// or_func(ptr, mask) should effect *ptr |= mask with the appropriate
|
|
// concurrency safety, working with bytes.
|
|
template <typename OrFunc>
|
|
void AddHash(uint32_t hash, const OrFunc& or_func);
|
|
|
|
bool DoubleProbe(uint32_t h32, size_t a) const;
|
|
};
|
|
|
|
inline void DynamicBloom::Add(const Slice& key) { AddHash(BloomHash(key)); }
|
|
|
|
inline void DynamicBloom::AddConcurrently(const Slice& key) {
|
|
AddHashConcurrently(BloomHash(key));
|
|
}
|
|
|
|
inline void DynamicBloom::AddHash(uint32_t hash) {
|
|
AddHash(hash, [](std::atomic<uint64_t>* ptr, uint64_t mask) {
|
|
ptr->store(ptr->load(std::memory_order_relaxed) | mask,
|
|
std::memory_order_relaxed);
|
|
});
|
|
}
|
|
|
|
inline void DynamicBloom::AddHashConcurrently(uint32_t hash) {
|
|
AddHash(hash, [](std::atomic<uint64_t>* ptr, uint64_t mask) {
|
|
// Happens-before between AddHash and MaybeContains is handled by
|
|
// access to versions_->LastSequence(), so all we have to do here is
|
|
// avoid races (so we don't give the compiler a license to mess up
|
|
// our code) and not lose bits. std::memory_order_relaxed is enough
|
|
// for that.
|
|
if ((mask & ptr->load(std::memory_order_relaxed)) != mask) {
|
|
ptr->fetch_or(mask, std::memory_order_relaxed);
|
|
}
|
|
});
|
|
}
|
|
|
|
inline bool DynamicBloom::MayContain(const Slice& key) const {
|
|
return (MayContainHash(BloomHash(key)));
|
|
}
|
|
|
|
inline void DynamicBloom::MayContain(int num_keys, Slice* keys,
|
|
bool* may_match) const {
|
|
std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
|
|
std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
|
|
for (int i = 0; i < num_keys; ++i) {
|
|
hashes[i] = BloomHash(keys[i]);
|
|
size_t a = FastRange32(kLen, hashes[i]);
|
|
PREFETCH(data_ + a, 0, 3);
|
|
byte_offsets[i] = a;
|
|
}
|
|
|
|
for (int i = 0; i < num_keys; i++) {
|
|
may_match[i] = DoubleProbe(hashes[i], byte_offsets[i]);
|
|
}
|
|
}
|
|
|
|
#if defined(_MSC_VER)
|
|
#pragma warning(push)
|
|
// local variable is initialized but not referenced
|
|
#pragma warning(disable : 4189)
|
|
#endif
|
|
inline void DynamicBloom::Prefetch(uint32_t h32) {
|
|
size_t a = FastRange32(kLen, h32);
|
|
PREFETCH(data_ + a, 0, 3);
|
|
}
|
|
#if defined(_MSC_VER)
|
|
#pragma warning(pop)
|
|
#endif
|
|
|
|
// Speed hacks in this implementation:
|
|
// * Uses fastrange instead of %
|
|
// * Minimum logic to determine first (and all) probed memory addresses.
|
|
// (Uses constant bit-xor offsets from the starting probe address.)
|
|
// * (Major) Two probes per 64-bit memory fetch/write.
|
|
// Code simplification / optimization: only allow even number of probes.
|
|
// * Very fast and effective (murmur-like) hash expansion/re-mixing. (At
|
|
// least on recent CPUs, integer multiplication is very cheap. Each 64-bit
|
|
// remix provides five pairs of bit addresses within a uint64_t.)
|
|
// Code simplification / optimization: only allow up to 10 probes, from a
|
|
// single 64-bit remix.
|
|
//
|
|
// The FP rate penalty for this implementation, vs. standard Bloom filter, is
|
|
// roughly 1.12x on top of the 1.15x penalty for a 512-bit cache-local Bloom.
|
|
// This implementation does not explicitly use the cache line size, but is
|
|
// effectively cache-local (up to 16 probes) because of the bit-xor offsetting.
|
|
//
|
|
// NB: could easily be upgraded to support a 64-bit hash and
|
|
// total_bits > 2^32 (512MB). (The latter is a bad idea without the former,
|
|
// because of false positives.)
|
|
|
|
inline bool DynamicBloom::MayContainHash(uint32_t h32) const {
|
|
size_t a = FastRange32(kLen, h32);
|
|
PREFETCH(data_ + a, 0, 3);
|
|
return DoubleProbe(h32, a);
|
|
}
|
|
|
|
inline bool DynamicBloom::DoubleProbe(uint32_t h32, size_t byte_offset) const {
|
|
// Expand/remix with 64-bit golden ratio
|
|
uint64_t h = 0x9e3779b97f4a7c13ULL * h32;
|
|
for (unsigned i = 0;; ++i) {
|
|
// Two bit probes per uint64_t probe
|
|
uint64_t mask =
|
|
((uint64_t)1 << (h & 63)) | ((uint64_t)1 << ((h >> 6) & 63));
|
|
uint64_t val = data_[byte_offset ^ i].load(std::memory_order_relaxed);
|
|
if (i + 1 >= kNumDoubleProbes) {
|
|
return (val & mask) == mask;
|
|
} else if ((val & mask) != mask) {
|
|
return false;
|
|
}
|
|
h = (h >> 12) | (h << 52);
|
|
}
|
|
}
|
|
|
|
template <typename OrFunc>
|
|
inline void DynamicBloom::AddHash(uint32_t h32, const OrFunc& or_func) {
|
|
size_t a = FastRange32(kLen, h32);
|
|
PREFETCH(data_ + a, 0, 3);
|
|
// Expand/remix with 64-bit golden ratio
|
|
uint64_t h = 0x9e3779b97f4a7c13ULL * h32;
|
|
for (unsigned i = 0;; ++i) {
|
|
// Two bit probes per uint64_t probe
|
|
uint64_t mask =
|
|
((uint64_t)1 << (h & 63)) | ((uint64_t)1 << ((h >> 6) & 63));
|
|
or_func(&data_[a ^ i], mask);
|
|
if (i + 1 >= kNumDoubleProbes) {
|
|
return;
|
|
}
|
|
h = (h >> 12) | (h << 52);
|
|
}
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|