rocksdb/memtable/skiplistrep.cc
Baptiste Lemaire e3a96c4823 Memtable sampling for mempurge heuristic. (#8628)
Summary:
Changes the API of the MemPurge process: the `bool experimental_allow_mempurge` and `experimental_mempurge_policy` flags have been replaced by a `double experimental_mempurge_threshold` option.
This change of API reflects another major change introduced in this PR: the MemPurgeDecider() function now works by sampling the memtables being flushed to estimate the overall amount of useful payload (payload minus the garbage), and then compare this useful payload estimate with the `double experimental_mempurge_threshold` value.
Therefore, when the value of this flag is `0.0` (default value), mempurge is simply deactivated. On the other hand, a value of `DBL_MAX` would be equivalent to always going through a mempurge regardless of the garbage ratio estimate.
At the moment, a `double experimental_mempurge_threshold` value else than 0.0 or `DBL_MAX` is opnly supported`with the `SkipList` memtable representation.
Regarding the sampling, this PR includes the introduction of a `MemTable::UniqueRandomSample` function that collects (approximately) random entries from the memtable by using the new `SkipList::Iterator::RandomSeek()` under the hood, or by iterating through each memtable entry, depending on the target sample size and the total number of entries.
The unit tests have been readapted to support this new API.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8628

Reviewed By: pdillinger

Differential Revision: D30149315

Pulled By: bjlemaire

fbshipit-source-id: 1feef5390c95db6f4480ab4434716533d3947f27
2021-08-10 18:09:03 -07:00

345 lines
12 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
#include <random>
#include "db/memtable.h"
#include "memory/arena.h"
#include "memtable/inlineskiplist.h"
#include "rocksdb/memtablerep.h"
namespace ROCKSDB_NAMESPACE {
namespace {
class SkipListRep : public MemTableRep {
InlineSkipList<const MemTableRep::KeyComparator&> skip_list_;
const MemTableRep::KeyComparator& cmp_;
const SliceTransform* transform_;
const size_t lookahead_;
friend class LookaheadIterator;
public:
explicit SkipListRep(const MemTableRep::KeyComparator& compare,
Allocator* allocator, const SliceTransform* transform,
const size_t lookahead)
: MemTableRep(allocator),
skip_list_(compare, allocator),
cmp_(compare),
transform_(transform),
lookahead_(lookahead) {}
KeyHandle Allocate(const size_t len, char** buf) override {
*buf = skip_list_.AllocateKey(len);
return static_cast<KeyHandle>(*buf);
}
// Insert key into the list.
// REQUIRES: nothing that compares equal to key is currently in the list.
void Insert(KeyHandle handle) override {
skip_list_.Insert(static_cast<char*>(handle));
}
bool InsertKey(KeyHandle handle) override {
return skip_list_.Insert(static_cast<char*>(handle));
}
void InsertWithHint(KeyHandle handle, void** hint) override {
skip_list_.InsertWithHint(static_cast<char*>(handle), hint);
}
bool InsertKeyWithHint(KeyHandle handle, void** hint) override {
return skip_list_.InsertWithHint(static_cast<char*>(handle), hint);
}
void InsertWithHintConcurrently(KeyHandle handle, void** hint) override {
skip_list_.InsertWithHintConcurrently(static_cast<char*>(handle), hint);
}
bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) override {
return skip_list_.InsertWithHintConcurrently(static_cast<char*>(handle),
hint);
}
void InsertConcurrently(KeyHandle handle) override {
skip_list_.InsertConcurrently(static_cast<char*>(handle));
}
bool InsertKeyConcurrently(KeyHandle handle) override {
return skip_list_.InsertConcurrently(static_cast<char*>(handle));
}
// Returns true iff an entry that compares equal to key is in the list.
bool Contains(const char* key) const override {
return skip_list_.Contains(key);
}
size_t ApproximateMemoryUsage() override {
// All memory is allocated through allocator; nothing to report here
return 0;
}
void Get(const LookupKey& k, void* callback_args,
bool (*callback_func)(void* arg, const char* entry)) override {
SkipListRep::Iterator iter(&skip_list_);
Slice dummy_slice;
for (iter.Seek(dummy_slice, k.memtable_key().data());
iter.Valid() && callback_func(callback_args, iter.key()); iter.Next()) {
}
}
uint64_t ApproximateNumEntries(const Slice& start_ikey,
const Slice& end_ikey) override {
std::string tmp;
uint64_t start_count =
skip_list_.EstimateCount(EncodeKey(&tmp, start_ikey));
uint64_t end_count = skip_list_.EstimateCount(EncodeKey(&tmp, end_ikey));
return (end_count >= start_count) ? (end_count - start_count) : 0;
}
void UniqueRandomSample(const uint64_t& num_entries,
const uint64_t& target_sample_size,
std::unordered_set<const char*>* entries) override {
entries->clear();
// Avoid divide-by-0.
assert(target_sample_size > 0);
assert(num_entries > 0);
// NOTE: the size of entries is not enforced to be exactly
// target_sample_size at the end of this function, it might be slightly
// greater or smaller.
SkipListRep::Iterator iter(&skip_list_);
// There are two methods to create the subset of samples (size m)
// from the table containing N elements:
// 1-Iterate linearly through the N memtable entries. For each entry i,
// add it to the sample set with a probability
// (target_sample_size - entries.size() ) / (N-i).
//
// 2-Pick m random elements without repetition.
// We pick Option 2 when m<sqrt(N) and
// Option 1 when m > sqrt(N).
if (target_sample_size >
static_cast<uint64_t>(std::sqrt(1.0 * num_entries))) {
Random* rnd = Random::GetTLSInstance();
iter.SeekToFirst();
uint64_t counter = 0, num_samples_left = target_sample_size;
for (; iter.Valid() && (num_samples_left > 0); iter.Next(), counter++) {
// Add entry to sample set with probability
// num_samples_left/(num_entries - counter).
if (rnd->Next() % (num_entries - counter) < num_samples_left) {
entries->insert(iter.key());
num_samples_left--;
}
}
} else {
// Option 2: pick m random elements with no duplicates.
// If Option 2 is picked, then target_sample_size<sqrt(N)
// Using a set spares the need to check for duplicates.
for (uint64_t i = 0; i < target_sample_size; i++) {
// We give it 5 attempts to find a non-duplicate
// With 5 attempts, the chances of returning `entries` set
// of size target_sample_size is:
// PROD_{i=1}^{target_sample_size-1} [1-(i/N)^5]
// which is monotonically increasing with N in the worse case
// of target_sample_size=sqrt(N), and is always >99.9% for N>4.
// At worst, for the final pick , when m=sqrt(N) there is
// a probability of p= 1/sqrt(N) chances to find a duplicate.
for (uint64_t j = 0; j < 5; j++) {
iter.RandomSeek();
// unordered_set::insert returns pair<iterator, bool>.
// The second element is true if an insert successfully happened.
// If element is already in the set, this bool will be false, and
// true otherwise.
if ((entries->insert(iter.key())).second) {
break;
}
}
}
}
}
~SkipListRep() override {}
// Iteration over the contents of a skip list
class Iterator : public MemTableRep::Iterator {
InlineSkipList<const MemTableRep::KeyComparator&>::Iterator iter_;
public:
// Initialize an iterator over the specified list.
// The returned iterator is not valid.
explicit Iterator(
const InlineSkipList<const MemTableRep::KeyComparator&>* list)
: iter_(list) {}
~Iterator() override {}
// Returns true iff the iterator is positioned at a valid node.
bool Valid() const override { return iter_.Valid(); }
// Returns the key at the current position.
// REQUIRES: Valid()
const char* key() const override { return iter_.key(); }
// Advances to the next position.
// REQUIRES: Valid()
void Next() override { iter_.Next(); }
// Advances to the previous position.
// REQUIRES: Valid()
void Prev() override { iter_.Prev(); }
// Advance to the first entry with a key >= target
void Seek(const Slice& user_key, const char* memtable_key) override {
if (memtable_key != nullptr) {
iter_.Seek(memtable_key);
} else {
iter_.Seek(EncodeKey(&tmp_, user_key));
}
}
// Retreat to the last entry with a key <= target
void SeekForPrev(const Slice& user_key, const char* memtable_key) override {
if (memtable_key != nullptr) {
iter_.SeekForPrev(memtable_key);
} else {
iter_.SeekForPrev(EncodeKey(&tmp_, user_key));
}
}
void RandomSeek() override { iter_.RandomSeek(); }
// Position at the first entry in list.
// Final state of iterator is Valid() iff list is not empty.
void SeekToFirst() override { iter_.SeekToFirst(); }
// Position at the last entry in list.
// Final state of iterator is Valid() iff list is not empty.
void SeekToLast() override { iter_.SeekToLast(); }
protected:
std::string tmp_; // For passing to EncodeKey
};
// Iterator over the contents of a skip list which also keeps track of the
// previously visited node. In Seek(), it examines a few nodes after it
// first, falling back to O(log n) search from the head of the list only if
// the target key hasn't been found.
class LookaheadIterator : public MemTableRep::Iterator {
public:
explicit LookaheadIterator(const SkipListRep& rep) :
rep_(rep), iter_(&rep_.skip_list_), prev_(iter_) {}
~LookaheadIterator() override {}
bool Valid() const override { return iter_.Valid(); }
const char* key() const override {
assert(Valid());
return iter_.key();
}
void Next() override {
assert(Valid());
bool advance_prev = true;
if (prev_.Valid()) {
auto k1 = rep_.UserKey(prev_.key());
auto k2 = rep_.UserKey(iter_.key());
if (k1.compare(k2) == 0) {
// same user key, don't move prev_
advance_prev = false;
} else if (rep_.transform_) {
// only advance prev_ if it has the same prefix as iter_
auto t1 = rep_.transform_->Transform(k1);
auto t2 = rep_.transform_->Transform(k2);
advance_prev = t1.compare(t2) == 0;
}
}
if (advance_prev) {
prev_ = iter_;
}
iter_.Next();
}
void Prev() override {
assert(Valid());
iter_.Prev();
prev_ = iter_;
}
void Seek(const Slice& internal_key, const char* memtable_key) override {
const char *encoded_key =
(memtable_key != nullptr) ?
memtable_key : EncodeKey(&tmp_, internal_key);
if (prev_.Valid() && rep_.cmp_(encoded_key, prev_.key()) >= 0) {
// prev_.key() is smaller or equal to our target key; do a quick
// linear search (at most lookahead_ steps) starting from prev_
iter_ = prev_;
size_t cur = 0;
while (cur++ <= rep_.lookahead_ && iter_.Valid()) {
if (rep_.cmp_(encoded_key, iter_.key()) <= 0) {
return;
}
Next();
}
}
iter_.Seek(encoded_key);
prev_ = iter_;
}
void SeekForPrev(const Slice& internal_key,
const char* memtable_key) override {
const char* encoded_key = (memtable_key != nullptr)
? memtable_key
: EncodeKey(&tmp_, internal_key);
iter_.SeekForPrev(encoded_key);
prev_ = iter_;
}
void SeekToFirst() override {
iter_.SeekToFirst();
prev_ = iter_;
}
void SeekToLast() override {
iter_.SeekToLast();
prev_ = iter_;
}
protected:
std::string tmp_; // For passing to EncodeKey
private:
const SkipListRep& rep_;
InlineSkipList<const MemTableRep::KeyComparator&>::Iterator iter_;
InlineSkipList<const MemTableRep::KeyComparator&>::Iterator prev_;
};
MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override {
if (lookahead_ > 0) {
void *mem =
arena ? arena->AllocateAligned(sizeof(SkipListRep::LookaheadIterator))
: operator new(sizeof(SkipListRep::LookaheadIterator));
return new (mem) SkipListRep::LookaheadIterator(*this);
} else {
void *mem =
arena ? arena->AllocateAligned(sizeof(SkipListRep::Iterator))
: operator new(sizeof(SkipListRep::Iterator));
return new (mem) SkipListRep::Iterator(&skip_list_);
}
}
};
}
MemTableRep* SkipListFactory::CreateMemTableRep(
const MemTableRep::KeyComparator& compare, Allocator* allocator,
const SliceTransform* transform, Logger* /*logger*/) {
return new SkipListRep(compare, allocator, transform, lookahead_);
}
} // namespace ROCKSDB_NAMESPACE