krad 1f0142ce19 Persistent Read Cache (Part 2) Data structure for building persistent read cache index
Summary:
We expect the persistent read cache to perform at speeds upto 8 GB/s. In order
to accomplish that, we need build a index mechanism which operate in the order
of multiple millions per sec rate.

This patch provide the basic data structure to accomplish that:

(1) Hash table implementation with lock contention spread
    It is based on the StripedHashSet<T> implementation in
    The Art of multiprocessor programming by Maurice Henry & Nir Shavit
(2) LRU implementation
    Place holder algorithm for further optimizing
(3) Evictable Hash Table implementation
    Building block for building index data structure that evicts data like files
    etc

TODO:
(1) Figure if the sharded hash table and LRU can be used instead
(2) Figure if we need to support configurable eviction algorithm for
EvictableHashTable

Test Plan: Run unit tests

Subscribers: andrewkr, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D55785
2016-05-17 13:18:47 -07:00

227 lines
6.3 KiB
C++

// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#pragma once
#include <assert.h>
#include <sys/mman.h>
#include <list>
#include <vector>
#include "include/rocksdb/env.h"
#include "port/port_posix.h"
#include "util/mutexlock.h"
namespace rocksdb {
// HashTable<T, Hash, Equal>
//
// Traditional implementation of hash table with syncronization built on top
// don't perform very well in multi-core scenarios. This is an implementation
// designed for multi-core scenarios with high lock contention.
//
// |<-------- alpha ------------->|
// Buckets Collision list
// ---- +----+ +---+---+--- ...... ---+---+---+
// / | |--->| | | | | |
// / +----+ +---+---+--- ...... ---+---+---+
// / | |
// Locks/ +----+
// +--+/ . .
// | | . .
// +--+ . .
// | | . .
// +--+ . .
// | | . .
// +--+ . .
// \ +----+
// \ | |
// \ +----+
// \ | |
// \---- +----+
//
// The lock contention is spread over an array of locks. This helps improve
// concurrent access. The spine is designed for a certain capacity and load
// factor. When the capacity planning is done correctly we can expect
// O(load_factor = 1) insert, access and remove time.
//
// Micro benchmark on debug build gives about .5 Million/sec rate of insert,
// erase and lookup in parallel (total of about 1.5 Million ops/sec). If the
// blocks were of 4K, the hash table can support a virtual throughput of
// 6 GB/s.
//
// T Object type (contains both key and value)
// Hash Function that returns an hash from type T
// Equal Returns if two objects are equal
// (We need explicit equal for pointer type)
//
template <class T, class Hash, class Equal>
class HashTable {
public:
explicit HashTable(const size_t capacity = 1024 * 1024,
const float load_factor = 2.0, const uint32_t nlocks = 256)
: nbuckets_(load_factor ? capacity / load_factor : 0), nlocks_(nlocks) {
// pre-conditions
assert(capacity);
assert(load_factor);
assert(nbuckets_);
assert(nlocks_);
buckets_.reset(new Bucket[nbuckets_]);
mlock(buckets_.get(), nbuckets_ * sizeof(Bucket));
// initialize locks
locks_.reset(new port::RWMutex[nlocks_]);
mlock(locks_.get(), nlocks_ * sizeof(port::RWMutex));
// post-conditions
assert(buckets_);
assert(locks_);
}
virtual ~HashTable() { AssertEmptyBuckets(); }
//
// Insert given record to hash table
//
bool Insert(const T& t) {
const uint64_t h = Hash()(t);
const uint32_t bucket_idx = h % nbuckets_;
const uint32_t lock_idx = bucket_idx % nlocks_;
WriteLock _(&locks_[lock_idx]);
auto& bucket = buckets_[bucket_idx];
return Insert(&bucket, t);
}
// Lookup hash table
//
// Please note that read lock should be held by the caller. This is because
// the caller owns the data, and should hold the read lock as long as he
// operates on the data.
bool Find(const T& t, T* ret, port::RWMutex** ret_lock) {
const uint64_t h = Hash()(t);
const uint32_t bucket_idx = h % nbuckets_;
const uint32_t lock_idx = bucket_idx % nlocks_;
port::RWMutex& lock = locks_[lock_idx];
lock.ReadLock();
auto& bucket = buckets_[bucket_idx];
if (Find(&bucket, t, ret)) {
*ret_lock = &lock;
return true;
}
lock.ReadUnlock();
return false;
}
//
// Erase a given key from the hash table
//
bool Erase(const T& t, T* ret) {
const uint64_t h = Hash()(t);
const uint32_t bucket_idx = h % nbuckets_;
const uint32_t lock_idx = bucket_idx % nlocks_;
WriteLock _(&locks_[lock_idx]);
auto& bucket = buckets_[bucket_idx];
return Erase(&bucket, t, ret);
}
// Fetch the mutex associated with a key
// This call is used to hold the lock for a given data for extended period of
// time.
port::RWMutex* GetMutex(const T& t) {
const uint64_t h = Hash()(t);
const uint32_t bucket_idx = h % nbuckets_;
const uint32_t lock_idx = bucket_idx % nlocks_;
return &locks_[lock_idx];
}
void Clear(void (*fn)(T)) {
for (uint32_t i = 0; i < nbuckets_; ++i) {
const uint32_t lock_idx = i % nlocks_;
WriteLock _(&locks_[lock_idx]);
for (auto& t : buckets_[i].list_) {
(*fn)(t);
}
buckets_[i].list_.clear();
}
}
protected:
// Models bucket of keys that hash to the same bucket number
struct Bucket {
std::list<T> list_;
};
// Substitute for std::find with custom comparator operator
typename std::list<T>::iterator Find(std::list<T>* list, const T& t) {
for (auto it = list->begin(); it != list->end(); ++it) {
if (Equal()(*it, t)) {
return it;
}
}
return list->end();
}
bool Insert(Bucket* bucket, const T& t) {
// Check if the key already exists
auto it = Find(&bucket->list_, t);
if (it != bucket->list_.end()) {
return false;
}
// insert to bucket
bucket->list_.push_back(t);
return true;
}
bool Find(Bucket* bucket, const T& t, T* ret) {
auto it = Find(&bucket->list_, t);
if (it != bucket->list_.end()) {
if (ret) {
*ret = *it;
}
return true;
}
return false;
}
bool Erase(Bucket* bucket, const T& t, T* ret) {
auto it = Find(&bucket->list_, t);
if (it != bucket->list_.end()) {
if (ret) {
*ret = *it;
}
bucket->list_.erase(it);
return true;
}
return false;
}
// assert that all buckets are empty
void AssertEmptyBuckets() {
#ifndef NDEBUG
for (size_t i = 0; i < nbuckets_; ++i) {
WriteLock _(&locks_[i % nlocks_]);
assert(buckets_[i].list_.empty());
}
#endif
}
const uint32_t nbuckets_; // No. of buckets in the spine
std::unique_ptr<Bucket[]> buckets_; // Spine of the hash buckets
const uint32_t nlocks_; // No. of locks
std::unique_ptr<port::RWMutex[]> locks_; // Granular locks
};
} // namespace rocksdb