// Copyright (c) 2013, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // #pragma once #ifndef ROCKSDB_LITE #include <assert.h> #include <list> #include <vector> #ifdef OS_LINUX #include <sys/mman.h> #endif #include "include/rocksdb/env.h" #include "util/mutexlock.h" namespace rocksdb { // HashTable<T, Hash, Equal> // // Traditional implementation of hash table with synchronization built on top // don't perform very well in multi-core scenarios. This is an implementation // designed for multi-core scenarios with high lock contention. // // |<-------- alpha ------------->| // Buckets Collision list // ---- +----+ +---+---+--- ...... ---+---+---+ // / | |--->| | | | | | // / +----+ +---+---+--- ...... ---+---+---+ // / | | // Locks/ +----+ // +--+/ . . // | | . . // +--+ . . // | | . . // +--+ . . // | | . . // +--+ . . // \ +----+ // \ | | // \ +----+ // \ | | // \---- +----+ // // The lock contention is spread over an array of locks. This helps improve // concurrent access. The spine is designed for a certain capacity and load // factor. When the capacity planning is done correctly we can expect // O(load_factor = 1) insert, access and remove time. // // Micro benchmark on debug build gives about .5 Million/sec rate of insert, // erase and lookup in parallel (total of about 1.5 Million ops/sec). If the // blocks were of 4K, the hash table can support a virtual throughput of // 6 GB/s. // // T Object type (contains both key and value) // Hash Function that returns an hash from type T // Equal Returns if two objects are equal // (We need explicit equal for pointer type) // template <class T, class Hash, class Equal> class HashTable { public: explicit HashTable(const size_t capacity = 1024 * 1024, const float load_factor = 2.0, const uint32_t nlocks = 256) : nbuckets_( static_cast<uint32_t>(load_factor ? capacity / load_factor : 0)), nlocks_(nlocks) { // pre-conditions assert(capacity); assert(load_factor); assert(nbuckets_); assert(nlocks_); buckets_.reset(new Bucket[nbuckets_]); #ifdef OS_LINUX mlock(buckets_.get(), nbuckets_ * sizeof(Bucket)); #endif // initialize locks locks_.reset(new port::RWMutex[nlocks_]); #ifdef OS_LINUX mlock(locks_.get(), nlocks_ * sizeof(port::RWMutex)); #endif // post-conditions assert(buckets_); assert(locks_); } virtual ~HashTable() { AssertEmptyBuckets(); } // // Insert given record to hash table // bool Insert(const T& t) { const uint64_t h = Hash()(t); const uint32_t bucket_idx = h % nbuckets_; const uint32_t lock_idx = bucket_idx % nlocks_; WriteLock _(&locks_[lock_idx]); auto& bucket = buckets_[bucket_idx]; return Insert(&bucket, t); } // Lookup hash table // // Please note that read lock should be held by the caller. This is because // the caller owns the data, and should hold the read lock as long as he // operates on the data. bool Find(const T& t, T* ret, port::RWMutex** ret_lock) { const uint64_t h = Hash()(t); const uint32_t bucket_idx = h % nbuckets_; const uint32_t lock_idx = bucket_idx % nlocks_; port::RWMutex& lock = locks_[lock_idx]; lock.ReadLock(); auto& bucket = buckets_[bucket_idx]; if (Find(&bucket, t, ret)) { *ret_lock = &lock; return true; } lock.ReadUnlock(); return false; } // // Erase a given key from the hash table // bool Erase(const T& t, T* ret) { const uint64_t h = Hash()(t); const uint32_t bucket_idx = h % nbuckets_; const uint32_t lock_idx = bucket_idx % nlocks_; WriteLock _(&locks_[lock_idx]); auto& bucket = buckets_[bucket_idx]; return Erase(&bucket, t, ret); } // Fetch the mutex associated with a key // This call is used to hold the lock for a given data for extended period of // time. port::RWMutex* GetMutex(const T& t) { const uint64_t h = Hash()(t); const uint32_t bucket_idx = h % nbuckets_; const uint32_t lock_idx = bucket_idx % nlocks_; return &locks_[lock_idx]; } void Clear(void (*fn)(T)) { for (uint32_t i = 0; i < nbuckets_; ++i) { const uint32_t lock_idx = i % nlocks_; WriteLock _(&locks_[lock_idx]); for (auto& t : buckets_[i].list_) { (*fn)(t); } buckets_[i].list_.clear(); } } protected: // Models bucket of keys that hash to the same bucket number struct Bucket { std::list<T> list_; }; // Substitute for std::find with custom comparator operator typename std::list<T>::iterator Find(std::list<T>* list, const T& t) { for (auto it = list->begin(); it != list->end(); ++it) { if (Equal()(*it, t)) { return it; } } return list->end(); } bool Insert(Bucket* bucket, const T& t) { // Check if the key already exists auto it = Find(&bucket->list_, t); if (it != bucket->list_.end()) { return false; } // insert to bucket bucket->list_.push_back(t); return true; } bool Find(Bucket* bucket, const T& t, T* ret) { auto it = Find(&bucket->list_, t); if (it != bucket->list_.end()) { if (ret) { *ret = *it; } return true; } return false; } bool Erase(Bucket* bucket, const T& t, T* ret) { auto it = Find(&bucket->list_, t); if (it != bucket->list_.end()) { if (ret) { *ret = *it; } bucket->list_.erase(it); return true; } return false; } // assert that all buckets are empty void AssertEmptyBuckets() { #ifndef NDEBUG for (size_t i = 0; i < nbuckets_; ++i) { WriteLock _(&locks_[i % nlocks_]); assert(buckets_[i].list_.empty()); } #endif } const uint32_t nbuckets_; // No. of buckets in the spine std::unique_ptr<Bucket[]> buckets_; // Spine of the hash buckets const uint32_t nlocks_; // No. of locks std::unique_ptr<port::RWMutex[]> locks_; // Granular locks }; } // namespace rocksdb #endif