Modifed the LRU cache eviction code so that it doesn't evict blocks which have exteranl references

Summary:
Currently, blocks which have more than one reference (ie referenced by something other than cache itself) are evicted from cache. This doesn't make much sense:
- blocks are still in RAM, so the RAM usage reported by the cache is incorrect
- if the same block is needed by another iterator, it will be loaded and decompressed again

This diff changes the reference counting scheme a bit. Previously, if the cache contained the block, this was accounted for in its refcount. After this change, the refcount is only used to track external references. There is a boolean flag which indicates whether or not the block is contained in the cache.
This diff also changes how LRU list is used. Previously, both hashtable and the LRU list contained all blocks. After this change, the LRU list contains blocks with the refcount==0, ie those which can be evicted from the cache.

Note that this change still allows for cache to grow beyond its capacity. This happens when all blocks are pinned (ie refcount>0). This is consistent with the current behavior. The cache's insert function never fails. I spent lots of time trying to make table_reader and other places work with the insert which might failed. It turned out to be pretty hard. It might really destabilize some customers, so finally, I decided against doing this.

table_cache_remove_scan_count_limit option will be unneeded after this change, but I will remove it in the following diff, if this one gets approved

Test Plan: Ran tests, made sure they pass

Reviewers: sdong, ljin

Differential Revision: https://reviews.facebook.net/D25503
This commit is contained in:
Alexey Maykov 2014-10-21 11:49:13 -07:00
parent 0ab0242f37
commit ee95cae9a4
3 changed files with 176 additions and 135 deletions

View File

@ -1,5 +1,8 @@
# Rocksdb Change Log # Rocksdb Change Log
### Unreleased Features
* Changed the LRU caching algorithm so that referenced blocks (by iterators) are never evicted
### 3.9.0 (12/8/2014) ### 3.9.0 (12/8/2014)
### New Features ### New Features

View File

@ -26,8 +26,27 @@ namespace {
// LRU cache implementation // LRU cache implementation
// An entry is a variable length heap-allocated structure. Entries // An entry is a variable length heap-allocated structure.
// are kept in a circular doubly linked list ordered by access time. // Entries are referenced by cache and/or by any external entity.
// The cache keeps all its entries in table. Some elements
// are also stored on LRU list.
//
// LRUHandle can be in these states:
// 1. Referenced externally AND in hash table.
// In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true)
// 2. Not referenced externally and in hash table. In that case the entry is
// in the LRU and can be freed. (refs == 1 && in_cache == true)
// 3. Referenced externally and not in hash table. In that case the entry is
// in not on LRU and not in table. (refs >= 1 && in_cache == false)
//
// All newly created LRUHandles are in state 1. If you call LRUCache::Release
// on entry in state 1, it will go into state 2. To move from state 1 to
// state 3, either call LRUCache::Erase or LRUCache::Insert with the same key.
// To move from state 2 to state 1, use LRUCache::Lookup.
// Before destruction, make sure that no handles are in state 1. This means
// that any successful LRUCache::Lookup/LRUCache::Insert have a matching
// RUCache::Release (to move into state 2) or LRUCache::Erase (for state 3)
struct LRUHandle { struct LRUHandle {
void* value; void* value;
void (*deleter)(const Slice&, void* value); void (*deleter)(const Slice&, void* value);
@ -36,7 +55,9 @@ struct LRUHandle {
LRUHandle* prev; LRUHandle* prev;
size_t charge; // TODO(opt): Only allow uint32_t? size_t charge; // TODO(opt): Only allow uint32_t?
size_t key_length; size_t key_length;
uint32_t refs; uint32_t refs; // a number of refs to this entry
// cache itself is counted as 1
bool in_cache; // true, if this entry is referenced by the hash table
uint32_t hash; // Hash of key(); used for fast sharding and comparisons uint32_t hash; // Hash of key(); used for fast sharding and comparisons
char key_data[1]; // Beginning of key char key_data[1]; // Beginning of key
@ -49,6 +70,12 @@ struct LRUHandle {
return Slice(key_data, key_length); return Slice(key_data, key_length);
} }
} }
void Free() {
assert((refs == 1 && in_cache) || (refs == 0 && !in_cache));
(*deleter)(key(), value);
free(this);
}
}; };
// We provide our own simple hash table since it removes a whole bunch // We provide our own simple hash table since it removes a whole bunch
@ -59,7 +86,28 @@ struct LRUHandle {
class HandleTable { class HandleTable {
public: public:
HandleTable() : length_(0), elems_(0), list_(nullptr) { Resize(); } HandleTable() : length_(0), elems_(0), list_(nullptr) { Resize(); }
~HandleTable() { delete[] list_; }
template <typename T>
void ApplyToAllCacheEntries(T func) {
for (uint32_t i = 0; i < length_; i++) {
LRUHandle* h = list_[i];
while (h != nullptr) {
auto n = h->next_hash;
assert(h->in_cache);
func(h);
h = n;
}
}
}
~HandleTable() {
ApplyToAllCacheEntries([](LRUHandle* h) {
if (h->refs == 1) {
h->Free();
}
});
delete[] list_;
}
LRUHandle* Lookup(const Slice& key, uint32_t hash) { LRUHandle* Lookup(const Slice& key, uint32_t hash) {
return *FindPointer(key, hash); return *FindPointer(key, hash);
@ -173,8 +221,6 @@ class LRUCache {
// Just reduce the reference count by 1. // Just reduce the reference count by 1.
// Return true if last reference // Return true if last reference
bool Unref(LRUHandle* e); bool Unref(LRUHandle* e);
// Call deleter and free
void FreeEntry(LRUHandle* e);
// Initialized before use. // Initialized before use.
size_t capacity_; size_t capacity_;
@ -188,6 +234,7 @@ class LRUCache {
// Dummy head of LRU list. // Dummy head of LRU list.
// lru.prev is newest entry, lru.next is oldest entry. // lru.prev is newest entry, lru.next is oldest entry.
// LRU contains items which can be evicted, ie reference only by cache
LRUHandle lru_; LRUHandle lru_;
HandleTable table_; HandleTable table_;
@ -200,16 +247,7 @@ LRUCache::LRUCache()
lru_.prev = &lru_; lru_.prev = &lru_;
} }
LRUCache::~LRUCache() { LRUCache::~LRUCache() {}
for (LRUHandle* e = lru_.next; e != &lru_; ) {
LRUHandle* next = e->next;
assert(e->refs == 1); // Error if caller has an unreleased handle
if (Unref(e)) {
FreeEntry(e);
}
e = next;
}
}
bool LRUCache::Unref(LRUHandle* e) { bool LRUCache::Unref(LRUHandle* e) {
assert(e->refs > 0); assert(e->refs > 0);
@ -217,47 +255,48 @@ bool LRUCache::Unref(LRUHandle* e) {
return e->refs == 0; return e->refs == 0;
} }
void LRUCache::FreeEntry(LRUHandle* e) { // Call deleter and free
assert(e->refs == 0);
(*e->deleter)(e->key(), e->value);
free(e);
}
void LRUCache::ApplyToAllCacheEntries(void (*callback)(void*, size_t), void LRUCache::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
bool thread_safe) { bool thread_safe) {
if (thread_safe) { if (thread_safe) {
mutex_.Lock(); mutex_.Lock();
} }
for (auto e = lru_.next; e != &lru_; e = e->next) { table_.ApplyToAllCacheEntries([callback](LRUHandle* h) {
callback(e->value, e->charge); callback(h->value, h->charge);
} });
if (thread_safe) { if (thread_safe) {
mutex_.Unlock(); mutex_.Unlock();
} }
} }
void LRUCache::LRU_Remove(LRUHandle* e) { void LRUCache::LRU_Remove(LRUHandle* e) {
assert(e->next != nullptr);
assert(e->prev != nullptr);
e->next->prev = e->prev; e->next->prev = e->prev;
e->prev->next = e->next; e->prev->next = e->next;
usage_ -= e->charge; e->prev = e->next = nullptr;
} }
void LRUCache::LRU_Append(LRUHandle* e) { void LRUCache::LRU_Append(LRUHandle* e) {
// Make "e" newest entry by inserting just before lru_ // Make "e" newest entry by inserting just before lru_
assert(e->next == nullptr);
assert(e->prev == nullptr);
e->next = &lru_; e->next = &lru_;
e->prev = lru_.prev; e->prev = lru_.prev;
e->prev->next = e; e->prev->next = e;
e->next->prev = e; e->next->prev = e;
usage_ += e->charge;
} }
Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) { Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) {
MutexLock l(&mutex_); MutexLock l(&mutex_);
LRUHandle* e = table_.Lookup(key, hash); LRUHandle* e = table_.Lookup(key, hash);
if (e != nullptr) { if (e != nullptr) {
assert(e->in_cache);
if (e->refs == 1) {
LRU_Remove(e);
}
e->refs++; e->refs++;
LRU_Remove(e);
LRU_Append(e);
} }
return reinterpret_cast<Cache::Handle*>(e); return reinterpret_cast<Cache::Handle*>(e);
} }
@ -268,9 +307,31 @@ void LRUCache::Release(Cache::Handle* handle) {
{ {
MutexLock l(&mutex_); MutexLock l(&mutex_);
last_reference = Unref(e); last_reference = Unref(e);
if (last_reference) {
usage_ -= e->charge;
}
if (e->refs == 1 && e->in_cache) {
// The item is still in cache, and nobody else holds a reference to it
if (usage_ > capacity_) {
// the cache is full
// The LRU list must be empty since the cache is full
assert(lru_.next == &lru_);
// take this opportunity and remove the item
table_.Remove(e->key(), e->hash);
e->in_cache = false;
Unref(e);
usage_ -= e->charge;
last_reference = true;
} else {
// put the item on the list to be potentially freed
LRU_Append(e);
}
}
} }
// free outside of mutex
if (last_reference) { if (last_reference) {
FreeEntry(e); e->Free();
} }
} }
@ -278,8 +339,11 @@ Cache::Handle* LRUCache::Insert(
const Slice& key, uint32_t hash, void* value, size_t charge, const Slice& key, uint32_t hash, void* value, size_t charge,
void (*deleter)(const Slice& key, void* value)) { void (*deleter)(const Slice& key, void* value)) {
LRUHandle* e = reinterpret_cast<LRUHandle*>( // Allocate the memory here outside of the mutex
malloc(sizeof(LRUHandle)-1 + key.size())); // If the cache is full, we'll have to release it
// It shouldn't happen very often though.
LRUHandle* e =
reinterpret_cast<LRUHandle*>(malloc(sizeof(LRUHandle) - 1 + key.size()));
autovector<LRUHandle*> last_reference_list; autovector<LRUHandle*> last_reference_list;
e->value = value; e->value = value;
@ -288,47 +352,40 @@ Cache::Handle* LRUCache::Insert(
e->key_length = key.size(); e->key_length = key.size();
e->hash = hash; e->hash = hash;
e->refs = 2; // One from LRUCache, one for the returned handle e->refs = 2; // One from LRUCache, one for the returned handle
e->next = e->prev = nullptr;
e->in_cache = true;
memcpy(e->key_data, key.data(), key.size()); memcpy(e->key_data, key.data(), key.size());
{ {
MutexLock l(&mutex_); MutexLock l(&mutex_);
LRU_Append(e);
LRUHandle* old = table_.Insert(e);
if (old != nullptr) {
LRU_Remove(old);
if (Unref(old)) {
last_reference_list.push_back(old);
}
}
if (remove_scan_count_limit_ > 0) {
// Try to free the space by evicting the entries that are only
// referenced by the cache first.
LRUHandle* cur = lru_.next;
for (unsigned int scanCount = 0;
usage_ > capacity_ && cur != &lru_
&& scanCount < remove_scan_count_limit_; scanCount++) {
LRUHandle* next = cur->next;
if (cur->refs <= 1) {
LRU_Remove(cur);
table_.Remove(cur->key(), cur->hash);
if (Unref(cur)) {
last_reference_list.push_back(cur);
}
}
cur = next;
}
}
// Free the space following strict LRU policy until enough space // Free the space following strict LRU policy until enough space
// is freed. // is freed or the lru list is empty
while (usage_ > capacity_ && lru_.next != &lru_) { while (usage_ + charge > capacity_ && lru_.next != &lru_) {
old = lru_.next; LRUHandle* old = lru_.next;
assert(old->in_cache);
assert(old->refs ==
1); // LRU list contains elements which may be evicted
LRU_Remove(old); LRU_Remove(old);
table_.Remove(old->key(), old->hash); table_.Remove(old->key(), old->hash);
old->in_cache = false;
Unref(old);
usage_ -= old->charge;
last_reference_list.push_back(old);
}
// insert into the cache
// note that the cache might get larger than its capacity if not enough
// space was freed
LRUHandle* old = table_.Insert(e);
usage_ += e->charge;
if (old != nullptr) {
old->in_cache = false;
if (Unref(old)) { if (Unref(old)) {
usage_ -= old->charge;
// old is on LRU because it's in cache and its reference count
// was just 1 (Unref returned 0)
LRU_Remove(old);
last_reference_list.push_back(old); last_reference_list.push_back(old);
} }
} }
@ -337,7 +394,7 @@ Cache::Handle* LRUCache::Insert(
// we free the entries here outside of mutex for // we free the entries here outside of mutex for
// performance reasons // performance reasons
for (auto entry : last_reference_list) { for (auto entry : last_reference_list) {
FreeEntry(entry); entry->Free();
} }
return reinterpret_cast<Cache::Handle*>(e); return reinterpret_cast<Cache::Handle*>(e);
@ -350,14 +407,21 @@ void LRUCache::Erase(const Slice& key, uint32_t hash) {
MutexLock l(&mutex_); MutexLock l(&mutex_);
e = table_.Remove(key, hash); e = table_.Remove(key, hash);
if (e != nullptr) { if (e != nullptr) {
LRU_Remove(e);
last_reference = Unref(e); last_reference = Unref(e);
if (last_reference) {
usage_ -= e->charge;
}
if (last_reference && e->in_cache) {
LRU_Remove(e);
}
e->in_cache = false;
} }
} }
// mutex not held here // mutex not held here
// last_reference will only be true if e != nullptr // last_reference will only be true if e != nullptr
if (last_reference) { if (last_reference) {
FreeEntry(e); e->Free();
} }
} }

View File

@ -190,25 +190,30 @@ TEST(CacheTest, EntriesArePinned) {
Insert(100, 101); Insert(100, 101);
Cache::Handle* h1 = cache_->Lookup(EncodeKey(100)); Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
ASSERT_EQ(101, DecodeValue(cache_->Value(h1))); ASSERT_EQ(101, DecodeValue(cache_->Value(h1)));
ASSERT_EQ(1, cache_->GetUsage());
Insert(100, 102); Insert(100, 102);
Cache::Handle* h2 = cache_->Lookup(EncodeKey(100)); Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
ASSERT_EQ(102, DecodeValue(cache_->Value(h2))); ASSERT_EQ(102, DecodeValue(cache_->Value(h2)));
ASSERT_EQ(0U, deleted_keys_.size()); ASSERT_EQ(0U, deleted_keys_.size());
ASSERT_EQ(2, cache_->GetUsage());
cache_->Release(h1); cache_->Release(h1);
ASSERT_EQ(1U, deleted_keys_.size()); ASSERT_EQ(1U, deleted_keys_.size());
ASSERT_EQ(100, deleted_keys_[0]); ASSERT_EQ(100, deleted_keys_[0]);
ASSERT_EQ(101, deleted_values_[0]); ASSERT_EQ(101, deleted_values_[0]);
ASSERT_EQ(1, cache_->GetUsage());
Erase(100); Erase(100);
ASSERT_EQ(-1, Lookup(100)); ASSERT_EQ(-1, Lookup(100));
ASSERT_EQ(1U, deleted_keys_.size()); ASSERT_EQ(1U, deleted_keys_.size());
ASSERT_EQ(1, cache_->GetUsage());
cache_->Release(h2); cache_->Release(h2);
ASSERT_EQ(2U, deleted_keys_.size()); ASSERT_EQ(2U, deleted_keys_.size());
ASSERT_EQ(100, deleted_keys_[1]); ASSERT_EQ(100, deleted_keys_[1]);
ASSERT_EQ(102, deleted_values_[1]); ASSERT_EQ(102, deleted_values_[1]);
ASSERT_EQ(0, cache_->GetUsage());
} }
TEST(CacheTest, EvictionPolicy) { TEST(CacheTest, EvictionPolicy) {
@ -273,76 +278,28 @@ TEST(CacheTest, EvictionPolicyRef) {
cache_->Release(h204); cache_->Release(h204);
} }
TEST(CacheTest, EvictionPolicyRef2) { TEST(CacheTest, ErasedHandleState) {
std::vector<Cache::Handle*> handles; // insert a key and get two handles
Insert(100, 1000);
Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
ASSERT_EQ(h1, h2);
ASSERT_EQ(DecodeValue(cache_->Value(h1)), 1000);
ASSERT_EQ(DecodeValue(cache_->Value(h2)), 1000);
Insert(100, 101); // delete the key from the cache
// Insert entries much more than Cache capacity Erase(100);
for (int i = 0; i < kCacheSize + 100; i++) { // can no longer find in the cache
Insert(1000 + i, 2000 + i);
if (i < kCacheSize ) {
handles.push_back(cache_->Lookup(EncodeKey(1000 + i)));
}
}
// Make sure referenced keys are also possible to be deleted
// if there are not sufficient non-referenced keys
for (int i = 0; i < 5; i++) {
ASSERT_EQ(-1, Lookup(1000 + i));
}
for (int i = kCacheSize; i < kCacheSize + 100; i++) {
ASSERT_EQ(2000 + i, Lookup(1000 + i));
}
ASSERT_EQ(-1, Lookup(100)); ASSERT_EQ(-1, Lookup(100));
// Cleaning up all the handles // release one handle
while (handles.size() > 0) { cache_->Release(h1);
cache_->Release(handles.back()); // still can't find in cache
handles.pop_back(); ASSERT_EQ(-1, Lookup(100));
}
cache_->Release(h2);
} }
TEST(CacheTest, EvictionPolicyRefLargeScanLimit) {
std::vector<Cache::Handle*> handles2;
// Cache2 has a cache RemoveScanCountLimit higher than cache size
// so it would trigger a boundary condition.
// Populate the cache with 10 more keys than its size.
// Reference all keys except one close to the end.
for (int i = 0; i < kCacheSize2 + 10; i++) {
Insert2(1000 + i, 2000+i);
if (i != kCacheSize2 ) {
handles2.push_back(cache2_->Lookup(EncodeKey(1000 + i)));
}
}
// Make sure referenced keys are also possible to be deleted
// if there are not sufficient non-referenced keys
for (int i = 0; i < 3; i++) {
ASSERT_EQ(-1, Lookup2(1000 + i));
}
// The non-referenced value is deleted even if it's accessed
// recently.
ASSERT_EQ(-1, Lookup2(1000 + kCacheSize2));
// Other values recently accessed are not deleted since they
// are referenced.
for (int i = kCacheSize2 - 10; i < kCacheSize2 + 10; i++) {
if (i != kCacheSize2) {
ASSERT_EQ(2000 + i, Lookup2(1000 + i));
}
}
// Cleaning up all the handles
while (handles2.size() > 0) {
cache2_->Release(handles2.back());
handles2.pop_back();
}
}
TEST(CacheTest, HeavyEntries) { TEST(CacheTest, HeavyEntries) {
// Add a bunch of light and heavy entries and then count the combined // Add a bunch of light and heavy entries and then count the combined
// size of items still in the cache, which must be approximately the // size of items still in the cache, which must be approximately the
@ -392,7 +349,7 @@ void deleter(const Slice& key, void* value) {
} }
} // namespace } // namespace
TEST(CacheTest, BadEviction) { TEST(CacheTest, OverCapacity) {
int n = 10; int n = 10;
// a LRUCache with n entries and one shard only // a LRUCache with n entries and one shard only
@ -411,15 +368,32 @@ TEST(CacheTest, BadEviction) {
std::string key = ToString(i+1); std::string key = ToString(i+1);
auto h = cache->Lookup(key); auto h = cache->Lookup(key);
std::cout << key << (h?" found\n":" not found\n"); std::cout << key << (h?" found\n":" not found\n");
// Only the first entry should be missing ASSERT_TRUE(h != nullptr);
ASSERT_TRUE(h || i == 0);
if (h) cache->Release(h); if (h) cache->Release(h);
} }
// the cache is over capacity since nothing could be evicted
ASSERT_EQ(n + 1, cache->GetUsage());
for (int i = 0; i < n+1; i++) { for (int i = 0; i < n+1; i++) {
cache->Release(handles[i]); cache->Release(handles[i]);
} }
std::cout << "Poor entries\n";
// cache is under capacity now since elements were released
ASSERT_EQ(n, cache->GetUsage());
// element 0 is evicted and the rest is there
// This is consistent with the LRU policy since the element 0
// was released first
for (int i = 0; i < n+1; i++) {
std::string key = ToString(i+1);
auto h = cache->Lookup(key);
if (h) {
ASSERT_NE(i, 0);
cache->Release(h);
} else {
ASSERT_EQ(i, 0);
}
}
} }
namespace { namespace {