Modifed the LRU cache eviction code so that it doesn't evict blocks which have exteranl references

Summary: Currently, blocks which have more than one reference (ie referenced by something other than cache itself) are evicted from cache. This doesn't make much sense: - blocks are still in RAM, so the RAM usage reported by the cache is incorrect - if the same block is needed by another iterator, it will be loaded and decompressed again This diff changes the reference counting scheme a bit. Previously, if the cache contained the block, this was accounted for in its refcount. After this change, the refcount is only used to track external references. There is a boolean flag which indicates whether or not the block is contained in the cache. This diff also changes how LRU list is used. Previously, both hashtable and the LRU list contained all blocks. After this change, the LRU list contains blocks with the refcount==0, ie those which can be evicted from the cache. Note that this change still allows for cache to grow beyond its capacity. This happens when all blocks are pinned (ie refcount>0). This is consistent with the current behavior. The cache's insert function never fails. I spent lots of time trying to make table_reader and other places work with the insert which might failed. It turned out to be pretty hard. It might really destabilize some customers, so finally, I decided against doing this. table_cache_remove_scan_count_limit option will be unneeded after this change, but I will remove it in the following diff, if this one gets approved Test Plan: Ran tests, made sure they pass Reviewers: sdong, ljin Differential Revision: https://reviews.facebook.net/D25503
2014-10-21 11:49:13 -07:00 · 2014-10-21 11:49:13 -07:00 · ee95cae9a4
commit ee95cae9a4
parent 0ab0242f37
3 changed files with 176 additions and 135 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -1,5 +1,8 @@
 # Rocksdb Change Log
 ### Unreleased Features
 * Changed the LRU caching algorithm so that referenced blocks (by iterators) are never evicted
 ### 3.9.0 (12/8/2014)
 ### New Features
--- a/util/cache.cc
+++ b/util/cache.cc
@ -26,8 +26,27 @@ namespace {
 // LRU cache implementation
-// An entry is a variable length heap-allocated structure.  Entries
+// An entry is a variable length heap-allocated structure.
-// are kept in a circular doubly linked list ordered by access time.
+// Entries are referenced by cache and/or by any external entity.
 // The cache keeps all its entries in table. Some elements
 // are also stored on LRU list.
 //
 // LRUHandle can be in these states:
 // 1. Referenced externally AND in hash table.
 //  In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true)
 // 2. Not referenced externally and in hash table. In that case the entry is
 // in the LRU and can be freed. (refs == 1 && in_cache == true)
 // 3. Referenced externally and not in hash table. In that case the entry is
 // in not on LRU and not in table. (refs >= 1 && in_cache == false)
 //
 // All newly created LRUHandles are in state 1. If you call LRUCache::Release
 // on entry in state 1, it will go into state 2. To move from state 1 to
 // state 3, either call LRUCache::Erase or LRUCache::Insert with the same key.
 // To move from state 2 to state 1, use LRUCache::Lookup.
 // Before destruction, make sure that no handles are in state 1. This means
 // that any successful LRUCache::Lookup/LRUCache::Insert have a matching
 // RUCache::Release (to move into state 2) or LRUCache::Erase (for state 3)
 struct LRUHandle {
  void* value;
  void (*deleter)(const Slice&, void* value);
@ -36,7 +55,9 @@ struct LRUHandle {
  LRUHandle* prev;
  size_t charge;      // TODO(opt): Only allow uint32_t?
  size_t key_length;
-  uint32_t refs;
+  uint32_t refs;      // a number of refs to this entry
                      // cache itself is counted as 1
  bool in_cache;      // true, if this entry is referenced by the hash table
  uint32_t hash;      // Hash of key(); used for fast sharding and comparisons
  char key_data[1];   // Beginning of key
@ -49,6 +70,12 @@ struct LRUHandle {
      return Slice(key_data, key_length);
    }
  }
  void Free() {
    assert((refs == 1 && in_cache) || (refs == 0 && !in_cache));
    (*deleter)(key(), value);
    free(this);
  }
 };
 // We provide our own simple hash table since it removes a whole bunch
@ -59,7 +86,28 @@ struct LRUHandle {
 class HandleTable {
 public:
  HandleTable() : length_(0), elems_(0), list_(nullptr) { Resize(); }
-  ~HandleTable() { delete[] list_; }
+
  template <typename T>
  void ApplyToAllCacheEntries(T func) {
    for (uint32_t i = 0; i < length_; i++) {
      LRUHandle* h = list_[i];
      while (h != nullptr) {
        auto n = h->next_hash;
        assert(h->in_cache);
        func(h);
        h = n;
      }
    }
  }
  ~HandleTable() {
    ApplyToAllCacheEntries([](LRUHandle* h) {
      if (h->refs == 1) {
        h->Free();
      }
    });
    delete[] list_;
  }
  LRUHandle* Lookup(const Slice& key, uint32_t hash) {
    return *FindPointer(key, hash);
@ -173,8 +221,6 @@ class LRUCache {
  // Just reduce the reference count by 1.
  // Return true if last reference
  bool Unref(LRUHandle* e);
  // Call deleter and free
  void FreeEntry(LRUHandle* e);
  // Initialized before use.
  size_t capacity_;
@ -188,6 +234,7 @@ class LRUCache {
  // Dummy head of LRU list.
  // lru.prev is newest entry, lru.next is oldest entry.
  // LRU contains items which can be evicted, ie reference only by cache
  LRUHandle lru_;
  HandleTable table_;
@ -200,16 +247,7 @@ LRUCache::LRUCache()
  lru_.prev = &lru_;
 }
-LRUCache::~LRUCache() {
+LRUCache::~LRUCache() {}
  for (LRUHandle* e = lru_.next; e != &lru_; ) {
    LRUHandle* next = e->next;
    assert(e->refs == 1);  // Error if caller has an unreleased handle
    if (Unref(e)) {
      FreeEntry(e);
    }
    e = next;
  }
 }
 bool LRUCache::Unref(LRUHandle* e) {
  assert(e->refs > 0);
@ -217,47 +255,48 @@ bool LRUCache::Unref(LRUHandle* e) {
  return e->refs == 0;
 }
-void LRUCache::FreeEntry(LRUHandle* e) {
+// Call deleter and free
  assert(e->refs == 0);
  (*e->deleter)(e->key(), e->value);
  free(e);
 }
 void LRUCache::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
                                      bool thread_safe) {
  if (thread_safe) {
    mutex_.Lock();
  }
-  for (auto e = lru_.next; e != &lru_; e = e->next) {
+  table_.ApplyToAllCacheEntries([callback](LRUHandle* h) {
-    callback(e->value, e->charge);
+    callback(h->value, h->charge);
-  }
+  });
  if (thread_safe) {
    mutex_.Unlock();
  }
 }
 void LRUCache::LRU_Remove(LRUHandle* e) {
  assert(e->next != nullptr);
  assert(e->prev != nullptr);
  e->next->prev = e->prev;
  e->prev->next = e->next;
-  usage_ -= e->charge;
+  e->prev = e->next = nullptr;
 }
 void LRUCache::LRU_Append(LRUHandle* e) {
  // Make "e" newest entry by inserting just before lru_
  assert(e->next == nullptr);
  assert(e->prev == nullptr);
  e->next = &lru_;
  e->prev = lru_.prev;
  e->prev->next = e;
  e->next->prev = e;
  usage_ += e->charge;
 }
 Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) {
  MutexLock l(&mutex_);
  LRUHandle* e = table_.Lookup(key, hash);
  if (e != nullptr) {
    assert(e->in_cache);
    if (e->refs == 1) {
      LRU_Remove(e);
    }
    e->refs++;
    LRU_Remove(e);
    LRU_Append(e);
  }
  return reinterpret_cast<Cache::Handle*>(e);
 }
@ -268,9 +307,31 @@ void LRUCache::Release(Cache::Handle* handle) {
  {
    MutexLock l(&mutex_);
    last_reference = Unref(e);
    if (last_reference) {
      usage_ -= e->charge;
    }
    if (e->refs == 1 && e->in_cache) {
      // The item is still in cache, and nobody else holds a reference to it
      if (usage_ > capacity_) {
        // the cache is full
        // The LRU list must be empty since the cache is full
        assert(lru_.next == &lru_);
        // take this opportunity and remove the item
        table_.Remove(e->key(), e->hash);
        e->in_cache = false;
        Unref(e);
        usage_ -= e->charge;
        last_reference = true;
      } else {
        // put the item on the list to be potentially freed
        LRU_Append(e);
      }
    }
  }
  // free outside of mutex
  if (last_reference) {
-    FreeEntry(e);
+    e->Free();
  }
 }
@ -278,8 +339,11 @@ Cache::Handle* LRUCache::Insert(
    const Slice& key, uint32_t hash, void* value, size_t charge,
    void (*deleter)(const Slice& key, void* value)) {
-  LRUHandle* e = reinterpret_cast<LRUHandle*>(
+  // Allocate the memory here outside of the mutex
-      malloc(sizeof(LRUHandle)-1 + key.size()));
+  // If the cache is full, we'll have to release it
  // It shouldn't happen very often though.
  LRUHandle* e =
      reinterpret_cast<LRUHandle*>(malloc(sizeof(LRUHandle) - 1 + key.size()));
  autovector<LRUHandle*> last_reference_list;
  e->value = value;
@ -288,47 +352,40 @@ Cache::Handle* LRUCache::Insert(
  e->key_length = key.size();
  e->hash = hash;
  e->refs = 2;  // One from LRUCache, one for the returned handle
  e->next = e->prev = nullptr;
  e->in_cache = true;
  memcpy(e->key_data, key.data(), key.size());
  {
    MutexLock l(&mutex_);
    LRU_Append(e);
    LRUHandle* old = table_.Insert(e);
    if (old != nullptr) {
      LRU_Remove(old);
      if (Unref(old)) {
        last_reference_list.push_back(old);
      }
    }
    if (remove_scan_count_limit_ > 0) {
      // Try to free the space by evicting the entries that are only
      // referenced by the cache first.
      LRUHandle* cur = lru_.next;
      for (unsigned int scanCount = 0;
           usage_ > capacity_ && cur != &lru_
           && scanCount < remove_scan_count_limit_; scanCount++) {
        LRUHandle* next = cur->next;
        if (cur->refs <= 1) {
          LRU_Remove(cur);
          table_.Remove(cur->key(), cur->hash);
          if (Unref(cur)) {
            last_reference_list.push_back(cur);
          }
        }
        cur = next;
      }
    }
    // Free the space following strict LRU policy until enough space
-    // is freed.
+    // is freed or the lru list is empty
-    while (usage_ > capacity_ && lru_.next != &lru_) {
+    while (usage_ + charge > capacity_ && lru_.next != &lru_) {
-      old = lru_.next;
+      LRUHandle* old = lru_.next;
      assert(old->in_cache);
      assert(old->refs ==
             1);  // LRU list contains elements which may be evicted
      LRU_Remove(old);
      table_.Remove(old->key(), old->hash);
      old->in_cache = false;
      Unref(old);
      usage_ -= old->charge;
      last_reference_list.push_back(old);
    }
    // insert into the cache
    // note that the cache might get larger than its capacity if not enough
    // space was freed
    LRUHandle* old = table_.Insert(e);
    usage_ += e->charge;
    if (old != nullptr) {
      old->in_cache = false;
      if (Unref(old)) {
        usage_ -= old->charge;
        // old is on LRU because it's in cache and its reference count
        // was just 1 (Unref returned 0)
        LRU_Remove(old);
        last_reference_list.push_back(old);
      }
    }
@ -337,7 +394,7 @@ Cache::Handle* LRUCache::Insert(
  // we free the entries here outside of mutex for
  // performance reasons
  for (auto entry : last_reference_list) {
-    FreeEntry(entry);
+    entry->Free();
  }
  return reinterpret_cast<Cache::Handle*>(e);
@ -350,14 +407,21 @@ void LRUCache::Erase(const Slice& key, uint32_t hash) {
    MutexLock l(&mutex_);
    e = table_.Remove(key, hash);
    if (e != nullptr) {
      LRU_Remove(e);
      last_reference = Unref(e);
      if (last_reference) {
        usage_ -= e->charge;
      }
      if (last_reference && e->in_cache) {
        LRU_Remove(e);
      }
      e->in_cache = false;
    }
  }
  // mutex not held here
  // last_reference will only be true if e != nullptr
  if (last_reference) {
-    FreeEntry(e);
+    e->Free();
  }
 }
--- a/util/cache_test.cc
+++ b/util/cache_test.cc
@ -190,25 +190,30 @@ TEST(CacheTest, EntriesArePinned) {
  Insert(100, 101);
  Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
  ASSERT_EQ(101, DecodeValue(cache_->Value(h1)));
  ASSERT_EQ(1, cache_->GetUsage());
  Insert(100, 102);
  Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
  ASSERT_EQ(102, DecodeValue(cache_->Value(h2)));
  ASSERT_EQ(0U, deleted_keys_.size());
  ASSERT_EQ(2, cache_->GetUsage());
  cache_->Release(h1);
  ASSERT_EQ(1U, deleted_keys_.size());
  ASSERT_EQ(100, deleted_keys_[0]);
  ASSERT_EQ(101, deleted_values_[0]);
  ASSERT_EQ(1, cache_->GetUsage());
  Erase(100);
  ASSERT_EQ(-1, Lookup(100));
  ASSERT_EQ(1U, deleted_keys_.size());
  ASSERT_EQ(1, cache_->GetUsage());
  cache_->Release(h2);
  ASSERT_EQ(2U, deleted_keys_.size());
  ASSERT_EQ(100, deleted_keys_[1]);
  ASSERT_EQ(102, deleted_values_[1]);
  ASSERT_EQ(0, cache_->GetUsage());
 }
 TEST(CacheTest, EvictionPolicy) {
@ -273,76 +278,28 @@ TEST(CacheTest, EvictionPolicyRef) {
  cache_->Release(h204);
 }
-TEST(CacheTest, EvictionPolicyRef2) {
+TEST(CacheTest, ErasedHandleState) {
-  std::vector<Cache::Handle*> handles;
+  // insert a key and get two handles
  Insert(100, 1000);
  Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
  Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
  ASSERT_EQ(h1, h2);
  ASSERT_EQ(DecodeValue(cache_->Value(h1)), 1000);
  ASSERT_EQ(DecodeValue(cache_->Value(h2)), 1000);
-  Insert(100, 101);
+  // delete the key from the cache
-  // Insert entries much more than Cache capacity
+  Erase(100);
-  for (int i = 0; i < kCacheSize + 100; i++) {
+  // can no longer find in the cache
    Insert(1000 + i, 2000 + i);
    if (i < kCacheSize ) {
      handles.push_back(cache_->Lookup(EncodeKey(1000 + i)));
    }
  }
  // Make sure referenced keys are also possible to be deleted
  // if there are not sufficient non-referenced keys
  for (int i = 0; i < 5; i++) {
    ASSERT_EQ(-1, Lookup(1000 + i));
  }
  for (int i = kCacheSize; i < kCacheSize + 100; i++) {
    ASSERT_EQ(2000 + i, Lookup(1000 + i));
  }
  ASSERT_EQ(-1, Lookup(100));
-  // Cleaning up all the handles
+  // release one handle
-  while (handles.size() > 0) {
+  cache_->Release(h1);
-    cache_->Release(handles.back());
+  // still can't find in cache
-    handles.pop_back();
+  ASSERT_EQ(-1, Lookup(100));
-  }
+
  cache_->Release(h2);
 }
 TEST(CacheTest, EvictionPolicyRefLargeScanLimit) {
  std::vector<Cache::Handle*> handles2;
  // Cache2 has a cache RemoveScanCountLimit higher than cache size
  // so it would trigger a boundary condition.
  // Populate the cache with 10 more keys than its size.
  // Reference all keys except one close to the end.
  for (int i = 0; i < kCacheSize2 + 10; i++) {
    Insert2(1000 + i, 2000+i);
    if (i != kCacheSize2 ) {
      handles2.push_back(cache2_->Lookup(EncodeKey(1000 + i)));
    }
  }
  // Make sure referenced keys are also possible to be deleted
  // if there are not sufficient non-referenced keys
  for (int i = 0; i < 3; i++) {
    ASSERT_EQ(-1, Lookup2(1000 + i));
  }
  // The non-referenced value is deleted even if it's accessed
  // recently.
  ASSERT_EQ(-1, Lookup2(1000 + kCacheSize2));
  // Other values recently accessed are not deleted since they
  // are referenced.
  for (int i = kCacheSize2 - 10; i < kCacheSize2 + 10; i++) {
    if (i != kCacheSize2) {
      ASSERT_EQ(2000 + i, Lookup2(1000 + i));
    }
  }
  // Cleaning up all the handles
  while (handles2.size() > 0) {
    cache2_->Release(handles2.back());
    handles2.pop_back();
  }
 }
 TEST(CacheTest, HeavyEntries) {
  // Add a bunch of light and heavy entries and then count the combined
  // size of items still in the cache, which must be approximately the
@ -392,7 +349,7 @@ void deleter(const Slice& key, void* value) {
 }
 }  // namespace
-TEST(CacheTest, BadEviction) {
+TEST(CacheTest, OverCapacity) {
  int n = 10;
  // a LRUCache with n entries and one shard only
@ -411,15 +368,32 @@ TEST(CacheTest, BadEviction) {
    std::string key = ToString(i+1);
    auto h = cache->Lookup(key);
    std::cout << key << (h?" found\n":" not found\n");
-    // Only the first entry should be missing
+    ASSERT_TRUE(h != nullptr);
    ASSERT_TRUE(h || i == 0);
    if (h) cache->Release(h);
  }
  // the cache is over capacity since nothing could be evicted
  ASSERT_EQ(n + 1, cache->GetUsage());
  for (int i = 0; i < n+1; i++) {
    cache->Release(handles[i]);
  }
-  std::cout << "Poor entries\n";
+
  // cache is under capacity now since elements were released
  ASSERT_EQ(n, cache->GetUsage());
  // element 0 is evicted and the rest is there
  // This is consistent with the LRU policy since the element 0
  // was released first
  for (int i = 0; i < n+1; i++) {
    std::string key = ToString(i+1);
    auto h = cache->Lookup(key);
    if (h) {
      ASSERT_NE(i, 0);
      cache->Release(h);
    } else {
      ASSERT_EQ(i, 0);
    }
  }
 }
 namespace {