Charge block cache for cache internal usage (#5797)

Summary: For our default block cache, each additional entry has extra memory overhead. It include LRUHandle (72 bytes currently) and the cache key (two varint64, file id and offset). The usage is not negligible. For example for block_size=4k, the overhead accounts for an extra 2% memory usage for the cache. The patch charging the cache for the extra usage, reducing untracked memory usage outside block cache. The feature is enabled by default and can be disabled by passing kDontChargeCacheMetadata to the cache constructor. This PR builds up on https://github.com/facebook/rocksdb/issues/4258 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5797 Test Plan: - Existing tests are updated to either disable the feature when the test has too much dependency on the old way of accounting the usage or increasing the cache capacity to account for the additional charge of metadata. - The Usage tests in cache_test.cc are augmented to test the cache usage under kFullChargeCacheMetadata. Differential Revision: D17396833 Pulled By: maysamyabandeh fbshipit-source-id: 7684ccb9f8a40ca595e4f5efcdb03623afea0c6f
2019-09-16 15:14:51 -07:00 · 2019-09-16 15:14:51 -07:00 · 638d239507
commit 638d239507
parent 94d62d771e
25 changed files with 289 additions and 120 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -11,6 +11,7 @@
 * When user uses options.force_consistency_check in RocksDb, instead of crashing the process, we now pass the error back to the users without killing the process.
 * Add an option `memtable_insert_hint_per_batch` to WriteOptions. If it is true, each WriteBatch will maintain its own insert hints for each memtable in concurrent write. See include/rocksdb/options.h for more details.
 * The `sst_dump` command line tool `recompress` command now displays how many blocks were compressed and how many were not, in particular how many were not compressed because the compression ratio was not met (12.5% threshold for GoodCompressionRatio), as seen in the `number.block.not_compressed` counter stat since version 6.0.0.
+* The block cache usage is now takes into account the overhead of metadata per each entry. This results into more accurate managment of memory. A side-effect of this feature is that less items are fit into the block cache of the same size, which would result to higher cache miss rates. This can be remedied by increasing the block cache size or passing kDontChargeCacheMetadata to its constuctor to restore the old behavior.
 ### Public API Change
 * Added max_write_buffer_size_to_maintain option to better control memory usage of immutable memtables.
 * Added a lightweight API GetCurrentWalFile() to get last live WAL filename and size. Meant to be used as a helper for backup/restore tooling in a larger ecosystem such as MySQL with a MyRocks storage engine.
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@ -86,14 +86,22 @@ class CacheTest : public testing::TestWithParam<std::string> {
    return nullptr;
  }

-  std::shared_ptr<Cache> NewCache(size_t capacity, int num_shard_bits,
-                                  bool strict_capacity_limit) {
+  std::shared_ptr<Cache> NewCache(
+      size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+      CacheMetadataChargePolicy charge_policy = kDontChargeCacheMetadata) {
    auto type = GetParam();
    if (type == kLRU) {
-      return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit, 0.0);
+      LRUCacheOptions co;
+      co.capacity = capacity;
+      co.num_shard_bits = num_shard_bits;
+      co.strict_capacity_limit = strict_capacity_limit;
+      co.high_pri_pool_ratio = 0;
+      co.metadata_charge_policy = charge_policy;
+      return NewLRUCache(co);
    }
    if (type == kClock) {
-      return NewClockCache(capacity, num_shard_bits, strict_capacity_limit);
+      return NewClockCache(capacity, num_shard_bits, strict_capacity_limit,
+                           charge_policy);
    }
    return nullptr;
  }
@ -143,10 +151,15 @@ class CacheTest : public testing::TestWithParam<std::string> {
 };
 CacheTest* CacheTest::current_;

+class LRUCacheTest : public CacheTest {};
+
 TEST_P(CacheTest, UsageTest) {
  // cache is std::shared_ptr and will be automatically cleaned up.
  const uint64_t kCapacity = 100000;
-  auto cache = NewCache(kCapacity, 8, false);
+  auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata);
+  auto precise_cache = NewCache(kCapacity, 0, false, kFullChargeCacheMetadata);
+  ASSERT_EQ(0, cache->GetUsage());
+  ASSERT_EQ(0, precise_cache->GetUsage());

  size_t usage = 0;
  char value[10] = "abcdef";
@ -155,31 +168,45 @@ TEST_P(CacheTest, UsageTest) {
    std::string key(i, 'a');
    auto kv_size = key.size() + 5;
    cache->Insert(key, reinterpret_cast<void*>(value), kv_size, dumbDeleter);
+    precise_cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
+                          dumbDeleter);
    usage += kv_size;
    ASSERT_EQ(usage, cache->GetUsage());
+    ASSERT_LT(usage, precise_cache->GetUsage());
  }

+  cache->EraseUnRefEntries();
+  precise_cache->EraseUnRefEntries();
+  ASSERT_EQ(0, cache->GetUsage());
+  ASSERT_EQ(0, precise_cache->GetUsage());
+
  // make sure the cache will be overloaded
  for (uint64_t i = 1; i < kCapacity; ++i) {
    auto key = ToString(i);
    cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
                  dumbDeleter);
+    precise_cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
+                          dumbDeleter);
  }

  // the usage should be close to the capacity
  ASSERT_GT(kCapacity, cache->GetUsage());
+  ASSERT_GT(kCapacity, precise_cache->GetUsage());
  ASSERT_LT(kCapacity * 0.95, cache->GetUsage());
+  ASSERT_LT(kCapacity * 0.95, precise_cache->GetUsage());
 }

 TEST_P(CacheTest, PinnedUsageTest) {
  // cache is std::shared_ptr and will be automatically cleaned up.
-  const uint64_t kCapacity = 100000;
-  auto cache = NewCache(kCapacity, 8, false);
+  const uint64_t kCapacity = 200000;
+  auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata);
+  auto precise_cache = NewCache(kCapacity, 8, false, kFullChargeCacheMetadata);

  size_t pinned_usage = 0;
  char value[10] = "abcdef";

  std::forward_list<Cache::Handle*> unreleased_handles;
+  std::forward_list<Cache::Handle*> unreleased_handles_in_precise_cache;

  // Add entries. Unpin some of them after insertion. Then, pin some of them
  // again. Check GetPinnedUsage().
@ -187,40 +214,72 @@ TEST_P(CacheTest, PinnedUsageTest) {
    std::string key(i, 'a');
    auto kv_size = key.size() + 5;
    Cache::Handle* handle;
+    Cache::Handle* handle_in_precise_cache;
    cache->Insert(key, reinterpret_cast<void*>(value), kv_size, dumbDeleter,
                  &handle);
+    assert(handle);
+    precise_cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
+                          dumbDeleter, &handle_in_precise_cache);
+    assert(handle_in_precise_cache);
    pinned_usage += kv_size;
    ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+    ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage());
    if (i % 2 == 0) {
      cache->Release(handle);
+      precise_cache->Release(handle_in_precise_cache);
      pinned_usage -= kv_size;
      ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+      ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage());
    } else {
      unreleased_handles.push_front(handle);
+      unreleased_handles_in_precise_cache.push_front(handle_in_precise_cache);
    }
    if (i % 3 == 0) {
      unreleased_handles.push_front(cache->Lookup(key));
+      auto x = precise_cache->Lookup(key);
+      assert(x);
+      unreleased_handles_in_precise_cache.push_front(x);
      // If i % 2 == 0, then the entry was unpinned before Lookup, so pinned
      // usage increased
      if (i % 2 == 0) {
        pinned_usage += kv_size;
      }
      ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+      ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage());
    }
  }
+  auto precise_cache_pinned_usage = precise_cache->GetPinnedUsage();
+  ASSERT_LT(pinned_usage, precise_cache_pinned_usage);

  // check that overloading the cache does not change the pinned usage
  for (uint64_t i = 1; i < 2 * kCapacity; ++i) {
    auto key = ToString(i);
    cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
                  dumbDeleter);
+    precise_cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
+                          dumbDeleter);
  }
  ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+  ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage());
+
+  cache->EraseUnRefEntries();
+  precise_cache->EraseUnRefEntries();
+  ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+  ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage());

  // release handles for pinned entries to prevent memory leaks
  for (auto handle : unreleased_handles) {
    cache->Release(handle);
  }
+  for (auto handle : unreleased_handles_in_precise_cache) {
+    precise_cache->Release(handle);
+  }
+  ASSERT_EQ(0, cache->GetPinnedUsage());
+  ASSERT_EQ(0, precise_cache->GetPinnedUsage());
+  cache->EraseUnRefEntries();
+  precise_cache->EraseUnRefEntries();
+  ASSERT_EQ(0, cache->GetUsage());
+  ASSERT_EQ(0, precise_cache->GetUsage());
 }

 TEST_P(CacheTest, HitAndMiss) {
@ -550,10 +609,10 @@ TEST_P(CacheTest, SetCapacity) {
  }
 }

-TEST_P(CacheTest, SetStrictCapacityLimit) {
+TEST_P(LRUCacheTest, SetStrictCapacityLimit) {
  // test1: set the flag to false. Insert more keys than capacity. See if they
  // all go through.
-  std::shared_ptr<Cache> cache = NewLRUCache(5, 0, false);
+  std::shared_ptr<Cache> cache = NewCache(5, 0, false);
  std::vector<Cache::Handle*> handles(10);
  Status s;
  for (size_t i = 0; i < 10; i++) {
@ -579,7 +638,7 @@ TEST_P(CacheTest, SetStrictCapacityLimit) {
  }

  // test3: init with flag being true.
-  std::shared_ptr<Cache> cache2 = NewLRUCache(5, 0, true);
+  std::shared_ptr<Cache> cache2 = NewCache(5, 0, true);
  for (size_t i = 0; i < 5; i++) {
    std::string key = ToString(i + 1);
    s = cache2->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
@ -697,13 +756,14 @@ TEST_P(CacheTest, GetCharge) {
 }

 #ifdef SUPPORT_CLOCK_CACHE
-std::shared_ptr<Cache> (*new_clock_cache_func)(size_t, int,
-                                               bool) = NewClockCache;
+std::shared_ptr<Cache> (*new_clock_cache_func)(
+    size_t, int, bool, CacheMetadataChargePolicy) = NewClockCache;
 INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest,
                        testing::Values(kLRU, kClock));
 #else
 INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest, testing::Values(kLRU));
 #endif  // SUPPORT_CLOCK_CACHE
+INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest, testing::Values(kLRU));

 }  // namespace rocksdb

--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@ -13,8 +13,9 @@

 namespace rocksdb {

-std::shared_ptr<Cache> NewClockCache(size_t /*capacity*/, int /*num_shard_bits*/,
-                                     bool /*strict_capacity_limit*/) {
+std::shared_ptr<Cache> NewClockCache(
+    size_t /*capacity*/, int /*num_shard_bits*/, bool /*strict_capacity_limit*/,
+    CacheMetadataChargePolicy /*metadata_charge_policy*/) {
  // Clock cache not supported.
  return nullptr;
 }
@ -35,6 +36,7 @@ std::shared_ptr<Cache> NewClockCache(size_t /*capacity*/, int /*num_shard_bits*/
 #include "tbb/concurrent_hash_map.h"

 #include "cache/sharded_cache.h"
+#include "port/malloc.h"
 #include "port/port.h"
 #include "util/autovector.h"
 #include "util/mutexlock.h"
@ -202,6 +204,27 @@ struct CacheHandle {
    deleter = a.deleter;
    return *this;
  }
+
+  inline static size_t CalcTotalCharge(
+      Slice key, size_t charge,
+      CacheMetadataChargePolicy metadata_charge_policy) {
+    size_t meta_charge = 0;
+    if (metadata_charge_policy == kFullChargeCacheMetadata) {
+      meta_charge += sizeof(CacheHandle);
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+      meta_charge +=
+          malloc_usable_size(static_cast<void*>(const_cast<char*>(key.data())));
+#else
+      meta_charge += key.size();
+#endif
+    }
+    return charge + meta_charge;
+  }
+
+  inline size_t CalcTotalCharge(
+      CacheMetadataChargePolicy metadata_charge_policy) {
+    return CalcTotalCharge(key, charge, metadata_charge_policy);
+  }
 };

 // Key of hash map. We store hash value with the key for convenience.
@ -404,11 +427,12 @@ void ClockCacheShard::RecycleHandle(CacheHandle* handle,
  assert(!InCache(handle->flags) && CountRefs(handle->flags) == 0);
  context->to_delete_key.push_back(handle->key.data());
  context->to_delete_value.emplace_back(*handle);
+  size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
  handle->key.clear();
  handle->value = nullptr;
  handle->deleter = nullptr;
  recycle_.push_back(handle);
-  usage_.fetch_sub(handle->charge, std::memory_order_relaxed);
+  usage_.fetch_sub(total_charge, std::memory_order_relaxed);
 }

 void ClockCacheShard::Cleanup(const CleanupContext& context) {
@ -434,7 +458,8 @@ bool ClockCacheShard::Ref(Cache::Handle* h) {
                                            std::memory_order_relaxed)) {
      if (CountRefs(flags) == 0) {
        // No reference count before the operation.
-        pinned_usage_.fetch_add(handle->charge, std::memory_order_relaxed);
+        size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
+        pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed);
      }
      return true;
    }
@ -454,7 +479,8 @@ bool ClockCacheShard::Unref(CacheHandle* handle, bool set_usage,
  assert(CountRefs(flags) > 0);
  if (CountRefs(flags) == 1) {
    // this is the last reference.
-    pinned_usage_.fetch_sub(handle->charge, std::memory_order_relaxed);
+    size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
+    pinned_usage_.fetch_sub(total_charge, std::memory_order_relaxed);
    // Cleanup if it is the last reference.
    if (!InCache(flags)) {
      MutexLock l(&mutex_);
@ -539,8 +565,10 @@ CacheHandle* ClockCacheShard::Insert(
    const Slice& key, uint32_t hash, void* value, size_t charge,
    void (*deleter)(const Slice& key, void* value), bool hold_reference,
    CleanupContext* context) {
+  size_t total_charge =
+      CacheHandle::CalcTotalCharge(key, charge, metadata_charge_policy_);
  MutexLock l(&mutex_);
-  bool success = EvictFromCache(charge, context);
+  bool success = EvictFromCache(total_charge, context);
  bool strict = strict_capacity_limit_.load(std::memory_order_relaxed);
  if (!success && (strict || !hold_reference)) {
    context->to_delete_key.push_back(key.data());
@ -575,9 +603,9 @@ CacheHandle* ClockCacheShard::Insert(
  }
  table_.insert(HashTable::value_type(CacheKey(key, hash), handle));
  if (hold_reference) {
-    pinned_usage_.fetch_add(charge, std::memory_order_relaxed);
+    pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed);
  }
-  usage_.fetch_add(charge, std::memory_order_relaxed);
+  usage_.fetch_add(total_charge, std::memory_order_relaxed);
  return handle;
 }

@ -674,10 +702,14 @@ void ClockCacheShard::EraseUnRefEntries() {

 class ClockCache final : public ShardedCache {
 public:
-  ClockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit)
+  ClockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+             CacheMetadataChargePolicy metadata_charge_policy)
      : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) {
    int num_shards = 1 << num_shard_bits;
    shards_ = new ClockCacheShard[num_shards];
+    for (int i = 0; i < num_shards; i++) {
+      shards_[i].set_metadata_charge_policy(metadata_charge_policy);
+    }
    SetCapacity(capacity);
    SetStrictCapacityLimit(strict_capacity_limit);
  }
@ -714,13 +746,14 @@ class ClockCache final : public ShardedCache {

 }  // end anonymous namespace

-std::shared_ptr<Cache> NewClockCache(size_t capacity, int num_shard_bits,
-                                     bool strict_capacity_limit) {
+std::shared_ptr<Cache> NewClockCache(
+    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+    CacheMetadataChargePolicy metadata_charge_policy) {
  if (num_shard_bits < 0) {
    num_shard_bits = GetDefaultCacheShardBits(capacity);
  }
-  return std::make_shared<ClockCache>(capacity, num_shard_bits,
-                                      strict_capacity_limit);
+  return std::make_shared<ClockCache>(
+      capacity, num_shard_bits, strict_capacity_limit, metadata_charge_policy);
 }

 }  // namespace rocksdb
--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@ -97,7 +97,8 @@ void LRUHandleTable::Resize() {

 LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
                             double high_pri_pool_ratio,
-                             bool use_adaptive_mutex)
+                             bool use_adaptive_mutex,
+                             CacheMetadataChargePolicy metadata_charge_policy)
    : capacity_(0),
      high_pri_pool_usage_(0),
      strict_capacity_limit_(strict_capacity_limit),
@ -106,6 +107,7 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
      usage_(0),
      lru_usage_(0),
      mutex_(use_adaptive_mutex) {
+  set_metadata_charge_policy(metadata_charge_policy);
  // Make empty circular linked list
  lru_.next = &lru_;
  lru_.prev = &lru_;
@ -124,7 +126,9 @@ void LRUCacheShard::EraseUnRefEntries() {
      LRU_Remove(old);
      table_.Remove(old->key(), old->hash);
      old->SetInCache(false);
-      usage_ -= old->charge;
+      size_t total_charge = old->CalcTotalCharge(metadata_charge_policy_);
+      assert(usage_ >= total_charge);
+      usage_ -= total_charge;
      last_reference_list.push_back(old);
    }
  }
@ -180,16 +184,19 @@ void LRUCacheShard::LRU_Remove(LRUHandle* e) {
  e->next->prev = e->prev;
  e->prev->next = e->next;
  e->prev = e->next = nullptr;
-  lru_usage_ -= e->charge;
+  size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
+  assert(lru_usage_ >= total_charge);
+  lru_usage_ -= total_charge;
  if (e->InHighPriPool()) {
-    assert(high_pri_pool_usage_ >= e->charge);
-    high_pri_pool_usage_ -= e->charge;
+    assert(high_pri_pool_usage_ >= total_charge);
+    high_pri_pool_usage_ -= total_charge;
  }
 }

 void LRUCacheShard::LRU_Insert(LRUHandle* e) {
  assert(e->next == nullptr);
  assert(e->prev == nullptr);
+  size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
  if (high_pri_pool_ratio_ > 0 && (e->IsHighPri() || e->HasHit())) {
    // Inset "e" to head of LRU list.
    e->next = &lru_;
@ -197,7 +204,7 @@ void LRUCacheShard::LRU_Insert(LRUHandle* e) {
    e->prev->next = e;
    e->next->prev = e;
    e->SetInHighPriPool(true);
-    high_pri_pool_usage_ += e->charge;
+    high_pri_pool_usage_ += total_charge;
    MaintainPoolSize();
  } else {
    // Insert "e" to the head of low-pri pool. Note that when
@ -209,7 +216,7 @@ void LRUCacheShard::LRU_Insert(LRUHandle* e) {
    e->SetInHighPriPool(false);
    lru_low_pri_ = e;
  }
-  lru_usage_ += e->charge;
+  lru_usage_ += total_charge;
 }

 void LRUCacheShard::MaintainPoolSize() {
@ -218,6 +225,7 @@ void LRUCacheShard::MaintainPoolSize() {
    lru_low_pri_ = lru_low_pri_->next;
    assert(lru_low_pri_ != &lru_);
    lru_low_pri_->SetInHighPriPool(false);
+    assert(high_pri_pool_usage_ >= lru_low_pri_->charge);
    high_pri_pool_usage_ -= lru_low_pri_->charge;
  }
 }
@ -231,7 +239,9 @@ void LRUCacheShard::EvictFromLRU(size_t charge,
    LRU_Remove(old);
    table_.Remove(old->key(), old->hash);
    old->SetInCache(false);
-    usage_ -= old->charge;
+    size_t old_total_charge = old->CalcTotalCharge(metadata_charge_policy_);
+    assert(usage_ >= old_total_charge);
+    usage_ -= old_total_charge;
    deleted->push_back(old);
  }
 }
@ -311,7 +321,9 @@ bool LRUCacheShard::Release(Cache::Handle* handle, bool force_erase) {
      }
    }
    if (last_reference) {
-      usage_ -= e->charge;
+      size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
+      assert(usage_ >= total_charge);
+      usage_ -= total_charge;
    }
  }

@ -345,15 +357,16 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
  e->SetInCache(true);
  e->SetPriority(priority);
  memcpy(e->key_data, key.data(), key.size());
+  size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);

  {
    MutexLock l(&mutex_);

    // Free the space following strict LRU policy until enough space
    // is freed or the lru list is empty
-    EvictFromLRU(charge, &last_reference_list);
+    EvictFromLRU(total_charge, &last_reference_list);

-    if ((usage_ + charge) > capacity_ &&
+    if ((usage_ + total_charge) > capacity_ &&
        (strict_capacity_limit_ || handle == nullptr)) {
      if (handle == nullptr) {
        // Don't insert the entry but still return ok, as if the entry inserted
@ -369,14 +382,17 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
      // Insert into the cache. Note that the cache might get larger than its
      // capacity if not enough space was freed up.
      LRUHandle* old = table_.Insert(e);
-      usage_ += e->charge;
+      usage_ += total_charge;
      if (old != nullptr) {
        assert(old->InCache());
        old->SetInCache(false);
        if (!old->HasRefs()) {
          // old is on LRU because it's in cache and its reference count is 0
          LRU_Remove(old);
-          usage_ -= old->charge;
+          size_t old_total_charge =
+              old->CalcTotalCharge(metadata_charge_policy_);
+          assert(usage_ >= old_total_charge);
+          usage_ -= old_total_charge;
          last_reference_list.push_back(old);
        }
      }
@ -409,7 +425,9 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
      if (!e->HasRefs()) {
        // The entry is in LRU since it's in hash and has no external references
        LRU_Remove(e);
-        usage_ -= e->charge;
+        size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
+        assert(usage_ >= total_charge);
+        usage_ -= total_charge;
        last_reference = true;
      }
    }
@ -447,7 +465,8 @@ std::string LRUCacheShard::GetPrintableOptions() const {
 LRUCache::LRUCache(size_t capacity, int num_shard_bits,
                   bool strict_capacity_limit, double high_pri_pool_ratio,
                   std::shared_ptr<MemoryAllocator> allocator,
-                   bool use_adaptive_mutex)
+                   bool use_adaptive_mutex,
+                   CacheMetadataChargePolicy metadata_charge_policy)
    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
                   std::move(allocator)) {
  num_shards_ = 1 << num_shard_bits;
@ -457,7 +476,7 @@ LRUCache::LRUCache(size_t capacity, int num_shard_bits,
  for (int i = 0; i < num_shards_; i++) {
    new (&shards_[i])
        LRUCacheShard(per_shard, strict_capacity_limit, high_pri_pool_ratio,
-            use_adaptive_mutex);
+                      use_adaptive_mutex, metadata_charge_policy);
  }
 }

@ -526,15 +545,15 @@ std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts) {
  return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits,
                     cache_opts.strict_capacity_limit,
                     cache_opts.high_pri_pool_ratio,
-                     cache_opts.memory_allocator,
-                     cache_opts.use_adaptive_mutex);
+                     cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
+                     cache_opts.metadata_charge_policy);
 }

 std::shared_ptr<Cache> NewLRUCache(
    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
    double high_pri_pool_ratio,
-    std::shared_ptr<MemoryAllocator> memory_allocator,
-    bool use_adaptive_mutex) {
+    std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
+    CacheMetadataChargePolicy metadata_charge_policy) {
  if (num_shard_bits >= 20) {
    return nullptr;  // the cache cannot be sharded into too many fine pieces
  }
@ -545,10 +564,9 @@ std::shared_ptr<Cache> NewLRUCache(
  if (num_shard_bits < 0) {
    num_shard_bits = GetDefaultCacheShardBits(capacity);
  }
-  return std::make_shared<LRUCache>(capacity, num_shard_bits,
-                                    strict_capacity_limit, high_pri_pool_ratio,
-                                    std::move(memory_allocator),
-                                    use_adaptive_mutex);
+  return std::make_shared<LRUCache>(
+      capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio,
+      std::move(memory_allocator), use_adaptive_mutex, metadata_charge_policy);
 }

 }  // namespace rocksdb
--- a/cache/lru_cache.h
+++ b/cache/lru_cache.h
@ -12,6 +12,7 @@

 #include "cache/sharded_cache.h"

+#include "port/malloc.h"
 #include "port/port.h"
 #include "util/autovector.h"

@ -128,6 +129,22 @@ struct LRUHandle {
    }
    delete[] reinterpret_cast<char*>(this);
  }
+
+  // Caclculate the memory usage by metadata
+  inline size_t CalcTotalCharge(
+      CacheMetadataChargePolicy metadata_charge_policy) {
+    assert(key_length);
+    size_t meta_charge = 0;
+    if (metadata_charge_policy == kFullChargeCacheMetadata) {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+      meta_charge += malloc_usable_size(static_cast<void*>(this));
+#else
+      // This is the size that is used when a new handle is created
+      meta_charge += sizeof(LRUHandle) - 1 + key_length;
+#endif
+    }
+    return charge + meta_charge;
+  }
 };

 // We provide our own simple hash table since it removes a whole bunch
@ -176,7 +193,8 @@ class LRUHandleTable {
 class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
 public:
  LRUCacheShard(size_t capacity, bool strict_capacity_limit,
-                double high_pri_pool_ratio, bool use_adaptive_mutex);
+                double high_pri_pool_ratio, bool use_adaptive_mutex,
+                CacheMetadataChargePolicy metadata_charge_policy);
  virtual ~LRUCacheShard() override = default;

  // Separate from constructor so caller can easily make an array of LRUCache
@ -297,7 +315,9 @@ class LRUCache
  LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
           double high_pri_pool_ratio,
           std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
-           bool use_adaptive_mutex = kDefaultToAdaptiveMutex);
+           bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
+           CacheMetadataChargePolicy metadata_charge_policy =
+               kDontChargeCacheMetadata);
  virtual ~LRUCache();
  virtual const char* Name() const override { return "LRUCache"; }
  virtual CacheShard* GetShard(int shard) override;
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@ -31,7 +31,8 @@ class LRUCacheTest : public testing::Test {
    cache_ = reinterpret_cast<LRUCacheShard*>(
        port::cacheline_aligned_alloc(sizeof(LRUCacheShard)));
    new (cache_) LRUCacheShard(capacity, false /*strict_capcity_limit*/,
-                               high_pri_pool_ratio, use_adaptive_mutex);
+                               high_pri_pool_ratio, use_adaptive_mutex,
+                               kDontChargeCacheMetadata);
  }

  void Insert(const std::string& key,
--- a/cache/sharded_cache.h
+++ b/cache/sharded_cache.h
@ -40,6 +40,13 @@ class CacheShard {
                                      bool thread_safe) = 0;
  virtual void EraseUnRefEntries() = 0;
  virtual std::string GetPrintableOptions() const { return ""; }
+  void set_metadata_charge_policy(
+      CacheMetadataChargePolicy metadata_charge_policy) {
+    metadata_charge_policy_ = metadata_charge_policy;
+  }
+
+ protected:
+  CacheMetadataChargePolicy metadata_charge_policy_ = kDontChargeCacheMetadata;
 };

 // Generic cache interface which shards cache by hash of keys. 2^num_shard_bits
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@ -380,8 +380,13 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
  options.statistics = rocksdb::CreateDBStatistics();
  BlockBasedTableOptions table_options;
  table_options.cache_index_and_filter_blocks = true;
+  LRUCacheOptions co;
  // 500 bytes are enough to hold the first two blocks
-  std::shared_ptr<Cache> cache = NewLRUCache(500, 0, false);
+  co.capacity = 500;
+  co.num_shard_bits = 0;
+  co.strict_capacity_limit = false;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = NewLRUCache(co);
  table_options.block_cache = cache;
  table_options.filter_policy.reset(NewBloomFilterPolicy(20, true));
  options.table_factory.reset(new BlockBasedTableFactory(table_options));
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@ -240,8 +240,11 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
  const int table_cache_size = (mutable_db_options_.max_open_files == -1)
                                   ? TableCache::kInfiniteCapacity
                                   : mutable_db_options_.max_open_files - 10;
-  table_cache_ = NewLRUCache(table_cache_size,
-                             immutable_db_options_.table_cache_numshardbits);
+  LRUCacheOptions co;
+  co.capacity = table_cache_size;
+  co.num_shard_bits = immutable_db_options_.table_cache_numshardbits;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  table_cache_ = NewLRUCache(co);

  versions_.reset(new VersionSet(dbname_, &immutable_db_options_, env_options_,
                                 table_cache_.get(), write_buffer_manager_,
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@ -1070,7 +1070,8 @@ TEST_P(DBIteratorTest, IndexWithFirstKey) {
        BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
    table_options.flush_block_policy_factory =
        std::make_shared<FlushBlockEveryKeyPolicyFactory>();
-    table_options.block_cache = NewLRUCache(1000);  // fits all blocks
+    table_options.block_cache =
+        NewLRUCache(8000);  // fits all blocks and their cache metadata overhead
    options.table_factory.reset(NewBlockBasedTableFactory(table_options));

    DestroyAndReopen(options);
--- a/db/db_properties_test.cc
+++ b/db/db_properties_test.cc
@ -1631,7 +1631,11 @@ TEST_F(DBPropertiesTest, BlockCacheProperties) {

  // Test with empty block cache.
  constexpr size_t kCapacity = 100;
-  auto block_cache = NewLRUCache(kCapacity, 0 /*num_shard_bits*/);
+  LRUCacheOptions co;
+  co.capacity = kCapacity;
+  co.num_shard_bits = 0;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto block_cache = NewLRUCache(co);
  table_options.block_cache = block_cache;
  table_options.no_block_cache = false;
  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@ -3780,7 +3780,7 @@ TEST_F(DBTest2, CloseWithUnreleasedSnapshot) {
 TEST_F(DBTest2, RowCacheSnapshot) {
  Options options = CurrentOptions();
  options.statistics = rocksdb::CreateDBStatistics();
-  options.row_cache = NewLRUCache(8192);
+  options.row_cache = NewLRUCache(8 * 8192);
  DestroyAndReopen(options);

  ASSERT_OK(Put("foo", "bar1"));
--- a/env/env_test.cc
+++ b/env/env_test.cc
@ -11,13 +11,6 @@
 #include <sys/ioctl.h>
 #endif

-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else
-#include <malloc.h>
-#endif
-#endif
 #include <sys/types.h>

 #include <iostream>
@ -39,6 +32,7 @@

 #include "env/env_chroot.h"
 #include "logging/log_buffer.h"
+#include "port/malloc.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "test_util/sync_point.h"
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@ -36,6 +36,13 @@ class Cache;

 extern const bool kDefaultToAdaptiveMutex;

+enum CacheMetadataChargePolicy {
+  kDontChargeCacheMetadata,
+  kFullChargeCacheMetadata
+};
+const CacheMetadataChargePolicy kDefaultCacheMetadataChargePolicy =
+    kFullChargeCacheMetadata;
+
 struct LRUCacheOptions {
  // Capacity of the cache.
  size_t capacity = 0;
@ -76,17 +83,23 @@ struct LRUCacheOptions {
  // -DROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX, false otherwise.
  bool use_adaptive_mutex = kDefaultToAdaptiveMutex;

+  CacheMetadataChargePolicy metadata_charge_policy =
+      kDefaultCacheMetadataChargePolicy;
+
  LRUCacheOptions() {}
  LRUCacheOptions(size_t _capacity, int _num_shard_bits,
                  bool _strict_capacity_limit, double _high_pri_pool_ratio,
                  std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
-                  bool _use_adaptive_mutex = kDefaultToAdaptiveMutex)
+                  bool _use_adaptive_mutex = kDefaultToAdaptiveMutex,
+                  CacheMetadataChargePolicy _metadata_charge_policy =
+                      kDefaultCacheMetadataChargePolicy)
      : capacity(_capacity),
        num_shard_bits(_num_shard_bits),
        strict_capacity_limit(_strict_capacity_limit),
        high_pri_pool_ratio(_high_pri_pool_ratio),
        memory_allocator(std::move(_memory_allocator)),
-        use_adaptive_mutex(_use_adaptive_mutex) {}
+        use_adaptive_mutex(_use_adaptive_mutex),
+        metadata_charge_policy(_metadata_charge_policy) {}
 };

 // Create a new cache with a fixed size capacity. The cache is sharded
@ -101,7 +114,9 @@ extern std::shared_ptr<Cache> NewLRUCache(
    size_t capacity, int num_shard_bits = -1,
    bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5,
    std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
-    bool use_adaptive_mutex = kDefaultToAdaptiveMutex);
+    bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
+    CacheMetadataChargePolicy metadata_charge_policy =
+        kDefaultCacheMetadataChargePolicy);

 extern std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts);

@ -110,10 +125,11 @@ extern std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts);
 // more detail.
 //
 // Return nullptr if it is not supported.
-extern std::shared_ptr<Cache> NewClockCache(size_t capacity,
-                                            int num_shard_bits = -1,
-                                            bool strict_capacity_limit = false);
-
+extern std::shared_ptr<Cache> NewClockCache(
+    size_t capacity, int num_shard_bits = -1,
+    bool strict_capacity_limit = false,
+    CacheMetadataChargePolicy metadata_charge_policy =
+        kDefaultCacheMetadataChargePolicy);
 class Cache {
 public:
  // Depending on implementation, cache entries with high priority could be less
--- a/memory/arena.cc
+++ b/memory/arena.cc
@ -8,18 +8,12 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

 #include "memory/arena.h"
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else
-#include <malloc.h>
-#endif
-#endif
 #ifndef OS_WIN
 #include <sys/mman.h>
 #endif
 #include <algorithm>
 #include "logging/logging.h"
+#include "port/malloc.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "test_util/sync_point.h"
--- a/memtable/write_buffer_manager_test.cc
+++ b/memtable/write_buffer_manager_test.cc
@ -51,8 +51,12 @@ TEST_F(WriteBufferManagerTest, ShouldFlush) {
 }

 TEST_F(WriteBufferManagerTest, CacheCost) {
+  LRUCacheOptions co;
  // 1GB cache
-  std::shared_ptr<Cache> cache = NewLRUCache(1024 * 1024 * 1024, 4);
+  co.capacity = 1024 * 1024 * 1024;
+  co.num_shard_bits = 4;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = NewLRUCache(co);
  // A write buffer manager of size 50MB
  std::unique_ptr<WriteBufferManager> wbf(
      new WriteBufferManager(50 * 1024 * 1024, cache));
--- a/port/malloc.h
+++ b/port/malloc.h
@ -0,0 +1,17 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+#ifdef OS_FREEBSD
+#include <malloc_np.h>
+#else
+#include <malloc.h>
+#endif  // OS_FREEBSD
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
--- a/table/block_based/block.h
+++ b/table/block_based/block.h
@ -12,16 +12,10 @@
 #include <stdint.h>
 #include <string>
 #include <vector>
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else
-#include <malloc.h>
-#endif
-#endif

 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
+#include "port/malloc.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
--- a/table/block_based/full_filter_block.cc
+++ b/table/block_based/full_filter_block.cc
@ -6,15 +6,8 @@
 #include <array>
 #include "table/block_based/full_filter_block.h"

-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else
-#include <malloc.h>
-#endif
-#endif
-
 #include "monitoring/perf_context_imp.h"
+#include "port/malloc.h"
 #include "port/port.h"
 #include "rocksdb/filter_policy.h"
 #include "table/block_based/block_based_table_reader.h"
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@ -5,16 +5,10 @@

 #include "table/block_based/partitioned_filter_block.h"

-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else
-#include <malloc.h>
-#endif
-#endif
 #include <utility>

 #include "monitoring/perf_context_imp.h"
+#include "port/malloc.h"
 #include "port/port.h"
 #include "rocksdb/filter_policy.h"
 #include "table/block_based/block.h"
--- a/table/format.h
+++ b/table/format.h
@ -10,13 +10,6 @@
 #pragma once
 #include <stdint.h>
 #include <string>
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else
-#include <malloc.h>
-#endif
-#endif
 #include "file/file_prefetch_buffer.h"
 #include "file/random_access_file_reader.h"

@ -27,6 +20,7 @@

 #include "memory/memory_allocator.h"
 #include "options/cf_options.h"
+#include "port/malloc.h"
 #include "port/port.h"  // noexcept
 #include "table/persistent_cache_options.h"
 #include "util/crc32c.h"
--- a/table/table_test.cc
+++ b/table/table_test.cc
@ -2599,7 +2599,11 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {

  // Enable the cache for index/filter blocks
  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
-  table_options.block_cache = NewLRUCache(2048, 2);
+  LRUCacheOptions co;
+  co.capacity = 2048;
+  co.num_shard_bits = 2;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  table_options.block_cache = NewLRUCache(co);
  table_options.cache_index_and_filter_blocks = true;
  options.table_factory.reset(new BlockBasedTableFactory(table_options));
  std::vector<std::string> keys;
--- a/utilities/simulator_cache/cache_simulator_test.cc
+++ b/utilities/simulator_cache/cache_simulator_test.cc
@ -313,10 +313,13 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulatorGetTest) {
  get.sst_fd_number = 0;
  get.get_from_user_specified_snapshot = Boolean::kFalse;

-  std::shared_ptr<Cache> sim_cache =
-      NewLRUCache(/*capacity=*/16, /*num_shard_bits=*/1,
-                  /*strict_capacity_limit=*/false,
-                  /*high_pri_pool_ratio=*/0);
+  LRUCacheOptions co;
+  co.capacity = 16;
+  co.num_shard_bits = 1;
+  co.strict_capacity_limit = false;
+  co.high_pri_pool_ratio = 0;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> sim_cache = NewLRUCache(co);
  std::unique_ptr<HybridRowBlockCacheSimulator> cache_simulator(
      new HybridRowBlockCacheSimulator(
          nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/true));
--- a/utilities/simulator_cache/sim_cache.cc
+++ b/utilities/simulator_cache/sim_cache.cc
@ -331,8 +331,11 @@ class SimCacheImpl : public SimCache {
 // For instrumentation purpose, use NewSimCache instead
 std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> cache,
                                      size_t sim_capacity, int num_shard_bits) {
-  return NewSimCache(NewLRUCache(sim_capacity, num_shard_bits), cache,
-                     num_shard_bits);
+  LRUCacheOptions co;
+  co.capacity = sim_capacity;
+  co.num_shard_bits = num_shard_bits;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  return NewSimCache(NewLRUCache(co), cache, num_shard_bits);
 }

 std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> sim_cache,
--- a/utilities/simulator_cache/sim_cache_test.cc
+++ b/utilities/simulator_cache/sim_cache_test.cc
@ -77,8 +77,12 @@ TEST_F(SimCacheTest, SimCache) {
  auto table_options = GetTableOptions();
  auto options = GetOptions(table_options);
  InitTable(options);
-  std::shared_ptr<SimCache> simCache =
-      NewSimCache(NewLRUCache(0, 0, false), 20000, 0);
+  LRUCacheOptions co;
+  co.capacity = 0;
+  co.num_shard_bits = 0;
+  co.strict_capacity_limit = false;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<SimCache> simCache = NewSimCache(NewLRUCache(co), 20000, 0);
  table_options.block_cache = simCache;
  options.table_factory.reset(new BlockBasedTableFactory(table_options));
  Reopen(options);
@ -142,8 +146,10 @@ TEST_F(SimCacheTest, SimCacheLogging) {
  auto table_options = GetTableOptions();
  auto options = GetOptions(table_options);
  options.disable_auto_compactions = true;
-  std::shared_ptr<SimCache> sim_cache =
-      NewSimCache(NewLRUCache(1024 * 1024), 20000, 0);
+  LRUCacheOptions co;
+  co.capacity = 1024 * 1024;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<SimCache> sim_cache = NewSimCache(NewLRUCache(co), 20000, 0);
  table_options.block_cache = sim_cache;
  options.table_factory.reset(new BlockBasedTableFactory(table_options));
  Reopen(options);