Merge pull request #8213 from anand1976/nvm_cache_initial

Allow cache_bench and db_bench to use 3rd party tiered cache
More build failure fixes
2021-04-28 12:35:02 -07:00 · 2021-04-28 11:01:48 -07:00 · 2021-04-28 10:03:08 -07:00 · 2021-04-27 17:20:52 -07:00 · 2021-04-26 13:35:48 -07:00 · 2021-04-26 12:56:12 -07:00
22 changed files with 1315 additions and 480 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1348,7 +1348,8 @@ if(WITH_BENCHMARK_TOOLS)
    ${ROCKSDB_LIB} ${THIRDPARTY_LIBS})
  add_executable(cache_bench${ARTIFACT_SUFFIX}
-    cache/cache_bench.cc)
+    cache/cache_bench.cc
    cache/cache_bench_tool.cc)
  target_link_libraries(cache_bench${ARTIFACT_SUFFIX}
    ${ROCKSDB_LIB} ${GFLAGS_LIB})
--- a/HISTORY.md
+++ b/HISTORY.md
@ -43,6 +43,7 @@
 * Added BackupEngine::GetBackupInfo / GetLatestBackupInfo for querying individual backups.
 * Made the Ribbon filter a long-term supported feature in terms of the SST schema(compatible with version >= 6.15.0) though the API for enabling it is expected to change.
 ## 6.19.0 (03/21/2021)
 ### Bug Fixes
 * Fixed the truncation error found in APIs/tools when dumping block-based SST files in a human-readable format. After fix, the block-based table can be fully dumped as a readable file.
--- a/7
+++ b/7
@ -491,13 +491,14 @@ VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
 TEST_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES)) $(GTEST)
 BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(BENCH_LIB_SOURCES))
 CACHE_BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(CACHE_BENCH_LIB_SOURCES))
 TOOL_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TOOL_LIB_SOURCES))
 ANALYZE_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(ANALYZER_LIB_SOURCES))
 STRESS_OBJECTS =  $(patsubst %.cc, $(OBJ_DIR)/%.o, $(STRESS_LIB_SOURCES))
 # Exclude build_version.cc -- a generated source file -- from all sources.  Not needed for dependencies
-ALL_SOURCES  = $(filter-out util/build_version.cc, $(LIB_SOURCES)) $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES) $(GTEST_DIR)/gtest/gtest-all.cc
+ALL_SOURCES  = $(LIB_SOURCES) $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES) $(GTEST_DIR)/gtest/gtest-all.cc
-ALL_SOURCES += $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES)
+ALL_SOURCES += $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(CACHE_BENCH_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES)
 ALL_SOURCES += $(TEST_MAIN_SOURCES) $(TOOL_MAIN_SOURCES) $(BENCH_MAIN_SOURCES)
 TESTS = $(patsubst %.cc, %, $(notdir $(TEST_MAIN_SOURCES)))
@ -1252,7 +1253,7 @@ folly_synchronization_distributed_mutex_test: $(OBJ_DIR)/third-party/folly/folly
 	$(AM_LINK)
 endif
-cache_bench: $(OBJ_DIR)/cache/cache_bench.o $(LIBRARY)
+cache_bench: $(OBJ_DIR)/cache/cache_bench.o $(CACHE_BENCH_OBJECTS) $(LIBRARY)
 	$(AM_LINK)
 persistent_cache_bench: $(OBJ_DIR)/utilities/persistent_cache/persistent_cache_bench.o $(LIBRARY)
--- a/15
+++ b/15
@ -794,6 +794,21 @@ cpp_library(
    link_whole = False,
 )
 cpp_library(
    name = "rocksdb_cache_bench_tools_lib",
    srcs = ["cache/cache_bench_tool.cc"],
    auto_headers = AutoHeaders.RECURSIVE_GLOB,
    arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
    compiler_flags = ROCKSDB_COMPILER_FLAGS,
    os_deps = ROCKSDB_OS_DEPS,
    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
    preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
    include_paths = ROCKSDB_INCLUDE_PATHS,
    deps = [":rocksdb_lib"],
    external_deps = ROCKSDB_EXTERNAL_DEPS,
    link_whole = False,
 )
 cpp_library(
    name = "rocksdb_stress_lib",
    srcs = [
--- a/buckifier/buckify_rocksdb.py
+++ b/buckifier/buckify_rocksdb.py
@ -173,6 +173,11 @@ def generate_targets(repo_path, deps_map):
        src_mk.get("ANALYZER_LIB_SOURCES", []) +
        ["test_util/testutil.cc"],
        [":rocksdb_lib"])
    # rocksdb_cache_bench_tools_lib
    TARGETS.add_library(
        "rocksdb_cache_bench_tools_lib",
        src_mk.get("CACHE_BENCH_LIB_SOURCES", []),
        [":rocksdb_lib"])
    # rocksdb_stress_lib
    TARGETS.add_rocksdb_library(
        "rocksdb_stress_lib",
--- a/cache/cache_bench.cc
+++ b/cache/cache_bench.cc
@ -1,7 +1,11 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2013-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #ifndef GFLAGS
 #include <cstdio>
@ -10,375 +14,8 @@ int main() {
  return 1;
 }
 #else
-
+#include <rocksdb/cache_bench_tool.h>
 #include <sys/types.h>
 #include <cinttypes>
 #include <cstdio>
 #include <limits>
 #include "port/port.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/system_clock.h"
 #include "util/coding.h"
 #include "util/gflags_compat.h"
 #include "util/hash.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 static constexpr uint32_t KiB = uint32_t{1} << 10;
 static constexpr uint32_t MiB = KiB << 10;
 static constexpr uint64_t GiB = MiB << 10;
 DEFINE_uint32(threads, 16, "Number of concurrent threads to run.");
 DEFINE_uint64(cache_size, 1 * GiB,
              "Number of bytes to use as a cache of uncompressed data.");
 DEFINE_uint32(num_shard_bits, 6, "shard_bits.");
 DEFINE_double(resident_ratio, 0.25,
              "Ratio of keys fitting in cache to keyspace.");
 DEFINE_uint64(ops_per_thread, 0,
              "Number of operations per thread. (Default: 5 * keyspace size)");
 DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added.");
 DEFINE_uint32(skew, 5, "Degree of skew in key selection");
 DEFINE_bool(populate_cache, true, "Populate cache before operations");
 DEFINE_uint32(lookup_insert_percent, 87,
              "Ratio of lookup (+ insert on not found) to total workload "
              "(expressed as a percentage)");
 DEFINE_uint32(insert_percent, 2,
              "Ratio of insert to total workload (expressed as a percentage)");
 DEFINE_uint32(lookup_percent, 10,
              "Ratio of lookup to total workload (expressed as a percentage)");
 DEFINE_uint32(erase_percent, 1,
              "Ratio of erase to total workload (expressed as a percentage)");
 DEFINE_bool(use_clock_cache, false, "");
 namespace ROCKSDB_NAMESPACE {
 class CacheBench;
 namespace {
 // State shared by all concurrent executions of the same benchmark.
 class SharedState {
 public:
  explicit SharedState(CacheBench* cache_bench)
      : cv_(&mu_),
        num_initialized_(0),
        start_(false),
        num_done_(0),
        cache_bench_(cache_bench) {}
  ~SharedState() {}
  port::Mutex* GetMutex() {
    return &mu_;
  }
  port::CondVar* GetCondVar() {
    return &cv_;
  }
  CacheBench* GetCacheBench() const {
    return cache_bench_;
  }
  void IncInitialized() {
    num_initialized_++;
  }
  void IncDone() {
    num_done_++;
  }
  bool AllInitialized() const { return num_initialized_ >= FLAGS_threads; }
  bool AllDone() const { return num_done_ >= FLAGS_threads; }
  void SetStart() {
    start_ = true;
  }
  bool Started() const {
    return start_;
  }
 private:
  port::Mutex mu_;
  port::CondVar cv_;
  uint64_t num_initialized_;
  bool start_;
  uint64_t num_done_;
  CacheBench* cache_bench_;
 };
 // Per-thread state for concurrent executions of the same benchmark.
 struct ThreadState {
  uint32_t tid;
  Random64 rnd;
  SharedState* shared;
  ThreadState(uint32_t index, SharedState* _shared)
      : tid(index), rnd(1000 + index), shared(_shared) {}
 };
 struct KeyGen {
  char key_data[27];
  Slice GetRand(Random64& rnd, uint64_t max_key) {
    uint64_t raw = rnd.Next();
    // Skew according to setting
    for (uint32_t i = 0; i < FLAGS_skew; ++i) {
      raw = std::min(raw, rnd.Next());
    }
    uint64_t key = FastRange64(raw, max_key);
    // Variable size and alignment
    size_t off = key % 8;
    key_data[0] = char{42};
    EncodeFixed64(key_data + 1, key);
    key_data[9] = char{11};
    EncodeFixed64(key_data + 10, key);
    key_data[18] = char{4};
    EncodeFixed64(key_data + 19, key);
    return Slice(&key_data[off], sizeof(key_data) - off);
  }
 };
 char* createValue(Random64& rnd) {
  char* rv = new char[FLAGS_value_bytes];
  // Fill with some filler data, and take some CPU time
  for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) {
    EncodeFixed64(rv + i, rnd.Next());
  }
  return rv;
 }
 void deleter(const Slice& /*key*/, void* value) {
  delete[] static_cast<char*>(value);
 }
 }  // namespace
 class CacheBench {
  static constexpr uint64_t kHundredthUint64 =
      std::numeric_limits<uint64_t>::max() / 100U;
 public:
  CacheBench()
      : max_key_(static_cast<uint64_t>(FLAGS_cache_size / FLAGS_resident_ratio /
                                       FLAGS_value_bytes)),
        lookup_insert_threshold_(kHundredthUint64 *
                                 FLAGS_lookup_insert_percent),
        insert_threshold_(lookup_insert_threshold_ +
                          kHundredthUint64 * FLAGS_insert_percent),
        lookup_threshold_(insert_threshold_ +
                          kHundredthUint64 * FLAGS_lookup_percent),
        erase_threshold_(lookup_threshold_ +
                         kHundredthUint64 * FLAGS_erase_percent) {
    if (erase_threshold_ != 100U * kHundredthUint64) {
      fprintf(stderr, "Percentages must add to 100.\n");
      exit(1);
    }
    if (FLAGS_use_clock_cache) {
      cache_ = NewClockCache(FLAGS_cache_size, FLAGS_num_shard_bits);
      if (!cache_) {
        fprintf(stderr, "Clock cache not supported.\n");
        exit(1);
      }
    } else {
      cache_ = NewLRUCache(FLAGS_cache_size, FLAGS_num_shard_bits);
    }
    if (FLAGS_ops_per_thread == 0) {
      FLAGS_ops_per_thread = 5 * max_key_;
    }
  }
  ~CacheBench() {}
  void PopulateCache() {
    Random64 rnd(1);
    KeyGen keygen;
    for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) {
      cache_->Insert(keygen.GetRand(rnd, max_key_), createValue(rnd),
                     FLAGS_value_bytes, &deleter);
    }
  }
  bool Run() {
    ROCKSDB_NAMESPACE::Env* env = ROCKSDB_NAMESPACE::Env::Default();
    const auto& clock = env->GetSystemClock();
    PrintEnv();
    SharedState shared(this);
    std::vector<std::unique_ptr<ThreadState> > threads(FLAGS_threads);
    for (uint32_t i = 0; i < FLAGS_threads; i++) {
      threads[i].reset(new ThreadState(i, &shared));
      env->StartThread(ThreadBody, threads[i].get());
    }
    {
      MutexLock l(shared.GetMutex());
      while (!shared.AllInitialized()) {
        shared.GetCondVar()->Wait();
      }
      // Record start time
      uint64_t start_time = clock->NowMicros();
      // Start all threads
      shared.SetStart();
      shared.GetCondVar()->SignalAll();
      // Wait threads to complete
      while (!shared.AllDone()) {
        shared.GetCondVar()->Wait();
      }
      // Record end time
      uint64_t end_time = clock->NowMicros();
      double elapsed = static_cast<double>(end_time - start_time) * 1e-6;
      uint32_t qps = static_cast<uint32_t>(
          static_cast<double>(FLAGS_threads * FLAGS_ops_per_thread) / elapsed);
      fprintf(stdout, "Complete in %.3f s; QPS = %u\n", elapsed, qps);
    }
    return true;
  }
 private:
  std::shared_ptr<Cache> cache_;
  const uint64_t max_key_;
  // Cumulative thresholds in the space of a random uint64_t
  const uint64_t lookup_insert_threshold_;
  const uint64_t insert_threshold_;
  const uint64_t lookup_threshold_;
  const uint64_t erase_threshold_;
  static void ThreadBody(void* v) {
    ThreadState* thread = static_cast<ThreadState*>(v);
    SharedState* shared = thread->shared;
    {
      MutexLock l(shared->GetMutex());
      shared->IncInitialized();
      if (shared->AllInitialized()) {
        shared->GetCondVar()->SignalAll();
      }
      while (!shared->Started()) {
        shared->GetCondVar()->Wait();
      }
    }
    thread->shared->GetCacheBench()->OperateCache(thread);
    {
      MutexLock l(shared->GetMutex());
      shared->IncDone();
      if (shared->AllDone()) {
        shared->GetCondVar()->SignalAll();
      }
    }
  }
  void OperateCache(ThreadState* thread) {
    // To use looked-up values
    uint64_t result = 0;
    // To hold handles for a non-trivial amount of time
    Cache::Handle* handle = nullptr;
    KeyGen gen;
    for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
      Slice key = gen.GetRand(thread->rnd, max_key_);
      uint64_t random_op = thread->rnd.Next();
      if (random_op < lookup_insert_threshold_) {
        if (handle) {
          cache_->Release(handle);
          handle = nullptr;
        }
        // do lookup
        handle = cache_->Lookup(key);
        if (handle) {
          // do something with the data
          result += NPHash64(static_cast<char*>(cache_->Value(handle)),
                             FLAGS_value_bytes);
        } else {
          // do insert
          cache_->Insert(key, createValue(thread->rnd), FLAGS_value_bytes,
                         &deleter, &handle);
        }
      } else if (random_op < insert_threshold_) {
        if (handle) {
          cache_->Release(handle);
          handle = nullptr;
        }
        // do insert
        cache_->Insert(key, createValue(thread->rnd), FLAGS_value_bytes,
                       &deleter, &handle);
      } else if (random_op < lookup_threshold_) {
        if (handle) {
          cache_->Release(handle);
          handle = nullptr;
        }
        // do lookup
        handle = cache_->Lookup(key);
        if (handle) {
          // do something with the data
          result += NPHash64(static_cast<char*>(cache_->Value(handle)),
                             FLAGS_value_bytes);
        }
      } else if (random_op < erase_threshold_) {
        // do erase
        cache_->Erase(key);
      } else {
        // Should be extremely unlikely (noop)
        assert(random_op >= kHundredthUint64 * 100U);
      }
    }
    if (handle) {
      cache_->Release(handle);
      handle = nullptr;
    }
  }
  void PrintEnv() const {
    printf("RocksDB version     : %d.%d\n", kMajorVersion, kMinorVersion);
    printf("Number of threads   : %u\n", FLAGS_threads);
    printf("Ops per thread      : %" PRIu64 "\n", FLAGS_ops_per_thread);
    printf("Cache size          : %" PRIu64 "\n", FLAGS_cache_size);
    printf("Num shard bits      : %u\n", FLAGS_num_shard_bits);
    printf("Max key             : %" PRIu64 "\n", max_key_);
    printf("Resident ratio      : %g\n", FLAGS_resident_ratio);
    printf("Skew degree         : %u\n", FLAGS_skew);
    printf("Populate cache      : %d\n", int{FLAGS_populate_cache});
    printf("Lookup+Insert pct   : %u%%\n", FLAGS_lookup_insert_percent);
    printf("Insert percentage   : %u%%\n", FLAGS_insert_percent);
    printf("Lookup percentage   : %u%%\n", FLAGS_lookup_percent);
    printf("Erase percentage    : %u%%\n", FLAGS_erase_percent);
    printf("----------------------------\n");
  }
 };
 }  // namespace ROCKSDB_NAMESPACE
 int main(int argc, char** argv) {
-  ParseCommandLineFlags(&argc, &argv, true);
+  return ROCKSDB_NAMESPACE::cache_bench_tool(argc, argv);
  if (FLAGS_threads <= 0) {
    fprintf(stderr, "threads number <= 0\n");
    exit(1);
  }
  ROCKSDB_NAMESPACE::CacheBench bench;
  if (FLAGS_populate_cache) {
    bench.PopulateCache();
    printf("Population complete\n");
    printf("----------------------------\n");
  }
  if (bench.Run()) {
    return 0;
  } else {
    return 1;
  }
 }
 #endif  // GFLAGS
--- a/cache/cache_bench_tool.cc
+++ b/cache/cache_bench_tool.cc
@ -0,0 +1,504 @@
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 #ifdef GFLAGS
 #include "rocksdb/cache_bench_tool.h"
 #include <sys/types.h>
 #include <cinttypes>
 #include <cstdio>
 #include <limits>
 #include "options/configurable_helper.h"
 #include "port/port.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/system_clock.h"
 #include "rocksdb/tiered_cache.h"
 #include "rocksdb/utilities/object_registry.h"
 #include "rocksdb/utilities/options_type.h"
 #include "util/coding.h"
 #include "util/gflags_compat.h"
 #include "util/hash.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/stop_watch.h"
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 static constexpr uint32_t KiB = uint32_t{1} << 10;
 static constexpr uint32_t MiB = KiB << 10;
 static constexpr uint64_t GiB = MiB << 10;
 DEFINE_uint32(threads, 16, "Number of concurrent threads to run.");
 DEFINE_uint64(cache_size, 1 * GiB,
              "Number of bytes to use as a cache of uncompressed data.");
 DEFINE_uint32(num_shard_bits, 6, "shard_bits.");
 DEFINE_double(resident_ratio, 0.25,
              "Ratio of keys fitting in cache to keyspace.");
 DEFINE_uint64(ops_per_thread, 0,
              "Number of operations per thread. (Default: 5 * keyspace size)");
 DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added.");
 DEFINE_uint32(skew, 5, "Degree of skew in key selection");
 DEFINE_bool(populate_cache, true, "Populate cache before operations");
 DEFINE_uint32(lookup_insert_percent, 87,
              "Ratio of lookup (+ insert on not found) to total workload "
              "(expressed as a percentage)");
 DEFINE_uint32(insert_percent, 2,
              "Ratio of insert to total workload (expressed as a percentage)");
 DEFINE_uint32(lookup_percent, 10,
              "Ratio of lookup to total workload (expressed as a percentage)");
 DEFINE_uint32(erase_percent, 1,
              "Ratio of erase to total workload (expressed as a percentage)");
 DEFINE_bool(use_clock_cache, false, "");
 DEFINE_bool(skewed, false, "If true, skew the key access distribution");
 #ifndef ROCKSDB_LITE
 DEFINE_string(tiered_cache_uri, "",
              "Full URI for creating a custom NVM cache object");
 static class std::shared_ptr<ROCKSDB_NAMESPACE::TieredCache> tiered_cache;
 #endif
 namespace ROCKSDB_NAMESPACE {
 class CacheBench;
 namespace {
 // State shared by all concurrent executions of the same benchmark.
 class SharedState {
 public:
  explicit SharedState(CacheBench* cache_bench)
      : cv_(&mu_),
        num_initialized_(0),
        start_(false),
        num_done_(0),
        cache_bench_(cache_bench) {}
  ~SharedState() {}
  port::Mutex* GetMutex() { return &mu_; }
  port::CondVar* GetCondVar() { return &cv_; }
  CacheBench* GetCacheBench() const { return cache_bench_; }
  void IncInitialized() { num_initialized_++; }
  void IncDone() { num_done_++; }
  bool AllInitialized() const { return num_initialized_ >= FLAGS_threads; }
  bool AllDone() const { return num_done_ >= FLAGS_threads; }
  void SetStart() { start_ = true; }
  bool Started() const { return start_; }
 private:
  port::Mutex mu_;
  port::CondVar cv_;
  uint64_t num_initialized_;
  bool start_;
  uint64_t num_done_;
  CacheBench* cache_bench_;
 };
 class Stats {
 private:
  uint64_t hits;
  uint64_t misses;
  uint64_t lookup_us;
  uint64_t insert_us;
 public:
  Stats() : hits(0), misses(0), lookup_us(0), insert_us(0) {}
  void AddHits(uint64_t inc) { hits += inc; }
  void AddMisses(uint64_t inc) { misses += inc; }
  void AddLookupUs(uint64_t lookup) { lookup_us += lookup; }
  void AddInsertUs(uint64_t insert) { insert_us += insert; }
  void Merge(const Stats& other) {
    hits += other.hits;
    misses += other.misses;
    lookup_us += other.lookup_us;
    insert_us += other.insert_us;
  }
  void Report() {
    fprintf(stdout, "%llu hits, %llu misses, %llu lookup_us, %llu insert_us\n",
            (unsigned long long)hits, (unsigned long long)misses,
            (unsigned long long)lookup_us, (unsigned long long)insert_us);
    fflush(stdout);
  }
 };
 // Per-thread state for concurrent executions of the same benchmark.
 struct ThreadState {
  uint32_t tid;
  Random64 rnd;
  Stats stats;
  SharedState* shared;
  ThreadState(uint32_t index, SharedState* _shared)
      : tid(index), rnd(1000 + index), shared(_shared) {}
 };
 struct KeyGen {
  char key_data[27];
  Slice GetRand(Random64& rnd, uint64_t max_key, int max_log) {
    uint64_t key = 0;
    if (!FLAGS_skewed) {
      uint64_t raw = rnd.Next();
      // Skew according to setting
      for (uint32_t i = 0; i < FLAGS_skew; ++i) {
        raw = std::min(raw, rnd.Next());
      }
      key = FastRange64(raw, max_key);
    } else {
      key = rnd.Skewed(max_log);
      if (key > max_key) {
        key -= max_key;
      }
    }
    // Variable size and alignment
    size_t off = key % 8;
    key_data[0] = char{42};
    EncodeFixed64(key_data + 1, key);
    key_data[9] = char{11};
    EncodeFixed64(key_data + 10, key);
    key_data[18] = char{4};
    EncodeFixed64(key_data + 19, key);
    return Slice(&key_data[off], sizeof(key_data) - off);
  }
 };
 char* createValue(Random64& rnd) {
  char* rv = new char[FLAGS_value_bytes];
  // Fill with some filler data, and take some CPU time
  for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) {
    EncodeFixed64(rv + i, rnd.Next());
  }
  return rv;
 }
 void helperCallback(Cache::SizeCallback* size_cb,
                    Cache::SaveToCallback* save_cb,
                    Cache::DeletionCallback* del_cb) {
  if (size_cb) {
    *size_cb = [](void* /*obj*/) -> size_t { return FLAGS_value_bytes; };
  }
  if (save_cb) {
    *save_cb = [](void* obj, size_t /*offset*/, size_t size,
                  void* out) -> Status {
      memcpy(out, obj, size);
      return Status::OK();
    };
  }
  if (del_cb) {
    *del_cb = [](const Slice& /*key*/, void* obj) -> void {
      delete[] static_cast<char*>(obj);
    };
  }
 }
 }  // namespace
 class CacheBench {
  static constexpr uint64_t kHundredthUint64 =
      std::numeric_limits<uint64_t>::max() / 100U;
 public:
  CacheBench()
      : max_key_(static_cast<uint64_t>(FLAGS_cache_size / FLAGS_resident_ratio /
                                       FLAGS_value_bytes)),
        lookup_insert_threshold_(kHundredthUint64 *
                                 FLAGS_lookup_insert_percent),
        insert_threshold_(lookup_insert_threshold_ +
                          kHundredthUint64 * FLAGS_insert_percent),
        lookup_threshold_(insert_threshold_ +
                          kHundredthUint64 * FLAGS_lookup_percent),
        erase_threshold_(lookup_threshold_ +
                         kHundredthUint64 * FLAGS_erase_percent),
        skewed_(FLAGS_skewed) {
    if (erase_threshold_ != 100U * kHundredthUint64) {
      fprintf(stderr, "Percentages must add to 100.\n");
      exit(1);
    }
    max_log_ = 0;
    if (skewed_) {
      uint64_t max_key = max_key_;
      while (max_key >>= 1) max_log_++;
      if (max_key > (1u << max_log_)) max_log_++;
    }
    if (FLAGS_use_clock_cache) {
      cache_ = NewClockCache(FLAGS_cache_size, FLAGS_num_shard_bits);
      if (!cache_) {
        fprintf(stderr, "Clock cache not supported.\n");
        exit(1);
      }
    } else {
      LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits, false, 0.5);
 #ifndef ROCKSDB_LITE
      if (!FLAGS_tiered_cache_uri.empty()) {
        Status s = ObjectRegistry::NewInstance()->NewSharedObject<TieredCache>(
            FLAGS_tiered_cache_uri, &tiered_cache);
        if (tiered_cache == nullptr) {
          fprintf(stderr,
                  "No tiered cache registered matching string: %s status=%s\n",
                  FLAGS_tiered_cache_uri.c_str(), s.ToString().c_str());
          exit(1);
        }
        opts.tiered_cache = tiered_cache;
      }
 #endif
      cache_ = NewLRUCache(opts);
    }
    if (FLAGS_ops_per_thread == 0) {
      FLAGS_ops_per_thread = 5 * max_key_;
    }
  }
  ~CacheBench() {}
  void PopulateCache() {
    Random64 rnd(1);
    KeyGen keygen;
    for (uint64_t i = 0; i < 10 * FLAGS_cache_size; i += FLAGS_value_bytes) {
      cache_->Insert(keygen.GetRand(rnd, max_key_, max_log_), createValue(rnd),
                     helperCallback, FLAGS_value_bytes);
    }
  }
  bool Run() {
    ROCKSDB_NAMESPACE::Env* env = ROCKSDB_NAMESPACE::Env::Default();
    const auto& clock = env->GetSystemClock();
    PrintEnv();
    SharedState shared(this);
    std::vector<std::unique_ptr<ThreadState> > threads(FLAGS_threads);
    for (uint32_t i = 0; i < FLAGS_threads; i++) {
      threads[i].reset(new ThreadState(i, &shared));
      env->StartThread(ThreadBody, threads[i].get());
    }
    {
      MutexLock l(shared.GetMutex());
      while (!shared.AllInitialized()) {
        shared.GetCondVar()->Wait();
      }
      // Record start time
      uint64_t start_time = clock->NowMicros();
      // Start all threads
      shared.SetStart();
      shared.GetCondVar()->SignalAll();
      // Wait threads to complete
      while (!shared.AllDone()) {
        shared.GetCondVar()->Wait();
      }
      // Record end time
      uint64_t end_time = clock->NowMicros();
      double elapsed = static_cast<double>(end_time - start_time) * 1e-6;
      uint32_t qps = static_cast<uint32_t>(
          static_cast<double>(FLAGS_threads * FLAGS_ops_per_thread) / elapsed);
      fprintf(stdout, "Complete in %.3f s; QPS = %u\n", elapsed, qps);
      Stats merge_stats;
      for (uint32_t i = 0; i < FLAGS_threads; ++i) {
        merge_stats.Merge(threads[i]->stats);
      }
      merge_stats.Report();
    }
    return true;
  }
 private:
  std::shared_ptr<Cache> cache_;
  const uint64_t max_key_;
  // Cumulative thresholds in the space of a random uint64_t
  const uint64_t lookup_insert_threshold_;
  const uint64_t insert_threshold_;
  const uint64_t lookup_threshold_;
  const uint64_t erase_threshold_;
  const bool skewed_;
  int max_log_;
  static void ThreadBody(void* v) {
    ThreadState* thread = static_cast<ThreadState*>(v);
    SharedState* shared = thread->shared;
    {
      MutexLock l(shared->GetMutex());
      shared->IncInitialized();
      if (shared->AllInitialized()) {
        shared->GetCondVar()->SignalAll();
      }
      while (!shared->Started()) {
        shared->GetCondVar()->Wait();
      }
    }
    thread->shared->GetCacheBench()->OperateCache(thread);
    {
      MutexLock l(shared->GetMutex());
      shared->IncDone();
      if (shared->AllDone()) {
        shared->GetCondVar()->SignalAll();
      }
    }
  }
  void OperateCache(ThreadState* thread) {
    // To use looked-up values
    uint64_t result = 0;
    // To hold handles for a non-trivial amount of time
    Cache::Handle* handle = nullptr;
    KeyGen gen;
    ROCKSDB_NAMESPACE::Env* env = ROCKSDB_NAMESPACE::Env::Default();
    SystemClock* clock = env->GetSystemClock().get();
    for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
      Slice key = gen.GetRand(thread->rnd, max_key_, max_log_);
      uint64_t random_op = thread->rnd.Next();
      Cache::CreateCallback create_cb =
          [](void* buf, size_t size, void** out_obj, size_t* charge) -> Status {
        *out_obj = reinterpret_cast<void*>(new char[size]);
        memcpy(*out_obj, buf, size);
        *charge = size;
        return Status::OK();
      };
      if (random_op < lookup_insert_threshold_) {
        if (handle) {
          cache_->Release(handle);
          handle = nullptr;
        }
        // do lookup
        {
          uint64_t elapsed = 0;
          {
            StopWatch sw(clock, nullptr, 0, &elapsed);
            handle = cache_->Lookup(key, helperCallback, create_cb,
                                    Cache::Priority::LOW, true);
          }
          thread->stats.AddLookupUs(elapsed);
        }
        if (handle) {
          thread->stats.AddHits(1);
          // do something with the data
          result += NPHash64(static_cast<char*>(cache_->Value(handle)),
                             FLAGS_value_bytes);
        } else {
          thread->stats.AddMisses(1);
          // do insert
          {
            uint64_t elapsed = 0;
            {
              StopWatch sw(clock, nullptr, 0, &elapsed);
              cache_->Insert(key, createValue(thread->rnd), helperCallback,
                             FLAGS_value_bytes, &handle);
            }
            thread->stats.AddInsertUs(elapsed);
          }
        }
      } else if (random_op < insert_threshold_) {
        if (handle) {
          cache_->Release(handle);
          handle = nullptr;
        }
        // do insert
        {
          uint64_t elapsed = 0;
          {
            StopWatch sw(clock, nullptr, 0, &elapsed);
            cache_->Insert(key, createValue(thread->rnd), helperCallback,
                           FLAGS_value_bytes, &handle);
          }
          thread->stats.AddInsertUs(elapsed);
        }
      } else if (random_op < lookup_threshold_) {
        if (handle) {
          cache_->Release(handle);
          handle = nullptr;
        }
        // do lookup
        {
          uint64_t elapsed = 0;
          {
            StopWatch sw(clock, nullptr, 0, &elapsed);
            handle = cache_->Lookup(key, helperCallback, create_cb,
                                    Cache::Priority::LOW, true);
          }
          thread->stats.AddLookupUs(elapsed);
        }
        if (handle) {
          thread->stats.AddHits(1);
          // do something with the data
          result += NPHash64(static_cast<char*>(cache_->Value(handle)),
                             FLAGS_value_bytes);
        }
      } else if (random_op < erase_threshold_) {
        // do erase
        cache_->Erase(key);
      } else {
        // Should be extremely unlikely (noop)
        assert(random_op >= kHundredthUint64 * 100U);
      }
    }
    if (handle) {
      cache_->Release(handle);
      handle = nullptr;
    }
  }
  void PrintEnv() const {
    printf("RocksDB version     : %d.%d\n", kMajorVersion, kMinorVersion);
    printf("Number of threads   : %u\n", FLAGS_threads);
    printf("Ops per thread      : %" PRIu64 "\n", FLAGS_ops_per_thread);
    printf("Cache size          : %" PRIu64 "\n", FLAGS_cache_size);
    printf("Num shard bits      : %u\n", FLAGS_num_shard_bits);
    printf("Max key             : %" PRIu64 "\n", max_key_);
    printf("Resident ratio      : %g\n", FLAGS_resident_ratio);
    printf("Skew degree         : %u\n", FLAGS_skew);
    printf("Populate cache      : %d\n", int{FLAGS_populate_cache});
    printf("Lookup+Insert pct   : %u%%\n", FLAGS_lookup_insert_percent);
    printf("Insert percentage   : %u%%\n", FLAGS_insert_percent);
    printf("Lookup percentage   : %u%%\n", FLAGS_lookup_percent);
    printf("Erase percentage    : %u%%\n", FLAGS_erase_percent);
    printf("----------------------------\n");
  }
 };
 int cache_bench_tool(int argc, char** argv) {
  ParseCommandLineFlags(&argc, &argv, true);
  if (FLAGS_threads <= 0) {
    fprintf(stderr, "threads number <= 0\n");
    exit(1);
  }
  ROCKSDB_NAMESPACE::CacheBench bench;
  if (FLAGS_populate_cache) {
    bench.PopulateCache();
    printf("Population complete\n");
    printf("----------------------------\n");
  }
  if (bench.Run()) {
    return 0;
  } else {
    return 1;
  }
 }
 }  // namespace ROCKSDB_NAMESPACE
 #endif
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@ -271,7 +271,27 @@ class ClockCacheShard final : public CacheShard {
  Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
                void (*deleter)(const Slice& key, void* value),
                Cache::Handle** handle, Cache::Priority priority) override;
  Status Insert(const Slice& key, uint32_t hash, void* value,
                Cache::CacheItemHelperCallback helper_cb, size_t charge,
                Cache::Handle** handle, Cache::Priority priority) override {
    Cache::DeletionCallback delete_cb;
    (*helper_cb)(nullptr, nullptr, &delete_cb);
    return Insert(key, hash, value, charge, delete_cb, handle, priority);
  }
  Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
  Cache::Handle* Lookup(const Slice& key, uint32_t hash,
                        Cache::CacheItemHelperCallback /*helper_cb*/,
                        const Cache::CreateCallback& /*create_cb*/,
                        Cache::Priority /*priority*/, bool /*wait*/) override {
    return Lookup(key, hash);
  }
  bool Release(Cache::Handle* handle, bool /*useful*/,
               bool force_erase) override {
    return Release(handle, force_erase);
  }
  bool isReady(Cache::Handle* /*handle*/) override { return true; }
  void Wait(Cache::Handle* /*handle*/) override {}
  // If the entry in in cache, increase reference count and return true.
  // Return false otherwise.
  //
@ -748,6 +768,8 @@ class ClockCache final : public ShardedCache {
  void DisownData() override { shards_ = nullptr; }
  void WaitAll(std::vector<Handle*>& /*handles*/) override {}
 private:
  ClockCacheShard* shards_;
 };
--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@ -97,7 +97,8 @@ void LRUHandleTable::Resize() {
 LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
                             double high_pri_pool_ratio,
                             bool use_adaptive_mutex,
-                             CacheMetadataChargePolicy metadata_charge_policy)
+                             CacheMetadataChargePolicy metadata_charge_policy,
                             const std::shared_ptr<TieredCache>& tiered_cache)
    : capacity_(0),
      high_pri_pool_usage_(0),
      strict_capacity_limit_(strict_capacity_limit),
@ -105,7 +106,8 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
      high_pri_pool_capacity_(0),
      usage_(0),
      lru_usage_(0),
-      mutex_(use_adaptive_mutex) {
+      mutex_(use_adaptive_mutex),
      tiered_cache_(tiered_cache) {
  set_metadata_charge_policy(metadata_charge_policy);
  // Make empty circular linked list
  lru_.next = &lru_;
@ -256,8 +258,14 @@ void LRUCacheShard::SetCapacity(size_t capacity) {
    EvictFromLRU(0, &last_reference_list);
  }
  // Try to insert the evicted entries into tiered cache
  // Free the entries outside of mutex for performance reasons
  for (auto entry : last_reference_list) {
    if (tiered_cache_ && entry->IsTieredCacheCompatible() &&
        !entry->IsPromoted()) {
      tiered_cache_->Insert(entry->key(), entry->value, entry->info_.helper_cb)
          .PermitUncheckedError();
    }
    entry->Free();
  }
 }
@ -267,9 +275,81 @@ void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
  strict_capacity_limit_ = strict_capacity_limit;
 }
-Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
+Status LRUCacheShard::InsertItem(LRUHandle* e, Cache::Handle** handle) {
  Status s = Status::OK();
  autovector<LRUHandle*> last_reference_list;
  size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
  {
    MutexLock l(&mutex_);
-  LRUHandle* e = table_.Lookup(key, hash);
+
    // Free the space following strict LRU policy until enough space
    // is freed or the lru list is empty
    EvictFromLRU(total_charge, &last_reference_list);
    if ((usage_ + total_charge) > capacity_ &&
        (strict_capacity_limit_ || handle == nullptr)) {
      if (handle == nullptr) {
        // Don't insert the entry but still return ok, as if the entry inserted
        // into cache and get evicted immediately.
        e->SetInCache(false);
        last_reference_list.push_back(e);
      } else {
        delete[] reinterpret_cast<char*>(e);
        *handle = nullptr;
        s = Status::Incomplete("Insert failed due to LRU cache being full.");
      }
    } else {
      // Insert into the cache. Note that the cache might get larger than its
      // capacity if not enough space was freed up.
      LRUHandle* old = table_.Insert(e);
      usage_ += total_charge;
      if (old != nullptr) {
        s = Status::OkOverwritten();
        assert(old->InCache());
        old->SetInCache(false);
        if (!old->HasRefs()) {
          // old is on LRU because it's in cache and its reference count is 0
          LRU_Remove(old);
          size_t old_total_charge =
              old->CalcTotalCharge(metadata_charge_policy_);
          assert(usage_ >= old_total_charge);
          usage_ -= old_total_charge;
          last_reference_list.push_back(old);
        }
      }
      if (handle == nullptr) {
        LRU_Insert(e);
      } else {
        e->Ref();
        *handle = reinterpret_cast<Cache::Handle*>(e);
      }
    }
  }
  // Try to insert the evicted entries into NVM cache
  // Free the entries here outside of mutex for performance reasons
  for (auto entry : last_reference_list) {
    if (tiered_cache_ && entry->IsTieredCacheCompatible() &&
        !entry->IsPromoted()) {
      tiered_cache_->Insert(entry->key(), entry->value, entry->info_.helper_cb)
          .PermitUncheckedError();
    }
    entry->Free();
  }
  return s;
 }
 Cache::Handle* LRUCacheShard::Lookup(
    const Slice& key, uint32_t hash,
    ShardedCache::CacheItemHelperCallback helper_cb,
    const ShardedCache::CreateCallback& create_cb, Cache::Priority priority,
    bool wait) {
  LRUHandle* e = nullptr;
  {
    MutexLock l(&mutex_);
    e = table_.Lookup(key, hash);
    if (e != nullptr) {
      assert(e->InCache());
      if (!e->HasRefs()) {
@ -279,6 +359,43 @@ Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
      e->Ref();
      e->SetHit();
    }
  }
  // If handle table lookup failed, then allocate a handle outside the
  // mutex if we're going to lookup in the NVM cache
  // Only support synchronous for now
  // TODO: Support asynchronous lookup in NVM cache
  if (!e && tiered_cache_ && helper_cb && wait) {
    assert(create_cb);
    std::unique_ptr<TieredCacheHandle> tiered_handle =
        tiered_cache_->Lookup(key, create_cb, wait);
    if (tiered_handle != nullptr) {
      e = reinterpret_cast<LRUHandle*>(
          new char[sizeof(LRUHandle) - 1 + key.size()]);
      e->flags = 0;
      e->SetPromoted(true);
      e->SetTieredCacheCompatible(true);
      e->info_.helper_cb = helper_cb;
      e->key_length = key.size();
      e->hash = hash;
      e->refs = 0;
      e->next = e->prev = nullptr;
      e->SetInCache(true);
      e->SetPriority(priority);
      memcpy(e->key_data, key.data(), key.size());
      e->value = tiered_handle->Value();
      e->charge = tiered_handle->Size();
      // This call could nullify e if the cache is over capacity and
      // strict_capacity_limit_ is true. In such a case, the caller will try
      // to insert later, which might again fail, but its ok as this should
      // not be common
      InsertItem(e, reinterpret_cast<Cache::Handle**>(&e))
          .PermitUncheckedError();
    }
  }
  return reinterpret_cast<Cache::Handle*>(e);
 }
@ -338,81 +455,32 @@ bool LRUCacheShard::Release(Cache::Handle* handle, bool force_erase) {
 Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
                             size_t charge,
                             void (*deleter)(const Slice& key, void* value),
                             Cache::CacheItemHelperCallback helper_cb,
                             Cache::Handle** handle, Cache::Priority priority) {
  // Allocate the memory here outside of the mutex
  // If the cache is full, we'll have to release it
  // It shouldn't happen very often though.
  LRUHandle* e = reinterpret_cast<LRUHandle*>(
      new char[sizeof(LRUHandle) - 1 + key.size()]);
  Status s = Status::OK();
  autovector<LRUHandle*> last_reference_list;
  e->value = value;
-  e->deleter = deleter;
+  e->flags = 0;
  if (helper_cb) {
    e->SetTieredCacheCompatible(true);
    e->info_.helper_cb = helper_cb;
  } else {
    e->info_.deleter = deleter;
  }
  e->charge = charge;
  e->key_length = key.size();
  e->flags = 0;
  e->hash = hash;
  e->refs = 0;
  e->next = e->prev = nullptr;
  e->SetInCache(true);
  e->SetPriority(priority);
  memcpy(e->key_data, key.data(), key.size());
  size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
-  {
+  return InsertItem(e, handle);
    MutexLock l(&mutex_);
    // Free the space following strict LRU policy until enough space
    // is freed or the lru list is empty
    EvictFromLRU(total_charge, &last_reference_list);
    if ((usage_ + total_charge) > capacity_ &&
        (strict_capacity_limit_ || handle == nullptr)) {
      if (handle == nullptr) {
        // Don't insert the entry but still return ok, as if the entry inserted
        // into cache and get evicted immediately.
        e->SetInCache(false);
        last_reference_list.push_back(e);
      } else {
        delete[] reinterpret_cast<char*>(e);
        *handle = nullptr;
        s = Status::Incomplete("Insert failed due to LRU cache being full.");
      }
    } else {
      // Insert into the cache. Note that the cache might get larger than its
      // capacity if not enough space was freed up.
      LRUHandle* old = table_.Insert(e);
      usage_ += total_charge;
      if (old != nullptr) {
        s = Status::OkOverwritten();
        assert(old->InCache());
        old->SetInCache(false);
        if (!old->HasRefs()) {
          // old is on LRU because it's in cache and its reference count is 0
          LRU_Remove(old);
          size_t old_total_charge =
              old->CalcTotalCharge(metadata_charge_policy_);
          assert(usage_ >= old_total_charge);
          usage_ -= old_total_charge;
          last_reference_list.push_back(old);
        }
      }
      if (handle == nullptr) {
        LRU_Insert(e);
      } else {
        e->Ref();
        *handle = reinterpret_cast<Cache::Handle*>(e);
      }
    }
  }
  // Free the entries here outside of mutex for performance reasons
  for (auto entry : last_reference_list) {
    entry->Free();
  }
  return s;
 }
 void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
@ -468,7 +536,8 @@ LRUCache::LRUCache(size_t capacity, int num_shard_bits,
                   bool strict_capacity_limit, double high_pri_pool_ratio,
                   std::shared_ptr<MemoryAllocator> allocator,
                   bool use_adaptive_mutex,
-                   CacheMetadataChargePolicy metadata_charge_policy)
+                   CacheMetadataChargePolicy metadata_charge_policy,
                   const std::shared_ptr<TieredCache>& tiered_cache)
    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
                   std::move(allocator)) {
  num_shards_ = 1 << num_shard_bits;
@ -478,7 +547,7 @@ LRUCache::LRUCache(size_t capacity, int num_shard_bits,
  for (int i = 0; i < num_shards_; i++) {
    new (&shards_[i])
        LRUCacheShard(per_shard, strict_capacity_limit, high_pri_pool_ratio,
-                      use_adaptive_mutex, metadata_charge_policy);
+                      use_adaptive_mutex, metadata_charge_policy, tiered_cache);
  }
 }
@ -543,19 +612,12 @@ double LRUCache::GetHighPriPoolRatio() {
  return result;
 }
 std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts) {
  return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits,
                     cache_opts.strict_capacity_limit,
                     cache_opts.high_pri_pool_ratio,
                     cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
                     cache_opts.metadata_charge_policy);
 }
 std::shared_ptr<Cache> NewLRUCache(
    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
    double high_pri_pool_ratio,
    std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
-    CacheMetadataChargePolicy metadata_charge_policy) {
+    CacheMetadataChargePolicy metadata_charge_policy,
    const std::shared_ptr<TieredCache>& tiered_cache) {
  if (num_shard_bits >= 20) {
    return nullptr;  // the cache cannot be sharded into too many fine pieces
  }
@ -568,7 +630,25 @@ std::shared_ptr<Cache> NewLRUCache(
  }
  return std::make_shared<LRUCache>(
      capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio,
-      std::move(memory_allocator), use_adaptive_mutex, metadata_charge_policy);
+      std::move(memory_allocator), use_adaptive_mutex, metadata_charge_policy,
      tiered_cache);
 }
 std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts) {
  return NewLRUCache(
      cache_opts.capacity, cache_opts.num_shard_bits,
      cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio,
      cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
      cache_opts.metadata_charge_policy, cache_opts.tiered_cache);
 }
 std::shared_ptr<Cache> NewLRUCache(
    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
    double high_pri_pool_ratio,
    std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
    CacheMetadataChargePolicy metadata_charge_policy) {
  return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
                     high_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
                     metadata_charge_policy, nullptr);
 }
 }  // namespace ROCKSDB_NAMESPACE
--- a/cache/lru_cache.h
+++ b/cache/lru_cache.h
@ -11,9 +11,9 @@
 #include <string>
 #include "cache/sharded_cache.h"
 #include "port/malloc.h"
 #include "port/port.h"
 #include "rocksdb/tiered_cache.h"
 #include "util/autovector.h"
 namespace ROCKSDB_NAMESPACE {
@ -49,7 +49,14 @@ namespace ROCKSDB_NAMESPACE {
 struct LRUHandle {
  void* value;
  union Info {
    Info() {}
    ~Info() {}
    void (*deleter)(const Slice&, void* value);
    ShardedCache::CacheItemHelperCallback helper_cb;
    // This needs to be explicitly constructed and destructed
    std::unique_ptr<TieredCacheHandle> tiered_handle;
  } info_;
  LRUHandle* next_hash;
  LRUHandle* next;
  LRUHandle* prev;
@ -69,6 +76,12 @@ struct LRUHandle {
    IN_HIGH_PRI_POOL = (1 << 2),
    // Whether this entry has had any lookups (hits).
    HAS_HIT = (1 << 3),
    // Can this be inserted into the tiered cache
    IS_TIERED_CACHE_COMPATIBLE = (1 << 4),
    // Is the handle still being read from a lower tier
    IS_PENDING = (1 << 5),
    // Has the item been promoted from a lower tier
    IS_PROMOTED = (1 << 6),
  };
  uint8_t flags;
@ -95,6 +108,11 @@ struct LRUHandle {
  bool IsHighPri() const { return flags & IS_HIGH_PRI; }
  bool InHighPriPool() const { return flags & IN_HIGH_PRI_POOL; }
  bool HasHit() const { return flags & HAS_HIT; }
  bool IsTieredCacheCompatible() const {
    return flags & IS_TIERED_CACHE_COMPATIBLE;
  }
  bool IsPending() const { return flags & IS_PENDING; }
  bool IsPromoted() const { return flags & IS_PROMOTED; }
  void SetInCache(bool in_cache) {
    if (in_cache) {
@ -122,10 +140,38 @@ struct LRUHandle {
  void SetHit() { flags |= HAS_HIT; }
  void SetTieredCacheCompatible(bool tiered) {
    if (tiered) {
      flags |= IS_TIERED_CACHE_COMPATIBLE;
    } else {
      flags &= ~IS_TIERED_CACHE_COMPATIBLE;
    }
  }
  void SetIncomplete(bool incomp) {
    if (incomp) {
      flags |= IS_PENDING;
    } else {
      flags &= ~IS_PENDING;
    }
  }
  void SetPromoted(bool promoted) {
    if (promoted) {
      flags |= IS_PROMOTED;
    } else {
      flags &= ~IS_PROMOTED;
    }
  }
  void Free() {
    assert(refs == 0);
-    if (deleter) {
+    if (!IsTieredCacheCompatible() && info_.deleter) {
-      (*deleter)(key(), value);
+      (*info_.deleter)(key(), value);
    } else if (IsTieredCacheCompatible()) {
      ShardedCache::DeletionCallback del_cb;
      (*info_.helper_cb)(nullptr, nullptr, &del_cb);
      (*del_cb)(key(), value);
    }
    delete[] reinterpret_cast<char*>(this);
  }
@ -193,7 +239,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
 public:
  LRUCacheShard(size_t capacity, bool strict_capacity_limit,
                double high_pri_pool_ratio, bool use_adaptive_mutex,
-                CacheMetadataChargePolicy metadata_charge_policy);
+                CacheMetadataChargePolicy metadata_charge_policy,
                const std::shared_ptr<TieredCache>& tiered_cache);
  virtual ~LRUCacheShard() override = default;
  // Separate from constructor so caller can easily make an array of LRUCache
@ -212,8 +259,32 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
                        size_t charge,
                        void (*deleter)(const Slice& key, void* value),
                        Cache::Handle** handle,
-                        Cache::Priority priority) override;
+                        Cache::Priority priority) override {
-  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
+    return Insert(key, hash, value, charge, deleter, nullptr, handle, priority);
  }
  virtual Status Insert(const Slice& key, uint32_t hash, void* value,
                        Cache::CacheItemHelperCallback helper_cb, size_t charge,
                        Cache::Handle** handle,
                        Cache::Priority priority) override {
    return Insert(key, hash, value, charge, nullptr, helper_cb, handle,
                  priority);
  }
  // If helper_cb is null, the values of the following arguments don't
  // matter
  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash,
                                ShardedCache::CacheItemHelperCallback helper_cb,
                                const ShardedCache::CreateCallback& create_cb,
                                ShardedCache::Priority priority,
                                bool wait) override;
  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) override {
    return Lookup(key, hash, nullptr, nullptr, Cache::Priority::LOW, true);
  }
  virtual bool Release(Cache::Handle* handle, bool /*useful*/,
                       bool force_erase) override {
    return Release(handle, force_erase);
  }
  virtual bool isReady(Cache::Handle* /*handle*/) override { return true; }
  virtual void Wait(Cache::Handle* /*handle*/) override {}
  virtual bool Ref(Cache::Handle* handle) override;
  virtual bool Release(Cache::Handle* handle,
                       bool force_erase = false) override;
@ -243,6 +314,11 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
  double GetHighPriPoolRatio();
 private:
  Status InsertItem(LRUHandle* item, Cache::Handle** handle);
  Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
                void (*deleter)(const Slice& key, void* value),
                Cache::CacheItemHelperCallback helper_cb,
                Cache::Handle** handle, Cache::Priority priority);
  void LRU_Remove(LRUHandle* e);
  void LRU_Insert(LRUHandle* e);
@ -303,6 +379,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
  // We don't count mutex_ as the cache's internal state so semantically we
  // don't mind mutex_ invoking the non-const actions.
  mutable port::Mutex mutex_;
  std::shared_ptr<TieredCache> tiered_cache_;
 };
 class LRUCache
@ -316,7 +394,8 @@ class LRUCache
           std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
           bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
           CacheMetadataChargePolicy metadata_charge_policy =
-               kDontChargeCacheMetadata);
+               kDontChargeCacheMetadata,
           const std::shared_ptr<TieredCache>& tiered_cache = nullptr);
  virtual ~LRUCache();
  virtual const char* Name() const override { return "LRUCache"; }
  virtual CacheShard* GetShard(int shard) override;
@ -325,6 +404,7 @@ class LRUCache
  virtual size_t GetCharge(Handle* handle) const override;
  virtual uint32_t GetHash(Handle* handle) const override;
  virtual void DisownData() override;
  virtual void WaitAll(std::vector<Handle*>& /*handles*/) override {}
  //  Retrieves number of elements in LRU, for unit test purpose only
  size_t TEST_GetLRUSize();
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@ -7,8 +7,12 @@
 #include <string>
 #include <vector>
 #include "port/port.h"
 #include "rocksdb/cache.h"
 #include "test_util/testharness.h"
 #include "util/coding.h"
 #include "util/random.h"
 namespace ROCKSDB_NAMESPACE {
@ -30,9 +34,9 @@ class LRUCacheTest : public testing::Test {
    DeleteCache();
    cache_ = reinterpret_cast<LRUCacheShard*>(
        port::cacheline_aligned_alloc(sizeof(LRUCacheShard)));
-    new (cache_) LRUCacheShard(capacity, false /*strict_capacity_limit*/,
+    new (cache_) LRUCacheShard(
-                               high_pri_pool_ratio, use_adaptive_mutex,
+        capacity, false /*strict_capcity_limit*/, high_pri_pool_ratio,
-                               kDontChargeCacheMetadata);
+        use_adaptive_mutex, kDontChargeCacheMetadata, nullptr /*tiered_cache*/);
  }
  void Insert(const std::string& key,
@ -191,6 +195,175 @@ TEST_F(LRUCacheTest, EntriesWithPriority) {
  ValidateLRUList({"e", "f", "g", "Z", "d"}, 2);
 }
 class TestTieredCache : public TieredCache {
 public:
  TestTieredCache(size_t capacity) : num_inserts_(0), num_lookups_(0) {
    cache_ = NewLRUCache(capacity, 0, false, 0.5, nullptr,
                         kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
  }
  ~TestTieredCache() { cache_.reset(); }
  std::string Name() override { return "TestTieredCache"; }
  Status Insert(const Slice& key, void* value,
                Cache::CacheItemHelperCallback helper_cb) override {
    Cache::SizeCallback size_cb;
    Cache::SaveToCallback save_cb;
    size_t size;
    char* buf;
    Status s;
    num_inserts_++;
    (*helper_cb)(&size_cb, &save_cb, nullptr);
    size = (*size_cb)(value);
    buf = new char[size + sizeof(uint64_t)];
    EncodeFixed64(buf, size);
    s = (*save_cb)(value, 0, size, buf + sizeof(uint64_t));
    EXPECT_OK(s);
    return cache_->Insert(key, buf, size,
                          [](const Slice& /*key*/, void* val) -> void {
                            delete[] reinterpret_cast<char*>(val);
                          });
  }
  std::unique_ptr<TieredCacheHandle> Lookup(
      const Slice& key, const Cache::CreateCallback& create_cb,
      bool /*wait*/) override {
    std::unique_ptr<TieredCacheHandle> tiered_handle;
    Cache::Handle* handle = cache_->Lookup(key);
    num_lookups_++;
    if (handle) {
      void* value;
      size_t charge;
      char* ptr = (char*)cache_->Value(handle);
      size_t size = DecodeFixed64(ptr);
      ptr += sizeof(uint64_t);
      Status s = create_cb(ptr, size, &value, &charge);
      EXPECT_OK(s);
      tiered_handle.reset(
          new TestTieredCacheHandle(cache_.get(), handle, value, charge));
    }
    return tiered_handle;
  }
  void Erase(const Slice& /*key*/) override {}
  void WaitAll(std::vector<TieredCacheHandle*> /*handles*/) override {}
  std::string GetPrintableOptions() const override { return ""; }
  uint32_t num_inserts() { return num_inserts_; }
  uint32_t num_lookups() { return num_lookups_; }
 private:
  class TestTieredCacheHandle : public TieredCacheHandle {
   public:
    TestTieredCacheHandle(Cache* cache, Cache::Handle* handle, void* value,
                          size_t size)
        : cache_(cache), handle_(handle), value_(value), size_(size) {}
    ~TestTieredCacheHandle() { cache_->Release(handle_); }
    bool isReady() override { return true; }
    void Wait() override {}
    void* Value() override { return value_; }
    size_t Size() override { return size_; }
   private:
    Cache* cache_;
    Cache::Handle* handle_;
    void* value_;
    size_t size_;
  };
  std::shared_ptr<Cache> cache_;
  uint32_t num_inserts_;
  uint32_t num_lookups_;
 };
 TEST_F(LRUCacheTest, TestTieredCache) {
  LRUCacheOptions opts(1024, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex,
                       kDontChargeCacheMetadata);
  std::shared_ptr<TestTieredCache> tiered_cache(new TestTieredCache(2048));
  opts.tiered_cache = tiered_cache;
  std::shared_ptr<Cache> cache = NewLRUCache(opts);
  class TestItem {
   public:
    TestItem(const char* buf, size_t size) : buf_(new char[size]), size_(size) {
      memcpy(buf_.get(), buf, size);
    }
    ~TestItem() {}
    char* Buf() { return buf_.get(); }
    size_t Size() { return size_; }
   private:
    std::unique_ptr<char[]> buf_;
    size_t size_;
  };
  Cache::CacheItemHelperCallback helper_cb =
      [](Cache::SizeCallback* size_cb, Cache::SaveToCallback* saveto_cb,
         Cache::DeletionCallback* del_cb) -> void {
    if (size_cb) {
      *size_cb = [](void* obj) -> size_t {
        return reinterpret_cast<TestItem*>(obj)->Size();
      };
    }
    if (saveto_cb) {
      *saveto_cb = [](void* obj, size_t offset, size_t size,
                      void* out) -> Status {
        TestItem* item = reinterpret_cast<TestItem*>(obj);
        char* buf = item->Buf();
        EXPECT_EQ(size, item->Size());
        EXPECT_EQ(offset, 0);
        memcpy(out, buf, size);
        return Status::OK();
      };
    }
    if (del_cb) {
      *del_cb = [](const Slice& /*key*/, void* obj) -> void {
        delete reinterpret_cast<TestItem*>(obj);
      };
    }
  };
  int create_count = 0;
  Cache::CreateCallback test_item_creator =
      [&create_count](void* buf, size_t size, void** out_obj,
                      size_t* charge) -> Status {
    create_count++;
    *out_obj = reinterpret_cast<void*>(new TestItem((char*)buf, size));
    *charge = size;
    return Status::OK();
  };
  Random rnd(301);
  std::string str1 = rnd.RandomString(1020);
  TestItem* item1 = new TestItem(str1.data(), str1.length());
  ASSERT_OK(cache->Insert("k1", item1, helper_cb, str1.length()));
  std::string str2 = rnd.RandomString(1020);
  TestItem* item2 = new TestItem(str2.data(), str2.length());
  // k2 should be demoted to NVM
  ASSERT_OK(cache->Insert("k2", item2, helper_cb, str2.length()));
  Cache::Handle* handle;
  handle = cache->Lookup("k2", helper_cb, test_item_creator,
                         Cache::Priority::LOW, true);
  ASSERT_NE(handle, nullptr);
  cache->Release(handle);
  // This lookup should promote k1 and demote k2
  handle = cache->Lookup("k1", helper_cb, test_item_creator,
                         Cache::Priority::LOW, true);
  ASSERT_NE(handle, nullptr);
  cache->Release(handle);
  ASSERT_EQ(tiered_cache->num_inserts(), 2u);
  ASSERT_EQ(tiered_cache->num_lookups(), 1u);
 }
 }  // namespace ROCKSDB_NAMESPACE
 int main(int argc, char** argv) {
--- a/cache/sharded_cache.cc
+++ b/cache/sharded_cache.cc
@ -51,11 +51,39 @@ Status ShardedCache::Insert(const Slice& key, void* value, size_t charge,
      ->Insert(key, hash, value, charge, deleter, handle, priority);
 }
 Status ShardedCache::Insert(const Slice& key, void* value,
                            CacheItemHelperCallback helper_cb, size_t charge,
                            Handle** handle, Priority priority) {
  uint32_t hash = HashSlice(key);
  return GetShard(Shard(hash))
      ->Insert(key, hash, value, helper_cb, charge, handle, priority);
 }
 Cache::Handle* ShardedCache::Lookup(const Slice& key, Statistics* /*stats*/) {
  uint32_t hash = HashSlice(key);
  return GetShard(Shard(hash))->Lookup(key, hash);
 }
 Cache::Handle* ShardedCache::Lookup(const Slice& key,
                                    CacheItemHelperCallback helper_cb,
                                    const CreateCallback& create_cb,
                                    Priority priority, bool wait,
                                    Statistics* /*stats*/) {
  uint32_t hash = HashSlice(key);
  return GetShard(Shard(hash))
      ->Lookup(key, hash, helper_cb, create_cb, priority, wait);
 }
 bool ShardedCache::isReady(Handle* handle) {
  uint32_t hash = GetHash(handle);
  return GetShard(Shard(hash))->isReady(handle);
 }
 void ShardedCache::Wait(Handle* handle) {
  uint32_t hash = GetHash(handle);
  GetShard(Shard(hash))->Wait(handle);
 }
 bool ShardedCache::Ref(Handle* handle) {
  uint32_t hash = GetHash(handle);
  return GetShard(Shard(hash))->Ref(handle);
@ -66,6 +94,11 @@ bool ShardedCache::Release(Handle* handle, bool force_erase) {
  return GetShard(Shard(hash))->Release(handle, force_erase);
 }
 bool ShardedCache::Release(Handle* handle, bool useful, bool force_erase) {
  uint32_t hash = GetHash(handle);
  return GetShard(Shard(hash))->Release(handle, useful, force_erase);
 }
 void ShardedCache::Erase(const Slice& key) {
  uint32_t hash = HashSlice(key);
  GetShard(Shard(hash))->Erase(key, hash);
--- a/cache/sharded_cache.h
+++ b/cache/sharded_cache.h
@ -28,9 +28,20 @@ class CacheShard {
                        size_t charge,
                        void (*deleter)(const Slice& key, void* value),
                        Cache::Handle** handle, Cache::Priority priority) = 0;
  virtual Status Insert(const Slice& key, uint32_t hash, void* value,
                        Cache::CacheItemHelperCallback helper_cb, size_t charge,
                        Cache::Handle** handle, Cache::Priority priority) = 0;
  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) = 0;
  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash,
                                Cache::CacheItemHelperCallback helper_cb,
                                const Cache::CreateCallback& create_cb,
                                Cache::Priority priority, bool wait) = 0;
  virtual bool Release(Cache::Handle* handle, bool useful,
                       bool force_erase) = 0;
  virtual bool isReady(Cache::Handle* handle) = 0;
  virtual void Wait(Cache::Handle* handle) = 0;
  virtual bool Ref(Cache::Handle* handle) = 0;
-  virtual bool Release(Cache::Handle* handle, bool force_erase = false) = 0;
+  virtual bool Release(Cache::Handle* handle, bool force_erase) = 0;
  virtual void Erase(const Slice& key, uint32_t hash) = 0;
  virtual void SetCapacity(size_t capacity) = 0;
  virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0;
@ -62,6 +73,7 @@ class ShardedCache : public Cache {
  virtual const CacheShard* GetShard(int shard) const = 0;
  virtual void* Value(Handle* handle) override = 0;
  virtual size_t GetCharge(Handle* handle) const override = 0;
  virtual void WaitAll(std::vector<Handle*>& handles) override = 0;
  virtual uint32_t GetHash(Handle* handle) const = 0;
  virtual void DisownData() override = 0;
@ -72,7 +84,18 @@ class ShardedCache : public Cache {
  virtual Status Insert(const Slice& key, void* value, size_t charge,
                        void (*deleter)(const Slice& key, void* value),
                        Handle** handle, Priority priority) override;
  virtual Status Insert(const Slice& key, void* value,
                        CacheItemHelperCallback helper_cb, size_t chargge,
                        Handle** handle = nullptr,
                        Priority priority = Priority::LOW) override;
  virtual Handle* Lookup(const Slice& key, Statistics* stats) override;
  virtual Handle* Lookup(const Slice& key, CacheItemHelperCallback helper_cb,
                         const CreateCallback& create_cb, Priority priority,
                         bool wait, Statistics* stats = nullptr) override;
  virtual bool Release(Handle* handle, bool useful,
                       bool force_erase = false) override;
  virtual bool isReady(Handle* handle) override;
  virtual void Wait(Handle* handle) override;
  virtual bool Ref(Handle* handle) override;
  virtual bool Release(Handle* handle, bool force_erase = false) override;
  virtual void Erase(const Slice& key) override;
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@ -2820,6 +2820,7 @@ class DBBasicTestMultiGet : public DBTestBase {
    const char* Name() const override { return "MyBlockCache"; }
    using Cache::Insert;
    Status Insert(const Slice& key, void* value, size_t charge,
                  void (*deleter)(const Slice& key, void* value),
                  Handle** handle = nullptr,
@ -2828,6 +2829,7 @@ class DBBasicTestMultiGet : public DBTestBase {
      return target_->Insert(key, value, charge, deleter, handle, priority);
    }
    using Cache::Lookup;
    Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override {
      num_lookups_++;
      Handle* handle = target_->Lookup(key, stats);
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@ -446,6 +446,7 @@ class MockCache : public LRUCache {
                 false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/) {
  }
  using ShardedCache::Insert;
  Status Insert(const Slice& key, void* value, size_t charge,
                void (*deleter)(const Slice& key, void* value), Handle** handle,
                Priority priority) override {
@ -533,6 +534,7 @@ class LookupLiarCache : public CacheWrapper {
  explicit LookupLiarCache(std::shared_ptr<Cache> target)
      : CacheWrapper(std::move(target)) {}
  using Cache::Lookup;
  Handle* Lookup(const Slice& key, Statistics* stats) override {
    if (nth_lookup_not_found_ == 1) {
      nth_lookup_not_found_ = 0;
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@ -826,6 +826,7 @@ class CacheWrapper : public Cache {
  const char* Name() const override { return target_->Name(); }
  using Cache::Insert;
  Status Insert(const Slice& key, void* value, size_t charge,
                void (*deleter)(const Slice& key, void* value),
                Handle** handle = nullptr,
@ -833,12 +834,14 @@ class CacheWrapper : public Cache {
    return target_->Insert(key, value, charge, deleter, handle, priority);
  }
  using Cache::Lookup;
  Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override {
    return target_->Lookup(key, stats);
  }
  bool Ref(Handle* handle) override { return target_->Ref(handle); }
  using Cache::Release;
  bool Release(Handle* handle, bool force_erase = false) override {
    return target_->Release(handle, force_erase);
  }
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@ -23,8 +23,11 @@
 #pragma once
 #include <stdint.h>
 #include <functional>
 #include <memory>
 #include <string>
 #include "rocksdb/memory_allocator.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/statistics.h"
@ -34,6 +37,7 @@ namespace ROCKSDB_NAMESPACE {
 class Cache;
 struct ConfigOptions;
 class TieredCache;
 extern const bool kDefaultToAdaptiveMutex;
@ -87,6 +91,9 @@ struct LRUCacheOptions {
  CacheMetadataChargePolicy metadata_charge_policy =
      kDefaultCacheMetadataChargePolicy;
  // A TieredCache instance to use a the non-volatile tier
  std::shared_ptr<TieredCache> tiered_cache;
  LRUCacheOptions() {}
  LRUCacheOptions(size_t _capacity, int _num_shard_bits,
                  bool _strict_capacity_limit, double _high_pri_pool_ratio,
@ -137,6 +144,57 @@ class Cache {
  // likely to get evicted than low priority entries.
  enum class Priority { HIGH, LOW };
  // A set of callbacks to allow objects in the volatile block cache to be
  // be persisted in a NVM cache tier. Since the volatile cache holds C++
  // objects and the NVM cache may only hold flat data that doesn't need
  // relocation, these callbacks need to be provided by the user of the block
  // cache to do the conversion.
  // The CacheItemHelperCallback is passed to Insert(). When invoked, it
  // returns the callback functions for size, saving and deletion of the
  // object. We do it this way so that the cache implementation only needs to
  // save one function pointer in its metadata per object, the
  // CacheItemHelperCallback pointer which is a C-style function pointer.
  // Saving multiple std::function objects will take up 32 bytes per
  // function, even if its not bound to an object and does no capture. The
  // other alternative is to take a pointer to another object that implements
  // this interface, but that would create issues with managing the object
  // lifecycle.
  //
  // All the callbacks are C-style function pointers in order to simplify
  // lifecycle management. Objects in the cache can outlive the parent DB,
  // so anything required for these operations should be contained in the
  // object itself.
  //
  // The SizeCallback takes a void* pointer to the object and returns the size
  // of the persistable data. It can be used by the NVM cache to allocate
  // memory if needed.
  typedef size_t (*SizeCallback)(void* obj);
  // The SaveToCallback takes a void* object pointer and saves the persistable
  // data into a buffer. The NVM cache may decide to not store it in a
  // contiguous buffer, in which case this callback will be called multiple
  // times with increasing offset
  typedef ROCKSDB_NAMESPACE::Status (*SaveToCallback)(void* obj, size_t offset,
                                                      size_t size, void* out);
  // DeletionCallback is a function pointer that deletes the cached
  // object. The signature matches the old deleter function.
  typedef void (*DeletionCallback)(const Slice&, void*);
  // A callback function that returns the size, save to, and deletion
  // callbacks. Fill any of size_cb, saveto_cb, del_cb that is non-null
  typedef void (*CacheItemHelperCallback)(SizeCallback* size_cb,
                                          SaveToCallback* saveto_cb,
                                          DeletionCallback* del_cb);
  // The CreateCallback is passed by the block cache user to Lookup(). It
  // takes in a buffer from the NVM cache and constructs an object using
  // it. The callback doesn't have ownership of the buffer and should
  // copy the contents into its own buffer.
  typedef std::function<ROCKSDB_NAMESPACE::Status(
      void* buf, size_t size, void** out_obj, size_t* charge)>
      CreateCallback;
  Cache(std::shared_ptr<MemoryAllocator> allocator = nullptr)
      : memory_allocator_(std::move(allocator)) {}
  // No copying allowed
@ -170,8 +228,8 @@ class Cache {
  // The type of the Cache
  virtual const char* Name() const = 0;
-  // Insert a mapping from key->value into the cache and assign it
+  // Insert a mapping from key->value into the volatile cache only
-  // the specified charge against the total cache capacity.
+  // and assign it // the specified charge against the total cache capacity.
  // If strict_capacity_limit is true and cache reaches its full capacity,
  // return Status::Incomplete.
  //
@ -287,6 +345,79 @@ class Cache {
  MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); }
  // Insert a mapping from key->value into the volatile cache and assign it
  // the specified charge against the total cache capacity.
  // If strict_capacity_limit is true and cache reaches its full capacity,
  // return Status::Incomplete.
  //
  // If handle is not nullptr, returns a handle that corresponds to the
  // mapping. The caller must call this->Release(handle) when the returned
  // mapping is no longer needed. In case of error caller is responsible to
  // cleanup the value (i.e. calling "deleter").
  //
  // If handle is nullptr, it is as if Release is called immediately after
  // insert. In case of error value will be cleanup.
  //
  // Regardless of whether the item was inserted into the volatile cache,
  // it will attempt to insert it into the NVM cache if one is configured.
  // The block cache implementation must support the NVM tier, otherwise
  // the item is only inserted into the volatile tier. It may
  // defer the insertion to NVM as it sees fit. The NVM
  // cache may or may not write it to NVM depending on its admission
  // policy.
  //
  // When the inserted entry is no longer needed, the key and
  // value will be passed to "deleter".
  virtual Status Insert(const Slice& key, void* value,
                        CacheItemHelperCallback helper_cb, size_t charge,
                        Handle** handle = nullptr,
                        Priority priority = Priority::LOW) {
    DeletionCallback delete_cb = nullptr;
    assert(helper_cb != nullptr);
    (*helper_cb)(nullptr, nullptr, &delete_cb);
    return Insert(key, value, charge, delete_cb, handle, priority);
  }
  // Lookup the key in the volatile and NVM tiers (if one is configured).
  // The create_cb callback function object will be used to contruct the
  // cached object.
  // If none of the tiers have the mapping for the key, rturns nullptr.
  // Else, returns a handle that corresponds to the mapping.
  //
  // The handle returned may not be ready. The caller should call isReady()
  // to check if the item value is ready, and call Wait() or WaitAll() if
  // its not ready. The caller should then call Value() to check if the
  // item was successfully retrieved. If unsuccessful (perhaps due to an
  // IO error), Value() will return nullptr.
  virtual Handle* Lookup(const Slice& key,
                         CacheItemHelperCallback /*helper_cb*/,
                         const CreateCallback& /*create_cb*/,
                         Priority /*priority*/, bool /*wait*/,
                         Statistics* stats = nullptr) {
    return Lookup(key, stats);
  }
  // Release a mapping returned by a previous Lookup(). The "useful"
  // parameter specifies whether the data was actually used or not,
  // which may be used by the cache implementation to decide whether
  // to consider it as a hit for retention purposes.
  virtual bool Release(Handle* handle, bool /*useful*/, bool force_erase) {
    return Release(handle, force_erase);
  }
  // Determines if the handle returned by Lookup() has a valid value yet.
  virtual bool isReady(Handle* /*handle*/) { return true; }
  // If the handle returned by Lookup() is not ready yet, wait till it
  // becomes ready.
  // Note: A ready handle doesn't necessarily mean it has a valid value. The
  // user should call Value() and check for nullptr.
  virtual void Wait(Handle* /*handle*/) {}
  // Wait for a vector of handles to become ready. As with Wait(), the user
  // should check the Value() of each handle for nullptr
  virtual void WaitAll(std::vector<Handle*>& /*handles*/) {}
 private:
  std::shared_ptr<MemoryAllocator> memory_allocator_;
 };
--- a/include/rocksdb/cache_bench_tool.h
+++ b/include/rocksdb/cache_bench_tool.h
@ -0,0 +1,14 @@
 // Copyright (c) 2013-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 #pragma once
 #include "rocksdb/rocksdb_namespace.h"
 #include "rocksdb/status.h"
 #include "rocksdb/types.h"
 namespace ROCKSDB_NAMESPACE {
 int cache_bench_tool(int argc, char** argv);
 }  // namespace ROCKSDB_NAMESPACE
--- a/include/rocksdb/tiered_cache.h
+++ b/include/rocksdb/tiered_cache.h
@ -0,0 +1,79 @@
 // Copyright (c) 2021, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 #pragma once
 #include <stdint.h>
 #include <memory>
 #include <string>
 #include "rocksdb/cache.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 namespace ROCKSDB_NAMESPACE {
 // A handle for lookup result. The handle may not be immediately ready or
 // have a valid value. The caller must call isReady() to determine if its
 // ready, and call Wait() in order to block until it becomes ready.
 // The caller must call value() after it becomes ready to determine if the
 // handle successfullly read the item.
 class TieredCacheHandle {
 public:
  virtual ~TieredCacheHandle() {}
  // Returns whether the handle is ready or not
  virtual bool isReady() = 0;
  // Block until handle becomes ready
  virtual void Wait() = 0;
  // Return the value. If nullptr, it means the lookup was unsuccessful
  virtual void* Value() = 0;
  // Return the size of value
  virtual size_t Size() = 0;
 };
 // TieredCache
 //
 // Cache interface for caching blocks on a stackable tiers (which can include
 // non-volatile mediums)
 class TieredCache {
 public:
  virtual ~TieredCache() {}
  virtual std::string Name() = 0;
  static const char* Type() { return "TieredCache"; }
  // Insert the given value into this tier. The value is not written
  // directly. Rather, the SaveToCallback provided by helper_cb will be
  // used to extract the persistable data in value, which will be written
  // to this tier. The implementation may or may not write it to cache
  // depending on the admission control policy, even if the return status is
  // success.
  virtual Status Insert(const Slice& key, void* value,
                        Cache::CacheItemHelperCallback helper_cb) = 0;
  // Lookup the data for the given key in this tier. The create_cb
  // will be used to create the object. The handle returned may not be
  // ready yet, unless wait=true, in which case Lookup() will block until
  // the handle is ready
  virtual std::unique_ptr<TieredCacheHandle> Lookup(
      const Slice& key, const Cache::CreateCallback& create_cb, bool wait) = 0;
  // At the discretion of the implementation, erase the data associated
  // with key
  virtual void Erase(const Slice& key) = 0;
  // Wait for a collection of handles to become ready
  virtual void WaitAll(std::vector<TieredCacheHandle*> handles) = 0;
  virtual std::string GetPrintableOptions() const = 0;
 };
 }  // namespace ROCKSDB_NAMESPACE
--- a/src.mk
+++ b/src.mk
@ -319,6 +319,9 @@ MOCK_LIB_SOURCES =                                              \
 BENCH_LIB_SOURCES =                                             \
  tools/db_bench_tool.cc                                        \
 CACHE_BENCH_LIB_SOURCES =					\
  cache/cache_bench_tool.cc                                     \
 STRESS_LIB_SOURCES =                                            \
  db_stress_tool/batched_ops_stress.cc                         \
  db_stress_tool/cf_consistency_stress.cc                      \
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@ -56,8 +56,10 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/stats_history.h"
 #include "rocksdb/tiered_cache.h"
 #include "rocksdb/utilities/object_registry.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
 #include "rocksdb/utilities/options_type.h"
 #include "rocksdb/utilities/options_util.h"
 #include "rocksdb/utilities/sim_cache.h"
 #include "rocksdb/utilities/transaction.h"
@ -1412,6 +1414,12 @@ DEFINE_bool(read_with_latest_user_timestamp, true,
            "If true, always use the current latest timestamp for read. If "
            "false, choose a random timestamp from the past.");
 #ifndef ROCKSDB_LITE
 DEFINE_string(tiered_cache_uri, "",
              "Full URI for creating a custom tiered cache object");
 static class std::shared_ptr<ROCKSDB_NAMESPACE::TieredCache> tiered_cache;
 #endif
 static const bool FLAGS_soft_rate_limit_dummy __attribute__((__unused__)) =
    RegisterFlagValidator(&FLAGS_soft_rate_limit, &ValidateRateLimit);
@ -2773,22 +2781,37 @@ class Benchmark {
      }
      return cache;
    } else {
-      if (FLAGS_use_cache_memkind_kmem_allocator) {
+      LRUCacheOptions opts(
 #ifdef MEMKIND
        return NewLRUCache(
          static_cast<size_t>(capacity), FLAGS_cache_numshardbits,
          false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio,
-            std::make_shared<MemkindKmemAllocator>());
+#ifdef MEMKIND
-
+          FLAGS_use_cache_memkind_kmem_allocator
              ? std::make_shared<MemkindKmemAllocator>()
              : nullptr
 #else
          nullptr
 #endif
      );
      if (FLAGS_use_cache_memkind_kmem_allocator) {
 #ifndef MEMKIND
        fprintf(stderr, "Memkind library is not linked with the binary.");
        exit(1);
 #endif
      } else {
        return NewLRUCache(
            static_cast<size_t>(capacity), FLAGS_cache_numshardbits,
            false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio);
      }
 #ifndef ROCKSDB_LITE
      if (!FLAGS_tiered_cache_uri.empty()) {
        Status s = ObjectRegistry::NewInstance()->NewSharedObject<TieredCache>(
            FLAGS_tiered_cache_uri, &tiered_cache);
        if (tiered_cache == nullptr) {
          fprintf(stderr,
                  "No tiered cache registered matching string: %s status=%s\n",
                  FLAGS_tiered_cache_uri.c_str(), s.ToString().c_str());
          exit(1);
        }
        opts.tiered_cache = tiered_cache;
      }
 #endif
      return NewLRUCache(opts);
    }
  }
--- a/utilities/simulator_cache/sim_cache.cc
+++ b/utilities/simulator_cache/sim_cache.cc
@ -167,6 +167,7 @@ class SimCacheImpl : public SimCache {
    cache_->SetStrictCapacityLimit(strict_capacity_limit);
  }
  using Cache::Insert;
  Status Insert(const Slice& key, void* value, size_t charge,
                void (*deleter)(const Slice& key, void* value), Handle** handle,
                Priority priority) override {
@ -193,6 +194,7 @@ class SimCacheImpl : public SimCache {
    return cache_->Insert(key, value, charge, deleter, handle, priority);
  }
  using Cache::Lookup;
  Handle* Lookup(const Slice& key, Statistics* stats) override {
    Handle* h = key_only_cache_->Lookup(key);
    if (h != nullptr) {
@ -213,6 +215,7 @@ class SimCacheImpl : public SimCache {
  bool Ref(Handle* handle) override { return cache_->Ref(handle); }
  using Cache::Release;
  bool Release(Handle* handle, bool force_erase = false) override {
    return cache_->Release(handle, force_erase);
  }
Author	SHA1	Message	Date
anand1976	bd35a845ca	Merge pull request #8213 from anand1976/nvm_cache_initial Allow cache_bench and db_bench to use 3rd party tiered cache	2021-04-28 12:35:02 -07:00
anand76	befd813e95	More build failure fixes	2021-04-28 11:01:48 -07:00
anand76	5c04610b9c	Fix CircleCI build failures	2021-04-28 10:03:08 -07:00
anand76	8f0de0554f	Address comments	2021-04-27 17:20:52 -07:00
anand76	d0a0c91017	Fix build issues from merge	2021-04-26 13:35:48 -07:00
anand76	354588fea4	Merge branch 'nvm_cache_proto' of https://github.com/facebook/rocksdb into nvm_cache_initial	2021-04-26 12:56:12 -07:00
anand76	99ad5553eb	Fix comment	2021-04-26 12:21:55 -07:00
anand76	1aedd57a62	Fix comments and an ASSERT_STATUS_CHECKED bug	2021-04-26 12:21:55 -07:00
anand76	b8a1b8f361	Address comments	2021-04-26 12:21:41 -07:00
anand76	ecd3693770	Fix formatting and a ASSERT_STATUS_CHECKED test failure	2021-04-26 12:20:20 -07:00
anand76	27ec9bc869	Fix some CircleCI failures	2021-04-26 12:20:20 -07:00
anand76	abe8a29a37	Fix some build and test failures	2021-04-26 12:20:20 -07:00
anand76	1e0b641048	Add missing nvm_cache.h header file	2021-04-26 12:20:20 -07:00
anand76	7c22a2c549	Initial support for NVM cache in LRUCache Summary: Only support synchronous lookup currently.	2021-04-26 12:20:20 -07:00
Zhichao Cao	03384bc9d2	try the push	2021-04-26 12:20:20 -07:00
anand76	09df74c540	Allow cache_bench and db_bench to use 3rd party tiered cache	2021-04-20 21:33:09 -07:00
anand76	e295344ae3	Fix comment	2021-04-01 10:31:43 -07:00
anand76	b42d4a8ad4	Fix comments and an ASSERT_STATUS_CHECKED bug	2021-04-01 10:23:13 -07:00
anand76	8015fc9871	Address comments	2021-03-31 12:13:52 -07:00
anand76	b2c302597d	Fix formatting and a ASSERT_STATUS_CHECKED test failure	2021-03-30 12:34:19 -07:00
anand76	2db4e48211	Fix some CircleCI failures	2021-03-30 09:21:15 -07:00
anand76	1b0a1abafa	Fix some build and test failures	2021-03-29 14:28:04 -07:00
anand76	c1ae1f143e	Add missing nvm_cache.h header file	2021-03-29 11:37:16 -07:00
anand76	9cc94bfbd3	Initial support for NVM cache in LRUCache Summary: Only support synchronous lookup currently.	2021-03-25 14:19:56 -07:00
anand1976	33970a392c	Merge pull request #8102 from zhichao-cao/nvm_try try the push	2021-03-25 09:42:26 -07:00
Zhichao Cao	5732c039a4	try the push	2021-03-24 13:27:12 -07:00
anand1976	0ae90f8e28	Merge pull request #8101 from anand1976/nvm_branch_trial Trial merge to nvm_cache_proto branch	2021-03-24 13:16:26 -07:00
anand76	5a165416ac	Update HISTORY.md	2021-03-24 13:14:14 -07:00