Cache simulator: Optimize hybrid row-block cache. (#5616)

Summary:
This PR optimizes the hybrid row-block cache simulator. If a Get request hits the cache, we treat all its future accesses as hits.

Consider a Get request (no snapshot) accesses multiple files, e.g, file1, file2, file3. We construct the row key as "fdnumber_key_0". Before this PR, if it hits the cache when searching the key in file1, we continue to process its accesses in file2 and file3 which is unnecessary.

With this PR, if "file1_key_0" is in the cache, we treat all future accesses of this Get request as hits.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5616

Differential Revision: D16453187

Pulled By: HaoyuHuang

fbshipit-source-id: 56f3169cc322322305baaf5543226a0824fae19f
This commit is contained in:
haoyuhuang 2019-07-29 10:52:32 -07:00 committed by Facebook Github Bot
parent 80d7067cb2
commit e648c1d9eb
3 changed files with 186 additions and 27 deletions

View File

@ -122,14 +122,26 @@ void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) {
// TODO (haoyu): We only support Get for now. We need to extend the tracing
// for MultiGet, i.e., non-data block accesses must log all keys in a
// MultiGet.
bool is_cache_miss = false;
bool is_cache_miss = true;
bool admitted = false;
if (access.caller == TableReaderCaller::kUserGet &&
access.get_id != BlockCacheTraceHelper::kReservedGetId) {
// This is a Get/MultiGet request.
// This is a Get request.
const std::string& row_key = BlockCacheTraceHelper::ComputeRowKey(access);
if (getid_getkeys_map_[access.get_id].find(row_key) ==
getid_getkeys_map_[access.get_id].end()) {
GetRequestStatus& status = getid_status_map_[access.get_id];
if (status.is_complete) {
// This Get request completes.
// Skip future accesses to its index/filter/data
// blocks. These block lookups are unnecessary if we observe a hit for the
// referenced key-value pair already. Thus, we treat these lookups as
// hits. This is also to ensure the total number of accesses are the same
// when comparing to other policies.
miss_ratio_stats_.UpdateMetrics(access.access_timestamp,
/*is_user_access=*/true,
/*is_cache_miss=*/false);
return;
}
if (status.row_key_status.find(row_key) == status.row_key_status.end()) {
// This is the first time that this key is accessed. Look up the key-value
// pair first. Do not update the miss/accesses metrics here since it will
// be updated later.
@ -144,37 +156,30 @@ void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) {
} else if (admitted) {
result = InsertResult::ADMITTED;
}
getid_getkeys_map_[access.get_id][row_key] =
std::make_pair(is_cache_miss, result);
status.row_key_status[row_key] = result;
}
std::pair<bool, InsertResult> miss_inserted =
getid_getkeys_map_[access.get_id][row_key];
if (!miss_inserted.first) {
// This is a cache hit. Skip future accesses to its index/filter/data
// blocks. These block lookups are unnecessary if we observe a hit for the
// referenced key-value pair already. Thus, we treat these lookups as
// hits. This is also to ensure the total number of accesses are the same
// when comparing to other policies.
if (!is_cache_miss) {
// A cache hit.
status.is_complete = true;
miss_ratio_stats_.UpdateMetrics(access.access_timestamp,
/*is_user_access=*/true,
/*is_cache_miss=*/false);
return;
}
// The key-value pair observes a cache miss. We need to access its
// The row key-value pair observes a cache miss. We need to access its
// index/filter/data blocks.
InsertResult inserted = status.row_key_status[row_key];
AccessKVPair(
access.block_key, access.block_type, ComputeBlockPriority(access),
access.block_key, access.block_size, ComputeBlockPriority(access),
access,
/*no_insert=*/!insert_blocks_upon_row_kvpair_miss_ || access.no_insert,
/*is_user_access=*/true, &is_cache_miss, &admitted,
/*update_metrics=*/true);
if (access.referenced_data_size > 0 &&
miss_inserted.second == InsertResult::ADMITTED) {
if (access.referenced_data_size > 0 && inserted == InsertResult::ADMITTED) {
sim_cache_->Insert(row_key, /*value=*/nullptr,
access.referenced_data_size, /*deleter=*/nullptr,
/*handle=*/nullptr, Cache::Priority::HIGH);
getid_getkeys_map_[access.get_id][row_key] =
std::make_pair(true, InsertResult::INSERTED);
status.row_key_status[row_key] = InsertResult::INSERTED;
}
return;
}

View File

@ -47,6 +47,7 @@ class MissRatioStats {
return static_cast<double>(num_misses_ * 100.0 / num_accesses_);
}
uint64_t total_accesses() const { return num_accesses_; }
uint64_t total_misses() const { return num_misses_; }
const std::map<uint64_t, uint64_t>& num_accesses_timeline() const {
return num_accesses_timeline_;
@ -63,6 +64,7 @@ class MissRatioStats {
return static_cast<double>(user_misses_ * 100.0 / user_accesses_);
}
uint64_t user_accesses() const { return user_accesses_; }
uint64_t user_misses() const { return user_misses_; }
void UpdateMetrics(uint64_t timestamp_in_ms, bool is_user_access,
bool is_cache_miss);
@ -168,17 +170,24 @@ class HybridRowBlockCacheSimulator : public PrioritizedCacheSimulator {
NO_INSERT,
};
// A map stores get_id to a map of row keys. For each row key, it stores a
// boolean and an enum. The first bool is true when we observe a miss upon the
// first time we encounter the row key. The second arg is INSERTED when the
// We set is_complete to true when the referenced row-key of a get request
// hits the cache. If is_complete is true, we treat future accesses of this
// get request as hits.
//
// For each row key, it stores an enum. It is INSERTED when the
// kv-pair has been inserted into the cache, ADMITTED if it should be inserted
// but haven't been, NO_INSERT if it should not be inserted.
//
// A kv-pair is in ADMITTED state when we encounter this kv-pair but do not
// know its size. This may happen if the first access on the referenced key is
// an index/filter block.
std::map<uint64_t, std::map<std::string, std::pair<bool, InsertResult>>>
getid_getkeys_map_;
struct GetRequestStatus {
bool is_complete = false;
std::map<std::string, InsertResult> row_key_status;
};
// A map stores get_id to a map of row keys.
std::map<uint64_t, GetRequestStatus> getid_status_map_;
bool insert_blocks_upon_row_kvpair_miss_;
};

View File

@ -14,6 +14,7 @@ namespace rocksdb {
namespace {
const std::string kBlockKeyPrefix = "test-block-";
const std::string kRefKeyPrefix = "test-get-";
const std::string kRefKeySequenceNumber = std::string(8, 'c');
const uint64_t kGetId = 1;
const uint64_t kGetBlockId = 100;
const uint64_t kCompactionBlockId = 1000;
@ -38,12 +39,12 @@ class CacheSimulatorTest : public testing::Test {
record.cf_name = "test";
record.caller = TableReaderCaller::kUserGet;
record.level = 6;
record.sst_fd_number = kGetBlockId;
record.sst_fd_number = 0;
record.get_id = getid;
record.is_cache_hit = Boolean::kFalse;
record.no_insert = Boolean::kFalse;
record.referenced_key =
kRefKeyPrefix + std::to_string(kGetId) + std::string(8, 'c');
kRefKeyPrefix + std::to_string(kGetId) + kRefKeySequenceNumber;
record.referenced_key_exist_in_block = Boolean::kTrue;
record.referenced_data_size = 100;
record.num_keys_in_block = 300;
@ -66,6 +67,29 @@ class CacheSimulatorTest : public testing::Test {
return record;
}
void AssertCache(std::shared_ptr<Cache> sim_cache,
const MissRatioStats& miss_ratio_stats,
uint64_t expected_usage, uint64_t expected_num_accesses,
uint64_t expected_num_misses,
std::vector<std::string> blocks,
std::vector<std::string> keys) {
EXPECT_EQ(expected_usage, sim_cache->GetUsage());
EXPECT_EQ(expected_num_accesses, miss_ratio_stats.total_accesses());
EXPECT_EQ(expected_num_misses, miss_ratio_stats.total_misses());
for (auto const& block : blocks) {
auto handle = sim_cache->Lookup(block);
EXPECT_NE(nullptr, handle);
sim_cache->Release(handle);
}
for (auto const& key : keys) {
std::string row_key = kRefKeyPrefix + key + kRefKeySequenceNumber;
auto handle =
sim_cache->Lookup("0_" + ExtractUserKey(row_key).ToString() + "_0");
EXPECT_NE(nullptr, handle);
sim_cache->Release(handle);
}
}
Env* env_;
};
@ -277,6 +301,127 @@ TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) {
}
}
TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulatorGetTest) {
BlockCacheTraceRecord get = GenerateGetRecord(kGetId);
get.block_size = 1;
get.referenced_data_size = 0;
get.access_timestamp = 0;
get.block_key = "1";
get.get_id = 1;
get.get_from_user_specified_snapshot = Boolean::kFalse;
get.referenced_key =
kRefKeyPrefix + std::to_string(1) + kRefKeySequenceNumber;
get.no_insert = Boolean::kFalse;
get.sst_fd_number = 0;
get.get_from_user_specified_snapshot = Boolean::kFalse;
std::shared_ptr<Cache> sim_cache =
NewLRUCache(/*capacity=*/16, /*num_shard_bits=*/1,
/*strict_capacity_limit=*/false,
/*high_pri_pool_ratio=*/0);
std::unique_ptr<HybridRowBlockCacheSimulator> cache_simulator(
new HybridRowBlockCacheSimulator(
nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/true));
// Expect a miss and does not insert the row key-value pair since it does not
// have size.
cache_simulator->Access(get);
AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 1, 1, 1, {"1"},
{});
get.access_timestamp += 1;
get.referenced_data_size = 1;
get.block_key = "2";
cache_simulator->Access(get);
AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 3, 2, 2,
{"1", "2"}, {"1"});
get.access_timestamp += 1;
get.block_key = "3";
// K1 should not inserted again.
cache_simulator->Access(get);
AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 4, 3, 3,
{"1", "2", "3"}, {"1"});
// A second get request referencing the same key.
get.access_timestamp += 1;
get.get_id = 2;
get.block_key = "4";
get.referenced_data_size = 0;
cache_simulator->Access(get);
AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 4, 4, 3,
{"1", "2", "3"}, {"1"});
// A third get request searches three files, three different keys.
// And the second key observes a hit.
get.access_timestamp += 1;
get.referenced_data_size = 1;
get.get_id = 3;
get.block_key = "3";
get.referenced_key = kRefKeyPrefix + "2" + kRefKeySequenceNumber;
// K2 should observe a miss. Block 3 observes a hit.
cache_simulator->Access(get);
AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 5, 3,
{"1", "2", "3"}, {"1", "2"});
get.access_timestamp += 1;
get.referenced_data_size = 1;
get.get_id = 3;
get.block_key = "4";
get.referenced_data_size = 1;
get.referenced_key = kRefKeyPrefix + "1" + kRefKeySequenceNumber;
// K1 should observe a hit.
cache_simulator->Access(get);
AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 6, 3,
{"1", "2", "3"}, {"1", "2"});
get.access_timestamp += 1;
get.referenced_data_size = 1;
get.get_id = 3;
get.block_key = "4";
get.referenced_data_size = 1;
get.referenced_key = kRefKeyPrefix + "3" + kRefKeySequenceNumber;
// K3 should observe a miss.
// However, as the get already complete, we should not access k3 any more.
cache_simulator->Access(get);
AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 7, 3,
{"1", "2", "3"}, {"1", "2"});
// A fourth get request searches one file and two blocks. One row key.
get.access_timestamp += 1;
get.get_id = 4;
get.block_key = "5";
get.referenced_key = kRefKeyPrefix + "4" + kRefKeySequenceNumber;
get.referenced_data_size = 1;
cache_simulator->Access(get);
AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 7, 8, 4,
{"1", "2", "3", "5"}, {"1", "2", "4"});
for (auto const& key : {"1", "2", "4"}) {
auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key + "_0");
ASSERT_NE(nullptr, handle);
sim_cache->Release(handle);
}
// A bunch of insertions which evict cached row keys.
for (uint32_t i = 6; i < 100; i++) {
get.access_timestamp += 1;
get.get_id = 0;
get.block_key = std::to_string(i);
cache_simulator->Access(get);
}
get.get_id = 4;
// A different block.
get.block_key = "100";
// Same row key and should not be inserted again.
get.referenced_key = kRefKeyPrefix + "4" + kRefKeySequenceNumber;
get.referenced_data_size = 1;
cache_simulator->Access(get);
AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 16, 103, 99, {},
{});
for (auto const& key : {"1", "2", "4"}) {
auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key + "_0");
ASSERT_EQ(nullptr, handle);
}
}
TEST_F(CacheSimulatorTest, HybridRowBlockNoInsertCacheSimulator) {
uint64_t block_id = 100;
BlockCacheTraceRecord first_get = GenerateGetRecord(kGetId);