Block cache trace analysis: Write time series graphs in csv files (#5490)
Summary: This PR adds a feature in block cache trace analysis tool to write statistics into csv files. 1. The analysis tool supports grouping the number of accesses per second by various labels, e.g., block, column family, block type, or a combination of them. 2. It also computes reuse distance and reuse interval. Reuse distance: The cumulated size of unique blocks read between two consecutive accesses on the same block. Reuse interval: The time between two consecutive accesses on the same block. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5490 Differential Revision: D15901322 Pulled By: HaoyuHuang fbshipit-source-id: b5454fea408a32757a80be63de6fe1c8149ca70e
This commit is contained in:
parent
acb80534ca
commit
554a6456aa
@ -11,7 +11,6 @@
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include "monitoring/histogram.h"
|
||||
#include "util/gflags_compat.h"
|
||||
@ -42,12 +41,70 @@ DEFINE_bool(print_data_block_access_count_stats, false,
|
||||
DEFINE_int32(cache_sim_warmup_seconds, 0,
|
||||
"The number of seconds to warmup simulated caches. The hit/miss "
|
||||
"counters are reset after the warmup completes.");
|
||||
DEFINE_string(output_miss_ratio_curve_path, "",
|
||||
"The output file to save the computed miss ratios. File format: "
|
||||
"cache_name,num_shard_bits,capacity,miss_ratio,total_accesses");
|
||||
DEFINE_string(
|
||||
block_cache_analysis_result_dir, "",
|
||||
"The directory that saves block cache analysis results. It contains 1) a "
|
||||
"mrc file that saves the computed miss ratios for simulated caches. Its "
|
||||
"format is "
|
||||
"cache_name,num_shard_bits,capacity,miss_ratio,total_accesses. 2) Several "
|
||||
"\"label_access_timeline\" files that contain number of accesses per "
|
||||
"second grouped by the label. File format: "
|
||||
"time,label_1_access_per_second,label_2_access_per_second,...,label_N_"
|
||||
"access_per_second where N is the number of unique labels found in the "
|
||||
"trace. 3) Several \"label_reuse_distance\" and \"label_reuse_interval\" "
|
||||
"csv files that contain the reuse distance/interval grouped by label. File "
|
||||
"format: bucket,label_1,label_2,...,label_N. The first N buckets are "
|
||||
"absolute values. The second N buckets are percentage values.");
|
||||
DEFINE_string(
|
||||
timeline_labels, "",
|
||||
"Group the number of accesses per block per second using these labels. "
|
||||
"Possible labels are a combination of the following: cf (column family), "
|
||||
"sst, level, bt (block type), caller, block. For example, label \"cf_bt\" "
|
||||
"means the number of acccess per second is grouped by unique pairs of "
|
||||
"\"cf_bt\". A label \"all\" contains the aggregated number of accesses per "
|
||||
"second across all possible labels.");
|
||||
DEFINE_string(reuse_distance_labels, "",
|
||||
"Group the reuse distance of a block using these labels. Reuse "
|
||||
"distance is defined as the cumulated size of unique blocks read "
|
||||
"between two consecutive accesses on the same block.");
|
||||
DEFINE_string(
|
||||
reuse_distance_buckets, "",
|
||||
"Group blocks by their reuse distances given these buckets. For "
|
||||
"example, if 'reuse_distance_buckets' is '1K,1M,1G', we will "
|
||||
"create four buckets. The first three buckets contain the number of "
|
||||
"blocks with reuse distance less than 1KB, between 1K and 1M, between 1M "
|
||||
"and 1G, respectively. The last bucket contains the number of blocks with "
|
||||
"reuse distance larger than 1G. ");
|
||||
DEFINE_string(
|
||||
reuse_interval_labels, "",
|
||||
"Group the reuse interval of a block using these labels. Reuse "
|
||||
"interval is defined as the time between two consecutive accesses "
|
||||
"on the same block.");
|
||||
DEFINE_string(
|
||||
reuse_interval_buckets, "",
|
||||
"Group blocks by their reuse interval given these buckets. For "
|
||||
"example, if 'reuse_distance_buckets' is '1,10,100', we will "
|
||||
"create four buckets. The first three buckets contain the number of "
|
||||
"blocks with reuse interval less than 1 second, between 1 second and 10 "
|
||||
"seconds, between 10 seconds and 100 seconds, respectively. The last "
|
||||
"bucket contains the number of blocks with reuse interval longer than 100 "
|
||||
"seconds.");
|
||||
|
||||
namespace rocksdb {
|
||||
namespace {
|
||||
|
||||
const std::string kMissRatioCurveFileName = "mrc";
|
||||
const std::string kGroupbyBlock = "block";
|
||||
const std::string kGroupbyColumnFamily = "cf";
|
||||
const std::string kGroupbySSTFile = "sst";
|
||||
const std::string kGroupbyBlockType = "bt";
|
||||
const std::string kGroupbyCaller = "caller";
|
||||
const std::string kGroupbyLevel = "level";
|
||||
const std::string kGroupbyAll = "all";
|
||||
const std::set<std::string> kGroupbyLabels{
|
||||
kGroupbyBlock, kGroupbyColumnFamily, kGroupbySSTFile, kGroupbyLevel,
|
||||
kGroupbyBlockType, kGroupbyCaller, kGroupbyAll};
|
||||
|
||||
std::string block_type_to_string(TraceType type) {
|
||||
switch (type) {
|
||||
case kBlockTraceFilterBlock:
|
||||
@ -146,8 +203,9 @@ void BlockCacheTraceSimulator::Access(const BlockCacheTraceRecord& access) {
|
||||
trace_start_time_ = access.access_timestamp;
|
||||
}
|
||||
// access.access_timestamp is in microseconds.
|
||||
if (!warmup_complete_ && trace_start_time_ + warmup_seconds_ * 1000000 <=
|
||||
access.access_timestamp) {
|
||||
if (!warmup_complete_ &&
|
||||
trace_start_time_ + warmup_seconds_ * kMicrosInSecond <=
|
||||
access.access_timestamp) {
|
||||
for (auto& sim_cache : sim_caches_) {
|
||||
sim_cache->reset_counter();
|
||||
}
|
||||
@ -162,14 +220,16 @@ void BlockCacheTraceSimulator::Access(const BlockCacheTraceRecord& access) {
|
||||
}
|
||||
}
|
||||
|
||||
void BlockCacheTraceAnalyzer::PrintMissRatioCurves() const {
|
||||
void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const {
|
||||
if (!cache_simulator_) {
|
||||
return;
|
||||
}
|
||||
if (output_miss_ratio_curve_path_.empty()) {
|
||||
if (output_dir_.empty()) {
|
||||
return;
|
||||
}
|
||||
std::ofstream out(output_miss_ratio_curve_path_);
|
||||
const std::string output_miss_ratio_curve_path =
|
||||
output_dir_ + "/" + kMissRatioCurveFileName;
|
||||
std::ofstream out(output_miss_ratio_curve_path);
|
||||
if (!out.is_open()) {
|
||||
return;
|
||||
}
|
||||
@ -203,14 +263,345 @@ void BlockCacheTraceAnalyzer::PrintMissRatioCurves() const {
|
||||
out.close();
|
||||
}
|
||||
|
||||
std::set<std::string> BlockCacheTraceAnalyzer::ParseLabelStr(
|
||||
const std::string& label_str) const {
|
||||
std::stringstream ss(label_str);
|
||||
std::set<std::string> labels;
|
||||
// label_str is in the form of "label1_label2_label3", e.g., cf_bt.
|
||||
while (ss.good()) {
|
||||
std::string label_name;
|
||||
getline(ss, label_name, '_');
|
||||
if (kGroupbyLabels.find(label_name) == kGroupbyLabels.end()) {
|
||||
// Unknown label name.
|
||||
fprintf(stderr, "Unknown label name %s, label string %s\n",
|
||||
label_name.c_str(), label_str.c_str());
|
||||
return {};
|
||||
}
|
||||
labels.insert(label_name);
|
||||
}
|
||||
return labels;
|
||||
}
|
||||
|
||||
std::string BlockCacheTraceAnalyzer::BuildLabel(
|
||||
const std::set<std::string>& labels, const std::string& cf_name,
|
||||
uint64_t fd, uint32_t level, TraceType type, TableReaderCaller caller,
|
||||
const std::string& block_key) const {
|
||||
std::map<std::string, std::string> label_value_map;
|
||||
label_value_map[kGroupbyAll] = kGroupbyAll;
|
||||
label_value_map[kGroupbyLevel] = std::to_string(level);
|
||||
label_value_map[kGroupbyCaller] = caller_to_string(caller);
|
||||
label_value_map[kGroupbySSTFile] = std::to_string(fd);
|
||||
label_value_map[kGroupbyBlockType] = block_type_to_string(type);
|
||||
label_value_map[kGroupbyColumnFamily] = cf_name;
|
||||
label_value_map[kGroupbyBlock] = block_key;
|
||||
// Concatenate the label values.
|
||||
std::string label;
|
||||
for (auto const& l : labels) {
|
||||
label += label_value_map[l];
|
||||
label += "-";
|
||||
}
|
||||
if (!label.empty()) {
|
||||
label.pop_back();
|
||||
}
|
||||
return label;
|
||||
}
|
||||
|
||||
void BlockCacheTraceAnalyzer::WriteAccessTimeline(
|
||||
const std::string& label_str) const {
|
||||
std::set<std::string> labels = ParseLabelStr(label_str);
|
||||
uint64_t start_time = port::kMaxUint64;
|
||||
uint64_t end_time = 0;
|
||||
std::map<std::string, std::map<uint64_t, uint64_t>> label_access_timeline;
|
||||
for (auto const& cf_aggregates : cf_aggregates_map_) {
|
||||
// Stats per column family.
|
||||
const std::string& cf_name = cf_aggregates.first;
|
||||
for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
|
||||
// Stats per SST file.
|
||||
const uint64_t fd = file_aggregates.first;
|
||||
const uint32_t level = file_aggregates.second.level;
|
||||
for (auto const& block_type_aggregates :
|
||||
file_aggregates.second.block_type_aggregates_map) {
|
||||
// Stats per block type.
|
||||
const TraceType type = block_type_aggregates.first;
|
||||
for (auto const& block_access_info :
|
||||
block_type_aggregates.second.block_access_info_map) {
|
||||
// Stats per block.
|
||||
for (auto const& timeline :
|
||||
block_access_info.second.caller_num_accesses_timeline) {
|
||||
const TableReaderCaller caller = timeline.first;
|
||||
const std::string& block_key = block_access_info.first;
|
||||
const std::string label =
|
||||
BuildLabel(labels, cf_name, fd, level, type, caller, block_key);
|
||||
for (auto const& naccess : timeline.second) {
|
||||
const uint64_t timestamp = naccess.first;
|
||||
const uint64_t num = naccess.second;
|
||||
label_access_timeline[label][timestamp] += num;
|
||||
start_time = std::min(start_time, timestamp);
|
||||
end_time = std::max(end_time, timestamp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We have label_access_timeline now. Write them into a file.
|
||||
const std::string output_path =
|
||||
output_dir_ + "/" + label_str + "_access_timeline";
|
||||
std::ofstream out(output_path);
|
||||
if (!out.is_open()) {
|
||||
return;
|
||||
}
|
||||
std::string header("time");
|
||||
for (auto const& label : label_access_timeline) {
|
||||
header += ",";
|
||||
header += label.first;
|
||||
}
|
||||
out << header << std::endl;
|
||||
std::string row;
|
||||
for (uint64_t now = start_time; now <= end_time; now++) {
|
||||
row = std::to_string(now);
|
||||
for (auto const& label : label_access_timeline) {
|
||||
auto it = label.second.find(now);
|
||||
row += ",";
|
||||
if (it != label.second.end()) {
|
||||
row += std::to_string(it->second);
|
||||
} else {
|
||||
row += "0";
|
||||
}
|
||||
}
|
||||
out << row << std::endl;
|
||||
}
|
||||
out.close();
|
||||
}
|
||||
|
||||
void BlockCacheTraceAnalyzer::WriteReuseDistance(
|
||||
const std::string& label_str,
|
||||
const std::set<uint64_t>& distance_buckets) const {
|
||||
std::set<std::string> labels = ParseLabelStr(label_str);
|
||||
std::map<std::string, std::map<uint64_t, uint64_t>> label_distance_num_reuses;
|
||||
uint64_t total_num_reuses = 0;
|
||||
for (auto const& cf_aggregates : cf_aggregates_map_) {
|
||||
// Stats per column family.
|
||||
const std::string& cf_name = cf_aggregates.first;
|
||||
for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
|
||||
// Stats per SST file.
|
||||
const uint64_t fd = file_aggregates.first;
|
||||
const uint32_t level = file_aggregates.second.level;
|
||||
for (auto const& block_type_aggregates :
|
||||
file_aggregates.second.block_type_aggregates_map) {
|
||||
// Stats per block type.
|
||||
const TraceType type = block_type_aggregates.first;
|
||||
for (auto const& block_access_info :
|
||||
block_type_aggregates.second.block_access_info_map) {
|
||||
// Stats per block.
|
||||
const std::string& block_key = block_access_info.first;
|
||||
const std::string label = BuildLabel(
|
||||
labels, cf_name, fd, level, type,
|
||||
TableReaderCaller::kMaxBlockCacheLookupCaller, block_key);
|
||||
if (label_distance_num_reuses.find(label) ==
|
||||
label_distance_num_reuses.end()) {
|
||||
// The first time we encounter this label.
|
||||
for (auto const& distance_bucket : distance_buckets) {
|
||||
label_distance_num_reuses[label][distance_bucket] = 0;
|
||||
}
|
||||
}
|
||||
for (auto const& reuse_distance :
|
||||
block_access_info.second.reuse_distance_count) {
|
||||
label_distance_num_reuses[label]
|
||||
.upper_bound(reuse_distance.first)
|
||||
->second += reuse_distance.second;
|
||||
total_num_reuses += reuse_distance.second;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We have label_naccesses and label_distance_num_reuses now. Write them into
|
||||
// a file.
|
||||
const std::string output_path =
|
||||
output_dir_ + "/" + label_str + "_reuse_distance";
|
||||
std::ofstream out(output_path);
|
||||
if (!out.is_open()) {
|
||||
return;
|
||||
}
|
||||
std::string header("bucket");
|
||||
for (auto const& label_it : label_distance_num_reuses) {
|
||||
header += ",";
|
||||
header += label_it.first;
|
||||
}
|
||||
out << header << std::endl;
|
||||
// Absolute values.
|
||||
for (auto const& bucket : distance_buckets) {
|
||||
std::string row(std::to_string(bucket));
|
||||
for (auto const& label_it : label_distance_num_reuses) {
|
||||
auto const& it = label_it.second.find(bucket);
|
||||
assert(it != label_it.second.end());
|
||||
row += ",";
|
||||
row += std::to_string(it->second);
|
||||
}
|
||||
out << row << std::endl;
|
||||
}
|
||||
// Percentage values.
|
||||
for (auto const& bucket : distance_buckets) {
|
||||
std::string row(std::to_string(bucket));
|
||||
for (auto const& label_it : label_distance_num_reuses) {
|
||||
auto const& it = label_it.second.find(bucket);
|
||||
assert(it != label_it.second.end());
|
||||
row += ",";
|
||||
row += std::to_string(percent(it->second, total_num_reuses));
|
||||
}
|
||||
out << row << std::endl;
|
||||
}
|
||||
out.close();
|
||||
}
|
||||
|
||||
void BlockCacheTraceAnalyzer::UpdateReuseIntervalStats(
|
||||
const std::string& label, const std::set<uint64_t>& time_buckets,
|
||||
const std::map<uint64_t, uint64_t> timeline,
|
||||
std::map<std::string, std::map<uint64_t, uint64_t>>* label_time_num_reuses,
|
||||
uint64_t* total_num_reuses) const {
|
||||
assert(label_time_num_reuses);
|
||||
assert(total_num_reuses);
|
||||
if (label_time_num_reuses->find(label) == label_time_num_reuses->end()) {
|
||||
// The first time we encounter this label.
|
||||
for (auto const& time_bucket : time_buckets) {
|
||||
(*label_time_num_reuses)[label][time_bucket] = 0;
|
||||
}
|
||||
}
|
||||
auto it = timeline.begin();
|
||||
const uint64_t prev_timestamp = it->first;
|
||||
const uint64_t prev_num = it->second;
|
||||
it++;
|
||||
// Reused within one second.
|
||||
if (prev_num > 1) {
|
||||
(*label_time_num_reuses)[label].upper_bound(1)->second += prev_num - 1;
|
||||
*total_num_reuses += prev_num - 1;
|
||||
}
|
||||
while (it != timeline.end()) {
|
||||
const uint64_t timestamp = it->first;
|
||||
const uint64_t num = it->second;
|
||||
const uint64_t reuse_interval = timestamp - prev_timestamp;
|
||||
(*label_time_num_reuses)[label].upper_bound(reuse_interval)->second += num;
|
||||
*total_num_reuses += num;
|
||||
}
|
||||
}
|
||||
|
||||
void BlockCacheTraceAnalyzer::WriteReuseInterval(
|
||||
const std::string& label_str,
|
||||
const std::set<uint64_t>& time_buckets) const {
|
||||
std::set<std::string> labels = ParseLabelStr(label_str);
|
||||
std::map<std::string, std::map<uint64_t, uint64_t>> label_time_num_reuses;
|
||||
uint64_t total_num_reuses = 0;
|
||||
for (auto const& cf_aggregates : cf_aggregates_map_) {
|
||||
// Stats per column family.
|
||||
const std::string& cf_name = cf_aggregates.first;
|
||||
for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
|
||||
// Stats per SST file.
|
||||
const uint64_t fd = file_aggregates.first;
|
||||
const uint32_t level = file_aggregates.second.level;
|
||||
for (auto const& block_type_aggregates :
|
||||
file_aggregates.second.block_type_aggregates_map) {
|
||||
// Stats per block type.
|
||||
const TraceType type = block_type_aggregates.first;
|
||||
for (auto const& block_access_info :
|
||||
block_type_aggregates.second.block_access_info_map) {
|
||||
// Stats per block.
|
||||
const std::string& block_key = block_access_info.first;
|
||||
if (labels.find(kGroupbyCaller) != labels.end()) {
|
||||
for (auto const& timeline :
|
||||
block_access_info.second.caller_num_accesses_timeline) {
|
||||
const TableReaderCaller caller = timeline.first;
|
||||
const std::string label = BuildLabel(labels, cf_name, fd, level,
|
||||
type, caller, block_key);
|
||||
UpdateReuseIntervalStats(label, time_buckets, timeline.second,
|
||||
&label_time_num_reuses,
|
||||
&total_num_reuses);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
// Does not group by caller so we need to flatten the access timeline.
|
||||
const std::string label = BuildLabel(
|
||||
labels, cf_name, fd, level, type,
|
||||
TableReaderCaller::kMaxBlockCacheLookupCaller, block_key);
|
||||
std::map<uint64_t, uint64_t> timeline;
|
||||
for (auto const& caller_timeline :
|
||||
block_access_info.second.caller_num_accesses_timeline) {
|
||||
for (auto const& time_naccess : caller_timeline.second) {
|
||||
timeline[time_naccess.first] += time_naccess.second;
|
||||
}
|
||||
}
|
||||
UpdateReuseIntervalStats(label, time_buckets, timeline,
|
||||
&label_time_num_reuses, &total_num_reuses);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We have label_naccesses and label_interval_num_reuses now. Write them into
|
||||
// a file.
|
||||
const std::string output_path =
|
||||
output_dir_ + "/" + label_str + "_reuse_interval";
|
||||
std::ofstream out(output_path);
|
||||
if (!out.is_open()) {
|
||||
return;
|
||||
}
|
||||
std::string header("bucket");
|
||||
for (auto const& label_it : label_time_num_reuses) {
|
||||
header += ",";
|
||||
header += label_it.first;
|
||||
}
|
||||
out << header << std::endl;
|
||||
// Absolute values.
|
||||
for (auto const& bucket : time_buckets) {
|
||||
std::string row(std::to_string(bucket));
|
||||
for (auto const& label_it : label_time_num_reuses) {
|
||||
auto const& it = label_it.second.find(bucket);
|
||||
assert(it != label_it.second.end());
|
||||
row += ",";
|
||||
row += std::to_string(it->second);
|
||||
}
|
||||
out << row << std::endl;
|
||||
}
|
||||
// Percentage values.
|
||||
for (auto const& bucket : time_buckets) {
|
||||
std::string row(std::to_string(bucket));
|
||||
for (auto const& label_it : label_time_num_reuses) {
|
||||
auto const& it = label_it.second.find(bucket);
|
||||
assert(it != label_it.second.end());
|
||||
row += ",";
|
||||
row += std::to_string(percent(it->second, total_num_reuses));
|
||||
}
|
||||
out << row << std::endl;
|
||||
}
|
||||
out.close();
|
||||
}
|
||||
|
||||
BlockCacheTraceAnalyzer::BlockCacheTraceAnalyzer(
|
||||
const std::string& trace_file_path,
|
||||
const std::string& output_miss_ratio_curve_path,
|
||||
const std::string& trace_file_path, const std::string& output_dir,
|
||||
std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator)
|
||||
: trace_file_path_(trace_file_path),
|
||||
output_miss_ratio_curve_path_(output_miss_ratio_curve_path),
|
||||
cache_simulator_(std::move(cache_simulator)) {
|
||||
env_ = rocksdb::Env::Default();
|
||||
: env_(rocksdb::Env::Default()),
|
||||
trace_file_path_(trace_file_path),
|
||||
output_dir_(output_dir),
|
||||
cache_simulator_(std::move(cache_simulator)) {}
|
||||
|
||||
void BlockCacheTraceAnalyzer::ComputeReuseDistance(
|
||||
BlockAccessInfo* info) const {
|
||||
assert(info);
|
||||
if (info->num_accesses == 0) {
|
||||
return;
|
||||
}
|
||||
uint64_t reuse_distance = 0;
|
||||
for (auto const& block_key : info->unique_blocks_since_last_access) {
|
||||
auto const& it = block_info_map_.find(block_key);
|
||||
// This block must exist.
|
||||
assert(it != block_info_map_.end());
|
||||
reuse_distance += it->second->block_size;
|
||||
}
|
||||
info->reuse_distance_count[reuse_distance] += 1;
|
||||
// We clear this hash set since this is the second access on this block.
|
||||
info->unique_blocks_since_last_access.clear();
|
||||
}
|
||||
|
||||
void BlockCacheTraceAnalyzer::RecordAccess(
|
||||
@ -223,7 +614,23 @@ void BlockCacheTraceAnalyzer::RecordAccess(
|
||||
file_aggr.block_type_aggregates_map[access.block_type];
|
||||
BlockAccessInfo& block_access_info =
|
||||
block_type_aggr.block_access_info_map[access.block_key];
|
||||
ComputeReuseDistance(&block_access_info);
|
||||
block_access_info.AddAccess(access);
|
||||
block_info_map_[access.block_key] = &block_access_info;
|
||||
|
||||
// Add this block to all existing blocks.
|
||||
for (auto& cf_aggregates : cf_aggregates_map_) {
|
||||
for (auto& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
|
||||
for (auto& block_type_aggregates :
|
||||
file_aggregates.second.block_type_aggregates_map) {
|
||||
for (auto& existing_block :
|
||||
block_type_aggregates.second.block_access_info_map) {
|
||||
existing_block.second.unique_blocks_since_last_access.insert(
|
||||
access.block_key);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Status BlockCacheTraceAnalyzer::Analyze() {
|
||||
@ -659,6 +1066,18 @@ std::vector<CacheConfiguration> parse_cache_config_file(
|
||||
return configs;
|
||||
}
|
||||
|
||||
std::set<uint64_t> parse_buckets(const std::string& bucket_str) {
|
||||
std::set<uint64_t> buckets;
|
||||
std::stringstream ss(bucket_str);
|
||||
while (ss.good()) {
|
||||
std::string bucket;
|
||||
getline(ss, bucket, ',');
|
||||
buckets.insert(ParseUint64(bucket));
|
||||
}
|
||||
buckets.insert(port::kMaxUint64);
|
||||
return buckets;
|
||||
}
|
||||
|
||||
int block_cache_trace_analyzer_tool(int argc, char** argv) {
|
||||
ParseCommandLineFlags(&argc, &argv, true);
|
||||
if (FLAGS_block_cache_trace_path.empty()) {
|
||||
@ -678,7 +1097,7 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) {
|
||||
warmup_seconds, downsample_ratio, cache_configs));
|
||||
}
|
||||
BlockCacheTraceAnalyzer analyzer(FLAGS_block_cache_trace_path,
|
||||
FLAGS_output_miss_ratio_curve_path,
|
||||
FLAGS_block_cache_analysis_result_dir,
|
||||
std::move(cache_simulator));
|
||||
Status s = analyzer.Analyze();
|
||||
if (!s.IsIncomplete()) {
|
||||
@ -701,7 +1120,38 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) {
|
||||
analyzer.PrintDataBlockAccessStats();
|
||||
}
|
||||
print_break_lines(/*num_break_lines=*/3);
|
||||
analyzer.PrintMissRatioCurves();
|
||||
analyzer.WriteMissRatioCurves();
|
||||
|
||||
if (!FLAGS_timeline_labels.empty()) {
|
||||
std::stringstream ss(FLAGS_timeline_labels);
|
||||
while (ss.good()) {
|
||||
std::string label;
|
||||
getline(ss, label, ',');
|
||||
analyzer.WriteAccessTimeline(label);
|
||||
}
|
||||
}
|
||||
|
||||
if (!FLAGS_reuse_distance_labels.empty() &&
|
||||
!FLAGS_reuse_distance_buckets.empty()) {
|
||||
std::set<uint64_t> buckets = parse_buckets(FLAGS_reuse_distance_buckets);
|
||||
std::stringstream ss(FLAGS_reuse_distance_labels);
|
||||
while (ss.good()) {
|
||||
std::string label;
|
||||
getline(ss, label, ',');
|
||||
analyzer.WriteReuseDistance(label, buckets);
|
||||
}
|
||||
}
|
||||
|
||||
if (!FLAGS_reuse_interval_labels.empty() &&
|
||||
!FLAGS_reuse_interval_buckets.empty()) {
|
||||
std::set<uint64_t> buckets = parse_buckets(FLAGS_reuse_interval_buckets);
|
||||
std::stringstream ss(FLAGS_reuse_interval_labels);
|
||||
while (ss.good()) {
|
||||
std::string label;
|
||||
getline(ss, label, ',');
|
||||
analyzer.WriteReuseInterval(label, buckets);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -6,6 +6,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
#include "rocksdb/env.h"
|
||||
@ -14,6 +15,8 @@
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
const uint64_t kMicrosInSecond = 1000000;
|
||||
|
||||
class BlockCacheTraceAnalyzer;
|
||||
|
||||
// A cache configuration provided by user.
|
||||
@ -73,6 +76,14 @@ struct BlockAccessInfo {
|
||||
non_exist_key_num_access_map; // for keys do not exist in this block.
|
||||
uint64_t num_referenced_key_exist_in_block = 0;
|
||||
std::map<TableReaderCaller, uint64_t> caller_num_access_map;
|
||||
// caller:timestamp:number_of_accesses. The granularity of the timestamp is
|
||||
// seconds.
|
||||
std::map<TableReaderCaller, std::map<uint64_t, uint64_t>>
|
||||
caller_num_accesses_timeline;
|
||||
// Unique blocks since the last access.
|
||||
std::set<std::string> unique_blocks_since_last_access;
|
||||
// Number of reuses grouped by reuse distance.
|
||||
std::map<uint64_t, uint64_t> reuse_distance_count;
|
||||
|
||||
void AddAccess(const BlockCacheTraceRecord& access) {
|
||||
if (first_access_time == 0) {
|
||||
@ -82,10 +93,13 @@ struct BlockAccessInfo {
|
||||
block_size = access.block_size;
|
||||
caller_num_access_map[access.caller]++;
|
||||
num_accesses++;
|
||||
// access.access_timestamp is in microsecond.
|
||||
const uint64_t timestamp_in_seconds =
|
||||
access.access_timestamp / kMicrosInSecond;
|
||||
caller_num_accesses_timeline[access.caller][timestamp_in_seconds] += 1;
|
||||
if (BlockCacheTraceHelper::ShouldTraceReferencedKey(access.block_type,
|
||||
access.caller)) {
|
||||
num_keys = access.num_keys_in_block;
|
||||
|
||||
if (access.referenced_key_exist_in_block == Boolean::kTrue) {
|
||||
key_num_access_map[access.referenced_key]++;
|
||||
num_referenced_key_exist_in_block++;
|
||||
@ -115,8 +129,7 @@ struct ColumnFamilyAccessInfoAggregate {
|
||||
class BlockCacheTraceAnalyzer {
|
||||
public:
|
||||
BlockCacheTraceAnalyzer(
|
||||
const std::string& trace_file_path,
|
||||
const std::string& output_miss_ratio_curve_path,
|
||||
const std::string& trace_file_path, const std::string& output_dir,
|
||||
std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator);
|
||||
~BlockCacheTraceAnalyzer() = default;
|
||||
// No copy and move.
|
||||
@ -165,7 +178,24 @@ class BlockCacheTraceAnalyzer {
|
||||
// accesses on keys exist in a data block and its break down by column family.
|
||||
void PrintDataBlockAccessStats() const;
|
||||
|
||||
void PrintMissRatioCurves() const;
|
||||
// Write miss ratio curves of simulated cache configurations into a csv file
|
||||
// saved in 'output_dir'.
|
||||
void WriteMissRatioCurves() const;
|
||||
|
||||
// Write the access timeline into a csv file saved in 'output_dir'.
|
||||
void WriteAccessTimeline(const std::string& label) const;
|
||||
|
||||
// Write the reuse distance into a csv file saved in 'output_dir'. Reuse
|
||||
// distance is defined as the cumulated size of unique blocks read between two
|
||||
// consective accesses on the same block.
|
||||
void WriteReuseDistance(const std::string& label_str,
|
||||
const std::set<uint64_t>& distance_buckets) const;
|
||||
|
||||
// Write the reuse interval into a csv file saved in 'output_dir'. Reuse
|
||||
// interval is defined as the time between two consecutive accesses on the
|
||||
// same block..
|
||||
void WriteReuseInterval(const std::string& label_str,
|
||||
const std::set<uint64_t>& time_buckets) const;
|
||||
|
||||
const std::map<std::string, ColumnFamilyAccessInfoAggregate>&
|
||||
TEST_cf_aggregates_map() const {
|
||||
@ -173,15 +203,33 @@ class BlockCacheTraceAnalyzer {
|
||||
}
|
||||
|
||||
private:
|
||||
std::set<std::string> ParseLabelStr(const std::string& label_str) const;
|
||||
|
||||
std::string BuildLabel(const std::set<std::string>& labels,
|
||||
const std::string& cf_name, uint64_t fd,
|
||||
uint32_t level, TraceType type,
|
||||
TableReaderCaller caller,
|
||||
const std::string& block_key) const;
|
||||
|
||||
void ComputeReuseDistance(BlockAccessInfo* info) const;
|
||||
|
||||
void RecordAccess(const BlockCacheTraceRecord& access);
|
||||
|
||||
void UpdateReuseIntervalStats(
|
||||
const std::string& label, const std::set<uint64_t>& time_buckets,
|
||||
const std::map<uint64_t, uint64_t> timeline,
|
||||
std::map<std::string, std::map<uint64_t, uint64_t>>*
|
||||
label_time_num_reuses,
|
||||
uint64_t* total_num_reuses) const;
|
||||
|
||||
rocksdb::Env* env_;
|
||||
const std::string trace_file_path_;
|
||||
const std::string output_miss_ratio_curve_path_;
|
||||
const std::string output_dir_;
|
||||
|
||||
BlockCacheTraceHeader header_;
|
||||
std::unique_ptr<BlockCacheTraceSimulator> cache_simulator_;
|
||||
std::map<std::string, ColumnFamilyAccessInfoAggregate> cf_aggregates_map_;
|
||||
std::map<std::string, BlockAccessInfo*> block_info_map_;
|
||||
};
|
||||
|
||||
int block_cache_trace_analyzer_tool(int argc, char** argv);
|
||||
|
@ -49,7 +49,13 @@ class BlockCacheTracerTest : public testing::Test {
|
||||
EXPECT_OK(env_->CreateDir(test_path_));
|
||||
trace_file_path_ = test_path_ + "/block_cache_trace";
|
||||
block_cache_sim_config_path_ = test_path_ + "/block_cache_sim_config";
|
||||
output_miss_ratio_curve_path_ = test_path_ + "/out_miss_ratio_curve";
|
||||
timeline_labels_ =
|
||||
"block,all,cf,sst,level,bt,caller,cf_sst,cf_level,cf_bt,cf_caller";
|
||||
reuse_distance_labels_ =
|
||||
"block,all,cf,sst,level,bt,caller,cf_sst,cf_level,cf_bt,cf_caller";
|
||||
reuse_distance_buckets_ = "1,1K,1M,1G";
|
||||
reuse_interval_labels_ = "block,all,cf,sst,level,bt,cf_sst,cf_level,cf_bt";
|
||||
reuse_interval_buckets_ = "1,10,100,1000";
|
||||
}
|
||||
|
||||
~BlockCacheTracerTest() override {
|
||||
@ -85,11 +91,12 @@ class BlockCacheTracerTest : public testing::Test {
|
||||
assert(writer);
|
||||
for (uint32_t i = 0; i < nblocks; i++) {
|
||||
uint32_t key_id = from_key_id + i;
|
||||
uint32_t timestamp = (key_id + 1) * kMicrosInSecond;
|
||||
BlockCacheTraceRecord record;
|
||||
record.block_type = block_type;
|
||||
record.block_size = kBlockSize + key_id;
|
||||
record.block_key = kBlockKeyPrefix + std::to_string(key_id);
|
||||
record.access_timestamp = env_->NowMicros();
|
||||
record.access_timestamp = timestamp;
|
||||
record.cf_id = kCFId;
|
||||
record.cf_name = kDefaultColumnFamilyName;
|
||||
record.caller = GetCaller(key_id);
|
||||
@ -146,11 +153,17 @@ class BlockCacheTracerTest : public testing::Test {
|
||||
"./block_cache_trace_analyzer",
|
||||
"-block_cache_trace_path=" + trace_file_path_,
|
||||
"-block_cache_sim_config_path=" + block_cache_sim_config_path_,
|
||||
"-output_miss_ratio_curve_path=" + output_miss_ratio_curve_path_,
|
||||
"-block_cache_analysis_result_dir=" + test_path_,
|
||||
"-print_block_size_stats",
|
||||
"-print_access_count_stats",
|
||||
"-print_data_block_access_count_stats",
|
||||
"-cache_sim_warmup_seconds=0"};
|
||||
"-cache_sim_warmup_seconds=0",
|
||||
"-timeline_labels=" + timeline_labels_,
|
||||
"-reuse_distance_labels=" + reuse_distance_labels_,
|
||||
"-reuse_distance_buckets=" + reuse_distance_buckets_,
|
||||
"-reuse_interval_labels=" + reuse_interval_labels_,
|
||||
"-reuse_interval_buckets=" + reuse_interval_buckets_,
|
||||
};
|
||||
char arg_buffer[kArgBufferSize];
|
||||
char* argv[kMaxArgCount];
|
||||
int argc = 0;
|
||||
@ -168,10 +181,14 @@ class BlockCacheTracerTest : public testing::Test {
|
||||
|
||||
Env* env_;
|
||||
EnvOptions env_options_;
|
||||
std::string output_miss_ratio_curve_path_;
|
||||
std::string block_cache_sim_config_path_;
|
||||
std::string trace_file_path_;
|
||||
std::string test_path_;
|
||||
std::string timeline_labels_;
|
||||
std::string reuse_distance_labels_;
|
||||
std::string reuse_distance_buckets_;
|
||||
std::string reuse_interval_labels_;
|
||||
std::string reuse_interval_buckets_;
|
||||
};
|
||||
|
||||
TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
|
||||
@ -199,7 +216,8 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
|
||||
// Validate the cache miss ratios.
|
||||
const std::vector<uint64_t> expected_capacities{1024, 1024 * 1024,
|
||||
1024 * 1024 * 1024};
|
||||
std::ifstream infile(output_miss_ratio_curve_path_);
|
||||
const std::string mrc_path = test_path_ + "/mrc";
|
||||
std::ifstream infile(mrc_path);
|
||||
uint32_t config_index = 0;
|
||||
std::string line;
|
||||
// Read header.
|
||||
@ -224,8 +242,91 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
|
||||
}
|
||||
ASSERT_EQ(expected_capacities.size(), config_index);
|
||||
infile.close();
|
||||
ASSERT_OK(env_->DeleteFile(mrc_path));
|
||||
}
|
||||
{
|
||||
// Validate the timeline csv files.
|
||||
const uint32_t expected_num_lines = 50;
|
||||
std::stringstream ss(timeline_labels_);
|
||||
while (ss.good()) {
|
||||
std::string l;
|
||||
ASSERT_TRUE(getline(ss, l, ','));
|
||||
const std::string timeline_file =
|
||||
test_path_ + "/" + l + "_access_timeline";
|
||||
std::ifstream infile(timeline_file);
|
||||
std::string line;
|
||||
uint32_t nlines = 0;
|
||||
ASSERT_TRUE(getline(infile, line));
|
||||
uint64_t expected_time = 1;
|
||||
while (getline(infile, line)) {
|
||||
std::stringstream ss_naccess(line);
|
||||
uint32_t naccesses = 0;
|
||||
std::string substr;
|
||||
uint32_t time = 0;
|
||||
while (ss_naccess.good()) {
|
||||
ASSERT_TRUE(getline(ss_naccess, substr, ','));
|
||||
if (time == 0) {
|
||||
time = ParseUint32(substr);
|
||||
continue;
|
||||
}
|
||||
naccesses += ParseUint32(substr);
|
||||
}
|
||||
nlines++;
|
||||
ASSERT_EQ(1, naccesses);
|
||||
ASSERT_EQ(expected_time, time);
|
||||
expected_time += 1;
|
||||
}
|
||||
ASSERT_EQ(expected_num_lines, nlines);
|
||||
ASSERT_OK(env_->DeleteFile(timeline_file));
|
||||
}
|
||||
}
|
||||
{
|
||||
// Validate the reuse_interval and reuse_distance csv files.
|
||||
std::map<std::string, std::string> test_reuse_csv_files;
|
||||
test_reuse_csv_files["_reuse_interval"] = reuse_interval_labels_;
|
||||
test_reuse_csv_files["_reuse_distance"] = reuse_distance_labels_;
|
||||
for (auto const& test : test_reuse_csv_files) {
|
||||
const std::string& file_suffix = test.first;
|
||||
const std::string& labels = test.second;
|
||||
const uint32_t expected_num_rows = 10;
|
||||
const uint32_t expected_num_rows_absolute_values = 5;
|
||||
const uint32_t expected_reused_blocks = 0;
|
||||
std::stringstream ss(labels);
|
||||
while (ss.good()) {
|
||||
std::string l;
|
||||
ASSERT_TRUE(getline(ss, l, ','));
|
||||
const std::string reuse_csv_file = test_path_ + "/" + l + file_suffix;
|
||||
std::ifstream infile(reuse_csv_file);
|
||||
std::string line;
|
||||
ASSERT_TRUE(getline(infile, line));
|
||||
uint32_t nblocks = 0;
|
||||
double npercentage = 0;
|
||||
uint32_t nrows = 0;
|
||||
while (getline(infile, line)) {
|
||||
std::stringstream ss_naccess(line);
|
||||
bool label_read = false;
|
||||
nrows++;
|
||||
while (ss_naccess.good()) {
|
||||
std::string substr;
|
||||
ASSERT_TRUE(getline(ss_naccess, substr, ','));
|
||||
if (!label_read) {
|
||||
label_read = true;
|
||||
continue;
|
||||
}
|
||||
if (nrows < expected_num_rows_absolute_values) {
|
||||
nblocks += ParseUint32(substr);
|
||||
} else {
|
||||
npercentage += ParseDouble(substr);
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT_EQ(expected_num_rows, nrows);
|
||||
ASSERT_EQ(expected_reused_blocks, nblocks);
|
||||
ASSERT_LT(npercentage, 0);
|
||||
ASSERT_OK(env_->DeleteFile(reuse_csv_file));
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT_OK(env_->DeleteFile(output_miss_ratio_curve_path_));
|
||||
ASSERT_OK(env_->DeleteFile(block_cache_sim_config_path_));
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user