Block cache analyzer: python script to plot graphs (#5673)

Summary:
This PR updated the python script to plot graphs for stats output from block cache analyzer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5673

Test Plan: Manually run the script to generate graphs.

Differential Revision: D16657145

Pulled By: HaoyuHuang

fbshipit-source-id: fd510b5fd4307835f9a986fac545734dbe003d28
This commit is contained in:
haoyuhuang 2019-08-05 18:31:42 -07:00 committed by Facebook Github Bot
parent b1a02ffeab
commit f4a616ebf9

View File

@ -1,12 +1,17 @@
#!/usr/bin/env python3
import csv
import math
import os
import random
import sys
import matplotlib
matplotlib.use("Agg")
import matplotlib.backends.backend_pdf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# Make sure a legend has the same color across all generated graphs.
@ -19,7 +24,7 @@ def get_cmap(n, name="hsv"):
color_index = 0
bar_color_maps = {}
colors = []
n_colors = 60
n_colors = 360
linear_colors = get_cmap(n_colors)
for i in range(n_colors):
colors.append(linear_colors(i))
@ -35,19 +40,20 @@ def num_to_gb(n):
return "{0:.2f}".format(float(n) / one_gb)
def plot_miss_ratio_graphs(csv_result_dir, output_result_dir):
mrc_file_path = csv_result_dir + "/mrc"
if not os.path.exists(mrc_file_path):
return
def plot_miss_stats_graphs(
csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name
):
miss_ratios = {}
print("Processing file {}".format(mrc_file_path))
for file in os.listdir(csv_result_dir):
if not file.startswith(file_prefix):
continue
if not file.endswith(file_suffix):
continue
print("Processing file {}/{}".format(csv_result_dir, file))
mrc_file_path = csv_result_dir + "/" + file
with open(mrc_file_path, "r") as csvfile:
rows = csv.reader(csvfile, delimiter=",")
is_header = False
for row in rows:
if not is_header:
is_header = True
continue
cache_name = row[0]
num_shard_bits = int(row[1])
ghost_capacity = int(row[2])
@ -58,18 +64,71 @@ def plot_miss_ratio_graphs(csv_result_dir, output_result_dir):
miss_ratios[config] = {}
miss_ratios[config]["x"] = []
miss_ratios[config]["y"] = []
miss_ratios[config]["x"].append(num_to_gb(capacity))
miss_ratios[config]["x"].append(capacity)
miss_ratios[config]["y"].append(miss_ratio)
fig = plt.figure()
for config in miss_ratios:
plt.plot(miss_ratios[config]["x"], miss_ratios[config]["y"], label=config)
plt.xlabel("Cache capacity (GB)")
plt.ylabel("Miss Ratio (%)")
# plt.xscale('log', basex=2)
plt.plot(
miss_ratios[config]["x"], miss_ratios[config]["y"], label=config
)
plt.xlabel("Cache capacity")
plt.ylabel(ylabel)
plt.xscale("log", basex=2)
plt.ylim(ymin=0)
plt.title("RocksDB block cache miss ratios")
plt.title("{}".format(file))
plt.legend()
fig.savefig(output_result_dir + "/mrc.pdf", bbox_inches="tight")
fig.savefig(
output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight"
)
def plot_miss_stats_diff_lru_graphs(
csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name
):
miss_ratios = {}
for file in os.listdir(csv_result_dir):
if not file.startswith(file_prefix):
continue
if not file.endswith(file_suffix):
continue
print("Processing file {}/{}".format(csv_result_dir, file))
mrc_file_path = csv_result_dir + "/" + file
with open(mrc_file_path, "r") as csvfile:
rows = csv.reader(csvfile, delimiter=",")
for row in rows:
cache_name = row[0]
num_shard_bits = int(row[1])
ghost_capacity = int(row[2])
capacity = int(row[3])
miss_ratio = float(row[4])
config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity)
if config not in miss_ratios:
miss_ratios[config] = {}
miss_ratios[config]["x"] = []
miss_ratios[config]["y"] = []
miss_ratios[config]["x"].append(capacity)
miss_ratios[config]["y"].append(miss_ratio)
if "lru-0-0" not in miss_ratios:
return
fig = plt.figure()
for config in miss_ratios:
diffs = [0] * len(miss_ratios["lru-0-0"]["x"])
for i in range(len(miss_ratios["lru-0-0"]["x"])):
for j in range(len(miss_ratios[config]["x"])):
if miss_ratios["lru-0-0"]["x"][i] == miss_ratios[config]["x"][j]:
diffs[i] = (
miss_ratios[config]["y"][j] - miss_ratios["lru-0-0"]["y"][i]
)
break
plt.plot(miss_ratios["lru-0-0"]["x"], diffs, label=config)
plt.xlabel("Cache capacity")
plt.ylabel(ylabel)
plt.xscale("log", basex=2)
plt.title("{}".format(file))
plt.legend()
fig.savefig(
output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight"
)
def sanitize(label):
@ -143,6 +202,7 @@ def read_data_for_plot(csvfile, vertical):
def plot_line_charts(
csv_result_dir,
output_result_dir,
filename_prefix,
filename_suffix,
pdf_name,
xlabel,
@ -151,11 +211,14 @@ def plot_line_charts(
vertical,
legend,
):
global color_index, bar_color_maps, colors
pdf = matplotlib.backends.backend_pdf.PdfPages(output_result_dir + "/" + pdf_name)
for file in os.listdir(csv_result_dir):
if not file.endswith(filename_suffix):
continue
print("Processing file {}".format(file))
if not file.startswith(filename_prefix):
continue
print("Processing file {}/{}".format(csv_result_dir, file))
with open(csv_result_dir + "/" + file, "r") as csvfile:
x, labels, label_stats = read_data_for_plot(csvfile, vertical)
if len(x) == 0 or len(labels) == 0:
@ -163,10 +226,15 @@ def plot_line_charts(
# plot figure
fig = plt.figure()
for label_index in label_stats:
# Assign a unique color to this label.
if labels[label_index] not in bar_color_maps:
bar_color_maps[labels[label_index]] = colors[color_index]
color_index += 1
plt.plot(
[int(x[i]) for i in range(len(x))],
label_stats[label_index],
[int(x[i]) for i in range(len(x) - 1)],
label_stats[label_index][:-1],
label=labels[label_index],
color=bar_color_maps[labels[label_index]],
)
# Translate time unit into x labels.
@ -239,10 +307,29 @@ def plot_stacked_bar_charts(
pdf.close()
def plot_access_timeline(csv_result_dir, output_result_dir):
def plot_heatmap(csv_result_dir, output_result_dir, filename_suffix, pdf_name, title):
pdf = matplotlib.backends.backend_pdf.PdfPages(
"{}/{}".format(output_result_dir, pdf_name)
)
for file in os.listdir(csv_result_dir):
if not file.endswith(filename_suffix):
continue
csv_file_name = "{}/{}".format(csv_result_dir, file)
print("Processing file {}/{}".format(csv_result_dir, file))
corr_table = pd.read_csv(csv_file_name)
corr_table = corr_table.pivot("label", "corr", "value")
fig = plt.figure()
sns.heatmap(corr_table, annot=True, linewidths=0.5, fmt=".2")
plt.title("{} filename:{}".format(title, file))
pdf.savefig(fig)
pdf.close()
def plot_timeline(csv_result_dir, output_result_dir):
plot_line_charts(
csv_result_dir,
output_result_dir,
filename_prefix="",
filename_suffix="access_timeline",
pdf_name="access_time.pdf",
xlabel="Time",
@ -253,6 +340,109 @@ def plot_access_timeline(csv_result_dir, output_result_dir):
)
def convert_to_0_if_nan(n):
if math.isnan(n):
return 0.0
return n
def plot_correlation(csv_result_dir, output_result_dir):
# Processing the correlation input first.
label_str_file = {}
for file in os.listdir(csv_result_dir):
if not file.endswith("correlation_input"):
continue
csv_file_name = "{}/{}".format(csv_result_dir, file)
print("Processing file {}/{}".format(csv_result_dir, file))
corr_table = pd.read_csv(csv_file_name)
label_str = file.split("_")[0]
label = file[len(label_str) + 1 :]
label = label[: len(label) - len("_correlation_input")]
output_file = "{}/{}_correlation_output".format(csv_result_dir, label_str)
if output_file not in label_str_file:
f = open("{}/{}_correlation_output".format(csv_result_dir, label_str), "w+")
label_str_file[output_file] = f
f.write("label,corr,value\n")
f = label_str_file[output_file]
f.write(
"{},{},{}\n".format(
label,
"LA+A",
convert_to_0_if_nan(
corr_table["num_accesses_since_last_access"].corr(
corr_table["num_accesses_till_next_access"], method="spearman"
)
),
)
)
f.write(
"{},{},{}\n".format(
label,
"PA+A",
convert_to_0_if_nan(
corr_table["num_past_accesses"].corr(
corr_table["num_accesses_till_next_access"], method="spearman"
)
),
)
)
f.write(
"{},{},{}\n".format(
label,
"LT+A",
convert_to_0_if_nan(
corr_table["elapsed_time_since_last_access"].corr(
corr_table["num_accesses_till_next_access"], method="spearman"
)
),
)
)
f.write(
"{},{},{}\n".format(
label,
"LA+T",
convert_to_0_if_nan(
corr_table["num_accesses_since_last_access"].corr(
corr_table["elapsed_time_till_next_access"], method="spearman"
)
),
)
)
f.write(
"{},{},{}\n".format(
label,
"LT+T",
convert_to_0_if_nan(
corr_table["elapsed_time_since_last_access"].corr(
corr_table["elapsed_time_till_next_access"], method="spearman"
)
),
)
)
f.write(
"{},{},{}\n".format(
label,
"PA+T",
convert_to_0_if_nan(
corr_table["num_past_accesses"].corr(
corr_table["elapsed_time_till_next_access"], method="spearman"
)
),
)
)
for label_str in label_str_file:
label_str_file[label_str].close()
plot_heatmap(
csv_result_dir,
output_result_dir,
"correlation_output",
"correlation.pdf",
"Correlation",
)
def plot_reuse_graphs(csv_result_dir, output_result_dir):
plot_stacked_bar_charts(
csv_result_dir,
@ -301,6 +491,7 @@ def plot_reuse_graphs(csv_result_dir, output_result_dir):
plot_line_charts(
csv_result_dir,
output_result_dir,
filename_prefix="",
filename_suffix="reuse_blocks_timeline",
pdf_name="reuse_blocks_timeline.pdf",
xlabel="",
@ -370,14 +561,90 @@ def plot_access_count_summary(csv_result_dir, output_result_dir):
vertical=True,
x_prefix="< ",
)
plot_line_charts(
csv_result_dir,
output_result_dir,
filename_prefix="",
filename_suffix="skewness",
pdf_name="skew.pdf",
xlabel="",
ylabel="Percentage of accesses",
title="Skewness",
vertical=True,
legend=False,
)
def plot_miss_ratio_timeline(csv_result_dir, output_result_dir):
plot_line_charts(
csv_result_dir,
output_result_dir,
filename_prefix="",
filename_suffix="3600_miss_ratio_timeline",
pdf_name="miss_ratio_timeline.pdf",
xlabel="Time",
ylabel="Miss Ratio (%)",
title="Miss ratio timeline",
vertical=False,
legend=True,
)
plot_line_charts(
csv_result_dir,
output_result_dir,
filename_prefix="",
filename_suffix="3600_miss_timeline",
pdf_name="miss_timeline.pdf",
xlabel="Time",
ylabel="# of misses ",
title="Miss timeline",
vertical=False,
legend=True,
)
plot_line_charts(
csv_result_dir,
output_result_dir,
filename_prefix="",
filename_suffix="3600_miss_timeline",
pdf_name="miss_timeline.pdf",
xlabel="Time",
ylabel="# of misses ",
title="Miss timeline",
vertical=False,
legend=True,
)
plot_line_charts(
csv_result_dir,
output_result_dir,
filename_prefix="",
filename_suffix="3600_policy_timeline",
pdf_name="policy_timeline.pdf",
xlabel="Time",
ylabel="# of times a policy is selected ",
title="Policy timeline",
vertical=False,
legend=True,
)
plot_line_charts(
csv_result_dir,
output_result_dir,
filename_prefix="",
filename_suffix="3600_policy_ratio_timeline",
pdf_name="policy_ratio_timeline.pdf",
xlabel="Time",
ylabel="Percentage of times a policy is selected ",
title="Policy timeline",
vertical=False,
legend=True,
)
if __name__ == "__main__":
if len(sys.argv) < 3:
print(
"Must provide two arguments: 1) The directory that saves a list of "
"directories which contain block cache trace analyzer result files "
"2) the directory to save plotted graphs."
"Must provide two arguments: \n"
"1) The directory that saves a list of "
"directories which contain block cache trace analyzer result files. \n"
"2) the directory to save plotted graphs. \n"
)
exit(1)
csv_result_dir = sys.argv[1]
@ -396,8 +663,59 @@ if __name__ == "__main__":
print("Processing experiment dir: {}".format(csv_relative_dir))
if not os.path.exists(result_dir):
os.makedirs(result_dir)
plot_miss_ratio_graphs(csv_abs_dir, result_dir)
plot_access_timeline(csv_abs_dir, result_dir)
plot_access_count_summary(csv_abs_dir, result_dir)
plot_timeline(csv_abs_dir, result_dir)
plot_miss_ratio_timeline(csv_result_dir, output_result_dir)
plot_correlation(csv_abs_dir, result_dir)
plot_reuse_graphs(csv_abs_dir, result_dir)
plot_percentage_access_summary(csv_abs_dir, result_dir)
plot_access_count_summary(csv_abs_dir, result_dir)
plot_miss_stats_graphs(
csv_abs_dir,
result_dir,
file_prefix="",
file_suffix="mrc",
ylabel="Miss ratio (%)",
pdf_file_name="mrc",
)
plot_miss_stats_diff_lru_graphs(
csv_abs_dir,
result_dir,
file_prefix="",
file_suffix="mrc",
ylabel="Miss ratio (%)",
pdf_file_name="mrc_diff_lru",
)
# The following stats are only available in pysim.
for time_unit in ["1", "60", "3600"]:
plot_miss_stats_graphs(
csv_abs_dir,
result_dir,
file_prefix="ml_{}_".format(time_unit),
file_suffix="p95mb",
ylabel="p95 number of byte miss per {} seconds".format(time_unit),
pdf_file_name="p95mb_per{}_seconds".format(time_unit),
)
plot_miss_stats_graphs(
csv_abs_dir,
result_dir,
file_prefix="ml_{}_".format(time_unit),
file_suffix="avgmb",
ylabel="Average number of byte miss per {} seconds".format(time_unit),
pdf_file_name="avgmb_per{}_seconds".format(time_unit),
)
plot_miss_stats_diff_lru_graphs(
csv_abs_dir,
result_dir,
file_prefix="ml_{}_".format(time_unit),
file_suffix="p95mb",
ylabel="p95 number of byte miss per {} seconds".format(time_unit),
pdf_file_name="p95mb_per{}_seconds_diff_lru".format(time_unit),
)
plot_miss_stats_diff_lru_graphs(
csv_abs_dir,
result_dir,
file_prefix="ml_{}_".format(time_unit),
file_suffix="avgmb",
ylabel="Average number of byte miss per {} seconds".format(time_unit),
pdf_file_name="avgmb_per{}_seconds_diff_lru".format(time_unit),
)