70c7302fb5
Summary: This PR implements cache eviction using reinforcement learning. It includes two implementations: 1. An implementation of Thompson Sampling for the Bernoulli Bandit [1]. 2. An implementation of LinUCB with disjoint linear models [2]. The idea is that a cache uses multiple eviction policies, e.g., MRU, LRU, and LFU. The cache learns which eviction policy is the best and uses it upon a cache miss. Thompson Sampling is contextless and does not include any features. LinUCB includes features such as level, block type, caller, column family id to decide which eviction policy to use. [1] Daniel J. Russo, Benjamin Van Roy, Abbas Kazerouni, Ian Osband, and Zheng Wen. 2018. A Tutorial on Thompson Sampling. Found. Trends Mach. Learn. 11, 1 (July 2018), 1-96. DOI: https://doi.org/10.1561/2200000070 [2] Lihong Li, Wei Chu, John Langford, and Robert E. Schapire. 2010. A contextual-bandit approach to personalized news article recommendation. In Proceedings of the 19th international conference on World wide web (WWW '10). ACM, New York, NY, USA, 661-670. DOI=http://dx.doi.org/10.1145/1772690.1772758 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5610 Differential Revision: D16435067 Pulled By: HaoyuHuang fbshipit-source-id: 6549239ae14115c01cb1e70548af9e46d8dc21bb
404 lines
12 KiB
Python
404 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
import csv
|
|
import os
|
|
import random
|
|
import sys
|
|
|
|
import matplotlib.backends.backend_pdf
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
|
|
|
|
# Make sure a legend has the same color across all generated graphs.
|
|
def get_cmap(n, name="hsv"):
|
|
"""Returns a function that maps each index in 0, 1, ..., n-1 to a distinct
|
|
RGB color; the keyword argument name must be a standard mpl colormap name."""
|
|
return plt.cm.get_cmap(name, n)
|
|
|
|
|
|
color_index = 0
|
|
bar_color_maps = {}
|
|
colors = []
|
|
n_colors = 60
|
|
linear_colors = get_cmap(n_colors)
|
|
for i in range(n_colors):
|
|
colors.append(linear_colors(i))
|
|
# Shuffle the colors so that adjacent bars in a graph are obvious to differentiate.
|
|
random.shuffle(colors)
|
|
|
|
|
|
def num_to_gb(n):
|
|
one_gb = 1024 * 1024 * 1024
|
|
if float(n) % one_gb == 0:
|
|
return "{}".format(n / one_gb)
|
|
# Keep two decimal points.
|
|
return "{0:.2f}".format(float(n) / one_gb)
|
|
|
|
|
|
def plot_miss_ratio_graphs(csv_result_dir, output_result_dir):
|
|
mrc_file_path = csv_result_dir + "/mrc"
|
|
if not os.path.exists(mrc_file_path):
|
|
return
|
|
miss_ratios = {}
|
|
print("Processing file {}".format(mrc_file_path))
|
|
with open(mrc_file_path, "r") as csvfile:
|
|
rows = csv.reader(csvfile, delimiter=",")
|
|
is_header = False
|
|
for row in rows:
|
|
if not is_header:
|
|
is_header = True
|
|
continue
|
|
cache_name = row[0]
|
|
num_shard_bits = int(row[1])
|
|
ghost_capacity = int(row[2])
|
|
capacity = int(row[3])
|
|
miss_ratio = float(row[4])
|
|
config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity)
|
|
if config not in miss_ratios:
|
|
miss_ratios[config] = {}
|
|
miss_ratios[config]["x"] = []
|
|
miss_ratios[config]["y"] = []
|
|
miss_ratios[config]["x"].append(num_to_gb(capacity))
|
|
miss_ratios[config]["y"].append(miss_ratio)
|
|
fig = plt.figure()
|
|
for config in miss_ratios:
|
|
plt.plot(miss_ratios[config]["x"], miss_ratios[config]["y"], label=config)
|
|
plt.xlabel("Cache capacity (GB)")
|
|
plt.ylabel("Miss Ratio (%)")
|
|
# plt.xscale('log', basex=2)
|
|
plt.ylim(ymin=0)
|
|
plt.title("RocksDB block cache miss ratios")
|
|
plt.legend()
|
|
fig.savefig(output_result_dir + "/mrc.pdf", bbox_inches="tight")
|
|
|
|
|
|
def sanitize(label):
|
|
# matplotlib cannot plot legends that is prefixed with "_"
|
|
# so we need to remove them here.
|
|
index = 0
|
|
for i in range(len(label)):
|
|
if label[i] == "_":
|
|
index += 1
|
|
else:
|
|
break
|
|
data = label[index:]
|
|
# The value of uint64_max in c++.
|
|
if "18446744073709551615" in data:
|
|
return "max"
|
|
return data
|
|
|
|
|
|
# Read the csv file vertically, i.e., group the data by columns.
|
|
def read_data_for_plot_vertical(csvfile):
|
|
x = []
|
|
labels = []
|
|
label_stats = {}
|
|
csv_rows = csv.reader(csvfile, delimiter=",")
|
|
data_rows = []
|
|
for row in csv_rows:
|
|
data_rows.append(row)
|
|
# header
|
|
for i in range(1, len(data_rows[0])):
|
|
labels.append(sanitize(data_rows[0][i]))
|
|
label_stats[i - 1] = []
|
|
for i in range(1, len(data_rows)):
|
|
for j in range(len(data_rows[i])):
|
|
if j == 0:
|
|
x.append(sanitize(data_rows[i][j]))
|
|
continue
|
|
label_stats[j - 1].append(float(data_rows[i][j]))
|
|
return x, labels, label_stats
|
|
|
|
|
|
# Read the csv file horizontally, i.e., group the data by rows.
|
|
def read_data_for_plot_horizontal(csvfile):
|
|
x = []
|
|
labels = []
|
|
label_stats = {}
|
|
csv_rows = csv.reader(csvfile, delimiter=",")
|
|
data_rows = []
|
|
for row in csv_rows:
|
|
data_rows.append(row)
|
|
# header
|
|
for i in range(1, len(data_rows)):
|
|
labels.append(sanitize(data_rows[i][0]))
|
|
label_stats[i - 1] = []
|
|
for i in range(1, len(data_rows[0])):
|
|
x.append(sanitize(data_rows[0][i]))
|
|
for i in range(1, len(data_rows)):
|
|
for j in range(len(data_rows[i])):
|
|
if j == 0:
|
|
# label
|
|
continue
|
|
label_stats[i - 1].append(float(data_rows[i][j]))
|
|
return x, labels, label_stats
|
|
|
|
|
|
def read_data_for_plot(csvfile, vertical):
|
|
if vertical:
|
|
return read_data_for_plot_vertical(csvfile)
|
|
return read_data_for_plot_horizontal(csvfile)
|
|
|
|
|
|
def plot_line_charts(
|
|
csv_result_dir,
|
|
output_result_dir,
|
|
filename_suffix,
|
|
pdf_name,
|
|
xlabel,
|
|
ylabel,
|
|
title,
|
|
vertical,
|
|
legend,
|
|
):
|
|
pdf = matplotlib.backends.backend_pdf.PdfPages(output_result_dir + "/" + pdf_name)
|
|
for file in os.listdir(csv_result_dir):
|
|
if not file.endswith(filename_suffix):
|
|
continue
|
|
print("Processing file {}".format(file))
|
|
with open(csv_result_dir + "/" + file, "r") as csvfile:
|
|
x, labels, label_stats = read_data_for_plot(csvfile, vertical)
|
|
if len(x) == 0 or len(labels) == 0:
|
|
continue
|
|
# plot figure
|
|
fig = plt.figure()
|
|
for label_index in label_stats:
|
|
plt.plot(
|
|
[int(x[i]) for i in range(len(x))],
|
|
label_stats[label_index],
|
|
label=labels[label_index],
|
|
)
|
|
|
|
# Translate time unit into x labels.
|
|
if "_60" in file:
|
|
plt.xlabel("{} (Minute)".format(xlabel))
|
|
if "_3600" in file:
|
|
plt.xlabel("{} (Hour)".format(xlabel))
|
|
plt.ylabel(ylabel)
|
|
plt.title("{} {}".format(title, file))
|
|
if legend:
|
|
plt.legend()
|
|
pdf.savefig(fig)
|
|
pdf.close()
|
|
|
|
|
|
def plot_stacked_bar_charts(
|
|
csv_result_dir,
|
|
output_result_dir,
|
|
filename_suffix,
|
|
pdf_name,
|
|
xlabel,
|
|
ylabel,
|
|
title,
|
|
vertical,
|
|
x_prefix,
|
|
):
|
|
global color_index, bar_color_maps, colors
|
|
pdf = matplotlib.backends.backend_pdf.PdfPages(
|
|
"{}/{}".format(output_result_dir, pdf_name)
|
|
)
|
|
for file in os.listdir(csv_result_dir):
|
|
if not file.endswith(filename_suffix):
|
|
continue
|
|
with open(csv_result_dir + "/" + file, "r") as csvfile:
|
|
print("Processing file {}/{}".format(csv_result_dir, file))
|
|
x, labels, label_stats = read_data_for_plot(csvfile, vertical)
|
|
if len(x) == 0 or len(label_stats) == 0:
|
|
continue
|
|
# Plot figure
|
|
fig = plt.figure()
|
|
ind = np.arange(len(x)) # the x locations for the groups
|
|
width = 0.5 # the width of the bars: can also be len(x) sequence
|
|
bars = []
|
|
bottom_bars = []
|
|
for _i in label_stats[0]:
|
|
bottom_bars.append(0)
|
|
for i in range(0, len(label_stats)):
|
|
# Assign a unique color to this label.
|
|
if labels[i] not in bar_color_maps:
|
|
bar_color_maps[labels[i]] = colors[color_index]
|
|
color_index += 1
|
|
p = plt.bar(
|
|
ind,
|
|
label_stats[i],
|
|
width,
|
|
bottom=bottom_bars,
|
|
color=bar_color_maps[labels[i]],
|
|
)
|
|
bars.append(p[0])
|
|
for j in range(len(label_stats[i])):
|
|
bottom_bars[j] += label_stats[i][j]
|
|
plt.xlabel(xlabel)
|
|
plt.ylabel(ylabel)
|
|
plt.xticks(
|
|
ind, [x_prefix + x[i] for i in range(len(x))], rotation=20, fontsize=8
|
|
)
|
|
plt.legend(bars, labels)
|
|
plt.title("{} filename:{}".format(title, file))
|
|
pdf.savefig(fig)
|
|
pdf.close()
|
|
|
|
|
|
def plot_access_timeline(csv_result_dir, output_result_dir):
|
|
plot_line_charts(
|
|
csv_result_dir,
|
|
output_result_dir,
|
|
filename_suffix="access_timeline",
|
|
pdf_name="access_time.pdf",
|
|
xlabel="Time",
|
|
ylabel="Throughput",
|
|
title="Access timeline with group by label",
|
|
vertical=False,
|
|
legend=True,
|
|
)
|
|
|
|
|
|
def plot_reuse_graphs(csv_result_dir, output_result_dir):
|
|
plot_stacked_bar_charts(
|
|
csv_result_dir,
|
|
output_result_dir,
|
|
filename_suffix="avg_reuse_interval_naccesses",
|
|
pdf_name="avg_reuse_interval_naccesses.pdf",
|
|
xlabel="",
|
|
ylabel="Percentage of accesses",
|
|
title="Average reuse interval",
|
|
vertical=True,
|
|
x_prefix="< ",
|
|
)
|
|
plot_stacked_bar_charts(
|
|
csv_result_dir,
|
|
output_result_dir,
|
|
filename_suffix="avg_reuse_interval",
|
|
pdf_name="avg_reuse_interval.pdf",
|
|
xlabel="",
|
|
ylabel="Percentage of blocks",
|
|
title="Average reuse interval",
|
|
vertical=True,
|
|
x_prefix="< ",
|
|
)
|
|
plot_stacked_bar_charts(
|
|
csv_result_dir,
|
|
output_result_dir,
|
|
filename_suffix="access_reuse_interval",
|
|
pdf_name="reuse_interval.pdf",
|
|
xlabel="Seconds",
|
|
ylabel="Percentage of accesses",
|
|
title="Reuse interval",
|
|
vertical=True,
|
|
x_prefix="< ",
|
|
)
|
|
plot_stacked_bar_charts(
|
|
csv_result_dir,
|
|
output_result_dir,
|
|
filename_suffix="reuse_lifetime",
|
|
pdf_name="reuse_lifetime.pdf",
|
|
xlabel="Seconds",
|
|
ylabel="Percentage of blocks",
|
|
title="Reuse lifetime",
|
|
vertical=True,
|
|
x_prefix="< ",
|
|
)
|
|
plot_line_charts(
|
|
csv_result_dir,
|
|
output_result_dir,
|
|
filename_suffix="reuse_blocks_timeline",
|
|
pdf_name="reuse_blocks_timeline.pdf",
|
|
xlabel="",
|
|
ylabel="Percentage of blocks",
|
|
title="Reuse blocks timeline",
|
|
vertical=False,
|
|
legend=False,
|
|
)
|
|
|
|
|
|
def plot_percentage_access_summary(csv_result_dir, output_result_dir):
|
|
plot_stacked_bar_charts(
|
|
csv_result_dir,
|
|
output_result_dir,
|
|
filename_suffix="percentage_of_accesses_summary",
|
|
pdf_name="percentage_access.pdf",
|
|
xlabel="",
|
|
ylabel="Percentage of accesses",
|
|
title="",
|
|
vertical=True,
|
|
x_prefix="",
|
|
)
|
|
plot_stacked_bar_charts(
|
|
csv_result_dir,
|
|
output_result_dir,
|
|
filename_suffix="percent_ref_keys",
|
|
pdf_name="percent_ref_keys.pdf",
|
|
xlabel="",
|
|
ylabel="Percentage of blocks",
|
|
title="",
|
|
vertical=True,
|
|
x_prefix="",
|
|
)
|
|
plot_stacked_bar_charts(
|
|
csv_result_dir,
|
|
output_result_dir,
|
|
filename_suffix="percent_data_size_on_ref_keys",
|
|
pdf_name="percent_data_size_on_ref_keys.pdf",
|
|
xlabel="",
|
|
ylabel="Percentage of blocks",
|
|
title="",
|
|
vertical=True,
|
|
x_prefix="",
|
|
)
|
|
plot_stacked_bar_charts(
|
|
csv_result_dir,
|
|
output_result_dir,
|
|
filename_suffix="percent_accesses_on_ref_keys",
|
|
pdf_name="percent_accesses_on_ref_keys.pdf",
|
|
xlabel="",
|
|
ylabel="Percentage of blocks",
|
|
title="",
|
|
vertical=True,
|
|
x_prefix="",
|
|
)
|
|
|
|
|
|
def plot_access_count_summary(csv_result_dir, output_result_dir):
|
|
plot_stacked_bar_charts(
|
|
csv_result_dir,
|
|
output_result_dir,
|
|
filename_suffix="access_count_summary",
|
|
pdf_name="access_count_summary.pdf",
|
|
xlabel="Access count",
|
|
ylabel="Percentage of blocks",
|
|
title="",
|
|
vertical=True,
|
|
x_prefix="< ",
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) < 3:
|
|
print(
|
|
"Must provide two arguments: 1) The directory that saves a list of "
|
|
"directories which contain block cache trace analyzer result files "
|
|
"2) the directory to save plotted graphs."
|
|
)
|
|
exit(1)
|
|
csv_result_dir = sys.argv[1]
|
|
output_result_dir = sys.argv[2]
|
|
print(
|
|
"Processing directory {} and save graphs to {}.".format(
|
|
csv_result_dir, output_result_dir
|
|
)
|
|
)
|
|
for csv_relative_dir in os.listdir(csv_result_dir):
|
|
csv_abs_dir = csv_result_dir + "/" + csv_relative_dir
|
|
result_dir = output_result_dir + "/" + csv_relative_dir
|
|
if not os.path.isdir(csv_abs_dir):
|
|
print("{} is not a directory".format(csv_abs_dir))
|
|
continue
|
|
print("Processing experiment dir: {}".format(csv_relative_dir))
|
|
if not os.path.exists(result_dir):
|
|
os.makedirs(result_dir)
|
|
plot_miss_ratio_graphs(csv_abs_dir, result_dir)
|
|
plot_access_timeline(csv_abs_dir, result_dir)
|
|
plot_reuse_graphs(csv_abs_dir, result_dir)
|
|
plot_percentage_access_summary(csv_abs_dir, result_dir)
|
|
plot_access_count_summary(csv_abs_dir, result_dir)
|