diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py b/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py index 22d56b932..0fdaa4158 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py @@ -1,12 +1,17 @@ #!/usr/bin/env python3 import csv +import math import os import random import sys +import matplotlib +matplotlib.use("Agg") import matplotlib.backends.backend_pdf import matplotlib.pyplot as plt import numpy as np +import pandas as pd +import seaborn as sns # Make sure a legend has the same color across all generated graphs. @@ -19,7 +24,7 @@ def get_cmap(n, name="hsv"): color_index = 0 bar_color_maps = {} colors = [] -n_colors = 60 +n_colors = 360 linear_colors = get_cmap(n_colors) for i in range(n_colors): colors.append(linear_colors(i)) @@ -35,41 +40,95 @@ def num_to_gb(n): return "{0:.2f}".format(float(n) / one_gb) -def plot_miss_ratio_graphs(csv_result_dir, output_result_dir): - mrc_file_path = csv_result_dir + "/mrc" - if not os.path.exists(mrc_file_path): - return +def plot_miss_stats_graphs( + csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name +): miss_ratios = {} - print("Processing file {}".format(mrc_file_path)) - with open(mrc_file_path, "r") as csvfile: - rows = csv.reader(csvfile, delimiter=",") - is_header = False - for row in rows: - if not is_header: - is_header = True - continue - cache_name = row[0] - num_shard_bits = int(row[1]) - ghost_capacity = int(row[2]) - capacity = int(row[3]) - miss_ratio = float(row[4]) - config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity) - if config not in miss_ratios: - miss_ratios[config] = {} - miss_ratios[config]["x"] = [] - miss_ratios[config]["y"] = [] - miss_ratios[config]["x"].append(num_to_gb(capacity)) - miss_ratios[config]["y"].append(miss_ratio) + for file in os.listdir(csv_result_dir): + if not file.startswith(file_prefix): + continue + if not file.endswith(file_suffix): + continue + print("Processing file {}/{}".format(csv_result_dir, file)) + mrc_file_path = csv_result_dir + "/" + file + with open(mrc_file_path, "r") as csvfile: + rows = csv.reader(csvfile, delimiter=",") + for row in rows: + cache_name = row[0] + num_shard_bits = int(row[1]) + ghost_capacity = int(row[2]) + capacity = int(row[3]) + miss_ratio = float(row[4]) + config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity) + if config not in miss_ratios: + miss_ratios[config] = {} + miss_ratios[config]["x"] = [] + miss_ratios[config]["y"] = [] + miss_ratios[config]["x"].append(capacity) + miss_ratios[config]["y"].append(miss_ratio) + fig = plt.figure() + for config in miss_ratios: + plt.plot( + miss_ratios[config]["x"], miss_ratios[config]["y"], label=config + ) + plt.xlabel("Cache capacity") + plt.ylabel(ylabel) + plt.xscale("log", basex=2) + plt.ylim(ymin=0) + plt.title("{}".format(file)) + plt.legend() + fig.savefig( + output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight" + ) + + +def plot_miss_stats_diff_lru_graphs( + csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name +): + miss_ratios = {} + for file in os.listdir(csv_result_dir): + if not file.startswith(file_prefix): + continue + if not file.endswith(file_suffix): + continue + print("Processing file {}/{}".format(csv_result_dir, file)) + mrc_file_path = csv_result_dir + "/" + file + with open(mrc_file_path, "r") as csvfile: + rows = csv.reader(csvfile, delimiter=",") + for row in rows: + cache_name = row[0] + num_shard_bits = int(row[1]) + ghost_capacity = int(row[2]) + capacity = int(row[3]) + miss_ratio = float(row[4]) + config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity) + if config not in miss_ratios: + miss_ratios[config] = {} + miss_ratios[config]["x"] = [] + miss_ratios[config]["y"] = [] + miss_ratios[config]["x"].append(capacity) + miss_ratios[config]["y"].append(miss_ratio) + if "lru-0-0" not in miss_ratios: + return fig = plt.figure() for config in miss_ratios: - plt.plot(miss_ratios[config]["x"], miss_ratios[config]["y"], label=config) - plt.xlabel("Cache capacity (GB)") - plt.ylabel("Miss Ratio (%)") - # plt.xscale('log', basex=2) - plt.ylim(ymin=0) - plt.title("RocksDB block cache miss ratios") + diffs = [0] * len(miss_ratios["lru-0-0"]["x"]) + for i in range(len(miss_ratios["lru-0-0"]["x"])): + for j in range(len(miss_ratios[config]["x"])): + if miss_ratios["lru-0-0"]["x"][i] == miss_ratios[config]["x"][j]: + diffs[i] = ( + miss_ratios[config]["y"][j] - miss_ratios["lru-0-0"]["y"][i] + ) + break + plt.plot(miss_ratios["lru-0-0"]["x"], diffs, label=config) + plt.xlabel("Cache capacity") + plt.ylabel(ylabel) + plt.xscale("log", basex=2) + plt.title("{}".format(file)) plt.legend() - fig.savefig(output_result_dir + "/mrc.pdf", bbox_inches="tight") + fig.savefig( + output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight" + ) def sanitize(label): @@ -143,6 +202,7 @@ def read_data_for_plot(csvfile, vertical): def plot_line_charts( csv_result_dir, output_result_dir, + filename_prefix, filename_suffix, pdf_name, xlabel, @@ -151,11 +211,14 @@ def plot_line_charts( vertical, legend, ): + global color_index, bar_color_maps, colors pdf = matplotlib.backends.backend_pdf.PdfPages(output_result_dir + "/" + pdf_name) for file in os.listdir(csv_result_dir): if not file.endswith(filename_suffix): continue - print("Processing file {}".format(file)) + if not file.startswith(filename_prefix): + continue + print("Processing file {}/{}".format(csv_result_dir, file)) with open(csv_result_dir + "/" + file, "r") as csvfile: x, labels, label_stats = read_data_for_plot(csvfile, vertical) if len(x) == 0 or len(labels) == 0: @@ -163,10 +226,15 @@ def plot_line_charts( # plot figure fig = plt.figure() for label_index in label_stats: + # Assign a unique color to this label. + if labels[label_index] not in bar_color_maps: + bar_color_maps[labels[label_index]] = colors[color_index] + color_index += 1 plt.plot( - [int(x[i]) for i in range(len(x))], - label_stats[label_index], + [int(x[i]) for i in range(len(x) - 1)], + label_stats[label_index][:-1], label=labels[label_index], + color=bar_color_maps[labels[label_index]], ) # Translate time unit into x labels. @@ -239,10 +307,29 @@ def plot_stacked_bar_charts( pdf.close() -def plot_access_timeline(csv_result_dir, output_result_dir): +def plot_heatmap(csv_result_dir, output_result_dir, filename_suffix, pdf_name, title): + pdf = matplotlib.backends.backend_pdf.PdfPages( + "{}/{}".format(output_result_dir, pdf_name) + ) + for file in os.listdir(csv_result_dir): + if not file.endswith(filename_suffix): + continue + csv_file_name = "{}/{}".format(csv_result_dir, file) + print("Processing file {}/{}".format(csv_result_dir, file)) + corr_table = pd.read_csv(csv_file_name) + corr_table = corr_table.pivot("label", "corr", "value") + fig = plt.figure() + sns.heatmap(corr_table, annot=True, linewidths=0.5, fmt=".2") + plt.title("{} filename:{}".format(title, file)) + pdf.savefig(fig) + pdf.close() + + +def plot_timeline(csv_result_dir, output_result_dir): plot_line_charts( csv_result_dir, output_result_dir, + filename_prefix="", filename_suffix="access_timeline", pdf_name="access_time.pdf", xlabel="Time", @@ -253,6 +340,109 @@ def plot_access_timeline(csv_result_dir, output_result_dir): ) +def convert_to_0_if_nan(n): + if math.isnan(n): + return 0.0 + return n + + +def plot_correlation(csv_result_dir, output_result_dir): + # Processing the correlation input first. + label_str_file = {} + for file in os.listdir(csv_result_dir): + if not file.endswith("correlation_input"): + continue + csv_file_name = "{}/{}".format(csv_result_dir, file) + print("Processing file {}/{}".format(csv_result_dir, file)) + corr_table = pd.read_csv(csv_file_name) + label_str = file.split("_")[0] + label = file[len(label_str) + 1 :] + label = label[: len(label) - len("_correlation_input")] + + output_file = "{}/{}_correlation_output".format(csv_result_dir, label_str) + if output_file not in label_str_file: + f = open("{}/{}_correlation_output".format(csv_result_dir, label_str), "w+") + label_str_file[output_file] = f + f.write("label,corr,value\n") + f = label_str_file[output_file] + f.write( + "{},{},{}\n".format( + label, + "LA+A", + convert_to_0_if_nan( + corr_table["num_accesses_since_last_access"].corr( + corr_table["num_accesses_till_next_access"], method="spearman" + ) + ), + ) + ) + f.write( + "{},{},{}\n".format( + label, + "PA+A", + convert_to_0_if_nan( + corr_table["num_past_accesses"].corr( + corr_table["num_accesses_till_next_access"], method="spearman" + ) + ), + ) + ) + f.write( + "{},{},{}\n".format( + label, + "LT+A", + convert_to_0_if_nan( + corr_table["elapsed_time_since_last_access"].corr( + corr_table["num_accesses_till_next_access"], method="spearman" + ) + ), + ) + ) + f.write( + "{},{},{}\n".format( + label, + "LA+T", + convert_to_0_if_nan( + corr_table["num_accesses_since_last_access"].corr( + corr_table["elapsed_time_till_next_access"], method="spearman" + ) + ), + ) + ) + f.write( + "{},{},{}\n".format( + label, + "LT+T", + convert_to_0_if_nan( + corr_table["elapsed_time_since_last_access"].corr( + corr_table["elapsed_time_till_next_access"], method="spearman" + ) + ), + ) + ) + f.write( + "{},{},{}\n".format( + label, + "PA+T", + convert_to_0_if_nan( + corr_table["num_past_accesses"].corr( + corr_table["elapsed_time_till_next_access"], method="spearman" + ) + ), + ) + ) + for label_str in label_str_file: + label_str_file[label_str].close() + + plot_heatmap( + csv_result_dir, + output_result_dir, + "correlation_output", + "correlation.pdf", + "Correlation", + ) + + def plot_reuse_graphs(csv_result_dir, output_result_dir): plot_stacked_bar_charts( csv_result_dir, @@ -301,6 +491,7 @@ def plot_reuse_graphs(csv_result_dir, output_result_dir): plot_line_charts( csv_result_dir, output_result_dir, + filename_prefix="", filename_suffix="reuse_blocks_timeline", pdf_name="reuse_blocks_timeline.pdf", xlabel="", @@ -370,14 +561,90 @@ def plot_access_count_summary(csv_result_dir, output_result_dir): vertical=True, x_prefix="< ", ) + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="skewness", + pdf_name="skew.pdf", + xlabel="", + ylabel="Percentage of accesses", + title="Skewness", + vertical=True, + legend=False, + ) + + +def plot_miss_ratio_timeline(csv_result_dir, output_result_dir): + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="3600_miss_ratio_timeline", + pdf_name="miss_ratio_timeline.pdf", + xlabel="Time", + ylabel="Miss Ratio (%)", + title="Miss ratio timeline", + vertical=False, + legend=True, + ) + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="3600_miss_timeline", + pdf_name="miss_timeline.pdf", + xlabel="Time", + ylabel="# of misses ", + title="Miss timeline", + vertical=False, + legend=True, + ) + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="3600_miss_timeline", + pdf_name="miss_timeline.pdf", + xlabel="Time", + ylabel="# of misses ", + title="Miss timeline", + vertical=False, + legend=True, + ) + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="3600_policy_timeline", + pdf_name="policy_timeline.pdf", + xlabel="Time", + ylabel="# of times a policy is selected ", + title="Policy timeline", + vertical=False, + legend=True, + ) + plot_line_charts( + csv_result_dir, + output_result_dir, + filename_prefix="", + filename_suffix="3600_policy_ratio_timeline", + pdf_name="policy_ratio_timeline.pdf", + xlabel="Time", + ylabel="Percentage of times a policy is selected ", + title="Policy timeline", + vertical=False, + legend=True, + ) if __name__ == "__main__": if len(sys.argv) < 3: print( - "Must provide two arguments: 1) The directory that saves a list of " - "directories which contain block cache trace analyzer result files " - "2) the directory to save plotted graphs." + "Must provide two arguments: \n" + "1) The directory that saves a list of " + "directories which contain block cache trace analyzer result files. \n" + "2) the directory to save plotted graphs. \n" ) exit(1) csv_result_dir = sys.argv[1] @@ -396,8 +663,59 @@ if __name__ == "__main__": print("Processing experiment dir: {}".format(csv_relative_dir)) if not os.path.exists(result_dir): os.makedirs(result_dir) - plot_miss_ratio_graphs(csv_abs_dir, result_dir) - plot_access_timeline(csv_abs_dir, result_dir) + plot_access_count_summary(csv_abs_dir, result_dir) + plot_timeline(csv_abs_dir, result_dir) + plot_miss_ratio_timeline(csv_result_dir, output_result_dir) + plot_correlation(csv_abs_dir, result_dir) plot_reuse_graphs(csv_abs_dir, result_dir) plot_percentage_access_summary(csv_abs_dir, result_dir) - plot_access_count_summary(csv_abs_dir, result_dir) + plot_miss_stats_graphs( + csv_abs_dir, + result_dir, + file_prefix="", + file_suffix="mrc", + ylabel="Miss ratio (%)", + pdf_file_name="mrc", + ) + plot_miss_stats_diff_lru_graphs( + csv_abs_dir, + result_dir, + file_prefix="", + file_suffix="mrc", + ylabel="Miss ratio (%)", + pdf_file_name="mrc_diff_lru", + ) + # The following stats are only available in pysim. + for time_unit in ["1", "60", "3600"]: + plot_miss_stats_graphs( + csv_abs_dir, + result_dir, + file_prefix="ml_{}_".format(time_unit), + file_suffix="p95mb", + ylabel="p95 number of byte miss per {} seconds".format(time_unit), + pdf_file_name="p95mb_per{}_seconds".format(time_unit), + ) + plot_miss_stats_graphs( + csv_abs_dir, + result_dir, + file_prefix="ml_{}_".format(time_unit), + file_suffix="avgmb", + ylabel="Average number of byte miss per {} seconds".format(time_unit), + pdf_file_name="avgmb_per{}_seconds".format(time_unit), + ) + plot_miss_stats_diff_lru_graphs( + csv_abs_dir, + result_dir, + file_prefix="ml_{}_".format(time_unit), + file_suffix="p95mb", + ylabel="p95 number of byte miss per {} seconds".format(time_unit), + pdf_file_name="p95mb_per{}_seconds_diff_lru".format(time_unit), + ) + plot_miss_stats_diff_lru_graphs( + csv_abs_dir, + result_dir, + file_prefix="ml_{}_".format(time_unit), + file_suffix="avgmb", + ylabel="Average number of byte miss per {} seconds".format(time_unit), + pdf_file_name="avgmb_per{}_seconds_diff_lru".format(time_unit), + )