From b4e59a48fd204d1fceb6e2efd01d196ce7e8931f Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Tue, 12 Oct 2021 11:33:36 -0700
Subject: [PATCH] Add a benchmarking wrapper script for BlobDB (#9015)

Summary:
The patch adds a new BlobDB benchmarking script called `run_blob_bench.sh`.
It is a thin wrapper around `benchmark.sh` (similarly to `run_flash_bench.sh`):
it actually calls `benchmark.sh` a number of times, cycling through six workloads,
two write-only ones (bulk load and overwrite), two read/write ones (point lookups
while writing, range scans while writing), and two read-only ones (point lookups
and range scans).

Note: this is a simpler/cleaned up/reworked version of the script used to produce the
benchmark results in http://rocksdb.org/blog/2021/05/26/integrated-blob-db.html .
The new version takes advantage of several recent `benchmark.sh` improvements
like the ability to pass in arbitrary `db_bench` options or the possibility of using a
job ID.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9015

Test Plan: Ran the script manually with different parameter combinations.

Reviewed By: riversand963

Differential Revision: D31555277

Pulled By: ltamasi

fbshipit-source-id: 0e151b2f7b2cf6f66ed7f95455571492ad7ea87f
---
 tools/benchmark.sh      |   2 +-
 tools/run_blob_bench.sh | 195 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 196 insertions(+), 1 deletion(-)
 create mode 100755 tools/run_blob_bench.sh

diff --git a/tools/benchmark.sh b/tools/benchmark.sh
index 665990dfb..80e36f495 100755
--- a/tools/benchmark.sh
+++ b/tools/benchmark.sh
@@ -446,7 +446,7 @@ function run_fillseq {
        --disable_wal=$1 \
        --seed=$( date +%s ) \
        2>&1 | tee -a $log_file_name"
-  
+
   if [[ "$job_id" != "" ]]; then
     echo "Job ID: ${job_id}" > $log_file_name
     echo $cmd | tee -a $log_file_name
diff --git a/tools/run_blob_bench.sh b/tools/run_blob_bench.sh
new file mode 100755
index 000000000..b3657438a
--- /dev/null
+++ b/tools/run_blob_bench.sh
@@ -0,0 +1,195 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# BlobDB benchmark script
+#
+# REQUIRES: benchmark.sh is in the tools subdirectory
+#
+# After the execution of this script, log files are available in $output_dir.
+# report.tsv provides high level statistics.
+#
+# Should be run from the parent of the tools directory. The command line is:
+#   [$env_vars] tools/run_blob_bench.sh
+#
+# This runs the following sequence of BlobDB performance tests:
+#   phase 1) write-only - bulkload+compact, overwrite+waitforcompaction
+#   phase 2) read-write - readwhilewriting, fwdrangewhilewriting
+#   phase 3) read-only - readrandom, fwdrange
+#
+
+# Exit Codes
+EXIT_INVALID_ARGS=1
+
+# Size constants
+K=1024
+M=$((1024 * K))
+G=$((1024 * M))
+T=$((1024 * G))
+
+function display_usage() {
+  echo "usage: run_blob_bench.sh [--help]"
+  echo ""
+  echo "Runs the following sequence of BlobDB benchmark tests using tools/benchmark.sh:"
+  echo -e "\tPhase 1: write-only tests: bulkload+compact, overwrite+waitforcompaction"
+  echo -e "\tPhase 2: read-write tests: readwhilewriting, fwdrangewhilewriting"
+  echo -e "\tPhase 3: read-only tests: readrandom, fwdrange"
+  echo ""
+  echo "Environment Variables:"
+  echo -e "\tJOB_ID\t\t\t\tIdentifier for the benchmark job, will appear in the results (default: empty)"
+  echo -e "\tDB_DIR\t\t\t\tPath for the RocksDB data directory (mandatory)"
+  echo -e "\tWAL_DIR\t\t\t\tPath for the RocksDB WAL directory (mandatory)"
+  echo -e "\tOUTPUT_DIR\t\t\tPath for the benchmark results (mandatory)"
+  echo -e "\tNUM_THREADS\t\t\tNumber of threads (default: 16)"
+  echo -e "\tCOMPRESSION_TYPE\t\tCompression type for the SST files (default: lz4)"
+  echo -e "\tDB_SIZE\t\t\t\tRaw (uncompressed) database size (default: 1 TB)"
+  echo -e "\tVALUE_SIZE\t\t\tValue size (default: 1 KB)"
+  echo -e "\tNUM_KEYS\t\t\tNumber of keys (default: raw database size divided by value size)"
+  echo -e "\tDURATION\t\t\tIndividual duration for read-write/read-only tests in seconds (default: 1800)"
+  echo -e "\tWRITE_BUFFER_SIZE\t\tWrite buffer (memtable) size (default: 1 GB)"
+  echo -e "\tENABLE_BLOB_FILES\t\tEnable blob files (default: 1)"
+  echo -e "\tMIN_BLOB_SIZE\t\t\tSize threshold for storing values in blob files (default: 0)"
+  echo -e "\tBLOB_FILE_SIZE\t\t\tBlob file size (default: same as write buffer size)"
+  echo -e "\tBLOB_COMPRESSION_TYPE\t\tCompression type for the blob files (default: lz4)"
+  echo -e "\tENABLE_BLOB_GC\t\t\tEnable blob garbage collection (default: 1)"
+  echo -e "\tBLOB_GC_AGE_CUTOFF\t\tBlob garbage collection age cutoff (default: 0.25)"
+  echo -e "\tBLOB_GC_FORCE_THRESHOLD\t\tThreshold for forcing garbage collection of the oldest blob files (default: 1.0)"
+  echo -e "\tTARGET_FILE_SIZE_BASE\t\tTarget SST file size for compactions (default: write buffer size, scaled down if blob files are enabled)"
+  echo -e "\tMAX_BYTES_FOR_LEVEL_BASE\tMaximum size for the base level (default: 8 * target SST file size)"
+}
+
+if [ $# -ge 1 ]; then
+  display_usage
+
+  if [ "$1" == "--help" ]; then
+    exit
+  else
+    exit $EXIT_INVALID_ARGS
+  fi
+fi
+
+# shellcheck disable=SC2153
+if [ -z "$DB_DIR" ]; then
+  echo "DB_DIR is not defined"
+  exit $EXIT_INVALID_ARGS
+fi
+
+# shellcheck disable=SC2153
+if [ -z "$WAL_DIR" ]; then
+  echo "WAL_DIR is not defined"
+  exit $EXIT_INVALID_ARGS
+fi
+
+# shellcheck disable=SC2153
+if [ -z "$OUTPUT_DIR" ]; then
+  echo "OUTPUT_DIR is not defined"
+  exit $EXIT_INVALID_ARGS
+fi
+
+# shellcheck disable=SC2153
+job_id=$JOB_ID
+
+db_dir=$DB_DIR
+wal_dir=$WAL_DIR
+output_dir=$OUTPUT_DIR
+
+num_threads=${NUM_THREADS:-16}
+
+compression_type=${COMPRESSION_TYPE:-lz4}
+
+db_size=${DB_SIZE:-$((1 * T))}
+value_size=${VALUE_SIZE:-$((1 * K))}
+num_keys=${NUM_KEYS:-$((db_size / value_size))}
+
+duration=${DURATION:-1800}
+
+write_buffer_size=${WRITE_BUFFER_SIZE:-$((1 * G))}
+
+enable_blob_files=${ENABLE_BLOB_FILES:-1}
+min_blob_size=${MIN_BLOB_SIZE:-0}
+blob_file_size=${BLOB_FILE_SIZE:-$write_buffer_size}
+blob_compression_type=${BLOB_COMPRESSION_TYPE:-lz4}
+enable_blob_garbage_collection=${ENABLE_BLOB_GC:-1}
+blob_garbage_collection_age_cutoff=${BLOB_GC_AGE_CUTOFF:-0.25}
+blob_garbage_collection_force_threshold=${BLOB_GC_FORCE_THRESHOLD:-1.0}
+
+if [ "$enable_blob_files" == "1" ]; then
+  target_file_size_base=${TARGET_FILE_SIZE_BASE:-$((32 * write_buffer_size / value_size))}
+else
+  target_file_size_base=${TARGET_FILE_SIZE_BASE:-$write_buffer_size}
+fi
+
+max_bytes_for_level_base=${MAX_BYTES_FOR_LEVEL_BASE:-$((8 * target_file_size_base))}
+
+echo "======================== Benchmark setup ========================"
+echo -e "Job ID:\t\t\t\t\t$job_id"
+echo -e "Data directory:\t\t\t\t$db_dir"
+echo -e "WAL directory:\t\t\t\t$wal_dir"
+echo -e "Output directory:\t\t\t$output_dir"
+echo -e "Number of threads:\t\t\t$num_threads"
+echo -e "Compression type for SST files:\t\t$compression_type"
+echo -e "Raw database size:\t\t\t$db_size"
+echo -e "Value size:\t\t\t\t$value_size"
+echo -e "Number of keys:\t\t\t\t$num_keys"
+echo -e "Duration of read-write/read-only tests:\t$duration"
+echo -e "Write buffer size:\t\t\t$write_buffer_size"
+echo -e "Blob files enabled:\t\t\t$enable_blob_files"
+echo -e "Blob size threshold:\t\t\t$min_blob_size"
+echo -e "Blob file size:\t\t\t\t$blob_file_size"
+echo -e "Compression type for blob files:\t$blob_compression_type"
+echo -e "Blob GC enabled:\t\t\t$enable_blob_garbage_collection"
+echo -e "Blob GC age cutoff:\t\t\t$blob_garbage_collection_age_cutoff"
+echo -e "Blob GC force threshold:\t\t$blob_garbage_collection_force_threshold"
+echo -e "Target SST file size:\t\t\t$target_file_size_base"
+echo -e "Maximum size of base level:\t\t$max_bytes_for_level_base"
+echo "================================================================="
+
+rm -rf "$db_dir"
+rm -rf "$wal_dir"
+rm -rf "$output_dir"
+
+ENV_VARS="\
+  JOB_ID=$job_id \
+  DB_DIR=$db_dir \
+  WAL_DIR=$wal_dir \
+  OUTPUT_DIR=$output_dir \
+  NUM_THREADS=$num_threads \
+  COMPRESSION_TYPE=$compression_type \
+  VALUE_SIZE=$value_size \
+  NUM_KEYS=$num_keys"
+
+ENV_VARS_D="$ENV_VARS DURATION=$duration"
+
+PARAMS="\
+  --enable_blob_files=$enable_blob_files \
+  --min_blob_size=$min_blob_size \
+  --blob_file_size=$blob_file_size \
+  --blob_compression_type=$blob_compression_type \
+  --write_buffer_size=$write_buffer_size \
+  --target_file_size_base=$target_file_size_base \
+  --max_bytes_for_level_base=$max_bytes_for_level_base"
+
+PARAMS_GC="$PARAMS \
+  --enable_blob_garbage_collection=$enable_blob_garbage_collection \
+  --blob_garbage_collection_age_cutoff=$blob_garbage_collection_age_cutoff \
+  --blob_garbage_collection_force_threshold=$blob_garbage_collection_force_threshold"
+
+# bulk load (using fillrandom) + compact
+env -u DURATION -S "$ENV_VARS" ./tools/benchmark.sh bulkload "$PARAMS"
+
+# overwrite + waitforcompaction
+env -u DURATION -S "$ENV_VARS" ./tools/benchmark.sh overwrite "$PARAMS_GC"
+
+# readwhilewriting
+env -S "$ENV_VARS_D" ./tools/benchmark.sh readwhilewriting "$PARAMS_GC"
+
+# fwdrangewhilewriting
+env -S "$ENV_VARS_D" ./tools/benchmark.sh fwdrangewhilewriting "$PARAMS_GC"
+
+# readrandom
+env -S "$ENV_VARS_D" ./tools/benchmark.sh readrandom "$PARAMS_GC"
+
+# fwdrange
+env -S "$ENV_VARS_D" ./tools/benchmark.sh fwdrange "$PARAMS_GC"
+
+# save logs to output directory
+cp "$db_dir"/LOG* "$output_dir/"