rocksdb/tools/block_cache_analyzer/block_cache_pysim.sh

#!/usr/bin/env bash
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#
# A shell script to run a batch of pysims and combine individual pysim output files.
#
# Usage: bash block_cache_pysim.sh trace_file_path result_dir downsample_size warmup_seconds max_jobs
# trace_file_path: The file path that stores the traces.
# result_dir: The directory to store pysim results. The output files from a pysim is stores in result_dir/ml
# downsample_size: The downsample size used to collect the trace.
# warmup_seconds: The number of seconds used for warmup.
# max_jobs: The max number of concurrent pysims to run.

if [ $# -ne 5 ]; then
  echo "Usage: ./block_cache_pysim.sh trace_file_path result_dir downsample_size warmup_seconds max_jobs"
  exit 0
fi

trace_file="$1"
result_dir="$2"
downsample_size="$3"
warmup_seconds="$4"
max_jobs="$5"
current_jobs=0

ml_tmp_result_dir="$result_dir/ml"
rm -rf "$ml_tmp_result_dir"
mkdir -p "$result_dir"
mkdir -p "$ml_tmp_result_dir"

for cache_type in "ts" "linucb" "ts_hybrid" "linucb_hybrid"
do
for cache_size in "16M" "256M" "1G" "2G" "4G" "8G" "12G" "16G"
do
    while [ "$current_jobs" -ge "$max_jobs" ]
    do
      sleep 10
      echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
      current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep)
      echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
    done
    output="log-ml-$cache_type-$cache_size"
    echo "Running simulation for $cache_type and cache size $cache_size. Number of running jobs: $current_jobs. "
    nohup python block_cache_pysim.py "$cache_type" "$cache_size" "$downsample_size" "$warmup_seconds" "$trace_file" "$ml_tmp_result_dir" >& $ml_tmp_result_dir/$output &
    current_jobs=$((current_jobs+1))
done
done

# Wait for all jobs to complete.
while [ $current_jobs -gt 0 ]
do
  sleep 10
  echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
  current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep)
  echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
done

echo "Combine individual pysim output files"

rm -rf "$result_dir/ml_*"
mrc_file="$result_dir/ml_mrc"
for header in "header-" "data-"
do
for fn in $ml_tmp_result_dir/*
do
  sum_file=""
  time_unit=""
  capacity=""
  if [[ $fn == *"timeline"* ]]; then
    tmpfn="$fn"
    IFS='-' read -ra elements <<< "$tmpfn"
    time_unit_index=0
    capacity_index=0
    for i in "${elements[@]}"
    do
       if [[ $i == "timeline" ]]; then
         break
       fi
       time_unit_index=$((time_unit_index+1))
    done
    time_unit_index=$((time_unit_index+1))
    capacity_index=$((time_unit_index+2))
    time_unit="${elements[$time_unit_index]}_"
    capacity="${elements[$capacity_index]}_"
  fi

  if [[ $fn == "${header}ml-policy-timeline"* ]]; then
    sum_file="$result_dir/ml_${capacity}${time_unit}policy_timeline"
  fi
  if [[ $fn == "${header}ml-policy-ratio-timeline"* ]]; then
    sum_file="$result_dir/ml_${capacity}${time_unit}policy_ratio_timeline"
  fi
  if [[ $fn == "${header}ml-miss-timeline"* ]]; then
    sum_file="$result_dir/ml_${capacity}${time_unit}miss_timeline"
  fi
  if [[ $fn == "${header}ml-miss-ratio-timeline"* ]]; then
    sum_file="$result_dir/ml_${capacity}${time_unit}miss_ratio_timeline"
  fi
  if [[ $fn == "${header}ml-mrc"* ]]; then
    sum_file="$mrc_file"
  fi
  if [[ $sum_file == "" ]]; then
    continue
  fi
  if [[ $header == "header-" ]]; then
    if [ -e "$sum_file" ]; then
      continue
    fi
  fi
  cat "$ml_tmp_result_dir/$fn" >> "$sum_file"
done
done

echo "Done"
# Sort MRC file by cache_type and cache_size.
tmp_file="$result_dir/tmp_mrc"
cat "$mrc_file" | sort -t ',' -k1,1 -k4,4n > "$tmp_file"
cat "$tmp_file" > "$mrc_file"
rm -rf "$tmp_file"