Added simple monitoring script to monitor overusage of memory in db_bench

Summary: rockuse more memory that asked to. Monitor and report.

Test Plan: run the pro with conditions to simulate the overusage. It should report that the process is using more memory than needed.

Reviewers: yhchiang, rven, sdong, igor

Reviewed By: igor

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D33249
This commit is contained in:
Ramki Balasubramanian 2015-02-11 18:40:11 -08:00
parent 5f00af4570
commit 5d1151deba
2 changed files with 319 additions and 0 deletions

102
tools/dbench_monitor Executable file
View File

@ -0,0 +1,102 @@
#!/bin/bash
#
#(c) 2004-present, Facebook Inc. All rights reserved.
#
#see LICENSE file for more information on use/redistribution rights.
#
#
#dbench_monitor: monitor db_bench process for violation of memory utilization
#
#default usage will monitor 'virtual memory size'. See below for standard options
#passed to db_bench during this test.
#
# See also: ./pflag for the actual monitoring script that does the work
#
#NOTE:
# You may end up with some /tmp/ files if db_bench OR
# this script OR ./pflag was killed unceremoniously
#
# If you see the script taking a long time, trying "kill"
# will usually cleanly exit.
#
#
DIR=`dirname $0`
LOG=/tmp/`basename $0`.$$
DB_BENCH="$DIR/../db_bench";
PFLAG=${DIR}/pflag
usage() {
cat <<HELP; exit
Usage: $0 [-h]
-h: prints this help message
This program will run the db_bench script to monitor memory usage
using the 'pflag' program. It launches db_bench with default settings
for certain arguments. You can change the defaults passed to
'db_bench' program, by setting the following environment
variables:
bs [block_size]
ztype [compression_type]
benches [benchmarks]
reads [reads]
threads [threads]
cs [cache_size]
vsize [value_size]
comp [compression_ratio]
num [num]
See the code for more info
HELP
}
[ ! -x ${DB_BENCH} ] && echo "WARNING: ${DB_BENCH} doesn't exist, abort!" && exit -1;
[ "x$1" = "x-h" ] && usage;
trap 'rm -f ${LOG}; kill ${PID}; echo "Interrupted, exiting";' 1 2 3 15
touch $LOG;
: ${bs:=16384}
: ${ztype:=zlib}
: ${benches:=readwhilewriting}
: ${reads:=$((1*1024*1024))};
: ${threads:=8}
: ${vsize:=2000}
: ${comp:=0.5}
: ${num:=10000}
: ${cs:=$((1*1024*1024*1024))};
DEBUG=1 #Set to 0 to remove chattiness
if [ "x$DEBUG" != "x" ]; then
#
#NOTE: under some circumstances, --use_existing_db may leave LOCK files under ${TMPDIR}/rocksdb/*
#cleanup the dir and re-run
#
echo DEBUG: Will run $DB_BENCH --block_size=$bs --compression_type=$ztype --benchmarks="$benches" --reads="$reads" --threads="$threads" --cache_size=$cs --value_size=$vsize --compression_ratio=$comp --num=$num --use_existing_db
fi
$DB_BENCH --block_size=$bs --compression_type=$ztype --benchmarks="$benches" --reads="$reads" --threads="$threads" --cache_size=$cs --value_size=$vsize --compression_ratio=$comp --num=$num --use_existing_db >$LOG 2>&1 &
if [ $? -ne 0 ]; then
warn "WARNING: ${DB_BENCH} did not launch successfully! Abort!";
exit;
fi
PID=$!
#
#Start the monitoring. Default is "vsz" monitoring for upto cache_size ($cs) value of virtual mem
#You could also monitor RSS and CPUTIME (bsdtime). Try 'pflag -h' for how to do this
#
${PFLAG} -p $PID -v
rm -f $LOG;

217
tools/pflag Executable file
View File

@ -0,0 +1,217 @@
#!/bin/bash
#
#(c) 2004-present, Facebook, all rights reserved.
# See the LICENSE file for usage and distribution rights.
#
trap 'echo "Caught exception, dying"; exit' 1 2 3 15
ME=`basename $0`
SERVER=`hostname`
#parameters used
#
Dump_Config=0
DEBUG=
OS=`/bin/uname -s`
VMEM=
RSS=
CPU=
VERBOSE=
VAR=
LIMIT=
ACTION=
N=
WAIT=
#
#supported OS: Linux only for now. Easy to add
#
oscheck() {
case ${OS} in
Linux)
VMEM=vsz
RSS=rss
CPU=bsdtime
;;
*)
die "Unsupported OS ${OS}. Send a bug report with OS you need supported. Thanks."
;;
esac
}
verbose() {
if [ "x$DEBUG" != "x" ]; then
echo "$@" >&2
fi
}
warn() {
echo "$@" >&2
}
die() {
echo "ERROR: " "$@" >&2;
exit;
}
dump_config() {
cat <<EOCONFIG;
$ME running on ${HOSTNAME} at `date`
Configuration for this run:
PID to monitor : ${PID}
Resource monitored : ${VAR}
Resource limit : ${LIMIT}
Check every : ${WAIT} seconds
No. of times run : ${N}
What to do : ${ACTION}
EOCONFIG
}
usage() {
cat <<USAGE; exit
$@
Usage ${ME} -p pid [-x {VMEM|RSS|CPU}] -l limit [-a {warn|die|kill}] [-n cycles] [-w wait]
Monitor a process for set of violations. Options:
-p: PID of process to monitor
-x: metric to sense. Currently only VMEM/RSS/CPU are supported. Defaults to VMEM
-l: what is the threshold/limit for the metric that is being sensed.
Examples: "-l 100m", "-l 1.5g" (for VMEM/RSS), "-l 5:04" 5:04 in BSDTIME for CPU
NOTE: defaults to 1GB
-a: action. Currently {warn|die|kill} are supported.
The default action is to 'warn'. Here is the behavior:
warn: complain if usage exceeds threshold, but continue monitoring
kill: complain, kill the db_bench process and exit
die: if usage exceeds threshold, die immediately
-n: number of cycles to monitor. Default is to monitor until PID no longer exists.
-w: wait time per cycle of monitoring. Default is 5 seconds.
-v: verbose messaging
USAGE
}
#set default values if none given
set_defaults_if_noopt_given() {
: ${VAR:=vsz}
: ${LIMIT:=1024000}
: ${WAIT:=5}
: ${N:=999999}
: ${ACTION:=warn}
}
validate_options() {
if [ "x$PID" = "x" -a $Dump_Config -ne 1 ]; then
usage "PID is mandatory"
fi
}
###### START
while getopts ":p:x:l:a:n:t:vhd" opt; do
case $opt in
d)
Dump_Config=1
;;
h)
usage;
;;
a)
ACTION=${OPTARG};
;;
v)
DEBUG=1;
;;
p)
PID=$OPTARG;
;;
x)
VAR=$OPTARG;
;;
l)
LIMIT=$OPTARG;
;;
w)
WAIT=$OPTARG;
;;
n)
N=$OPTARG;
;;
\?)
usage;
;;
esac
done
oscheck;
set_defaults_if_noopt_given;
validate_options;
if [ $Dump_Config -eq 1 ]; then
dump_config;
exit;
fi
Done=0
verbose "Trying ${N} times, Waiting ${WAIT} seconds each iteration";
while [ $Done -eq 0 ]; do
VAL=`/bin/ps h -p $PID -o ${VAR} | perl -pe 'chomp; s/(.*)m/$1 * 1024/e; s/(.*)g/$1 * 1024 * 1024/e;'`
if [ ${VAL:=0} -eq 0 ]; then
warn "Process $PID ended without incident."
Done=1;
break;
fi
if [ $VAL -ge $LIMIT ]; then
Done=1;
else
echo "Value of '${VAR}' (${VAL}) is less than ${LIMIT} for PID ${PID}"
sleep $WAIT;
fi
if [ $Done -eq 1 ]; then
if [ "$ACTION" = "kill" ]; then
kill ${PID} || kill -3 ${PID}
exit;
elif [ "$ACTION" = "warn" ]; then
# go back to monitoring.
warn "`date` WARNING: ${VAR} breached threshold ${LIMIT}, actual is ${VAL}"
Done=0 #go back to monitoring
elif [ "$ACTION" = "die" ]; then
warn "WARNING: dying without killing process ${PID} on ${SERVER}"
warn "The process details are below: "
warn "`ps -p ${PID} -o pid,ppid,bsdtime,rss,vsz,cmd,args`"
warn ""
#should we send email/notify someone? TODO... for now, bail.
exit -1;
fi
else
:
#warn "INFO: PID $PID, $VAR = $VAL, limit ($LIMIT) not exceeded";
fi
done