Added simple monitoring script to monitor overusage of memory in db_bench
Summary: rockuse more memory that asked to. Monitor and report. Test Plan: run the pro with conditions to simulate the overusage. It should report that the process is using more memory than needed. Reviewers: yhchiang, rven, sdong, igor Reviewed By: igor Subscribers: dhruba Differential Revision: https://reviews.facebook.net/D33249
This commit is contained in:
parent
5f00af4570
commit
5d1151deba
102
tools/dbench_monitor
Executable file
102
tools/dbench_monitor
Executable file
@ -0,0 +1,102 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#(c) 2004-present, Facebook Inc. All rights reserved.
|
||||
#
|
||||
#see LICENSE file for more information on use/redistribution rights.
|
||||
#
|
||||
|
||||
#
|
||||
#dbench_monitor: monitor db_bench process for violation of memory utilization
|
||||
#
|
||||
#default usage will monitor 'virtual memory size'. See below for standard options
|
||||
#passed to db_bench during this test.
|
||||
#
|
||||
# See also: ./pflag for the actual monitoring script that does the work
|
||||
#
|
||||
#NOTE:
|
||||
# You may end up with some /tmp/ files if db_bench OR
|
||||
# this script OR ./pflag was killed unceremoniously
|
||||
#
|
||||
# If you see the script taking a long time, trying "kill"
|
||||
# will usually cleanly exit.
|
||||
#
|
||||
#
|
||||
DIR=`dirname $0`
|
||||
LOG=/tmp/`basename $0`.$$
|
||||
DB_BENCH="$DIR/../db_bench";
|
||||
PFLAG=${DIR}/pflag
|
||||
|
||||
usage() {
|
||||
cat <<HELP; exit
|
||||
|
||||
Usage: $0 [-h]
|
||||
|
||||
-h: prints this help message
|
||||
|
||||
This program will run the db_bench script to monitor memory usage
|
||||
using the 'pflag' program. It launches db_bench with default settings
|
||||
for certain arguments. You can change the defaults passed to
|
||||
'db_bench' program, by setting the following environment
|
||||
variables:
|
||||
|
||||
bs [block_size]
|
||||
ztype [compression_type]
|
||||
benches [benchmarks]
|
||||
reads [reads]
|
||||
threads [threads]
|
||||
cs [cache_size]
|
||||
vsize [value_size]
|
||||
comp [compression_ratio]
|
||||
num [num]
|
||||
|
||||
See the code for more info
|
||||
|
||||
HELP
|
||||
|
||||
}
|
||||
|
||||
[ ! -x ${DB_BENCH} ] && echo "WARNING: ${DB_BENCH} doesn't exist, abort!" && exit -1;
|
||||
|
||||
[ "x$1" = "x-h" ] && usage;
|
||||
|
||||
trap 'rm -f ${LOG}; kill ${PID}; echo "Interrupted, exiting";' 1 2 3 15
|
||||
|
||||
touch $LOG;
|
||||
|
||||
: ${bs:=16384}
|
||||
: ${ztype:=zlib}
|
||||
: ${benches:=readwhilewriting}
|
||||
: ${reads:=$((1*1024*1024))};
|
||||
: ${threads:=8}
|
||||
: ${vsize:=2000}
|
||||
: ${comp:=0.5}
|
||||
: ${num:=10000}
|
||||
: ${cs:=$((1*1024*1024*1024))};
|
||||
|
||||
DEBUG=1 #Set to 0 to remove chattiness
|
||||
|
||||
|
||||
if [ "x$DEBUG" != "x" ]; then
|
||||
#
|
||||
#NOTE: under some circumstances, --use_existing_db may leave LOCK files under ${TMPDIR}/rocksdb/*
|
||||
#cleanup the dir and re-run
|
||||
#
|
||||
echo DEBUG: Will run $DB_BENCH --block_size=$bs --compression_type=$ztype --benchmarks="$benches" --reads="$reads" --threads="$threads" --cache_size=$cs --value_size=$vsize --compression_ratio=$comp --num=$num --use_existing_db
|
||||
|
||||
fi
|
||||
|
||||
$DB_BENCH --block_size=$bs --compression_type=$ztype --benchmarks="$benches" --reads="$reads" --threads="$threads" --cache_size=$cs --value_size=$vsize --compression_ratio=$comp --num=$num --use_existing_db >$LOG 2>&1 &
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
warn "WARNING: ${DB_BENCH} did not launch successfully! Abort!";
|
||||
exit;
|
||||
fi
|
||||
PID=$!
|
||||
|
||||
#
|
||||
#Start the monitoring. Default is "vsz" monitoring for upto cache_size ($cs) value of virtual mem
|
||||
#You could also monitor RSS and CPUTIME (bsdtime). Try 'pflag -h' for how to do this
|
||||
#
|
||||
${PFLAG} -p $PID -v
|
||||
|
||||
rm -f $LOG;
|
217
tools/pflag
Executable file
217
tools/pflag
Executable file
@ -0,0 +1,217 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#(c) 2004-present, Facebook, all rights reserved.
|
||||
# See the LICENSE file for usage and distribution rights.
|
||||
#
|
||||
|
||||
trap 'echo "Caught exception, dying"; exit' 1 2 3 15
|
||||
|
||||
ME=`basename $0`
|
||||
SERVER=`hostname`
|
||||
|
||||
#parameters used
|
||||
#
|
||||
Dump_Config=0
|
||||
DEBUG=
|
||||
OS=`/bin/uname -s`
|
||||
VMEM=
|
||||
RSS=
|
||||
CPU=
|
||||
VERBOSE=
|
||||
VAR=
|
||||
LIMIT=
|
||||
ACTION=
|
||||
N=
|
||||
WAIT=
|
||||
|
||||
#
|
||||
#supported OS: Linux only for now. Easy to add
|
||||
#
|
||||
oscheck() {
|
||||
case ${OS} in
|
||||
Linux)
|
||||
VMEM=vsz
|
||||
RSS=rss
|
||||
CPU=bsdtime
|
||||
;;
|
||||
*)
|
||||
die "Unsupported OS ${OS}. Send a bug report with OS you need supported. Thanks."
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
|
||||
verbose() {
|
||||
if [ "x$DEBUG" != "x" ]; then
|
||||
echo "$@" >&2
|
||||
fi
|
||||
}
|
||||
|
||||
warn() {
|
||||
echo "$@" >&2
|
||||
}
|
||||
|
||||
die() {
|
||||
echo "ERROR: " "$@" >&2;
|
||||
exit;
|
||||
}
|
||||
|
||||
dump_config() {
|
||||
cat <<EOCONFIG;
|
||||
$ME running on ${HOSTNAME} at `date`
|
||||
|
||||
Configuration for this run:
|
||||
PID to monitor : ${PID}
|
||||
Resource monitored : ${VAR}
|
||||
Resource limit : ${LIMIT}
|
||||
Check every : ${WAIT} seconds
|
||||
No. of times run : ${N}
|
||||
What to do : ${ACTION}
|
||||
EOCONFIG
|
||||
|
||||
}
|
||||
|
||||
usage() {
|
||||
cat <<USAGE; exit
|
||||
$@
|
||||
|
||||
Usage ${ME} -p pid [-x {VMEM|RSS|CPU}] -l limit [-a {warn|die|kill}] [-n cycles] [-w wait]
|
||||
|
||||
Monitor a process for set of violations. Options:
|
||||
|
||||
-p: PID of process to monitor
|
||||
|
||||
-x: metric to sense. Currently only VMEM/RSS/CPU are supported. Defaults to VMEM
|
||||
|
||||
-l: what is the threshold/limit for the metric that is being sensed.
|
||||
Examples: "-l 100m", "-l 1.5g" (for VMEM/RSS), "-l 5:04" 5:04 in BSDTIME for CPU
|
||||
NOTE: defaults to 1GB
|
||||
|
||||
-a: action. Currently {warn|die|kill} are supported.
|
||||
The default action is to 'warn'. Here is the behavior:
|
||||
|
||||
warn: complain if usage exceeds threshold, but continue monitoring
|
||||
kill: complain, kill the db_bench process and exit
|
||||
die: if usage exceeds threshold, die immediately
|
||||
|
||||
-n: number of cycles to monitor. Default is to monitor until PID no longer exists.
|
||||
|
||||
-w: wait time per cycle of monitoring. Default is 5 seconds.
|
||||
|
||||
-v: verbose messaging
|
||||
|
||||
USAGE
|
||||
|
||||
}
|
||||
|
||||
#set default values if none given
|
||||
set_defaults_if_noopt_given() {
|
||||
|
||||
: ${VAR:=vsz}
|
||||
: ${LIMIT:=1024000}
|
||||
: ${WAIT:=5}
|
||||
: ${N:=999999}
|
||||
: ${ACTION:=warn}
|
||||
}
|
||||
|
||||
validate_options() {
|
||||
if [ "x$PID" = "x" -a $Dump_Config -ne 1 ]; then
|
||||
usage "PID is mandatory"
|
||||
fi
|
||||
}
|
||||
|
||||
###### START
|
||||
|
||||
|
||||
while getopts ":p:x:l:a:n:t:vhd" opt; do
|
||||
case $opt in
|
||||
d)
|
||||
Dump_Config=1
|
||||
;;
|
||||
h)
|
||||
usage;
|
||||
;;
|
||||
a)
|
||||
ACTION=${OPTARG};
|
||||
;;
|
||||
v)
|
||||
DEBUG=1;
|
||||
;;
|
||||
p)
|
||||
PID=$OPTARG;
|
||||
;;
|
||||
x)
|
||||
VAR=$OPTARG;
|
||||
;;
|
||||
l)
|
||||
LIMIT=$OPTARG;
|
||||
;;
|
||||
w)
|
||||
WAIT=$OPTARG;
|
||||
;;
|
||||
n)
|
||||
N=$OPTARG;
|
||||
;;
|
||||
\?)
|
||||
usage;
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
oscheck;
|
||||
set_defaults_if_noopt_given;
|
||||
validate_options;
|
||||
|
||||
if [ $Dump_Config -eq 1 ]; then
|
||||
dump_config;
|
||||
exit;
|
||||
fi
|
||||
|
||||
Done=0
|
||||
|
||||
verbose "Trying ${N} times, Waiting ${WAIT} seconds each iteration";
|
||||
|
||||
while [ $Done -eq 0 ]; do
|
||||
VAL=`/bin/ps h -p $PID -o ${VAR} | perl -pe 'chomp; s/(.*)m/$1 * 1024/e; s/(.*)g/$1 * 1024 * 1024/e;'`
|
||||
if [ ${VAL:=0} -eq 0 ]; then
|
||||
warn "Process $PID ended without incident."
|
||||
Done=1;
|
||||
break;
|
||||
fi
|
||||
|
||||
if [ $VAL -ge $LIMIT ]; then
|
||||
Done=1;
|
||||
else
|
||||
echo "Value of '${VAR}' (${VAL}) is less than ${LIMIT} for PID ${PID}"
|
||||
sleep $WAIT;
|
||||
fi
|
||||
if [ $Done -eq 1 ]; then
|
||||
|
||||
if [ "$ACTION" = "kill" ]; then
|
||||
kill ${PID} || kill -3 ${PID}
|
||||
exit;
|
||||
|
||||
elif [ "$ACTION" = "warn" ]; then
|
||||
|
||||
# go back to monitoring.
|
||||
|
||||
warn "`date` WARNING: ${VAR} breached threshold ${LIMIT}, actual is ${VAL}"
|
||||
Done=0 #go back to monitoring
|
||||
|
||||
elif [ "$ACTION" = "die" ]; then
|
||||
warn "WARNING: dying without killing process ${PID} on ${SERVER}"
|
||||
warn "The process details are below: "
|
||||
warn "`ps -p ${PID} -o pid,ppid,bsdtime,rss,vsz,cmd,args`"
|
||||
warn ""
|
||||
|
||||
#should we send email/notify someone? TODO... for now, bail.
|
||||
|
||||
exit -1;
|
||||
|
||||
fi
|
||||
else
|
||||
:
|
||||
#warn "INFO: PID $PID, $VAR = $VAL, limit ($LIMIT) not exceeded";
|
||||
fi
|
||||
done
|
||||
|
Loading…
Reference in New Issue
Block a user