#!/bin/bash # #(c) 2004-present, Facebook, all rights reserved. # See the LICENSE file for usage and distribution rights. # trap 'echo "Caught exception, dying"; exit' 1 2 3 15 ME=`basename $0` SERVER=`hostname` #parameters used # Dump_Config=0 DEBUG= OS=`/bin/uname -s` VMEM= RSS= CPU= VERBOSE= VAR= LIMIT= ACTION= N= WAIT= # #supported OS: Linux only for now. Easy to add # oscheck() { case ${OS} in Linux) VMEM=vsz RSS=rss CPU=bsdtime ;; *) die "Unsupported OS ${OS}. Send a bug report with OS you need supported. Thanks." ;; esac } verbose() { if [ "x$DEBUG" != "x" ]; then echo "$@" >&2 fi } warn() { echo "$@" >&2 } die() { echo "ERROR: " "$@" >&2; exit; } dump_config() { cat <<EOCONFIG; $ME running on ${HOSTNAME} at `date` Configuration for this run: PID to monitor : ${PID} Resource monitored : ${VAR} Resource limit : ${LIMIT} Check every : ${WAIT} seconds No. of times run : ${N} What to do : ${ACTION} EOCONFIG } usage() { cat <<USAGE; exit $@ Usage ${ME} -p pid [-x {VMEM|RSS|CPU}] -l limit [-a {warn|die|kill}] [-n cycles] [-w wait] Monitor a process for set of violations. Options: -p: PID of process to monitor -x: metric to sense. Currently only VMEM/RSS/CPU are supported. Defaults to VMEM -l: what is the threshold/limit for the metric that is being sensed. Examples: "-l 100m", "-l 1.5g" (for VMEM/RSS), "-l 5:04" 5:04 in BSDTIME for CPU NOTE: defaults to 1GB -a: action. Currently {warn|die|kill} are supported. The default action is to 'warn'. Here is the behavior: warn: complain if usage exceeds threshold, but continue monitoring kill: complain, kill the db_bench process and exit die: if usage exceeds threshold, die immediately -n: number of cycles to monitor. Default is to monitor until PID no longer exists. -w: wait time per cycle of monitoring. Default is 5 seconds. -v: verbose messaging USAGE } #set default values if none given set_defaults_if_noopt_given() { : ${VAR:=vsz} : ${LIMIT:=1024000} : ${WAIT:=5} : ${N:=999999} : ${ACTION:=warn} } validate_options() { if [ "x$PID" = "x" -a $Dump_Config -ne 1 ]; then usage "PID is mandatory" fi } ###### START while getopts ":p:x:l:a:n:t:vhd" opt; do case $opt in d) Dump_Config=1 ;; h) usage; ;; a) ACTION=${OPTARG}; ;; v) DEBUG=1; ;; p) PID=$OPTARG; ;; x) VAR=$OPTARG; ;; l) LIMIT=$OPTARG; ;; w) WAIT=$OPTARG; ;; n) N=$OPTARG; ;; \?) usage; ;; esac done oscheck; set_defaults_if_noopt_given; validate_options; if [ $Dump_Config -eq 1 ]; then dump_config; exit; fi Done=0 verbose "Trying ${N} times, Waiting ${WAIT} seconds each iteration"; while [ $Done -eq 0 ]; do VAL=`/bin/ps h -p $PID -o ${VAR} | perl -pe 'chomp; s/(.*)m/$1 * 1024/e; s/(.*)g/$1 * 1024 * 1024/e;'` if [ ${VAL:=0} -eq 0 ]; then warn "Process $PID ended without incident." Done=1; break; fi if [ $VAL -ge $LIMIT ]; then Done=1; else echo "Value of '${VAR}' (${VAL}) is less than ${LIMIT} for PID ${PID}" sleep $WAIT; fi if [ $Done -eq 1 ]; then if [ "$ACTION" = "kill" ]; then kill ${PID} || kill -3 ${PID} exit; elif [ "$ACTION" = "warn" ]; then # go back to monitoring. warn "`date` WARNING: ${VAR} breached threshold ${LIMIT}, actual is ${VAL}" Done=0 #go back to monitoring elif [ "$ACTION" = "die" ]; then warn "WARNING: dying without killing process ${PID} on ${SERVER}" warn "The process details are below: " warn "`ps -p ${PID} -o pid,ppid,bsdtime,rss,vsz,cmd,args`" warn "" #should we send email/notify someone? TODO... for now, bail. exit -1; fi else : #warn "INFO: PID $PID, $VAR = $VAL, limit ($LIMIT) not exceeded"; fi done