#!/bin/bash
#
#(c) 2004-present, Facebook, all rights reserved. 
# See the LICENSE file for usage and distribution rights.
#

trap 'echo "Caught exception, dying"; exit' 1 2 3 15

ME=`basename $0`
SERVER=`hostname`

#parameters used
#
Dump_Config=0
DEBUG=
OS=`/bin/uname -s`
VMEM=
RSS=
CPU=
VERBOSE=
VAR=
LIMIT=
ACTION=
N=
WAIT=

#
#supported OS: Linux only for now. Easy to add
#
oscheck() {
  case ${OS} in
    Linux)
     VMEM=vsz
     RSS=rss
     CPU=bsdtime
     ;;
    *)
      die "Unsupported OS ${OS}. Send a bug report with OS you need supported. Thanks."
      ;;
  esac
}


verbose() {
  if [ "x$DEBUG" != "x" ]; then
    echo "$@" >&2
  fi
}

warn() {
  echo "$@" >&2
}

die() {
    echo "ERROR: " "$@" >&2;
    exit;
}

dump_config() {
  cat <<EOCONFIG;
$ME running on ${HOSTNAME} at `date`

Configuration for this run:
  PID to monitor     : ${PID}
  Resource monitored : ${VAR}
  Resource limit     : ${LIMIT}
  Check every        : ${WAIT} seconds
  No. of times run   : ${N}
  What to do         : ${ACTION}
EOCONFIG

}

usage() {
  cat <<USAGE; exit
$@

Usage ${ME} -p pid [-x {VMEM|RSS|CPU}] -l limit [-a {warn|die|kill}] [-n cycles] [-w wait]

Monitor a process for set of violations. Options:

  -p: PID of process to monitor

  -x: metric to sense. Currently only VMEM/RSS/CPU are supported. Defaults to VMEM

  -l: what is the threshold/limit for the metric that is being sensed.
    Examples: "-l 100m", "-l 1.5g" (for VMEM/RSS), "-l 5:04" 5:04 in BSDTIME for CPU
    NOTE: defaults to 1GB

  -a: action. Currently {warn|die|kill} are supported. 
    The default action is to 'warn'. Here is the behavior:

    warn: complain if usage exceeds threshold, but continue monitoring
    kill: complain, kill the db_bench process and exit
    die:  if usage exceeds threshold, die immediately

  -n: number of cycles to monitor. Default is to monitor until PID no longer exists.

  -w: wait time per cycle of monitoring. Default is 5 seconds.

  -v: verbose messaging

USAGE

}

#set default values if none given
set_defaults_if_noopt_given() {

  : ${VAR:=vsz}
  : ${LIMIT:=1024000}
  : ${WAIT:=5}
  : ${N:=999999}
  : ${ACTION:=warn}
}

validate_options() {
  if [ "x$PID" = "x" -a $Dump_Config -ne 1 ]; then
    usage "PID is mandatory"
  fi
}

###### START


  while getopts ":p:x:l:a:n:t:vhd" opt; do
    case $opt in
      d)
          Dump_Config=1
          ;;
      h)
          usage;
          ;;
      a)
        ACTION=${OPTARG};
        ;;
      v)
        DEBUG=1;
        ;;
      p)
        PID=$OPTARG;
        ;;
      x)
        VAR=$OPTARG;
        ;;
      l)
        LIMIT=$OPTARG;
        ;;
      w)
        WAIT=$OPTARG;
        ;;
      n)
        N=$OPTARG;
        ;;
      \?) 
        usage;
        ;;
    esac
  done

oscheck;
set_defaults_if_noopt_given;
validate_options;

if [ $Dump_Config -eq 1 ]; then
    dump_config;
    exit;
fi

Done=0

verbose "Trying ${N} times, Waiting ${WAIT} seconds each iteration";

while [ $Done -eq 0 ]; do
  VAL=`/bin/ps h -p $PID -o ${VAR} | perl -pe 'chomp; s/(.*)m/$1 * 1024/e; s/(.*)g/$1 * 1024 * 1024/e;'`
  if [ ${VAL:=0} -eq 0 ]; then
    warn "Process $PID ended without incident."
    Done=1;
    break;
  fi

  if [ $VAL -ge $LIMIT ]; then
    Done=1;
  else
    echo "Value of '${VAR}' (${VAL}) is less than ${LIMIT} for PID ${PID}"
    sleep $WAIT;
  fi
  if [ $Done -eq 1 ]; then

    if [ "$ACTION" = "kill" ]; then
        kill ${PID} || kill -3 ${PID}
        exit;

    elif [ "$ACTION" = "warn" ]; then

      # go back to monitoring.

      warn "`date` WARNING: ${VAR} breached threshold ${LIMIT}, actual is ${VAL}"
      Done=0  #go back to monitoring

    elif [ "$ACTION" = "die" ]; then
      warn "WARNING: dying without killing process ${PID} on ${SERVER}"
      warn "The process details are below: "
      warn "`ps -p ${PID} -o pid,ppid,bsdtime,rss,vsz,cmd,args`"
      warn ""

      #should we send email/notify someone? TODO... for now, bail.

      exit -1;

    fi
  else
      :
      #warn "INFO: PID $PID, $VAR = $VAL, limit ($LIMIT) not exceeded";
  fi
done