#!/usr/bin/perl -wT ################################# # Description: use qstat to gather information about the PBS system # Author: Colin Morey # Institute: Manchester University # Created: 2006-03-15 # License: GPL # $Id:$ ################################### # use strict; use Time::HiRes qw (gettimeofday); use File::Basename; use Getopt::Long; $ENV{PATH} ='/usr/bin:/bin'; my $starttime= [gettimeofday]; #################### # NOTE: Change this if your nagios libexec directory differs. #################### use lib "/usr/nagios/libexec"; use utils qw(%ERRORS); #################### # variable declaration #################### my $myversion ='$Revision: 1.5 $'; my $myname="check_pbs.pl"; my $qstat = '/usr/bin/qstat' ; my $queue={}; my ($help,$verbose,$version); my ($hostname,$queuename)=('localhost'); my %thresholds=(); Getopt::Long::config ("bundling_override"); GetOptions( "help|h" => \$help, "version|V" => \$version, "verbose|v+" => \$verbose, "hostname|H:s" => \$hostname, "queuename|Q:s" => \$queuename, "twarn|tw=i" => \$thresholds{'twarn'}, "tmax|tm=i" => \$thresholds{'tmax'}, "qwarn|qw=i" => \$thresholds{'qwarn'}, "qmax|qm=i" => \$thresholds{'qmax'}, "hwarn|hw=i" => \$thresholds{'hwarn'}, "hmax|hm=i" => \$thresholds{'hmax'}, "wwarn|ww=i" => \$thresholds{'wwarn'}, "wmax|wm=i" => \$thresholds{'wmax'} ); if (defined $version) { print "$myname UNKNOWN: version: $myversion\n"; exit $ERRORS{'UNKNOWN'}; } $verbose=0 unless (defined $verbose); if (defined $help && 1 eq $help ){&usage}; if (defined $hostname && '' eq $hostname) { do_debug(0,$myname,"option -H requires a hostname, defaulting to localhost"); $hostname ='localhost'; } # because we're running with -T we need to sanitise $hostname if ($hostname =~/^([-[:alnum:].]+)$/) { $hostname = $1; } else { do_critical($myname,"Bad Data in hostname field"); } do_critical($myname,"option -Q requires the name of a queue") if (defined $queuename && '' eq $queuename); do_debug(2,$myname,"checking for existance and executablity of qstat command"); do_critical($myname,"Cannot find \"qstat\"") unless (-x $qstat ); $queue=get_stats($myname,$hostname,$queuename,$queue) || do_critical($myname,qq|Call to get_stats("$hostname,$queuename") failed: $!|); my $endtime=[gettimeofday]; my $elapsed = Time::HiRes::tv_interval($starttime,$endtime); $elapsed = 1000*1000*$elapsed ; #convert seconds to us. my $perfdata = 'exectime='.$elapsed .'us'; my $jobstatus = qq| Total jobs:$queue->{'Total'}, Jobs Queued:$queue->{'Queued'}, Jobs Waiting:| . abs($queue->{'Wait'}) .qq|, Jobs Halted:$queue->{'Hold'}|; #################### # begin long scary threshold checks #################### if (!defined $queuename) { $queuename = 'PBS'; } do_critical($myname,qq|$queuename on $hostname checked, Number of queued jobs higher than $thresholds{'qmax'}. $jobstatus \|$perfdata |) if (defined $thresholds{'qmax'} && $queue->{'Queued'} > $thresholds{'qmax'}) ; do_critical($myname,qq|$queuename on $hostname checked, Number of halted jobs higher than $thresholds{'hmax'}. $jobstatus \|$perfdata |) if (defined $thresholds{'hmax'} && $queue->{'Hold'} > $thresholds{'hmax'}) ; do_critical($myname,qq|$queuename on $hostname checked, Number of jobs witing higher than $thresholds{'wmax'}. $jobstatus \|$perfdata |) if (defined $thresholds{'wmax'} && $queue->{'Wait'} > $thresholds{'wmax'}) ; do_critical($myname,qq|$queuename on $hostname checked, Total number of jobs higher than $thresholds{'tmax'}. $jobstatus \|$perfdata |) if (defined $thresholds{'tmax'} && $queue->{'Total'} > $thresholds{'tmax'}) ; do_warn($myname,qq|$queuename on $hostname checked, Number of queued jobs higher than $thresholds{'qwarn'}. $jobstatus \|$perfdata |) if (defined $thresholds{'qwarn'} && $queue->{'Queued'} > $thresholds{'qwarn'}) ; do_warn($myname,qq|$queuename on $hostname checked, Number of halted jobs higher than $thresholds{'hwarn'}. $jobstatus \|$perfdata |) if (defined $thresholds{'hwarn'} && $queue->{'Hold'} > $thresholds{'hwarn'}) ; do_warn($myname,qq|$queuename on $hostname checked, Number of jobs waiting higher than $thresholds{'wwarn'}. $jobstatus \|$perfdata |) if (defined $thresholds{'wwarn'} && $queue->{'Wait'} > $thresholds{'wwarn'}) ; do_warn($myname,qq|$queuename on $hostname checked, Total number of jobs higher than $thresholds{'twarn'}. $jobstatus \|$perfdata |) if (defined $thresholds{'twarn'} && $queue->{'Total'} > $thresholds{'twarn'}) ; do_warn($myname,qq|$queuename on $hostname checked, Negative number of Waiting Jobs found. $jobstatus \|$perfdata |) if ( check_neg($queue->{'Wait'})); #################### # end long scary threshold checks (which aren't that scary) #################### print "$myname OK: $queuename on $hostname checked $jobstatus \|$perfdata\n"; exit $ERRORS{"OK"}; #################### # Usage #################### sub usage{ my $exitCode= shift @_; print << "EOT"; usage $myname [OPTIONS] GENERAL -h, --help print this text and exit -V, --version show the version of the script and exit -v, --verbose increase verbosity, use multiple times for added effect -H, --hostname specify the hostname to query, defaults to localhost -Q, --queuename specify the name of the queue you want to monitor -tw, --twarn total number of jobs in the queue before issuing a warning -tm, --tmax total number of jobs in the queue before issuing a critical -qw, --qwarn number of queued jobs before issuing a warning -qm, --qmax number of queued jobs before issuing a critical -hw, --hwarn number of halted jobs before issuing a warning -hm, --hmax number of halted jobs before issuing a critical -ww, --wwarn number of jobs waiting before issuing a warning -wm, --wmax number of jobs waiting before issuing a critical EOT exit $ERRORS{'UNKNOWN'}; # Invalid command line arguments were supplied to the plugin or low-level failures # internal to the plugin (such as unable to fork, or open a tcp socket) that prevent # it from performing the specified operation. Higher-level errors (such as name # resolution errors, socket timeouts, etc) are outside of the control of plugins # and should generally NOT be reported as UNKNOWN states. } #################### # Usage #################### #################### # get_stats #################### sub get_stats{ do_debug(2,$myname, "Entering get_stats"); my ($myname,$hostname,$queuename,$queue) = @_; my $shortname=substr($hostname,0,16);; my $output=''; if (defined $queuename) { $output=`$qstat -Q \@$hostname 2>&1`; } else { $output=`$qstat -B $hostname 2>&1`; } do_debug(1,$myname,"Ouput from \"$qstat -B $hostname\" exit code:$?\n$output"); do_critical($myname,qq|Unable to open "$qstat -B $hostname\n"|) if ($output eq ''); do_critical($myname,qq|ERROR: "$qstat -B $hostname" failed: Dumping output\n$output|) if (0<$?); # now lets read in the output my @lines = split ("\n",$output); shift @lines; shift @lines; do_debug(1,$myname,"Dumping remaining queue lines\n".join('-=-',@lines)); my @wantedlines; if (defined $queuename) { do_debug(2,$myname,"looking for queuename $queuename:"); @wantedlines = grep(/$queuename/,@lines); do_debug(2,$myname,"found ". join('-=-',@wantedlines)); } else { do_debug(2,$myname,"looking for shortname $shortname:"); @wantedlines= grep (/$shortname/,@lines); } do_critical($myname,"Unable to find queue name in qstat output, run with -vvv for more information") if (!defined $wantedlines[0]); my $wantedline=''; $wantedline= $wantedlines[0]; # naively assume that the grep will only match one line, or that it's the first line matched that we want $wantedline=~s/ {2,}/ /g; do_debug(1,$myname,"Dumping wanted queue lines\n$wantedline"); if (!defined $queuename){ ($queue->{'Server'},$queue->{'Maximum'},$queue->{'Total'},$queue->{'Queued'},$queue->{'Running'}, $queue->{'Hold'},$queue->{'Wait'},$queue->{'Trn'},$queue->{'Ext'}, $queue->{'Status'})=split(' ',$wantedline); }else { ($queue->{'Queue'},$queue->{'Maximum'},$queue->{'Total'},$queue->{'Ena'}, $queue->{'Str'},$queue->{'Queued'},$queue->{'Running'},$queue->{'Hold'}, $queue->{'Wait'},$queue->{'Trn'},$queue->{'Ext'},$queue->{'Type'})=split(' ',$wantedline); } do_debug(2,$myname,"Leaving get_stats"); return $queue; } #################### # get_stats #################### #################### # do_critical take message and die with the correct exit code #################### sub do_critical{ my ($myname,$errmsg)=@_; print "$myname Critical: $errmsg \n"; exit $ERRORS{"CRITICAL"}; } #################### # do_critical #################### #################### # do_warn take message and die with the correct exit code #################### sub do_warn{ my ($myname,$errmsg)=@_; print "$myname Warning: $errmsg \n"; exit $ERRORS{"WARNING"}; } #################### # do_warn #################### #################### # do_debug take message and issue debug message #################### sub do_debug{ my ($debug_level,$myname,$msg)=@_; if ($verbose > $debug_level){ print "$myname Debug: $msg \n"; } } #################### # do_warn #################### #################### # check_neg #################### sub check_neg{ my $num = shift(); return 1 if (abs($num) != $num ) ; #number is negative return 0; }