diff options
Diffstat (limited to 'plugins')
-rwxr-xr-x | plugins/check_remote_host.py | 144 | ||||
-rwxr-xr-x | plugins/gluster_host_service_handler.py | 45 |
2 files changed, 50 insertions, 139 deletions
diff --git a/plugins/check_remote_host.py b/plugins/check_remote_host.py index 7350e27..31ff9dd 100755 --- a/plugins/check_remote_host.py +++ b/plugins/check_remote_host.py @@ -2,13 +2,13 @@ # # check_remote_host.py -- nagios plugin uses Mklivestatus to get the overall # status -# of a host. The entities considered for the status of the host are - -# 1. Host is reachable -# 2. LV/Inode Service status -# 3. CPU Utilization -# 4. Memory Utilization -# 5. Network Utilization -# 6. Swap Utilization +# of a host. The services considered by default for the status of the host +# are - +# 1. LV/Inode Service status +# 2. CPU Utilization +# 3. Memory Utilization +# 4. Network Utilization +# 5. Swap Utilization # # Copyright (C) 2014 Red Hat Inc # @@ -29,90 +29,35 @@ import os import sys -import shlex -import subprocess -import socket import getopt +#import socket +import json + +import livestatus STATUS_OK = 0 STATUS_WARNING = 1 STATUS_CRITICAL = 2 STATUS_UNKNOWN = 3 -_checkPingCommand = "/usr/lib64/nagios/plugins/check_ping" _commandStatusStrs = {STATUS_OK: 'OK', STATUS_WARNING: 'WARNING', STATUS_CRITICAL: 'CRITICAL', STATUS_UNKNOWN: 'UNKNOWN'} -_socketPath = '/var/spool/nagios/cmd/live' - - -# Class for exception definition -class checkPingCmdExecFailedException(Exception): - message = "check_ping command failed" - - def __init__(self, rc=0, out=(), err=()): - self.rc = rc - self.out = out - self.err = err - - def __str__(self): - o = '\n'.join(self.out) - e = '\n'.join(self.err) - if o and e: - m = o + '\n' + e - else: - m = o or e - s = self.message - if m: - s += '\nerror: ' + m - if self.rc: - s += '\nreturn code: %s' % self.rc - return s - -# Method to execute a command -def execCmd(command): - proc = subprocess.Popen(command, - close_fds=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - (out, err) = proc.communicate() - return (proc.returncode, out, err) - - -# Method to check the ing status of the host -def getPingStatus(hostAddr): - cmd = "%s -H %s" % (_checkPingCommand, hostAddr) - cmd += " -w 3000.0,80% -c 5000.0,100%" - - try: - (rc, out, err) = execCmd(shlex.split(cmd)) - except (OSError, ValueError) as e: - raise checkPingCmdExecFailedException(err=[str(e)]) - - if rc != 0: - raise checkPingCmdExecFailedException(rc, [out], [err]) - - return rc +# Load the host monitoring services list +def loadSrvcList(): + srvc_list = [] + with open("/etc/nagios/gluster/host-monitoring-services.in") as data_file: + srvc_list = json.load(data_file)['serviceList'] + return srvc_list # Method to execute livestatus def checkLiveStatus(hostAddr, srvc): - s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - s.connect(_socketPath) - - # Write command to socket - cmd = "GET services\nColumns: state\nFilter: " - "description = %s\nFilter: host_address = %s\n" % (srvc, hostAddr) - s.send(cmd) + cmd = "GET services\nColumns: state\nFilter: " \ + "description = %s\n" \ + "Filter: host_address = %s" % (srvc, hostAddr) - # Close socket - s.shutdown(socket.SHUT_WR) - - # Read the answer - answer = s.recv(1000000) - - # Parse the answer into a table - table = [line.split(';') for line in answer.split('\n')[:-1]] + table = livestatus.readLiveStatus(cmd) if len(table) > 0 and len(table[0]) > 0: return int(table[0][0]) @@ -150,43 +95,18 @@ if __name__ == "__main__": showUsage() sys.exit(STATUS_CRITICAL) - # Check ping status of the node, if its not reachable exit - try: - pingStatus = getPingStatus(hostAddr) - except (checkPingCmdExecFailedException) as e: - print "Host Status %s - Host not reachable" % \ - (_commandStatusStrs[STATUS_UNKNOWN]) - sys.exit(_commandStatusStrs[STATUS_UNKNOWN]) - - if pingStatus != STATUS_OK: - print "Host Status %s - Host not reachable" % \ - (_commandStatusStrs[STATUS_UNKNOWN]) - sys.exit(pingStatus) - - # Check the various performance statuses for the host - diskPerfStatus = checkLiveStatus(hostAddr, 'Disk Utilization') - cpuPerfStatus = checkLiveStatus(hostAddr, 'Cpu Utilization') - memPerfStatus = checkLiveStatus(hostAddr, 'Memory Utilization') - swapPerfStatus = checkLiveStatus(hostAddr, 'Swap Utilization') - nwPerfStatus = checkLiveStatus(hostAddr, 'Network Utilization') - - # Calculate the consolidated status for the host based on above status - # details - finalStatus = pingStatus | diskPerfStatus | cpuPerfStatus | \ - memPerfStatus | swapPerfStatus | nwPerfStatus - - # Get the list of ciritical services + # Load the services list + srvc_list = loadSrvcList() + + # Calculate the consolidated status for the host based on above + # status of individual services + finalStatus = STATUS_OK criticalSrvcs = [] - if diskPerfStatus == STATUS_CRITICAL: - criticalSrvcs.append('Disk Utilization') - if cpuPerfStatus == STATUS_CRITICAL: - criticalSrvcs.append('Cpu Utilization') - if memPerfStatus == STATUS_CRITICAL: - criticalSrvcs.append('Memory Utilization') - if swapPerfStatus == STATUS_CRITICAL: - criticalSrvcs.append('Swap Utilization') - if nwPerfStatus == STATUS_CRITICAL: - criticalSrvcs.append('Network Utilization') + for srvc in srvc_list: + srvc_status = checkLiveStatus(hostAddr, srvc) + finalStatus = finalStatus | srvc_status + if srvc_status == STATUS_CRITICAL: + criticalSrvcs.append(srvc) # Return the status if finalStatus == STATUS_CRITICAL: diff --git a/plugins/gluster_host_service_handler.py b/plugins/gluster_host_service_handler.py index 283ac69..2a62108 100755 --- a/plugins/gluster_host_service_handler.py +++ b/plugins/gluster_host_service_handler.py @@ -23,9 +23,10 @@ import os import sys import datetime -import socket import getopt +import livestatus + STATUS_OK = "OK" STATUS_WARNING = "WARNING" STATUS_CRITICAL = "CRITICAL" @@ -37,14 +38,15 @@ statusCodes = {STATUS_OK: 0, STATUS_WARNING: 1, STATUS_CRITICAL: 2, NAGIOS_COMMAND_FILE = "/var/spool/nagios/cmd/nagios.cmd" SRVC_LIST = ['Disk Utilization', 'Cpu Utilization', 'Memory Utilization', 'Swap Utilization', 'Network Utilization'] -_socketPath = '/var/spool/nagios/cmd/live' # Shows the usage of the script def showUsage(): - usage = "Usage: %s -s <Service State (OK/WARNING/CRITICAL/UNKNOWN)> " - "-t <Service State Type (SOFT/HARD)> -a <No of Service attempts> " - "-l <Host Address> -n <Service Name>\n" % os.path.basename(sys.argv[0]) + usage = "Usage: %s -s <Service State (OK/WARNING/CRITICAL/UNKNOWN)> " \ + "-t <Service State Type (SOFT/HARD)>" \ + " -a <No of Service attempts> " \ + "-l <Host Address>" \ + " -n <Service Name>\n" % os.path.basename(sys.argv[0]) sys.stderr.write(usage) @@ -52,12 +54,13 @@ def showUsage(): def update_host_state(hostAddr, srvcName, statusCode): now = datetime.datetime.now() if statusCode == statusCodes[STATUS_WARNING]: - cmdStr = "[%s] PROCESS_HOST_CHECK_RESULT;%s;%s;Host Status WARNING - " - "Service(s) ['%s'] in CRITICAL state\n" % (now, hostAddr, statusCode, - srvcName) + cmdStr = "[%s] PROCESS_HOST_CHECK_RESULT;%s;%s;" \ + "Host Status WARNING - " \ + "Service(s) ['%s'] in CRITICAL state\n" \ + % (now, hostAddr, statusCode, srvcName) else: - cmdStr = "[%s] PROCESS_HOST_CHECK_RESULT;%s;%s;Host Status OK - " - "Services in good health\n" % (now, hostAddr, statusCode) + cmdStr = "[%s] PROCESS_HOST_CHECK_RESULT;%s;%s;Host Status OK - " \ + "Services in good health\n" % (now, hostAddr, statusCode) f = open(NAGIOS_COMMAND_FILE, "w") f.write(cmdStr) @@ -66,22 +69,10 @@ def update_host_state(hostAddr, srvcName, statusCode): # Method to execute livestatus def checkLiveStatus(hostAddr, srvc): - s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - s.connect(_socketPath) - - # Write command to socket - cmd = "GET services\nColumns: state\nFilter: " - "description = %s\nFilter: host_address = %s\n" % (srvc, hostAddr) - s.send(cmd) - - # Close socket - s.shutdown(socket.SHUT_WR) - - # Read the answer - answer = s.recv(1000) + cmd = "GET services\nColumns: state\nFilter: " \ + "description = %s\nFilter: host_address = %s" % (srvc, hostAddr) - # Parse the answer into a table - table = [line.split(';') for line in answer.split('\n')[:-1]] + table = livestatus.readLiveStatus(cmd) if len(table) > 0 and len(table[0]) > 0: return int(table[0][0]) @@ -141,8 +132,8 @@ if __name__ == "__main__": if srvcState == STATUS_CRITICAL: if srvcStateType == SRVC_STATE_TYPE_SOFT: if int(attempts) == 3: - print "Updating the host status to warning " - "(3rd SOFT critical state)..." + print "Updating the host status to warning " \ + "(3rd SOFT critical state)..." update_host_state(hostAddr, srvcName, statusCodes[STATUS_WARNING]) elif srvcStateType == SRVC_STATE_TYPE_HARD: |