From 795fc58ef57127f257ea06c8c77ff8fba0662a4e Mon Sep 17 00:00:00 2001 From: Shubhendu Tripathi Date: Fri, 14 Mar 2014 12:56:53 +0530 Subject: nagios-server-addons: Check remote host plugin Plugins to ckeck the remote host status based on various services. Change-Id: I1e260829901aa8dd831f0ca1d58609addb9bcf1b Signed-off-by: Shubhendu Tripathi Reviewed-on: https://cuckoo.blr.redhat.com:8443/9 Reviewed-by: Sahina Bose Tested-by: Sahina Bose --- config/Makefile.am | 1 + config/host-monitoring-services.in | 8 ++ plugins/check_remote_host.py | 144 +++++++------------------------- plugins/gluster_host_service_handler.py | 45 ++++------ tests/test_check_remote_host.py | 47 +++-------- 5 files changed, 72 insertions(+), 173 deletions(-) create mode 100644 config/host-monitoring-services.in diff --git a/config/Makefile.am b/config/Makefile.am index 7e19136..eea3ea2 100644 --- a/config/Makefile.am +++ b/config/Makefile.am @@ -5,6 +5,7 @@ glusternagiosconf_DATA = \ gluster-host-services.cfg \ gluster-templates.cfg \ gluster-contacts.cfg \ + host-monitoring-services.in \ $(NULL) glusternagiosdefaultconfdir = $(sysconfdir)/nagios/gluster/default diff --git a/config/host-monitoring-services.in b/config/host-monitoring-services.in new file mode 100644 index 0000000..52ce8e0 --- /dev/null +++ b/config/host-monitoring-services.in @@ -0,0 +1,8 @@ +{serviceList: [ + 'Cpu Utilization', + 'Disk Utilization', + 'Memory Utilization', + 'Network Utilization', + 'Swap Utilization', + ] +} diff --git a/plugins/check_remote_host.py b/plugins/check_remote_host.py index 7350e27..31ff9dd 100755 --- a/plugins/check_remote_host.py +++ b/plugins/check_remote_host.py @@ -2,13 +2,13 @@ # # check_remote_host.py -- nagios plugin uses Mklivestatus to get the overall # status -# of a host. The entities considered for the status of the host are - -# 1. Host is reachable -# 2. LV/Inode Service status -# 3. CPU Utilization -# 4. Memory Utilization -# 5. Network Utilization -# 6. Swap Utilization +# of a host. The services considered by default for the status of the host +# are - +# 1. LV/Inode Service status +# 2. CPU Utilization +# 3. Memory Utilization +# 4. Network Utilization +# 5. Swap Utilization # # Copyright (C) 2014 Red Hat Inc # @@ -29,90 +29,35 @@ import os import sys -import shlex -import subprocess -import socket import getopt +#import socket +import json + +import livestatus STATUS_OK = 0 STATUS_WARNING = 1 STATUS_CRITICAL = 2 STATUS_UNKNOWN = 3 -_checkPingCommand = "/usr/lib64/nagios/plugins/check_ping" _commandStatusStrs = {STATUS_OK: 'OK', STATUS_WARNING: 'WARNING', STATUS_CRITICAL: 'CRITICAL', STATUS_UNKNOWN: 'UNKNOWN'} -_socketPath = '/var/spool/nagios/cmd/live' - - -# Class for exception definition -class checkPingCmdExecFailedException(Exception): - message = "check_ping command failed" - - def __init__(self, rc=0, out=(), err=()): - self.rc = rc - self.out = out - self.err = err - - def __str__(self): - o = '\n'.join(self.out) - e = '\n'.join(self.err) - if o and e: - m = o + '\n' + e - else: - m = o or e - s = self.message - if m: - s += '\nerror: ' + m - if self.rc: - s += '\nreturn code: %s' % self.rc - return s - -# Method to execute a command -def execCmd(command): - proc = subprocess.Popen(command, - close_fds=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - (out, err) = proc.communicate() - return (proc.returncode, out, err) - - -# Method to check the ing status of the host -def getPingStatus(hostAddr): - cmd = "%s -H %s" % (_checkPingCommand, hostAddr) - cmd += " -w 3000.0,80% -c 5000.0,100%" - - try: - (rc, out, err) = execCmd(shlex.split(cmd)) - except (OSError, ValueError) as e: - raise checkPingCmdExecFailedException(err=[str(e)]) - - if rc != 0: - raise checkPingCmdExecFailedException(rc, [out], [err]) - - return rc +# Load the host monitoring services list +def loadSrvcList(): + srvc_list = [] + with open("/etc/nagios/gluster/host-monitoring-services.in") as data_file: + srvc_list = json.load(data_file)['serviceList'] + return srvc_list # Method to execute livestatus def checkLiveStatus(hostAddr, srvc): - s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - s.connect(_socketPath) - - # Write command to socket - cmd = "GET services\nColumns: state\nFilter: " - "description = %s\nFilter: host_address = %s\n" % (srvc, hostAddr) - s.send(cmd) + cmd = "GET services\nColumns: state\nFilter: " \ + "description = %s\n" \ + "Filter: host_address = %s" % (srvc, hostAddr) - # Close socket - s.shutdown(socket.SHUT_WR) - - # Read the answer - answer = s.recv(1000000) - - # Parse the answer into a table - table = [line.split(';') for line in answer.split('\n')[:-1]] + table = livestatus.readLiveStatus(cmd) if len(table) > 0 and len(table[0]) > 0: return int(table[0][0]) @@ -150,43 +95,18 @@ if __name__ == "__main__": showUsage() sys.exit(STATUS_CRITICAL) - # Check ping status of the node, if its not reachable exit - try: - pingStatus = getPingStatus(hostAddr) - except (checkPingCmdExecFailedException) as e: - print "Host Status %s - Host not reachable" % \ - (_commandStatusStrs[STATUS_UNKNOWN]) - sys.exit(_commandStatusStrs[STATUS_UNKNOWN]) - - if pingStatus != STATUS_OK: - print "Host Status %s - Host not reachable" % \ - (_commandStatusStrs[STATUS_UNKNOWN]) - sys.exit(pingStatus) - - # Check the various performance statuses for the host - diskPerfStatus = checkLiveStatus(hostAddr, 'Disk Utilization') - cpuPerfStatus = checkLiveStatus(hostAddr, 'Cpu Utilization') - memPerfStatus = checkLiveStatus(hostAddr, 'Memory Utilization') - swapPerfStatus = checkLiveStatus(hostAddr, 'Swap Utilization') - nwPerfStatus = checkLiveStatus(hostAddr, 'Network Utilization') - - # Calculate the consolidated status for the host based on above status - # details - finalStatus = pingStatus | diskPerfStatus | cpuPerfStatus | \ - memPerfStatus | swapPerfStatus | nwPerfStatus - - # Get the list of ciritical services + # Load the services list + srvc_list = loadSrvcList() + + # Calculate the consolidated status for the host based on above + # status of individual services + finalStatus = STATUS_OK criticalSrvcs = [] - if diskPerfStatus == STATUS_CRITICAL: - criticalSrvcs.append('Disk Utilization') - if cpuPerfStatus == STATUS_CRITICAL: - criticalSrvcs.append('Cpu Utilization') - if memPerfStatus == STATUS_CRITICAL: - criticalSrvcs.append('Memory Utilization') - if swapPerfStatus == STATUS_CRITICAL: - criticalSrvcs.append('Swap Utilization') - if nwPerfStatus == STATUS_CRITICAL: - criticalSrvcs.append('Network Utilization') + for srvc in srvc_list: + srvc_status = checkLiveStatus(hostAddr, srvc) + finalStatus = finalStatus | srvc_status + if srvc_status == STATUS_CRITICAL: + criticalSrvcs.append(srvc) # Return the status if finalStatus == STATUS_CRITICAL: diff --git a/plugins/gluster_host_service_handler.py b/plugins/gluster_host_service_handler.py index 283ac69..2a62108 100755 --- a/plugins/gluster_host_service_handler.py +++ b/plugins/gluster_host_service_handler.py @@ -23,9 +23,10 @@ import os import sys import datetime -import socket import getopt +import livestatus + STATUS_OK = "OK" STATUS_WARNING = "WARNING" STATUS_CRITICAL = "CRITICAL" @@ -37,14 +38,15 @@ statusCodes = {STATUS_OK: 0, STATUS_WARNING: 1, STATUS_CRITICAL: 2, NAGIOS_COMMAND_FILE = "/var/spool/nagios/cmd/nagios.cmd" SRVC_LIST = ['Disk Utilization', 'Cpu Utilization', 'Memory Utilization', 'Swap Utilization', 'Network Utilization'] -_socketPath = '/var/spool/nagios/cmd/live' # Shows the usage of the script def showUsage(): - usage = "Usage: %s -s " - "-t -a " - "-l -n \n" % os.path.basename(sys.argv[0]) + usage = "Usage: %s -s " \ + "-t " \ + " -a " \ + "-l " \ + " -n \n" % os.path.basename(sys.argv[0]) sys.stderr.write(usage) @@ -52,12 +54,13 @@ def showUsage(): def update_host_state(hostAddr, srvcName, statusCode): now = datetime.datetime.now() if statusCode == statusCodes[STATUS_WARNING]: - cmdStr = "[%s] PROCESS_HOST_CHECK_RESULT;%s;%s;Host Status WARNING - " - "Service(s) ['%s'] in CRITICAL state\n" % (now, hostAddr, statusCode, - srvcName) + cmdStr = "[%s] PROCESS_HOST_CHECK_RESULT;%s;%s;" \ + "Host Status WARNING - " \ + "Service(s) ['%s'] in CRITICAL state\n" \ + % (now, hostAddr, statusCode, srvcName) else: - cmdStr = "[%s] PROCESS_HOST_CHECK_RESULT;%s;%s;Host Status OK - " - "Services in good health\n" % (now, hostAddr, statusCode) + cmdStr = "[%s] PROCESS_HOST_CHECK_RESULT;%s;%s;Host Status OK - " \ + "Services in good health\n" % (now, hostAddr, statusCode) f = open(NAGIOS_COMMAND_FILE, "w") f.write(cmdStr) @@ -66,22 +69,10 @@ def update_host_state(hostAddr, srvcName, statusCode): # Method to execute livestatus def checkLiveStatus(hostAddr, srvc): - s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - s.connect(_socketPath) - - # Write command to socket - cmd = "GET services\nColumns: state\nFilter: " - "description = %s\nFilter: host_address = %s\n" % (srvc, hostAddr) - s.send(cmd) - - # Close socket - s.shutdown(socket.SHUT_WR) - - # Read the answer - answer = s.recv(1000) + cmd = "GET services\nColumns: state\nFilter: " \ + "description = %s\nFilter: host_address = %s" % (srvc, hostAddr) - # Parse the answer into a table - table = [line.split(';') for line in answer.split('\n')[:-1]] + table = livestatus.readLiveStatus(cmd) if len(table) > 0 and len(table[0]) > 0: return int(table[0][0]) @@ -141,8 +132,8 @@ if __name__ == "__main__": if srvcState == STATUS_CRITICAL: if srvcStateType == SRVC_STATE_TYPE_SOFT: if int(attempts) == 3: - print "Updating the host status to warning " - "(3rd SOFT critical state)..." + print "Updating the host status to warning " \ + "(3rd SOFT critical state)..." update_host_state(hostAddr, srvcName, statusCodes[STATUS_WARNING]) elif srvcStateType == SRVC_STATE_TYPE_HARD: diff --git a/tests/test_check_remote_host.py b/tests/test_check_remote_host.py index c5c602d..f7965f3 100644 --- a/tests/test_check_remote_host.py +++ b/tests/test_check_remote_host.py @@ -19,49 +19,28 @@ # import mock +import socket +import plugins from testrunner import PluginsTestCase as TestCaseBase -from plugins.check_remote_host import * -class TestHello(TestCaseBase): - # Method to test the execCmd() method - @mock.patch('check_remote_host.subprocess.Popen') - def testExecCmd(self, mock_popen): - reference = subprocess.Popen('any command', close_fds=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - out = "sample output" - err = "" - reference.communicate.return_value = (out, err) - self.assertTrue(reference.communicate, "communicate called") - - # Method to test the getPingStatus() method - @mock.patch('check_remote_host.execCmd') - def testGetPingStatus(self, mock_execCmd): - rc = 0 - out = "sample output" - err = "" - mock_execCmd.return_value = (rc, out, err) - getPingStatus('dummy host') - mock_execCmd.assert_called_with([ - '/usr/lib64/nagios/plugins/check_ping', '-H', 'dummy', 'host', - '-w', '3000.0,80%', '-c', '5000.0,100%']) - self.assertRaises(OSError, execCmd, - ['/usr/lib64/nagios/plugins/check_ping', '-H', - 'dummy', 'host', '-w', '3000.0,80%', '-c', - '5000.0,100%']) - +class TestCheckRemoteHost(TestCaseBase): # Method to test the checkLiveStatus() method - @mock.patch('check_remote_host.socket.socket') + @mock.patch('plugins.check_remote_host.livestatus.socket.socket') def testCheckLiveStatus(self, mock_socket): reference = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) self.assertTrue(mock_socket, "called") reference.recv.return_value = "0\n" - checkLiveStatus("dummy host", "dummy srvc") - reference.connect.assert_called_with('/var/spool/nagios/cmd/live') + plugins.check_remote_host.checkLiveStatus("dummy host", "dummy srvc") + reference.connect.assert_called_with("${localstatedir}/" + "spool/nagios/cmd/live") reference.send.assert_called_with("GET services\nColumns: state\n" "Filter: description = dummy srvc\n" "Filter: host_address = " - "dummy host\n") - self.assertEquals(0, checkLiveStatus("dummy host", "dummy srvc")) + "dummy host\n" + "Separators: 10 124 44 59") + self.assertEquals(0, + plugins. + check_remote_host. + checkLiveStatus("dummy host", "dummy srvc")) -- cgit