diff options
author | Timothy Asir <tjeyasin@redhat.com> | 2014-04-29 18:28:14 +0530 |
---|---|---|
committer | Sahina Bose <sabose@redhat.com> | 2014-05-02 06:16:03 -0700 |
commit | d646c986a3ba54570c9a0d367d106deeb0a80e38 (patch) | |
tree | b721ee3b3d1b9f53e10ac417b77f5040a6e2a9bb /plugins/check_proc_status.py | |
parent | ccec0742af257e13effafa30a1184541c3cf5b65 (diff) |
Run check gluster process status as a daemon
Enhanced to send specific gluster related process status only
when there is a change detected in a service status or for any
critical status to avoid too many logs in the nagios service side.
Change-Id: I26e389ae2d1ccba1b5ccadc45d202d3b5219c74a
Signed-off-by: Timothy Asir <tjeyasin@redhat.com>
Reviewed-on: http://review.gluster.org/7602
Reviewed-by: Sahina Bose <sabose@redhat.com>
Tested-by: Timothy Asir <tim.gluster@gmail.com>
Diffstat (limited to 'plugins/check_proc_status.py')
-rwxr-xr-x | plugins/check_proc_status.py | 158 |
1 files changed, 115 insertions, 43 deletions
diff --git a/plugins/check_proc_status.py b/plugins/check_proc_status.py index 2ac1bc3..95a9b96 100755 --- a/plugins/check_proc_status.py +++ b/plugins/check_proc_status.py @@ -19,7 +19,11 @@ import sys import errno import socket +import lockfile +import logging import psutil +import time +from daemon import runner import nscautils import glusternagios @@ -47,7 +51,8 @@ _glusterdService = "Gluster Management Daemon" _quotadService = "Gluster Quota Daemon" -def sendBrickStatus(hostName, volInfo): +def getBrickStatus(hostName, volInfo): + bricks = {} hostUuid = glustercli.hostUUIDGet() status = None for volumeName, volumeInfo in volInfo.iteritems(): @@ -78,15 +83,15 @@ def sendBrickStatus(hostName, volInfo): msg = "OK: Brick %s" % brickPath elif status != utils.PluginStatusCode.UNKNOWN: msg = "CRITICAL: Brick %s is down" % brickPath - nscautils.send_to_nsca(hostName, brickService, status, msg) + bricks[brickService] = [status, msg] + return bricks -def sendNfsStatus(hostName, volInfo): +def getNfsStatus(hostName, volInfo): # if nfs is already running we need not to check further status, msg, error = utils.execCmd(_checkNfsCmd) if status == utils.PluginStatusCode.OK: - nscautils.send_to_nsca(hostName, _nfsService, status, msg) - return + return status, msg[0] if len(msg) > 0 else "" # if nfs is not running and any of the volume uses nfs # then its required to alert the user @@ -101,36 +106,34 @@ def sendNfsStatus(hostName, volInfo): else: msg = "OK: No gluster volume uses nfs" status = utils.PluginStatusCode.OK - nscautils.send_to_nsca(hostName, _nfsService, status, msg) + return status, msg -def sendSmbStatus(hostName, volInfo): +def getSmbStatus(hostName, volInfo): status, msg, error = utils.execCmd(_checkSmbCmd) if status == utils.PluginStatusCode.OK: - nscautils.send_to_nsca(hostName, _smbService, status, msg) - return + return status, msg[0] if len(msg) > 0 else "" # if smb is not running and any of the volume uses smb # then its required to alert the use for k, v in volInfo.iteritems(): - cifsStatus = v.get('options', {}).get('user.cifs', '') - smbStatus = v.get('options', {}).get('user.smb', '') - if cifsStatus == 'disable' or smbStatus == 'disable': + cifsStatus = v.get('options', {}).get('user.cifs', 'enable') + smbStatus = v.get('options', {}).get('user.smb', 'enable') + if cifsStatus == 'enable' and smbStatus == 'enable': msg = "CRITICAL: Process smb is not running" status = utils.PluginStatusCode.CRITICAL break else: msg = "OK: No gluster volume uses smb" status = utils.PluginStatusCode.OK - nscautils.send_to_nsca(hostName, _smbService, status, msg) + return status, msg -def sendQuotadStatus(hostName, volInfo): +def getQuotadStatus(hostName, volInfo): # if quota is already running we need not to check further status, msg, error = utils.execCmd(_checkQuotaCmd) if status == utils.PluginStatusCode.OK: - nscautils.send_to_nsca(hostName, _quotadService, status, msg) - return + return status, msg[0] if len(msg) > 0 else "" # if quota is not running and any of the volume uses quota # then the quotad process should be running in the host @@ -143,14 +146,13 @@ def sendQuotadStatus(hostName, volInfo): else: msg = "OK: Quota not enabled" status = utils.PluginStatusCode.OK - nscautils.send_to_nsca(hostName, _quotadService, status, msg) + return status, msg -def sendShdStatus(hostName, volInfo): +def getShdStatus(hostName, volInfo): status, msg, error = utils.execCmd(_checkShdCmd) if status == utils.PluginStatusCode.OK: - nscautils.send_to_nsca(hostName, _shdService, status, msg) - return + return status, msg[0] if len(msg) > 0 else "" hostUuid = glustercli.hostUUIDGet() for volumeName, volumeInfo in volInfo.iteritems(): @@ -164,7 +166,7 @@ def sendShdStatus(hostName, volInfo): else: msg = "OK: Process Gluster Self Heal Daemon" status = utils.PluginStatusCode.OK - nscautils.send_to_nsca(hostName, _shdService, status, msg) + return status, msg def hasBricks(hostUuid, bricks): @@ -174,31 +176,101 @@ def hasBricks(hostUuid, bricks): return False -if __name__ == '__main__': - hostName = nscautils.getCurrentHostNameInNagiosServer() - if not hostName: - hostName = socket.getfqdn() - if hostName == "localhost.localdomain" or hostName == "localhost": - sys.stderr.write("failed to find localhost fqdn") +class App(): + def __init__(self): + self.stdin_path = '/dev/null' + self.stdout_path = '/dev/tty' + self.stderr_path = '/dev/null' + self.pidfile_path = '/var/run/glusterpmd.pid' + self.pidfile_timeout = 5 - ### service check ### - status, msg, error = utils.execCmd(_checkGlusterdCmd) - nscautils.send_to_nsca(hostName, _glusterdService, status, msg) + def run(self): + hostName = nscautils.getCurrentHostNameInNagiosServer() + sleepTime = int(nscautils.getProcessMonitorSleepTime()) + glusterdStatus = None + nfsStatus = None + smbStatus = None + shdStatus = None + quotaStatus = None + brickStatus = {} + while True: + if not hostName: + hostName = nscautils.getCurrentHostNameInNagiosServer() + if not hostName: + logger.warn("Hostname is not configured") + time.sleep(sleepTime) + continue + status, msg, error = utils.execCmd(_checkGlusterdCmd) + if status != glusterdStatus or \ + status == utils.PluginStatusCode.CRITICAL: + glusterdStatus = status + msg = msg[0] if len(msg) > 0 else "" + nscautils.send_to_nsca(hostName, _glusterdService, status, msg) - # Get the volume status only if glusterfs is running to avoid - # unusual delay - if status != utils.PluginStatusCode.OK: - sys.exit(status) + # Get the volume status only if glusterfs is running to avoid + # unusual delay + if status != utils.PluginStatusCode.OK: + logger.warn("Glusterd is not running") + time.sleep(sleepTime) + continue - try: - volInfo = glustercli.volumeInfo() - except glusternagios.glustercli.GlusterCmdFailedException as e: - sys.exit(utils.PluginStatusCode.UNKNOWN) + try: + volInfo = glustercli.volumeInfo() + except glusternagios.glustercli.GlusterCmdFailedException: + logger.error("failed to find volume info") + time.sleep(sleepTime) + continue + + status, msg = getNfsStatus(hostName, volInfo) + if status != nfsStatus or \ + status == utils.PluginStatusCode.CRITICAL: + nfsStatus = status + nscautils.send_to_nsca(hostName, _nfsService, status, msg) - sendNfsStatus(hostName, volInfo) - sendSmbStatus(hostName, volInfo) - sendShdStatus(hostName, volInfo) - sendQuotadStatus(hostName, volInfo) - sendBrickStatus(hostName, volInfo) + status, msg = getSmbStatus(hostName, volInfo) + if status != smbStatus or \ + status == utils.PluginStatusCode.CRITICAL: + smbStatus = status + nscautils.send_to_nsca(hostName, _smbService, status, msg) + status, msg = getShdStatus(hostName, volInfo) + if status != shdStatus or \ + status == utils.PluginStatusCode.CRITICAL: + shdStatus = status + nscautils.send_to_nsca(hostName, _shdService, status, msg) + + status, msg = getQuotadStatus(hostName, volInfo) + if status != quotaStatus or \ + status == utils.PluginStatusCode.CRITICAL: + quotaStatus = status + nscautils.send_to_nsca(hostName, _quotadService, status, msg) + + brick = getBrickStatus(hostName, volInfo) + # brickInfo contains status, and message + for brickService, brickInfo in brick.iteritems(): + if brickInfo[0] != brickStatus.get(brickService, [None])[0] \ + or brickInfo[0] == utils.PluginStatusCode.CRITICAL: + brickStatus[brickService] = brickInfo + nscautils.send_to_nsca(hostName, brickService, + brickInfo[0], brickInfo[1]) + time.sleep(sleepTime) + +if __name__ == '__main__': + app = App() + logger = logging.getLogger("GlusterProcLog") + logger.setLevel(logging.INFO) + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s") + handler = logging.FileHandler("/var/log/glusterpmd.log") + handler.setFormatter(formatter) + logger.addHandler(handler) + + daemonRunner = runner.DaemonRunner(app) + daemonRunner.daemon_context.files_preserve = [handler.stream] + try: + daemonRunner.do_action() + except lockfile.LockTimeout: + logger.error("failed to aquire lock") + except runner.DaemonRunnerStopFailureError: + logger.error("failed to get the lock file") sys.exit(utils.PluginStatusCode.OK) |