Enhanced check proc and added CTDB service check

CTDB is node based and any gluster volume can be configured to be used by CTDB. Normally when CTDB starts it will start Samba automatically. However one can switch off SMB at any given time. But If SMB/NFS are not running in any node, then CTDB is also not required. This patch checks for CTDB process and will send an appropriate message to nagios based on its status using 'ctdb nodestatus' command. Currently there is no direct way to find whether CTDB is required to run on a given node. Its because, the CTDB volume name / CTDB volume mount path can have any name. It could be easy if gluster-cli comes with any tag/set command to know whether the given volume is for CTDB or CTDB enabled. Change-Id: Iccb98296ebd902838acc63805bbe20cd77cdcc61 Signed-off-by: Timothy Asir <tjeyasin@redhat.com> Reviewed-on: http://review.gluster.org/7647 Tested-by: Timothy Asir <tim.gluster@gmail.com> Reviewed-by: Kanagaraj M <kmayilsa@redhat.com> Reviewed-by: Sahina Bose <sabose@redhat.com>
author: Timothy Asir <tjeyasin@redhat.com> 2014-05-02 19:02:08 +0530
committer: Sahina Bose <sabose@redhat.com> 2014-05-05 04:28:43 -0700
commit: 41d8289255f669c5a033a34b61612300e8fe838d (patch)
tree: d6f7949b63587b5643755d6b156a4577bcc50e3d
parent: 8831c86a8acbd1c9f46719ad7b8f26ec325d1376 (diff)
1 files changed, 50 insertions, 13 deletions
diff --git a/plugins/check_proc_status.py b/plugins/check_proc_status.py
index bea18ab..00f3f1d 100755
--- a/plugins/check_proc_status.py
+++ b/plugins/check_proc_status.py
@@ -23,6 +23,7 @@ import logging
 import psutil
 import time
 from daemon import runner
+from logging import handlers
 import nscautils
 import glusternagios
 
@@ -37,20 +38,22 @@ _glusterVolPath = "/var/lib/glusterd/vols"
 _checkNfsCmd = [_checkProc.cmd, "-c", "1:", "-C", "glusterfs", "-a", "nfs"]
 _checkShdCmd = [_checkProc.cmd, "-c", "1:", "-C", "glusterfs", "-a",
                 "glustershd"]
-_checkSmbCmd = [_checkProc.cmd, "-C", "smb"]
+_checkSmbCmd = [_checkProc.cmd, "-c", "1:", "-C", "smbd"]
 _checkQuotaCmd = [_checkProc.cmd, "-c", "1:", "-C", "glusterfs", "-a",
                   "quotad"]
 _checkBrickCmd = [_checkProc.cmd, "-C", "glusterfsd"]
 _checkGlusterdCmd = [_checkProc.cmd, "-c", "1:", "-w", "1:1", "-C", "glusterd"]
+_checkCtdbCmd = [_checkProc.cmd, "-c", "1:", "-C", "ctdbd"]
 _nfsService = "Glusterfs NFS Daemon"
 _shdService = "Glusterfs Self-Heal Daemon"
 _smbService = "CIFS"
 _brickService = "Brick Status - "
 _glusterdService = "Gluster Management Daemon"
 _quotadService = "Gluster Quota Daemon"
+_ctdbdService = "CTDB"
 
 
-def getBrickStatus(hostName, volInfo):
+def getBrickStatus(volInfo):
     bricks = {}
     hostUuid = glustercli.hostUUIDGet()
     status = None
@@ -86,7 +89,7 @@ def getBrickStatus(hostName, volInfo):
     return bricks
 
 
-def getNfsStatus(hostName, volInfo):
+def getNfsStatus(volInfo):
     # if nfs is already running we need not to check further
     status, msg, error = utils.execCmd(_checkNfsCmd)
     if status == utils.PluginStatusCode.OK:
@@ -108,13 +111,39 @@ def getNfsStatus(hostName, volInfo):
     return status, msg
 
 
-def getSmbStatus(hostName, volInfo):
+def getCtdbStatus(smbStatus, nfsStatus):
+    if smbStatus != utils.PluginStatusCode.OK and \
+       nfsStatus != utils.PluginStatusCode.OK:
+        return (utils.PluginStatusCode.OK,
+                "CTDB ignored as SMB and NFS are not running")
+
+    status, msg, error = utils.execCmd(_checkCtdbCmd)
+    if status != utils.PluginStatusCode.OK:
+        return utils.PluginStatusCode.UNKNOWN, "CTDB not configured"
+
+    # CTDB, SMB/NFS are running
+    status, msg, error = utils.execCmd(['ctdb', 'nodestatus'])
+    if status == utils.PluginStatusCode.OK:
+        if len(msg) > -1:
+            message = msg[0].split()
+            if len(message) > 1:
+                msg = "Node status: %s" % message[2]
+                if message[2] == 'UNHEALTHY':
+                    status = utils.PluginStatusCode.WARNING
+                elif message[2] in ['DISCONNECTED', 'BANNED', 'INACTIVE']:
+                    status = utils.PluginStatusCode.CRITICAL
+                else:
+                    status = utils.PluginStatusCode.UNKNOWN
+    return status, msg
+
+
+def getSmbStatus(volInfo):
     status, msg, error = utils.execCmd(_checkSmbCmd)
     if status == utils.PluginStatusCode.OK:
         return status, msg[0] if len(msg) > 0 else ""
 
     # if smb is not running and any of the volume uses smb
-    # then its required to alert the use
+    # then its required to alert the user
     for k, v in volInfo.iteritems():
         cifsStatus = v.get('options', {}).get('user.cifs', 'enable')
         smbStatus = v.get('options', {}).get('user.smb', 'enable')
@@ -128,7 +157,7 @@ def getSmbStatus(hostName, volInfo):
     return status, msg
 
 
-def getQuotadStatus(hostName, volInfo):
+def getQuotadStatus(volInfo):
     # if quota is already running we need not to check further
     status, msg, error = utils.execCmd(_checkQuotaCmd)
     if status == utils.PluginStatusCode.OK:
@@ -148,7 +177,7 @@ def getQuotadStatus(hostName, volInfo):
     return status, msg
 
 
-def getShdStatus(hostName, volInfo):
+def getShdStatus(volInfo):
     status, msg, error = utils.execCmd(_checkShdCmd)
     if status == utils.PluginStatusCode.OK:
         return status, msg[0] if len(msg) > 0 else ""
@@ -191,6 +220,7 @@ class App():
         smbStatus = None
         shdStatus = None
         quotaStatus = None
+        ctdbStatus = None
         brickStatus = {}
         while True:
             if not hostName:
@@ -220,31 +250,37 @@ class App():
                 time.sleep(sleepTime)
                 continue
 
-            status, msg = getNfsStatus(hostName, volInfo)
+            status, msg = getNfsStatus(volInfo)
             if status != nfsStatus or \
                     status == utils.PluginStatusCode.CRITICAL:
                 nfsStatus = status
                 nscautils.send_to_nsca(hostName, _nfsService, status, msg)
 
-            status, msg = getSmbStatus(hostName, volInfo)
+            status, msg = getSmbStatus(volInfo)
             if status != smbStatus or \
                     status == utils.PluginStatusCode.CRITICAL:
                 smbStatus = status
                 nscautils.send_to_nsca(hostName, _smbService, status, msg)
 
-            status, msg = getShdStatus(hostName, volInfo)
+            status, msg = getCtdbStatus(smbStatus, nfsStatus)
+            if status != ctdbStatus or \
+                    status == utils.PluginStatusCode.CRITICAL:
+                ctdbStatus = status
+                nscautils.send_to_nsca(hostName, _ctdbdService, status, msg)
+
+            status, msg = getShdStatus(volInfo)
             if status != shdStatus or \
                     status == utils.PluginStatusCode.CRITICAL:
                 shdStatus = status
                 nscautils.send_to_nsca(hostName, _shdService, status, msg)
 
-            status, msg = getQuotadStatus(hostName, volInfo)
+            status, msg = getQuotadStatus(volInfo)
             if status != quotaStatus or \
                     status == utils.PluginStatusCode.CRITICAL:
                 quotaStatus = status
                 nscautils.send_to_nsca(hostName, _quotadService, status, msg)
 
-            brick = getBrickStatus(hostName, volInfo)
+            brick = getBrickStatus(volInfo)
             # brickInfo contains status, and message
             for brickService, brickInfo in brick.iteritems():
                 if brickInfo[0] != brickStatus.get(brickService, [None])[0] \
@@ -260,7 +296,8 @@ if __name__ == '__main__':
     logger.setLevel(logging.INFO)
     formatter = logging.Formatter(
         "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
-    handler = logging.FileHandler("/var/log/glusterpmd.log")
+    handler = handlers.TimedRotatingFileHandler(
+        "/var/log/glusterpmd.log", 'midnight')
     handler.setFormatter(formatter)
     logger.addHandler(handler)
author	Timothy Asir <tjeyasin@redhat.com>	2014-05-02 19:02:08 +0530
committer	Sahina Bose <sabose@redhat.com>	2014-05-05 04:28:43 -0700
commit	41d8289255f669c5a033a34b61612300e8fe838d (patch)
tree	d6f7949b63587b5643755d6b156a4577bcc50e3d
parent	8831c86a8acbd1c9f46719ad7b8f26ec325d1376 (diff)