diff options
-rw-r--r-- | gluster-nagios-addons.spec.in | 2 | ||||
-rw-r--r-- | plugins/Makefile.am | 2 | ||||
-rwxr-xr-x | plugins/check_gluster_proc_status.py | 90 | ||||
-rwxr-xr-x | plugins/check_proc_status.py | 182 | ||||
-rwxr-xr-x | plugins/check_proc_util.py | 196 |
5 files changed, 300 insertions, 172 deletions
diff --git a/gluster-nagios-addons.spec.in b/gluster-nagios-addons.spec.in index 6e305bf..363ead4 100644 --- a/gluster-nagios-addons.spec.in +++ b/gluster-nagios-addons.spec.in @@ -144,6 +144,8 @@ command[check_interfaces]=%{_libdir}/nagios/plugins/gluster/network.py -e lo -e command[check_brick_usage]=%{_libdir}/nagios/plugins/gluster/check_disk_and_inode.py -w 80 -c 90 -u MB -n -i \$ARG1\$ command[check_vol_utilization]=sudo %{_libdir}/nagios/plugins/gluster/check_vol_utilization.py \$ARG1\$ -w \$ARG2\$ -c \$ARG3\$ command[check_vol_status]=sudo %{_libdir}/nagios/plugins/gluster/check_volume_status.py -v \$ARG1\$ -t \$ARG2\$ +command[check_proc_status]=sudo %{_libdir}/nagios/plugins/gluster/check_gluster_proc_status.py -t \$ARG1\$ +command[check_brick_status]=sudo %{_libdir}/nagios/plugins/gluster/check_gluster_proc_status.py -t BRICK -v \$ARG1\$ -b \$ARG2\$ ###Auto Discovery related command[discoverpeers]=sudo %{_libdir}/nagios/plugins/gluster/discoverpeers.py command[discover_volume_list]=sudo %{_libdir}/nagios/plugins/gluster/discover_volumes.py -l diff --git a/plugins/Makefile.am b/plugins/Makefile.am index c809b99..5f993aa 100644 --- a/plugins/Makefile.am +++ b/plugins/Makefile.am @@ -24,6 +24,8 @@ dist_glusternagiosplugins_PYTHON = \ discover_volumes.py \ discoverhostparams.py \ configure_gluster_node.py \ + check_gluster_proc_status.py \ + check_proc_util.py \ __init__.py \ memory.py \ network.py \ diff --git a/plugins/check_gluster_proc_status.py b/plugins/check_gluster_proc_status.py new file mode 100755 index 0000000..bc15672 --- /dev/null +++ b/plugins/check_gluster_proc_status.py @@ -0,0 +1,90 @@ +#!/usr/bin/python +# Copyright (C) 2014 Red Hat Inc +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +# + +import argparse + +import check_proc_util +from glusternagios import utils +from glusternagios import glustercli + + +_NFS = "NFS" +_SMB = "CIFS" +_CTDB = "CTDB" +_SHD = "SHD" +_QUOTA = "QUOTA" +_BRICK = "BRICK" +_GLUSTERD = "GLUSTERD" + + +def parse_input(): + parser = argparse.ArgumentParser() + parser.add_argument("-t", "--type", action="store", dest="type", + required=True, + help="Type of status to be shown. Possible values:", + choices=[_NFS, _SMB, _CTDB, _SHD, _QUOTA, _BRICK, + _GLUSTERD]) + parser.add_argument("-v", "--volume", action="store", required=False, + help="Name of the volume for status") + parser.add_argument("-b", "--brickPath", action="store", required=False, + help="Brick Path") + args = parser.parse_args() + return args + + +def _findBrickName(volInfo, brickPath): + hostUuid = glustercli.hostUUIDGet() + for volumeName, volumeInfo in volInfo.iteritems(): + for brick in volumeInfo['bricksInfo']: + if brick.get('hostUuid') == hostUuid \ + and brick['name'].split(':')[1] == brickPath: + return brick['name'] + + +if __name__ == '__main__': + args = parse_input() + status, msg = check_proc_util.getGlusterdStatus() + if status == utils.PluginStatusCode.OK: + if args.type == _NFS: + status, msg = check_proc_util.getNfsStatus(glustercli.volumeInfo()) + elif args.type == _SMB: + status, msg = check_proc_util.getSmbStatus(glustercli.volumeInfo()) + elif args.type == _SHD: + status, msg = check_proc_util.getShdStatus(glustercli.volumeInfo()) + elif args.type == _QUOTA: + status, msg = check_proc_util.getQuotadStatus( + glustercli.volumeInfo()) + elif args.type == _CTDB: + volInfo = glustercli.volumeInfo() + nfsStatus, nfsMsg = check_proc_util.getNfsStatus(volInfo) + smbStatus, smbMsg = check_proc_util.getSmbStatus(volInfo) + status, msg = check_proc_util.getCtdbStatus(smbStatus, nfsStatus) + elif args.type == _BRICK: + volInfo = glustercli.volumeInfo(args.volume) + brickName = _findBrickName(volInfo, args.brickPath) + if brickName: + status, msg = check_proc_util.getBrickStatus(args.volume, + brickName) + else: + status = utils.PluginStatusCode.CRITICAL + msg = "Brick - %s not found" % args.brickPath + elif args.type != _GLUSTERD: + msg = "UNKNOWN: Could not determine %s status " % args.type + status = utils.PluginStatusCode.UNKNOWN + print msg + exit(status) diff --git a/plugins/check_proc_status.py b/plugins/check_proc_status.py index 83bde1f..8895b0f 100755 --- a/plugins/check_proc_status.py +++ b/plugins/check_proc_status.py @@ -17,34 +17,19 @@ # import sys -import errno import lockfile import logging -import psutil import time from daemon import runner from logging import handlers import nscautils +import check_proc_util import glusternagios from glusternagios import utils from glusternagios import glustercli -from glusternagios import storage -_checkProc = utils.CommandPath('check_proc', - '/usr/lib64/nagios/plugins/check_procs') - -_glusterVolPath = "/var/lib/glusterd/vols" -_checkNfsCmd = [_checkProc.cmd, "-c", "1:", "-C", "glusterfs", "-a", "nfs"] -_checkShdCmd = [_checkProc.cmd, "-c", "1:", "-C", "glusterfs", "-a", - "glustershd"] -_checkSmbCmd = [_checkProc.cmd, "-c", "1:", "-C", "smbd"] -_checkQuotaCmd = [_checkProc.cmd, "-c", "1:", "-C", "glusterfs", "-a", - "quotad"] -_checkBrickCmd = [_checkProc.cmd, "-C", "glusterfsd"] -_checkGlusterdCmd = [_checkProc.cmd, "-c", "1:", "-w", "1:1", "-C", "glusterd"] -_checkCtdbCmd = [_checkProc.cmd, "-c", "1:", "-C", "ctdbd"] _nfsService = "NFS" _shdService = "Self-Heal" _smbService = "CIFS" @@ -59,166 +44,20 @@ checkIdeSmartCmdPath = utils.CommandPath( def getBrickStatus(volInfo): bricks = {} hostUuid = glustercli.hostUUIDGet() - status = None for volumeName, volumeInfo in volInfo.iteritems(): if volumeInfo['volumeStatus'] == glustercli.VolumeStatus.OFFLINE: continue for brick in volumeInfo['bricksInfo']: if brick.get('hostUuid') != hostUuid: continue + status, msg = check_proc_util.getBrickStatus(volumeName, + brick['name']) brickPath = brick['name'].split(':')[1] brickService = _brickService % brickPath - pidFile = brick['name'].replace( - ":/", "-").replace("/", "-") + ".pid" - try: - with open("%s/%s/run/%s" % ( - _glusterVolPath, volumeName, pidFile)) as f: - if psutil.pid_exists(int(f.read().strip())): - status = utils.PluginStatusCode.OK - #Now check the status of the underlying physical disk - brickDevice = storage.getBrickDeviceName( - brick['name'].split(":")[1]) - disk = storage.getDisksForBrick( - brickDevice) - cmd = [checkIdeSmartCmdPath.cmd, "-d", disk, "-n"] - rc, out, err = utils.execCmd(cmd) - if rc == utils.PluginStatusCode.CRITICAL and \ - "tests failed" in out[0]: - status = utils.PluginStatusCode.WARNING - msg = "WARNING: Brick %s: %s" % ( - brick['name'], out[0]) - else: - status = utils.PluginStatusCode.CRITICAL - except IOError, e: - if e.errno == errno.ENOENT: - status = utils.PluginStatusCode.CRITICAL - else: - status = utils.PluginStatusCode.UNKNOWN - msg = "UNKNOWN: Brick %s: %s" % (brickPath, str(e)) - finally: - if status == utils.PluginStatusCode.OK: - msg = "OK: Brick %s" % brickPath - elif status != utils.PluginStatusCode.UNKNOWN: - msg = "CRITICAL: Brick %s is down" % brickPath - bricks[brickService] = [status, msg] + bricks[brickService] = [status, msg] return bricks -def getNfsStatus(volInfo): - # if nfs is already running we need not to check further - status, msg, error = utils.execCmd(_checkNfsCmd) - if status == utils.PluginStatusCode.OK: - return status, msg[0] if len(msg) > 0 else "" - - # if nfs is not running and any of the volume uses nfs - # then its required to alert the user - for volume, volumeInfo in volInfo.iteritems(): - if volumeInfo['volumeStatus'] == glustercli.VolumeStatus.OFFLINE: - continue - nfsStatus = volumeInfo.get('options', {}).get('nfs.disable', 'off') - if nfsStatus == 'off': - msg = "CRITICAL: Process glusterfs-nfs is not running" - status = utils.PluginStatusCode.CRITICAL - break - else: - msg = "OK: No gluster volume uses nfs" - status = utils.PluginStatusCode.OK - return status, msg - - -def getCtdbStatus(smbStatus, nfsStatus): - if smbStatus != utils.PluginStatusCode.OK and \ - nfsStatus != utils.PluginStatusCode.OK: - return (utils.PluginStatusCode.OK, - "CTDB ignored as SMB and NFS are not running") - - status, msg, error = utils.execCmd(_checkCtdbCmd) - if status != utils.PluginStatusCode.OK: - return utils.PluginStatusCode.UNKNOWN, "CTDB not configured" - - # CTDB, SMB/NFS are running - status, msg, error = utils.execCmd(['ctdb', 'nodestatus']) - if status == utils.PluginStatusCode.OK: - if len(msg) > -1: - message = msg[0].split() - if len(message) > 1: - msg = "Node status: %s" % message[2] - if message[2] == 'UNHEALTHY': - status = utils.PluginStatusCode.WARNING - elif message[2] in ['DISCONNECTED', 'BANNED', 'INACTIVE']: - status = utils.PluginStatusCode.CRITICAL - else: - status = utils.PluginStatusCode.UNKNOWN - return status, msg - - -def getSmbStatus(volInfo): - status, msg, error = utils.execCmd(_checkSmbCmd) - if status == utils.PluginStatusCode.OK: - return status, msg[0] if len(msg) > 0 else "" - - # if smb is not running and any of the volume uses smb - # then its required to alert the user - for k, v in volInfo.iteritems(): - cifsStatus = v.get('options', {}).get('user.cifs', 'enable') - smbStatus = v.get('options', {}).get('user.smb', 'enable') - if cifsStatus == 'enable' and smbStatus == 'enable': - msg = "CRITICAL: Process smb is not running" - status = utils.PluginStatusCode.CRITICAL - break - else: - msg = "OK: No gluster volume uses smb" - status = utils.PluginStatusCode.OK - return status, msg - - -def getQuotadStatus(volInfo): - # if quota is already running we need not to check further - status, msg, error = utils.execCmd(_checkQuotaCmd) - if status == utils.PluginStatusCode.OK: - return status, msg[0] if len(msg) > 0 else "" - - # if quota is not running and any of the volume uses quota - # then the quotad process should be running in the host - for k, v in volInfo.iteritems(): - quotadStatus = v.get('options', {}).get('features.quota', '') - if quotadStatus == 'on': - msg = "CRITICAL: Process quotad is not running" - utils.PluginStatusCode.CRITICAL - break - else: - msg = "OK: Quota not enabled" - status = utils.PluginStatusCode.OK - return status, msg - - -def getShdStatus(volInfo): - status, msg, error = utils.execCmd(_checkShdCmd) - if status == utils.PluginStatusCode.OK: - return status, msg[0] if len(msg) > 0 else "" - - hostUuid = glustercli.hostUUIDGet() - for volumeName, volumeInfo in volInfo.iteritems(): - if volumeInfo['volumeStatus'] == glustercli.VolumeStatus.OFFLINE: - continue - if hasBricks(hostUuid, volumeInfo['bricksInfo']) and \ - int(volumeInfo['replicaCount']) > 1: - status = utils.PluginStatusCode.CRITICAL - msg = "CRITICAL: Gluster Self Heal Daemon not running" - break - else: - msg = "OK: Process Gluster Self Heal Daemon" - status = utils.PluginStatusCode.OK - return status, msg - - -def hasBricks(hostUuid, bricks): - for brick in bricks: - if brick['hostUuid'] == hostUuid: - return True - return False - - class App(): def __init__(self): self.stdin_path = '/dev/null' @@ -244,11 +83,10 @@ class App(): logger.warn("Hostname is not configured") time.sleep(sleepTime) continue - status, msg, error = utils.execCmd(_checkGlusterdCmd) + status, msg = check_proc_util.getGlusterdStatus() if status != glusterdStatus or \ status == utils.PluginStatusCode.CRITICAL: glusterdStatus = status - msg = msg[0] if len(msg) > 0 else "" nscautils.send_to_nsca(hostName, _glusterdService, status, msg) # Get the volume status only if glusterfs is running to avoid @@ -265,31 +103,31 @@ class App(): time.sleep(sleepTime) continue - status, msg = getNfsStatus(volInfo) + status, msg = check_proc_util.getNfsStatus(volInfo) if status != nfsStatus or \ status == utils.PluginStatusCode.CRITICAL: nfsStatus = status nscautils.send_to_nsca(hostName, _nfsService, status, msg) - status, msg = getSmbStatus(volInfo) + status, msg = check_proc_util.getSmbStatus(volInfo) if status != smbStatus or \ status == utils.PluginStatusCode.CRITICAL: smbStatus = status nscautils.send_to_nsca(hostName, _smbService, status, msg) - status, msg = getCtdbStatus(smbStatus, nfsStatus) + status, msg = check_proc_util.getCtdbStatus(smbStatus, nfsStatus) if status != ctdbStatus or \ status == utils.PluginStatusCode.CRITICAL: ctdbStatus = status nscautils.send_to_nsca(hostName, _ctdbdService, status, msg) - status, msg = getShdStatus(volInfo) + status, msg = check_proc_util.getShdStatus(volInfo) if status != shdStatus or \ status == utils.PluginStatusCode.CRITICAL: shdStatus = status nscautils.send_to_nsca(hostName, _shdService, status, msg) - status, msg = getQuotadStatus(volInfo) + status, msg = check_proc_util.getQuotadStatus(volInfo) if status != quotaStatus or \ status == utils.PluginStatusCode.CRITICAL: quotaStatus = status diff --git a/plugins/check_proc_util.py b/plugins/check_proc_util.py new file mode 100755 index 0000000..20f57eb --- /dev/null +++ b/plugins/check_proc_util.py @@ -0,0 +1,196 @@ +#!/usr/bin/python +# Copyright (C) 2014 Red Hat Inc +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +# + +import errno +import psutil + + +from glusternagios import utils +from glusternagios import glustercli +from glusternagios import storage + + +_checkProc = utils.CommandPath('check_proc', + '/usr/lib64/nagios/plugins/check_procs') + +_glusterVolPath = "/var/lib/glusterd/vols" +_checkNfsCmd = [_checkProc.cmd, "-c", "1:", "-C", "glusterfs", "-a", "nfs"] +_checkShdCmd = [_checkProc.cmd, "-c", "1:", "-C", "glusterfs", "-a", + "glustershd"] +_checkSmbCmd = [_checkProc.cmd, "-c", "1:", "-C", "smbd"] +_checkQuotaCmd = [_checkProc.cmd, "-c", "1:", "-C", "glusterfs", "-a", + "quotad"] +_checkBrickCmd = [_checkProc.cmd, "-C", "glusterfsd"] +_checkGlusterdCmd = [_checkProc.cmd, "-c", "1:", "-w", "1:1", "-C", "glusterd"] +_checkCtdbCmd = [_checkProc.cmd, "-c", "1:", "-C", "ctdbd"] +checkIdeSmartCmdPath = utils.CommandPath( + 'check_ide_smart', '/usr/lib64/nagios/plugins/check_ide_smart') + + +def getBrickStatus(volumeName, brickName): + status = None + brickPath = brickName.split(':')[1] + pidFile = brickName.replace(":/", "-").replace("/", "-") + ".pid" + try: + with open("%s/%s/run/%s" % ( + _glusterVolPath, volumeName, pidFile)) as f: + if psutil.pid_exists(int(f.read().strip())): + status = utils.PluginStatusCode.OK + brickDevice = storage.getBrickDeviceName(brickPath) + disk = storage.getDisksForBrick(brickDevice) + cmd = [checkIdeSmartCmdPath.cmd, "-d", disk, "-n"] + rc, out, err = utils.execCmd(cmd) + if rc == utils.PluginStatusCode.CRITICAL and \ + "tests failed" in out[0]: + status = utils.PluginStatusCode.WARNING + msg = "WARNING: Brick %s: %s" % (brickPath, out[0]) + else: + status = utils.PluginStatusCode.CRITICAL + except IOError as e: + if e.errno == errno.ENOENT: + status = utils.PluginStatusCode.CRITICAL + else: + status = utils.PluginStatusCode.UNKNOWN + msg = "UNKNOWN: Brick %s: %s" % (brickPath, str(e)) + finally: + if status == utils.PluginStatusCode.OK: + msg = "OK: Brick %s" % brickPath + elif status == utils.PluginStatusCode.CRITICAL: + msg = "CRITICAL: Brick %s is down" % brickPath + return status, msg + + +def getNfsStatus(volInfo): + # if nfs is already running we need not to check further + status, msg, error = utils.execCmd(_checkNfsCmd) + if status == utils.PluginStatusCode.OK: + return status, msg[0] if len(msg) > 0 else "" + + # if nfs is not running and any of the volume uses nfs + # then its required to alert the user + for volume, volumeInfo in volInfo.iteritems(): + if volumeInfo['volumeStatus'] == glustercli.VolumeStatus.OFFLINE: + continue + nfsStatus = volumeInfo.get('options', {}).get('nfs.disable', 'off') + if nfsStatus == 'off': + msg = "CRITICAL: Process glusterfs-nfs is not running" + status = utils.PluginStatusCode.CRITICAL + break + else: + msg = "OK: No gluster volume uses nfs" + status = utils.PluginStatusCode.OK + return status, msg + + +def getCtdbStatus(smbStatus, nfsStatus): + if smbStatus != utils.PluginStatusCode.OK and \ + nfsStatus != utils.PluginStatusCode.OK: + return (utils.PluginStatusCode.OK, + "CTDB ignored as SMB and NFS are not running") + + status, msg, error = utils.execCmd(_checkCtdbCmd) + if status != utils.PluginStatusCode.OK: + return utils.PluginStatusCode.UNKNOWN, "CTDB not configured" + + # CTDB, SMB/NFS are running + status, msg, error = utils.execCmd(['ctdb', 'nodestatus']) + if status == utils.PluginStatusCode.OK: + if len(msg) > -1: + message = msg[0].split() + if len(message) > 1: + msg = "Node status: %s" % message[2] + if message[2] == 'UNHEALTHY': + status = utils.PluginStatusCode.WARNING + elif message[2] in ['DISCONNECTED', 'BANNED', 'INACTIVE']: + status = utils.PluginStatusCode.CRITICAL + else: + status = utils.PluginStatusCode.UNKNOWN + return status, msg + + +def getSmbStatus(volInfo): + status, msg, error = utils.execCmd(_checkSmbCmd) + if status == utils.PluginStatusCode.OK: + return status, msg[0] if len(msg) > 0 else "" + + # if smb is not running and any of the volume uses smb + # then its required to alert the user + for k, v in volInfo.iteritems(): + cifsStatus = v.get('options', {}).get('user.cifs', 'enable') + smbStatus = v.get('options', {}).get('user.smb', 'enable') + if cifsStatus == 'enable' and smbStatus == 'enable': + msg = "CRITICAL: Process smb is not running" + status = utils.PluginStatusCode.CRITICAL + break + else: + msg = "OK: No gluster volume uses smb" + status = utils.PluginStatusCode.OK + return status, msg + + +def getQuotadStatus(volInfo): + # if quota is already running we need not to check further + status, msg, error = utils.execCmd(_checkQuotaCmd) + if status == utils.PluginStatusCode.OK: + return status, msg[0] if len(msg) > 0 else "" + + # if quota is not running and any of the volume uses quota + # then the quotad process should be running in the host + for k, v in volInfo.iteritems(): + quotadStatus = v.get('options', {}).get('features.quota', '') + if quotadStatus == 'on': + msg = "CRITICAL: Process quotad is not running" + utils.PluginStatusCode.CRITICAL + break + else: + msg = "OK: Quota not enabled" + status = utils.PluginStatusCode.OK + return status, msg + + +def getShdStatus(volInfo): + status, msg, error = utils.execCmd(_checkShdCmd) + if status == utils.PluginStatusCode.OK: + return status, msg[0] if len(msg) > 0 else "" + + hostUuid = glustercli.hostUUIDGet() + for volumeName, volumeInfo in volInfo.iteritems(): + if volumeInfo['volumeStatus'] == glustercli.VolumeStatus.OFFLINE: + continue + if hasBricks(hostUuid, volumeInfo['bricksInfo']) and \ + int(volumeInfo['replicaCount']) > 1: + status = utils.PluginStatusCode.CRITICAL + msg = "CRITICAL: Gluster Self Heal Daemon not running" + break + else: + msg = "OK: Process Gluster Self Heal Daemon" + status = utils.PluginStatusCode.OK + return status, msg + + +def getGlusterdStatus(): + status, msg, error = utils.execCmd(_checkGlusterdCmd) + msg = msg[0] if len(msg) > 0 else "" + return status, msg + + +def hasBricks(hostUuid, bricks): + for brick in bricks: + if brick['hostUuid'] == hostUuid: + return True + return False |