From 775323c1b2fa9a557d3ea74d57e843575f7b1278 Mon Sep 17 00:00:00 2001 From: Csaba Henk Date: Thu, 21 Apr 2011 16:43:49 +0000 Subject: syncdaemon: have the monitor kill the worker if it does not connect in 60 sec Signed-off-by: Csaba Henk Signed-off-by: Anand Avati BUG: 2736 (gsyncd hangs if crash occurs in the non-main thread) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2736 --- xlators/features/marker/utils/syncdaemon/gsyncd.py | 6 +++++ xlators/features/marker/utils/syncdaemon/master.py | 3 +++ .../features/marker/utils/syncdaemon/monitor.py | 28 ++++++++++++++++++---- 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py index c52d1a218..60980f546 100644 --- a/xlators/features/marker/utils/syncdaemon/gsyncd.py +++ b/xlators/features/marker/utils/syncdaemon/gsyncd.py @@ -140,6 +140,7 @@ def main_i(): op.add_option('-c', '--config-file', metavar='CONF', type=str, action='callback', callback=store_local) # duh. need to specify dest or value will be mapped to None :S op.add_option('--monitor', dest='monitor', action='callback', callback=store_local_curry(True)) + op.add_option('--feedback-fd', dest='feedback_fd', type=int, help=SUPPRESS_HELP, action='callback', callback=store_local) op.add_option('--listen', dest='listen', help=SUPPRESS_HELP, action='callback', callback=store_local_curry(True)) op.add_option('-N', '--no-daemon', dest="go_daemon", action='callback', callback=store_local_curry('dont')) op.add_option('--debug', dest="go_daemon", action='callback', callback=lambda *a: (store_local_curry('dont')(*a), @@ -246,6 +247,11 @@ def main_i(): gconf.__dict__.update(opts.__dict__) gconf.configinterface = gcnf + ffd = rconf.get('feedback_fd') + if ffd: + gconf.feedback_fd = ffd + fcntl.fcntl(int(ffd), fcntl.F_SETFD, fcntl.FD_CLOEXEC) + #normalize loglevel lvl0 = gconf.log_level if isinstance(lvl0, str): diff --git a/xlators/features/marker/utils/syncdaemon/master.py b/xlators/features/marker/utils/syncdaemon/master.py index 35dc4ee06..76f924ed3 100644 --- a/xlators/features/marker/utils/syncdaemon/master.py +++ b/xlators/features/marker/utils/syncdaemon/master.py @@ -87,6 +87,9 @@ class GMaster(object): self.terminate = False def crawl_loop(self): + ffd = getattr(gconf, 'feedback_fd', None) + if ffd: + os.close(int(ffd)) timo = int(gconf.timeout or 0) if timo > 0: def keep_alive(): diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py index 5e5d22f4f..a86acdc75 100644 --- a/xlators/features/marker/utils/syncdaemon/monitor.py +++ b/xlators/features/marker/utils/syncdaemon/monitor.py @@ -2,6 +2,8 @@ import os import sys import time import logging +import select +from signal import SIGKILL from gconf import gconf from syncdutils import update_file @@ -35,12 +37,30 @@ class Monitor(object): if os.WIFEXITED(r): return os.WEXITSTATUS(r) return 1 + conn_timeout = 60 while ret in (0, 1): - logging.info('-' * 60) + logging.info('-' * conn_timeout) logging.info('starting gsyncd worker') - cpid = os.spawnv(os.P_NOWAIT, sys.executable, argv) - time.sleep(60) - ret = nwait(cpid, os.WNOHANG) + pr, pw = os.pipe() + cpid = os.fork() + if cpid == 0: + os.close(pr) + os.execv(sys.executable, argv + ['--feedback-fd', str(pw)]) + os.close(pw) + t0 = time.time() + select.select((pr,), (), (), conn_timeout) + os.close(pr) + et = time.time() - t0 + if et < conn_timeout: + et2 = conn_timeout - et + logging.debug("worker got connected in %d sec, " + "waiting %d more to make sure it's fine" % (et, et2)) + time.sleep(et2) + ret = nwait(cpid, os.WNOHANG) + else: + logging.debug("worker not confirmed in %d sec, aborting it" % et) + os.kill(cpid, SIGKILL) + ret = nwait(cpid) if ret == None: self.set_state('OK') ret = nwait(cpid) -- cgit