diff options
author | Csaba Henk <csaba@gluster.com> | 2011-04-21 16:43:49 +0000 |
---|---|---|
committer | Anand Avati <avati@gluster.com> | 2011-04-22 04:05:20 -0700 |
commit | 775323c1b2fa9a557d3ea74d57e843575f7b1278 (patch) | |
tree | 58ceec9deac4c3b947a58a025255720abba45223 | |
parent | de809504282731332c9bc0fc7f7da5be34f206ce (diff) |
syncdaemon: have the monitor kill the worker if it does not connect in 60 sec
Signed-off-by: Csaba Henk <csaba@gluster.com>
Signed-off-by: Anand Avati <avati@gluster.com>
BUG: 2736 (gsyncd hangs if crash occurs in the non-main thread)
URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2736
-rw-r--r-- | xlators/features/marker/utils/syncdaemon/gsyncd.py | 6 | ||||
-rw-r--r-- | xlators/features/marker/utils/syncdaemon/master.py | 3 | ||||
-rw-r--r-- | xlators/features/marker/utils/syncdaemon/monitor.py | 28 |
3 files changed, 33 insertions, 4 deletions
diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py index c52d1a21811..60980f54659 100644 --- a/xlators/features/marker/utils/syncdaemon/gsyncd.py +++ b/xlators/features/marker/utils/syncdaemon/gsyncd.py @@ -140,6 +140,7 @@ def main_i(): op.add_option('-c', '--config-file', metavar='CONF', type=str, action='callback', callback=store_local) # duh. need to specify dest or value will be mapped to None :S op.add_option('--monitor', dest='monitor', action='callback', callback=store_local_curry(True)) + op.add_option('--feedback-fd', dest='feedback_fd', type=int, help=SUPPRESS_HELP, action='callback', callback=store_local) op.add_option('--listen', dest='listen', help=SUPPRESS_HELP, action='callback', callback=store_local_curry(True)) op.add_option('-N', '--no-daemon', dest="go_daemon", action='callback', callback=store_local_curry('dont')) op.add_option('--debug', dest="go_daemon", action='callback', callback=lambda *a: (store_local_curry('dont')(*a), @@ -246,6 +247,11 @@ def main_i(): gconf.__dict__.update(opts.__dict__) gconf.configinterface = gcnf + ffd = rconf.get('feedback_fd') + if ffd: + gconf.feedback_fd = ffd + fcntl.fcntl(int(ffd), fcntl.F_SETFD, fcntl.FD_CLOEXEC) + #normalize loglevel lvl0 = gconf.log_level if isinstance(lvl0, str): diff --git a/xlators/features/marker/utils/syncdaemon/master.py b/xlators/features/marker/utils/syncdaemon/master.py index 35dc4ee06aa..76f924ed37a 100644 --- a/xlators/features/marker/utils/syncdaemon/master.py +++ b/xlators/features/marker/utils/syncdaemon/master.py @@ -87,6 +87,9 @@ class GMaster(object): self.terminate = False def crawl_loop(self): + ffd = getattr(gconf, 'feedback_fd', None) + if ffd: + os.close(int(ffd)) timo = int(gconf.timeout or 0) if timo > 0: def keep_alive(): diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py index 5e5d22f4f91..a86acdc7566 100644 --- a/xlators/features/marker/utils/syncdaemon/monitor.py +++ b/xlators/features/marker/utils/syncdaemon/monitor.py @@ -2,6 +2,8 @@ import os import sys import time import logging +import select +from signal import SIGKILL from gconf import gconf from syncdutils import update_file @@ -35,12 +37,30 @@ class Monitor(object): if os.WIFEXITED(r): return os.WEXITSTATUS(r) return 1 + conn_timeout = 60 while ret in (0, 1): - logging.info('-' * 60) + logging.info('-' * conn_timeout) logging.info('starting gsyncd worker') - cpid = os.spawnv(os.P_NOWAIT, sys.executable, argv) - time.sleep(60) - ret = nwait(cpid, os.WNOHANG) + pr, pw = os.pipe() + cpid = os.fork() + if cpid == 0: + os.close(pr) + os.execv(sys.executable, argv + ['--feedback-fd', str(pw)]) + os.close(pw) + t0 = time.time() + select.select((pr,), (), (), conn_timeout) + os.close(pr) + et = time.time() - t0 + if et < conn_timeout: + et2 = conn_timeout - et + logging.debug("worker got connected in %d sec, " + "waiting %d more to make sure it's fine" % (et, et2)) + time.sleep(et2) + ret = nwait(cpid, os.WNOHANG) + else: + logging.debug("worker not confirmed in %d sec, aborting it" % et) + os.kill(cpid, SIGKILL) + ret = nwait(cpid) if ret == None: self.set_state('OK') ret = nwait(cpid) |