summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCsaba Henk <csaba@gluster.com>2011-04-21 16:43:49 +0000
committerAnand Avati <avati@gluster.com>2011-04-22 04:05:20 -0700
commit775323c1b2fa9a557d3ea74d57e843575f7b1278 (patch)
tree58ceec9deac4c3b947a58a025255720abba45223
parentde809504282731332c9bc0fc7f7da5be34f206ce (diff)
syncdaemon: have the monitor kill the worker if it does not connect in 60 sec
Signed-off-by: Csaba Henk <csaba@gluster.com> Signed-off-by: Anand Avati <avati@gluster.com> BUG: 2736 (gsyncd hangs if crash occurs in the non-main thread) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2736
-rw-r--r--xlators/features/marker/utils/syncdaemon/gsyncd.py6
-rw-r--r--xlators/features/marker/utils/syncdaemon/master.py3
-rw-r--r--xlators/features/marker/utils/syncdaemon/monitor.py28
3 files changed, 33 insertions, 4 deletions
diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py
index c52d1a21811..60980f54659 100644
--- a/xlators/features/marker/utils/syncdaemon/gsyncd.py
+++ b/xlators/features/marker/utils/syncdaemon/gsyncd.py
@@ -140,6 +140,7 @@ def main_i():
op.add_option('-c', '--config-file', metavar='CONF', type=str, action='callback', callback=store_local)
# duh. need to specify dest or value will be mapped to None :S
op.add_option('--monitor', dest='monitor', action='callback', callback=store_local_curry(True))
+ op.add_option('--feedback-fd', dest='feedback_fd', type=int, help=SUPPRESS_HELP, action='callback', callback=store_local)
op.add_option('--listen', dest='listen', help=SUPPRESS_HELP, action='callback', callback=store_local_curry(True))
op.add_option('-N', '--no-daemon', dest="go_daemon", action='callback', callback=store_local_curry('dont'))
op.add_option('--debug', dest="go_daemon", action='callback', callback=lambda *a: (store_local_curry('dont')(*a),
@@ -246,6 +247,11 @@ def main_i():
gconf.__dict__.update(opts.__dict__)
gconf.configinterface = gcnf
+ ffd = rconf.get('feedback_fd')
+ if ffd:
+ gconf.feedback_fd = ffd
+ fcntl.fcntl(int(ffd), fcntl.F_SETFD, fcntl.FD_CLOEXEC)
+
#normalize loglevel
lvl0 = gconf.log_level
if isinstance(lvl0, str):
diff --git a/xlators/features/marker/utils/syncdaemon/master.py b/xlators/features/marker/utils/syncdaemon/master.py
index 35dc4ee06aa..76f924ed37a 100644
--- a/xlators/features/marker/utils/syncdaemon/master.py
+++ b/xlators/features/marker/utils/syncdaemon/master.py
@@ -87,6 +87,9 @@ class GMaster(object):
self.terminate = False
def crawl_loop(self):
+ ffd = getattr(gconf, 'feedback_fd', None)
+ if ffd:
+ os.close(int(ffd))
timo = int(gconf.timeout or 0)
if timo > 0:
def keep_alive():
diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py
index 5e5d22f4f91..a86acdc7566 100644
--- a/xlators/features/marker/utils/syncdaemon/monitor.py
+++ b/xlators/features/marker/utils/syncdaemon/monitor.py
@@ -2,6 +2,8 @@ import os
import sys
import time
import logging
+import select
+from signal import SIGKILL
from gconf import gconf
from syncdutils import update_file
@@ -35,12 +37,30 @@ class Monitor(object):
if os.WIFEXITED(r):
return os.WEXITSTATUS(r)
return 1
+ conn_timeout = 60
while ret in (0, 1):
- logging.info('-' * 60)
+ logging.info('-' * conn_timeout)
logging.info('starting gsyncd worker')
- cpid = os.spawnv(os.P_NOWAIT, sys.executable, argv)
- time.sleep(60)
- ret = nwait(cpid, os.WNOHANG)
+ pr, pw = os.pipe()
+ cpid = os.fork()
+ if cpid == 0:
+ os.close(pr)
+ os.execv(sys.executable, argv + ['--feedback-fd', str(pw)])
+ os.close(pw)
+ t0 = time.time()
+ select.select((pr,), (), (), conn_timeout)
+ os.close(pr)
+ et = time.time() - t0
+ if et < conn_timeout:
+ et2 = conn_timeout - et
+ logging.debug("worker got connected in %d sec, "
+ "waiting %d more to make sure it's fine" % (et, et2))
+ time.sleep(et2)
+ ret = nwait(cpid, os.WNOHANG)
+ else:
+ logging.debug("worker not confirmed in %d sec, aborting it" % et)
+ os.kill(cpid, SIGKILL)
+ ret = nwait(cpid)
if ret == None:
self.set_state('OK')
ret = nwait(cpid)