summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCsaba Henk <csaba@gluster.com>2011-05-31 11:06:54 +0000
committerAnand Avati <avati@gluster.com>2011-05-31 10:01:41 -0700
commita2b30a1848ee69854c1de87cd1e3d1d74e96a964 (patch)
tree7e95edf1b1c8af125be4f04f8ed8b96ac99848b1
parente58cb9aa2f21b6453f9533a174fae0cbfda1e98c (diff)
syncdaemon: some refactor on monitor
- detect faulty state early - keep the feedback fd in gsyncd module Signed-off-by: Csaba Henk <csaba@gluster.com> Signed-off-by: Anand Avati <avati@gluster.com> BUG: 2537 (gsync autorestart) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2537 BUG: 2537 (gsync autorestart) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2537
-rw-r--r--xlators/features/marker/utils/syncdaemon/gsyncd.py5
-rw-r--r--xlators/features/marker/utils/syncdaemon/master.py3
-rw-r--r--xlators/features/marker/utils/syncdaemon/monitor.py22
3 files changed, 17 insertions, 13 deletions
diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py
index 60980f54659..193af9d5f37 100644
--- a/xlators/features/marker/utils/syncdaemon/gsyncd.py
+++ b/xlators/features/marker/utils/syncdaemon/gsyncd.py
@@ -249,8 +249,7 @@ def main_i():
ffd = rconf.get('feedback_fd')
if ffd:
- gconf.feedback_fd = ffd
- fcntl.fcntl(int(ffd), fcntl.F_SETFD, fcntl.FD_CLOEXEC)
+ fcntl.fcntl(ffd, fcntl.F_SETFD, fcntl.FD_CLOEXEC)
#normalize loglevel
lvl0 = gconf.log_level
@@ -292,6 +291,8 @@ def main_i():
# complete remote connection in child
remote.connect_remote(go_daemon='done')
local.connect()
+ if ffd:
+ os.close(ffd)
local.service_loop(*[r for r in [remote] if r])
logging.info("exiting.")
diff --git a/xlators/features/marker/utils/syncdaemon/master.py b/xlators/features/marker/utils/syncdaemon/master.py
index 76f924ed37a..35dc4ee06aa 100644
--- a/xlators/features/marker/utils/syncdaemon/master.py
+++ b/xlators/features/marker/utils/syncdaemon/master.py
@@ -87,9 +87,6 @@ class GMaster(object):
self.terminate = False
def crawl_loop(self):
- ffd = getattr(gconf, 'feedback_fd', None)
- if ffd:
- os.close(int(ffd))
timo = int(gconf.timeout or 0)
if timo > 0:
def keep_alive():
diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py
index a86acdc7566..365e91435fd 100644
--- a/xlators/features/marker/utils/syncdaemon/monitor.py
+++ b/xlators/features/marker/utils/syncdaemon/monitor.py
@@ -48,17 +48,23 @@ class Monitor(object):
os.execv(sys.executable, argv + ['--feedback-fd', str(pw)])
os.close(pw)
t0 = time.time()
- select.select((pr,), (), (), conn_timeout)
+ so = select.select((pr,), (), (), conn_timeout)[0]
os.close(pr)
- et = time.time() - t0
- if et < conn_timeout:
- et2 = conn_timeout - et
- logging.debug("worker got connected in %d sec, "
- "waiting %d more to make sure it's fine" % (et, et2))
- time.sleep(et2)
+ if so:
ret = nwait(cpid, os.WNOHANG)
+ if ret != None:
+ logging.debug("worker died before establishing connection")
+ else:
+ logging.debug("worker seems to be connected (?? racy check)")
+ while time.time() < t0 + conn_timeout:
+ ret = nwait(cpid, os.WNOHANG)
+ if ret != None:
+ logging.debug("worker died in startup phase")
+ break
+ time.sleep(1)
else:
- logging.debug("worker not confirmed in %d sec, aborting it" % et)
+ logging.debug("worker not confirmed in %d sec, aborting it" % \
+ conn_timeout)
os.kill(cpid, SIGKILL)
ret = nwait(cpid)
if ret == None: