diff options
-rw-r--r-- | xlators/features/marker/utils/syncdaemon/gsyncd.py | 5 | ||||
-rw-r--r-- | xlators/features/marker/utils/syncdaemon/master.py | 3 | ||||
-rw-r--r-- | xlators/features/marker/utils/syncdaemon/monitor.py | 22 |
3 files changed, 17 insertions, 13 deletions
diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py index 963e7101bf4..a502c200998 100644 --- a/xlators/features/marker/utils/syncdaemon/gsyncd.py +++ b/xlators/features/marker/utils/syncdaemon/gsyncd.py @@ -252,8 +252,7 @@ def main_i(): ffd = rconf.get('feedback_fd') if ffd: - gconf.feedback_fd = ffd - fcntl.fcntl(int(ffd), fcntl.F_SETFD, fcntl.FD_CLOEXEC) + fcntl.fcntl(ffd, fcntl.F_SETFD, fcntl.FD_CLOEXEC) #normalize loglevel lvl0 = gconf.log_level @@ -295,6 +294,8 @@ def main_i(): # complete remote connection in child remote.connect_remote(go_daemon='done') local.connect() + if ffd: + os.close(ffd) local.service_loop(*[r for r in [remote] if r]) logging.info("exiting.") diff --git a/xlators/features/marker/utils/syncdaemon/master.py b/xlators/features/marker/utils/syncdaemon/master.py index 76f924ed37a..35dc4ee06aa 100644 --- a/xlators/features/marker/utils/syncdaemon/master.py +++ b/xlators/features/marker/utils/syncdaemon/master.py @@ -87,9 +87,6 @@ class GMaster(object): self.terminate = False def crawl_loop(self): - ffd = getattr(gconf, 'feedback_fd', None) - if ffd: - os.close(int(ffd)) timo = int(gconf.timeout or 0) if timo > 0: def keep_alive(): diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py index a86acdc7566..365e91435fd 100644 --- a/xlators/features/marker/utils/syncdaemon/monitor.py +++ b/xlators/features/marker/utils/syncdaemon/monitor.py @@ -48,17 +48,23 @@ class Monitor(object): os.execv(sys.executable, argv + ['--feedback-fd', str(pw)]) os.close(pw) t0 = time.time() - select.select((pr,), (), (), conn_timeout) + so = select.select((pr,), (), (), conn_timeout)[0] os.close(pr) - et = time.time() - t0 - if et < conn_timeout: - et2 = conn_timeout - et - logging.debug("worker got connected in %d sec, " - "waiting %d more to make sure it's fine" % (et, et2)) - time.sleep(et2) + if so: ret = nwait(cpid, os.WNOHANG) + if ret != None: + logging.debug("worker died before establishing connection") + else: + logging.debug("worker seems to be connected (?? racy check)") + while time.time() < t0 + conn_timeout: + ret = nwait(cpid, os.WNOHANG) + if ret != None: + logging.debug("worker died in startup phase") + break + time.sleep(1) else: - logging.debug("worker not confirmed in %d sec, aborting it" % et) + logging.debug("worker not confirmed in %d sec, aborting it" % \ + conn_timeout) os.kill(cpid, SIGKILL) ret = nwait(cpid) if ret == None: |