summaryrefslogtreecommitdiffstats
path: root/xlators
diff options
context:
space:
mode:
authorCsaba Henk <csaba@gluster.com>2011-04-29 06:24:45 +0000
committerAnand Avati <avati@gluster.com>2011-05-09 13:27:03 -0700
commit6f2ed82bfcfe494ef273bff55cf69480a7472cc5 (patch)
treee911f81c9bcc82628b3dc3e651616349dbc0f6b3 /xlators
parent9ffd9c51f3833e04a60373648ba3c69c2ed7a68a (diff)
syncdaemon: some refactor on monitor
- detect faulty state early - keep the feedback fd in gsyncd module Signed-off-by: Csaba Henk <csaba@gluster.com> Signed-off-by: Anand Avati <avati@gluster.com> BUG: 2537 (gsync autorestart) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2537
Diffstat (limited to 'xlators')
-rw-r--r--xlators/features/marker/utils/syncdaemon/gsyncd.py5
-rw-r--r--xlators/features/marker/utils/syncdaemon/master.py3
-rw-r--r--xlators/features/marker/utils/syncdaemon/monitor.py22
3 files changed, 17 insertions, 13 deletions
diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py
index 963e7101b..a502c2009 100644
--- a/xlators/features/marker/utils/syncdaemon/gsyncd.py
+++ b/xlators/features/marker/utils/syncdaemon/gsyncd.py
@@ -252,8 +252,7 @@ def main_i():
ffd = rconf.get('feedback_fd')
if ffd:
- gconf.feedback_fd = ffd
- fcntl.fcntl(int(ffd), fcntl.F_SETFD, fcntl.FD_CLOEXEC)
+ fcntl.fcntl(ffd, fcntl.F_SETFD, fcntl.FD_CLOEXEC)
#normalize loglevel
lvl0 = gconf.log_level
@@ -295,6 +294,8 @@ def main_i():
# complete remote connection in child
remote.connect_remote(go_daemon='done')
local.connect()
+ if ffd:
+ os.close(ffd)
local.service_loop(*[r for r in [remote] if r])
logging.info("exiting.")
diff --git a/xlators/features/marker/utils/syncdaemon/master.py b/xlators/features/marker/utils/syncdaemon/master.py
index 76f924ed3..35dc4ee06 100644
--- a/xlators/features/marker/utils/syncdaemon/master.py
+++ b/xlators/features/marker/utils/syncdaemon/master.py
@@ -87,9 +87,6 @@ class GMaster(object):
self.terminate = False
def crawl_loop(self):
- ffd = getattr(gconf, 'feedback_fd', None)
- if ffd:
- os.close(int(ffd))
timo = int(gconf.timeout or 0)
if timo > 0:
def keep_alive():
diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py
index a86acdc75..365e91435 100644
--- a/xlators/features/marker/utils/syncdaemon/monitor.py
+++ b/xlators/features/marker/utils/syncdaemon/monitor.py
@@ -48,17 +48,23 @@ class Monitor(object):
os.execv(sys.executable, argv + ['--feedback-fd', str(pw)])
os.close(pw)
t0 = time.time()
- select.select((pr,), (), (), conn_timeout)
+ so = select.select((pr,), (), (), conn_timeout)[0]
os.close(pr)
- et = time.time() - t0
- if et < conn_timeout:
- et2 = conn_timeout - et
- logging.debug("worker got connected in %d sec, "
- "waiting %d more to make sure it's fine" % (et, et2))
- time.sleep(et2)
+ if so:
ret = nwait(cpid, os.WNOHANG)
+ if ret != None:
+ logging.debug("worker died before establishing connection")
+ else:
+ logging.debug("worker seems to be connected (?? racy check)")
+ while time.time() < t0 + conn_timeout:
+ ret = nwait(cpid, os.WNOHANG)
+ if ret != None:
+ logging.debug("worker died in startup phase")
+ break
+ time.sleep(1)
else:
- logging.debug("worker not confirmed in %d sec, aborting it" % et)
+ logging.debug("worker not confirmed in %d sec, aborting it" % \
+ conn_timeout)
os.kill(cpid, SIGKILL)
ret = nwait(cpid)
if ret == None: