summaryrefslogtreecommitdiffstats
path: root/geo-replication/syncdaemon
diff options
context:
space:
mode:
authorKotresh H R <khiremat@redhat.com>2014-05-30 17:03:30 +0530
committerVenky Shankar <vshankar@redhat.com>2014-06-17 21:47:07 -0700
commitb6c8b1b47838228611e1a619890effe8b2dd2ad3 (patch)
tree8ab49bb31ec6de4d56678097ae726454707dc3e2 /geo-replication/syncdaemon
parent93832829016a0a51a8938c0c89c6bd09b3229c9f (diff)
feature/geo-rep: Fix for changelog agent becoming zombie.
Monitor process spawns changelog agent and is not wait on it, hence becoming zombie. When worker is dies/killed, it respawns both worker and corresponding agent leaving the earlier changelog agent in zombie state. This patch addresses this issue by waiting on agent process in montor process. Change-Id: I571b7d6487133848edca67e7446f1caa70ae01c9 BUG: 1103643 Signed-off-by: Kotresh H R <khiremat@redhat.com> Reviewed-on: http://review.gluster.org/7956 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Aravinda VK <avishwan@redhat.com> Reviewed-by: Venky Shankar <vshankar@redhat.com> Tested-by: Venky Shankar <vshankar@redhat.com>
Diffstat (limited to 'geo-replication/syncdaemon')
-rw-r--r--geo-replication/syncdaemon/monitor.py16
1 files changed, 12 insertions, 4 deletions
diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py
index f3700c1a390..0bde216d761 100644
--- a/geo-replication/syncdaemon/monitor.py
+++ b/geo-replication/syncdaemon/monitor.py
@@ -189,21 +189,22 @@ class Monitor(object):
self.lock.release()
os.close(pw)
- t0 = time.time()
- so = select((pr,), (), (), conn_timeout)[0]
- os.close(pr)
-
# close all RPC pipes in monitor
os.close(ra)
os.close(wa)
os.close(rw)
os.close(ww)
+ t0 = time.time()
+ so = select((pr,), (), (), conn_timeout)[0]
+ os.close(pr)
+
if so:
ret = nwait(cpid, os.WNOHANG)
if ret is not None:
logging.info("worker(%s) died before establishing "
"connection" % w[0])
+ nwait(apid) #wait for agent
else:
logging.debug("worker(%s) connected" % w[0])
while time.time() < t0 + conn_timeout:
@@ -211,15 +212,20 @@ class Monitor(object):
if ret is not None:
logging.info("worker(%s) died in startup "
"phase" % w[0])
+ nwait(apid) #wait for agent
break
time.sleep(1)
else:
logging.info("worker(%s) not confirmed in %d sec, "
"aborting it" % (w[0], conn_timeout))
os.kill(cpid, signal.SIGKILL)
+ nwait(apid) #wait for agent
ret = nwait(cpid)
if ret is None:
self.set_state(self.ST_STABLE, w)
+ #If worker dies, agent terminates on EOF.
+ #So lets wait for agent first.
+ nwait(apid)
ret = nwait(cpid)
if exit_signalled(ret):
ret = 0
@@ -249,6 +255,8 @@ class Monitor(object):
self.lock.acquire()
for cpid in cpids:
os.kill(cpid, signal.SIGKILL)
+ for apid in agents:
+ os.kill(apid, signal.SIGKILL)
self.lock.release()
finalize(exval=1)
t = Thread(target=wmon, args=[wx])