From b6c8b1b47838228611e1a619890effe8b2dd2ad3 Mon Sep 17 00:00:00 2001 From: Kotresh H R Date: Fri, 30 May 2014 17:03:30 +0530 Subject: feature/geo-rep: Fix for changelog agent becoming zombie. Monitor process spawns changelog agent and is not wait on it, hence becoming zombie. When worker is dies/killed, it respawns both worker and corresponding agent leaving the earlier changelog agent in zombie state. This patch addresses this issue by waiting on agent process in montor process. Change-Id: I571b7d6487133848edca67e7446f1caa70ae01c9 BUG: 1103643 Signed-off-by: Kotresh H R Reviewed-on: http://review.gluster.org/7956 Tested-by: Gluster Build System Reviewed-by: Aravinda VK Reviewed-by: Venky Shankar Tested-by: Venky Shankar --- geo-replication/syncdaemon/monitor.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'geo-replication') diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py index f3700c1a390..0bde216d761 100644 --- a/geo-replication/syncdaemon/monitor.py +++ b/geo-replication/syncdaemon/monitor.py @@ -189,21 +189,22 @@ class Monitor(object): self.lock.release() os.close(pw) - t0 = time.time() - so = select((pr,), (), (), conn_timeout)[0] - os.close(pr) - # close all RPC pipes in monitor os.close(ra) os.close(wa) os.close(rw) os.close(ww) + t0 = time.time() + so = select((pr,), (), (), conn_timeout)[0] + os.close(pr) + if so: ret = nwait(cpid, os.WNOHANG) if ret is not None: logging.info("worker(%s) died before establishing " "connection" % w[0]) + nwait(apid) #wait for agent else: logging.debug("worker(%s) connected" % w[0]) while time.time() < t0 + conn_timeout: @@ -211,15 +212,20 @@ class Monitor(object): if ret is not None: logging.info("worker(%s) died in startup " "phase" % w[0]) + nwait(apid) #wait for agent break time.sleep(1) else: logging.info("worker(%s) not confirmed in %d sec, " "aborting it" % (w[0], conn_timeout)) os.kill(cpid, signal.SIGKILL) + nwait(apid) #wait for agent ret = nwait(cpid) if ret is None: self.set_state(self.ST_STABLE, w) + #If worker dies, agent terminates on EOF. + #So lets wait for agent first. + nwait(apid) ret = nwait(cpid) if exit_signalled(ret): ret = 0 @@ -249,6 +255,8 @@ class Monitor(object): self.lock.acquire() for cpid in cpids: os.kill(cpid, signal.SIGKILL) + for apid in agents: + os.kill(apid, signal.SIGKILL) self.lock.release() finalize(exval=1) t = Thread(target=wmon, args=[wx]) -- cgit