diff options
author | Kotresh H R <khiremat@redhat.com> | 2014-05-30 17:03:30 +0530 |
---|---|---|
committer | Venky Shankar <vshankar@redhat.com> | 2014-06-17 21:47:07 -0700 |
commit | b6c8b1b47838228611e1a619890effe8b2dd2ad3 (patch) | |
tree | 8ab49bb31ec6de4d56678097ae726454707dc3e2 | |
parent | 93832829016a0a51a8938c0c89c6bd09b3229c9f (diff) |
feature/geo-rep: Fix for changelog agent becoming zombie.
Monitor process spawns changelog agent and is not
wait on it, hence becoming zombie. When worker is
dies/killed, it respawns both worker and corresponding
agent leaving the earlier changelog agent in zombie
state. This patch addresses this issue by waiting
on agent process in montor process.
Change-Id: I571b7d6487133848edca67e7446f1caa70ae01c9
BUG: 1103643
Signed-off-by: Kotresh H R <khiremat@redhat.com>
Reviewed-on: http://review.gluster.org/7956
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Aravinda VK <avishwan@redhat.com>
Reviewed-by: Venky Shankar <vshankar@redhat.com>
Tested-by: Venky Shankar <vshankar@redhat.com>
-rw-r--r-- | geo-replication/syncdaemon/monitor.py | 16 |
1 files changed, 12 insertions, 4 deletions
diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py index f3700c1a390..0bde216d761 100644 --- a/geo-replication/syncdaemon/monitor.py +++ b/geo-replication/syncdaemon/monitor.py @@ -189,21 +189,22 @@ class Monitor(object): self.lock.release() os.close(pw) - t0 = time.time() - so = select((pr,), (), (), conn_timeout)[0] - os.close(pr) - # close all RPC pipes in monitor os.close(ra) os.close(wa) os.close(rw) os.close(ww) + t0 = time.time() + so = select((pr,), (), (), conn_timeout)[0] + os.close(pr) + if so: ret = nwait(cpid, os.WNOHANG) if ret is not None: logging.info("worker(%s) died before establishing " "connection" % w[0]) + nwait(apid) #wait for agent else: logging.debug("worker(%s) connected" % w[0]) while time.time() < t0 + conn_timeout: @@ -211,15 +212,20 @@ class Monitor(object): if ret is not None: logging.info("worker(%s) died in startup " "phase" % w[0]) + nwait(apid) #wait for agent break time.sleep(1) else: logging.info("worker(%s) not confirmed in %d sec, " "aborting it" % (w[0], conn_timeout)) os.kill(cpid, signal.SIGKILL) + nwait(apid) #wait for agent ret = nwait(cpid) if ret is None: self.set_state(self.ST_STABLE, w) + #If worker dies, agent terminates on EOF. + #So lets wait for agent first. + nwait(apid) ret = nwait(cpid) if exit_signalled(ret): ret = 0 @@ -249,6 +255,8 @@ class Monitor(object): self.lock.acquire() for cpid in cpids: os.kill(cpid, signal.SIGKILL) + for apid in agents: + os.kill(apid, signal.SIGKILL) self.lock.release() finalize(exval=1) t = Thread(target=wmon, args=[wx]) |