feature/geo-rep: Fix for changelog agent becoming zombie.

Monitor process spawns changelog agent and is not wait on it, hence becoming zombie. When worker is dies/killed, it respawns both worker and corresponding agent leaving the earlier changelog agent in zombie state. This patch addresses this issue by waiting on agent process in montor process. Change-Id: I571b7d6487133848edca67e7446f1caa70ae01c9 BUG: 1103643 Signed-off-by: Kotresh H R <khiremat@redhat.com> Reviewed-on: http://review.gluster.org/7956 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Aravinda VK <avishwan@redhat.com> Reviewed-by: Venky Shankar <vshankar@redhat.com> Tested-by: Venky Shankar <vshankar@redhat.com>
author: Kotresh H R <khiremat@redhat.com> 2014-05-30 17:03:30 +0530
committer: Venky Shankar <vshankar@redhat.com> 2014-06-17 21:47:07 -0700
commit: b6c8b1b47838228611e1a619890effe8b2dd2ad3 (patch)
tree: 8ab49bb31ec6de4d56678097ae726454707dc3e2 /geo-replication
parent: 93832829016a0a51a8938c0c89c6bd09b3229c9f (diff)
1 files changed, 12 insertions, 4 deletions
diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py
index f3700c1a390..0bde216d761 100644
--- a/geo-replication/syncdaemon/monitor.py
+++ b/geo-replication/syncdaemon/monitor.py
@@ -189,21 +189,22 @@ class Monitor(object):
             self.lock.release()
             os.close(pw)
 
-            t0 = time.time()
-            so = select((pr,), (), (), conn_timeout)[0]
-            os.close(pr)
-
             # close all RPC pipes in monitor
             os.close(ra)
             os.close(wa)
             os.close(rw)
             os.close(ww)
 
+            t0 = time.time()
+            so = select((pr,), (), (), conn_timeout)[0]
+            os.close(pr)
+
             if so:
                 ret = nwait(cpid, os.WNOHANG)
                 if ret is not None:
                     logging.info("worker(%s) died before establishing "
                                  "connection" % w[0])
+                    nwait(apid) #wait for agent
                 else:
                     logging.debug("worker(%s) connected" % w[0])
                     while time.time() < t0 + conn_timeout:
@@ -211,15 +212,20 @@ class Monitor(object):
                         if ret is not None:
                             logging.info("worker(%s) died in startup "
                                          "phase" % w[0])
+                            nwait(apid) #wait for agent
                             break
                         time.sleep(1)
             else:
                 logging.info("worker(%s) not confirmed in %d sec, "
                              "aborting it" % (w[0], conn_timeout))
                 os.kill(cpid, signal.SIGKILL)
+                nwait(apid) #wait for agent
                 ret = nwait(cpid)
             if ret is None:
                 self.set_state(self.ST_STABLE, w)
+                #If worker dies, agent terminates on EOF.
+                #So lets wait for agent first.
+                nwait(apid)
                 ret = nwait(cpid)
             if exit_signalled(ret):
                 ret = 0
@@ -249,6 +255,8 @@ class Monitor(object):
                 self.lock.acquire()
                 for cpid in cpids:
                     os.kill(cpid, signal.SIGKILL)
+                for apid in agents:
+                    os.kill(apid, signal.SIGKILL)
                 self.lock.release()
                 finalize(exval=1)
             t = Thread(target=wmon, args=[wx])
author	Kotresh H R <khiremat@redhat.com>	2014-05-30 17:03:30 +0530
committer	Venky Shankar <vshankar@redhat.com>	2014-06-17 21:47:07 -0700
commit	b6c8b1b47838228611e1a619890effe8b2dd2ad3 (patch)
tree	8ab49bb31ec6de4d56678097ae726454707dc3e2 /geo-replication
parent	93832829016a0a51a8938c0c89c6bd09b3229c9f (diff)