diff options
author | Aravinda VK <avishwan@redhat.com> | 2014-03-19 12:00:44 +0530 |
---|---|---|
committer | Vijay Bellur <vbellur@redhat.com> | 2014-03-24 10:30:32 -0700 |
commit | 853a90f9d7399e4afdb685946a809e9dd30a1b98 (patch) | |
tree | 63eb62d606b49c4e7d408d41d1c29b163811372a | |
parent | 29cbd30acd5d7a66451df09c7b94ff4bbe8b0e18 (diff) |
geo-rep: Fix ValueError - signal only works in main thread
When a worker process not confirmed within 60 seconds of start
then monitor thread was terminated instead of stopping and restarting
the worker thread.
Before terminate monitor thread tries to add a signal handler for
SIGTERM to cleanup the stuff before terminate. Signal handling will
not work inside thread, so ValueError was raised.
This patch will not terminate monitor thread, instead only kills and
restarts the worker.
BUG: 1078068
Change-Id: Icf0df7ef492da636d0d20e42750747e404d897df
Signed-off-by: Aravinda VK <avishwan@redhat.com>
Reviewed-on: http://review.gluster.org/7294
Reviewed-on: http://review.gluster.org/7313
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
-rw-r--r-- | geo-replication/syncdaemon/monitor.py | 15 |
1 files changed, 7 insertions, 8 deletions
diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py index 0c3a42fa6b7..b0262ee30a8 100644 --- a/geo-replication/syncdaemon/monitor.py +++ b/geo-replication/syncdaemon/monitor.py @@ -146,20 +146,20 @@ class Monitor(object): if so: ret = nwait(cpid, os.WNOHANG) if ret != None: - logging.debug("worker died before establishing connection") + logging.info("worker(%s) died before establishing " \ + "connection" % w[0]) else: - logging.debug("worker seems to be connected (?? racy check)") + logging.debug("worker(%s) connected" % w[0]) while time.time() < t0 + conn_timeout: ret = nwait(cpid, os.WNOHANG) if ret != None: - logging.debug("worker died in startup phase") + logging.info("worker(%s) died in startup " \ + "phase" % w[0]) break time.sleep(1) else: - logging.debug("worker not confirmed in %d sec, aborting it" % \ - conn_timeout) - self.terminate() - time.sleep(1) + logging.info("worker(%s) not confirmed in %d sec, " \ + "aborting it" % (w[0], conn_timeout)) os.kill(cpid, signal.SIGKILL) ret = nwait(cpid) if ret == None: @@ -188,7 +188,6 @@ class Monitor(object): for wx in wspx: def wmon(w): cpid, _ = self.monitor(w, argv, cpids) - terminate() time.sleep(1) self.lock.acquire() for cpid in cpids: |