diff options
-rw-r--r-- | geo-replication/syncdaemon/gsyncdstatus.py | 1 | ||||
-rw-r--r-- | geo-replication/syncdaemon/monitor.py | 18 |
2 files changed, 15 insertions, 4 deletions
diff --git a/geo-replication/syncdaemon/gsyncdstatus.py b/geo-replication/syncdaemon/gsyncdstatus.py index e8a810f4b38..87fa09c070c 100644 --- a/geo-replication/syncdaemon/gsyncdstatus.py +++ b/geo-replication/syncdaemon/gsyncdstatus.py @@ -103,6 +103,7 @@ class LockedOpen(object): return f def __exit__(self, _exc_type, _exc_value, _traceback): + fcntl.flock(self.fileobj, fcntl.LOCK_UN) self.fileobj.close() diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py index 52ae256fb14..97274f32422 100644 --- a/geo-replication/syncdaemon/monitor.py +++ b/geo-replication/syncdaemon/monitor.py @@ -105,10 +105,6 @@ class Monitor(object): master, "%s::%s" % (slave_host, slave_vol)) - - set_monitor_status(gconf.get("state-file"), self.ST_STARTED) - self.status[w[0]['dir']].set_worker_status(self.ST_INIT) - ret = 0 def nwait(p, o=0): @@ -153,6 +149,7 @@ class Monitor(object): # Spawn the worker and agent in lock to avoid fd leak self.lock.acquire() + self.status[w[0]['dir']].set_worker_status(self.ST_INIT) logging.info(lf('starting gsyncd worker', brick=w[0]['dir'], slave_node=remote_host)) @@ -349,6 +346,19 @@ class Monitor(object): t = Thread(target=wmon, args=[wx]) t.start() ta.append(t) + + # monitor status was being updated in each monitor thread. It + # should not be done as it can cause deadlock for a worker start. + # set_monitor_status uses flock to synchronize multple instances + # updating the file. Since each monitor thread forks worker and + # agent, these processes can hold the reference to fd of status + # file causing deadlock to workers which starts later as flock + # will not be release until all references to same fd is closed. + # It will also cause fd leaks. + + self.lock.acquire() + set_monitor_status(gconf.get("state-file"), self.ST_STARTED) + self.lock.release() for t in ta: t.join() |