diff options
| -rw-r--r-- | geo-replication/syncdaemon/gsyncdstatus.py | 1 | ||||
| -rw-r--r-- | geo-replication/syncdaemon/monitor.py | 18 | 
2 files changed, 15 insertions, 4 deletions
diff --git a/geo-replication/syncdaemon/gsyncdstatus.py b/geo-replication/syncdaemon/gsyncdstatus.py index e75f9dcd62c..e93e0519890 100644 --- a/geo-replication/syncdaemon/gsyncdstatus.py +++ b/geo-replication/syncdaemon/gsyncdstatus.py @@ -100,6 +100,7 @@ class LockedOpen(object):          return f      def __exit__(self, _exc_type, _exc_value, _traceback): +        fcntl.flock(self.fileobj, fcntl.LOCK_UN)          self.fileobj.close() diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py index 40818427bfe..047efa75346 100644 --- a/geo-replication/syncdaemon/monitor.py +++ b/geo-replication/syncdaemon/monitor.py @@ -105,10 +105,6 @@ class Monitor(object):                                                      master,                                                      "%s::%s" % (slave_host,                                                                  slave_vol)) - -        set_monitor_status(gconf.get("state-file"), self.ST_STARTED) -        self.status[w[0]['dir']].set_worker_status(self.ST_INIT) -          ret = 0          def nwait(p, o=0): @@ -153,6 +149,7 @@ class Monitor(object):              # Spawn the worker and agent in lock to avoid fd leak              self.lock.acquire() +            self.status[w[0]['dir']].set_worker_status(self.ST_INIT)              logging.info(lf('starting gsyncd worker',                              brick=w[0]['dir'],                              slave_node=remote_host)) @@ -349,6 +346,19 @@ class Monitor(object):              t = Thread(target=wmon, args=[wx])              t.start()              ta.append(t) + +        # monitor status was being updated in each monitor thread. It +        # should not be done as it can cause deadlock for a worker start. +        # set_monitor_status uses flock to synchronize multple instances +        # updating the file. Since each monitor thread forks worker and +        # agent, these processes can hold the reference to fd of status +        # file causing deadlock to workers which starts later as flock +        # will not be release until all references to same fd is closed. +        # It will also cause fd leaks. + +        self.lock.acquire() +        set_monitor_status(gconf.get("state-file"), self.ST_STARTED) +        self.lock.release()          for t in ta:              t.join()  | 
