diff options
author | Kotresh HR <khiremat@redhat.com> | 2017-04-04 15:39:46 -0400 |
---|---|---|
committer | Aravinda VK <avishwan@redhat.com> | 2017-04-07 02:09:34 -0400 |
commit | e01025973c73e2bd0eda8cfed22b75617305d740 (patch) | |
tree | 9afdb57aaf3474a54bc222c657ed3de00f40cf4c /geo-replication | |
parent | cbcb1d33de8c4bd7250a5c038e8f95456772add1 (diff) |
geo-rep: Improve worker log messages
Monitor process expects worker to establish SSH Tunnel to slave node
and mount master volume locally with in 60 secs and acknowledge monitor
process by closing feedback fd. If something goes wrong and worker
does not close feedback fd with in 60 secs, monitor kills the worker.
But there was no clue in log message about the actual issue. This patch
adds log and indicates whether the worker is hung during SSH
or master mount.
Change-Id: Id08a12fa6f3bba1d4fe8036728dbc290e6c14c8c
BUG: 1261689
Signed-off-by: Kotresh HR <khiremat@redhat.com>
Reviewed-on: https://review.gluster.org/16997
Smoke: Gluster Build System <jenkins@build.gluster.org>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Aravinda VK <avishwan@redhat.com>
Diffstat (limited to 'geo-replication')
-rw-r--r-- | geo-replication/syncdaemon/gsyncd.py | 1 | ||||
-rw-r--r-- | geo-replication/syncdaemon/monitor.py | 9 | ||||
-rw-r--r-- | geo-replication/syncdaemon/resource.py | 8 |
3 files changed, 16 insertions, 2 deletions
diff --git a/geo-replication/syncdaemon/gsyncd.py b/geo-replication/syncdaemon/gsyncd.py index adc48f146a6..ac39a79128b 100644 --- a/geo-replication/syncdaemon/gsyncd.py +++ b/geo-replication/syncdaemon/gsyncd.py @@ -777,6 +777,7 @@ def main_i(): remote.connect_remote(go_daemon='done') local.connect() if ffd: + logging.info ("Closing feedback fd, waking up the monitor") os.close(ffd) local.service_loop(*[r for r in [remote] if r]) diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py index d23d4542fd6..c54c07d600c 100644 --- a/geo-replication/syncdaemon/monitor.py +++ b/geo-replication/syncdaemon/monitor.py @@ -369,8 +369,13 @@ class Monitor(object): time.sleep(1) else: - logging.info("worker(%s) not confirmed in %d sec, " - "aborting it" % (w[0]['dir'], conn_timeout)) + logging.info("worker(%s) not confirmed in %d sec, aborting it. " + "Gsyncd invocation on remote slave via SSH or " + "gluster master mount might have hung. Please " + "check the above logs for exact issue and check " + "master or slave volume for errors. Restarting " + "master/slave volume accordingly might help." + % (w[0]['dir'], conn_timeout)) errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(apid) # wait for agent ret = nwait(cpid) diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py index 0e718b28344..eb295ad8601 100644 --- a/geo-replication/syncdaemon/resource.py +++ b/geo-replication/syncdaemon/resource.py @@ -1452,6 +1452,8 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): with given backend """ + logging.info ("Mounting gluster volume locally...") + t0 = time.time() label = getattr(gconf, 'mountbroker', None) if not label and not privileged(): label = syncdutils.getusername() @@ -1462,6 +1464,8 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): ['log-file=' + gconf.gluster_log_file, 'volfile-server=' + self.host, 'volfile-id=' + self.volume, 'client-pid=-1'] mounter(params).inhibit(*[l for l in [label] if l]) + logging.info ("Mounted gluster volume. Time taken: {0:.4f} " + "secs".format((time.time() - t0))) def connect_remote(self, *a, **kw): sup(self, *a, **kw) @@ -1723,10 +1727,14 @@ class SSH(AbstractUrl, SlaveRemote): self.inner_rsc.url) deferred = go_daemon == 'postconn' + logging.info ("Initializing SSH connection between master and slave...") + t0 = time.time() ret = sup(self, gconf.ssh_command.split() + ["-p", str(gconf.ssh_port)] + gconf.ssh_ctl_args + [self.remote_addr], slave=self.inner_rsc.url, deferred=deferred) + logging.info ("SSH connection between master and slave established. " + "Time taken: {0:.4f} secs".format((time.time() - t0))) if deferred: # send a message to peer so that we can wait for |