geo-rep: Status Enhancements

Discussion in gluster-devel http://www.gluster.org/pipermail/gluster-devel/2015-April/044301.html MASTER NODE - Master Volume Node MASTER VOL - Master Volume name MASTER BRICK - Master Volume Brick SLAVE USER - Slave User to which Geo-rep session is established SLAVE - <SLAVE_NODE>::<SLAVE_VOL> used in Geo-rep Create command SLAVE NODE - Slave Node to which Master worker is connected STATUS - Worker Status(Created, Initializing, Active, Passive, Faulty, Paused, Stopped) CRAWL STATUS - Crawl type(Hybrid Crawl, History Crawl, Changelog Crawl) LAST_SYNCED - Last Synced Time(Local Time in CLI output and UTC in XML output) ENTRY - Number of entry Operations pending.(Resets on worker restart) DATA - Number of Data operations pending(Resets on worker restart) META - Number of Meta operations pending(Resets on worker restart) FAILURES - Number of Failures CHECKPOINT TIME - Checkpoint set Time(Local Time in CLI output and UTC in XML output) CHECKPOINT COMPLETED - Yes/No or N/A CHECKPOINT COMPLETION TIME - Checkpoint Completed Time(Local Time in CLI output and UTC in XML output) XML output: <?xml version="1.0" encoding="UTF-8" standalone="yes"?> cliOutput> geoRep> volume> name> sessions> session> session_slave> pair> master_node> master_brick> slave_user> slave/> slave_node> status> crawl_status> entry> data> meta> failures> checkpoint_completed> master_node_uuid> last_synced> checkpoint_time> checkpoint_completion_time> BUG: 1212410 Change-Id: I944a6c3c67f1e6d6baf9670b474233bec8f61ea3 Signed-off-by: Aravinda VK <avishwan@redhat.com> Reviewed-on: http://review.gluster.org/10121 Tested-by: NetBSD Build System Reviewed-by: Kotresh HR <khiremat@redhat.com> Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Vijay Bellur <vbellur@redhat.com>
author: Aravinda VK <avishwan@redhat.com> 2015-03-12 16:07:13 +0530
committer: Vijay Bellur <vbellur@redhat.com> 2015-05-05 02:15:24 -0700
commit: 98b69412e92742e0638ef8bd76223671386f5a39 (patch)
tree: 4d02c8989c50c7b219404900bc7beac327b19dca /geo-replication/syncdaemon/monitor.py
parent: e02ac3c28241ff004d6cfbfc03975822146ce5dd (diff)
1 files changed, 19 insertions, 39 deletions
diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py
index 029726c7a5a..ba5c8e32514 100644
--- a/geo-replication/syncdaemon/monitor.py
+++ b/geo-replication/syncdaemon/monitor.py
@@ -22,10 +22,12 @@ from errno import EEXIST
 import re
 import random
 from gconf import gconf
-from syncdutils import update_file, select, waitpid
+from syncdutils import select, waitpid
 from syncdutils import set_term_handler, is_host_local, GsyncdError
 from syncdutils import escape, Thread, finalize, memoize
 
+from gsyncdstatus import GeorepStatus, set_monitor_status
+
 
 ParseError = XET.ParseError if hasattr(XET, 'ParseError') else SyntaxError
 
@@ -125,46 +127,22 @@ class Volinfo(object):
     def disperse_count(self):
         return int(self.get('disperseCount')[0].text)
 
+
 class Monitor(object):
 
     """class which spawns and manages gsyncd workers"""
 
     ST_INIT = 'Initializing...'
-    ST_STABLE = 'Stable'
-    ST_FAULTY = 'faulty'
+    ST_STARTED = 'Started'
+    ST_STABLE = 'Active'
+    ST_FAULTY = 'Faulty'
     ST_INCON = 'inconsistent'
     _ST_ORD = [ST_STABLE, ST_INIT, ST_FAULTY, ST_INCON]
 
     def __init__(self):
         self.lock = Lock()
         self.state = {}
-
-    def set_state(self, state, w=None):
-        """set the state that can be used by external agents
-           like glusterd for status reporting"""
-        computestate = lambda: self.state and self._ST_ORD[
-            max(self._ST_ORD.index(s) for s in self.state.values())]
-        if w:
-            self.lock.acquire()
-            old_state = computestate()
-            self.state[w] = state
-            state = computestate()
-            self.lock.release()
-            if state != old_state:
-                self.set_state(state)
-        else:
-            if getattr(gconf, 'state_file', None):
-                # If previous state is paused, suffix the
-                # new state with '(Paused)'
-                try:
-                    with open(gconf.state_file, "r") as f:
-                        content = f.read()
-                        if "paused" in content.lower():
-                            state = state + '(Paused)'
-                except IOError:
-                    pass
-                logging.info('new state: %s' % state)
-                update_file(gconf.state_file, lambda f: f.write(state + '\n'))
+        self.status = {}
 
     @staticmethod
     def terminate():
@@ -174,8 +152,7 @@ class Monitor(object):
         # give a chance to graceful exit
         os.kill(-os.getpid(), signal.SIGTERM)
 
-
-    def monitor(self, w, argv, cpids, agents, slave_vol, slave_host):
+    def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master):
         """the monitor loop
 
         Basic logic is a blantantly simple blunt heuristics:
@@ -194,8 +171,11 @@ class Monitor(object):
         blown worker blows up on EPIPE if the net goes down,
         due to the keep-alive thread)
         """
+        if not self.status.get(w[0], None):
+            self.status[w[0]] = GeorepStatus(gconf.state_file, w[0])
 
-        self.set_state(self.ST_INIT, w)
+        set_monitor_status(gconf.state_file, self.ST_STARTED)
+        self.status[w[0]].set_worker_status(self.ST_INIT)
 
         ret = 0
 
@@ -310,7 +290,7 @@ class Monitor(object):
                 nwait(apid) #wait for agent
                 ret = nwait(cpid)
             if ret is None:
-                self.set_state(self.ST_STABLE, w)
+                self.status[w[0]].set_worker_status(self.ST_STABLE)
                 #If worker dies, agent terminates on EOF.
                 #So lets wait for agent first.
                 nwait(apid)
@@ -320,12 +300,12 @@ class Monitor(object):
             else:
                 ret = exit_status(ret)
                 if ret in (0, 1):
-                    self.set_state(self.ST_FAULTY, w)
+                    self.status[w[0]].set_worker_status(self.ST_FAULTY)
             time.sleep(10)
-        self.set_state(self.ST_INCON, w)
+        self.status[w[0]].set_worker_status(self.ST_INCON)
         return ret
 
-    def multiplex(self, wspx, suuid, slave_vol, slave_host):
+    def multiplex(self, wspx, suuid, slave_vol, slave_host, master):
         argv = sys.argv[:]
         for o in ('-N', '--no-daemon', '--monitor'):
             while o in argv:
@@ -339,7 +319,7 @@ class Monitor(object):
         for wx in wspx:
             def wmon(w):
                 cpid, _ = self.monitor(w, argv, cpids, agents, slave_vol,
-                                       slave_host)
+                                       slave_host, master)
                 time.sleep(1)
                 self.lock.acquire()
                 for cpid in cpids:
@@ -401,7 +381,7 @@ def distribute(*resources):
                   for idx, brick in enumerate(mvol.bricks)
                   if is_host_local(brick['host'])]
     logging.info('worker specs: ' + repr(workerspex))
-    return workerspex, suuid, slave_vol, slave_host
+    return workerspex, suuid, slave_vol, slave_host, master
 
 
 def monitor(*resources):
author	Aravinda VK <avishwan@redhat.com>	2015-03-12 16:07:13 +0530
committer	Vijay Bellur <vbellur@redhat.com>	2015-05-05 02:15:24 -0700
commit	98b69412e92742e0638ef8bd76223671386f5a39 (patch)
tree	4d02c8989c50c7b219404900bc7beac327b19dca /geo-replication/syncdaemon/monitor.py
parent	e02ac3c28241ff004d6cfbfc03975822146ce5dd (diff)