diff options
| author | Aravinda VK <avishwan@redhat.com> | 2015-03-12 16:07:13 +0530 | 
|---|---|---|
| committer | Vijay Bellur <vbellur@redhat.com> | 2015-05-05 02:15:24 -0700 | 
| commit | 98b69412e92742e0638ef8bd76223671386f5a39 (patch) | |
| tree | 4d02c8989c50c7b219404900bc7beac327b19dca /geo-replication/syncdaemon/monitor.py | |
| parent | e02ac3c28241ff004d6cfbfc03975822146ce5dd (diff) | |
geo-rep: Status Enhancements
Discussion in gluster-devel
http://www.gluster.org/pipermail/gluster-devel/2015-April/044301.html
MASTER NODE - Master Volume Node
MASTER VOL - Master Volume name
MASTER BRICK - Master Volume Brick
SLAVE USER - Slave User to which Geo-rep session is established
SLAVE - <SLAVE_NODE>::<SLAVE_VOL> used in Geo-rep Create command
SLAVE NODE - Slave Node to which Master worker is connected
STATUS - Worker Status(Created, Initializing, Active, Passive, Faulty,
         Paused, Stopped)
CRAWL STATUS - Crawl type(Hybrid Crawl, History Crawl, Changelog Crawl)
LAST_SYNCED - Last Synced Time(Local Time in CLI output and UTC in XML output)
ENTRY - Number of entry Operations pending.(Resets on worker restart)
DATA - Number of Data operations pending(Resets on worker restart)
META - Number of Meta operations pending(Resets on worker restart)
FAILURES - Number of Failures
CHECKPOINT TIME - Checkpoint set Time(Local Time in CLI output and UTC
                  in XML output)
CHECKPOINT COMPLETED - Yes/No or N/A
CHECKPOINT COMPLETION TIME - Checkpoint Completed Time(Local Time in CLI
                             output and UTC in XML output)
XML output:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
cliOutput>
  geoRep>
      volume>
        name>
        sessions>
          session>
             session_slave>
             pair>
                master_node>
                master_brick>
                slave_user>
                slave/>
                slave_node>
                status>
                crawl_status>
                entry>
                data>
                meta>
                failures>
                checkpoint_completed>
                master_node_uuid>
                last_synced>
                checkpoint_time>
                checkpoint_completion_time>
BUG: 1212410
Change-Id: I944a6c3c67f1e6d6baf9670b474233bec8f61ea3
Signed-off-by: Aravinda VK <avishwan@redhat.com>
Reviewed-on: http://review.gluster.org/10121
Tested-by: NetBSD Build System
Reviewed-by: Kotresh HR <khiremat@redhat.com>
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
Diffstat (limited to 'geo-replication/syncdaemon/monitor.py')
| -rw-r--r-- | geo-replication/syncdaemon/monitor.py | 58 | 
1 files changed, 19 insertions, 39 deletions
diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py index 029726c7a5a..ba5c8e32514 100644 --- a/geo-replication/syncdaemon/monitor.py +++ b/geo-replication/syncdaemon/monitor.py @@ -22,10 +22,12 @@ from errno import EEXIST  import re  import random  from gconf import gconf -from syncdutils import update_file, select, waitpid +from syncdutils import select, waitpid  from syncdutils import set_term_handler, is_host_local, GsyncdError  from syncdutils import escape, Thread, finalize, memoize +from gsyncdstatus import GeorepStatus, set_monitor_status +  ParseError = XET.ParseError if hasattr(XET, 'ParseError') else SyntaxError @@ -125,46 +127,22 @@ class Volinfo(object):      def disperse_count(self):          return int(self.get('disperseCount')[0].text) +  class Monitor(object):      """class which spawns and manages gsyncd workers"""      ST_INIT = 'Initializing...' -    ST_STABLE = 'Stable' -    ST_FAULTY = 'faulty' +    ST_STARTED = 'Started' +    ST_STABLE = 'Active' +    ST_FAULTY = 'Faulty'      ST_INCON = 'inconsistent'      _ST_ORD = [ST_STABLE, ST_INIT, ST_FAULTY, ST_INCON]      def __init__(self):          self.lock = Lock()          self.state = {} - -    def set_state(self, state, w=None): -        """set the state that can be used by external agents -           like glusterd for status reporting""" -        computestate = lambda: self.state and self._ST_ORD[ -            max(self._ST_ORD.index(s) for s in self.state.values())] -        if w: -            self.lock.acquire() -            old_state = computestate() -            self.state[w] = state -            state = computestate() -            self.lock.release() -            if state != old_state: -                self.set_state(state) -        else: -            if getattr(gconf, 'state_file', None): -                # If previous state is paused, suffix the -                # new state with '(Paused)' -                try: -                    with open(gconf.state_file, "r") as f: -                        content = f.read() -                        if "paused" in content.lower(): -                            state = state + '(Paused)' -                except IOError: -                    pass -                logging.info('new state: %s' % state) -                update_file(gconf.state_file, lambda f: f.write(state + '\n')) +        self.status = {}      @staticmethod      def terminate(): @@ -174,8 +152,7 @@ class Monitor(object):          # give a chance to graceful exit          os.kill(-os.getpid(), signal.SIGTERM) - -    def monitor(self, w, argv, cpids, agents, slave_vol, slave_host): +    def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master):          """the monitor loop          Basic logic is a blantantly simple blunt heuristics: @@ -194,8 +171,11 @@ class Monitor(object):          blown worker blows up on EPIPE if the net goes down,          due to the keep-alive thread)          """ +        if not self.status.get(w[0], None): +            self.status[w[0]] = GeorepStatus(gconf.state_file, w[0]) -        self.set_state(self.ST_INIT, w) +        set_monitor_status(gconf.state_file, self.ST_STARTED) +        self.status[w[0]].set_worker_status(self.ST_INIT)          ret = 0 @@ -310,7 +290,7 @@ class Monitor(object):                  nwait(apid) #wait for agent                  ret = nwait(cpid)              if ret is None: -                self.set_state(self.ST_STABLE, w) +                self.status[w[0]].set_worker_status(self.ST_STABLE)                  #If worker dies, agent terminates on EOF.                  #So lets wait for agent first.                  nwait(apid) @@ -320,12 +300,12 @@ class Monitor(object):              else:                  ret = exit_status(ret)                  if ret in (0, 1): -                    self.set_state(self.ST_FAULTY, w) +                    self.status[w[0]].set_worker_status(self.ST_FAULTY)              time.sleep(10) -        self.set_state(self.ST_INCON, w) +        self.status[w[0]].set_worker_status(self.ST_INCON)          return ret -    def multiplex(self, wspx, suuid, slave_vol, slave_host): +    def multiplex(self, wspx, suuid, slave_vol, slave_host, master):          argv = sys.argv[:]          for o in ('-N', '--no-daemon', '--monitor'):              while o in argv: @@ -339,7 +319,7 @@ class Monitor(object):          for wx in wspx:              def wmon(w):                  cpid, _ = self.monitor(w, argv, cpids, agents, slave_vol, -                                       slave_host) +                                       slave_host, master)                  time.sleep(1)                  self.lock.acquire()                  for cpid in cpids: @@ -401,7 +381,7 @@ def distribute(*resources):                    for idx, brick in enumerate(mvol.bricks)                    if is_host_local(brick['host'])]      logging.info('worker specs: ' + repr(workerspex)) -    return workerspex, suuid, slave_vol, slave_host +    return workerspex, suuid, slave_vol, slave_host, master  def monitor(*resources):  | 
