diff options
| author | Csaba Henk <csaba@lowlife.hu> | 2011-04-02 19:40:49 +0000 | 
|---|---|---|
| committer | Vijay Bellur <vijay@dev.gluster.com> | 2011-04-04 08:02:27 -0700 | 
| commit | 01b3dff29adee2041b0ef1b374eda8c88fb07678 (patch) | |
| tree | c8f6c7eabb962c97f3e88add716eda429e2c3567 | |
| parent | e77c35248e8ce796bc5b108c10013089a0c65bde (diff) | |
syncdaemon: add monitor mode to support autorestart
Signed-off-by: Csaba Henk <csaba@gluster.com>
Signed-off-by: Vijay Bellur <vijay@dev.gluster.com>
BUG: 2537 (gsync autorestart)
URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2537
| -rw-r--r-- | cli/src/cli-rpc-ops.c | 76 | ||||
| -rw-r--r-- | xlators/features/marker/utils/syncdaemon/Makefile.am | 2 | ||||
| -rw-r--r-- | xlators/features/marker/utils/syncdaemon/gsyncd.py | 36 | ||||
| -rw-r--r-- | xlators/features/marker/utils/syncdaemon/master.py | 21 | ||||
| -rw-r--r-- | xlators/features/marker/utils/syncdaemon/monitor.py | 54 | ||||
| -rw-r--r-- | xlators/features/marker/utils/syncdaemon/syncdutils.py | 8 | ||||
| -rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-op-sm.c | 18 | 
7 files changed, 174 insertions, 41 deletions
| diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c index 6d47374a0c1..110962c6c29 100644 --- a/cli/src/cli-rpc-ops.c +++ b/cli/src/cli-rpc-ops.c @@ -2603,15 +2603,30 @@ out:  }  int -gf_cli3_1_gsync_get_pid_file (char *pidfolder, char *pidfile, char *master, char *slave, char *gl_workdir) +gf_cli3_1_gsync_get_param_file (char *prmfile, const char *ext, char *master, char *slave, char *gl_workdir)  {          FILE               *in = NULL;          char                buff[PATH_MAX] = {0, };          char                cmd[PATH_MAX] = {0, };          char               *ptr = NULL; -        char                buffer[PATH_MAX] = {0, }; +        char                pidfolder[PATH_MAX] = {0, }; +        char               *dotp = NULL;          int                 ret = 0; +        if (!(master && slave && gl_workdir)) { +                GF_ASSERT (!master && !slave && !gl_workdir); +                /* extension adjustment mode */ + +                dotp = strrchr (prmfile, '.'); +                if (!dotp++ || +                    /* overflow */ +                    dotp - prmfile + strlen (ext) + 1 > PATH_MAX) +                        return -1; + +                strcpy (dotp, ext); +                return 0; +        } +          snprintf (cmd, PATH_MAX, GSYNCD_PREFIX"/gsyncd --canonicalize-escape-url"                                       " %s %s", master, slave);          if (!(in = popen(cmd, "r"))) { @@ -2622,21 +2637,18 @@ gf_cli3_1_gsync_get_pid_file (char *pidfolder, char *pidfile, char *master, char          ptr = fgets(buff, sizeof(buff), in);          if (ptr) {                  buff[strlen(buff)-1]='\0'; //strip off \n -                snprintf (buffer, PATH_MAX, "%s/gsync/%s", gl_workdir, buff); -                strncpy (pidfolder, buffer, PATH_MAX); +                snprintf (pidfolder, PATH_MAX, "%s/gsync/%s", gl_workdir, buff);          } else {                  ret = -1;                  goto out;          }          memset (buff, 0, PATH_MAX); -        memset (buffer, 0, PATH_MAX);          ptr = fgets(buff, sizeof(buff), in);          if (ptr) {                  buff[strlen(buff)-1]='\0'; //strip off \n -                snprintf (buffer, PATH_MAX, "%s/%s.pid", pidfolder, buff); -                strncpy (pidfile, buffer, PATH_MAX); +                snprintf (prmfile, PATH_MAX, "%s/%s.pid", pidfolder, buff);          }   out: @@ -2684,19 +2696,19 @@ gf_cli3_1_start_gsync (char *master, char *slave, char *gl_workdir)          int32_t         ret     = -1;          int32_t         status  = 0;          char            cmd[PATH_MAX] = {0,}; -        char            pidfile[PATH_MAX] = {0,}; -        char            pidfolder[PATH_MAX] = {0,}; +        char            prmfile[PATH_MAX] = {0,}; +        char            *tslash = NULL; -        ret = gf_cli3_1_gsync_get_pid_file (pidfolder, pidfile, master, -                                               slave, gl_workdir); +        ret = gf_cli3_1_gsync_get_param_file (prmfile, "pid", master, +                                              slave, gl_workdir);          if (ret == -1) {                  ret = -1;                  gf_log ("", GF_LOG_WARNING, "failed to construct the " -                        "pidfile string"); +                        "prmfile string");                  goto out;          } -        ret = gf_cli3_1_gsync_status (master, slave, pidfile, &status); +        ret = gf_cli3_1_gsync_status (master, slave, prmfile, &status);          if ((ret == 0 && status == 0)) {                  gf_log ("", GF_LOG_WARNING, "gsync %s:%s"                          "already started", master, slave); @@ -2707,19 +2719,24 @@ gf_cli3_1_start_gsync (char *master, char *slave, char *gl_workdir)                  goto out;          } -        unlink (pidfile); +        unlink (prmfile); -        ret = mkdir (pidfolder, 0777); -        if (ret && (errno != EEXIST)) { -                gf_log ("", GF_LOG_DEBUG, "mkdir failed, errno: %d", -                        errno); -                goto out; +        tslash = strrchr(prmfile, '/'); +        if (tslash) { +                *tslash = '\0'; +                ret = mkdir (prmfile, 0777); +                if (ret && (errno != EEXIST)) { +                        gf_log ("", GF_LOG_DEBUG, "mkdir failed, errno: %d", +                                errno); +                        goto out; +                } +                *tslash = '/';          }          memset (cmd, 0, sizeof (cmd));          ret = snprintf (cmd, PATH_MAX, GSYNCD_PREFIX "/gsyncd -c %s/%s %s %s"                                         " --config-set pid-file %s", gl_workdir, -                                       GSYNC_CONF, master, slave, pidfile); +                                       GSYNC_CONF, master, slave, prmfile);          if (ret <= 0) {                  ret = -1;                  gf_log ("", GF_LOG_WARNING, "failed to construct the  " @@ -2728,14 +2745,29 @@ gf_cli3_1_start_gsync (char *master, char *slave, char *gl_workdir)          }          ret = system (cmd); -        if (ret == -1) { +        if (ret) {                  gf_log ("", GF_LOG_WARNING, "failed to set the pid "                          "option for %s %s", master, slave);                  goto out;          } +        ret = gf_cli3_1_gsync_get_param_file (prmfile, "status", NULL, NULL, NULL); +        if (ret != -1) +                ret = snprintf (cmd, PATH_MAX, GSYNCD_PREFIX "/gsyncd -c %s/%s %s %s" +                                " --config-set state-file %s", gl_workdir, +                                GSYNC_CONF, master, slave, prmfile); +        if (ret >= PATH_MAX) +                ret = -1; +        if (ret != -1) +                ret = system (cmd) ? -1 : 0; +        if (ret == -1) { +                gf_log ("", GF_LOG_WARNING, "failed to set status file " +                        "for %s %s", master, slave); +                goto out; +        } +          memset (cmd, 0, sizeof (cmd)); -        ret = snprintf (cmd, PATH_MAX, GSYNCD_PREFIX "/gsyncd -c %s/%s %s %s" +        ret = snprintf (cmd, PATH_MAX, GSYNCD_PREFIX "/gsyncd --monitor -c %s/%s %s %s"                                         , gl_workdir, GSYNC_CONF, master, slave);          if (ret <= 0) {                  ret = -1; diff --git a/xlators/features/marker/utils/syncdaemon/Makefile.am b/xlators/features/marker/utils/syncdaemon/Makefile.am index 03ac9762541..c900fa93260 100644 --- a/xlators/features/marker/utils/syncdaemon/Makefile.am +++ b/xlators/features/marker/utils/syncdaemon/Makefile.am @@ -1,5 +1,5 @@  syncdaemondir = $(libexecdir)/glusterfs/python/syncdaemon -syncdaemon_PYTHON = gconf.py gsyncd.py __init__.py master.py README.md repce.py resource.py configinterface.py syncdutils.py +syncdaemon_PYTHON = gconf.py gsyncd.py __init__.py master.py README.md repce.py resource.py configinterface.py syncdutils.py monitor.py  CLEANFILES = diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py index a992005ecd3..fb2fe522bce 100644 --- a/xlators/features/marker/utils/syncdaemon/gsyncd.py +++ b/xlators/features/marker/utils/syncdaemon/gsyncd.py @@ -17,6 +17,7 @@ from errno import EEXIST, ENOENT, EACCES, EAGAIN  from gconf import gconf  from configinterface import GConffile  import resource +from monitor import monitor  class GLogger(Logger): @@ -37,12 +38,11 @@ class GLogger(Logger):      @classmethod      def setup(cls, **kw): -        if kw.get('slave'): -            sls = "(slave)" -        else: -            sls = "" +        lbl = kw.get('label', "") +        if lbl: +            lbl = '(' + lbl + ')'          lprm = {'datefmt': "%Y-%m-%d %H:%M:%S", -                'format': "[%(asctime)s.%(nsecs)d] %(lvlnam)s [%(module)s" + sls + ":%(lineno)s:%(funcName)s] %(ctx)s: %(message)s"} +                'format': "[%(asctime)s.%(nsecs)d] %(lvlnam)s [%(module)s" + lbl + ":%(lineno)s:%(funcName)s] %(ctx)s: %(message)s"}          lprm.update(kw)          lvl = kw.get('level', logging.INFO)          lprm['level'] = lvl @@ -121,7 +121,7 @@ def startup(**kw):              lkw['stream'] = sys.stdout          else:              lkw['filename'] = kw['log_file'] -    GLogger.setup(slave=kw.get('slave'), **lkw) +    GLogger.setup(label=kw.get('label'), **lkw)  def finalize(*a):      if getattr(gconf, 'pid_file', None): @@ -178,7 +178,9 @@ def main_i():      rconf = {'go_daemon': 'should'}      def store_abs(opt, optstr, val, parser): -        setattr(parser.values, opt.dest, os.path.abspath(val)) +        if val: +            val = os.path.abspath(val) +        setattr(parser.values, opt.dest, val)      def store_local(opt, optstr, val, parser):          rconf[opt.dest] = val      def store_local_curry(val): @@ -190,8 +192,10 @@ def main_i():      op.add_option('--gluster-log-level',   metavar='LVL')      op.add_option('-p', '--pid-file',      metavar='PIDF',  type=str, action='callback', callback=store_abs)      op.add_option('-l', '--log-file',      metavar='LOGF',  type=str, action='callback', callback=store_abs) +    op.add_option('--state-file',          metavar='STATF', type=str, action='callback', callback=store_abs)      op.add_option('-L', '--log-level',     metavar='LVL')      op.add_option('-r', '--remote-gsyncd', metavar='CMD',   default=os.path.abspath(sys.argv[0])) +    op.add_option('--volume-id',           metavar='UUID')      op.add_option('-s', '--ssh-command',   metavar='CMD',   default='ssh')      op.add_option('--rsync-command',       metavar='CMD',   default='rsync')      op.add_option('--rsync-extra',         metavar='ARGS',  default='-sS', help=SUPPRESS_HELP) @@ -201,6 +205,7 @@ def main_i():      op.add_option('-c', '--config-file',   metavar='CONF',  type=str, action='callback', callback=store_local)      # duh. need to specify dest or value will be mapped to None :S +    op.add_option('--monitor', dest='monitor', action='callback', callback=store_local_curry(True))      op.add_option('--listen', dest='listen', help=SUPPRESS_HELP,      action='callback', callback=store_local_curry(True))      op.add_option('-N', '--no-daemon', dest="go_daemon",    action='callback', callback=store_local_curry('dont'))      op.add_option('--debug', dest="go_daemon",              action='callback', callback=lambda *a: (store_local_curry('dont')(*a), @@ -277,6 +282,7 @@ def main_i():      gconf.__dict__.update(defaults.__dict__)      gcnf.update_to(gconf.__dict__)      gconf.__dict__.update(opts.__dict__) +    gconf.configinterface = gcnf      #normalize loglevel      lvl0 = gconf.log_level @@ -290,13 +296,25 @@ def main_i():          gconf.log_level = lvl2      go_daemon = rconf['go_daemon'] +    be_monitor = rconf.get('monitor') -    if isinstance(remote, resource.SSH) and go_daemon == 'should': +    if not be_monitor and isinstance(remote, resource.SSH) and \ +       go_daemon == 'should':          go_daemon = 'postconn'          log_file = None      else:          log_file = gconf.log_file -    startup(go_daemon=go_daemon, log_file=log_file, slave=(not remote)) +    if be_monitor: +        label = 'monitor' +    elif remote: +        #master +        label = '' +    else: +        label = 'slave' +    startup(go_daemon=go_daemon, log_file=log_file, label=label) + +    if be_monitor: +        return monitor()      logging.info("syncing: %s" % " -> ".join(peers))      if remote: diff --git a/xlators/features/marker/utils/syncdaemon/master.py b/xlators/features/marker/utils/syncdaemon/master.py index 2df1470d5f7..87610f3879a 100644 --- a/xlators/features/marker/utils/syncdaemon/master.py +++ b/xlators/features/marker/utils/syncdaemon/master.py @@ -10,6 +10,7 @@ from errno import ENOENT, ENODATA  from threading import Thread, currentThread, Condition, Lock  from gconf import gconf +from syncdutils import FreeObject  URXTIME = (-1, 0) @@ -80,7 +81,8 @@ class GMaster(object):          # the authorative (foreign, native) volinfo pair          # which lets us deduce what to do when we refetch          # the volinfos from system -        self.volinfo_state = (None, None) +        uuid_preset = getattr(gconf, 'volume_id', None) +        self.volinfo_state = (uuid_preset and {'uuid': uuid_preset}, None)          # the actual volinfo we make use of          self.volinfo = None @@ -140,14 +142,16 @@ class GMaster(object):          # store the value below "boxed" to emulate proper closures          # (variables of the enclosing scope are available inner functions          # provided they are no reassigned; mutation is OK). -        relax_mismatch = [False] +        param = FreeObject(relax_mismatch = False, state_change = False)          def select_vi(vi0, vi):              if vi and (not vi0 or vi0['uuid'] == vi['uuid']): +                if not vi0 and not param.relax_mismatch: +                    param.state_change = True                  # valid new value found; for the rest, we are graceful about                  # uuid mismatch -                relax_mismatch[0] = True +                param.relax_mismatch = True                  return vi -            if vi0 and vi and vi0['uuid'] != vi['uuid'] and not relax_mismatch[0]: +            if vi0 and vi and vi0['uuid'] != vi['uuid'] and not param.relax_mismatch:                  # uuid mismatch for master candidate, bail out                  raise RuntimeError("aborting on uuid change from %s to %s" % \                                     (vi0['uuid'], vi['uuid'])) @@ -157,7 +161,7 @@ class GMaster(object):          srep = lambda vi: vi and vi['uuid'][0:8]          logging.debug('(%s, %s) << (%s, %s) -> (%s, %s)' % \                        tuple(srep(vi) for vi in volinfo_state + volinfo_sys + newstate)) -        return newstate +        return newstate, param.state_change      def crawl(self, path='.', xtl=None):          if path == '.': @@ -166,11 +170,16 @@ class GMaster(object):              time.sleep(1)              self.start = time.time()              volinfo_sys = self.get_sys_volinfo() -            self.volinfo_state = self.volinfo_state_machine(self.volinfo_state, volinfo_sys) +            self.volinfo_state, state_change = self.volinfo_state_machine(self.volinfo_state, +                                                                          volinfo_sys)              if self.inter_master:                  self.volinfo = volinfo_sys[self.KFGN]              else:                  self.volinfo = volinfo_sys[self.KNAT] +            if state_change: +                logging.info('new master is %s', self.uuid) +                if self.inter_master: +                    gconf.configinterface.set('volume_id', self.uuid)              if self.volinfo:                  if self.volinfo['retval']:                      raise RuntimeError ("master is corrupt") diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py new file mode 100644 index 00000000000..3f327b6d04d --- /dev/null +++ b/xlators/features/marker/utils/syncdaemon/monitor.py @@ -0,0 +1,54 @@ +import os +import sys +import time +import logging +from gconf import gconf +from syncdutils import update_file + +class Monitor(object): + +    def __init__(self): +        self.state = None + +    def set_state(self, state): +        if state == self.state: +            return +        self.state = state +        logging.info('new state: %s' % state) +        if getattr(gconf, 'state_file', None): +            update_file(gconf.state_file, lambda f: f.write(state + '\n')) + +    def monitor(self): +        argv = sys.argv[:] +        for o in ('-N', '--no-daemon', '--monitor'): +            while o in argv: +                argv.remove(o) +        argv.extend(('-N', '-p', '')) +        argv.insert(0, os.path.basename(sys.executable)) + +        self.set_state('starting...') +        ret = 0 +	def nwait(p, o=0): +            p2, r = os.waitpid(p, o) +            if not p2: +                return +            if os.WIFEXITED(r): +                return os.WEXITSTATUS(r) +            return 1 +        while ret in (0, 1): +            logging.info('-' * 60) +            logging.info('starting gsyncd worker') +            cpid = os.spawnv(os.P_NOWAIT, sys.executable, argv) +            time.sleep(60) +            ret = nwait(cpid, os.WNOHANG) +            if not ret: +                self.set_state('OK') +                ret = nwait(cpid) +            elif ret in (0, 1): +                self.set_state('faulty') +            time.sleep(10) +        self.set_state('inconsistent') +        return ret + +def monitor(): +    return Monitor().monitor() diff --git a/xlators/features/marker/utils/syncdaemon/syncdutils.py b/xlators/features/marker/utils/syncdaemon/syncdutils.py index 723ab8fb5fc..5c17d0579b0 100644 --- a/xlators/features/marker/utils/syncdaemon/syncdutils.py +++ b/xlators/features/marker/utils/syncdaemon/syncdutils.py @@ -40,3 +40,11 @@ def update_file(path, updater, merger = lambda f: True):          for fx in (fr, fw):              if fx:                  fx.close() + + +class FreeObject(object): +    """wildcard class for which any attribute can be set""" + +    def __init__(self, **kw): +        for k,v in kw.iteritems(): +            setattr(self, k, v) diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c index f8e043e46d6..0ae4f93e359 100644 --- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c +++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c @@ -3599,6 +3599,7 @@ stop_gsync (char *master, char *slave, char **op_errstr)          FILE            *file   = NULL;          char            pidfile[PATH_MAX] = {0,};          char            buf [1024] = {0,}; +        int             i       = 0;          ret = gsync_status (master, slave, &status);          if (ret == 0 && status == -1) { @@ -3632,14 +3633,25 @@ stop_gsync (char *master, char *slave, char **op_errstr)          ret = read (fileno(file), buf, 1024);          if (ret > 0) {                  pid = strtol (buf, NULL, 10); -                ret = kill (pid, SIGTERM); +                ret = kill (-pid, SIGTERM);                  if (ret) {                          gf_log ("", GF_LOG_WARNING,                                  "failed to stop gsyncd");                          goto out;                  } -                sleep (0.1); -                kill (pid, SIGTERM); +                for (i = 0; i < 20; i++) { +                        if (gsync_status (master, slave, &status) == -1 || +                            status == -1) { +                                /* monitor gsyncd is dead but worker may +                                 * still be alive, give some more time +                                 * before SIGKILL (hack) +                                 */ +                                sleep (0.05); +                                break; +                        } +                        sleep (0.05); +                } +                kill (-pid, SIGKILL);                  unlink (pidfile);          }          ret = 0; | 
