From 01b3dff29adee2041b0ef1b374eda8c88fb07678 Mon Sep 17 00:00:00 2001 From: Csaba Henk Date: Sat, 2 Apr 2011 19:40:49 +0000 Subject: syncdaemon: add monitor mode to support autorestart Signed-off-by: Csaba Henk Signed-off-by: Vijay Bellur BUG: 2537 (gsync autorestart) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2537 --- cli/src/cli-rpc-ops.c | 76 +++++++++++++++------- .../features/marker/utils/syncdaemon/Makefile.am | 2 +- xlators/features/marker/utils/syncdaemon/gsyncd.py | 36 +++++++--- xlators/features/marker/utils/syncdaemon/master.py | 21 ++++-- .../features/marker/utils/syncdaemon/monitor.py | 54 +++++++++++++++ .../features/marker/utils/syncdaemon/syncdutils.py | 8 +++ xlators/mgmt/glusterd/src/glusterd-op-sm.c | 18 ++++- 7 files changed, 174 insertions(+), 41 deletions(-) create mode 100644 xlators/features/marker/utils/syncdaemon/monitor.py diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c index 6d47374a0c1..110962c6c29 100644 --- a/cli/src/cli-rpc-ops.c +++ b/cli/src/cli-rpc-ops.c @@ -2603,15 +2603,30 @@ out: } int -gf_cli3_1_gsync_get_pid_file (char *pidfolder, char *pidfile, char *master, char *slave, char *gl_workdir) +gf_cli3_1_gsync_get_param_file (char *prmfile, const char *ext, char *master, char *slave, char *gl_workdir) { FILE *in = NULL; char buff[PATH_MAX] = {0, }; char cmd[PATH_MAX] = {0, }; char *ptr = NULL; - char buffer[PATH_MAX] = {0, }; + char pidfolder[PATH_MAX] = {0, }; + char *dotp = NULL; int ret = 0; + if (!(master && slave && gl_workdir)) { + GF_ASSERT (!master && !slave && !gl_workdir); + /* extension adjustment mode */ + + dotp = strrchr (prmfile, '.'); + if (!dotp++ || + /* overflow */ + dotp - prmfile + strlen (ext) + 1 > PATH_MAX) + return -1; + + strcpy (dotp, ext); + return 0; + } + snprintf (cmd, PATH_MAX, GSYNCD_PREFIX"/gsyncd --canonicalize-escape-url" " %s %s", master, slave); if (!(in = popen(cmd, "r"))) { @@ -2622,21 +2637,18 @@ gf_cli3_1_gsync_get_pid_file (char *pidfolder, char *pidfile, char *master, char ptr = fgets(buff, sizeof(buff), in); if (ptr) { buff[strlen(buff)-1]='\0'; //strip off \n - snprintf (buffer, PATH_MAX, "%s/gsync/%s", gl_workdir, buff); - strncpy (pidfolder, buffer, PATH_MAX); + snprintf (pidfolder, PATH_MAX, "%s/gsync/%s", gl_workdir, buff); } else { ret = -1; goto out; } memset (buff, 0, PATH_MAX); - memset (buffer, 0, PATH_MAX); ptr = fgets(buff, sizeof(buff), in); if (ptr) { buff[strlen(buff)-1]='\0'; //strip off \n - snprintf (buffer, PATH_MAX, "%s/%s.pid", pidfolder, buff); - strncpy (pidfile, buffer, PATH_MAX); + snprintf (prmfile, PATH_MAX, "%s/%s.pid", pidfolder, buff); } out: @@ -2684,19 +2696,19 @@ gf_cli3_1_start_gsync (char *master, char *slave, char *gl_workdir) int32_t ret = -1; int32_t status = 0; char cmd[PATH_MAX] = {0,}; - char pidfile[PATH_MAX] = {0,}; - char pidfolder[PATH_MAX] = {0,}; + char prmfile[PATH_MAX] = {0,}; + char *tslash = NULL; - ret = gf_cli3_1_gsync_get_pid_file (pidfolder, pidfile, master, - slave, gl_workdir); + ret = gf_cli3_1_gsync_get_param_file (prmfile, "pid", master, + slave, gl_workdir); if (ret == -1) { ret = -1; gf_log ("", GF_LOG_WARNING, "failed to construct the " - "pidfile string"); + "prmfile string"); goto out; } - ret = gf_cli3_1_gsync_status (master, slave, pidfile, &status); + ret = gf_cli3_1_gsync_status (master, slave, prmfile, &status); if ((ret == 0 && status == 0)) { gf_log ("", GF_LOG_WARNING, "gsync %s:%s" "already started", master, slave); @@ -2707,19 +2719,24 @@ gf_cli3_1_start_gsync (char *master, char *slave, char *gl_workdir) goto out; } - unlink (pidfile); + unlink (prmfile); - ret = mkdir (pidfolder, 0777); - if (ret && (errno != EEXIST)) { - gf_log ("", GF_LOG_DEBUG, "mkdir failed, errno: %d", - errno); - goto out; + tslash = strrchr(prmfile, '/'); + if (tslash) { + *tslash = '\0'; + ret = mkdir (prmfile, 0777); + if (ret && (errno != EEXIST)) { + gf_log ("", GF_LOG_DEBUG, "mkdir failed, errno: %d", + errno); + goto out; + } + *tslash = '/'; } memset (cmd, 0, sizeof (cmd)); ret = snprintf (cmd, PATH_MAX, GSYNCD_PREFIX "/gsyncd -c %s/%s %s %s" " --config-set pid-file %s", gl_workdir, - GSYNC_CONF, master, slave, pidfile); + GSYNC_CONF, master, slave, prmfile); if (ret <= 0) { ret = -1; gf_log ("", GF_LOG_WARNING, "failed to construct the " @@ -2728,14 +2745,29 @@ gf_cli3_1_start_gsync (char *master, char *slave, char *gl_workdir) } ret = system (cmd); - if (ret == -1) { + if (ret) { gf_log ("", GF_LOG_WARNING, "failed to set the pid " "option for %s %s", master, slave); goto out; } + ret = gf_cli3_1_gsync_get_param_file (prmfile, "status", NULL, NULL, NULL); + if (ret != -1) + ret = snprintf (cmd, PATH_MAX, GSYNCD_PREFIX "/gsyncd -c %s/%s %s %s" + " --config-set state-file %s", gl_workdir, + GSYNC_CONF, master, slave, prmfile); + if (ret >= PATH_MAX) + ret = -1; + if (ret != -1) + ret = system (cmd) ? -1 : 0; + if (ret == -1) { + gf_log ("", GF_LOG_WARNING, "failed to set status file " + "for %s %s", master, slave); + goto out; + } + memset (cmd, 0, sizeof (cmd)); - ret = snprintf (cmd, PATH_MAX, GSYNCD_PREFIX "/gsyncd -c %s/%s %s %s" + ret = snprintf (cmd, PATH_MAX, GSYNCD_PREFIX "/gsyncd --monitor -c %s/%s %s %s" , gl_workdir, GSYNC_CONF, master, slave); if (ret <= 0) { ret = -1; diff --git a/xlators/features/marker/utils/syncdaemon/Makefile.am b/xlators/features/marker/utils/syncdaemon/Makefile.am index 03ac9762541..c900fa93260 100644 --- a/xlators/features/marker/utils/syncdaemon/Makefile.am +++ b/xlators/features/marker/utils/syncdaemon/Makefile.am @@ -1,5 +1,5 @@ syncdaemondir = $(libexecdir)/glusterfs/python/syncdaemon -syncdaemon_PYTHON = gconf.py gsyncd.py __init__.py master.py README.md repce.py resource.py configinterface.py syncdutils.py +syncdaemon_PYTHON = gconf.py gsyncd.py __init__.py master.py README.md repce.py resource.py configinterface.py syncdutils.py monitor.py CLEANFILES = diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py index a992005ecd3..fb2fe522bce 100644 --- a/xlators/features/marker/utils/syncdaemon/gsyncd.py +++ b/xlators/features/marker/utils/syncdaemon/gsyncd.py @@ -17,6 +17,7 @@ from errno import EEXIST, ENOENT, EACCES, EAGAIN from gconf import gconf from configinterface import GConffile import resource +from monitor import monitor class GLogger(Logger): @@ -37,12 +38,11 @@ class GLogger(Logger): @classmethod def setup(cls, **kw): - if kw.get('slave'): - sls = "(slave)" - else: - sls = "" + lbl = kw.get('label', "") + if lbl: + lbl = '(' + lbl + ')' lprm = {'datefmt': "%Y-%m-%d %H:%M:%S", - 'format': "[%(asctime)s.%(nsecs)d] %(lvlnam)s [%(module)s" + sls + ":%(lineno)s:%(funcName)s] %(ctx)s: %(message)s"} + 'format': "[%(asctime)s.%(nsecs)d] %(lvlnam)s [%(module)s" + lbl + ":%(lineno)s:%(funcName)s] %(ctx)s: %(message)s"} lprm.update(kw) lvl = kw.get('level', logging.INFO) lprm['level'] = lvl @@ -121,7 +121,7 @@ def startup(**kw): lkw['stream'] = sys.stdout else: lkw['filename'] = kw['log_file'] - GLogger.setup(slave=kw.get('slave'), **lkw) + GLogger.setup(label=kw.get('label'), **lkw) def finalize(*a): if getattr(gconf, 'pid_file', None): @@ -178,7 +178,9 @@ def main_i(): rconf = {'go_daemon': 'should'} def store_abs(opt, optstr, val, parser): - setattr(parser.values, opt.dest, os.path.abspath(val)) + if val: + val = os.path.abspath(val) + setattr(parser.values, opt.dest, val) def store_local(opt, optstr, val, parser): rconf[opt.dest] = val def store_local_curry(val): @@ -190,8 +192,10 @@ def main_i(): op.add_option('--gluster-log-level', metavar='LVL') op.add_option('-p', '--pid-file', metavar='PIDF', type=str, action='callback', callback=store_abs) op.add_option('-l', '--log-file', metavar='LOGF', type=str, action='callback', callback=store_abs) + op.add_option('--state-file', metavar='STATF', type=str, action='callback', callback=store_abs) op.add_option('-L', '--log-level', metavar='LVL') op.add_option('-r', '--remote-gsyncd', metavar='CMD', default=os.path.abspath(sys.argv[0])) + op.add_option('--volume-id', metavar='UUID') op.add_option('-s', '--ssh-command', metavar='CMD', default='ssh') op.add_option('--rsync-command', metavar='CMD', default='rsync') op.add_option('--rsync-extra', metavar='ARGS', default='-sS', help=SUPPRESS_HELP) @@ -201,6 +205,7 @@ def main_i(): op.add_option('-c', '--config-file', metavar='CONF', type=str, action='callback', callback=store_local) # duh. need to specify dest or value will be mapped to None :S + op.add_option('--monitor', dest='monitor', action='callback', callback=store_local_curry(True)) op.add_option('--listen', dest='listen', help=SUPPRESS_HELP, action='callback', callback=store_local_curry(True)) op.add_option('-N', '--no-daemon', dest="go_daemon", action='callback', callback=store_local_curry('dont')) op.add_option('--debug', dest="go_daemon", action='callback', callback=lambda *a: (store_local_curry('dont')(*a), @@ -277,6 +282,7 @@ def main_i(): gconf.__dict__.update(defaults.__dict__) gcnf.update_to(gconf.__dict__) gconf.__dict__.update(opts.__dict__) + gconf.configinterface = gcnf #normalize loglevel lvl0 = gconf.log_level @@ -290,13 +296,25 @@ def main_i(): gconf.log_level = lvl2 go_daemon = rconf['go_daemon'] + be_monitor = rconf.get('monitor') - if isinstance(remote, resource.SSH) and go_daemon == 'should': + if not be_monitor and isinstance(remote, resource.SSH) and \ + go_daemon == 'should': go_daemon = 'postconn' log_file = None else: log_file = gconf.log_file - startup(go_daemon=go_daemon, log_file=log_file, slave=(not remote)) + if be_monitor: + label = 'monitor' + elif remote: + #master + label = '' + else: + label = 'slave' + startup(go_daemon=go_daemon, log_file=log_file, label=label) + + if be_monitor: + return monitor() logging.info("syncing: %s" % " -> ".join(peers)) if remote: diff --git a/xlators/features/marker/utils/syncdaemon/master.py b/xlators/features/marker/utils/syncdaemon/master.py index 2df1470d5f7..87610f3879a 100644 --- a/xlators/features/marker/utils/syncdaemon/master.py +++ b/xlators/features/marker/utils/syncdaemon/master.py @@ -10,6 +10,7 @@ from errno import ENOENT, ENODATA from threading import Thread, currentThread, Condition, Lock from gconf import gconf +from syncdutils import FreeObject URXTIME = (-1, 0) @@ -80,7 +81,8 @@ class GMaster(object): # the authorative (foreign, native) volinfo pair # which lets us deduce what to do when we refetch # the volinfos from system - self.volinfo_state = (None, None) + uuid_preset = getattr(gconf, 'volume_id', None) + self.volinfo_state = (uuid_preset and {'uuid': uuid_preset}, None) # the actual volinfo we make use of self.volinfo = None @@ -140,14 +142,16 @@ class GMaster(object): # store the value below "boxed" to emulate proper closures # (variables of the enclosing scope are available inner functions # provided they are no reassigned; mutation is OK). - relax_mismatch = [False] + param = FreeObject(relax_mismatch = False, state_change = False) def select_vi(vi0, vi): if vi and (not vi0 or vi0['uuid'] == vi['uuid']): + if not vi0 and not param.relax_mismatch: + param.state_change = True # valid new value found; for the rest, we are graceful about # uuid mismatch - relax_mismatch[0] = True + param.relax_mismatch = True return vi - if vi0 and vi and vi0['uuid'] != vi['uuid'] and not relax_mismatch[0]: + if vi0 and vi and vi0['uuid'] != vi['uuid'] and not param.relax_mismatch: # uuid mismatch for master candidate, bail out raise RuntimeError("aborting on uuid change from %s to %s" % \ (vi0['uuid'], vi['uuid'])) @@ -157,7 +161,7 @@ class GMaster(object): srep = lambda vi: vi and vi['uuid'][0:8] logging.debug('(%s, %s) << (%s, %s) -> (%s, %s)' % \ tuple(srep(vi) for vi in volinfo_state + volinfo_sys + newstate)) - return newstate + return newstate, param.state_change def crawl(self, path='.', xtl=None): if path == '.': @@ -166,11 +170,16 @@ class GMaster(object): time.sleep(1) self.start = time.time() volinfo_sys = self.get_sys_volinfo() - self.volinfo_state = self.volinfo_state_machine(self.volinfo_state, volinfo_sys) + self.volinfo_state, state_change = self.volinfo_state_machine(self.volinfo_state, + volinfo_sys) if self.inter_master: self.volinfo = volinfo_sys[self.KFGN] else: self.volinfo = volinfo_sys[self.KNAT] + if state_change: + logging.info('new master is %s', self.uuid) + if self.inter_master: + gconf.configinterface.set('volume_id', self.uuid) if self.volinfo: if self.volinfo['retval']: raise RuntimeError ("master is corrupt") diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py new file mode 100644 index 00000000000..3f327b6d04d --- /dev/null +++ b/xlators/features/marker/utils/syncdaemon/monitor.py @@ -0,0 +1,54 @@ +import os +import sys +import time +import logging +from gconf import gconf +from syncdutils import update_file + +class Monitor(object): + + def __init__(self): + self.state = None + + def set_state(self, state): + if state == self.state: + return + self.state = state + logging.info('new state: %s' % state) + if getattr(gconf, 'state_file', None): + update_file(gconf.state_file, lambda f: f.write(state + '\n')) + + def monitor(self): + argv = sys.argv[:] + for o in ('-N', '--no-daemon', '--monitor'): + while o in argv: + argv.remove(o) + argv.extend(('-N', '-p', '')) + argv.insert(0, os.path.basename(sys.executable)) + + self.set_state('starting...') + ret = 0 + def nwait(p, o=0): + p2, r = os.waitpid(p, o) + if not p2: + return + if os.WIFEXITED(r): + return os.WEXITSTATUS(r) + return 1 + while ret in (0, 1): + logging.info('-' * 60) + logging.info('starting gsyncd worker') + cpid = os.spawnv(os.P_NOWAIT, sys.executable, argv) + time.sleep(60) + ret = nwait(cpid, os.WNOHANG) + if not ret: + self.set_state('OK') + ret = nwait(cpid) + elif ret in (0, 1): + self.set_state('faulty') + time.sleep(10) + self.set_state('inconsistent') + return ret + +def monitor(): + return Monitor().monitor() diff --git a/xlators/features/marker/utils/syncdaemon/syncdutils.py b/xlators/features/marker/utils/syncdaemon/syncdutils.py index 723ab8fb5fc..5c17d0579b0 100644 --- a/xlators/features/marker/utils/syncdaemon/syncdutils.py +++ b/xlators/features/marker/utils/syncdaemon/syncdutils.py @@ -40,3 +40,11 @@ def update_file(path, updater, merger = lambda f: True): for fx in (fr, fw): if fx: fx.close() + + +class FreeObject(object): + """wildcard class for which any attribute can be set""" + + def __init__(self, **kw): + for k,v in kw.iteritems(): + setattr(self, k, v) diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c index f8e043e46d6..0ae4f93e359 100644 --- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c +++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c @@ -3599,6 +3599,7 @@ stop_gsync (char *master, char *slave, char **op_errstr) FILE *file = NULL; char pidfile[PATH_MAX] = {0,}; char buf [1024] = {0,}; + int i = 0; ret = gsync_status (master, slave, &status); if (ret == 0 && status == -1) { @@ -3632,14 +3633,25 @@ stop_gsync (char *master, char *slave, char **op_errstr) ret = read (fileno(file), buf, 1024); if (ret > 0) { pid = strtol (buf, NULL, 10); - ret = kill (pid, SIGTERM); + ret = kill (-pid, SIGTERM); if (ret) { gf_log ("", GF_LOG_WARNING, "failed to stop gsyncd"); goto out; } - sleep (0.1); - kill (pid, SIGTERM); + for (i = 0; i < 20; i++) { + if (gsync_status (master, slave, &status) == -1 || + status == -1) { + /* monitor gsyncd is dead but worker may + * still be alive, give some more time + * before SIGKILL (hack) + */ + sleep (0.05); + break; + } + sleep (0.05); + } + kill (-pid, SIGKILL); unlink (pidfile); } ret = 0; -- cgit