diff options
author | Csaba Henk <csaba@lowlife.hu> | 2011-04-02 19:40:49 +0000 |
---|---|---|
committer | Vijay Bellur <vijay@dev.gluster.com> | 2011-04-04 08:02:27 -0700 |
commit | 01b3dff29adee2041b0ef1b374eda8c88fb07678 (patch) | |
tree | c8f6c7eabb962c97f3e88add716eda429e2c3567 /xlators | |
parent | e77c35248e8ce796bc5b108c10013089a0c65bde (diff) |
syncdaemon: add monitor mode to support autorestart
Signed-off-by: Csaba Henk <csaba@gluster.com>
Signed-off-by: Vijay Bellur <vijay@dev.gluster.com>
BUG: 2537 (gsync autorestart)
URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2537
Diffstat (limited to 'xlators')
-rw-r--r-- | xlators/features/marker/utils/syncdaemon/Makefile.am | 2 | ||||
-rw-r--r-- | xlators/features/marker/utils/syncdaemon/gsyncd.py | 36 | ||||
-rw-r--r-- | xlators/features/marker/utils/syncdaemon/master.py | 21 | ||||
-rw-r--r-- | xlators/features/marker/utils/syncdaemon/monitor.py | 54 | ||||
-rw-r--r-- | xlators/features/marker/utils/syncdaemon/syncdutils.py | 8 | ||||
-rw-r--r-- | xlators/mgmt/glusterd/src/glusterd-op-sm.c | 18 |
6 files changed, 120 insertions, 19 deletions
diff --git a/xlators/features/marker/utils/syncdaemon/Makefile.am b/xlators/features/marker/utils/syncdaemon/Makefile.am index 03ac97625..c900fa932 100644 --- a/xlators/features/marker/utils/syncdaemon/Makefile.am +++ b/xlators/features/marker/utils/syncdaemon/Makefile.am @@ -1,5 +1,5 @@ syncdaemondir = $(libexecdir)/glusterfs/python/syncdaemon -syncdaemon_PYTHON = gconf.py gsyncd.py __init__.py master.py README.md repce.py resource.py configinterface.py syncdutils.py +syncdaemon_PYTHON = gconf.py gsyncd.py __init__.py master.py README.md repce.py resource.py configinterface.py syncdutils.py monitor.py CLEANFILES = diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py index a992005ec..fb2fe522b 100644 --- a/xlators/features/marker/utils/syncdaemon/gsyncd.py +++ b/xlators/features/marker/utils/syncdaemon/gsyncd.py @@ -17,6 +17,7 @@ from errno import EEXIST, ENOENT, EACCES, EAGAIN from gconf import gconf from configinterface import GConffile import resource +from monitor import monitor class GLogger(Logger): @@ -37,12 +38,11 @@ class GLogger(Logger): @classmethod def setup(cls, **kw): - if kw.get('slave'): - sls = "(slave)" - else: - sls = "" + lbl = kw.get('label', "") + if lbl: + lbl = '(' + lbl + ')' lprm = {'datefmt': "%Y-%m-%d %H:%M:%S", - 'format': "[%(asctime)s.%(nsecs)d] %(lvlnam)s [%(module)s" + sls + ":%(lineno)s:%(funcName)s] %(ctx)s: %(message)s"} + 'format': "[%(asctime)s.%(nsecs)d] %(lvlnam)s [%(module)s" + lbl + ":%(lineno)s:%(funcName)s] %(ctx)s: %(message)s"} lprm.update(kw) lvl = kw.get('level', logging.INFO) lprm['level'] = lvl @@ -121,7 +121,7 @@ def startup(**kw): lkw['stream'] = sys.stdout else: lkw['filename'] = kw['log_file'] - GLogger.setup(slave=kw.get('slave'), **lkw) + GLogger.setup(label=kw.get('label'), **lkw) def finalize(*a): if getattr(gconf, 'pid_file', None): @@ -178,7 +178,9 @@ def main_i(): rconf = {'go_daemon': 'should'} def store_abs(opt, optstr, val, parser): - setattr(parser.values, opt.dest, os.path.abspath(val)) + if val: + val = os.path.abspath(val) + setattr(parser.values, opt.dest, val) def store_local(opt, optstr, val, parser): rconf[opt.dest] = val def store_local_curry(val): @@ -190,8 +192,10 @@ def main_i(): op.add_option('--gluster-log-level', metavar='LVL') op.add_option('-p', '--pid-file', metavar='PIDF', type=str, action='callback', callback=store_abs) op.add_option('-l', '--log-file', metavar='LOGF', type=str, action='callback', callback=store_abs) + op.add_option('--state-file', metavar='STATF', type=str, action='callback', callback=store_abs) op.add_option('-L', '--log-level', metavar='LVL') op.add_option('-r', '--remote-gsyncd', metavar='CMD', default=os.path.abspath(sys.argv[0])) + op.add_option('--volume-id', metavar='UUID') op.add_option('-s', '--ssh-command', metavar='CMD', default='ssh') op.add_option('--rsync-command', metavar='CMD', default='rsync') op.add_option('--rsync-extra', metavar='ARGS', default='-sS', help=SUPPRESS_HELP) @@ -201,6 +205,7 @@ def main_i(): op.add_option('-c', '--config-file', metavar='CONF', type=str, action='callback', callback=store_local) # duh. need to specify dest or value will be mapped to None :S + op.add_option('--monitor', dest='monitor', action='callback', callback=store_local_curry(True)) op.add_option('--listen', dest='listen', help=SUPPRESS_HELP, action='callback', callback=store_local_curry(True)) op.add_option('-N', '--no-daemon', dest="go_daemon", action='callback', callback=store_local_curry('dont')) op.add_option('--debug', dest="go_daemon", action='callback', callback=lambda *a: (store_local_curry('dont')(*a), @@ -277,6 +282,7 @@ def main_i(): gconf.__dict__.update(defaults.__dict__) gcnf.update_to(gconf.__dict__) gconf.__dict__.update(opts.__dict__) + gconf.configinterface = gcnf #normalize loglevel lvl0 = gconf.log_level @@ -290,13 +296,25 @@ def main_i(): gconf.log_level = lvl2 go_daemon = rconf['go_daemon'] + be_monitor = rconf.get('monitor') - if isinstance(remote, resource.SSH) and go_daemon == 'should': + if not be_monitor and isinstance(remote, resource.SSH) and \ + go_daemon == 'should': go_daemon = 'postconn' log_file = None else: log_file = gconf.log_file - startup(go_daemon=go_daemon, log_file=log_file, slave=(not remote)) + if be_monitor: + label = 'monitor' + elif remote: + #master + label = '' + else: + label = 'slave' + startup(go_daemon=go_daemon, log_file=log_file, label=label) + + if be_monitor: + return monitor() logging.info("syncing: %s" % " -> ".join(peers)) if remote: diff --git a/xlators/features/marker/utils/syncdaemon/master.py b/xlators/features/marker/utils/syncdaemon/master.py index 2df1470d5..87610f387 100644 --- a/xlators/features/marker/utils/syncdaemon/master.py +++ b/xlators/features/marker/utils/syncdaemon/master.py @@ -10,6 +10,7 @@ from errno import ENOENT, ENODATA from threading import Thread, currentThread, Condition, Lock from gconf import gconf +from syncdutils import FreeObject URXTIME = (-1, 0) @@ -80,7 +81,8 @@ class GMaster(object): # the authorative (foreign, native) volinfo pair # which lets us deduce what to do when we refetch # the volinfos from system - self.volinfo_state = (None, None) + uuid_preset = getattr(gconf, 'volume_id', None) + self.volinfo_state = (uuid_preset and {'uuid': uuid_preset}, None) # the actual volinfo we make use of self.volinfo = None @@ -140,14 +142,16 @@ class GMaster(object): # store the value below "boxed" to emulate proper closures # (variables of the enclosing scope are available inner functions # provided they are no reassigned; mutation is OK). - relax_mismatch = [False] + param = FreeObject(relax_mismatch = False, state_change = False) def select_vi(vi0, vi): if vi and (not vi0 or vi0['uuid'] == vi['uuid']): + if not vi0 and not param.relax_mismatch: + param.state_change = True # valid new value found; for the rest, we are graceful about # uuid mismatch - relax_mismatch[0] = True + param.relax_mismatch = True return vi - if vi0 and vi and vi0['uuid'] != vi['uuid'] and not relax_mismatch[0]: + if vi0 and vi and vi0['uuid'] != vi['uuid'] and not param.relax_mismatch: # uuid mismatch for master candidate, bail out raise RuntimeError("aborting on uuid change from %s to %s" % \ (vi0['uuid'], vi['uuid'])) @@ -157,7 +161,7 @@ class GMaster(object): srep = lambda vi: vi and vi['uuid'][0:8] logging.debug('(%s, %s) << (%s, %s) -> (%s, %s)' % \ tuple(srep(vi) for vi in volinfo_state + volinfo_sys + newstate)) - return newstate + return newstate, param.state_change def crawl(self, path='.', xtl=None): if path == '.': @@ -166,11 +170,16 @@ class GMaster(object): time.sleep(1) self.start = time.time() volinfo_sys = self.get_sys_volinfo() - self.volinfo_state = self.volinfo_state_machine(self.volinfo_state, volinfo_sys) + self.volinfo_state, state_change = self.volinfo_state_machine(self.volinfo_state, + volinfo_sys) if self.inter_master: self.volinfo = volinfo_sys[self.KFGN] else: self.volinfo = volinfo_sys[self.KNAT] + if state_change: + logging.info('new master is %s', self.uuid) + if self.inter_master: + gconf.configinterface.set('volume_id', self.uuid) if self.volinfo: if self.volinfo['retval']: raise RuntimeError ("master is corrupt") diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py new file mode 100644 index 000000000..3f327b6d0 --- /dev/null +++ b/xlators/features/marker/utils/syncdaemon/monitor.py @@ -0,0 +1,54 @@ +import os +import sys +import time +import logging +from gconf import gconf +from syncdutils import update_file + +class Monitor(object): + + def __init__(self): + self.state = None + + def set_state(self, state): + if state == self.state: + return + self.state = state + logging.info('new state: %s' % state) + if getattr(gconf, 'state_file', None): + update_file(gconf.state_file, lambda f: f.write(state + '\n')) + + def monitor(self): + argv = sys.argv[:] + for o in ('-N', '--no-daemon', '--monitor'): + while o in argv: + argv.remove(o) + argv.extend(('-N', '-p', '')) + argv.insert(0, os.path.basename(sys.executable)) + + self.set_state('starting...') + ret = 0 + def nwait(p, o=0): + p2, r = os.waitpid(p, o) + if not p2: + return + if os.WIFEXITED(r): + return os.WEXITSTATUS(r) + return 1 + while ret in (0, 1): + logging.info('-' * 60) + logging.info('starting gsyncd worker') + cpid = os.spawnv(os.P_NOWAIT, sys.executable, argv) + time.sleep(60) + ret = nwait(cpid, os.WNOHANG) + if not ret: + self.set_state('OK') + ret = nwait(cpid) + elif ret in (0, 1): + self.set_state('faulty') + time.sleep(10) + self.set_state('inconsistent') + return ret + +def monitor(): + return Monitor().monitor() diff --git a/xlators/features/marker/utils/syncdaemon/syncdutils.py b/xlators/features/marker/utils/syncdaemon/syncdutils.py index 723ab8fb5..5c17d0579 100644 --- a/xlators/features/marker/utils/syncdaemon/syncdutils.py +++ b/xlators/features/marker/utils/syncdaemon/syncdutils.py @@ -40,3 +40,11 @@ def update_file(path, updater, merger = lambda f: True): for fx in (fr, fw): if fx: fx.close() + + +class FreeObject(object): + """wildcard class for which any attribute can be set""" + + def __init__(self, **kw): + for k,v in kw.iteritems(): + setattr(self, k, v) diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c index f8e043e46..0ae4f93e3 100644 --- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c +++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c @@ -3599,6 +3599,7 @@ stop_gsync (char *master, char *slave, char **op_errstr) FILE *file = NULL; char pidfile[PATH_MAX] = {0,}; char buf [1024] = {0,}; + int i = 0; ret = gsync_status (master, slave, &status); if (ret == 0 && status == -1) { @@ -3632,14 +3633,25 @@ stop_gsync (char *master, char *slave, char **op_errstr) ret = read (fileno(file), buf, 1024); if (ret > 0) { pid = strtol (buf, NULL, 10); - ret = kill (pid, SIGTERM); + ret = kill (-pid, SIGTERM); if (ret) { gf_log ("", GF_LOG_WARNING, "failed to stop gsyncd"); goto out; } - sleep (0.1); - kill (pid, SIGTERM); + for (i = 0; i < 20; i++) { + if (gsync_status (master, slave, &status) == -1 || + status == -1) { + /* monitor gsyncd is dead but worker may + * still be alive, give some more time + * before SIGKILL (hack) + */ + sleep (0.05); + break; + } + sleep (0.05); + } + kill (-pid, SIGKILL); unlink (pidfile); } ret = 0; |