diff options
author | ShyamsundarR <srangana@redhat.com> | 2018-09-13 14:05:02 -0400 |
---|---|---|
committer | ShyamsundarR <srangana@redhat.com> | 2018-09-17 10:31:19 -0400 |
commit | c1314445cf008cf78a2157cb425bee836de5594c (patch) | |
tree | e603de84dbd8d5c1a7f6c328cb1eab9d2a9e47d9 /xlators/experimental/jbr-server | |
parent | afc9f3b8716e88410ba50a6ce8abbfa186ee7c46 (diff) |
core: remove experimental xlators and associated tests
experimental xlators removed from 5.0
Change-Id: I47219d8b95efc3d5875ec9224d1e79f8371e9f76
Updates: bz#1628620
Signed-off-by: ShyamsundarR <srangana@redhat.com>
Diffstat (limited to 'xlators/experimental/jbr-server')
-rw-r--r-- | xlators/experimental/jbr-server/Makefile.am | 3 | ||||
-rw-r--r-- | xlators/experimental/jbr-server/src/Makefile.am | 39 | ||||
-rw-r--r-- | xlators/experimental/jbr-server/src/all-templates.c | 542 | ||||
-rwxr-xr-x | xlators/experimental/jbr-server/src/gen-fops.py | 181 | ||||
-rw-r--r-- | xlators/experimental/jbr-server/src/jbr-internal.h | 118 | ||||
-rw-r--r-- | xlators/experimental/jbr-server/src/jbr.c | 1675 |
6 files changed, 0 insertions, 2558 deletions
diff --git a/xlators/experimental/jbr-server/Makefile.am b/xlators/experimental/jbr-server/Makefile.am deleted file mode 100644 index a985f42a877..00000000000 --- a/xlators/experimental/jbr-server/Makefile.am +++ /dev/null @@ -1,3 +0,0 @@ -SUBDIRS = src - -CLEANFILES = diff --git a/xlators/experimental/jbr-server/src/Makefile.am b/xlators/experimental/jbr-server/src/Makefile.am deleted file mode 100644 index b3ceb2d9eda..00000000000 --- a/xlators/experimental/jbr-server/src/Makefile.am +++ /dev/null @@ -1,39 +0,0 @@ -if WITH_SERVER -xlator_LTLIBRARIES = jbr.la -endif -xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/experimental - -nodist_jbr_la_SOURCES = jbr-cg.c -CLEANFILES = $(nodist_jbr_la_SOURCES) - -jbr_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) -jbr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ - $(top_builddir)/api/src/libgfapi.la - -noinst_HEADERS = jbr-internal.h \ - $(top_srcdir)/xlators/lib/src/libxlator.h \ - $(top_srcdir)/xlators/experimental/fdl/src/fdl.h \ - $(top_srcdir)/glusterfsd/src/glusterfsd.h - -AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ - -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ - -I$(top_srcdir)/xlators/lib/src -I$(top_srcdir)/rpc/rpc-lib/src \ - -I$(top_srcdir)/xlators/experimental/fdl/src/ \ - -DSBIN_DIR=\"$(sbindir)\" -I$(top_srcdir)/api/src \ - -DJBR_SCRIPT_PREFIX=\"$(jbrdir)\" \ - -I$(top_srcdir)/xlators/experimental/jbr-client/src/ - -AM_CFLAGS = -Wall $(GF_CFLAGS) - -JBR_PREFIX = $(top_srcdir)/xlators/experimental/jbr-server/src -JBR_GEN_FOPS = $(JBR_PREFIX)/gen-fops.py -JBR_TEMPLATES = $(JBR_PREFIX)/all-templates.c -JBR_WRAPPER = $(JBR_PREFIX)/jbr.c -noinst_PYTHON = $(JBR_GEN_FOPS) -EXTRA_DIST = $(JBR_TEMPLATES) $(JBR_WRAPPER) - -jbr-cg.c: $(JBR_GEN_FOPS) $(JBR_TEMPLATES) $(JBR_WRAPPER) - $(PYTHON) $(JBR_GEN_FOPS) $(JBR_TEMPLATES) $(JBR_WRAPPER) > $@ - -uninstall-local: - rm -f $(DESTDIR)$(xlatordir)/jbr.so diff --git a/xlators/experimental/jbr-server/src/all-templates.c b/xlators/experimental/jbr-server/src/all-templates.c deleted file mode 100644 index 530c4187571..00000000000 --- a/xlators/experimental/jbr-server/src/all-templates.c +++ /dev/null @@ -1,542 +0,0 @@ -/* - * You can put anything here - it doesn't even have to be a comment - and it - * will be ignored until we reach the first template-name comment. - */ - - -/* template-name read-fop */ -int32_t -jbr_@NAME@ (call_frame_t *frame, xlator_t *this, - @LONG_ARGS@) -{ - jbr_private_t *priv = NULL; - gf_boolean_t in_recon = _gf_false; - int32_t op_errno = 0; - int32_t recon_term, recon_index; - - GF_VALIDATE_OR_GOTO ("jbr", this, err); - priv = this->private; - GF_VALIDATE_OR_GOTO (this->name, priv, err); - GF_VALIDATE_OR_GOTO (this->name, frame, err); - - op_errno = EREMOTE; - - /* allow reads during reconciliation * - * TBD: allow "dirty" reads on non-leaders * - */ - if (xdata && - (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) && - (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) { - in_recon = _gf_true; - } - - if ((!priv->leader) && (in_recon == _gf_false)) { - goto err; - } - - STACK_WIND (frame, default_@NAME@_cbk, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@, - @SHORT_ARGS@); - return 0; - -err: - STACK_UNWIND_STRICT (@NAME@, frame, -1, op_errno, - @ERROR_ARGS@); - return 0; -} - -/* template-name read-perform_local_op */ -/* No "perform_local_op" function needed for @NAME@ */ - -/* template-name read-dispatch */ -/* No "dispatch" function needed for @NAME@ */ - -/* template-name read-call_dispatch */ -/* No "call_dispatch" function needed for @NAME@ */ - -/* template-name read-fan-in */ -/* No "fan-in" function needed for @NAME@ */ - -/* template-name read-continue */ -/* No "continue" function needed for @NAME@ */ - -/* template-name read-complete */ -/* No "complete" function needed for @NAME@ */ - -/* template-name write-fop */ -int32_t -jbr_@NAME@ (call_frame_t *frame, xlator_t *this, - @LONG_ARGS@) -{ - jbr_local_t *local = NULL; - jbr_private_t *priv = NULL; - int32_t ret = -1; - int op_errno = ENOMEM; - - GF_VALIDATE_OR_GOTO ("jbr", this, err); - priv = this->private; - GF_VALIDATE_OR_GOTO (this->name, priv, err); - GF_VALIDATE_OR_GOTO (this->name, frame, err); - -#if defined(JBR_CG_NEED_FD) - ret = jbr_leader_checks_and_init (frame, this, &op_errno, xdata, fd); -#else - ret = jbr_leader_checks_and_init (frame, this, &op_errno, xdata, NULL); -#endif - if (ret) - goto err; - - local = frame->local; - - /* - * If we let it through despite not being the leader, then we just want - * to pass it on down without all of the additional xattrs, queuing, and - * so on. However, jbr_*_complete does depend on the initialization - * immediately above this. - */ - if (!priv->leader) { - STACK_WIND (frame, jbr_@NAME@_complete, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@, - @SHORT_ARGS@); - return 0; - } - - ret = jbr_initialize_xdata_set_attrs (this, &xdata); - if (ret) - goto err; - - local->xdata = dict_ref(xdata); - local->stub = fop_@NAME@_stub (frame, jbr_@NAME@_continue, - @SHORT_ARGS@); - if (!local->stub) { - goto err; - } - - /* - * Can be used to just call_dispatch or be customised per fop to * - * perform ops specific to that particular fop. * - */ - ret = jbr_@NAME@_perform_local_op (frame, this, &op_errno, - @SHORT_ARGS@); - if (ret) - goto err; - - return ret; -err: - if (local) { - if (local->stub) { - call_stub_destroy(local->stub); - } - if (local->qstub) { - call_stub_destroy(local->qstub); - } - if (local->fd) { - fd_unref(local->fd); - } - mem_put(local); - } - STACK_UNWIND_STRICT (@NAME@, frame, -1, op_errno, - @ERROR_ARGS@); - return 0; -} - -/* template-name write-perform_local_op */ -int32_t -jbr_@NAME@_perform_local_op (call_frame_t *frame, xlator_t *this, int *op_errno, - @LONG_ARGS@) -{ - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO ("jbr", this, out); - GF_VALIDATE_OR_GOTO (this->name, frame, out); - GF_VALIDATE_OR_GOTO (this->name, op_errno, out); - - ret = jbr_@NAME@_call_dispatch (frame, this, op_errno, - @SHORT_ARGS@); - -out: - return ret; -} - -/* template-name write-call_dispatch */ -int32_t -jbr_@NAME@_call_dispatch (call_frame_t *frame, xlator_t *this, int *op_errno, - @LONG_ARGS@) -{ - jbr_local_t *local = NULL; - jbr_private_t *priv = NULL; - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO ("jbr", this, out); - priv = this->private; - GF_VALIDATE_OR_GOTO (this->name, priv, out); - GF_VALIDATE_OR_GOTO (this->name, frame, out); - local = frame->local; - GF_VALIDATE_OR_GOTO (this->name, local, out); - GF_VALIDATE_OR_GOTO (this->name, op_errno, out); - -#if defined(JBR_CG_QUEUE) - jbr_inode_ctx_t *ictx = jbr_get_inode_ctx(this, fd->inode); - if (!ictx) { - *op_errno = EIO; - goto out; - } - - LOCK(&ictx->lock); - if (ictx->active) { - gf_msg_debug (this->name, 0, - "queuing request due to conflict"); - /* - * TBD: enqueue only for real conflict - * - * Currently we just act like all writes are in - * conflict with one another. What we should really do - * is check the active/pending queues and defer only if - * there's a conflict there. - * - * It's important to check the pending queue because we - * might have an active request X which conflicts with - * a pending request Y, and this request Z might - * conflict with Y but not X. If we checked only the - * active queue then Z could jump ahead of Y, which - * would be incorrect. - */ - local->qstub = fop_@NAME@_stub (frame, - jbr_@NAME@_dispatch, - @SHORT_ARGS@); - if (!local->qstub) { - UNLOCK(&ictx->lock); - goto out; - } - list_add_tail(&local->qlinks, &ictx->pqueue); - ++(ictx->pending); - UNLOCK(&ictx->lock); - ret = 0; - goto out; - } else { - list_add_tail(&local->qlinks, &ictx->aqueue); - ++(ictx->active); - } - UNLOCK(&ictx->lock); -#endif - ret = jbr_@NAME@_dispatch (frame, this, @SHORT_ARGS@); - -out: - return ret; -} - -/* template-name write-dispatch */ -int32_t -jbr_@NAME@_dispatch (call_frame_t *frame, xlator_t *this, - @LONG_ARGS@) -{ - jbr_local_t *local = NULL; - jbr_private_t *priv = NULL; - int32_t ret = -1; - xlator_list_t *trav; - - GF_VALIDATE_OR_GOTO ("jbr", this, out); - priv = this->private; - GF_VALIDATE_OR_GOTO (this->name, priv, out); - GF_VALIDATE_OR_GOTO (this->name, frame, out); - local = frame->local; - GF_VALIDATE_OR_GOTO (this->name, local, out); - - /* - * TBD: unblock pending request(s) if we fail after this point but - * before we get to jbr_@NAME@_complete (where that code currently - * resides). - */ - - local->call_count = priv->n_children - 1; - for (trav = this->children->next; trav; trav = trav->next) { - STACK_WIND (frame, jbr_@NAME@_fan_in, - trav->xlator, trav->xlator->fops->@NAME@, - @SHORT_ARGS@); - } - - /* TBD: variable Issue count */ - ret = 0; -out: - return ret; -} - -/* template-name write-fan-in */ -int32_t -jbr_@NAME@_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - @LONG_ARGS@) -{ - jbr_local_t *local = NULL; - int32_t ret = -1; - uint8_t call_count; - - GF_VALIDATE_OR_GOTO ("jbr", this, out); - GF_VALIDATE_OR_GOTO (this->name, frame, out); - local = frame->local; - GF_VALIDATE_OR_GOTO (this->name, local, out); - - gf_msg_trace (this->name, 0, "op_ret = %d, op_errno = %d\n", - op_ret, op_errno); - - LOCK(&frame->lock); - call_count = --(local->call_count); - if (op_ret != -1) { - /* Increment the number of successful acks * - * received for the operation. * - */ - (local->successful_acks)++; - local->successful_op_ret = op_ret; - } - gf_msg_debug (this->name, 0, "succ_acks = %d, op_ret = %d, op_errno = %d\n", - op_ret, op_errno, local->successful_acks); - UNLOCK(&frame->lock); - - /* TBD: variable Completion count */ - if (call_count == 0) { - call_resume(local->stub); - } - - ret = 0; -out: - return ret; -} - -/* template-name write-continue */ -int32_t -jbr_@NAME@_continue (call_frame_t *frame, xlator_t *this, - @LONG_ARGS@) -{ - int32_t ret = -1; - gf_boolean_t result = _gf_false; - jbr_local_t *local = NULL; - jbr_local_t *new_local = NULL; - jbr_private_t *priv = NULL; - int32_t op_errno = 0; - - GF_VALIDATE_OR_GOTO ("jbr", this, out); - GF_VALIDATE_OR_GOTO (this->name, frame, out); - priv = this->private; - local = frame->local; - GF_VALIDATE_OR_GOTO (this->name, priv, out); - GF_VALIDATE_OR_GOTO (this->name, local, out); - - /* Perform quorum check to see if the leader needs * - * to perform the operation. If the operation will not * - * meet quorum irrespective of the leader's result * - * there is no point in the leader performing the fop * - */ - result = fop_quorum_check (this, (double)priv->n_children, - (double)local->successful_acks + 1); - if (result == _gf_false) { - gf_msg (this->name, GF_LOG_ERROR, EROFS, - J_MSG_QUORUM_NOT_MET, "Didn't receive enough acks " - "to meet quorum. Failing the operation without trying " - "it on the leader."); - -#if defined(JBR_CG_QUEUE) - /* - * In case of a fop failure, before unwinding need to * - * remove it from queue * - */ - ret = jbr_remove_from_queue (frame, this); - if (ret) { - gf_msg (this->name, GF_LOG_ERROR, 0, - J_MSG_GENERIC, "Failed to remove from queue."); - } -#endif - - /* - * In this case, the quorum is not met on the followers * - * So the operation will not be performed on the leader * - * and a rollback will be sent via GF_FOP_IPC to all the * - * followers, where this particular fop's term and index * - * numbers will be journaled, and later used to rollback * - */ - call_frame_t *new_frame; - - new_frame = copy_frame (frame); - - if (new_frame) { - new_local = mem_get0(this->local_pool); - if (new_local) { - INIT_LIST_HEAD(&new_local->qlinks); - ret = dict_set_int32 (local->xdata, - "rollback-fop", - GF_FOP_@UPNAME@); - if (ret) { - gf_msg (this->name, GF_LOG_ERROR, 0, - J_MSG_DICT_FLR, - "failed to set rollback-fop"); - } else { - new_local->xdata = dict_ref(local->xdata); - new_frame->local = new_local; - jbr_ipc_call_dispatch (new_frame, - this, &op_errno, - FDL_IPC_JBR_SERVER_ROLLBACK, - new_local->xdata); - } - } else { - gf_log (this->name, GF_LOG_WARNING, - "Could not create local for new_frame"); - } - } else { - gf_log (this->name, GF_LOG_WARNING, - "Could not send rollback ipc"); - } - - STACK_UNWIND_STRICT (@NAME@, frame, -1, EROFS, - @ERROR_ARGS@); - } else { - STACK_WIND (frame, jbr_@NAME@_complete, - FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@, - @SHORT_ARGS@); - } - -out: - return 0; -} - -/* template-name write-complete */ -int32_t -jbr_@NAME@_complete (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - @LONG_ARGS@) -{ - int32_t ret = -1; - gf_boolean_t result = _gf_false; - jbr_private_t *priv = NULL; - jbr_local_t *local = NULL; - jbr_local_t *new_local = NULL; - - GF_VALIDATE_OR_GOTO ("jbr", this, err); - GF_VALIDATE_OR_GOTO (this->name, frame, err); - priv = this->private; - local = frame->local; - GF_VALIDATE_OR_GOTO (this->name, priv, err); - GF_VALIDATE_OR_GOTO (this->name, local, err); - - /* If the fop failed on the leader, then reduce one successful ack - * before calculating the fop quorum - */ - LOCK(&frame->lock); - if (op_ret == -1) - (local->successful_acks)--; - UNLOCK(&frame->lock); - -#if defined(JBR_CG_QUEUE) - ret = jbr_remove_from_queue (frame, this); - if (ret) - goto err; -#endif - -#if defined(JBR_CG_FSYNC) - jbr_mark_fd_dirty(this, local); -#endif - -#if defined(JBR_CG_NEED_FD) - fd_unref(local->fd); -#endif - - /* After the leader completes the fop, a quorum check is * - * performed, taking into account the outcome of the fop * - * on the leader. Irrespective of the fop being successful * - * or failing on the leader, the result of the quorum will * - * determine if the overall fop is successful or not. For * - * example, a fop might have succeeded on every node except * - * the leader, in which case as quorum is being met, the fop * - * will be treated as a successful fop, even though it failed * - * on the leader. On follower nodes, no quorum check should * - * be done, and the result is returned to the leader as is. * - */ - if (priv->leader) { - result = fop_quorum_check (this, (double)priv->n_children, - (double)local->successful_acks + 1); - if (result == _gf_false) { - op_ret = -1; - op_errno = EROFS; - gf_msg (this->name, GF_LOG_ERROR, EROFS, - J_MSG_QUORUM_NOT_MET, "Quorum is not met. " - "The operation has failed."); - /* - * In this case, the quorum is not met after the * - * operation is performed on the leader. Hence a * - * rollback will be sent via GF_FOP_IPC to the leader * - * where this particular fop's term and index numbers * - * will be journaled, and later used to rollback. * - * The same will be done on all the followers * - */ - call_frame_t *new_frame; - - new_frame = copy_frame (frame); - if (new_frame) { - new_local = mem_get0(this->local_pool); - if (new_local) { - INIT_LIST_HEAD(&new_local->qlinks); - gf_msg (this->name, GF_LOG_ERROR, 0, - J_MSG_DICT_FLR, "op = %d", - new_frame->op); - ret = dict_set_int32 (local->xdata, - "rollback-fop", - GF_FOP_@UPNAME@); - if (ret) { - gf_msg (this->name, - GF_LOG_ERROR, 0, - J_MSG_DICT_FLR, - "failed to set " - "rollback-fop"); - } else { - new_local->xdata = dict_ref (local->xdata); - new_frame->local = new_local; - /* - * Calling STACK_WIND instead * - * of jbr_ipc as it will not * - * unwind to the previous * - * translators like it will * - * in case of jbr_ipc. * - */ - STACK_WIND (new_frame, - jbr_ipc_complete, - FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ipc, - FDL_IPC_JBR_SERVER_ROLLBACK, - new_local->xdata); - } - } else { - gf_log (this->name, GF_LOG_WARNING, - "Could not create local " - "for new_frame"); - } - } else { - gf_log (this->name, GF_LOG_WARNING, - "Could not send rollback ipc"); - } - } else { -#if defined(JBR_CG_NEED_FD) - op_ret = local->successful_op_ret; -#else - op_ret = 0; -#endif - op_errno = 0; - gf_msg_debug (this->name, 0, - "Quorum has met. The operation has succeeded."); - } - } - - /* - * Unrefing the reference taken in jbr_@NAME@ () * - */ - dict_unref (local->xdata); - - STACK_UNWIND_STRICT (@NAME@, frame, op_ret, op_errno, - @SHORT_ARGS@); - - - return 0; - -err: - STACK_UNWIND_STRICT (@NAME@, frame, -1, 0, - @SHORT_ARGS@); - - return 0; -} diff --git a/xlators/experimental/jbr-server/src/gen-fops.py b/xlators/experimental/jbr-server/src/gen-fops.py deleted file mode 100755 index 616782bba45..00000000000 --- a/xlators/experimental/jbr-server/src/gen-fops.py +++ /dev/null @@ -1,181 +0,0 @@ -#!/usr/bin/python3 - -# This script generates the boilerplate versions of most fops and cbks in the -# server. This allows the details of leadership-status checking, sequencing -# between leader and followers (including fan-out), and basic error checking -# to be centralized one place, with per-operation code kept to a minimum. - -from __future__ import print_function -import os -import re -import string -import sys - -curdir = os.path.dirname(sys.argv[0]) -gendir = os.path.join(curdir, '../../../../libglusterfs/src') -sys.path.append(gendir) -from generator import ops, fop_subs, cbk_subs, generate - -# We really want the callback argument list, even when we're generating fop -# code, so we propagate here. -# TBD: this should probably be right in generate.py -for k, v in cbk_subs.items(): - fop_subs[k]['@ERROR_ARGS@'] = v['@ERROR_ARGS@'] - -# Stolen from old codegen.py -def load_templates (path): - templates = {} - tmpl_re = re.compile("/\* template-name (.*) \*/") - templates = {} - t_name = None - for line in open(path, "r").readlines(): - if not line: - break - m = tmpl_re.match(line) - if m: - if t_name: - templates[t_name] = ''.join(t_contents) - t_name = m.group(1).strip() - t_contents = [] - elif t_name: - t_contents.append(line) - if t_name: - templates[t_name] = ''.join(t_contents) - return templates - -# We need two types of templates. The first, for pure read operations, just -# needs to do a simple am-i-leader check (augmented to allow dirty reads). -# The second, for pure writes, needs to do fan-out to followers between those -# initial checks and local execution. There are other operations that don't -# fit neatly into either category - e.g. lock ops or fsync - so we'll just have -# to handle those manually. The table thus includes entries only for those we -# can categorize. The special cases, plus any new operations we've never even -# heard of, aren't in there. -# -# Various keywords can be used to define/undefine preprocessor symbols used -# in the templates, on a per-function basis. For example, if the keyword here -# is "fsync" (lowercase word or abbreviation) that will cause JBR_CG_FSYNC -# (prefix plus uppercase version) to be defined above all of the generated code -# for that fop. - -fop_table = { - "access": "read", - "create": "write", - "discard": "write", -# "entrylk": "read", - "fallocate": "write", -# "fentrylk": "read", - "fgetxattr": "read", -# "finodelk": "read", -# "flush": "read", - "fremovexattr": "write", - "fsetattr": "write", - "fsetxattr": "write", - "fstat": "read", -# "fsync": "read", -# "fsyncdir": "read", - "ftruncate": "write", - "fxattrop": "write", - "getxattr": "read", -# "inodelk": "read", - "link": "write", - "lk": "write,queue", -# "lookup": "read", - "mkdir": "write", - "mknod": "write", - "open": "write", - "opendir": "read", - "rchecksum": "read", - "readdir": "read", - "readdirp": "read", - "readlink": "read", - "readv": "read", - "removexattr": "write", - "rename": "write", - "rmdir": "write", - "setattr": "write", - "setxattr": "write", - "stat": "read", - "statfs": "read", - "symlink": "write", - "truncate": "write", - "unlink": "write", - "writev": "write,fsync,queue", - "xattrop": "write", - "ipc": "write", -} - -# Mention those fops in the selective_generate table, for which -# only a few common functions will be generated, and mention those -# functions. Rest of the functions can be customized -selective_generate = { - "lk": "fop,dispatch,call_dispatch", - "ipc": "dispatch,call_dispatch", -} - -# Stolen from gen_fdl.py -def gen_server (templates): - fops_done = [] - for name in fop_table.keys(): - info = fop_table[name].split(",") - kind = info[0] - flags = info[1:] - - # generate all functions for the fops in fop_table - # except for the ones in selective_generate for which - # generate only the functions mentioned in the - # selective_generate table - gen_funcs = "fop,complete,continue,fan-in,dispatch, \ - call_dispatch,perform_local_op" - if name in selective_generate: - gen_funcs = selective_generate[name].split(",") - - if ("fsync" in flags) or ("queue" in flags): - flags.append("need_fd") - for fname in flags: - print("#define JBR_CG_%s" % fname.upper()) - - if 'complete' in gen_funcs: - print(generate(templates[kind+"-complete"], - name, cbk_subs)) - - if 'continue' in gen_funcs: - print(generate(templates[kind+"-continue"], - name, fop_subs)) - - if 'fan-in' in gen_funcs: - print(generate(templates[kind+"-fan-in"], - name, cbk_subs)) - - if 'dispatch' in gen_funcs: - print(generate(templates[kind+"-dispatch"], - name, fop_subs)) - - if 'call_dispatch' in gen_funcs: - print(generate(templates[kind+"-call_dispatch"], - name, fop_subs)) - - if 'perform_local_op' in gen_funcs: - print(generate(templates[kind+"-perform_local_op"], - name, fop_subs)) - - if 'fop' in gen_funcs: - print(generate(templates[kind+"-fop"], name, fop_subs)) - - for fname in flags: - print("#undef JBR_CG_%s" % fname.upper()) - fops_done.append(name) - # Just for fun, emit the fops table too. - print("struct xlator_fops fops = {") - for x in fops_done: - print((" .%s = jbr_%s,"%(x, x))) - print("};") - -tmpl = load_templates(sys.argv[1]) -for l in open(sys.argv[2], 'r').readlines(): - if l.find('#pragma generate') != -1: - print("/* BEGIN GENERATED CODE - DO NOT MODIFY */") - gen_server(tmpl) - print("/* END GENERATED CODE */") - else: - print(l[:-1]) diff --git a/xlators/experimental/jbr-server/src/jbr-internal.h b/xlators/experimental/jbr-server/src/jbr-internal.h deleted file mode 100644 index f225e988a5f..00000000000 --- a/xlators/experimental/jbr-server/src/jbr-internal.h +++ /dev/null @@ -1,118 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#include <sys/stat.h> -#include <sys/types.h> - -#define LEADER_XATTR "user.jbr.leader" -#define SECOND_CHILD(xl) (xl->children->next->xlator) -#define RECONCILER_PATH JBR_SCRIPT_PREFIX "/reconciler.py" -#define CHANGELOG_ENTRY_SIZE 128 - -enum { - gf_mt_jbr_private_t = gf_common_mt_end + 1, - gf_mt_jbr_fd_ctx_t, - gf_mt_jbr_inode_ctx_t, - gf_mt_jbr_dirty_t, - gf_mt_jbr_end -}; - -typedef enum jbr_recon_notify_ev_id_t { - JBR_RECON_SET_LEADER = 1, - JBR_RECON_ADD_CHILD = 2 -} jbr_recon_notify_ev_id_t; - -typedef struct _jbr_recon_notify_ev_s { - jbr_recon_notify_ev_id_t id; - uint32_t index; /* in case of add */ - struct list_head list; -} jbr_recon_notify_ev_t; - -typedef struct { - /* - * This is a hack to allow a non-leader to accept requests while the - * leader is down, and it only works for n=2. The way it works is that - * "config_leader" indicates the state from our options (via init or - * reconfigure) but "leader" is what the fop code actually looks at. If - * config_leader is true, then leader will *always* be true as well, - * giving that brick precedence. If config_leader is false, then - * leader will only be true if there is no connection to the other - * brick (tracked in jbr_notify). - * - * TBD: implement real leader election - */ - gf_boolean_t config_leader; - gf_boolean_t leader; - uint8_t up_children; - uint8_t n_children; - char *vol_file; - uint32_t current_term; - uint32_t kid_state; - gf_lock_t dirty_lock; - struct list_head dirty_fds; - uint32_t index; - gf_lock_t index_lock; - double quorum_pct; - int term_fd; - long term_total; - long term_read; - /* - * This is a super-duper hack, but it will do for now. The reason it's - * a hack is that we pass this to dict_set_static_bin, so we don't have - * to mess around with allocating and freeing it on every single IPC - * request, but it's totally not thread-safe. On the other hand, there - * should only be one reconciliation thread running and calling these - * functions at a time, so maybe that doesn't matter. - * - * TBD: re-evaluate how to manage this - */ - char term_buf[CHANGELOG_ENTRY_SIZE]; - gf_boolean_t child_up; /* To maintain the state of * - * the translator */ -} jbr_private_t; - -typedef struct { - call_stub_t *stub; - call_stub_t *qstub; - uint32_t call_count; - uint32_t successful_acks; - uint32_t successful_op_ret; - fd_t *fd; - struct list_head qlinks; - dict_t *xdata; -} jbr_local_t; - -/* - * This should match whatever changelog returns on the pre-op for us to pass - * when we're ready for our post-op. - */ -typedef uint32_t log_id_t; - -typedef struct { - struct list_head links; - log_id_t id; -} jbr_dirty_list_t; - -typedef struct { - fd_t *fd; - struct list_head dirty_list; - struct list_head fd_list; -} jbr_fd_ctx_t; - -typedef struct { - gf_lock_t lock; - uint32_t active; - struct list_head aqueue; - uint32_t pending; - struct list_head pqueue; -} jbr_inode_ctx_t; - -void -jbr_start_reconciler(xlator_t *this); diff --git a/xlators/experimental/jbr-server/src/jbr.c b/xlators/experimental/jbr-server/src/jbr.c deleted file mode 100644 index 0d42740e504..00000000000 --- a/xlators/experimental/jbr-server/src/jbr.c +++ /dev/null @@ -1,1675 +0,0 @@ -/* - Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> - This file is part of GlusterFS. - - This file is licensed to you under your choice of the GNU Lesser - General Public License, version 3 or any later version (LGPLv3 or - later), or the GNU General Public License, version 2 (GPLv2), in all - cases as published by the Free Software Foundation. -*/ - -#ifndef _CONFIG_H -#define _CONFIG_H -#include "config.h" -#endif - -#include <fnmatch.h> -#include "call-stub.h" -#include "defaults.h" -#include "xlator.h" -#include "glfs.h" -#include "glfs-internal.h" -#include "run.h" -#include "common-utils.h" -#include "syncop.h" -#include "syscall.h" -#include "compat-errno.h" -#include "fdl.h" - -#include "jbr-internal.h" -#include "jbr-messages.h" - -#define JBR_FLUSH_INTERVAL 5 - -enum { - /* echo "cluster/jbr-server" | md5sum | cut -c 1-8 */ - JBR_SERVER_IPC_BASE = 0x0e2d66a5, - JBR_SERVER_TERM_RANGE, - JBR_SERVER_OPEN_TERM, - JBR_SERVER_NEXT_ENTRY -}; - -/* - * Need to declare jbr_lk_call_dispatch as jbr_lk_continue and * - * jbr_lk_perform_local_op call it, before code is generated. * - */ -int32_t -jbr_lk_call_dispatch(call_frame_t *frame, xlator_t *this, int *op_errno, - fd_t *fd, int32_t cmd, struct gf_flock *lock, - dict_t *xdata); - -int32_t -jbr_lk_dispatch(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, - struct gf_flock *lock, dict_t *xdata); - -int32_t -jbr_ipc_call_dispatch(call_frame_t *frame, xlator_t *this, int *op_errno, - int32_t op, dict_t *xdata); - -int32_t -jbr_ipc_complete(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata); - -/* Used to check the quorum of acks received after the fop - * confirming the status of the fop on all the brick processes - * for this particular subvolume - */ -gf_boolean_t -fop_quorum_check(xlator_t *this, double n_children, double current_state) -{ - jbr_private_t *priv = NULL; - gf_boolean_t result = _gf_false; - double required = 0; - double current = 0; - - GF_VALIDATE_OR_GOTO("jbr", this, out); - priv = this->private; - GF_VALIDATE_OR_GOTO(this->name, priv, out); - - required = n_children * priv->quorum_pct; - - /* - * Before performing the fop on the leader, we need to check, - * if there is any merit in performing the fop on the leader. - * In a case, where even a successful write on the leader, will - * not meet quorum, there is no point in trying the fop on the - * leader. - * When this function is called after the leader has tried - * performing the fop, this check will calculate quorum taking into - * account the status of the fop on the leader. If the leader's - * op_ret was -1, the complete function would account that by - * decrementing successful_acks by 1 - */ - - current = current_state * 100.0; - - if (current < required) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_QUORUM_NOT_MET, - "Quorum not met. quorum_pct = %f " - "Current State = %f, Required State = %f", - priv->quorum_pct, current, required); - } else - result = _gf_true; - -out: - return result; -} - -jbr_inode_ctx_t * -jbr_get_inode_ctx(xlator_t *this, inode_t *inode) -{ - uint64_t ctx_int = 0LL; - jbr_inode_ctx_t *ctx_ptr; - - if (__inode_ctx_get(inode, this, &ctx_int) == 0) { - ctx_ptr = (jbr_inode_ctx_t *)(long)ctx_int; - } else { - ctx_ptr = GF_CALLOC(1, sizeof(*ctx_ptr), gf_mt_jbr_inode_ctx_t); - if (ctx_ptr) { - ctx_int = (uint64_t)(long)ctx_ptr; - if (__inode_ctx_set(inode, this, &ctx_int) == 0) { - LOCK_INIT(&ctx_ptr->lock); - INIT_LIST_HEAD(&ctx_ptr->aqueue); - INIT_LIST_HEAD(&ctx_ptr->pqueue); - } else { - GF_FREE(ctx_ptr); - ctx_ptr = NULL; - } - } - } - - return ctx_ptr; -} - -jbr_fd_ctx_t * -jbr_get_fd_ctx(xlator_t *this, fd_t *fd) -{ - uint64_t ctx_int = 0LL; - jbr_fd_ctx_t *ctx_ptr; - - if (__fd_ctx_get(fd, this, &ctx_int) == 0) { - ctx_ptr = (jbr_fd_ctx_t *)(long)ctx_int; - } else { - ctx_ptr = GF_CALLOC(1, sizeof(*ctx_ptr), gf_mt_jbr_fd_ctx_t); - if (ctx_ptr) { - if (__fd_ctx_set(fd, this, (uint64_t)ctx_ptr) == 0) { - INIT_LIST_HEAD(&ctx_ptr->dirty_list); - INIT_LIST_HEAD(&ctx_ptr->fd_list); - } else { - GF_FREE(ctx_ptr); - ctx_ptr = NULL; - } - } - } - - return ctx_ptr; -} - -void -jbr_mark_fd_dirty(xlator_t *this, jbr_local_t *local) -{ - fd_t *fd = local->fd; - jbr_fd_ctx_t *ctx_ptr; - jbr_dirty_list_t *dirty; - jbr_private_t *priv = this->private; - - /* - * TBD: don't do any of this for O_SYNC/O_DIRECT writes. - * Unfortunately, that optimization requires that we distinguish - * between writev and other "write" calls, saving the original flags - * and checking them in the callback. Too much work for too little - * gain right now. - */ - - LOCK(&fd->lock); - ctx_ptr = jbr_get_fd_ctx(this, fd); - dirty = GF_CALLOC(1, sizeof(*dirty), gf_mt_jbr_dirty_t); - if (ctx_ptr && dirty) { - gf_msg_trace(this->name, 0, "marking fd %p as dirty (%p)", fd, dirty); - /* TBD: fill dirty->id from what changelog gave us */ - list_add_tail(&dirty->links, &ctx_ptr->dirty_list); - if (list_empty(&ctx_ptr->fd_list)) { - /* Add a ref so _release doesn't get called. */ - ctx_ptr->fd = fd_ref(fd); - LOCK(&priv->dirty_lock); - list_add_tail(&ctx_ptr->fd_list, &priv->dirty_fds); - UNLOCK(&priv->dirty_lock); - } - } else { - gf_msg(this->name, GF_LOG_ERROR, ENOMEM, J_MSG_MEM_ERR, - "could not mark %p dirty", fd); - if (ctx_ptr) { - GF_FREE(ctx_ptr); - } - if (dirty) { - GF_FREE(dirty); - } - } - UNLOCK(&fd->lock); -} - -#define JBR_TERM_XATTR "trusted.jbr.term" -#define JBR_INDEX_XATTR "trusted.jbr.index" -#define JBR_REP_COUNT_XATTR "trusted.jbr.rep-count" -#define RECON_TERM_XATTR "trusted.jbr.recon-term" -#define RECON_INDEX_XATTR "trusted.jbr.recon-index" - -int32_t -jbr_leader_checks_and_init(call_frame_t *frame, xlator_t *this, int *op_errno, - dict_t *xdata, fd_t *fd) -{ - jbr_local_t *local = NULL; - jbr_private_t *priv = NULL; - int32_t ret = -1; - gf_boolean_t result = _gf_false; - int from_leader = _gf_false; - int from_recon = _gf_false; - - GF_VALIDATE_OR_GOTO("jbr", this, out); - priv = this->private; - GF_VALIDATE_OR_GOTO(this->name, priv, out); - GF_VALIDATE_OR_GOTO(this->name, op_errno, out); - GF_VALIDATE_OR_GOTO(this->name, frame, out); - - /* - * Our first goal here is to avoid "split brain surprise" for users who - * specify exactly 50% with two- or three-way replication. That means - * either a more-than check against half the total replicas or an - * at-least check against half of our peers (one less). Of the two, - * only an at-least check supports the intuitive use of 100% to mean - * all replicas must be present, because "more than 100%" will never - * succeed regardless of which count we use. This leaves us with a - * slightly non-traditional definition of quorum ("at least X% of peers - * not including ourselves") but one that's useful enough to be worth - * it. - * - * Note that n_children and up_children *do* include the local - * subvolume, so we need to subtract one in each case. - */ - if (priv->leader) { - result = fop_quorum_check(this, (double)(priv->n_children - 1), - (double)(priv->up_children - 1)); - - if (result == _gf_false) { - /* Emulate the AFR client-side-quorum behavior. */ - gf_msg(this->name, GF_LOG_ERROR, EROFS, J_MSG_QUORUM_NOT_MET, - "Sufficient number of " - "subvolumes are not up to meet quorum."); - *op_errno = EROFS; - goto out; - } - } else { - if (xdata) { - from_leader = !!dict_get(xdata, JBR_TERM_XATTR); - from_recon = !!dict_get(xdata, RECON_TERM_XATTR) && - !!dict_get(xdata, RECON_INDEX_XATTR); - } else { - from_leader = from_recon = _gf_false; - } - - /* follower/recon path * - * just send it to local node * - */ - if (!from_leader && !from_recon) { - *op_errno = EREMOTE; - goto out; - } - } - - local = mem_get0(this->local_pool); - if (!local) { - goto out; - } - - if (fd) - local->fd = fd_ref(fd); - else - local->fd = NULL; - - INIT_LIST_HEAD(&local->qlinks); - local->successful_acks = 0; - frame->local = local; - - ret = 0; -out: - return ret; -} - -int32_t -jbr_initialize_xdata_set_attrs(xlator_t *this, dict_t **xdata) -{ - jbr_private_t *priv = NULL; - int32_t ret = -1; - uint32_t ti = 0; - - GF_VALIDATE_OR_GOTO("jbr", this, out); - priv = this->private; - GF_VALIDATE_OR_GOTO(this->name, priv, out); - GF_VALIDATE_OR_GOTO(this->name, xdata, out); - - if (!*xdata) { - *xdata = dict_new(); - if (!*xdata) { - gf_msg(this->name, GF_LOG_ERROR, ENOMEM, J_MSG_MEM_ERR, - "failed to allocate xdata"); - goto out; - } - } - - if (dict_set_int32(*xdata, JBR_TERM_XATTR, priv->current_term) != 0) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_DICT_FLR, - "failed to set jbr-term"); - goto out; - } - - LOCK(&priv->index_lock); - ti = ++(priv->index); - UNLOCK(&priv->index_lock); - if (dict_set_int32(*xdata, JBR_INDEX_XATTR, ti) != 0) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_DICT_FLR, - "failed to set index"); - goto out; - } - - ret = 0; -out: - return ret; -} - -int32_t -jbr_remove_from_queue(call_frame_t *frame, xlator_t *this) -{ - int32_t ret = -1; - jbr_inode_ctx_t *ictx = NULL; - jbr_local_t *local = NULL; - jbr_local_t *next = NULL; - - GF_VALIDATE_OR_GOTO("jbr", this, out); - GF_VALIDATE_OR_GOTO(this->name, frame, out); - local = frame->local; - GF_VALIDATE_OR_GOTO(this->name, local, out); - - if (local->qlinks.next != &local->qlinks) { - list_del(&local->qlinks); - ictx = jbr_get_inode_ctx(this, local->fd->inode); - if (ictx) { - LOCK(&ictx->lock); - if (ictx->pending) { - /* - * TBD: dequeue *all* non-conflicting - * reqs - * - * With the stub implementation there - * can only be one request active at a - * time (zero here) so it's not an - * issue. In a real implementation - * there might still be other active - * requests to check against, and - * multiple pending requests that could - * continue. - */ - gf_msg_debug(this->name, 0, "unblocking next request"); - --(ictx->pending); - next = list_entry(ictx->pqueue.next, jbr_local_t, qlinks); - list_del(&next->qlinks); - list_add_tail(&next->qlinks, &ictx->aqueue); - call_resume(next->qstub); - } else { - --(ictx->active); - } - UNLOCK(&ictx->lock); - } - } - - ret = 0; - -out: - return ret; -} - -int32_t -jbr_lk_complete(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *flock, - dict_t *xdata) -{ - int32_t ret = -1; - jbr_private_t *priv = NULL; - jbr_local_t *local = NULL; - gf_boolean_t result = _gf_false; - - GF_VALIDATE_OR_GOTO("jbr", this, err); - priv = this->private; - GF_VALIDATE_OR_GOTO(this->name, priv, err); - GF_VALIDATE_OR_GOTO(this->name, frame, err); - local = frame->local; - GF_VALIDATE_OR_GOTO(this->name, local, err); - GF_VALIDATE_OR_GOTO(this->name, flock, err); - GF_VALIDATE_OR_GOTO(this->name, xdata, err); - - /* - * Remove from queue for unlock operation only * - * For lock operation, it will be done in fan-in * - */ - if (flock->l_type == F_UNLCK) { - ret = jbr_remove_from_queue(frame, this); - if (ret) - goto err; - } - - /* - * On a follower, unwind with the op_ret and op_errno. On a * - * leader, if the fop is a locking fop, and its a failure, * - * send fail, else call stub which will dispatch the fop to * - * the followers. * - * * - * If the fop is a unlocking fop, check quorum. If quorum * - * is met, then send success. Else Rollback on leader, * - * followed by followers, and then send -ve ack to client. * - */ - if (priv->leader) { - /* Increase the successful acks if it's a success. */ - LOCK(&frame->lock); - if (op_ret != -1) - (local->successful_acks)++; - UNLOCK(&frame->lock); - - if (flock->l_type == F_UNLCK) { - result = fop_quorum_check(this, (double)priv->n_children, - (double)local->successful_acks); - if (result == _gf_false) { - op_ret = -1; - op_errno = EROFS; - gf_msg(this->name, GF_LOG_ERROR, EROFS, J_MSG_QUORUM_NOT_MET, - "Quorum is not met. " - "The operation has failed."); - - /* TODO: PERFORM UNLOCK ROLLBACK ON LEADER * - * FOLLOWED BY FOLLOWERS. */ - } else { - op_ret = 0; - op_errno = 0; - } - - fd_unref(local->fd); - STACK_UNWIND_STRICT(lk, frame, op_ret, op_errno, flock, xdata); - } else { - if (op_ret == -1) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_LOCK_FAILURE, - "The lock operation failed on " - "the leader."); - - fd_unref(local->fd); - STACK_UNWIND_STRICT(lk, frame, op_ret, op_errno, flock, xdata); - } else { - if (!local->stub) { - goto err; - } - - call_resume(local->stub); - } - } - } else { - fd_unref(local->fd); - STACK_UNWIND_STRICT(lk, frame, op_ret, op_errno, flock, xdata); - } - - return 0; - -err: - if (local) { - if (local->stub) { - call_stub_destroy(local->stub); - } - if (local->qstub) { - call_stub_destroy(local->qstub); - } - if (local->fd) { - fd_unref(local->fd); - } - mem_put(local); - } - STACK_UNWIND_STRICT(lk, frame, -1, op_errno, flock, xdata); - return 0; -} - -int32_t -jbr_lk_fan_in(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct gf_flock *flock, dict_t *xdata) -{ - uint8_t call_count = -1; - int32_t ret = -1; - gf_boolean_t result = _gf_false; - jbr_local_t *local = NULL; - jbr_private_t *priv = NULL; - - GF_VALIDATE_OR_GOTO("jbr", this, out); - GF_VALIDATE_OR_GOTO(this->name, frame, out); - priv = this->private; - local = frame->local; - GF_VALIDATE_OR_GOTO(this->name, priv, out); - GF_VALIDATE_OR_GOTO(this->name, local, out); - - gf_msg_trace(this->name, 0, "op_ret = %d, op_errno = %d\n", op_ret, - op_errno); - - LOCK(&frame->lock); - call_count = --(local->call_count); - if (op_ret != -1) { - /* Increment the number of successful acks * - * received for the operation. * - */ - (local->successful_acks)++; - local->successful_op_ret = op_ret; - } - gf_msg_debug(this->name, 0, "succ_acks = %d, op_ret = %d, op_errno = %d\n", - op_ret, op_errno, local->successful_acks); - UNLOCK(&frame->lock); - - if (call_count == 0) { - /* - * If the fop is a locking fop, then check quorum. If quorum * - * is met, send successful ack to the client. If quorum is * - * not met, then rollback locking on followers, followed by * - * rollback of locking on leader, and then sending -ve ack * - * to the client. * - * * - * If the fop is a unlocking fop, then call stub. * - */ - if (flock->l_type == F_UNLCK) { - call_resume(local->stub); - } else { - /* - * Remove from queue for locking fops, for unlocking * - * fops, it is taken care of in jbr_lk_complete * - */ - ret = jbr_remove_from_queue(frame, this); - if (ret) - goto out; - - fd_unref(local->fd); - - result = fop_quorum_check(this, (double)priv->n_children, - (double)local->successful_acks); - if (result == _gf_false) { - gf_msg(this->name, GF_LOG_ERROR, EROFS, J_MSG_QUORUM_NOT_MET, - "Didn't receive enough acks to meet " - "quorum. Failing the locking " - "operation and initiating rollback on " - "followers and the leader " - "respectively."); - - /* TODO: PERFORM ROLLBACK OF LOCKING ON - * FOLLOWERS, FOLLOWED BY ROLLBACK ON - * LEADER. - */ - - STACK_UNWIND_STRICT(lk, frame, -1, EROFS, flock, xdata); - } else { - STACK_UNWIND_STRICT(lk, frame, 0, 0, flock, xdata); - } - } - } - - ret = 0; -out: - return ret; -} - -/* - * Called from leader for locking fop, being written as a separate - * function so as to support queues. - */ -int32_t -jbr_perform_lk_on_leader(call_frame_t *frame, xlator_t *this, fd_t *fd, - int32_t cmd, struct gf_flock *flock, dict_t *xdata) -{ - int32_t ret = -1; - - GF_VALIDATE_OR_GOTO("jbr", this, out); - GF_VALIDATE_OR_GOTO(this->name, frame, out); - GF_VALIDATE_OR_GOTO(this->name, flock, out); - GF_VALIDATE_OR_GOTO(this->name, fd, out); - - STACK_WIND(frame, jbr_lk_complete, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lk, fd, cmd, flock, xdata); - - ret = 0; -out: - return ret; -} - -int32_t -jbr_lk_perform_local_op(call_frame_t *frame, xlator_t *this, int *op_errno, - fd_t *fd, int32_t cmd, struct gf_flock *flock, - dict_t *xdata) -{ - int32_t ret = -1; - jbr_local_t *local = NULL; - - GF_VALIDATE_OR_GOTO("jbr", this, out); - GF_VALIDATE_OR_GOTO(this->name, frame, out); - local = frame->local; - GF_VALIDATE_OR_GOTO(this->name, local, out); - GF_VALIDATE_OR_GOTO(this->name, fd, out); - GF_VALIDATE_OR_GOTO(this->name, op_errno, out); - GF_VALIDATE_OR_GOTO(this->name, flock, out); - - /* - * Check if the fop is a locking fop or unlocking fop, and - * handle it accordingly. If it is a locking fop, take the - * lock on leader first, and then send it to the followers. - * If it is a unlocking fop, unlock the followers first, - * and then on meeting quorum perform the unlock on the leader. - */ - if (flock->l_type == F_UNLCK) { - ret = jbr_lk_call_dispatch(frame, this, op_errno, fd, cmd, flock, - xdata); - if (ret) - goto out; - } else { - jbr_inode_ctx_t *ictx = jbr_get_inode_ctx(this, fd->inode); - - if (!ictx) { - *op_errno = EIO; - goto out; - } - - LOCK(&ictx->lock); - if (ictx->active) { - gf_msg_debug(this->name, 0, "queuing request due to conflict"); - - local->qstub = fop_lk_stub(frame, jbr_perform_lk_on_leader, fd, cmd, - flock, xdata); - if (!local->qstub) { - UNLOCK(&ictx->lock); - goto out; - } - list_add_tail(&local->qlinks, &ictx->pqueue); - ++(ictx->pending); - UNLOCK(&ictx->lock); - ret = 0; - goto out; - } else { - list_add_tail(&local->qlinks, &ictx->aqueue); - ++(ictx->active); - } - UNLOCK(&ictx->lock); - ret = jbr_perform_lk_on_leader(frame, this, fd, cmd, flock, xdata); - if (ret == -1) - goto out; - } - - ret = 0; -out: - return ret; -} - -int32_t -jbr_lk_continue(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, - struct gf_flock *flock, dict_t *xdata) -{ - int32_t ret = -1; - jbr_local_t *local = NULL; - jbr_private_t *priv = NULL; - - GF_VALIDATE_OR_GOTO("jbr", this, out); - GF_VALIDATE_OR_GOTO(this->name, frame, out); - priv = this->private; - local = frame->local; - GF_VALIDATE_OR_GOTO(this->name, priv, out); - GF_VALIDATE_OR_GOTO(this->name, local, out); - GF_VALIDATE_OR_GOTO(this->name, flock, out); - GF_VALIDATE_OR_GOTO(this->name, fd, out); - GF_VALIDATE_OR_GOTO(this->name, xdata, out); - - /* - * If it's a locking fop, then call dispatch to followers * - * If it's a unlock fop, then perform the unlock operation * - */ - if (flock->l_type == F_UNLCK) { - STACK_WIND(frame, jbr_lk_complete, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->lk, fd, cmd, flock, xdata); - } else { - /* - * Directly call jbr_lk_dispatch instead of appending * - * in queue, which is done at jbr_lk_perform_local_op * - * for locking fops * - */ - ret = jbr_lk_dispatch(frame, this, fd, cmd, flock, xdata); - if (ret) { - STACK_UNWIND_STRICT(lk, frame, -1, 0, flock, xdata); - goto out; - } - } - - ret = 0; -out: - return ret; -} - -uint8_t -jbr_count_up_kids(jbr_private_t *priv) -{ - uint8_t retval = 0; - uint8_t i; - - for (i = 0; i < priv->n_children; ++i) { - if (priv->kid_state & (1 << i)) { - ++retval; - } - } - - return retval; -} - -/* - * The fsync machinery looks a lot like that for any write call, but there are - * some important differences that are easy to miss. First, we don't care - * about the xdata that shows whether the call came from a leader or - * reconciliation process. If we're the leader we fan out; if we're not we - * don't. Second, we don't wait for followers before we issue the local call. - * The code generation system could be updated to handle this, and still might - * if we need to implement other "almost identical" paths (e.g. for open), but - * a copy is more readable as long as it's just one. - */ - -int32_t -jbr_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, - int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, - dict_t *xdata) -{ - jbr_local_t *local = frame->local; - gf_boolean_t unwind; - - LOCK(&frame->lock); - unwind = !--(local->call_count); - UNLOCK(&frame->lock); - - if (unwind) { - STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, - xdata); - } - return 0; -} - -int32_t -jbr_fsync_local_cbk(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - jbr_dirty_list_t *dirty; - jbr_dirty_list_t *dtmp; - jbr_local_t *local = frame->local; - - list_for_each_entry_safe(dirty, dtmp, &local->qlinks, links) - { - gf_msg_trace(this->name, 0, "sending post-op on %p (%p)", local->fd, - dirty); - GF_FREE(dirty); - } - - return jbr_fsync_cbk(frame, cookie, this, op_ret, op_errno, prebuf, postbuf, - xdata); -} - -int32_t -jbr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, - dict_t *xdata) -{ - jbr_private_t *priv = this->private; - jbr_local_t *local; - uint64_t ctx_int = 0LL; - jbr_fd_ctx_t *ctx_ptr; - xlator_list_t *trav; - - local = mem_get0(this->local_pool); - if (!local) { - STACK_UNWIND_STRICT(fsync, frame, -1, ENOMEM, NULL, NULL, xdata); - return 0; - } - INIT_LIST_HEAD(&local->qlinks); - frame->local = local; - - /* Move the dirty list from the fd to the fsync request. */ - LOCK(&fd->lock); - if (__fd_ctx_get(fd, this, &ctx_int) == 0) { - ctx_ptr = (jbr_fd_ctx_t *)(long)ctx_int; - list_splice_init(&ctx_ptr->dirty_list, &local->qlinks); - } - UNLOCK(&fd->lock); - - /* Issue the local call. */ - local->call_count = priv->leader ? priv->n_children : 1; - STACK_WIND(frame, jbr_fsync_local_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsync, fd, flags, xdata); - - /* Issue remote calls if we're the leader. */ - if (priv->leader) { - for (trav = this->children->next; trav; trav = trav->next) { - STACK_WIND(frame, jbr_fsync_cbk, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->fsync, fd, flags, xdata); - } - } - - return 0; -} - -int32_t -jbr_getxattr_special(call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *name, dict_t *xdata) -{ - dict_t *result; - jbr_private_t *priv = this->private; - - if (!priv->leader) { - STACK_UNWIND_STRICT(getxattr, frame, -1, EREMOTE, NULL, NULL); - return 0; - } - - if (!name || (strcmp(name, JBR_REP_COUNT_XATTR) != 0)) { - STACK_WIND_TAIL(frame, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); - return 0; - } - - result = dict_new(); - if (!result) { - goto dn_failed; - } - - priv->up_children = jbr_count_up_kids(this->private); - if (dict_set_uint32(result, JBR_REP_COUNT_XATTR, priv->up_children) != 0) { - goto dsu_failed; - } - - STACK_UNWIND_STRICT(getxattr, frame, 0, 0, result, NULL); - dict_unref(result); - return 0; - -dsu_failed: - dict_unref(result); -dn_failed: - STACK_UNWIND_STRICT(getxattr, frame, -1, ENOMEM, NULL, NULL); - return 0; -} - -void -jbr_flush_fd(xlator_t *this, jbr_fd_ctx_t *fd_ctx) -{ - jbr_dirty_list_t *dirty; - jbr_dirty_list_t *dtmp; - - list_for_each_entry_safe(dirty, dtmp, &fd_ctx->dirty_list, links) - { - gf_msg_trace(this->name, 0, "sending post-op on %p (%p)", fd_ctx->fd, - dirty); - GF_FREE(dirty); - } - - INIT_LIST_HEAD(&fd_ctx->dirty_list); -} - -void * -jbr_flush_thread(void *ctx) -{ - xlator_t *this = ctx; - jbr_private_t *priv = this->private; - struct list_head dirty_fds; - jbr_fd_ctx_t *fd_ctx; - jbr_fd_ctx_t *fd_tmp; - int ret; - - for (;;) { - /* - * We have to be very careful to avoid lock inversions here, so - * we can't just hold priv->dirty_lock while we take and - * release locks for each fd. Instead, we only hold dirty_lock - * at the beginning of each iteration, as we (effectively) make - * a copy of the current list head and then clear the original. - * This leads to four scenarios for adding the first entry to - * an fd and potentially putting it on the global list. - * - * (1) While we're asleep. No lock contention, it just gets - * added and will be processed on the next iteration. - * - * (2) After we've made a local copy, but before we've started - * processing that fd. The new entry will be added to the - * fd (under its lock), and we'll process it on the current - * iteration. - * - * (3) While we're processing the fd. They'll block on the fd - * lock, then see that the list is empty and put it on the - * global list. We'll process it here on the next - * iteration. - * - * (4) While we're working, but after we've processed that fd. - * Same as (1) as far as that fd is concerned. - */ - INIT_LIST_HEAD(&dirty_fds); - LOCK(&priv->dirty_lock); - list_splice_init(&priv->dirty_fds, &dirty_fds); - UNLOCK(&priv->dirty_lock); - - list_for_each_entry_safe(fd_ctx, fd_tmp, &dirty_fds, fd_list) - { - ret = syncop_fsync(FIRST_CHILD(this), fd_ctx->fd, 0, NULL, NULL); - if (ret) { - gf_msg(this->name, GF_LOG_WARNING, 0, J_MSG_SYS_CALL_FAILURE, - "failed to fsync %p (%d)", fd_ctx->fd, -ret); - } - - LOCK(&fd_ctx->fd->lock); - jbr_flush_fd(this, fd_ctx); - list_del_init(&fd_ctx->fd_list); - UNLOCK(&fd_ctx->fd->lock); - fd_unref(fd_ctx->fd); - } - - sleep(JBR_FLUSH_INTERVAL); - } - - return NULL; -} - -int32_t -jbr_get_changelog_dir(xlator_t *this, char **cl_dir_p) -{ - xlator_t *cl_xl; - - /* Find our changelog translator. */ - cl_xl = this; - while (cl_xl) { - if (strcmp(cl_xl->type, "features/changelog") == 0) { - break; - } - cl_xl = cl_xl->children->xlator; - } - if (!cl_xl) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_INIT_FAIL, - "failed to find changelog translator"); - return ENOENT; - } - - /* Find the actual changelog directory. */ - if (dict_get_str(cl_xl->options, "changelog-dir", cl_dir_p) != 0) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_INIT_FAIL, - "failed to find changelog-dir for %s", cl_xl->name); - return ENODATA; - } - - return 0; -} - -void -jbr_get_terms(call_frame_t *frame, xlator_t *this) -{ - int32_t op_errno = 0; - char *cl_dir = NULL; - int32_t term_first = -1; - int32_t term_contig = -1; - int32_t term_last = -1; - int term_num = 0; - char *probe_str = NULL; - dict_t *my_xdata = NULL; - DIR *fp = NULL; - struct dirent *entry = NULL; - struct dirent scratch[2] = { - { - 0, - }, - }; - - op_errno = jbr_get_changelog_dir(this, &cl_dir); - if (op_errno) { - goto err; /* Error was already logged. */ - } - op_errno = ENODATA; /* Most common error after this. */ - - fp = sys_opendir(cl_dir); - if (!fp) { - op_errno = errno; - goto err; - } - - /* Find first and last terms. */ - for (;;) { - errno = 0; - entry = sys_readdir(fp, scratch); - if (!entry || errno != 0) { - if (errno != 0) { - op_errno = errno; - goto err; - } - break; - } - - if (fnmatch("TERM.*", entry->d_name, FNM_PATHNAME) != 0) { - continue; - } - /* +5 points to the character after the period */ - term_num = atoi(entry->d_name + 5); - gf_msg(this->name, GF_LOG_INFO, 0, J_MSG_GENERIC, "%s => %d", - entry->d_name, term_num); - if (term_num < 0) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_INVALID, - "invalid term file name %s", entry->d_name); - op_errno = EINVAL; - goto err; - } - if ((term_first < 0) || (term_first > term_num)) { - term_first = term_num; - } - if ((term_last < 0) || (term_last < term_num)) { - term_last = term_num; - } - } - if ((term_first < 0) || (term_last < 0)) { - /* TBD: are we *sure* there should always be at least one? */ - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_NO_DATA, "no terms found"); - op_errno = EINVAL; - goto err; - } - - (void)sys_closedir(fp); - fp = NULL; - - /* - * Find term_contig, which is the earliest term for which there are - * no gaps between it and term_last. - */ - for (term_contig = term_last; term_contig > 0; --term_contig) { - if (gf_asprintf(&probe_str, "%s/TERM.%d", cl_dir, term_contig - 1) <= - 0) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_MEM_ERR, - "failed to format term %d", term_contig - 1); - goto err; - } - if (sys_access(probe_str, F_OK) != 0) { - GF_FREE(probe_str); - probe_str = NULL; - break; - } - GF_FREE(probe_str); - probe_str = NULL; - } - - gf_msg(this->name, GF_LOG_INFO, 0, J_MSG_GENERIC, "found terms %d-%d (%d)", - term_first, term_last, term_contig); - - /* Return what we've found */ - my_xdata = dict_new(); - if (!my_xdata) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_MEM_ERR, - "failed to allocate reply dictionary"); - goto err; - } - if (dict_set_int32(my_xdata, "term-first", term_first) != 0) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_DICT_FLR, - "failed to set term-first"); - goto err; - } - if (dict_set_int32(my_xdata, "term-contig", term_contig) != 0) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_DICT_FLR, - "failed to set term-contig"); - goto err; - } - if (dict_set_int32(my_xdata, "term-last", term_last) != 0) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_DICT_FLR, - "failed to set term-last"); - goto err; - } - - /* Finally! */ - STACK_UNWIND_STRICT(ipc, frame, 0, 0, my_xdata); - dict_unref(my_xdata); - return; - -err: - if (fp) { - (void)sys_closedir(fp); - } - if (my_xdata) { - dict_unref(my_xdata); - } - - if (probe_str) - GF_FREE(probe_str); - - STACK_UNWIND_STRICT(ipc, frame, -1, op_errno, NULL); -} - -long -get_entry_count(xlator_t *this, int fd) -{ - struct stat buf; - long min; /* last entry not known to be empty */ - long max; /* first entry known to be empty */ - long curr; - char entry[CHANGELOG_ENTRY_SIZE]; - - if (sys_fstat(fd, &buf) < 0) { - return -1; - } - - min = 0; - max = buf.st_size / CHANGELOG_ENTRY_SIZE; - - while ((min + 1) < max) { - curr = (min + max) / 2; - if (sys_lseek(fd, curr * CHANGELOG_ENTRY_SIZE, SEEK_SET) < 0) { - return -1; - } - if (sys_read(fd, entry, sizeof(entry)) != sizeof(entry)) { - return -1; - } - if ((entry[0] == '_') && (entry[1] == 'P')) { - min = curr; - } else { - max = curr; - } - } - - if (sys_lseek(fd, 0, SEEK_SET) < 0) { - gf_msg(this->name, GF_LOG_WARNING, 0, J_MSG_SYS_CALL_FAILURE, - "failed to reset offset"); - } - return max; -} - -void -jbr_open_term(call_frame_t *frame, xlator_t *this, dict_t *xdata) -{ - int32_t op_errno; - char *cl_dir; - char *term; - char *path = NULL; - jbr_private_t *priv = this->private; - - op_errno = jbr_get_changelog_dir(this, &cl_dir); - if (op_errno) { - goto err; - } - - if (dict_get_str(xdata, "term", &term) != 0) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_NO_DATA, "missing term"); - op_errno = ENODATA; - goto err; - } - - if (gf_asprintf(&path, "%s/TERM.%s", cl_dir, term) < 0) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_MEM_ERR, - "failed to construct path"); - op_errno = ENOMEM; - goto err; - } - - if (priv->term_fd >= 0) { - sys_close(priv->term_fd); - } - priv->term_fd = open(path, O_RDONLY); - if (priv->term_fd < 0) { - op_errno = errno; - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_SYS_CALL_FAILURE, - "failed to open term file"); - goto err; - } - - priv->term_total = get_entry_count(this, priv->term_fd); - if (priv->term_total < 0) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_NO_DATA, - "failed to get entry count"); - sys_close(priv->term_fd); - priv->term_fd = -1; - op_errno = EIO; - goto err; - } - priv->term_read = 0; - - /* Success! */ - STACK_UNWIND_STRICT(ipc, frame, 0, 0, NULL); - GF_FREE(path); - return; - -err: - STACK_UNWIND_STRICT(ipc, frame, -1, op_errno, NULL); - GF_FREE(path); -} - -void -jbr_next_entry(call_frame_t *frame, xlator_t *this) -{ - int32_t op_errno = ENOMEM; - jbr_private_t *priv = this->private; - ssize_t nbytes; - dict_t *my_xdata; - - if (priv->term_fd < 0) { - op_errno = EBADFD; - goto err; - } - - if (priv->term_read >= priv->term_total) { - op_errno = ENODATA; - goto err; - } - - nbytes = sys_read(priv->term_fd, priv->term_buf, CHANGELOG_ENTRY_SIZE); - if (nbytes < CHANGELOG_ENTRY_SIZE) { - if (nbytes < 0) { - op_errno = errno; - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_SYS_CALL_FAILURE, - "error reading next entry: %s", strerror(errno)); - } else { - op_errno = EIO; - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_SYS_CALL_FAILURE, - "got %zd/%d bytes for next entry", nbytes, - CHANGELOG_ENTRY_SIZE); - } - goto err; - } - ++(priv->term_read); - - my_xdata = dict_new(); - if (!my_xdata) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_MEM_ERR, - "failed to allocate reply xdata"); - goto err; - } - - if (dict_set_static_bin(my_xdata, "data", priv->term_buf, - CHANGELOG_ENTRY_SIZE) != 0) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_DICT_FLR, - "failed to assign reply xdata"); - goto err; - } - - STACK_UNWIND_STRICT(ipc, frame, 0, 0, my_xdata); - dict_unref(my_xdata); - return; - -err: - STACK_UNWIND_STRICT(ipc, frame, -1, op_errno, NULL); -} - -int32_t -jbr_ipc_fan_in(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - jbr_local_t *local = NULL; - int32_t ret = -1; - uint8_t call_count; - - GF_VALIDATE_OR_GOTO("jbr", this, out); - GF_VALIDATE_OR_GOTO(this->name, frame, out); - local = frame->local; - GF_VALIDATE_OR_GOTO(this->name, local, out); - - gf_msg_trace(this->name, 0, "op_ret = %d, op_errno = %d\n", op_ret, - op_errno); - - LOCK(&frame->lock); - call_count = --(local->call_count); - UNLOCK(&frame->lock); - - if (call_count == 0) { -#if defined(JBR_CG_QUEUE) - ret = jbr_remove_from_queue(frame, this); - if (ret) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_GENERIC, - "Failed to remove from queue."); - } -#endif - /* - * Unrefing the reference taken in continue() or complete() * - */ - dict_unref(local->xdata); - STACK_DESTROY(frame->root); - } - - ret = 0; -out: - return ret; -} - -int32_t -jbr_ipc_complete(call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - jbr_local_t *local = NULL; - - GF_VALIDATE_OR_GOTO("jbr", this, out); - GF_VALIDATE_OR_GOTO(this->name, frame, out); - local = frame->local; - GF_VALIDATE_OR_GOTO(this->name, local, out); - - jbr_ipc_call_dispatch(frame, this, &op_errno, FDL_IPC_JBR_SERVER_ROLLBACK, - local->xdata); -out: - return 0; -} - -int32_t -jbr_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata) -{ - switch (op) { - case JBR_SERVER_TERM_RANGE: - jbr_get_terms(frame, this); - break; - case JBR_SERVER_OPEN_TERM: - jbr_open_term(frame, this, xdata); - break; - case JBR_SERVER_NEXT_ENTRY: - jbr_next_entry(frame, this); - break; - case FDL_IPC_JBR_SERVER_ROLLBACK: - /* - * Just send the fop down to fdl. Need not * - * dispatch it to other bricks in the sub- * - * volume, as it will be done where the op * - * has failed. * - */ - default: - STACK_WIND_TAIL(frame, FIRST_CHILD(this), - FIRST_CHILD(this)->fops->ipc, op, xdata); - } - - return 0; -} - -#pragma generate - -int32_t -jbr_forget(xlator_t *this, inode_t *inode) -{ - uint64_t ctx = 0LL; - - if ((inode_ctx_del(inode, this, &ctx) == 0) && ctx) { - GF_FREE((void *)(long)ctx); - } - - return 0; -} - -int32_t -jbr_release(xlator_t *this, fd_t *fd) -{ - uint64_t ctx = 0LL; - - if ((fd_ctx_del(fd, this, &ctx) == 0) && ctx) { - GF_FREE((void *)(long)ctx); - } - - return 0; -} - -struct xlator_cbks cbks = { - .forget = jbr_forget, - .release = jbr_release, -}; - -int -jbr_reconfigure(xlator_t *this, dict_t *options) -{ - jbr_private_t *priv = this->private; - - GF_OPTION_RECONF("leader", priv->config_leader, options, bool, err); - GF_OPTION_RECONF("quorum-percent", priv->quorum_pct, options, percent, err); - gf_msg(this->name, GF_LOG_INFO, 0, J_MSG_GENERIC, - "reconfigure called, config_leader = %d, quorum_pct = %.1f\n", - priv->leader, priv->quorum_pct); - - priv->leader = priv->config_leader; - - return 0; - -err: - return -1; -} - -int -jbr_get_child_index(xlator_t *this, xlator_t *kid) -{ - xlator_list_t *trav; - int retval = -1; - - for (trav = this->children; trav; trav = trav->next) { - ++retval; - if (trav->xlator == kid) { - return retval; - } - } - - return -1; -} - -/* - * Child notify handling is unreasonably FUBAR. Sometimes we'll get a - * CHILD_DOWN for a protocol/client child before we ever got a CHILD_UP for it. - * Other times we won't. Because it's effectively random (probably racy), we - * can't just maintain a count. We actually have to keep track of the state - * for each child separately, to filter out the bogus CHILD_DOWN events, and - * then generate counts on demand. - */ -int -jbr_notify(xlator_t *this, int event, void *data, ...) -{ - jbr_private_t *priv = this->private; - int index = -1; - int ret = -1; - gf_boolean_t result = _gf_false; - gf_boolean_t relevant = _gf_false; - - switch (event) { - case GF_EVENT_CHILD_UP: - index = jbr_get_child_index(this, data); - if (index >= 0) { - /* Check if the child was previously down - * and it's not a false CHILD_UP - */ - if (!(priv->kid_state & (1 << index))) { - relevant = _gf_true; - } - - priv->kid_state |= (1 << index); - priv->up_children = jbr_count_up_kids(priv); - gf_msg(this->name, GF_LOG_INFO, 0, J_MSG_GENERIC, - "got CHILD_UP for %s, now %u kids", - ((xlator_t *)data)->name, priv->up_children); - if (!priv->config_leader && (priv->up_children > 1)) { - priv->leader = _gf_false; - } - - /* If it's not relevant, or we have already * - * sent CHILD_UP just break */ - if (!relevant || priv->child_up) - break; - - /* If it's not a leader, just send the notify up */ - if (!priv->leader) { - ret = default_notify(this, event, data); - if (!ret) - priv->child_up = _gf_true; - break; - } - - result = fop_quorum_check(this, (double)(priv->n_children - 1), - (double)(priv->up_children - 1)); - if (result == _gf_false) { - gf_msg(this->name, GF_LOG_INFO, 0, J_MSG_GENERIC, - "Not enough children " - "are up to meet quorum. Waiting to " - "send CHILD_UP from leader"); - } else { - gf_msg(this->name, GF_LOG_INFO, 0, J_MSG_GENERIC, - "Enough children are up " - "to meet quorum. Sending CHILD_UP " - "from leader"); - ret = default_notify(this, event, data); - if (!ret) - priv->child_up = _gf_true; - } - } - break; - case GF_EVENT_CHILD_DOWN: - index = jbr_get_child_index(this, data); - if (index >= 0) { - /* Check if the child was previously up - * and it's not a false CHILD_DOWN - */ - if (priv->kid_state & (1 << index)) { - relevant = _gf_true; - } - priv->kid_state &= ~(1 << index); - priv->up_children = jbr_count_up_kids(priv); - gf_msg(this->name, GF_LOG_INFO, 0, J_MSG_GENERIC, - "got CHILD_DOWN for %s, now %u kids", - ((xlator_t *)data)->name, priv->up_children); - if (!priv->config_leader && (priv->up_children < 2) && - relevant) { - priv->leader = _gf_true; - } - - /* If it's not relevant, or we have already * - * sent CHILD_DOWN just break */ - if (!relevant || !priv->child_up) - break; - - /* If it's not a leader, just break coz we shouldn't * - * propagate the failure from the failure till it * - * itself goes down * - */ - if (!priv->leader) { - break; - } - - result = fop_quorum_check(this, (double)(priv->n_children - 1), - (double)(priv->up_children - 1)); - if (result == _gf_false) { - gf_msg(this->name, GF_LOG_INFO, 0, J_MSG_GENERIC, - "Enough children are " - "to down to fail quorum. " - "Sending CHILD_DOWN from leader"); - ret = default_notify(this, event, data); - if (!ret) - priv->child_up = _gf_false; - } else { - gf_msg(this->name, GF_LOG_INFO, 0, J_MSG_GENERIC, - "Not enough children " - "are down to fail quorum. Waiting to " - "send CHILD_DOWN from leader"); - } - } - break; - default: - ret = default_notify(this, event, data); - } - - return ret; -} - -int32_t -mem_acct_init(xlator_t *this) -{ - int ret = -1; - - GF_VALIDATE_OR_GOTO("jbr", this, out); - - ret = xlator_mem_acct_init(this, gf_mt_jbr_end + 1); - - if (ret != 0) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_MEM_ERR, - "Memory accounting init" - "failed"); - return ret; - } -out: - return ret; -} - -void -jbr_deallocate_priv(jbr_private_t *priv) -{ - if (!priv) { - return; - } - - GF_FREE(priv); -} - -int32_t -jbr_init(xlator_t *this) -{ - xlator_list_t *remote; - xlator_list_t *local; - jbr_private_t *priv = NULL; - xlator_list_t *trav; - pthread_t kid; - extern xlator_t global_xlator; - glusterfs_ctx_t *oldctx = global_xlator.ctx; - - /* - * Any fop that gets special treatment has to be patched in here, - * because the compiled-in table is produced by the code generator and - * only contains generated functions. Note that we have to go through - * this->fops because of some dynamic-linking strangeness; modifying - * the static table doesn't work. - */ - this->fops->getxattr = jbr_getxattr_special; - this->fops->fsync = jbr_fsync; - - local = this->children; - if (!local) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_NO_DATA, - "no local subvolume"); - goto err; - } - - remote = local->next; - if (!remote) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_NO_DATA, - "no remote subvolumes"); - goto err; - } - - this->local_pool = mem_pool_new(jbr_local_t, 128); - if (!this->local_pool) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_MEM_ERR, - "failed to create jbr_local_t pool"); - goto err; - } - - priv = GF_CALLOC(1, sizeof(*priv), gf_mt_jbr_private_t); - if (!priv) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_MEM_ERR, - "could not allocate priv"); - goto err; - } - - for (trav = this->children; trav; trav = trav->next) { - ++(priv->n_children); - } - - LOCK_INIT(&priv->dirty_lock); - LOCK_INIT(&priv->index_lock); - INIT_LIST_HEAD(&priv->dirty_fds); - priv->term_fd = -1; - - this->private = priv; - - GF_OPTION_INIT("leader", priv->config_leader, bool, err); - GF_OPTION_INIT("quorum-percent", priv->quorum_pct, percent, err); - - priv->leader = priv->config_leader; - priv->child_up = _gf_false; - - if (gf_thread_create(&kid, NULL, jbr_flush_thread, this, "jbrflush") != 0) { - gf_msg(this->name, GF_LOG_ERROR, 0, J_MSG_SYS_CALL_FAILURE, - "could not start flush thread"); - /* TBD: treat this as a fatal error? */ - } - - /* - * Calling glfs_new changes old->ctx, even if THIS still points - * to global_xlator. That causes problems later in the main - * thread, when gf_log_dump_graph tries to use the FILE after - * we've mucked with it and gets a segfault in __fprintf_chk. - * We can avoid all that by undoing the damage before we - * continue. - */ - global_xlator.ctx = oldctx; - - return 0; - -err: - jbr_deallocate_priv(priv); - return -1; -} - -void -jbr_fini(xlator_t *this) -{ - jbr_deallocate_priv(this->private); -} - -class_methods_t class_methods = { - .init = jbr_init, - .fini = jbr_fini, - .reconfigure = jbr_reconfigure, - .notify = jbr_notify, -}; - -struct volume_options options[] = { - {.key = {"leader"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "false", - .description = "Start in the leader role. This is only for " - "bootstrapping the code, and should go away when we " - "have real leader election."}, - {.key = {"vol-name"}, - .type = GF_OPTION_TYPE_STR, - .description = "volume name"}, - {.key = {"my-name"}, - .type = GF_OPTION_TYPE_STR, - .description = "brick name in form of host:/path"}, - {.key = {"etcd-servers"}, - .type = GF_OPTION_TYPE_STR, - .description = "list of comma separated etc servers"}, - {.key = {"subvol-uuid"}, - .type = GF_OPTION_TYPE_STR, - .description = "UUID for this JBR (sub)volume"}, - {.key = {"quorum-percent"}, - .type = GF_OPTION_TYPE_PERCENT, - .default_value = "50.0", - .description = "percentage of rep_count-1 that must be up"}, - {.key = {NULL}}, -}; |