From 3e50e09723e024cd451c5f48a153fef0fe4857c7 Mon Sep 17 00:00:00 2001 From: Avra Sengupta Date: Thu, 23 Jun 2016 12:15:22 +0530 Subject: jbr: Sending rollback from failed fop to fdl In case of a failed fop, the failure is detected by the leader in the jbr-server in two places. First during a quorum check of +ve responses when it receives responses from all the followers. At this point if the fop hasn't been successfully journaled at a quorum of followers (as in there is no merit in trying the fop in the leader as the quorum will never be met), then we fail the fop. Also if this quorum is met, then the fop is tried on the leader, and after the leader completes the fop a quorum check similar to the previous one is done again, this time including the leaders outcome. If quorum is not met, then we fail the fop. In both these cases, when the fop fails we send a -ve ack to the client. With this patch, now we will also send a rollback through a GF_FOP_IPC to all the followers(and also to the leader in the second case of failure). This rollback will contain the index and term number of the fop which failed. This will be recorded in the respective journals of the bricks and will be used to rollback the fop on that brick later. A subsequent write, and it's respective rollback would look something like the following in the journal. The trusted.jbr.term and trusted.jbr.index present in the dict of both the logs, relate them, and the presence of "rollback-fop" in the dict of IPC indicates that it is a rollback fop, and the value 13(stands for GF_FOP_WRITE) indicates what kind of rollback operation it is. === GF_FOP_WRITE fd = vector = <158 bytes> offset = 0 (0x0) flags = 32769 (0x8001) xdata = dict { trusted.jbr.term = 0 <2 bytes> trusted.jbr.index = 4 <2 bytes> } === GF_FOP_IPC xdata = dict { trusted.jbr.term = 0 <2 bytes> trusted.jbr.index = 4 <2 bytes> rollback-fop = 13 <3 bytes> } Change-Id: I70b6a143d20697153d58e2f719e34ecd1ed160a5 BUG: 1349385 Signed-off-by: Avra Sengupta Reviewed-on: http://review.gluster.org/14783 NetBSD-regression: NetBSD Build System CentOS-regression: Gluster Build System Reviewed-by: Jeff Darcy Smoke: Gluster Build System --- xlators/experimental/jbr-server/src/Makefile.am | 2 + .../experimental/jbr-server/src/all-templates.c | 124 ++++++++++++++++++++- xlators/experimental/jbr-server/src/gen-fops.py | 2 + xlators/experimental/jbr-server/src/jbr-internal.h | 1 + xlators/experimental/jbr-server/src/jbr.c | 81 +++++++++++++- 5 files changed, 201 insertions(+), 9 deletions(-) (limited to 'xlators/experimental/jbr-server/src') diff --git a/xlators/experimental/jbr-server/src/Makefile.am b/xlators/experimental/jbr-server/src/Makefile.am index 5d6209d709a..5dc0273b2f7 100644 --- a/xlators/experimental/jbr-server/src/Makefile.am +++ b/xlators/experimental/jbr-server/src/Makefile.am @@ -10,11 +10,13 @@ jbr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \ noinst_HEADERS = jbr-internal.h \ $(top_srcdir)/xlators/lib/src/libxlator.h \ + $(top_srcdir)/xlators/experimental/fdl/src/fdl.h \ $(top_srcdir)/glusterfsd/src/glusterfsd.h AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ -I$(top_srcdir)/xlators/lib/src -I$(top_srcdir)/rpc/rpc-lib/src \ + -I$(top_srcdir)/xlators/experimental/fdl/src/ \ -DSBIN_DIR=\"$(sbindir)\" -I$(top_srcdir)/api/src \ -DJBR_SCRIPT_PREFIX=\"$(jbrdir)\" \ -I$(top_srcdir)/xlators/experimental/jbr-client/src/ diff --git a/xlators/experimental/jbr-server/src/all-templates.c b/xlators/experimental/jbr-server/src/all-templates.c index 0fb96ac0436..9720442e63f 100644 --- a/xlators/experimental/jbr-server/src/all-templates.c +++ b/xlators/experimental/jbr-server/src/all-templates.c @@ -105,6 +105,7 @@ jbr_@NAME@ (call_frame_t *frame, xlator_t *this, if (ret) goto err; + local->xdata = dict_ref(xdata); local->stub = fop_@NAME@_stub (frame, jbr_@NAME@_continue, @SHORT_ARGS@); if (!local->stub) { @@ -248,7 +249,6 @@ jbr_@NAME@_dispatch (call_frame_t *frame, xlator_t *this, */ local->call_count = priv->n_children - 1; - local->successful_acks = 0; for (trav = this->children->next; trav; trav = trav->next) { STACK_WIND (frame, jbr_@NAME@_fan_in, trav->xlator, trav->xlator->fops->@NAME@, @@ -307,9 +307,12 @@ int32_t jbr_@NAME@_continue (call_frame_t *frame, xlator_t *this, @LONG_ARGS@) { - gf_boolean_t result = _gf_false; - jbr_local_t *local = NULL; - jbr_private_t *priv = NULL; + int32_t ret = -1; + gf_boolean_t result = _gf_false; + jbr_local_t *local = NULL; + jbr_local_t *new_local = NULL; + jbr_private_t *priv = NULL; + int32_t op_errno = 0; GF_VALIDATE_OR_GOTO ("jbr", this, out); GF_VALIDATE_OR_GOTO (this->name, frame, out); @@ -330,6 +333,58 @@ jbr_@NAME@_continue (call_frame_t *frame, xlator_t *this, J_MSG_QUORUM_NOT_MET, "Didn't receive enough acks " "to meet quorum. Failing the operation without trying " "it on the leader."); + +#if defined(JBR_CG_QUEUE) + /* + * In case of a fop failure, before unwinding need to * + * remove it from queue * + */ + ret = jbr_remove_from_queue (frame, this); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_GENERIC, "Failed to remove from queue."); + } +#endif + + /* + * In this case, the quorum is not met on the followers * + * So the operation will not be performed on the leader * + * and a rollback will be sent via GF_FOP_IPC to all the * + * followers, where this particular fop's term and index * + * numbers will be journaled, and later used to rollback * + */ + call_frame_t *new_frame; + + new_frame = copy_frame (frame); + + if (new_frame) { + new_local = mem_get0(this->local_pool); + if (new_local) { + INIT_LIST_HEAD(&new_local->qlinks); + ret = dict_set_int32 (local->xdata, + "rollback-fop", + GF_FOP_@UPNAME@); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_DICT_FLR, + "failed to set rollback-fop"); + } else { + new_local->xdata = dict_ref(local->xdata); + new_frame->local = new_local; + jbr_ipc_call_dispatch (new_frame, + this, &op_errno, + FDL_IPC_JBR_SERVER_ROLLBACK, + new_local->xdata); + } + } else { + gf_log (this->name, GF_LOG_WARNING, + "Could not create local for new_frame"); + } + } else { + gf_log (this->name, GF_LOG_WARNING, + "Could not send rollback ipc"); + } + STACK_UNWIND_STRICT (@NAME@, frame, -1, EROFS, @ERROR_ARGS@); } else { @@ -348,12 +403,11 @@ jbr_@NAME@_complete (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, @LONG_ARGS@) { -#if defined(JBR_CG_QUEUE) int32_t ret = -1; -#endif gf_boolean_t result = _gf_false; jbr_private_t *priv = NULL; jbr_local_t *local = NULL; + jbr_local_t *new_local = NULL; GF_VALIDATE_OR_GOTO ("jbr", this, err); GF_VALIDATE_OR_GOTO (this->name, frame, err); @@ -404,6 +458,59 @@ jbr_@NAME@_complete (call_frame_t *frame, void *cookie, xlator_t *this, gf_msg (this->name, GF_LOG_ERROR, EROFS, J_MSG_QUORUM_NOT_MET, "Quorum is not met. " "The operation has failed."); + /* + * In this case, the quorum is not met after the * + * operation is performed on the leader. Hence a * + * rollback will be sent via GF_FOP_IPC to the leader * + * where this particular fop's term and index numbers * + * will be journaled, and later used to rollback. * + * The same will be done on all the followers * + */ + call_frame_t *new_frame; + + new_frame = copy_frame (frame); + if (new_frame) { + new_local = mem_get0(this->local_pool); + if (new_local) { + INIT_LIST_HEAD(&new_local->qlinks); + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_DICT_FLR, "op = %d", + new_frame->op); + ret = dict_set_int32 (local->xdata, + "rollback-fop", + GF_FOP_@UPNAME@); + if (ret) { + gf_msg (this->name, + GF_LOG_ERROR, 0, + J_MSG_DICT_FLR, + "failed to set " + "rollback-fop"); + } else { + new_local->xdata = dict_ref (local->xdata); + new_frame->local = new_local; + /* + * Calling STACK_WIND instead * + * of jbr_ipc as it will not * + * unwind to the previous * + * translators like it will * + * in case of jbr_ipc. * + */ + STACK_WIND (new_frame, + jbr_ipc_complete, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ipc, + FDL_IPC_JBR_SERVER_ROLLBACK, + new_local->xdata); + } + } else { + gf_log (this->name, GF_LOG_WARNING, + "Could not create local " + "for new_frame"); + } + } else { + gf_log (this->name, GF_LOG_WARNING, + "Could not send rollback ipc"); + } } else { #if defined(JBR_CG_NEED_FD) op_ret = local->successful_op_ret; @@ -416,6 +523,11 @@ jbr_@NAME@_complete (call_frame_t *frame, void *cookie, xlator_t *this, } } + /* + * Unrefing the reference taken in jbr_@NAME@ () * + */ + dict_unref (local->xdata); + STACK_UNWIND_STRICT (@NAME@, frame, op_ret, op_errno, @SHORT_ARGS@); diff --git a/xlators/experimental/jbr-server/src/gen-fops.py b/xlators/experimental/jbr-server/src/gen-fops.py index 8a2b47c5345..c4a5556a8fa 100755 --- a/xlators/experimental/jbr-server/src/gen-fops.py +++ b/xlators/experimental/jbr-server/src/gen-fops.py @@ -101,6 +101,7 @@ fop_table = { "unlink": "write", "writev": "write,fsync,queue", "xattrop": "write", + "ipc": "write", } # Mention those fops in the selective_generate table, for which @@ -108,6 +109,7 @@ fop_table = { # functions. Rest of the functions can be customized selective_generate = { "lk": "fop,dispatch,call_dispatch", + "ipc": "dispatch,call_dispatch", } # Stolen from gen_fdl.py diff --git a/xlators/experimental/jbr-server/src/jbr-internal.h b/xlators/experimental/jbr-server/src/jbr-internal.h index ab1dfc16de2..46a29910d1f 100644 --- a/xlators/experimental/jbr-server/src/jbr-internal.h +++ b/xlators/experimental/jbr-server/src/jbr-internal.h @@ -86,6 +86,7 @@ typedef struct { uint32_t successful_op_ret; fd_t *fd; struct list_head qlinks; + dict_t *xdata; } jbr_local_t; /* diff --git a/xlators/experimental/jbr-server/src/jbr.c b/xlators/experimental/jbr-server/src/jbr.c index 926b5b3c742..c3f0344df00 100644 --- a/xlators/experimental/jbr-server/src/jbr.c +++ b/xlators/experimental/jbr-server/src/jbr.c @@ -24,6 +24,7 @@ #include "syncop.h" #include "syscall.h" #include "compat-errno.h" +#include "fdl.h" #include "jbr-internal.h" #include "jbr-messages.h" @@ -52,6 +53,15 @@ jbr_lk_dispatch (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata); +int32_t +jbr_ipc_call_dispatch (call_frame_t *frame, xlator_t *this, int *op_errno, + int32_t op, dict_t *xdata); + +int32_t +jbr_ipc_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t *xdata); + /* Used to check the quorum of acks received after the fop * confirming the status of the fop on all the brick processes * for this particular subvolume @@ -277,6 +287,7 @@ jbr_leader_checks_and_init (call_frame_t *frame, xlator_t *this, int *op_errno, local->fd = NULL; INIT_LIST_HEAD(&local->qlinks); + local->successful_acks = 0; frame->local = local; ret = 0; @@ -718,8 +729,6 @@ out: return ret; } -#pragma generate - uint8_t jbr_count_up_kids (jbr_private_t *priv) { @@ -1285,6 +1294,65 @@ err: STACK_UNWIND_STRICT (ipc, frame, -1, op_errno, NULL); } +int32_t +jbr_ipc_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +{ + jbr_local_t *local = NULL; + int32_t ret = -1; + uint8_t call_count; + + GF_VALIDATE_OR_GOTO ("jbr", this, out); + GF_VALIDATE_OR_GOTO (this->name, frame, out); + local = frame->local; + GF_VALIDATE_OR_GOTO (this->name, local, out); + + gf_msg_trace (this->name, 0, "op_ret = %d, op_errno = %d\n", + op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + UNLOCK(&frame->lock); + + if (call_count == 0) { +#if defined(JBR_CG_QUEUE) + ret = jbr_remove_from_queue (frame, this); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_GENERIC, "Failed to remove from queue."); + } +#endif + /* + * Unrefing the reference taken in continue() or complete() * + */ + dict_unref (local->xdata); + STACK_DESTROY (frame->root); + } + + ret = 0; +out: + return ret; +} + +int32_t +jbr_ipc_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t *xdata) +{ + jbr_local_t *local = NULL; + + GF_VALIDATE_OR_GOTO ("jbr", this, out); + GF_VALIDATE_OR_GOTO (this->name, frame, out); + local = frame->local; + GF_VALIDATE_OR_GOTO (this->name, local, out); + + jbr_ipc_call_dispatch (frame, + this, &op_errno, + FDL_IPC_JBR_SERVER_ROLLBACK, + local->xdata); +out: + return 0; +} int32_t jbr_ipc (call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata) @@ -1299,6 +1367,13 @@ jbr_ipc (call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata) case JBR_SERVER_NEXT_ENTRY: jbr_next_entry(frame, this); break; + case FDL_IPC_JBR_SERVER_ROLLBACK: + /* + * Just send the fop down to fdl. Need not * + * dispatch it to other bricks in the sub- * + * volume, as it will be done where the op * + * has failed. * + */ default: STACK_WIND_TAIL (frame, FIRST_CHILD(this), @@ -1309,6 +1384,7 @@ jbr_ipc (call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata) return 0; } +#pragma generate int32_t jbr_forget (xlator_t *this, inode_t *inode) @@ -1556,7 +1632,6 @@ jbr_init (xlator_t *this) */ this->fops->getxattr = jbr_getxattr_special; this->fops->fsync = jbr_fsync; - this->fops->ipc = jbr_ipc; local = this->children; if (!local) { -- cgit