diff options
Diffstat (limited to 'xlators/experimental/jbr-server/src/all-templates.c')
-rw-r--r-- | xlators/experimental/jbr-server/src/all-templates.c | 437 |
1 files changed, 437 insertions, 0 deletions
diff --git a/xlators/experimental/jbr-server/src/all-templates.c b/xlators/experimental/jbr-server/src/all-templates.c new file mode 100644 index 00000000000..9b9a3e0be5e --- /dev/null +++ b/xlators/experimental/jbr-server/src/all-templates.c @@ -0,0 +1,437 @@ +/* + * You can put anything here - it doesn't even have to be a comment - and it + * will be ignored until we reach the first template-name comment. + */ + + +/* template-name read-fop */ +int32_t +jbr_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + jbr_private_t *priv = this->private; + gf_boolean_t in_recon = _gf_false; + int32_t recon_term, recon_index; + + /* allow reads during reconciliation * + * TBD: allow "dirty" reads on non-leaders * + */ + if (xdata && + (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) && + (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) { + in_recon = _gf_true; + } + + if ((!priv->leader) && (in_recon == _gf_false)) { + goto err; + } + + STACK_WIND (frame, default_@NAME@_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@, + @SHORT_ARGS@); + return 0; + +err: + STACK_UNWIND_STRICT (@NAME@, frame, -1, EREMOTE, + @ERROR_ARGS@); + return 0; +} + +/* template-name read-dispatch */ +/* No "dispatch" function needed for @NAME@ */ + +/* template-name read-fan-in */ +/* No "fan-in" function needed for @NAME@ */ + +/* template-name read-continue */ +/* No "continue" function needed for @NAME@ */ + +/* template-name read-complete */ +/* No "complete" function needed for @NAME@ */ + +/* template-name write-fop */ +int32_t +jbr_@NAME@ (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + jbr_local_t *local = NULL; + jbr_private_t *priv = this->private; + gf_boolean_t result = _gf_false; + int op_errno = ENOMEM; + int from_leader; + int from_recon; + uint32_t ti = 0; + + /* + * Our first goal here is to avoid "split brain surprise" for users who + * specify exactly 50% with two- or three-way replication. That means + * either a more-than check against half the total replicas or an + * at-least check against half of our peers (one less). Of the two, + * only an at-least check supports the intuitive use of 100% to mean + * all replicas must be present, because "more than 100%" will never + * succeed regardless of which count we use. This leaves us with a + * slightly non-traditional definition of quorum ("at least X% of peers + * not including ourselves") but one that's useful enough to be worth + * it. + * + * Note that n_children and up_children *do* include the local + * subvolume, so we need to subtract one in each case. + */ + if (priv->leader) { + result = fop_quorum_check (this, (double)(priv->n_children - 1), + (double)(priv->up_children - 1)); + + if (result == _gf_false) { + /* Emulate the AFR client-side-quorum behavior. */ + gf_msg (this->name, GF_LOG_ERROR, EROFS, + J_MSG_QUORUM_NOT_MET, "Sufficient number of " + "subvolumes are not up to meet quorum."); + op_errno = EROFS; + goto err; + } + } else { + if (xdata) { + from_leader = !!dict_get(xdata, JBR_TERM_XATTR); + from_recon = !!dict_get(xdata, RECON_TERM_XATTR) + && !!dict_get(xdata, RECON_INDEX_XATTR); + } else { + from_leader = from_recon = _gf_false; + } + + /* follower/recon path * + * just send it to local node * + */ + if (!from_leader && !from_recon) { + op_errno = EREMOTE; + goto err; + } + } + + local = mem_get0(this->local_pool); + if (!local) { + goto err; + } +#if defined(JBR_CG_NEED_FD) + local->fd = fd_ref(fd); +#else + local->fd = NULL; +#endif + INIT_LIST_HEAD(&local->qlinks); + frame->local = local; + + /* + * If we let it through despite not being the leader, then we just want + * to pass it on down without all of the additional xattrs, queuing, and + * so on. However, jbr_*_complete does depend on the initialization + * immediately above this. + */ + if (!priv->leader) { + STACK_WIND (frame, jbr_@NAME@_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@, + @SHORT_ARGS@); + return 0; + } + + if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_msg (this->name, GF_LOG_ERROR, ENOMEM, + J_MSG_MEM_ERR, "failed to allocate xdata"); + goto err; + } + } + + if (dict_set_int32(xdata, JBR_TERM_XATTR, priv->current_term) != 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_DICT_FLR, "failed to set jbr-term"); + goto err; + } + + LOCK(&priv->index_lock); + ti = ++(priv->index); + UNLOCK(&priv->index_lock); + if (dict_set_int32(xdata, JBR_INDEX_XATTR, ti) != 0) { + gf_msg (this->name, GF_LOG_ERROR, 0, + J_MSG_DICT_FLR, "failed to set index"); + goto err; + } + + local->stub = fop_@NAME@_stub (frame, jbr_@NAME@_continue, + @SHORT_ARGS@); + if (!local->stub) { + goto err; + } + + +#if defined(JBR_CG_QUEUE) + jbr_inode_ctx_t *ictx = jbr_get_inode_ctx(this, fd->inode); + + if (!ictx) { + op_errno = EIO; + goto err; + } + LOCK(&ictx->lock); + if (ictx->active) { + gf_msg_debug (this->name, 0, + "queuing request due to conflict"); + /* + * TBD: enqueue only for real conflict + * + * Currently we just act like all writes are in + * conflict with one another. What we should really do + * is check the active/pending queues and defer only if + * there's a conflict there. + * + * It's important to check the pending queue because we + * might have an active request X which conflicts with + * a pending request Y, and this request Z might + * conflict with Y but not X. If we checked only the + * active queue then Z could jump ahead of Y, which + * would be incorrect. + */ + local->qstub = fop_@NAME@_stub (frame, + jbr_@NAME@_dispatch, + @SHORT_ARGS@); + if (!local->qstub) { + UNLOCK(&ictx->lock); + goto err; + } + list_add_tail(&local->qlinks, &ictx->pqueue); + ++(ictx->pending); + UNLOCK(&ictx->lock); + return 0; + } else { + list_add_tail(&local->qlinks, &ictx->aqueue); + ++(ictx->active); + } + UNLOCK(&ictx->lock); +#endif + + return jbr_@NAME@_dispatch (frame, this, @SHORT_ARGS@); + +err: + if (local) { + if (local->stub) { + call_stub_destroy(local->stub); + } + if (local->qstub) { + call_stub_destroy(local->qstub); + } + if (local->fd) { + fd_unref(local->fd); + } + mem_put(local); + } + STACK_UNWIND_STRICT (@NAME@, frame, -1, op_errno, + @ERROR_ARGS@); + return 0; +} + +/* template-name write-dispatch */ +int32_t +jbr_@NAME@_dispatch (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + jbr_local_t *local = frame->local; + jbr_private_t *priv = this->private; + xlator_list_t *trav; + + /* + * TBD: unblock pending request(s) if we fail after this point but + * before we get to jbr_@NAME@_complete (where that code currently + * resides). + */ + + local->call_count = priv->n_children - 1; + local->successful_acks = 0; + for (trav = this->children->next; trav; trav = trav->next) { + STACK_WIND (frame, jbr_@NAME@_fan_in, + trav->xlator, trav->xlator->fops->@NAME@, + @SHORT_ARGS@); + } + + /* TBD: variable Issue count */ + return 0; +} + +/* template-name write-fan-in */ +int32_t +jbr_@NAME@_fan_in (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + @LONG_ARGS@) +{ + jbr_local_t *local = frame->local; + uint8_t call_count; + + gf_msg_trace (this->name, 0, "op_ret = %d, op_errno = %d\n", + op_ret, op_errno); + + LOCK(&frame->lock); + call_count = --(local->call_count); + if (op_ret != -1) { + /* Increment the number of successful acks * + * received for the operation. * + */ + (local->successful_acks)++; + local->successful_op_ret = op_ret; + } + gf_msg_debug (this->name, 0, "succ_acks = %d, op_ret = %d, op_errno = %d\n", + op_ret, op_errno, local->successful_acks); + UNLOCK(&frame->lock); + + /* TBD: variable Completion count */ + if (call_count == 0) { + call_resume(local->stub); + } + + return 0; +} + +/* template-name write-continue */ +int32_t +jbr_@NAME@_continue (call_frame_t *frame, xlator_t *this, + @LONG_ARGS@) +{ + int32_t ret = -1; + gf_boolean_t result = _gf_false; + jbr_local_t *local = NULL; + jbr_private_t *priv = NULL; + + GF_VALIDATE_OR_GOTO ("jbr", this, out); + GF_VALIDATE_OR_GOTO (this->name, frame, out); + priv = this->private; + local = frame->local; + GF_VALIDATE_OR_GOTO (this->name, priv, out); + GF_VALIDATE_OR_GOTO (this->name, local, out); + + /* Perform quorum check to see if the leader needs * + * to perform the operation. If the operation will not * + * meet quorum irrespective of the leader's result * + * there is no point in the leader performing the fop * + */ + result = fop_quorum_check (this, (double)priv->n_children, + (double)local->successful_acks + 1); + if (result == _gf_false) { + gf_msg (this->name, GF_LOG_ERROR, EROFS, + J_MSG_QUORUM_NOT_MET, "Didn't receive enough acks " + "to meet quorum. Failing the operation without trying " + "it on the leader."); + STACK_UNWIND_STRICT (@NAME@, frame, -1, EROFS, + @ERROR_ARGS@); + } else { + STACK_WIND (frame, jbr_@NAME@_complete, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@, + @SHORT_ARGS@); + } + + ret = 0; +out: + return ret; +} + +/* template-name write-complete */ +int32_t +jbr_@NAME@_complete (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + @LONG_ARGS@) +{ + gf_boolean_t result = _gf_false; + jbr_private_t *priv = this->private; + + jbr_local_t *local = frame->local; + + /* If the fop failed on the leader, then reduce one succesful ack + * before calculating the fop quorum + */ + LOCK(&frame->lock); + if (op_ret == -1) + (local->successful_acks)--; + UNLOCK(&frame->lock); + +#if defined(JBR_CG_QUEUE) + jbr_inode_ctx_t *ictx; + jbr_local_t *next; + + if (local->qlinks.next != &local->qlinks) { + list_del(&local->qlinks); + ictx = jbr_get_inode_ctx(this, local->fd->inode); + if (ictx) { + LOCK(&ictx->lock); + if (ictx->pending) { + /* + * TBD: dequeue *all* non-conflicting + * reqs + * + * With the stub implementation there + * can only be one request active at a + * time (zero here) so it's not an + * issue. In a real implementation + * there might still be other active + * requests to check against, and + * multiple pending requests that could + * continue. + */ + gf_msg_debug (this->name, 0, + "unblocking next request"); + --(ictx->pending); + next = list_entry (ictx->pqueue.next, + jbr_local_t, qlinks); + list_del(&next->qlinks); + list_add_tail(&next->qlinks, + &ictx->aqueue); + call_resume(next->qstub); + } else { + --(ictx->active); + } + UNLOCK(&ictx->lock); + } + } +#endif + +#if defined(JBR_CG_FSYNC) + jbr_mark_fd_dirty(this, local); +#endif + +#if defined(JBR_CG_NEED_FD) + fd_unref(local->fd); +#endif + + /* After the leader completes the fop, a quorum check is * + * performed, taking into account the outcome of the fop * + * on the leader. Irrespective of the fop being successful * + * or failing on the leader, the result of the quorum will * + * determine if the overall fop is successful or not. For * + * example, a fop might have succeeded on every node except * + * the leader, in which case as quorum is being met, the fop * + * will be treated as a successful fop, even though it failed * + * on the leader. On follower nodes, no quorum check should * + * be done, and the result is returned to the leader as is. * + */ + if (priv->leader) { + result = fop_quorum_check (this, (double)priv->n_children, + (double)local->successful_acks + 1); + if (result == _gf_false) { + op_ret = -1; + op_errno = EROFS; + gf_msg (this->name, GF_LOG_ERROR, EROFS, + J_MSG_QUORUM_NOT_MET, "Quorum is not met. " + "The operation has failed."); + } else { +#if defined(JBR_CG_NEED_FD) + op_ret = local->successful_op_ret; +#else + op_ret = 0; +#endif + op_errno = 0; + gf_msg_debug (this->name, 0, + "Quorum has met. The operation has succeeded."); + } + } + + STACK_UNWIND_STRICT (@NAME@, frame, op_ret, op_errno, + @SHORT_ARGS@); + + + return 0; + +} |