diff options
| author | Pranith Kumar K <pkarampu@redhat.com> | 2014-04-21 03:13:58 +0530 | 
|---|---|---|
| committer | Niels de Vos <ndevos@redhat.com> | 2014-05-10 13:30:48 -0700 | 
| commit | 46bf591eb87d743baacd30a85b99fd47db4fb055 (patch) | |
| tree | 20ba994c07283e4d91d45f68ef844614da3045e3 | |
| parent | ba328340e878c3156418bb3443c35a7db02a0f4b (diff) | |
cluster/afr: Fix bugs in quorum implementation
- Have common place to perform quorum fop wind check
- Check if fop succeeded in a way that matches quorum
  to avoid marking changelog in split-brain.
Change-Id: I663072ece0e1de6e1ee9fccb03e1b6c968793bc5
BUG: 1066996
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: http://review.gluster.org/7513
Reviewed-by: Ravishankar N <ravishankar@redhat.com>
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Niels de Vos <ndevos@redhat.com>
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 51 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-dir-write.c | 16 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-inode-write.c | 30 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-open.c | 4 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.c | 142 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.h | 3 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 6 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 17 | 
8 files changed, 166 insertions, 103 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 2a54a79ed4a..a31cc2cb015 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -4034,6 +4034,8 @@ afr_notify (xlator_t *this, int32_t event,          int             up_child            = AFR_ALL_CHILDREN;          dict_t          *input              = NULL;          dict_t          *output             = NULL; +        gf_boolean_t    had_quorum          = _gf_false; +        gf_boolean_t    has_quorum          = _gf_false;          priv = this->private; @@ -4071,6 +4073,8 @@ afr_notify (xlator_t *this, int32_t event,                  goto out;          } +        had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up, +                                                           this);          switch (event) {          case GF_EVENT_CHILD_UP:                  LOCK (&priv->lock); @@ -4163,6 +4167,16 @@ afr_notify (xlator_t *this, int32_t event,                  break;          } +        if (priv->quorum_count) { +                has_quorum = afr_has_quorum (priv->child_up, this); +                if (!had_quorum && has_quorum) +                        gf_log (this->name, GF_LOG_INFO, "Client-quorum" +                                " is met"); +                if (had_quorum && !has_quorum) +                        gf_log (this->name, GF_LOG_WARNING, +                                "Client-quorum is not met"); +        } +          /* have all subvolumes reported status once by now? */          have_heard_from_all = 1;          for (i = 0; i < priv->child_count; i++) { @@ -4546,43 +4560,6 @@ out:          return ret;  } -gf_boolean_t -afr_have_quorum (char *logname, afr_private_t *priv) -{ -        unsigned int        quorum = 0; - -        GF_VALIDATE_OR_GOTO(logname,priv,out); - -        quorum = priv->quorum_count; -        if (quorum != AFR_QUORUM_AUTO) { -                return (priv->up_count >= (priv->down_count + quorum)); -        } - -        quorum = priv->child_count / 2 + 1; -        if (priv->up_count >= (priv->down_count + quorum)) { -                return _gf_true; -        } - -        /* -         * Special case for even numbers of nodes: if we have exactly half -         * and that includes the first ("senior-most") node, then that counts -         * as quorum even if it wouldn't otherwise.  This supports e.g. N=2 -         * while preserving the critical property that there can only be one -         * such group. -         */ -        if ((priv->child_count % 2) == 0) { -                quorum = priv->child_count / 2; -                if (priv->up_count >= (priv->down_count + quorum)) { -                        if (priv->child_up[0]) { -                                return _gf_true; -                        } -                } -        } - -out: -        return _gf_false; -} -  void  afr_priv_destroy (afr_private_t *priv)  { diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 1943b719bb5..3cdec64536d 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -432,8 +432,6 @@ afr_create (call_frame_t *frame, xlator_t *this,          priv = this->private; -        QUORUM_CHECK(create,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame) {                  op_errno = ENOMEM; @@ -637,8 +635,6 @@ afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,          priv = this->private; -        QUORUM_CHECK(mknod,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame) {                  op_errno = ENOMEM; @@ -842,8 +838,6 @@ afr_mkdir (call_frame_t *frame, xlator_t *this,          priv = this->private; -        QUORUM_CHECK(mkdir,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame) {                  op_errno = ENOMEM; @@ -1046,8 +1040,6 @@ afr_link (call_frame_t *frame, xlator_t *this,          priv = this->private; -        QUORUM_CHECK(link,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame) {                  op_errno = ENOMEM; @@ -1249,8 +1241,6 @@ afr_symlink (call_frame_t *frame, xlator_t *this,          priv = this->private; -        QUORUM_CHECK(symlink,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame) {                  op_errno = ENOMEM; @@ -1464,8 +1454,6 @@ afr_rename (call_frame_t *frame, xlator_t *this,          priv = this->private; -        QUORUM_CHECK(rename,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame) {                  op_errno = ENOMEM; @@ -1685,8 +1673,6 @@ afr_unlink (call_frame_t *frame, xlator_t *this,          priv = this->private; -        QUORUM_CHECK(unlink,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame) {                  op_errno = ENOMEM; @@ -1889,8 +1875,6 @@ afr_rmdir (call_frame_t *frame, xlator_t *this,          priv = this->private; -        QUORUM_CHECK(rmdir,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame) {                  op_errno = ENOMEM; diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index d62847defa3..5561e9c36d7 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -548,8 +548,6 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,                  goto out;          } -        QUORUM_CHECK(writev,out); -          AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);          local = frame->local; @@ -730,8 +728,6 @@ afr_truncate (call_frame_t *frame, xlator_t *this,          priv = this->private; -        QUORUM_CHECK(truncate,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame) {                  op_errno = ENOMEM; @@ -745,6 +741,7 @@ afr_truncate (call_frame_t *frame, xlator_t *this,          if (ret < 0)                  goto out; +        local->op = GF_FOP_TRUNCATE;          local->cont.truncate.offset  = offset;          local->transaction.fop    = afr_truncate_wind; @@ -974,8 +971,6 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this,                  op_errno = EIO;                  goto out;          } -        QUORUM_CHECK(ftruncate,out); -          AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);          local = frame->local; @@ -1153,8 +1148,6 @@ afr_setattr (call_frame_t *frame, xlator_t *this,          priv = this->private; -        QUORUM_CHECK(setattr,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame) {                  op_errno = ENOMEM; @@ -1168,6 +1161,7 @@ afr_setattr (call_frame_t *frame, xlator_t *this,          if (ret < 0)                  goto out; +        local->op = GF_FOP_SETATTR;          local->cont.setattr.in_buf = *buf;          local->cont.setattr.valid  = valid; @@ -1351,8 +1345,6 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this,                  goto out;          } -        QUORUM_CHECK(fsetattr,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame) {                  op_errno = ENOMEM; @@ -1366,6 +1358,7 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this,          if (ret < 0)                  goto out; +        local->op = GF_FOP_FSETATTR;          local->cont.fsetattr.in_buf = *buf;          local->cont.fsetattr.valid  = valid; @@ -1540,7 +1533,6 @@ afr_setxattr (call_frame_t *frame, xlator_t *this,          priv = this->private; -        QUORUM_CHECK(setxattr,out);          transaction_frame = copy_frame (frame);          if (!transaction_frame) {                  op_errno = ENOMEM; @@ -1554,6 +1546,7 @@ afr_setxattr (call_frame_t *frame, xlator_t *this,          if (ret < 0)                  goto out; +        local->op = GF_FOP_SETXATTR;          local->cont.setxattr.dict  = dict_ref (dict);          local->cont.setxattr.flags = flags; @@ -1730,8 +1723,6 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this,                  goto out;          } -        QUORUM_CHECK(fsetxattr,out); -          AFR_LOCAL_ALLOC_OR_GOTO (local, out);          ret = afr_local_init (local, priv, &op_errno); @@ -1747,6 +1738,7 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this,          local->op_ret = -1; +        local->op = GF_FOP_FSETXATTR;          local->cont.fsetxattr.dict  = dict_ref (dict);          local->cont.fsetxattr.flags = flags; @@ -1922,8 +1914,6 @@ afr_removexattr (call_frame_t *frame, xlator_t *this,          priv = this->private; -        QUORUM_CHECK(removexattr,out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame) {                  op_errno = ENOMEM; @@ -1937,6 +1927,7 @@ afr_removexattr (call_frame_t *frame, xlator_t *this,          if (ret < 0)                  goto out; +        local->op = GF_FOP_REMOVEXATTR;          local->cont.removexattr.name = gf_strdup (name);          local->transaction.fop    = afr_removexattr_wind; @@ -2111,8 +2102,6 @@ afr_fremovexattr (call_frame_t *frame, xlator_t *this,                  goto out;          } -        QUORUM_CHECK(fremovexattr, out); -          transaction_frame = copy_frame (frame);          if (!transaction_frame) {                  goto out; @@ -2130,6 +2119,7 @@ afr_fremovexattr (call_frame_t *frame, xlator_t *this,          local->op_ret = -1; +        local->op = GF_FOP_FREMOVEXATTR;          local->cont.removexattr.name = gf_strdup (name);          local->transaction.fop    = afr_fremovexattr_wind; @@ -2354,8 +2344,6 @@ afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,                  op_errno = EIO;                  goto out;          } -        QUORUM_CHECK(fallocate,out); -          AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);          local = frame->local; @@ -2581,8 +2569,6 @@ afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,                  op_errno = EIO;                  goto out;          } -        QUORUM_CHECK(discard, out); -          AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);          local = frame->local; @@ -2825,8 +2811,6 @@ afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,                  op_errno = EIO;                  goto out;          } -        QUORUM_CHECK(zerofill, out); -          AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);          local = frame->local; diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index 643a5d692df..9404b6e847b 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -203,10 +203,6 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,          priv = this->private; -        if (flags & (O_CREAT|O_TRUNC)) { -                QUORUM_CHECK(open,out); -        } -          if (afr_is_split_brain (this, loc->inode)) {                  /* self-heal failed */                  gf_log (this->name, GF_LOG_WARNING, diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 20306e46924..91af75b503b 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -640,6 +640,131 @@ afr_data_handle_quota_errors (call_frame_t *frame, xlator_t *this)                                      local->transaction.type);  } +gf_boolean_t +afr_has_quorum (unsigned char *subvols, xlator_t *this) +{ +        unsigned int  quorum_count = 0; +        afr_private_t *priv  = NULL; +        unsigned int  up_children_count = 0; + +        priv = this->private; +        up_children_count = afr_up_children_count (subvols, priv->child_count); + +        if (priv->quorum_count == AFR_QUORUM_AUTO) { +                quorum_count = priv->child_count/2 + 1; +        } else { +                quorum_count = priv->quorum_count; +        } + +        if (priv->quorum_count == AFR_QUORUM_AUTO) { +        /* +         * Special case for even numbers of nodes in auto-quorum: +         * if we have exactly half children up +         * and that includes the first ("senior-most") node, then that counts +         * as quorum even if it wouldn't otherwise.  This supports e.g. N=2 +         * while preserving the critical property that there can only be one +         * such group. +         */ +                if ((priv->child_count % 2 == 0) && +                    (up_children_count == (priv->child_count/2))) +                        return subvols[0]; +        } + +        if (up_children_count >= quorum_count) +                return _gf_true; + +        return _gf_false; +} + +static gf_boolean_t +afr_has_fop_quorum (call_frame_t *frame) +{ +        xlator_t        *this = frame->this; +        afr_local_t     *local = frame->local; +        unsigned char *locked_nodes = NULL; + +        locked_nodes = afr_locked_nodes_get (local->transaction.type, +                                             &local->internal_lock); +        return afr_has_quorum (locked_nodes, this); +} + +static gf_boolean_t +afr_has_fop_cbk_quorum (call_frame_t *frame) +{ +        afr_local_t   *local   = frame->local; +        xlator_t      *this    = frame->this; +        afr_private_t *priv    = this->private; +        unsigned char *success = alloca(priv->child_count); +        int           i        = 0; +        int           j        = 0; + +        j = afr_index_for_transaction_type (local->transaction.type); +        for (i = 0; i < priv->child_count; i++) { +                if (local->pending[i][j]) +                        success[i] = 1; +                else +                        success[i] = 0; +        } + +        return afr_has_quorum (success, this); +} + +void +afr_handle_quorum (call_frame_t *frame) +{ +        afr_local_t   *local = NULL; +        afr_private_t *priv  = NULL; +        int           i      = 0; +        uuid_t        gfid   = {0}; + +        local = frame->local; +        priv  = frame->this->private; + +        if (priv->quorum_count == 0) +                return; + +        /* +         * Network split may happen just after the fops are unwound, so check +         * if the fop succeeded in a way it still follows quorum. If it doesn't, +         * mark the fop as failure, mark the changelogs so it reflects that +         * failure. +         * +         * Scenario: +         * There are 3 mounts on 3 machines(node1, node2, node3) all writing to +         * single file. Network split happened in a way that node1 can't see +         * node2, node3. Node2, node3 both of them can't see node1. Now at the +         * time of sending write all the bricks are up. Just after write fop is +         * wound on node1, network split happens. Node1 thinks write fop failed +         * on node2, node3 so marks pending changelog for those 2 extended +         * attributes on node1. Node2, node3 thinks writes failed on node1 so +         * they mark pending changelog for node1. When the network is stable +         * again the file already is in split-brain. These checks prevent +         * marking pending changelog on other subvolumes if the fop doesn't +         * succeed in a way it is still following quorum. So with this fix what +         * is happening is, node1 will have all pending changelog(FOOL) because +         * the write succeeded only on node1 but failed on node2, node3 so +         * instead of marking pending changelogs on node2, node3 it just treats +         * the fop as failure and goes into FOOL state. Where as node2, node3 +         * say they are sources and have pending changelog to node1 so there is +         * no split-brain with the fix. The problem is eliminated completely. +         */ +        if (afr_has_fop_cbk_quorum (frame)) +                return; + +        if (local->fd) +                uuid_copy (gfid, local->fd->inode->gfid); +        else +                loc_gfid (&local->loc, gfid); + +        gf_log (frame->this->name, GF_LOG_WARNING, "%s: Failing %s as quorum " +                "is not met", uuid_utoa (gfid), gf_fop_list[local->op]); +        for (i = 0; i < priv->child_count; i++) +                __mark_child_dead (local->pending, priv->child_count, i, +                                   local->transaction.type); +        local->op_ret = -1; +        local->op_errno = EROFS; +} +  int  afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)  { @@ -663,6 +788,7 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)          afr_data_handle_quota_errors (frame, this);          afr_dir_fop_handle_all_fop_failures (frame); +        afr_handle_quorum (frame);          if (local->fd)                  afr_transaction_rm_stale_children (frame, this, @@ -1320,12 +1446,24 @@ afr_lock (call_frame_t *frame, xlator_t *this)  int  afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)  { +        afr_local_t     *local = frame->local; +        afr_private_t   *priv  = this->private; + +        if (priv->quorum_count && !afr_has_fop_quorum (frame)) { +                local->op_ret = -1; +                local->op_errno = EROFS; +                local->internal_lock.lock_cbk = local->transaction.done; +                afr_unlock (frame, this); +                goto out; +        } +          if (__fop_changelog_needed (frame, this)) {                  afr_changelog_pre_op (frame, this);          } else {                  afr_transaction_perform_fop (frame, this);          } +out:          return 0;  } @@ -1833,9 +1971,7 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,                          child_index, local->transaction.type);  } - - -        static gf_boolean_t +static gf_boolean_t  afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)  {          uint64_t start1 = local1->transaction.start; diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index fa626fd0d6e..bcc40db99d3 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -48,4 +48,7 @@ afr_any_fops_failed (afr_local_t *local, afr_private_t *priv);  gf_boolean_t  afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this); + +gf_boolean_t +afr_has_quorum (unsigned char *subvols, xlator_t *this);  #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index c724eb2ae42..c020cbccac5 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -497,9 +497,6 @@ struct xlator_fops fops = {          .finodelk    = afr_finodelk,          .entrylk     = afr_entrylk,          .fentrylk    = afr_fentrylk, -	.fallocate   = afr_fallocate, -	.discard     = afr_discard, -        .zerofill    = afr_zerofill,          /* inode read */          .access      = afr_access, @@ -520,6 +517,9 @@ struct xlator_fops fops = {          .fsetattr    = afr_fsetattr,          .removexattr = afr_removexattr,          .fremovexattr = afr_fremovexattr, +        .fallocate   = afr_fallocate, +        .discard     = afr_discard, +        .zerofill    = afr_zerofill,          /* dir read */          .opendir     = afr_opendir, diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index e86300d0a22..59d08a1d0b7 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -1136,9 +1136,6 @@ int  afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child,                        int flags); -gf_boolean_t -afr_have_quorum (char *logname, afr_private_t *priv); -  void  afr_matrix_cleanup (int32_t **pending, unsigned int m); @@ -1167,20 +1164,6 @@ afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count);   */  #define AFR_QUORUM_AUTO INT_MAX -/* - * Having this as a macro will make debugging a bit weirder, but does reduce - * the probability of functions handling this check inconsistently. - */ -#define QUORUM_CHECK(_func,_label) do {                                  \ -        if (priv->quorum_count && !afr_have_quorum(this->name,priv)) { \ -                gf_log(this->name,GF_LOG_WARNING,                        \ -                       "failing "#_func" due to lack of quorum");        \ -                op_errno = EROFS;                                        \ -                goto _label;                                             \ -        }                                                                \ -} while (0); - -  #define AFR_SBRAIN_MSG "Failed on %s as split-brain is seen. Returning EIO."  #define AFR_SBRAIN_CHECK_FD(fd, label) do {                              \  | 
