diff options
| -rw-r--r-- | doc/features/heal-info-and-split-brain-resolution.md | 11 | ||||
| -rw-r--r-- | libglusterfs/src/glusterfs.h | 1 | ||||
| -rw-r--r-- | tests/basic/afr/split-brain-resolution.t | 1 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 198 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-inode-write.c | 140 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-mem-types.h | 1 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.c | 2 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 20 | 
8 files changed, 316 insertions, 58 deletions
diff --git a/doc/features/heal-info-and-split-brain-resolution.md b/doc/features/heal-info-and-split-brain-resolution.md index 6ca2be2f02f..7a6691db14e 100644 --- a/doc/features/heal-info-and-split-brain-resolution.md +++ b/doc/features/heal-info-and-split-brain-resolution.md @@ -426,6 +426,15 @@ Now performing cat operation on the file will again result in input/output error  cat: file1: Input/output error  ~~~ +The user can access each file for a timeout amount of period every time replica.split-brain-choice is set. This timeout is configurable by user, with a default value of 5 minutes. +### To set split-brain-choice timeout +A setfattr command from the mount allows the user set this timeout, to be specified in minutes. +~~~ +# setfattr -n replica.split-brain-choice-timeout -v <timeout-in-minutes> <mount_point/file> +~~~ +This is a global timeout, i.e. applicable to all files as long as the mount exists. So, the timeout need not be set each time a file needs to be inspected but for a new mount it will have to be set again for the first time. This option also needs to be set every time there is a client graph switch (_See note #3_).  + +### Resolving the split-brain  Once the choice for resolving split-brain is made, source brick is supposed to be set for the healing to be done.  This is done using the following command: @@ -446,3 +455,5 @@ NOTE:  ~~~  2) The above mentioned process for split-brain resolution from mount will not work on nfs mounts as it doesn't provide xattrs support. + +3) Client graph switch occurs when there is a change in the client side translator graph; typically during addition of new translators to the graph on client side and add-brick/remove-brick operations. diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index cfa5b75bd04..14722ce5ec5 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -167,6 +167,7 @@  #define GF_AFR_HEAL_SBRAIN "glusterfs.heal-sbrain"  #define GF_AFR_SBRAIN_STATUS "replica.split-brain-status"  #define GF_AFR_SBRAIN_CHOICE "replica.split-brain-choice" +#define GF_AFR_SPB_CHOICE_TIMEOUT "replica.split-brain-choice-timeout"  #define GF_AFR_SBRAIN_RESOLVE "replica.split-brain-heal-finalize"  #define GF_GFIDLESS_LOOKUP "gfidless-lookup" diff --git a/tests/basic/afr/split-brain-resolution.t b/tests/basic/afr/split-brain-resolution.t index 03e51cf92b1..fa1342e2cd5 100644 --- a/tests/basic/afr/split-brain-resolution.t +++ b/tests/basic/afr/split-brain-resolution.t @@ -50,6 +50,7 @@ TEST setfattr -n replica.split-brain-choice -v $V0-client-0 $M0/data-split-brain  #Should now be able to read the contents of data-split-brain.txt  EXPECT "brick0_alive" cat $M0/data-split-brain.txt +TEST setfattr -n replica.split-brain-choice-timeout -v 10 $M0/  TEST setfattr -n replica.split-brain-choice -v $V0-client-1 $M0/data-split-brain.txt  #Should now be able to read the contents of data-split-brain.txt diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 8fbca0b6f42..46f726da734 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -413,6 +413,142 @@ out:  	return ret;  } +int +afr_spb_choice_timeout_cancel (xlator_t *this, inode_t *inode) +{ +        afr_inode_ctx_t *ctx    = NULL; +        int              ret    = -1; + +        if (!inode) +                return ret; + +        LOCK(&inode->lock); +        { +                __afr_inode_ctx_get (this, inode, &ctx); +                if (!ctx) { +                        gf_log (this->name, GF_LOG_WARNING, "Failed to cancel" +                                " split-brain choice timer."); +                        goto out; +                } +                ctx->spb_choice = -1; +                if (ctx->timer) { +                        gf_timer_call_cancel (this->ctx, ctx->timer); +                        ctx->timer = NULL; +                } +                ret = 0; +        } +out: +        UNLOCK(&inode->lock); +        return ret; +} + +void +afr_set_split_brain_choice_cbk (void *data) +{ +        inode_t      *inode     = data; +        xlator_t     *this      = THIS; + +        afr_spb_choice_timeout_cancel (this, inode); +        inode_unref (inode); +        return; +} + + +int +afr_set_split_brain_choice (int ret, call_frame_t *frame, void *opaque) +{ +        int     op_errno         = ENOMEM; +        afr_private_t *priv      = NULL; +        afr_inode_ctx_t *ctx     = NULL; +        inode_t *inode           = NULL; +        loc_t   *loc             = NULL; +        xlator_t *this           = NULL; +        afr_spbc_timeout_t *data = opaque; +        struct timespec delta    = {0, }; + +        if (ret) +                goto out; + +        frame = data->frame; +        loc = data->loc; +        this = frame->this; +        priv = this->private; + +        delta.tv_sec = priv->spb_choice_timeout; +        delta.tv_nsec = 0; + +        inode = loc->inode; +        if (!inode) +                goto out; + +        if (!(data->d_spb || data->m_spb)) { +                gf_log (this->name, GF_LOG_WARNING, "Cannot set " +                        "replica.split-brain-choice on %s. File is" +                        " not in data/metadata split-brain.", +                        uuid_utoa (loc->gfid)); +                ret = -1; +                op_errno = EINVAL; +                goto out; +        } + +        LOCK(&inode->lock); +        { +                ret = __afr_inode_ctx_get (this, inode, &ctx); +                if (ret) { +                        gf_log (this->name, GF_LOG_ERROR, "Failed to get" +                                "inode_ctx for %s", loc->name); +                        goto unlock; +                } + +                ctx->spb_choice = data->spb_child_index; + +                /* Possible changes in spb-choice : +                 *         -1 to valid    : ref and inject timer +                 * +                 *         valid to valid : cancel timer and inject new one +                 * +                 *         valid to -1    : cancel timer and unref +                 * +                 *         -1    to -1    : do not do anything +                 */ + +                /* ctx->timer is NULL iff previous value of +                 * ctx->spb_choice is -1 +                 */ +                if (ctx->timer) { +                        if (ctx->spb_choice == -1) { +                                gf_timer_call_cancel (this->ctx, ctx->timer); +                                ctx->timer = NULL; +                                inode_unref (inode); +                                goto unlock; +                        } +                        goto reset_timer; +                } else { +                        if (ctx->spb_choice == -1) +                                goto unlock; +                } + +                inode = inode_ref (loc->inode); +                goto set_timer; + +reset_timer: +                gf_timer_call_cancel (this->ctx, ctx->timer); +                ctx->timer = NULL; + +set_timer: +                ctx->timer = gf_timer_call_after (this->ctx, delta, +                                                  afr_set_split_brain_choice_cbk, +                                                  inode); +        } +unlock: +        UNLOCK(&inode->lock); +        inode_invalidate (inode); +out: +        if (data) +                GF_FREE (data); +        AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL); +        return 0; +}  int  afr_accused_fill (xlator_t *this, dict_t *xdata, unsigned char *accused, @@ -3589,6 +3725,7 @@ afr_forget (xlator_t *this, inode_t *inode)          uint64_t        ctx_int = 0;          afr_inode_ctx_t *ctx    = NULL; +        afr_spb_choice_timeout_cancel (this, inode);          inode_ctx_del (inode, this, &ctx_int);          if (!ctx_int)                  return 0; @@ -4552,10 +4689,10 @@ out:  }  int -afr_set_split_brain_status (call_frame_t *frame, xlator_t *this, -                            struct afr_reply *replies, -                            afr_transaction_type type, -                            gf_boolean_t *spb) +_afr_is_split_brain (call_frame_t *frame, xlator_t *this, +                         struct afr_reply *replies, +                         afr_transaction_type type, +                         gf_boolean_t *spb)  {          afr_private_t    *priv              = NULL;          uint64_t         *witness           = NULL; @@ -4584,6 +4721,37 @@ afr_set_split_brain_status (call_frame_t *frame, xlator_t *this,  }  int +afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode, +                    uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb) +{ +        int    ret                          = -1; +        afr_private_t    *priv              = NULL; +        struct afr_reply *replies           = NULL; + +        priv = this->private; + +        replies = alloca0 (sizeof (*replies) * priv->child_count); + +        ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies); +        if (ret) +                goto out; + +        ret = _afr_is_split_brain (frame, this, replies, +                                    AFR_DATA_TRANSACTION, d_spb); +        if (ret) +                goto out; + +        ret = _afr_is_split_brain (frame, this, replies, +                                    AFR_METADATA_TRANSACTION, m_spb); +out: +        if (replies) { +                afr_replies_wipe (replies, priv->child_count); +                replies = NULL; +        } +        return ret; +} + +int  afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc)  {          gf_boolean_t      d_spb             = _gf_false; @@ -4594,7 +4762,6 @@ afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc)          char             *choices           = NULL;          char             *status            = NULL;          dict_t           *dict              = NULL; -        struct afr_reply *replies           = NULL;          inode_t          *inode             = NULL;          afr_private_t    *priv              = NULL;          xlator_t         **children         = NULL; @@ -4605,7 +4772,6 @@ afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc)          inode = afr_inode_find (this, loc->gfid);          if (!inode)                  goto out; -        replies = alloca0 (sizeof (*replies) * priv->child_count);          /* Calculation for string length :          * (child_count X length of child-name) + strlen ("    Choices :") @@ -4615,23 +4781,9 @@ afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc)          */          choices = alloca0 (priv->child_count * (256 + strlen ("-client-00,")) +                             strlen ("    Choices:")); -        ret = afr_selfheal_unlocked_discover (frame, inode, loc->gfid, replies); -        if (ret) { -                op_errno = -ret; -                ret = -1; -                goto out; -        } - -        ret = afr_set_split_brain_status (frame, this, replies, -                                          AFR_DATA_TRANSACTION, &d_spb); -        if (ret) { -                op_errno = -ret; -                ret = -1; -                goto out; -        } -        ret = afr_set_split_brain_status (frame, this, replies, -                                          AFR_METADATA_TRANSACTION, &m_spb); +        ret = afr_is_split_brain (frame, this, inode, loc->gfid, &d_spb, +                                  &m_spb);          if (ret) {                  op_errno = -ret;                  ret = -1; @@ -4678,8 +4830,6 @@ out:          AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL);          if (dict)                 dict_unref (dict); -        if (replies) -                afr_replies_wipe (replies, priv->child_count);          if (inode)                  inode_unref (inode);          return ret; diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index f9fde44e9e4..3db4010e997 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -979,12 +979,7 @@ afr_split_brain_resolve_do (call_frame_t *frame, xlator_t *this, loc_t *loc,          int     ret                       = -1;          int     op_errno                  = EINVAL; -        local = AFR_FRAME_INIT (frame, op_errno); -        if (!local) -                goto out; - -        local->op = GF_FOP_SETXATTR; - +        local = frame->local;          local->xdata_req = dict_new ();          if (!local->xdata_req) { @@ -1005,7 +1000,21 @@ afr_split_brain_resolve_do (call_frame_t *frame, xlator_t *this, loc_t *loc,                  ret = -1;                  goto out;          } +        /* set spb choice to -1 whether heal succeeds or not: +         * If heal succeeds : spb-choice should be set to -1 as +         *                    it is no longer valid; file is not +         *                    in split-brain anymore. +         * If heal doesn't succeed: +         *                    spb-choice should be set to -1 +         *                    otherwise reads will be served +         *                    from spb-choice which is misleading. +         */ +        ret = afr_inode_split_brain_choice_set (loc->inode, this, -1); +        if (ret) +                gf_log (this->name, GF_LOG_WARNING, "Failed to set" +                        "split-brain choice to -1");          afr_heal_splitbrain_file (frame, this, loc); +        ret = 0;  out:          if (ret < 0)                  AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); @@ -1013,28 +1022,6 @@ out:  }  int -afr_set_split_brain_choice (call_frame_t *frame, xlator_t *this, loc_t *loc, -                            int spb_choice) -{ -        int     ret       = -1; -        int     op_errno  = ENOMEM; -        afr_private_t *priv = NULL; - -        priv = this->private; - -        ret = afr_inode_split_brain_choice_set (loc->inode, this, spb_choice); -        if (ret) { -                gf_log (this->name, GF_LOG_ERROR, "Failed to set" -                        "split-brain choice as %s for %s", -                        priv->children[spb_choice]->name, -                        loc->name); -        } -        inode_invalidate (loc->inode); -        AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL); -        return ret; -} - -int  afr_get_split_brain_child_index (xlator_t *this, void *value, size_t len)  {          int             spb_child_index   = -1; @@ -1056,18 +1043,52 @@ afr_get_split_brain_child_index (xlator_t *this, void *value, size_t len)  }  int +afr_can_set_split_brain_choice (void *opaque) +{ +        afr_spbc_timeout_t        *data         = opaque; +        call_frame_t              *frame        = NULL; +        xlator_t                  *this         = NULL; +        loc_t                     *loc          = NULL; +        int                        ret          = -1; + +        frame = data->frame; +        loc = data->loc; +        this = frame->this; + +        ret = afr_is_split_brain (frame, this, loc->inode, loc->gfid, +                                  &data->d_spb, &data->m_spb); + +        if (ret) +                gf_log (this->name, GF_LOG_ERROR, "Failed to determine if %s" +                        " is in split-brain. " +                        "Aborting split-brain-choice set.", +                        uuid_utoa (loc->gfid)); +        return ret; +} + +int  afr_handle_split_brain_commands (xlator_t *this, call_frame_t *frame,                                  loc_t *loc, dict_t *dict)  { -        int             len               = 0;          void           *value             = NULL; +        afr_private_t  *priv              = NULL; +        afr_local_t    *local             = NULL; +        afr_spbc_timeout_t *data          = NULL; +        int             len               = 0;          int             spb_child_index   = -1;          int             ret               = -1;          int             op_errno          = EINVAL; -        afr_private_t  *priv              = NULL;          priv = this->private; +        local = AFR_FRAME_INIT (frame, op_errno); +        if (!local) { +                ret = 1; +                goto out; +        } + +        local->op = GF_FOP_SETXATTR; +          ret =  dict_get_ptr_and_len (dict, GF_AFR_SBRAIN_CHOICE, &value,                                       &len);          if (value) { @@ -1079,12 +1100,29 @@ afr_handle_split_brain_commands (xlator_t *this, call_frame_t *frame,                                  spb_child_index = -1;                          else {                                  ret = 1; +                                op_errno = EINVAL;                                  goto out;                          }                  } -                afr_set_split_brain_choice (frame, this, loc, -                                            spb_child_index); +                data = GF_CALLOC (1, sizeof (*data), gf_afr_mt_spbc_timeout_t); +                if (!data) { +                        ret = 1; +                        goto out; +                } +                data->spb_child_index = spb_child_index; +                data->frame = frame; +                data->loc = loc; +                ret = synctask_new (this->ctx->env, +                                    afr_can_set_split_brain_choice, +                                    afr_set_split_brain_choice, NULL, data); +                if (ret) { +                        gf_log (this->name, GF_LOG_ERROR, "Failed to create" +                                " synctask. Aborting split-brain choice set" +                                " for %s", loc->name); +                        ret = 1; +                        goto out; +                }                  ret = 0;                  goto out;          } @@ -1112,6 +1150,41 @@ out:  }  int +afr_handle_spb_choice_timeout (xlator_t *this, call_frame_t *frame, +                               dict_t *dict) +{ +        int             ret               = -1; +        int             op_errno          = 0; +        uint64_t        timeout           = 0; +        afr_private_t  *priv              = NULL; + +        priv = this->private; + +        ret = dict_get_uint64 (dict, GF_AFR_SPB_CHOICE_TIMEOUT, &timeout); +        if (!ret) { +                priv->spb_choice_timeout = timeout * 60; +                AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL); +        } + +        return ret; +} + +static int +afr_handle_special_xattr (xlator_t *this, call_frame_t *frame, loc_t *loc, +                          dict_t *dict) +{ +        int     ret     = -1; + +        ret = afr_handle_split_brain_commands (this, frame, loc, dict); +        if (ret == 0) +                goto out; + +        ret = afr_handle_spb_choice_timeout (this, frame, dict); +out: +        return ret; +} + +int  afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,  	      int32_t flags, dict_t *xdata)  { @@ -1126,8 +1199,7 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,          GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict,                                     op_errno, out); -        ret = afr_handle_split_brain_commands (this, frame, loc, dict); - +        ret = afr_handle_special_xattr (this, frame, loc, dict);          if (ret == 0)                  return 0; diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 05df90cc0ee..a11063c1f25 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -43,6 +43,7 @@ enum gf_afr_mem_types_ {          gf_afr_mt_pos_data_t,  	gf_afr_mt_reply_t,  	gf_afr_mt_subvol_healer_t, +	gf_afr_mt_spbc_timeout_t,          gf_afr_mt_end  };  #endif diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 21575fed2de..26efe93de99 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -276,6 +276,8 @@ init (xlator_t *this)          GF_OPTION_INIT ("arbiter-count", priv->arbiter_count, uint32, out); +        priv->spb_choice_timeout = AFR_DEFAULT_SPB_CHOICE_TIMEOUT; +  	GF_OPTION_INIT ("afr-dirty-xattr", priv->afr_dirty, str, out);  	GF_OPTION_INIT ("metadata-splitbrain-forced-heal", diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 6cb708ffbd7..855d3a3680e 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -38,6 +38,7 @@  #define AFR_LOCKEE_COUNT_MAX    3  #define AFR_DOM_COUNT_MAX    3  #define AFR_NUM_CHANGE_LOGS            3 /*data + metadata + entry*/ +#define AFR_DEFAULT_SPB_CHOICE_TIMEOUT 300 /*in seconds*/  #define ARBITER_BRICK_INDEX 2 @@ -130,6 +131,7 @@ typedef struct _afr_private {  	void                   *pump_private;  	gf_boolean_t           use_afr_in_pump;          gf_boolean_t           consistent_metadata; +        uint64_t               spb_choice_timeout;  } afr_private_t; @@ -742,8 +744,17 @@ typedef struct _afr_local {  typedef struct _afr_inode_ctx {          uint64_t        read_subvol;          int             spb_choice; +        gf_timer_t      *timer;  } afr_inode_ctx_t; +typedef struct afr_spbc_timeout { +        call_frame_t *frame; +        gf_boolean_t d_spb; +        gf_boolean_t m_spb; +        loc_t        *loc; +        int          spb_child_index; +} afr_spbc_timeout_t; +  /* did a call fail due to a child failing? */  #define child_went_down(op_ret, op_errno) (((op_ret) < 0) &&            \                                             ((op_errno == ENOTCONN) ||   \ @@ -1046,4 +1057,13 @@ afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this,                                    int *spb_choice);  int  afr_get_child_index_from_name (xlator_t *this, char *name); + +int +afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode, +                    uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb); +int +afr_spb_choice_timeout_cancel (xlator_t *this, inode_t *inode); + +int +afr_set_split_brain_choice (int ret, call_frame_t *frame, void *opaque);  #endif /* __AFR_H__ */  | 
