diff options
-rw-r--r-- | doc/features/heal-info-and-split-brain-resolution.md | 11 | ||||
-rw-r--r-- | libglusterfs/src/glusterfs.h | 1 | ||||
-rw-r--r-- | tests/basic/afr/split-brain-resolution.t | 1 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 198 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-inode-write.c | 140 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-mem-types.h | 1 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.c | 2 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.h | 20 |
8 files changed, 316 insertions, 58 deletions
diff --git a/doc/features/heal-info-and-split-brain-resolution.md b/doc/features/heal-info-and-split-brain-resolution.md index 6ca2be2f02f..7a6691db14e 100644 --- a/doc/features/heal-info-and-split-brain-resolution.md +++ b/doc/features/heal-info-and-split-brain-resolution.md @@ -426,6 +426,15 @@ Now performing cat operation on the file will again result in input/output error cat: file1: Input/output error ~~~ +The user can access each file for a timeout amount of period every time replica.split-brain-choice is set. This timeout is configurable by user, with a default value of 5 minutes. +### To set split-brain-choice timeout +A setfattr command from the mount allows the user set this timeout, to be specified in minutes. +~~~ +# setfattr -n replica.split-brain-choice-timeout -v <timeout-in-minutes> <mount_point/file> +~~~ +This is a global timeout, i.e. applicable to all files as long as the mount exists. So, the timeout need not be set each time a file needs to be inspected but for a new mount it will have to be set again for the first time. This option also needs to be set every time there is a client graph switch (_See note #3_). + +### Resolving the split-brain Once the choice for resolving split-brain is made, source brick is supposed to be set for the healing to be done. This is done using the following command: @@ -446,3 +455,5 @@ NOTE: ~~~ 2) The above mentioned process for split-brain resolution from mount will not work on nfs mounts as it doesn't provide xattrs support. + +3) Client graph switch occurs when there is a change in the client side translator graph; typically during addition of new translators to the graph on client side and add-brick/remove-brick operations. diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index cfa5b75bd04..14722ce5ec5 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -167,6 +167,7 @@ #define GF_AFR_HEAL_SBRAIN "glusterfs.heal-sbrain" #define GF_AFR_SBRAIN_STATUS "replica.split-brain-status" #define GF_AFR_SBRAIN_CHOICE "replica.split-brain-choice" +#define GF_AFR_SPB_CHOICE_TIMEOUT "replica.split-brain-choice-timeout" #define GF_AFR_SBRAIN_RESOLVE "replica.split-brain-heal-finalize" #define GF_GFIDLESS_LOOKUP "gfidless-lookup" diff --git a/tests/basic/afr/split-brain-resolution.t b/tests/basic/afr/split-brain-resolution.t index 03e51cf92b1..fa1342e2cd5 100644 --- a/tests/basic/afr/split-brain-resolution.t +++ b/tests/basic/afr/split-brain-resolution.t @@ -50,6 +50,7 @@ TEST setfattr -n replica.split-brain-choice -v $V0-client-0 $M0/data-split-brain #Should now be able to read the contents of data-split-brain.txt EXPECT "brick0_alive" cat $M0/data-split-brain.txt +TEST setfattr -n replica.split-brain-choice-timeout -v 10 $M0/ TEST setfattr -n replica.split-brain-choice -v $V0-client-1 $M0/data-split-brain.txt #Should now be able to read the contents of data-split-brain.txt diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 8fbca0b6f42..46f726da734 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -413,6 +413,142 @@ out: return ret; } +int +afr_spb_choice_timeout_cancel (xlator_t *this, inode_t *inode) +{ + afr_inode_ctx_t *ctx = NULL; + int ret = -1; + + if (!inode) + return ret; + + LOCK(&inode->lock); + { + __afr_inode_ctx_get (this, inode, &ctx); + if (!ctx) { + gf_log (this->name, GF_LOG_WARNING, "Failed to cancel" + " split-brain choice timer."); + goto out; + } + ctx->spb_choice = -1; + if (ctx->timer) { + gf_timer_call_cancel (this->ctx, ctx->timer); + ctx->timer = NULL; + } + ret = 0; + } +out: + UNLOCK(&inode->lock); + return ret; +} + +void +afr_set_split_brain_choice_cbk (void *data) +{ + inode_t *inode = data; + xlator_t *this = THIS; + + afr_spb_choice_timeout_cancel (this, inode); + inode_unref (inode); + return; +} + + +int +afr_set_split_brain_choice (int ret, call_frame_t *frame, void *opaque) +{ + int op_errno = ENOMEM; + afr_private_t *priv = NULL; + afr_inode_ctx_t *ctx = NULL; + inode_t *inode = NULL; + loc_t *loc = NULL; + xlator_t *this = NULL; + afr_spbc_timeout_t *data = opaque; + struct timespec delta = {0, }; + + if (ret) + goto out; + + frame = data->frame; + loc = data->loc; + this = frame->this; + priv = this->private; + + delta.tv_sec = priv->spb_choice_timeout; + delta.tv_nsec = 0; + + inode = loc->inode; + if (!inode) + goto out; + + if (!(data->d_spb || data->m_spb)) { + gf_log (this->name, GF_LOG_WARNING, "Cannot set " + "replica.split-brain-choice on %s. File is" + " not in data/metadata split-brain.", + uuid_utoa (loc->gfid)); + ret = -1; + op_errno = EINVAL; + goto out; + } + + LOCK(&inode->lock); + { + ret = __afr_inode_ctx_get (this, inode, &ctx); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to get" + "inode_ctx for %s", loc->name); + goto unlock; + } + + ctx->spb_choice = data->spb_child_index; + + /* Possible changes in spb-choice : + * -1 to valid : ref and inject timer + * + * valid to valid : cancel timer and inject new one + * + * valid to -1 : cancel timer and unref + * + * -1 to -1 : do not do anything + */ + + /* ctx->timer is NULL iff previous value of + * ctx->spb_choice is -1 + */ + if (ctx->timer) { + if (ctx->spb_choice == -1) { + gf_timer_call_cancel (this->ctx, ctx->timer); + ctx->timer = NULL; + inode_unref (inode); + goto unlock; + } + goto reset_timer; + } else { + if (ctx->spb_choice == -1) + goto unlock; + } + + inode = inode_ref (loc->inode); + goto set_timer; + +reset_timer: + gf_timer_call_cancel (this->ctx, ctx->timer); + ctx->timer = NULL; + +set_timer: + ctx->timer = gf_timer_call_after (this->ctx, delta, + afr_set_split_brain_choice_cbk, + inode); + } +unlock: + UNLOCK(&inode->lock); + inode_invalidate (inode); +out: + if (data) + GF_FREE (data); + AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL); + return 0; +} int afr_accused_fill (xlator_t *this, dict_t *xdata, unsigned char *accused, @@ -3589,6 +3725,7 @@ afr_forget (xlator_t *this, inode_t *inode) uint64_t ctx_int = 0; afr_inode_ctx_t *ctx = NULL; + afr_spb_choice_timeout_cancel (this, inode); inode_ctx_del (inode, this, &ctx_int); if (!ctx_int) return 0; @@ -4552,10 +4689,10 @@ out: } int -afr_set_split_brain_status (call_frame_t *frame, xlator_t *this, - struct afr_reply *replies, - afr_transaction_type type, - gf_boolean_t *spb) +_afr_is_split_brain (call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + afr_transaction_type type, + gf_boolean_t *spb) { afr_private_t *priv = NULL; uint64_t *witness = NULL; @@ -4584,6 +4721,37 @@ afr_set_split_brain_status (call_frame_t *frame, xlator_t *this, } int +afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode, + uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb) +{ + int ret = -1; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + + priv = this->private; + + replies = alloca0 (sizeof (*replies) * priv->child_count); + + ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies); + if (ret) + goto out; + + ret = _afr_is_split_brain (frame, this, replies, + AFR_DATA_TRANSACTION, d_spb); + if (ret) + goto out; + + ret = _afr_is_split_brain (frame, this, replies, + AFR_METADATA_TRANSACTION, m_spb); +out: + if (replies) { + afr_replies_wipe (replies, priv->child_count); + replies = NULL; + } + return ret; +} + +int afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc) { gf_boolean_t d_spb = _gf_false; @@ -4594,7 +4762,6 @@ afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc) char *choices = NULL; char *status = NULL; dict_t *dict = NULL; - struct afr_reply *replies = NULL; inode_t *inode = NULL; afr_private_t *priv = NULL; xlator_t **children = NULL; @@ -4605,7 +4772,6 @@ afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc) inode = afr_inode_find (this, loc->gfid); if (!inode) goto out; - replies = alloca0 (sizeof (*replies) * priv->child_count); /* Calculation for string length : * (child_count X length of child-name) + strlen (" Choices :") @@ -4615,23 +4781,9 @@ afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc) */ choices = alloca0 (priv->child_count * (256 + strlen ("-client-00,")) + strlen (" Choices:")); - ret = afr_selfheal_unlocked_discover (frame, inode, loc->gfid, replies); - if (ret) { - op_errno = -ret; - ret = -1; - goto out; - } - - ret = afr_set_split_brain_status (frame, this, replies, - AFR_DATA_TRANSACTION, &d_spb); - if (ret) { - op_errno = -ret; - ret = -1; - goto out; - } - ret = afr_set_split_brain_status (frame, this, replies, - AFR_METADATA_TRANSACTION, &m_spb); + ret = afr_is_split_brain (frame, this, inode, loc->gfid, &d_spb, + &m_spb); if (ret) { op_errno = -ret; ret = -1; @@ -4678,8 +4830,6 @@ out: AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL); if (dict) dict_unref (dict); - if (replies) - afr_replies_wipe (replies, priv->child_count); if (inode) inode_unref (inode); return ret; diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index f9fde44e9e4..3db4010e997 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -979,12 +979,7 @@ afr_split_brain_resolve_do (call_frame_t *frame, xlator_t *this, loc_t *loc, int ret = -1; int op_errno = EINVAL; - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; - - local->op = GF_FOP_SETXATTR; - + local = frame->local; local->xdata_req = dict_new (); if (!local->xdata_req) { @@ -1005,7 +1000,21 @@ afr_split_brain_resolve_do (call_frame_t *frame, xlator_t *this, loc_t *loc, ret = -1; goto out; } + /* set spb choice to -1 whether heal succeeds or not: + * If heal succeeds : spb-choice should be set to -1 as + * it is no longer valid; file is not + * in split-brain anymore. + * If heal doesn't succeed: + * spb-choice should be set to -1 + * otherwise reads will be served + * from spb-choice which is misleading. + */ + ret = afr_inode_split_brain_choice_set (loc->inode, this, -1); + if (ret) + gf_log (this->name, GF_LOG_WARNING, "Failed to set" + "split-brain choice to -1"); afr_heal_splitbrain_file (frame, this, loc); + ret = 0; out: if (ret < 0) AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); @@ -1013,28 +1022,6 @@ out: } int -afr_set_split_brain_choice (call_frame_t *frame, xlator_t *this, loc_t *loc, - int spb_choice) -{ - int ret = -1; - int op_errno = ENOMEM; - afr_private_t *priv = NULL; - - priv = this->private; - - ret = afr_inode_split_brain_choice_set (loc->inode, this, spb_choice); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to set" - "split-brain choice as %s for %s", - priv->children[spb_choice]->name, - loc->name); - } - inode_invalidate (loc->inode); - AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL); - return ret; -} - -int afr_get_split_brain_child_index (xlator_t *this, void *value, size_t len) { int spb_child_index = -1; @@ -1056,18 +1043,52 @@ afr_get_split_brain_child_index (xlator_t *this, void *value, size_t len) } int +afr_can_set_split_brain_choice (void *opaque) +{ + afr_spbc_timeout_t *data = opaque; + call_frame_t *frame = NULL; + xlator_t *this = NULL; + loc_t *loc = NULL; + int ret = -1; + + frame = data->frame; + loc = data->loc; + this = frame->this; + + ret = afr_is_split_brain (frame, this, loc->inode, loc->gfid, + &data->d_spb, &data->m_spb); + + if (ret) + gf_log (this->name, GF_LOG_ERROR, "Failed to determine if %s" + " is in split-brain. " + "Aborting split-brain-choice set.", + uuid_utoa (loc->gfid)); + return ret; +} + +int afr_handle_split_brain_commands (xlator_t *this, call_frame_t *frame, loc_t *loc, dict_t *dict) { - int len = 0; void *value = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_spbc_timeout_t *data = NULL; + int len = 0; int spb_child_index = -1; int ret = -1; int op_errno = EINVAL; - afr_private_t *priv = NULL; priv = this->private; + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) { + ret = 1; + goto out; + } + + local->op = GF_FOP_SETXATTR; + ret = dict_get_ptr_and_len (dict, GF_AFR_SBRAIN_CHOICE, &value, &len); if (value) { @@ -1079,12 +1100,29 @@ afr_handle_split_brain_commands (xlator_t *this, call_frame_t *frame, spb_child_index = -1; else { ret = 1; + op_errno = EINVAL; goto out; } } - afr_set_split_brain_choice (frame, this, loc, - spb_child_index); + data = GF_CALLOC (1, sizeof (*data), gf_afr_mt_spbc_timeout_t); + if (!data) { + ret = 1; + goto out; + } + data->spb_child_index = spb_child_index; + data->frame = frame; + data->loc = loc; + ret = synctask_new (this->ctx->env, + afr_can_set_split_brain_choice, + afr_set_split_brain_choice, NULL, data); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to create" + " synctask. Aborting split-brain choice set" + " for %s", loc->name); + ret = 1; + goto out; + } ret = 0; goto out; } @@ -1112,6 +1150,41 @@ out: } int +afr_handle_spb_choice_timeout (xlator_t *this, call_frame_t *frame, + dict_t *dict) +{ + int ret = -1; + int op_errno = 0; + uint64_t timeout = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + ret = dict_get_uint64 (dict, GF_AFR_SPB_CHOICE_TIMEOUT, &timeout); + if (!ret) { + priv->spb_choice_timeout = timeout * 60; + AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL); + } + + return ret; +} + +static int +afr_handle_special_xattr (xlator_t *this, call_frame_t *frame, loc_t *loc, + dict_t *dict) +{ + int ret = -1; + + ret = afr_handle_split_brain_commands (this, frame, loc, dict); + if (ret == 0) + goto out; + + ret = afr_handle_spb_choice_timeout (this, frame, dict); +out: + return ret; +} + +int afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata) { @@ -1126,8 +1199,7 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, op_errno, out); - ret = afr_handle_split_brain_commands (this, frame, loc, dict); - + ret = afr_handle_special_xattr (this, frame, loc, dict); if (ret == 0) return 0; diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 05df90cc0ee..a11063c1f25 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -43,6 +43,7 @@ enum gf_afr_mem_types_ { gf_afr_mt_pos_data_t, gf_afr_mt_reply_t, gf_afr_mt_subvol_healer_t, + gf_afr_mt_spbc_timeout_t, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 21575fed2de..26efe93de99 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -276,6 +276,8 @@ init (xlator_t *this) GF_OPTION_INIT ("arbiter-count", priv->arbiter_count, uint32, out); + priv->spb_choice_timeout = AFR_DEFAULT_SPB_CHOICE_TIMEOUT; + GF_OPTION_INIT ("afr-dirty-xattr", priv->afr_dirty, str, out); GF_OPTION_INIT ("metadata-splitbrain-forced-heal", diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 6cb708ffbd7..855d3a3680e 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -38,6 +38,7 @@ #define AFR_LOCKEE_COUNT_MAX 3 #define AFR_DOM_COUNT_MAX 3 #define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ +#define AFR_DEFAULT_SPB_CHOICE_TIMEOUT 300 /*in seconds*/ #define ARBITER_BRICK_INDEX 2 @@ -130,6 +131,7 @@ typedef struct _afr_private { void *pump_private; gf_boolean_t use_afr_in_pump; gf_boolean_t consistent_metadata; + uint64_t spb_choice_timeout; } afr_private_t; @@ -742,8 +744,17 @@ typedef struct _afr_local { typedef struct _afr_inode_ctx { uint64_t read_subvol; int spb_choice; + gf_timer_t *timer; } afr_inode_ctx_t; +typedef struct afr_spbc_timeout { + call_frame_t *frame; + gf_boolean_t d_spb; + gf_boolean_t m_spb; + loc_t *loc; + int spb_child_index; +} afr_spbc_timeout_t; + /* did a call fail due to a child failing? */ #define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ ((op_errno == ENOTCONN) || \ @@ -1046,4 +1057,13 @@ afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this, int *spb_choice); int afr_get_child_index_from_name (xlator_t *this, char *name); + +int +afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode, + uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb); +int +afr_spb_choice_timeout_cancel (xlator_t *this, inode_t *inode); + +int +afr_set_split_brain_choice (int ret, call_frame_t *frame, void *opaque); #endif /* __AFR_H__ */ |