summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnuradha <atalur@redhat.com>2015-04-30 15:31:13 +0530
committerPranith Kumar Karampuri <pkarampu@redhat.com>2015-05-07 01:18:19 -0700
commit6c578c03f0d44913d264494de5df004544c96271 (patch)
treeb35d92dcfa75c6d05ab2b13d06622dbf17975cc3
parentd68a2dbb3a4be89a4a45661310ae3f32542df20f (diff)
cluster/afr : Prevent inode-evict during split-brain resolution
1) Provided setfattr command to set timeout for split-brain choice. 2) If split-brain inspection/resolution is being done from the mount for a file, ref the inode when split-brain-choice is set. This inode will be unconditionally unref-ed after timeout seconds set by the user/default otherwise. 3) Updated the doc and testcase to reflect the changes. Change-Id: I15c9037dee28855f21e680e7e3632e1f48dba4e1 BUG: 1209104 Signed-off-by: Anuradha <atalur@redhat.com> Reviewed-on: http://review.gluster.org/10134 Reviewed-by: Krutika Dhananjay <kdhananj@redhat.com> Reviewed-by: Ravishankar N <ravishankar@redhat.com> Tested-by: Gluster Build System <jenkins@build.gluster.com> Tested-by: NetBSD Build System Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
-rw-r--r--doc/features/heal-info-and-split-brain-resolution.md11
-rw-r--r--libglusterfs/src/glusterfs.h1
-rw-r--r--tests/basic/afr/split-brain-resolution.t1
-rw-r--r--xlators/cluster/afr/src/afr-common.c198
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c140
-rw-r--r--xlators/cluster/afr/src/afr-mem-types.h1
-rw-r--r--xlators/cluster/afr/src/afr.c2
-rw-r--r--xlators/cluster/afr/src/afr.h20
8 files changed, 316 insertions, 58 deletions
diff --git a/doc/features/heal-info-and-split-brain-resolution.md b/doc/features/heal-info-and-split-brain-resolution.md
index 6ca2be2f02f..7a6691db14e 100644
--- a/doc/features/heal-info-and-split-brain-resolution.md
+++ b/doc/features/heal-info-and-split-brain-resolution.md
@@ -426,6 +426,15 @@ Now performing cat operation on the file will again result in input/output error
cat: file1: Input/output error
~~~
+The user can access each file for a timeout amount of period every time replica.split-brain-choice is set. This timeout is configurable by user, with a default value of 5 minutes.
+### To set split-brain-choice timeout
+A setfattr command from the mount allows the user set this timeout, to be specified in minutes.
+~~~
+# setfattr -n replica.split-brain-choice-timeout -v <timeout-in-minutes> <mount_point/file>
+~~~
+This is a global timeout, i.e. applicable to all files as long as the mount exists. So, the timeout need not be set each time a file needs to be inspected but for a new mount it will have to be set again for the first time. This option also needs to be set every time there is a client graph switch (_See note #3_).
+
+### Resolving the split-brain
Once the choice for resolving split-brain is made, source brick is supposed to be set for the healing to be done.
This is done using the following command:
@@ -446,3 +455,5 @@ NOTE:
~~~
2) The above mentioned process for split-brain resolution from mount will not work on nfs mounts as it doesn't provide xattrs support.
+
+3) Client graph switch occurs when there is a change in the client side translator graph; typically during addition of new translators to the graph on client side and add-brick/remove-brick operations.
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h
index cfa5b75bd04..14722ce5ec5 100644
--- a/libglusterfs/src/glusterfs.h
+++ b/libglusterfs/src/glusterfs.h
@@ -167,6 +167,7 @@
#define GF_AFR_HEAL_SBRAIN "glusterfs.heal-sbrain"
#define GF_AFR_SBRAIN_STATUS "replica.split-brain-status"
#define GF_AFR_SBRAIN_CHOICE "replica.split-brain-choice"
+#define GF_AFR_SPB_CHOICE_TIMEOUT "replica.split-brain-choice-timeout"
#define GF_AFR_SBRAIN_RESOLVE "replica.split-brain-heal-finalize"
#define GF_GFIDLESS_LOOKUP "gfidless-lookup"
diff --git a/tests/basic/afr/split-brain-resolution.t b/tests/basic/afr/split-brain-resolution.t
index 03e51cf92b1..fa1342e2cd5 100644
--- a/tests/basic/afr/split-brain-resolution.t
+++ b/tests/basic/afr/split-brain-resolution.t
@@ -50,6 +50,7 @@ TEST setfattr -n replica.split-brain-choice -v $V0-client-0 $M0/data-split-brain
#Should now be able to read the contents of data-split-brain.txt
EXPECT "brick0_alive" cat $M0/data-split-brain.txt
+TEST setfattr -n replica.split-brain-choice-timeout -v 10 $M0/
TEST setfattr -n replica.split-brain-choice -v $V0-client-1 $M0/data-split-brain.txt
#Should now be able to read the contents of data-split-brain.txt
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 8fbca0b6f42..46f726da734 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -413,6 +413,142 @@ out:
return ret;
}
+int
+afr_spb_choice_timeout_cancel (xlator_t *this, inode_t *inode)
+{
+ afr_inode_ctx_t *ctx = NULL;
+ int ret = -1;
+
+ if (!inode)
+ return ret;
+
+ LOCK(&inode->lock);
+ {
+ __afr_inode_ctx_get (this, inode, &ctx);
+ if (!ctx) {
+ gf_log (this->name, GF_LOG_WARNING, "Failed to cancel"
+ " split-brain choice timer.");
+ goto out;
+ }
+ ctx->spb_choice = -1;
+ if (ctx->timer) {
+ gf_timer_call_cancel (this->ctx, ctx->timer);
+ ctx->timer = NULL;
+ }
+ ret = 0;
+ }
+out:
+ UNLOCK(&inode->lock);
+ return ret;
+}
+
+void
+afr_set_split_brain_choice_cbk (void *data)
+{
+ inode_t *inode = data;
+ xlator_t *this = THIS;
+
+ afr_spb_choice_timeout_cancel (this, inode);
+ inode_unref (inode);
+ return;
+}
+
+
+int
+afr_set_split_brain_choice (int ret, call_frame_t *frame, void *opaque)
+{
+ int op_errno = ENOMEM;
+ afr_private_t *priv = NULL;
+ afr_inode_ctx_t *ctx = NULL;
+ inode_t *inode = NULL;
+ loc_t *loc = NULL;
+ xlator_t *this = NULL;
+ afr_spbc_timeout_t *data = opaque;
+ struct timespec delta = {0, };
+
+ if (ret)
+ goto out;
+
+ frame = data->frame;
+ loc = data->loc;
+ this = frame->this;
+ priv = this->private;
+
+ delta.tv_sec = priv->spb_choice_timeout;
+ delta.tv_nsec = 0;
+
+ inode = loc->inode;
+ if (!inode)
+ goto out;
+
+ if (!(data->d_spb || data->m_spb)) {
+ gf_log (this->name, GF_LOG_WARNING, "Cannot set "
+ "replica.split-brain-choice on %s. File is"
+ " not in data/metadata split-brain.",
+ uuid_utoa (loc->gfid));
+ ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_ctx_get (this, inode, &ctx);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get"
+ "inode_ctx for %s", loc->name);
+ goto unlock;
+ }
+
+ ctx->spb_choice = data->spb_child_index;
+
+ /* Possible changes in spb-choice :
+ * -1 to valid : ref and inject timer
+ *
+ * valid to valid : cancel timer and inject new one
+ *
+ * valid to -1 : cancel timer and unref
+ *
+ * -1 to -1 : do not do anything
+ */
+
+ /* ctx->timer is NULL iff previous value of
+ * ctx->spb_choice is -1
+ */
+ if (ctx->timer) {
+ if (ctx->spb_choice == -1) {
+ gf_timer_call_cancel (this->ctx, ctx->timer);
+ ctx->timer = NULL;
+ inode_unref (inode);
+ goto unlock;
+ }
+ goto reset_timer;
+ } else {
+ if (ctx->spb_choice == -1)
+ goto unlock;
+ }
+
+ inode = inode_ref (loc->inode);
+ goto set_timer;
+
+reset_timer:
+ gf_timer_call_cancel (this->ctx, ctx->timer);
+ ctx->timer = NULL;
+
+set_timer:
+ ctx->timer = gf_timer_call_after (this->ctx, delta,
+ afr_set_split_brain_choice_cbk,
+ inode);
+ }
+unlock:
+ UNLOCK(&inode->lock);
+ inode_invalidate (inode);
+out:
+ if (data)
+ GF_FREE (data);
+ AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL);
+ return 0;
+}
int
afr_accused_fill (xlator_t *this, dict_t *xdata, unsigned char *accused,
@@ -3589,6 +3725,7 @@ afr_forget (xlator_t *this, inode_t *inode)
uint64_t ctx_int = 0;
afr_inode_ctx_t *ctx = NULL;
+ afr_spb_choice_timeout_cancel (this, inode);
inode_ctx_del (inode, this, &ctx_int);
if (!ctx_int)
return 0;
@@ -4552,10 +4689,10 @@ out:
}
int
-afr_set_split_brain_status (call_frame_t *frame, xlator_t *this,
- struct afr_reply *replies,
- afr_transaction_type type,
- gf_boolean_t *spb)
+_afr_is_split_brain (call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies,
+ afr_transaction_type type,
+ gf_boolean_t *spb)
{
afr_private_t *priv = NULL;
uint64_t *witness = NULL;
@@ -4584,6 +4721,37 @@ afr_set_split_brain_status (call_frame_t *frame, xlator_t *this,
}
int
+afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb)
+{
+ int ret = -1;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+
+ priv = this->private;
+
+ replies = alloca0 (sizeof (*replies) * priv->child_count);
+
+ ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies);
+ if (ret)
+ goto out;
+
+ ret = _afr_is_split_brain (frame, this, replies,
+ AFR_DATA_TRANSACTION, d_spb);
+ if (ret)
+ goto out;
+
+ ret = _afr_is_split_brain (frame, this, replies,
+ AFR_METADATA_TRANSACTION, m_spb);
+out:
+ if (replies) {
+ afr_replies_wipe (replies, priv->child_count);
+ replies = NULL;
+ }
+ return ret;
+}
+
+int
afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc)
{
gf_boolean_t d_spb = _gf_false;
@@ -4594,7 +4762,6 @@ afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc)
char *choices = NULL;
char *status = NULL;
dict_t *dict = NULL;
- struct afr_reply *replies = NULL;
inode_t *inode = NULL;
afr_private_t *priv = NULL;
xlator_t **children = NULL;
@@ -4605,7 +4772,6 @@ afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc)
inode = afr_inode_find (this, loc->gfid);
if (!inode)
goto out;
- replies = alloca0 (sizeof (*replies) * priv->child_count);
/* Calculation for string length :
* (child_count X length of child-name) + strlen (" Choices :")
@@ -4615,23 +4781,9 @@ afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc)
*/
choices = alloca0 (priv->child_count * (256 + strlen ("-client-00,")) +
strlen (" Choices:"));
- ret = afr_selfheal_unlocked_discover (frame, inode, loc->gfid, replies);
- if (ret) {
- op_errno = -ret;
- ret = -1;
- goto out;
- }
-
- ret = afr_set_split_brain_status (frame, this, replies,
- AFR_DATA_TRANSACTION, &d_spb);
- if (ret) {
- op_errno = -ret;
- ret = -1;
- goto out;
- }
- ret = afr_set_split_brain_status (frame, this, replies,
- AFR_METADATA_TRANSACTION, &m_spb);
+ ret = afr_is_split_brain (frame, this, inode, loc->gfid, &d_spb,
+ &m_spb);
if (ret) {
op_errno = -ret;
ret = -1;
@@ -4678,8 +4830,6 @@ out:
AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL);
if (dict)
dict_unref (dict);
- if (replies)
- afr_replies_wipe (replies, priv->child_count);
if (inode)
inode_unref (inode);
return ret;
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index f9fde44e9e4..3db4010e997 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -979,12 +979,7 @@ afr_split_brain_resolve_do (call_frame_t *frame, xlator_t *this, loc_t *loc,
int ret = -1;
int op_errno = EINVAL;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
-
- local->op = GF_FOP_SETXATTR;
-
+ local = frame->local;
local->xdata_req = dict_new ();
if (!local->xdata_req) {
@@ -1005,7 +1000,21 @@ afr_split_brain_resolve_do (call_frame_t *frame, xlator_t *this, loc_t *loc,
ret = -1;
goto out;
}
+ /* set spb choice to -1 whether heal succeeds or not:
+ * If heal succeeds : spb-choice should be set to -1 as
+ * it is no longer valid; file is not
+ * in split-brain anymore.
+ * If heal doesn't succeed:
+ * spb-choice should be set to -1
+ * otherwise reads will be served
+ * from spb-choice which is misleading.
+ */
+ ret = afr_inode_split_brain_choice_set (loc->inode, this, -1);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "Failed to set"
+ "split-brain choice to -1");
afr_heal_splitbrain_file (frame, this, loc);
+ ret = 0;
out:
if (ret < 0)
AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
@@ -1013,28 +1022,6 @@ out:
}
int
-afr_set_split_brain_choice (call_frame_t *frame, xlator_t *this, loc_t *loc,
- int spb_choice)
-{
- int ret = -1;
- int op_errno = ENOMEM;
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
- ret = afr_inode_split_brain_choice_set (loc->inode, this, spb_choice);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Failed to set"
- "split-brain choice as %s for %s",
- priv->children[spb_choice]->name,
- loc->name);
- }
- inode_invalidate (loc->inode);
- AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL);
- return ret;
-}
-
-int
afr_get_split_brain_child_index (xlator_t *this, void *value, size_t len)
{
int spb_child_index = -1;
@@ -1056,18 +1043,52 @@ afr_get_split_brain_child_index (xlator_t *this, void *value, size_t len)
}
int
+afr_can_set_split_brain_choice (void *opaque)
+{
+ afr_spbc_timeout_t *data = opaque;
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ loc_t *loc = NULL;
+ int ret = -1;
+
+ frame = data->frame;
+ loc = data->loc;
+ this = frame->this;
+
+ ret = afr_is_split_brain (frame, this, loc->inode, loc->gfid,
+ &data->d_spb, &data->m_spb);
+
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Failed to determine if %s"
+ " is in split-brain. "
+ "Aborting split-brain-choice set.",
+ uuid_utoa (loc->gfid));
+ return ret;
+}
+
+int
afr_handle_split_brain_commands (xlator_t *this, call_frame_t *frame,
loc_t *loc, dict_t *dict)
{
- int len = 0;
void *value = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_spbc_timeout_t *data = NULL;
+ int len = 0;
int spb_child_index = -1;
int ret = -1;
int op_errno = EINVAL;
- afr_private_t *priv = NULL;
priv = this->private;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local) {
+ ret = 1;
+ goto out;
+ }
+
+ local->op = GF_FOP_SETXATTR;
+
ret = dict_get_ptr_and_len (dict, GF_AFR_SBRAIN_CHOICE, &value,
&len);
if (value) {
@@ -1079,12 +1100,29 @@ afr_handle_split_brain_commands (xlator_t *this, call_frame_t *frame,
spb_child_index = -1;
else {
ret = 1;
+ op_errno = EINVAL;
goto out;
}
}
- afr_set_split_brain_choice (frame, this, loc,
- spb_child_index);
+ data = GF_CALLOC (1, sizeof (*data), gf_afr_mt_spbc_timeout_t);
+ if (!data) {
+ ret = 1;
+ goto out;
+ }
+ data->spb_child_index = spb_child_index;
+ data->frame = frame;
+ data->loc = loc;
+ ret = synctask_new (this->ctx->env,
+ afr_can_set_split_brain_choice,
+ afr_set_split_brain_choice, NULL, data);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to create"
+ " synctask. Aborting split-brain choice set"
+ " for %s", loc->name);
+ ret = 1;
+ goto out;
+ }
ret = 0;
goto out;
}
@@ -1112,6 +1150,41 @@ out:
}
int
+afr_handle_spb_choice_timeout (xlator_t *this, call_frame_t *frame,
+ dict_t *dict)
+{
+ int ret = -1;
+ int op_errno = 0;
+ uint64_t timeout = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ ret = dict_get_uint64 (dict, GF_AFR_SPB_CHOICE_TIMEOUT, &timeout);
+ if (!ret) {
+ priv->spb_choice_timeout = timeout * 60;
+ AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL);
+ }
+
+ return ret;
+}
+
+static int
+afr_handle_special_xattr (xlator_t *this, call_frame_t *frame, loc_t *loc,
+ dict_t *dict)
+{
+ int ret = -1;
+
+ ret = afr_handle_split_brain_commands (this, frame, loc, dict);
+ if (ret == 0)
+ goto out;
+
+ ret = afr_handle_spb_choice_timeout (this, frame, dict);
+out:
+ return ret;
+}
+
+int
afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
int32_t flags, dict_t *xdata)
{
@@ -1126,8 +1199,7 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict,
op_errno, out);
- ret = afr_handle_split_brain_commands (this, frame, loc, dict);
-
+ ret = afr_handle_special_xattr (this, frame, loc, dict);
if (ret == 0)
return 0;
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
index 05df90cc0ee..a11063c1f25 100644
--- a/xlators/cluster/afr/src/afr-mem-types.h
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -43,6 +43,7 @@ enum gf_afr_mem_types_ {
gf_afr_mt_pos_data_t,
gf_afr_mt_reply_t,
gf_afr_mt_subvol_healer_t,
+ gf_afr_mt_spbc_timeout_t,
gf_afr_mt_end
};
#endif
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 21575fed2de..26efe93de99 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -276,6 +276,8 @@ init (xlator_t *this)
GF_OPTION_INIT ("arbiter-count", priv->arbiter_count, uint32, out);
+ priv->spb_choice_timeout = AFR_DEFAULT_SPB_CHOICE_TIMEOUT;
+
GF_OPTION_INIT ("afr-dirty-xattr", priv->afr_dirty, str, out);
GF_OPTION_INIT ("metadata-splitbrain-forced-heal",
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 6cb708ffbd7..855d3a3680e 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -38,6 +38,7 @@
#define AFR_LOCKEE_COUNT_MAX 3
#define AFR_DOM_COUNT_MAX 3
#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/
+#define AFR_DEFAULT_SPB_CHOICE_TIMEOUT 300 /*in seconds*/
#define ARBITER_BRICK_INDEX 2
@@ -130,6 +131,7 @@ typedef struct _afr_private {
void *pump_private;
gf_boolean_t use_afr_in_pump;
gf_boolean_t consistent_metadata;
+ uint64_t spb_choice_timeout;
} afr_private_t;
@@ -742,8 +744,17 @@ typedef struct _afr_local {
typedef struct _afr_inode_ctx {
uint64_t read_subvol;
int spb_choice;
+ gf_timer_t *timer;
} afr_inode_ctx_t;
+typedef struct afr_spbc_timeout {
+ call_frame_t *frame;
+ gf_boolean_t d_spb;
+ gf_boolean_t m_spb;
+ loc_t *loc;
+ int spb_child_index;
+} afr_spbc_timeout_t;
+
/* did a call fail due to a child failing? */
#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \
((op_errno == ENOTCONN) || \
@@ -1046,4 +1057,13 @@ afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this,
int *spb_choice);
int
afr_get_child_index_from_name (xlator_t *this, char *name);
+
+int
+afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb);
+int
+afr_spb_choice_timeout_cancel (xlator_t *this, inode_t *inode);
+
+int
+afr_set_split_brain_choice (int ret, call_frame_t *frame, void *opaque);
#endif /* __AFR_H__ */