summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRavishankar N <ravishankar@redhat.com>2017-01-30 09:54:16 +0530
committerPranith Kumar Karampuri <pkarampu@redhat.com>2017-02-09 20:37:00 -0500
commit0e03336a9362e5717e561f76b0c543e5a197b31b (patch)
tree28be72abb87f19d28b82c9ac7cec61533a968d05
parent55625293093d485623f3f3d98687cd1e2c594460 (diff)
afr: all children of AFR must be up to resolve s-brain
Problem: The various split-brain resolution policies (favorite-child-policy based, CLI based and mount (get/setfattr) based) attempt to resolve split-brain even when not all bricks of replica are up. This can be a problem when say in a replica 3, the only good copy is down and the other 2 bricks are up and blame each other (i.e. split-brain). We end up healing the file in such a case and allow I/O on it. Fix: A decision on whether the file is in split-brain or not must be taken only if we are able to examine the afr xattrs of *all* bricks of a given replica. Change-Id: Icddb1268b380005799990f5379ef957d84639ef9 BUG: 1417522 Signed-off-by: Ravishankar N <ravishankar@redhat.com> Reviewed-on: https://review.gluster.org/16476 Smoke: Gluster Build System <jenkins@build.gluster.org> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> CentOS-regression: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
-rw-r--r--tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t66
-rw-r--r--xlators/cluster/afr/src/afr-common.c32
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c38
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h6
4 files changed, 127 insertions, 15 deletions
diff --git a/tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t b/tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t
new file mode 100644
index 00000000000..4592ebf8d23
--- /dev/null
+++ b/tests/bugs/replicate/bug-1417522-block-split-brain-resolution.t
@@ -0,0 +1,66 @@
+#!/bin/bash
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0..2}
+TEST $CLI volume set $V0 self-heal-daemon off
+TEST $CLI volume set $V0 data-self-heal off
+TEST $CLI volume set $V0 entry-self-heal off
+TEST $CLI volume set $V0 metadata-self-heal off
+TEST $CLI volume start $V0
+
+TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0;
+TEST touch $M0/file
+
+TEST kill_brick $V0 $H0 $B0/${V0}1
+TEST dd if=/dev/urandom of=$M0/file bs=1024 count=10
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
+TEST kill_brick $V0 $H0 $B0/${V0}2
+TEST dd if=/dev/urandom of=$M0/file bs=1024 count=20
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2
+TEST kill_brick $V0 $H0 $B0/${V0}0
+TEST ! dd if=$M0/file of=/dev/null
+SOURCE_BRICK_MD5=$(md5sum $B0/${V0}0/file | cut -d\ -f1)
+
+# Various fav-child policies must not heal the file when some bricks are down.
+TEST $CLI volume set $V0 favorite-child-policy size
+TEST ! dd if=$M0/file of=/dev/null
+TEST $CLI volume set $V0 favorite-child-policy ctime
+TEST ! dd if=$M0/file of=/dev/null
+TEST $CLI volume set $V0 favorite-child-policy mtime
+TEST ! dd if=$M0/file of=/dev/null
+TEST $CLI volume set $V0 favorite-child-policy majority
+TEST ! dd if=$M0/file of=/dev/null
+
+# CLI/mount based split-brain resolution must also not work.
+TEST ! $CLI volume heal $V0 split-brain bigger-file /file
+TEST ! $CLI volume heal $V0 split-brain mtime /file
+TEST ! $CLI volume heal $V0 split-brain source-brick $H0:$B0/${V0}2 /file1
+
+TEST ! getfattr -n replica.split-brain-status $M0/file
+TEST ! setfattr -n replica.split-brain-choice -v $V0-client-1 $M0/file
+
+# Bring all bricks back up and launch heal.
+TEST $CLI volume set $V0 self-heal-daemon on
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
+TEST $CLI volume heal $V0
+EXPECT 0 get_pending_heal_count $V0
+B1_MD5=$(md5sum $B0/${V0}1/file | cut -d\ -f1)
+B2_MD5=$(md5sum $B0/${V0}2/file | cut -d\ -f1)
+TEST [ "$SOURCE_BRICK_MD5" == "$B1_MD5" ]
+TEST [ "$SOURCE_BRICK_MD5" == "$B2_MD5" ]
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+cleanup;
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 7f4db5c3778..ac834e90f4b 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -748,14 +748,17 @@ afr_set_split_brain_choice (int ret, call_frame_t *frame, void *opaque)
gf_boolean_t timer_reset = _gf_false;
int old_spb_choice = -1;
- if (ret)
- goto out;
-
frame = data->frame;
loc = data->loc;
this = frame->this;
priv = this->private;
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
delta.tv_sec = priv->spb_choice_timeout;
delta.tv_nsec = 0;
@@ -5611,6 +5614,12 @@ afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode,
if (ret)
goto out;
+ if (!afr_can_decide_split_brain_source_sinks (replies,
+ priv->child_count)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
ret = _afr_is_split_brain (frame, this, replies,
AFR_DATA_TRANSACTION, d_spb);
if (ret)
@@ -5663,6 +5672,13 @@ afr_get_split_brain_status (void *opaque)
if (!inode)
goto out;
+ dict = dict_new ();
+ if (!dict) {
+ op_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
/* Calculation for string length :
* (child_count X length of child-name) + strlen (" Choices :")
* child-name consists of :
@@ -5676,13 +5692,9 @@ afr_get_split_brain_status (void *opaque)
&m_spb);
if (ret) {
op_errno = -ret;
- ret = -1;
- goto out;
- }
-
- dict = dict_new ();
- if (!dict) {
- op_errno = ENOMEM;
+ if (ret == -EAGAIN)
+ ret = dict_set_str (dict, GF_AFR_SBRAIN_STATUS,
+ SBRAIN_HEAL_NO_GO_MSG);
ret = -1;
goto out;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 17e15d760c6..a8d3febb21b 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -473,6 +473,19 @@ afr_dict_contains_heal_op (call_frame_t *frame)
return _gf_true;
}
+gf_boolean_t
+afr_can_decide_split_brain_source_sinks (struct afr_reply *replies,
+ int child_count)
+{
+ int i = 0;
+
+ for (i = 0; i < child_count; i++)
+ if (replies[i].valid != 1 || replies[i].op_ret != 0)
+ return _gf_false;
+
+ return _gf_true;
+}
+
int
afr_mark_split_brain_source_sinks_by_heal_op (call_frame_t *frame,
xlator_t *this, unsigned char *sources,
@@ -511,6 +524,14 @@ afr_mark_split_brain_source_sinks_by_heal_op (call_frame_t *frame,
}
xdata_rsp = local->xdata_rsp;
+ if (!afr_can_decide_split_brain_source_sinks (replies,
+ priv->child_count)) {
+ ret = dict_set_str (xdata_rsp, "sh-fail-msg",
+ SBRAIN_HEAL_NO_GO_MSG);
+ ret = -1;
+ goto out;
+ }
+
for (i = 0 ; i < priv->child_count; i++)
if (locked_on[i])
sources[i] = 1;
@@ -749,26 +770,35 @@ afr_sh_get_fav_by_policy (xlator_t *this, struct afr_reply *replies,
int fav_child = -1;
priv = this->private;
+ if (!afr_can_decide_split_brain_source_sinks (replies,
+ priv->child_count)) {
+ return -1;
+ }
+
switch (priv->fav_child_policy) {
case AFR_FAV_CHILD_BY_SIZE:
fav_child = afr_sh_fav_by_size (this, replies, inode);
- if (policy_str && fav_child >= 0)
+ if (policy_str && fav_child >= 0) {
*policy_str = "SIZE";
+ }
break;
case AFR_FAV_CHILD_BY_CTIME:
fav_child = afr_sh_fav_by_ctime (this, replies, inode);
- if (policy_str && fav_child >= 0)
+ if (policy_str && fav_child >= 0) {
*policy_str = "CTIME";
+ }
break;
case AFR_FAV_CHILD_BY_MTIME:
fav_child = afr_sh_fav_by_mtime (this, replies, inode);
- if (policy_str && fav_child >= 0)
+ if (policy_str && fav_child >= 0) {
*policy_str = "MTIME";
+ }
break;
case AFR_FAV_CHILD_BY_MAJORITY:
fav_child = afr_sh_fav_by_majority (this, replies, inode);
- if (policy_str && fav_child >= 0)
+ if (policy_str && fav_child >= 0) {
*policy_str = "MAJORITY";
+ }
break;
case AFR_FAV_CHILD_NONE:
default:
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 500227abe24..a33905033cc 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -81,7 +81,8 @@
#define IA_EQUAL(f,s,field) (memcmp (&(f.ia_##field), &(s.ia_##field), sizeof (s.ia_##field)) == 0)
-
+#define SBRAIN_HEAL_NO_GO_MSG "Failed to obtain replies from all bricks of "\
+ "the replica (are they up?). Cannot resolve split-brain."
int
afr_selfheal (xlator_t *this, uuid_t gfid);
@@ -220,6 +221,9 @@ afr_mark_active_sinks (xlator_t *this, unsigned char *sources,
gf_boolean_t
afr_dict_contains_heal_op (call_frame_t *frame);
+gf_boolean_t
+afr_can_decide_split_brain_source_sinks (struct afr_reply *replies,
+ int child_count);
int
afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
inode_t *inode,