summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRavishankar N <ravishankar@redhat.com>2015-01-09 08:17:19 +0530
committerPranith Kumar Karampuri <pkarampu@redhat.com>2015-06-22 01:42:10 -0700
commit41805d75e47f2ce9f71d99e556885db008948fb2 (patch)
treeef34f292f342397f001f7abf77cfcc9079ac8dc0
parent79e4c7b2fad6db15863efb4e979525b1bd4862ea (diff)
afr: complete conservative merge even in case of gfid split-brain.
Problem: While performing conservative merge, we bail out of the merge if we encounter a file with mismatching gfid or type. What this means is all entries that come after the mismatching file (during the merge) never get healed, no matter how many index heals are done. Fix: Continue with the merging of rest of the entries even if a gfid/type mismatch is found, but ensure that post-op does not happen on the parent dir in such a case. Change-Id: I9bbfccc8906007daa53a0750ddd401dcf83943f8 BUG: 1180545 Signed-off-by: Ravishankar N <ravishankar@redhat.com> Reviewed-on: http://review.gluster.org/9429 Reviewed-by: Anuradha Talur <atalur@redhat.com> Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com> Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
-rw-r--r--tests/bugs/replicate/bug-1180545.t48
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c20
2 files changed, 66 insertions, 2 deletions
diff --git a/tests/bugs/replicate/bug-1180545.t b/tests/bugs/replicate/bug-1180545.t
new file mode 100644
index 00000000000..748d5defa91
--- /dev/null
+++ b/tests/bugs/replicate/bug-1180545.t
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+#Create gfid split-brain of directory and check if conservative merge
+#completes successfully.
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+. $(dirname $0)/../../afr.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+
+TEST $CLI volume create $V0 replica 2 $H0:$B0/brick{0,1}
+TEST $CLI volume set $V0 cluster.heal-timeout 60
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume start $V0
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
+
+#Create files with alternate brick down. One file has gfid mismatch.
+TEST mkdir $M0/DIR
+
+TEST kill_brick $V0 $H0 $B0/brick1
+TEST touch $M0/DIR/FILE
+TEST touch $M0/DIR/file{1..5}
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
+
+TEST kill_brick $V0 $H0 $B0/brick0
+TEST touch $M0/DIR/FILE
+TEST touch $M0/DIR/file{6..10}
+TEST $CLI volume start $V0 force
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
+
+#Trigger heal and verify number of entries in backend
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+EXPECT_WITHIN PROCESS_UP_TIMEOUT "Y" glustershd_up_status
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
+TEST $CLI volume heal $V0
+
+EXPECT_WITHIN $HEAL_TIMEOUT '2' count_sh_entries $B0/brick0
+EXPECT_WITHIN $HEAL_TIMEOUT '2' count_sh_entries $B0/brick1
+#Two entries for DIR and two for FILE
+EXPECT_WITHIN $HEAL_TIMEOUT "4" afr_get_pending_heal_count $V0
+TEST diff <(ls $B0/brick0/DIR) <(ls $B0/brick1/DIR)
+cleanup
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index b78bfa99f20..28e332db740 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -503,6 +503,7 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this,
call_frame_t *iter_frame = NULL;
xlator_t *subvol = NULL;
afr_private_t *priv = NULL;
+ gf_boolean_t mismatch = _gf_false;
priv = this->private;
subvol = priv->children[child];
@@ -532,6 +533,11 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this,
entry->d_name);
AFR_STACK_RESET (iter_frame);
+ if (ret == -1) {
+ /* gfid or type mismatch. */
+ mismatch = _gf_true;
+ ret = 0;
+ }
if (ret)
break;
}
@@ -542,6 +548,9 @@ afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this,
}
AFR_STACK_DESTROY (iter_frame);
+ if (mismatch == _gf_true)
+ /* undo pending will be skipped */
+ ret = -1;
return ret;
}
@@ -552,6 +561,7 @@ afr_selfheal_entry_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
{
int i = 0;
afr_private_t *priv = NULL;
+ gf_boolean_t mismatch = _gf_false;
int ret = 0;
priv = this->private;
@@ -563,14 +573,20 @@ afr_selfheal_entry_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
if (i != source && !healed_sinks[i])
continue;
ret = afr_selfheal_entry_do_subvol (frame, this, fd, i);
+ if (ret == -1) {
+ /* gfid or type mismatch. */
+ mismatch = _gf_true;
+ continue;
+ }
if (ret)
break;
}
+ if (mismatch == _gf_true)
+ /* undo pending will be skipped */
+ ret = -1;
return ret;
}
-
-
static int
__afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd,
unsigned char *locked_on)