tests: fix spurious regression test failures

Change-Id: I752aeb8e25f43281d2f5cf33d0ff5aeae49687e7 BUG: 764966 Signed-off-by: Jeff Darcy <jdarcy@redhat.com> Reviewed-on: http://review.gluster.org/4794 Reviewed-by: Anand Avati <avati@redhat.com> Tested-by: Anand Avati <avati@redhat.com>
author: Jeff Darcy <jdarcy@redhat.com> 2013-04-08 15:32:49 -0400
committer: Anand Avati <avati@redhat.com> 2013-04-08 19:19:54 -0700
commit: 1ca50941d693f48e73723b12a1466a70dd272ea2 (patch)
tree: 470aa14da936cc13625f36e596081df1af598972 /tests/bugs/bug-873962.t
parent: 108475decfde8fd30818c37604ef8db837220624 (diff)
1 files changed, 21 insertions, 0 deletions
diff --git a/tests/bugs/bug-873962.t b/tests/bugs/bug-873962.t
index 6a85cee0cde..b245cc3dab5 100755
--- a/tests/bugs/bug-873962.t
+++ b/tests/bugs/bug-873962.t
@@ -13,6 +13,26 @@ TEST $CLI volume info;
 B0_hiphenated=`echo $B0 | tr '/' '-'`
 TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2}
 
+# If we allow self-heal to happen in the background, we'll get spurious
+# failures - especially at the point labeled "FAIL HERE" but
+# occasionally elsewhere.  This behavior is very timing-dependent.  It
+# doesn't show up in Jenkins, but it does on JD's and KP's machines, and
+# it got sharply worse because of an unrelated fsync change (6ae6f3d)
+# which changed timing.  Putting anything at the FAIL HERE marker tends
+# to make it go away most of the time on affected machines, even if the
+# "anything" is unrelated.
+#
+# What's going on is that the I/O on the first mountpoint is allowed to
+# complete even though self-heal is still in progress and the state on
+# disk does not reflect its result.  In fact, the state changes during
+# self-heal create the appearance of split brain when the second I/O
+# comes in, so that fails even though we haven't actually been in split
+# brain since the manual xattr operations.  By disallowing background
+# self-heal, we ensure that the second I/O can't happen before self-heal
+# is complete, because it has to follow the first I/O which now has to
+# follow self-heal.
+TEST $CLI volume set $V0 cluster.background-self-heal-count 0
+
 #Make sure self-heal is not triggered when the bricks are re-started
 TEST $CLI volume set $V0 cluster.self-heal-daemon off
 TEST $CLI volume set $V0 performance.stat-prefetch off
@@ -53,6 +73,7 @@ TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0
 
 #The operations should do self-heal and give correct output
 EXPECT "2" cat $M0/a;
+# FAIL HERE - see comment about cluster.self-heal-background-count above.
 EXPECT "2" cat $M1/a;
 EXPECT "def" getfattr -n trusted.mdata --only-values $M0/b 2>/dev/null
 EXPECT "def" getfattr -n trusted.mdata --only-values $M1/b 2>/dev/null
author	Jeff Darcy <jdarcy@redhat.com>	2013-04-08 15:32:49 -0400
committer	Anand Avati <avati@redhat.com>	2013-04-08 19:19:54 -0700
commit	1ca50941d693f48e73723b12a1466a70dd272ea2 (patch)
tree	470aa14da936cc13625f36e596081df1af598972 /tests/bugs/bug-873962.t
parent	108475decfde8fd30818c37604ef8db837220624 (diff)