Make halo prove tests less racy

Summary: Halo prove tests were racy in a couple of ways. First, they raced against the self-heal daemon (e.g. write to volume with two bricks up and then assert that only two bricks have data file; but shd will properly copy file to third brick sooner or later). Fix by disabling shd in such tests. Second, tests rely on pings to complete and set halo state as expected, but do not check for this. If writing begins before initial pings complete, all bricks may be up and receive the data. Fix by adding explicit check for halo child states. Test Plan: prove tests/basic/halo*.t (prior to this changeset, would fail within ~10 iterations on my devserver and almost always on centos regression. Now runs overnight without failure on my devserver). Reviewers: Subscribers: Tasks: Blame Revision: Change-Id: If6823540dd4e23a19cc495d5d0e8b0c6fde9a3bd Signed-off-by: Kevin Vigor <kvigor@fb.com> Reviewed-on: http://review.gluster.org/16325 CentOS-regression: Gluster Build System <jenkins@build.gluster.org> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> Reviewed-by: Shreyas Siravara <sshreyas@fb.com> Smoke: Gluster Build System <jenkins@build.gluster.org>
author: Kevin Vigor <kvigor@fb.com> 2017-01-04 10:22:08 -0800
committer: Kevin Vigor <kvigor@fb.com> 2017-01-04 11:23:16 -0800
commit: 1e69c7ef96a92754d4394fe2cff8767d3b1809fa (patch)
tree: cf0c8420c7a712927441a2574a1654275e27f8ef /tests/basic
parent: 02f8b7300bc635dea9ae1fee6ef14c0d4725591a (diff)
2 files changed, 19 insertions, 5 deletions
diff --git a/tests/basic/halo-failover-disabled.t b/tests/basic/halo-failover-disabled.t
index 4cc66e38de6..f3655eaef3b 100644
--- a/tests/basic/halo-failover-disabled.t
+++ b/tests/basic/halo-failover-disabled.t
@@ -13,6 +13,7 @@
 #
 . $(dirname $0)/../include.rc
 . $(dirname $0)/../volume.rc
+. $(dirname $0)/../halo.rc
 
 cleanup;
 
@@ -33,7 +34,7 @@ TEST $CLI volume set $V0 cluster.heal-timeout 5
 TEST $CLI volume set $V0 cluster.entry-self-heal on
 TEST $CLI volume set $V0 cluster.data-self-heal on
 TEST $CLI volume set $V0 cluster.metadata-self-heal on
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
 TEST $CLI volume set $V0 cluster.eager-lock off
 TEST $CLI volume set $V0 diagnostics.client-log-level DEBUG
 TEST $CLI volume set $V0 diagnostics.brick-log-level DEBUG
@@ -46,7 +47,9 @@ TEST $CLI volume set $V0 network.ping-timeout 1000
 TEST $CLI volume set $V0 cluster.choose-local off
 TEST $CLI volume start $V0
 TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
-cd $M0
+
+# Make sure two children are up and one is down.
+EXPECT_WITHIN 10 "2 1" halo_sum_child_states 3
 
 # Write some data to the mount
 TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync
@@ -54,6 +57,9 @@ TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync
 UP_IDX=$(cat /var/log/glusterfs/$M0LOG  | grep "halo state: UP" | tail -n1 | grep -Eo "Child [0-9]+" | grep -Eo "[0-9]+")
 TEST kill_brick $V0 $H0 $B0/${V0}${UP_IDX}
 
+# Make sure two children are down and one is up.
+EXPECT_WITHIN 10 "1 2" halo_sum_child_states 3
+
 # Test that quorum should fail and the mount is RO, the reason here
 # is that although there _is_ another brick running which _could_
 # take the failed bricks place, it is not marked "up" so quorum
diff --git a/tests/basic/halo-failover-enabled.t b/tests/basic/halo-failover-enabled.t
index aa73589366f..2dddf9951fa 100644
--- a/tests/basic/halo-failover-enabled.t
+++ b/tests/basic/halo-failover-enabled.t
@@ -13,6 +13,7 @@
 #
 . $(dirname $0)/../include.rc
 . $(dirname $0)/../volume.rc
+. $(dirname $0)/../halo.rc
 
 cleanup;
 
@@ -31,7 +32,7 @@ TEST $CLI volume set $V0 cluster.heal-timeout 5
 TEST $CLI volume set $V0 cluster.entry-self-heal on
 TEST $CLI volume set $V0 cluster.data-self-heal on
 TEST $CLI volume set $V0 cluster.metadata-self-heal on
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
 TEST $CLI volume set $V0 cluster.eager-lock off
 TEST $CLI volume set $V0 network.ping-timeout 20
 TEST $CLI volume set $V0 cluster.choose-local off
@@ -41,8 +42,11 @@ TEST $CLI volume set $V0 nfs.log-level DEBUG
 TEST $CLI volume start $V0
 TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
 
+# Make sure two children are up and one is down.
+EXPECT_WITHIN 10 "2 1" halo_sum_child_states 3
+
 # Write some data to the mount
-dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync
+TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync
 
 KILL_IDX=$(cat /var/log/glusterfs/$M0LOG | grep "halo state: UP" | tail -n1 | grep -Eo "Child [0-9]+" | grep -Eo "[0-9]+")
 TEST [ -n "$KILL_IDX" ]
@@ -52,8 +56,12 @@ TEST [ -n "$KILL_IDX" ]
 UP_CHILDREN=($(echo "0 1 2" | sed "s/${KILL_IDX}//g"))
 UP1_HAS_TEST="$(ls $B0/${V0}${UP_CHILDREN[0]}/test 2>/dev/null)"
 UP2_HAS_TEST="$(ls $B0/${V0}${UP_CHILDREN[1]}/test 2>/dev/null)"
+VICTIM_HAS_TEST="$(ls $B0/${V0}${KILL_IDX}/test 2>/dev/null)"
+
+# The victim brick should have a copy of the file.
+TEST [ -n "$VICTIM_HAS_TEST" ]
 
-# Of the bricks which will remain standing, there is only a single
+# Of the bricks which will remain standing, there should be only one
 # brick which has the file called test.  If the both have the first
 # test file, the test is invalid as all the bricks are up and the
 # halo-max-replicas is not being honored; e.g. bug exists.
author	Kevin Vigor <kvigor@fb.com>	2017-01-04 10:22:08 -0800
committer	Kevin Vigor <kvigor@fb.com>	2017-01-04 11:23:16 -0800
commit	1e69c7ef96a92754d4394fe2cff8767d3b1809fa (patch)
tree	cf0c8420c7a712927441a2574a1654275e27f8ef /tests/basic
parent	02f8b7300bc635dea9ae1fee6ef14c0d4725591a (diff)