Fix Halo tests in v3.6.3 of GlusterFS + minor SHD bug fix

Summary: - SHD is now excluded from the max-replicas policy. We'd need to make an SHD specific tunable for this to make tests reliably pass, and frankly it probably makes things more intuitive having SHD excluded (i.e. SHD can always see everything). - Updated the halo-failover-enabled test, I think it's a bit more clear now, and works reliably. halo.t fixed after fixing the SHD max-replicas bug. Test Plan: - Run prove tests -> https://phabricator.fb.com/P19872728 Reviewers: dph, sshreyas Reviewed By: sshreyas FB-commit-id: e425e6651cd02691d36427831b6b8ca206d0f78f Change-Id: I57855ef99628146c32de59af475b096bd91d6012 Signed-off-by: Kevin Vigor <kvigor@fb.com> Reviewed-on: http://review.gluster.org/16305 CentOS-regression: Gluster Build System <jenkins@build.gluster.org> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> Smoke: Gluster Build System <jenkins@build.gluster.org> Reviewed-by: Shreyas Siravara <sshreyas@fb.com>
author: Richard Wareing <rwareing@fb.com> 2015-06-10 21:39:11 -0700
committer: Shreyas Siravara <sshreyas@fb.com> 2016-12-29 08:54:58 -0800
commit: 0658050cc6bd2b3e5b9515a35055287ad59f3796 (patch)
tree: f9c76a81f2224d5fb30637777f0ed96ae1f08a3f /tests
parent: 7a6ead5c03e9f62fe8726b141c94cc7d31a79c39 (diff)
2 files changed, 30 insertions, 18 deletions
diff --git a/tests/basic/halo-failover-enabled.t b/tests/basic/halo-failover-enabled.t
index e897d076813..aa73589366f 100644
--- a/tests/basic/halo-failover-enabled.t
+++ b/tests/basic/halo-failover-enabled.t
@@ -24,6 +24,7 @@ TEST $CLI volume set $V0 cluster.shd-max-threads 1
 TEST $CLI volume set $V0 cluster.halo-enabled True
 TEST $CLI volume set $V0 cluster.halo-failover-enabled on
 TEST $CLI volume set $V0 cluster.halo-max-replicas 2
+TEST $CLI volume set $V0 cluster.halo-min-samples 1
 TEST $CLI volume set $V0 cluster.quorum-type fixed
 TEST $CLI volume set $V0 cluster.quorum-count 2
 TEST $CLI volume set $V0 cluster.heal-timeout 5
@@ -34,36 +35,45 @@ TEST $CLI volume set $V0 cluster.self-heal-daemon on
 TEST $CLI volume set $V0 cluster.eager-lock off
 TEST $CLI volume set $V0 network.ping-timeout 20
 TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 diagnostics.client-log-level DEBUG
+TEST $CLI volume set $V0 diagnostics.brick-log-level DEBUG
+TEST $CLI volume set $V0 nfs.log-level DEBUG
 TEST $CLI volume start $V0
 TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
-cd $M0
 
 # Write some data to the mount
 dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync
 
-# Calulate the MD5s on the two up volumes.
-MD5_B0=$(md5sum $B0/${V0}0/test | cut -d' ' -f1)
-MD5_B1=$(md5sum $B0/${V0}1/test | cut -d' ' -f1)
+KILL_IDX=$(cat /var/log/glusterfs/$M0LOG | grep "halo state: UP" | tail -n1 | grep -Eo "Child [0-9]+" | grep -Eo "[0-9]+")
+TEST [ -n "$KILL_IDX" ]
+# NB: UP_CHILDREN is the set of children that should be up after we kill
+# the brick indicated by KILL_IDX, *not* the set of children which are
+# currently up!
+UP_CHILDREN=($(echo "0 1 2" | sed "s/${KILL_IDX}//g"))
+UP1_HAS_TEST="$(ls $B0/${V0}${UP_CHILDREN[0]}/test 2>/dev/null)"
+UP2_HAS_TEST="$(ls $B0/${V0}${UP_CHILDREN[1]}/test 2>/dev/null)"
 
-# Verify they are the same
-TEST [ "$MD5_B0" == "$MD5_B1" ]
+# Of the bricks which will remain standing, there is only a single
+# brick which has the file called test.  If the both have the first
+# test file, the test is invalid as all the bricks are up and the
+# halo-max-replicas is not being honored; e.g. bug exists.
+ONLY_ONE=$((([ -z "$UP2_HAS_TEST" ] || [ -z "$UP1_HAS_TEST" ]) &&
+  ([ -n "$UP2_HAS_TEST" ] || [ -n "$UP1_HAS_TEST" ])) && echo true)
+TEST [ "x$ONLY_ONE" == "xtrue" ]
 
-sleep 0.5
-# Kill the first brick, fail-over to 3rd
-TEST kill_brick $V0 $H0 $B0/${V0}0
+echo "Failing child ${KILL_IDX}..."
+TEST kill_brick $V0 $H0 $B0/${V0}${KILL_IDX}
 
 # Test the mount is still RW (i.e. quorum works)
-TEST dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 conv=fsync
+TEST dd if=/dev/urandom of=$M0/test_failover bs=1M count=1 conv=fsync
 
 # Calulate the MD5s
-MD5_B0=$(md5sum $B0/${V0}0/test_rw | cut -d' ' -f1)
-MD5_B1=$(md5sum $B0/${V0}1/test_rw | cut -d' ' -f1)
-MD5_B2=$(md5sum $B0/${V0}2/test_rw | cut -d' ' -f1)
+MD5_UP1=$(md5sum $B0/${V0}${UP_CHILDREN[0]}/test_failover | cut -d' ' -f1)
+MD5_UP2=$(md5sum $B0/${V0}${UP_CHILDREN[1]}/test_failover | cut -d' ' -f1)
 
-# Verify they are the same
-TEST [ x"$MD5_B1" == x"$MD5_B2" ]
-
-# Verify the failed brick has a different MD5
-TEST [ x"$MD5_B0" != x"$MD5_B1" ]
+# Verify the two up bricks have identical MD5s, if both are identical
+# then we must have successfully failed-over to the brick which was
+# previously proven to be down (via the ONLY_ONE test).
+TEST [ "$MD5_UP1" == "$MD5_UP2" ]
 
 cleanup
diff --git a/tests/include.rc b/tests/include.rc
index e13bbacc392..d1acbee5995 100644
--- a/tests/include.rc
+++ b/tests/include.rc
@@ -18,6 +18,8 @@ META_MNT=${META_MNT:=/var/run/gluster/shared_storage}; # Mount point of shared g
 CC=cc
 OSTYPE=$(uname -s)
 
+M0LOG=${M0LOG:="mnt-glusterfs-0.log"}; # Log file for 0th FUSE mount point
+
 ENV_RC=$(dirname $0)/../env.rc
 if [ ! -f $ENV_RC ]; then
    ENV_RC=$(dirname $0)/../../env.rc
author	Richard Wareing <rwareing@fb.com>	2015-06-10 21:39:11 -0700
committer	Shreyas Siravara <sshreyas@fb.com>	2016-12-29 08:54:58 -0800
commit	0658050cc6bd2b3e5b9515a35055287ad59f3796 (patch)
tree	f9c76a81f2224d5fb30637777f0ed96ae1f08a3f /tests
parent	7a6ead5c03e9f62fe8726b141c94cc7d31a79c39 (diff)