From 0658050cc6bd2b3e5b9515a35055287ad59f3796 Mon Sep 17 00:00:00 2001
From: Richard Wareing <rwareing@fb.com>
Date: Wed, 10 Jun 2015 21:39:11 -0700
Subject: Fix Halo tests in v3.6.3 of GlusterFS + minor SHD bug fix

Summary:
- SHD is now excluded from the max-replicas policy.  We'd need
  to make an SHD specific tunable for this to make tests reliably
  pass, and frankly it probably makes things more intuitive having
  SHD excluded (i.e. SHD can always see everything).
- Updated the halo-failover-enabled test, I think it's a bit more clear
  now, and works reliably.  halo.t fixed after fixing the SHD
  max-replicas bug.

Test Plan: - Run prove tests -> https://phabricator.fb.com/P19872728

Reviewers: dph, sshreyas

Reviewed By: sshreyas

FB-commit-id: e425e6651cd02691d36427831b6b8ca206d0f78f

Change-Id: I57855ef99628146c32de59af475b096bd91d6012
Signed-off-by: Kevin Vigor <kvigor@fb.com>
Reviewed-on: http://review.gluster.org/16305
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
Smoke: Gluster Build System <jenkins@build.gluster.org>
Reviewed-by: Shreyas Siravara <sshreyas@fb.com>
---
 tests/basic/halo-failover-enabled.t | 46 ++++++++++++++++++++++---------------
 1 file changed, 28 insertions(+), 18 deletions(-)

(limited to 'tests/basic')

diff --git a/tests/basic/halo-failover-enabled.t b/tests/basic/halo-failover-enabled.t
index e897d076813..aa73589366f 100644
--- a/tests/basic/halo-failover-enabled.t
+++ b/tests/basic/halo-failover-enabled.t
@@ -24,6 +24,7 @@ TEST $CLI volume set $V0 cluster.shd-max-threads 1
 TEST $CLI volume set $V0 cluster.halo-enabled True
 TEST $CLI volume set $V0 cluster.halo-failover-enabled on
 TEST $CLI volume set $V0 cluster.halo-max-replicas 2
+TEST $CLI volume set $V0 cluster.halo-min-samples 1
 TEST $CLI volume set $V0 cluster.quorum-type fixed
 TEST $CLI volume set $V0 cluster.quorum-count 2
 TEST $CLI volume set $V0 cluster.heal-timeout 5
@@ -34,36 +35,45 @@ TEST $CLI volume set $V0 cluster.self-heal-daemon on
 TEST $CLI volume set $V0 cluster.eager-lock off
 TEST $CLI volume set $V0 network.ping-timeout 20
 TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 diagnostics.client-log-level DEBUG
+TEST $CLI volume set $V0 diagnostics.brick-log-level DEBUG
+TEST $CLI volume set $V0 nfs.log-level DEBUG
 TEST $CLI volume start $V0
 TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
-cd $M0
 
 # Write some data to the mount
 dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync
 
-# Calulate the MD5s on the two up volumes.
-MD5_B0=$(md5sum $B0/${V0}0/test | cut -d' ' -f1)
-MD5_B1=$(md5sum $B0/${V0}1/test | cut -d' ' -f1)
+KILL_IDX=$(cat /var/log/glusterfs/$M0LOG | grep "halo state: UP" | tail -n1 | grep -Eo "Child [0-9]+" | grep -Eo "[0-9]+")
+TEST [ -n "$KILL_IDX" ]
+# NB: UP_CHILDREN is the set of children that should be up after we kill
+# the brick indicated by KILL_IDX, *not* the set of children which are
+# currently up!
+UP_CHILDREN=($(echo "0 1 2" | sed "s/${KILL_IDX}//g"))
+UP1_HAS_TEST="$(ls $B0/${V0}${UP_CHILDREN[0]}/test 2>/dev/null)"
+UP2_HAS_TEST="$(ls $B0/${V0}${UP_CHILDREN[1]}/test 2>/dev/null)"
 
-# Verify they are the same
-TEST [ "$MD5_B0" == "$MD5_B1" ]
+# Of the bricks which will remain standing, there is only a single
+# brick which has the file called test.  If the both have the first
+# test file, the test is invalid as all the bricks are up and the
+# halo-max-replicas is not being honored; e.g. bug exists.
+ONLY_ONE=$((([ -z "$UP2_HAS_TEST" ] || [ -z "$UP1_HAS_TEST" ]) &&
+  ([ -n "$UP2_HAS_TEST" ] || [ -n "$UP1_HAS_TEST" ])) && echo true)
+TEST [ "x$ONLY_ONE" == "xtrue" ]
 
-sleep 0.5
-# Kill the first brick, fail-over to 3rd
-TEST kill_brick $V0 $H0 $B0/${V0}0
+echo "Failing child ${KILL_IDX}..."
+TEST kill_brick $V0 $H0 $B0/${V0}${KILL_IDX}
 
 # Test the mount is still RW (i.e. quorum works)
-TEST dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 conv=fsync
+TEST dd if=/dev/urandom of=$M0/test_failover bs=1M count=1 conv=fsync
 
 # Calulate the MD5s
-MD5_B0=$(md5sum $B0/${V0}0/test_rw | cut -d' ' -f1)
-MD5_B1=$(md5sum $B0/${V0}1/test_rw | cut -d' ' -f1)
-MD5_B2=$(md5sum $B0/${V0}2/test_rw | cut -d' ' -f1)
+MD5_UP1=$(md5sum $B0/${V0}${UP_CHILDREN[0]}/test_failover | cut -d' ' -f1)
+MD5_UP2=$(md5sum $B0/${V0}${UP_CHILDREN[1]}/test_failover | cut -d' ' -f1)
 
-# Verify they are the same
-TEST [ x"$MD5_B1" == x"$MD5_B2" ]
-
-# Verify the failed brick has a different MD5
-TEST [ x"$MD5_B0" != x"$MD5_B1" ]
+# Verify the two up bricks have identical MD5s, if both are identical
+# then we must have successfully failed-over to the brick which was
+# previously proven to be down (via the ONLY_ONE test).
+TEST [ "$MD5_UP1" == "$MD5_UP2" ]
 
 cleanup
-- 
cgit