diff options
author | Kevin Vigor <kvigor@fb.com> | 2017-01-04 10:22:08 -0800 |
---|---|---|
committer | Kevin Vigor <kvigor@fb.com> | 2017-01-04 11:23:16 -0800 |
commit | 1e69c7ef96a92754d4394fe2cff8767d3b1809fa (patch) | |
tree | cf0c8420c7a712927441a2574a1654275e27f8ef /tests | |
parent | 02f8b7300bc635dea9ae1fee6ef14c0d4725591a (diff) |
Make halo prove tests less racy
Summary:
Halo prove tests were racy in a couple of ways. First, they raced
against the self-heal daemon (e.g. write to volume with two bricks
up and then assert that only two bricks have data file; but shd will
properly copy file to third brick sooner or later). Fix by disabling
shd in such tests.
Second, tests rely on pings to complete and set halo state as
expected, but do not check for this. If writing begins before initial
pings complete, all bricks may be up and receive the data. Fix by adding
explicit check for halo child states.
Test Plan:
prove tests/basic/halo*.t
(prior to this changeset, would fail within ~10 iterations on my
devserver and almost always on centos regression. Now runs overnight
without failure on my devserver).
Reviewers:
Subscribers:
Tasks:
Blame Revision:
Change-Id: If6823540dd4e23a19cc495d5d0e8b0c6fde9a3bd
Signed-off-by: Kevin Vigor <kvigor@fb.com>
Reviewed-on: http://review.gluster.org/16325
CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
Reviewed-by: Shreyas Siravara <sshreyas@fb.com>
Smoke: Gluster Build System <jenkins@build.gluster.org>
Diffstat (limited to 'tests')
-rw-r--r-- | tests/basic/halo-failover-disabled.t | 10 | ||||
-rw-r--r-- | tests/basic/halo-failover-enabled.t | 14 | ||||
-rw-r--r-- | tests/halo.rc | 52 |
3 files changed, 71 insertions, 5 deletions
diff --git a/tests/basic/halo-failover-disabled.t b/tests/basic/halo-failover-disabled.t index 4cc66e38de6..f3655eaef3b 100644 --- a/tests/basic/halo-failover-disabled.t +++ b/tests/basic/halo-failover-disabled.t @@ -13,6 +13,7 @@ # . $(dirname $0)/../include.rc . $(dirname $0)/../volume.rc +. $(dirname $0)/../halo.rc cleanup; @@ -33,7 +34,7 @@ TEST $CLI volume set $V0 cluster.heal-timeout 5 TEST $CLI volume set $V0 cluster.entry-self-heal on TEST $CLI volume set $V0 cluster.data-self-heal on TEST $CLI volume set $V0 cluster.metadata-self-heal on -TEST $CLI volume set $V0 cluster.self-heal-daemon on +TEST $CLI volume set $V0 cluster.self-heal-daemon off TEST $CLI volume set $V0 cluster.eager-lock off TEST $CLI volume set $V0 diagnostics.client-log-level DEBUG TEST $CLI volume set $V0 diagnostics.brick-log-level DEBUG @@ -46,7 +47,9 @@ TEST $CLI volume set $V0 network.ping-timeout 1000 TEST $CLI volume set $V0 cluster.choose-local off TEST $CLI volume start $V0 TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 -cd $M0 + +# Make sure two children are up and one is down. +EXPECT_WITHIN 10 "2 1" halo_sum_child_states 3 # Write some data to the mount TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync @@ -54,6 +57,9 @@ TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync UP_IDX=$(cat /var/log/glusterfs/$M0LOG | grep "halo state: UP" | tail -n1 | grep -Eo "Child [0-9]+" | grep -Eo "[0-9]+") TEST kill_brick $V0 $H0 $B0/${V0}${UP_IDX} +# Make sure two children are down and one is up. +EXPECT_WITHIN 10 "1 2" halo_sum_child_states 3 + # Test that quorum should fail and the mount is RO, the reason here # is that although there _is_ another brick running which _could_ # take the failed bricks place, it is not marked "up" so quorum diff --git a/tests/basic/halo-failover-enabled.t b/tests/basic/halo-failover-enabled.t index aa73589366f..2dddf9951fa 100644 --- a/tests/basic/halo-failover-enabled.t +++ b/tests/basic/halo-failover-enabled.t @@ -13,6 +13,7 @@ # . $(dirname $0)/../include.rc . $(dirname $0)/../volume.rc +. $(dirname $0)/../halo.rc cleanup; @@ -31,7 +32,7 @@ TEST $CLI volume set $V0 cluster.heal-timeout 5 TEST $CLI volume set $V0 cluster.entry-self-heal on TEST $CLI volume set $V0 cluster.data-self-heal on TEST $CLI volume set $V0 cluster.metadata-self-heal on -TEST $CLI volume set $V0 cluster.self-heal-daemon on +TEST $CLI volume set $V0 cluster.self-heal-daemon off TEST $CLI volume set $V0 cluster.eager-lock off TEST $CLI volume set $V0 network.ping-timeout 20 TEST $CLI volume set $V0 cluster.choose-local off @@ -41,8 +42,11 @@ TEST $CLI volume set $V0 nfs.log-level DEBUG TEST $CLI volume start $V0 TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 +# Make sure two children are up and one is down. +EXPECT_WITHIN 10 "2 1" halo_sum_child_states 3 + # Write some data to the mount -dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync +TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync KILL_IDX=$(cat /var/log/glusterfs/$M0LOG | grep "halo state: UP" | tail -n1 | grep -Eo "Child [0-9]+" | grep -Eo "[0-9]+") TEST [ -n "$KILL_IDX" ] @@ -52,8 +56,12 @@ TEST [ -n "$KILL_IDX" ] UP_CHILDREN=($(echo "0 1 2" | sed "s/${KILL_IDX}//g")) UP1_HAS_TEST="$(ls $B0/${V0}${UP_CHILDREN[0]}/test 2>/dev/null)" UP2_HAS_TEST="$(ls $B0/${V0}${UP_CHILDREN[1]}/test 2>/dev/null)" +VICTIM_HAS_TEST="$(ls $B0/${V0}${KILL_IDX}/test 2>/dev/null)" + +# The victim brick should have a copy of the file. +TEST [ -n "$VICTIM_HAS_TEST" ] -# Of the bricks which will remain standing, there is only a single +# Of the bricks which will remain standing, there should be only one # brick which has the file called test. If the both have the first # test file, the test is invalid as all the bricks are up and the # halo-max-replicas is not being honored; e.g. bug exists. diff --git a/tests/halo.rc b/tests/halo.rc new file mode 100644 index 00000000000..4cb7c81da85 --- /dev/null +++ b/tests/halo.rc @@ -0,0 +1,52 @@ +# Return the current Halo state of a given child (by index, i.e. 0 +# is first child). +function halo_child_state { + grep "Child $1 .*halo state: " /var/log/glusterfs/$M0LOG | + tail -n1 | sed 's/^.* halo state: //' | sed 's/ .*$//' +} + +# Return number of Halo children which are in a given state. +# First parameter is total # children. +# Second parameter is state to match (e.g. "UP"). +function halo_children_in_state { + local CHILD_COUNT=$1 + local SUM=0 + for CHILD in $(seq 0 $((CHILD_COUNT-1))); do + if [ x"$(halo_child_state $CHILD)" == x"$2" ]; then + SUM=$((SUM+1)) + fi + done + echo $SUM +} + +# Return number of up halo children, +# First parameter is total # children, +function halo_children_up { + echo $(halo_children_in_state $1 "UP") +} + +# Return number of down halo children, +# First parameter is total # children, +function halo_children_down { + echo $(halo_children_in_state $1 "DOWN") +} + +# Return number of up & down halo children. +# First parameter is total number of children. +function halo_sum_child_states { + local CHILD_COUNT=$1 + + local UP=0 + local DOWN=0 + + for CHILD in $(seq 0 $((CHILD_COUNT-1))); do + local STATE=$(halo_child_state $CHILD) + if [ x"$STATE" == x"UP" ]; then + UP=$((UP+1)) + elif [ x"$STATE" == x"DOWN" ]; then + DOWN=$((DOWN+1)) + fi + done + + echo "$UP $DOWN" +} |