2 files changed, 69 insertions, 4 deletions
diff --git a/tests/basic/halo-failover.t b/tests/basic/halo-failover.t
new file mode 100644
index 00000000000..220fa1f2207
--- /dev/null
+++ b/tests/basic/halo-failover.t
@@ -0,0 +1,65 @@
+#!/bin/bash
+#
+# Tests that fail-over works correctly for Halo Geo-replication
+#
+# 1. Create a volume @ 3x replication w/ halo + quorum enabled
+# 2. Write some data, background it & fail a brick
+# 3. The expected result is that the writes fail-over to the 3rd
+#    brick immediatelly, and md5s will show they are equal once
+#    the write completes.
+# 4. The mount should also be RW after the brick is killed as
+#    quorum will be immediately restored by swapping in the
+#    other brick.
+#
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 cluster.background-self-heal-count 0
+TEST $CLI volume set $V0 cluster.shd-max-threads 1
+TEST $CLI volume set $V0 cluster.halo-enabled True
+TEST $CLI volume set $V0 cluster.halo-max-replicas 2
+TEST $CLI volume set $V0 cluster.quorum-type fixed
+TEST $CLI volume set $V0 cluster.quorum-count 2
+TEST $CLI volume set $V0 cluster.heal-timeout 5
+TEST $CLI volume set $V0 cluster.entry-self-heal on
+TEST $CLI volume set $V0 cluster.data-self-heal on
+TEST $CLI volume set $V0 cluster.metadata-self-heal on
+TEST $CLI volume set $V0 cluster.self-heal-daemon on
+TEST $CLI volume set $V0 cluster.eager-lock off
+TEST $CLI volume set $V0 network.ping-timeout 20
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume start $V0
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0
+cd $M0
+
+# Write some data to the mount
+dd if=/dev/urandom of=$M0/test bs=1k count=200 oflag=sync &> /dev/null &
+
+sleep 0.5
+# Kill the first brick, fail-over to 3rd
+TEST kill_brick $V0 $H0 $B0/${V0}0
+
+# Test the mount is still RW (i.e. quorum works)
+TEST dd if=/dev/urandom of=$M0/test_rw bs=1M count=1
+
+# Wait for the dd to finish
+wait
+sleep 3
+
+# Calulate the MD5s
+MD5_B0=$(md5sum $B0/${V0}0/test | cut -d' ' -f1)
+MD5_B1=$(md5sum $B0/${V0}1/test | cut -d' ' -f1)
+MD5_B2=$(md5sum $B0/${V0}2/test | cut -d' ' -f1)
+
+# Verify they are the same
+TEST [ "$MD5_B1" == "$MD5_B2" ]
+
+# Verify the failed brick has a different MD5
+TEST [ x"$MD5_B0" != x"$MD5_B1" ]
+
+cleanup
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index c2e95953a7f..ed2c6414718 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -4211,8 +4211,8 @@ find_best_down_child (xlator_t *this)
         priv = this->private;
 
         for (i = 0; i < priv->child_count; i++) {
-                if (priv->child_up[i] &&
-                    priv->child_latency[i] >= 0.0 &&
+                if (!priv->child_up[i] &&
+                    priv->child_latency[i] >= 0 &&
                     priv->child_latency[i] < best_latency) {
                         best_child = i;
                         best_latency = priv->child_latency[i];
@@ -4237,7 +4237,7 @@ find_worst_up_child (xlator_t *this)
 
         for (i = 0; i < priv->child_count; i++) {
                 if (priv->child_up[i] &&
-                    priv->child_latency[i] >= 0.0 &&
+                    priv->child_latency[i] >= 0 &&
                     priv->child_latency[i] > worst_latency) {
                         worst_child = i;
                         worst_latency = priv->child_latency[i];
@@ -4510,7 +4510,7 @@ afr_notify (xlator_t *this, int32_t event,
         gf_boolean_t    had_quorum          = _gf_false;
         gf_boolean_t    has_quorum          = _gf_false;
         int64_t         halo_max_latency_msec = 0;
-        int64_t         child_latency_msec   = 0;
+        int64_t         child_latency_msec   = -1;
 
         child_xlator = (xlator_t *)data;
         priv = this->private;