6 files changed, 337 insertions, 23 deletions
diff --git a/tests/basic/halo-failover-disabled.t b/tests/basic/halo-failover-disabled.t
index 31a1d166404..4cc66e38de6 100644
--- a/tests/basic/halo-failover-disabled.t
+++ b/tests/basic/halo-failover-disabled.t
@@ -35,6 +35,10 @@ TEST $CLI volume set $V0 cluster.data-self-heal on
 TEST $CLI volume set $V0 cluster.metadata-self-heal on
 TEST $CLI volume set $V0 cluster.self-heal-daemon on
 TEST $CLI volume set $V0 cluster.eager-lock off
+TEST $CLI volume set $V0 diagnostics.client-log-level DEBUG
+TEST $CLI volume set $V0 diagnostics.brick-log-level DEBUG
+TEST $CLI volume set $V0 nfs.log-level DEBUG
+
 # Use a large ping time here so the spare brick is not marked up
 # based on the ping time.  The only way it can get marked up is
 # by being swapped in via the down event (which is what we are disabling).
@@ -47,8 +51,8 @@ cd $M0
 # Write some data to the mount
 TEST dd if=/dev/urandom of=$M0/test bs=1k count=200 conv=fsync
 
-# Kill the first brick, fail-over to 3rd
-TEST kill_brick $V0 $H0 $B0/${V0}0
+UP_IDX=$(cat /var/log/glusterfs/$M0LOG  | grep "halo state: UP" | tail -n1 | grep -Eo "Child [0-9]+" | grep -Eo "[0-9]+")
+TEST kill_brick $V0 $H0 $B0/${V0}${UP_IDX}
 
 # Test that quorum should fail and the mount is RO, the reason here
 # is that although there _is_ another brick running which _could_
@@ -59,7 +63,7 @@ TEST kill_brick $V0 $H0 $B0/${V0}0
 TEST ! dd if=/dev/urandom of=$M0/test_rw bs=1M count=1 conv=fsync
 
 TEST $CLI volume start $V0 force
-sleep 2
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 $UP_IDX
 
 # Test that quorum should be restored and the file is writable
 TEST dd if=/dev/urandom of=$M0/test_rw bs=1M count=1
diff --git a/tests/basic/halo-hybrid.t b/tests/basic/halo-hybrid.t
new file mode 100644
index 00000000000..4574fdfe41e
--- /dev/null
+++ b/tests/basic/halo-hybrid.t
@@ -0,0 +1,70 @@
+#!/bin/bash
+#
+# Test for the Halo hybrid feature
+#
+# 1. Create volume w/ 3x replication w/ max-replicas = 2 for clients,
+#    heal daemon is off to start.
+# 2. Write some data
+# 3. Verify hybrid code chose children for lookups
+# 4. Verify hybrid code chose child for reads
+# 5. Verify hybrid code wrote synchronously to all replicas
+#
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+function found_fuse_log_msg {
+  local dir="$1"
+  local msg="$2"
+  local cnt=$(cat /var/log/glusterfs/$M0LOG | grep "$msg" | tail -n1 | wc -l)
+  if (( $cnt == 1 )); then
+    echo "Y"
+  else
+    echo "N"
+  fi
+}
+
+cleanup;
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 cluster.background-self-heal-count 0
+TEST $CLI volume set $V0 cluster.shd-max-threads 1
+TEST $CLI volume set $V0 cluster.halo-enabled True
+TEST $CLI volume set $V0 cluster.halo-hybrid-mode True
+TEST $CLI volume set $V0 cluster.heal-timeout 5
+TEST $CLI volume set $V0 cluster.self-heal-daemon off
+TEST $CLI volume set $V0 cluster.eager-lock off
+TEST $CLI volume set $V0 cluster.choose-local off
+TEST $CLI volume set $V0 diagnostics.client-log-level TRACE
+TEST $CLI volume start $V0
+
+# Start a synchronous mount
+TEST glusterfs --volfile-id=/$V0 \
+  --xlator-option *replicate*.halo-max-latency=9999  \
+  --volfile-server=$H0 $M0 \
+  --attribute-timeout=0 --entry-timeout=0
+sleep 2
+cd $M0
+
+TEST mkdir testdir
+TEST cd testdir
+for i in {1..5}
+do
+        dd if=/dev/urandom of=testfile$i bs=1M count=1 2>/dev/null
+done
+TEST ls -l
+
+EXPECT_WITHIN "60" "Y" found_fuse_log_msg "children for LOOKUPs"
+EXPECT_WITHIN "60" "Y" found_fuse_log_msg "Selected hybrid child"
+
+B0_CNT=$(ls $B0/${V0}0/testdir | wc -l)
+B1_CNT=$(ls $B0/${V0}1/testdir | wc -l)
+B2_CNT=$(ls $B0/${V0}2/testdir | wc -l)
+
+# Writes should be synchronous, all should have same
+# file count
+TEST "(($B0_CNT == 5 && $B1_CNT == 5 && $B2_CNT == 5))"
+
+cleanup
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index c4b6fd6a9b6..0c621271405 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -45,6 +45,10 @@
 
 #define CHILD_UP_STR "UP"
 #define CHILD_DOWN_STR "DOWN"
+#define CHILD_DISCONNECTED_STR "DOWN"
+
+static int32_t
+find_hybrid_children (xlator_t *this, unsigned char *fastest_children);
 
 call_frame_t *
 afr_copy_frame (call_frame_t *base)
@@ -1371,21 +1375,75 @@ afr_hash_child (afr_read_subvol_args_t *args, int32_t child_count, int hashmode)
                              sizeof(gfid_copy)) % child_count;
 }
 
+/*
+ * afr_halo_read_subvol
+ *
+ * Given a array representing the readable children, this function will
+ * return which one of the readable children meet the halo hybrid criteria.
+ * In the event none are found, -1 is returned and another strategy will have
+ * to be used to figure out where the read should come from.
+ */
+int afr_halo_read_subvol (xlator_t *this, unsigned char *readable) {
+	afr_private_t *priv = NULL;
+        unsigned char *hybrid_children;
+        int32_t hybrid_cnt = 0;
+	int read_subvol = -1;
+	int i = 0;
+
+	priv = this->private;
+
+        /* Halo in-active or hybrid mode disabled, bail.... */
+        if (!priv->halo_enabled || !priv->halo_hybrid_mode)
+                return -1;
+
+        /* AFR Discovery edge case, if you are already pinned to a child
+         * which meets the latency threshold then go with this child for
+         * consistency purposes.
+         */
+        if (priv->read_child >= 0 && readable[priv->read_child] &&
+            priv->child_latency[priv->read_child] <=
+             AFR_HALO_HYBRID_LATENCY_MSEC) {
+                return priv->read_child;
+        }
+
+        hybrid_children = alloca0 (priv->child_count);
+        hybrid_cnt = find_hybrid_children (this, hybrid_children);
+        if (hybrid_cnt) {
+                for (i = 0; i < priv->child_count; i++) {
+                        if (readable[i] && hybrid_children[i]) {
+                                read_subvol = i;
+                                priv->read_child = read_subvol;
+                                gf_log (this->name, GF_LOG_TRACE,
+                                        "Selected hybrid child %d for reads",
+                                        i);
+                                break;
+                        }
+                }
+        }
+
+        return read_subvol;
+}
+
 
 int
 afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
 				  unsigned char *readable,
                                   afr_read_subvol_args_t *args)
 {
-	int             i           = 0;
-	int             read_subvol = -1;
-	afr_private_t  *priv        = NULL;
+	    int             i           = 0;
+	    int             read_subvol = -1;
+	    afr_private_t  *priv        = NULL;
         afr_read_subvol_args_t local_args = {0,};
 
-	priv = this->private;
+	    priv = this->private;
+
+        /* Choose lowest latency child for reads */
+        read_subvol = afr_halo_read_subvol (this, readable);
+        if (read_subvol != -1)
+                return read_subvol;
 
-	/* first preference - explicitly specified or local subvolume */
-	if (priv->read_child >= 0 && readable[priv->read_child])
+        /* first preference - explicitly specified or local subvolume */
+	    if (priv->read_child >= 0 && readable[priv->read_child])
                 return priv->read_child;
 
         if (inode_is_linked (inode)) {
@@ -1411,7 +1469,6 @@ afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
         return -1;
 }
 
-
 int
 afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,
 				unsigned char *readable, int *event_p,
@@ -2071,6 +2128,13 @@ afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
                         priv->children[child_index]->name);
 
                 priv->read_child = child_index;
+        } else if (priv->halo_enabled) {
+                if (priv->read_child < 0) {
+                        priv->read_child = child_index;
+                } else if (priv->child_latency[child_index] <
+                    priv->child_latency[priv->read_child]) {
+                        priv->read_child = child_index;
+                }
         }
 out:
         STACK_DESTROY(frame->root);
@@ -2487,6 +2551,7 @@ afr_discover_do (call_frame_t *frame, xlator_t *this, int err)
 	afr_local_t *local = NULL;
 	afr_private_t *priv = NULL;
 	int call_count = 0;
+        unsigned char *hybrid_children = NULL;
 
 	local = frame->local;
 	priv = this->private;
@@ -2497,8 +2562,19 @@ afr_discover_do (call_frame_t *frame, xlator_t *this, int err)
 		goto out;
 	}
 
-	call_count = local->call_count = AFR_COUNT (local->child_up,
-						    priv->child_count);
+        hybrid_children = alloca0 (priv->child_count);
+        call_count = find_hybrid_children (this, hybrid_children);
+        if (call_count) {
+                for (i = 0; i < priv->child_count; i++)
+                        local->child_up[i] = hybrid_children[i];
+                gf_log (this->name, GF_LOG_TRACE, "Selected %d hybrid "
+                        "children for LOOKUPs", call_count);
+        } else {
+                hybrid_children = NULL;
+                call_count = AFR_COUNT (local->child_up, priv->child_count);
+        }
+
+        local->call_count = call_count;
 
         ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req,
 					    &local->loc);
@@ -2731,6 +2807,15 @@ afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
 	afr_read_subvol_get (loc->parent, this, NULL, NULL, &event,
 			     AFR_DATA_TRANSACTION, NULL);
 
+        /* So this is the "secret" to why "Hybrid" halo works.  Encoded in
+         * the cached inodes, we store what is effectively the "generational"
+         * state of the cluster along with a "packed" version of the extended
+         * attributes which determine which nodes are wise/fools.  We can
+         * consult these cached values to figure out who we can trust, in the
+         * event the state of our cluster changes and we can no longer trust
+         * the cached info we "refresh" the inode (and hit all regions) to
+         * ensure we know which bricks we can safely read from.
+         */
 	if (event != local->event_generation)
 		afr_inode_refresh (frame, this, loc->parent, NULL,
                                    afr_lookup_do);
@@ -4203,6 +4288,97 @@ __get_heard_from_all_status (xlator_t *this)
         return heard_from_all;
 }
 
+/*
+ * afr_cmp_child
+ *
+ * Passed to the qsort function to order a list of children by the latency
+ * and/or up/down states.
+ */
+static int
+_afr_cmp_child (const void *child1, const void *child2)
+{
+        struct afr_child *child11 = (struct afr_child *)child1;
+        struct afr_child *child22 = (struct afr_child *)child2;
+
+        if (child11->latency > child22->latency) {
+                return 1;
+        }
+        if (child11->latency == child22->latency) {
+                return 0;
+        }
+        return -1;
+}
+
+/*
+ * find_hybrid_children
+ *
+ * Given a char array representing our children (aka bricks within our AFR
+ * AFR "subvolume"), we'll mark this array with the children which are
+ * within the halo_hybrid_read_max_latency_sec or if none fit this condition,
+ * we'll pick the fastest two bricks.
+ *
+ * You might ask, why not just pick the quickest brick and be done with it?
+ * Well, being within our set is not suffcient to be chosen for the read,
+ * we must also be marked "readable", we still want to choose as many as
+ * we can within our local region to ensure we have somebody that is readable.
+ *
+ * To illustrate this, consider the case where a 1/2 bricks received a sync
+ * from some other writer, and the 2nd brick although faster wasn't present.
+ * In this case we'll want to use the slower brick to service the read.
+ *
+ * In short, this function just tells the caller which hybrid children,
+ * it gives no signal as to their readability, nor should it since this is
+ * handled later in the various flows (e.g. by afr_halo_read_subvol).
+ */
+static int32_t
+find_hybrid_children (xlator_t *this, unsigned char *hybrid_children)
+{
+        int32_t i = 0;
+        afr_private_t *priv = NULL;
+        struct afr_child   *sorted_list = NULL;
+        uint32_t max_latency;
+        uint32_t limit = AFR_HALO_HYBRID_CHILD_LIMIT;
+
+        priv = this->private;
+
+        if (!priv->halo_enabled || !priv->halo_hybrid_mode)
+                return 0;
+
+        if (limit > priv->child_count)
+                limit = priv->child_count;
+
+        max_latency = priv->halo_hybrid_read_max_latency_msec;
+
+        sorted_list = alloca (sizeof (struct afr_child) * priv->child_count);
+
+        /* Find children meeting the latency threshold */
+        for (i = 0; i < priv->child_count; i++) {
+                sorted_list[i].idx = i;
+                sorted_list[i].child_up = priv->child_up[i];
+                sorted_list[i].latency = priv->child_latency[i];
+        }
+
+        /* QuickSort the children according to latency */
+        qsort (sorted_list, priv->child_count, sizeof (struct afr_child),
+               _afr_cmp_child);
+
+        i = 0;
+        while (i < priv->child_count && sorted_list[i].latency <= max_latency)
+                hybrid_children[sorted_list[i++].idx] = 1;
+
+        /* Found some candidates */
+        if (i != 0)
+                return i;
+
+        /* If no candidates can be found meeting the max_latency threshold
+         * then find the best of those we have to our limit.
+         */
+        for (i = 0; i < limit; i++)
+                hybrid_children[sorted_list[i].idx] = 1;
+
+        return i;
+}
+
 int
 find_best_down_child (xlator_t *this)
 {
@@ -4260,11 +4436,20 @@ static void dump_halo_states (xlator_t *this) {
         priv = this->private;
 
         for (i = 0; i < priv->child_count; i++) {
-                gf_log (this->name, GF_LOG_DEBUG,
-                        "Child %d halo state: %s (%"PRIi64"ms)",
-                        i,
-                        priv->child_up[i] ? CHILD_UP_STR : CHILD_DOWN_STR,
-                        priv->child_latency[i]);
+                if (priv->child_latency[i] == AFR_CHILD_DOWN_LATENCY) {
+                        gf_log (this->name, GF_LOG_DEBUG,
+                                "Child %d halo state: %s (N/A)",
+                                i,
+                                priv->child_up[i] ? CHILD_UP_STR :
+                                                    CHILD_DOWN_STR);
+                 } else {
+                        gf_log (this->name, GF_LOG_DEBUG,
+                                "Child %d halo state: %s (%"PRIi64" ms)",
+                                i,
+                                priv->child_up[i] ? CHILD_UP_STR :
+                                                    CHILD_DOWN_STR,
+                                priv->child_latency[i]);
+                }
         }
 }
 
@@ -4513,11 +4698,11 @@ _afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator,
 
         /*
          * If this is an _actual_ CHILD_DOWN event, we
-         * want to set the child_latency to < 0 to indicate
-         * the child is really disconnected.
+         * want to set the child_latency to AFR_CHILD_DOWN_LATENCY to
+         * indicate the child is really disconnected.
          */
-        if (child_latency_msec < 0) {
-                priv->child_latency[idx] = child_latency_msec;
+        if (child_latency_msec == AFR_CHILD_DOWN_LATENCY) {
+                priv->child_latency[idx] = AFR_CHILD_DOWN_LATENCY;
         }
         priv->child_up[idx] = 0;
 
@@ -4620,7 +4805,7 @@ afr_notify (xlator_t *this, int32_t event,
         gf_boolean_t    had_quorum          = _gf_false;
         gf_boolean_t    has_quorum          = _gf_false;
         int64_t         halo_max_latency_msec = 0;
-        int64_t         child_latency_msec   = -1;
+        int64_t         child_latency_msec   = AFR_CHILD_DOWN_LATENCY;
         gf_boolean_t    child_halo_enabled   = _gf_false;
 
         child_xlator = (xlator_t *)data;
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index d4dd8ff8815..ae9b28c7fb4 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -195,6 +195,14 @@ reconfigure (xlator_t *this, dict_t *options)
         GF_OPTION_RECONF ("halo-max-latency", priv->halo_max_latency_msec,
                           options, uint32, out);
 
+        GF_OPTION_RECONF ("halo-hybrid-mode",
+                          priv->halo_hybrid_mode, options, bool,
+                          out);
+
+        GF_OPTION_RECONF ("halo-hybrid-read-max-latency",
+                          priv->halo_hybrid_read_max_latency_msec, options,
+                          uint32, out);
+
         GF_OPTION_RECONF ("halo-max-replicas", priv->halo_max_replicas, options,
                               uint32, out);
 
@@ -424,6 +432,13 @@ init (xlator_t *this)
 
         GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out);
 
+        GF_OPTION_INIT ("halo-hybrid-mode",
+                        priv->halo_hybrid_mode, bool, out);
+
+        GF_OPTION_INIT ("halo-hybrid-read-max-latency",
+                        priv->halo_hybrid_read_max_latency_msec, uint32,
+                        out);
+
         GF_OPTION_INIT ("halo-enabled",
                         priv->halo_enabled, bool, out);
 
@@ -505,12 +520,14 @@ init (xlator_t *this)
                 goto out;
         }
 
-        for (i = 0; i < child_count; i++)
+        for (i = 0; i < child_count; i++) {
+                priv->child_latency[i] = 0.0;
                 priv->child_up[i] = -1; /* start with unknown state.
                                            this initialization needed
                                            for afr_notify() to work
                                            reliably
                                         */
+        }
 
         priv->children = GF_CALLOC (sizeof (xlator_t *), child_count,
                                     gf_afr_mt_xlator_t);
@@ -751,6 +768,27 @@ struct volume_options options[] = {
           .default_value = "5",
            .description = "Maximum latency for halo replication in msec."
         },
+        { .key   = {"halo-hybrid-mode"},
+          .type = GF_OPTION_TYPE_BOOL,
+          .default_value = "off",
+          .description = "Enable hybrid sync mounts.  When enabled, halo will "
+                         "do write FOPs synchronously, and read FOPs will be "
+                         "services in-region if the inode is clean/consistent."
+                         "If no bricks can be found below "
+                         "halo-hybrid-max-read-latency then the best 2 shall "
+                         "be selected.  This option can be used in "
+                         "conjunction with all other halo options."
+        },
+        { .key   = {"halo-hybrid-read-max-latency"},
+          .type  = GF_OPTION_TYPE_INT,
+          .min   = 1,
+          .max   = 99999,
+          .default_value = "8",
+           .description = "Maximum latency hybrid mode will use to select "
+                          "children for read FOPs.  Don't tune this unless "
+                          "you really know what you are doing (i.e. you've "
+                          "read/understand the associated source code)."
+        },
         { .key   = {"halo-max-replicas"},
           .type  = GF_OPTION_TYPE_INT,
           .min   = 1,
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index f66cdbbf56a..d09aa6852c8 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -30,6 +30,9 @@
 #define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty"
 #define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty)
 
+#define AFR_CHILD_DOWN_LATENCY INT64_MAX  /* Latency for down children */
+#define AFR_HALO_HYBRID_CHILD_LIMIT 2   /* Examine bricks <= 10 msec */
+#define AFR_HALO_HYBRID_LATENCY_MSEC 10.0   /* Examine bricks <= 10 msec */
 #define AFR_LOCKEE_COUNT_MAX    3
 #define AFR_DOM_COUNT_MAX    3
 #define AFR_NUM_CHANGE_LOGS            3 /*data + metadata + entry*/
@@ -65,6 +68,12 @@ struct afr_nfsd {
         uint32_t         halo_max_latency_msec;
 };
 
+struct afr_child {
+        uint32_t idx;
+        int64_t latency;
+        unsigned char child_up;
+};
+
 typedef struct _afr_private {
         gf_lock_t lock;               /* to guard access to child_count, etc */
         unsigned int child_count;     /* total number of children   */
@@ -149,7 +158,10 @@ typedef struct _afr_private {
 	char                   *afr_dirty;
         gf_boolean_t           halo_enabled;
 
+        /* Halo geo-replication tunables */
         gf_boolean_t           halo_failover_enabled;
+        gf_boolean_t           halo_hybrid_mode;
+        uint32_t               halo_hybrid_read_max_latency_msec;
         uint32_t               halo_max_latency_msec;
         uint32_t               halo_max_replicas;
         uint32_t               halo_min_replicas;
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index f28294301f8..bcb8877c5bd 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -3038,6 +3038,11 @@ struct volopt_map_entry glusterd_volopt_map[] = {
           .op_version = 2,
           .flags      = OPT_FLAG_CLIENT_OPT
         },
+        { .key        = "cluster.halo-hybrid-mode",
+          .voltype    = "cluster/replicate",
+          .op_version = 2,
+          .flags      = OPT_FLAG_CLIENT_OPT
+        },
         { .key        = "cluster.halo-failover-enabled",
           .voltype    = "cluster/replicate",
           .op_version = 2,