5 files changed, 422 insertions, 58 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index ac834e90f4b..17943d7baae 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -2531,7 +2531,6 @@ unwind:
         return 0;
 }
 
-
 int
 afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
 {
@@ -3227,7 +3226,7 @@ afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
         }
         UNLOCK (&frame->lock);
 
-	call_count = afr_frame_return (frame);
+        call_count = afr_frame_return (frame);
 
 	if (call_count == 0)
 		AFR_STACK_UNWIND (flush, frame, local->op_ret,
@@ -4655,20 +4654,292 @@ __get_heard_from_all_status (xlator_t *this)
         return heard_from_all;
 }
 
+static int
+find_best_down_child (xlator_t *this)
+{
+        afr_private_t   *priv               = NULL;
+        int             i                   = -1;
+        int32_t         best_child          = -1;
+        int64_t         best_latency        = INT64_MAX;
+
+        priv = this->private;
+
+        for (i = 0; i < priv->child_count; i++) {
+                if (priv->child_up[i] &&
+                    priv->child_latency[i] >= 0 &&
+                    priv->child_latency[i] < best_latency) {
+                        best_child = i;
+                        best_latency = priv->child_latency[i];
+                }
+        }
+        if (best_child >= 0) {
+                gf_msg_debug (this->name, 0, "Found best down child (%d) "
+                              "@ %ld ms latency", best_child, best_latency);
+        }
+        return best_child;
+}
+
+int
+find_worst_up_child (xlator_t *this)
+{
+        afr_private_t   *priv               = NULL;
+        int             i                   = -1;
+        int32_t         worst_child         = -1;
+        int64_t         worst_latency       = INT64_MIN;
+
+        priv = this->private;
+
+        for (i = 0; i < priv->child_count; i++) {
+                if (priv->child_up[i] &&
+                    priv->child_latency[i] >= 0 &&
+                    priv->child_latency[i] > worst_latency) {
+                        worst_child = i;
+                        worst_latency = priv->child_latency[i];
+                }
+        }
+        if (worst_child >= 0) {
+                gf_msg_debug (this->name, 0, "Found worst up child (%d)"
+                              " @ %ld ms latency", worst_child, worst_latency);
+        }
+        return worst_child;
+}
+
+void
+__afr_handle_ping_event (xlator_t *this, xlator_t *child_xlator,
+                const int idx, int64_t halo_max_latency_msec, int32_t *event,
+                int64_t child_latency_msec)
+{
+        afr_private_t   *priv               = NULL;
+        int             up_children         = 0;
+
+        priv = this->private;
+
+        priv->child_latency[idx] = child_latency_msec;
+        gf_msg_debug (child_xlator->name, 0, "Client ping @ %ld ms",
+                      child_latency_msec);
+
+        up_children = __afr_get_up_children_count (priv);
+
+        if (child_latency_msec > halo_max_latency_msec &&
+            priv->child_up[idx] == 1 &&
+            up_children > priv->halo_min_replicas) {
+                if ((up_children - 1) <
+                    priv->halo_min_replicas) {
+                        gf_log (child_xlator->name, GF_LOG_INFO,
+                               "Overriding halo threshold, "
+                               "min replicas: %d",
+                               priv->halo_min_replicas);
+                } else {
+                        gf_log (child_xlator->name, GF_LOG_INFO,
+                                "Child latency (%ld ms) "
+                                "exceeds halo threshold (%ld), "
+                                "marking child down.",
+                                child_latency_msec,
+                                halo_max_latency_msec);
+                        *event = GF_EVENT_CHILD_DOWN;
+                }
+        } else if (child_latency_msec < halo_max_latency_msec &&
+                   priv->child_up[idx] == 0) {
+                if (up_children < priv->halo_max_replicas) {
+                        gf_log (child_xlator->name, GF_LOG_INFO,
+                                "Child latency (%ld ms) "
+                                "below halo threshold (%ld), "
+                                "marking child up.",
+                                child_latency_msec,
+                                halo_max_latency_msec);
+                        *event = GF_EVENT_CHILD_UP;
+                } else {
+                        gf_log (child_xlator->name, GF_LOG_INFO,
+                            "Not marking child %d up, "
+                            "max replicas (%d) reached.", idx,
+                            priv->halo_max_replicas);
+                }
+        }
+}
+
+void
+__afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator,
+                const int idx, int64_t halo_max_latency_msec,
+                int32_t *event, int32_t *call_psh, int32_t *up_child)
+{
+        afr_private_t   *priv               = NULL;
+        int             up_children         = 0;
+        int             worst_up_child      = -1;
+
+        priv = this->private;
+
+        /*
+         * This only really counts if the child was never up
+         * (value = -1) or had been down (value = 0).  See
+         * comment at GF_EVENT_CHILD_DOWN for a more detailed
+         * explanation.
+         */
+        if (priv->child_up[idx] != 1) {
+                priv->event_generation++;
+        }
+        priv->child_up[idx] = 1;
+
+        *call_psh = 1;
+        *up_child = idx;
+        up_children = __afr_get_up_children_count (priv);
+
+        /*
+         * Handle the edge case where we exceed
+         * halo_min_replicas and we've got a child which is
+         * marked up as it was helping to satisfy the
+         * halo_min_replicas even though it's latency exceeds
+         * halo_max_latency_msec.
+         */
+        if (up_children > priv->halo_min_replicas) {
+                worst_up_child = find_worst_up_child (this);
+                if (worst_up_child >= 0 &&
+                    priv->child_latency[worst_up_child] >
+                    halo_max_latency_msec) {
+                        gf_msg_debug (this->name, 0, "Marking child %d down, "
+                                "doesn't meet halo threshold (%ld), and > "
+                                "halo_min_replicas (%d)",
+                                worst_up_child, halo_max_latency_msec,
+                                priv->halo_min_replicas);
+                        priv->child_up[worst_up_child] = 0;
+                        up_children--;
+                }
+        }
+        if (up_children > priv->halo_max_replicas &&
+            !priv->shd.iamshd) {
+                worst_up_child = find_worst_up_child (this);
+                if (worst_up_child < 0) {
+                        worst_up_child = idx;
+                }
+                priv->child_up[worst_up_child] = 0;
+                up_children--;
+                gf_msg_debug (this->name, 0, "Marking child %d down, "
+                        "up_children (%d) > halo_max_replicas (%d)",
+                        worst_up_child, up_children, priv->halo_max_replicas);
+        }
+
+        if (up_children == 1) {
+                gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP,
+                        "Subvolume '%s' came back up; "
+                        "going online.",
+                        child_xlator->name);
+        } else {
+                *event = GF_EVENT_SOME_DESCENDENT_UP;
+        }
+
+        priv->last_event[idx] = *event;
+}
+
+void
+__afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator,
+                int idx, int64_t child_latency_msec, int32_t *event,
+                int32_t *call_psh, int32_t *up_child)
+{
+        afr_private_t   *priv               = NULL;
+        int             i                   = 0;
+        int             up_children         = 0;
+        int             down_children       = 0;
+        int             best_down_child     = -1;
+
+        priv = this->private;
+
+        /*
+         * If a brick is down when we start, we'll get a
+         * CHILD_DOWN to indicate its initial state.  There
+         * was never a CHILD_UP in this case, so if we
+         * increment "down_count" the difference between than
+         * and "up_count" will no longer be the number of
+         * children that are currently up.  This has serious
+         * implications e.g. for quorum enforcement, so we
+         * don't increment these values unless the event
+         * represents an actual state transition between "up"
+         * (value = 1) and anything else.
+         */
+        if (priv->child_up[idx] == 1) {
+                priv->event_generation++;
+        }
+
+        /*
+         * If this is an _actual_ CHILD_DOWN event, we
+         * want to set the child_latency to < 0 to indicate
+         * the child is really disconnected.
+         */
+        if (child_latency_msec < 0) {
+                priv->child_latency[idx] = child_latency_msec;
+        }
+        priv->child_up[idx] = 0;
+
+        up_children = __afr_get_up_children_count (priv);
+        /*
+         * Handle the edge case where we need to find the
+         * next best child (to mark up) as marking this child
+         * down would cause us to fall below halo_min_replicas.
+         * We will also force the SHD to heal this child _now_
+         * as we want it to be up to date if we are going to
+         * begin using it synchronously.
+         */
+        if (up_children < priv->halo_min_replicas) {
+                best_down_child = find_best_down_child (this);
+                if (best_down_child >= 0) {
+                        gf_msg_debug (this->name, 0,
+                                "Swapping out child %d for "
+                                "child %d to satisfy halo_min_replicas (%d).",
+                                idx, best_down_child, priv->halo_min_replicas);
+                        priv->child_up[best_down_child] = 1;
+                        *call_psh = 1;
+                        *up_child = best_down_child;
+                }
+        }
+
+        for (i = 0; i < priv->child_count; i++)
+                if (priv->child_up[i] == 0)
+                        down_children++;
+        if (down_children == priv->child_count) {
+                gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_SUBVOLS_DOWN,
+                        "All subvolumes are down. Going "
+                        "offline until atleast one of them "
+                        "comes back up.");
+        } else {
+                *event = GF_EVENT_SOME_DESCENDENT_DOWN;
+        }
+        priv->last_event[idx] = *event;
+}
+
+static int64_t
+afr_get_halo_latency (xlator_t *this)
+{
+        afr_private_t *priv           = NULL;
+        int64_t halo_max_latency_msec = 0;
+
+        priv = this->private;
+
+        if (priv->shd.iamshd) {
+                halo_max_latency_msec = priv->shd.halo_max_latency_msec;
+        } else if (priv->nfsd.iamnfsd) {
+                halo_max_latency_msec =
+                        priv->nfsd.halo_max_latency_msec;
+        } else {
+                halo_max_latency_msec = priv->halo_max_latency_msec;
+        }
+        gf_msg_debug (this->name, 0, "Using halo latency %ld",
+                halo_max_latency_msec);
+        return halo_max_latency_msec;
+}
+
+
 int32_t
 afr_notify (xlator_t *this, int32_t event,
             void *data, void *data2)
 {
         afr_private_t   *priv               = NULL;
+        xlator_t        *child_xlator       = NULL;
         int             i                   = -1;
-        int             up_children         = 0;
-        int             down_children       = 0;
         int             propagate           = 0;
         int             had_heard_from_all  = 0;
         int             have_heard_from_all = 0;
         int             idx                 = -1;
         int             ret                 = -1;
         int             call_psh            = 0;
+        int             up_child            = -1;
         dict_t          *input              = NULL;
         dict_t          *output             = NULL;
         gf_boolean_t    had_quorum          = _gf_false;
@@ -4677,6 +4948,10 @@ afr_notify (xlator_t *this, int32_t event,
         struct gf_upcall_cache_invalidation *up_ci = NULL;
         inode_table_t  *itable              = NULL;
         inode_t        *inode               = NULL;
+        int64_t         halo_max_latency_msec = 0;
+        int64_t         child_latency_msec   = -1;
+
+        child_xlator = (xlator_t *)data;
 
         priv = this->private;
 
@@ -4701,7 +4976,7 @@ afr_notify (xlator_t *this, int32_t event,
          * subsequent revalidate lookup happens on all the dht's subvolumes
          * which triggers afr self-heals if any.
          */
-        idx = find_child_index (this, data);
+        idx = find_child_index (this, child_xlator);
         if (idx < 0) {
                 gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP,
                         "Received child_up from invalid subvolume");
@@ -4710,6 +4985,30 @@ afr_notify (xlator_t *this, int32_t event,
 
         had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up,
                                                            this);
+        if (priv->halo_enabled) {
+                halo_max_latency_msec = afr_get_halo_latency (this);
+
+                if (event == GF_EVENT_CHILD_PING) {
+                        /* Calculates the child latency and sets event
+                         */
+                        child_latency_msec = (int64_t)(uintptr_t)data2;
+                        LOCK (&priv->lock);
+                        {
+                                __afr_handle_ping_event (this, child_xlator,
+                                        idx, halo_max_latency_msec, &event,
+                                        child_latency_msec);
+                        }
+                        UNLOCK (&priv->lock);
+                }
+        }
+
+        if (event == GF_EVENT_CHILD_PING) {
+                /* This is the only xlator that handles PING, no reason to
+                 * propagate.
+                 */
+                goto out;
+            }
+
         if (event == GF_EVENT_TRANSLATOR_OP) {
                 LOCK (&priv->lock);
                 {
@@ -4736,57 +5035,15 @@ afr_notify (xlator_t *this, int32_t event,
                         propagate = 1;
                         break;
                 case GF_EVENT_CHILD_UP:
-                        /*
-                         * This only really counts if the child was never up
-                         * (value = -1) or had been down (value = 0).  See
-                         * comment at GF_EVENT_CHILD_DOWN for a more detailed
-                         * explanation.
-                         */
-                        if (priv->child_up[idx] != 1) {
-                                priv->event_generation++;
-                        }
-                        priv->child_up[idx] = 1;
-
-                        call_psh = 1;
-                        up_children = __afr_get_up_children_count (priv);
-                        if (up_children == 1) {
-                                gf_msg (this->name, GF_LOG_INFO, 0,
-                                        AFR_MSG_SUBVOL_UP,
-                                        "Subvolume '%s' came back up; "
-                                     "going online.", ((xlator_t *)data)->name);
-                                gf_event (EVENT_AFR_SUBVOL_UP,
-                                          "subvol=%s", this->name);
-
-                        } else {
-                                event = GF_EVENT_SOME_DESCENDENT_UP;
-                        }
-
-                        priv->last_event[idx] = event;
-
+                        __afr_handle_child_up_event (this, child_xlator,
+                                idx, halo_max_latency_msec, &event, &call_psh,
+                                &up_child);
                         break;
 
                 case GF_EVENT_CHILD_DOWN:
-                        if (priv->child_up[idx] == 1) {
-                                priv->event_generation++;
-                        }
-                        priv->child_up[idx] = 0;
-
-                        for (i = 0; i < priv->child_count; i++)
-                                if (priv->child_up[i] == 0)
-                                        down_children++;
-                        if (down_children == priv->child_count) {
-                                gf_msg (this->name, GF_LOG_ERROR, 0,
-                                        AFR_MSG_SUBVOLS_DOWN,
-                                       "All subvolumes are down. Going offline "
-                                    "until atleast one of them comes back up.");
-                                gf_event (EVENT_AFR_SUBVOLS_DOWN,
-                                          "subvol=%s", this->name);
-                        } else {
-                                event = GF_EVENT_SOME_DESCENDENT_DOWN;
-                        }
-
-                        priv->last_event[idx] = event;
-
+                        __afr_handle_child_down_event (this, child_xlator, idx,
+                                child_latency_msec, &event, &call_psh,
+                                &up_child);
                         break;
 
                 case GF_EVENT_CHILD_CONNECTING:
@@ -4839,7 +5096,6 @@ afr_notify (xlator_t *this, int32_t event,
                            had come up, propagate CHILD_UP, but only this time
                         */
                         event = GF_EVENT_CHILD_DOWN;
-                        up_children = __afr_get_up_children_count (priv);
                         for (i = 0; i < priv->child_count; i++) {
                                 if (priv->last_event[i] == GF_EVENT_CHILD_UP) {
                                         event = GF_EVENT_CHILD_UP;
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
index 7f7962013d7..c7d6261b110 100644
--- a/xlators/cluster/afr/src/afr-mem-types.h
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -46,7 +46,8 @@ enum gf_afr_mem_types_ {
 	gf_afr_mt_spbc_timeout_t,
         gf_afr_mt_spb_status_t,
         gf_afr_mt_empty_brick_t,
-        gf_afr_mt_end
+        gf_afr_mt_child_latency_t,
+    gf_afr_mt_end
 };
 #endif
 
diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h
index c6ac5ebfd1b..4ac1d32f58a 100644
--- a/xlators/cluster/afr/src/afr-self-heald.h
+++ b/xlators/cluster/afr/src/afr-self-heald.h
@@ -58,6 +58,7 @@ typedef struct {
         eh_t                    **statistics;
         uint32_t                max_threads;
         uint32_t                wait_qlength;
+        uint32_t                halo_max_latency_msec;
 } afr_self_heald_t;
 
 
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index ceaa034dbbb..17b34822c17 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -184,6 +184,27 @@ reconfigure (xlator_t *this, dict_t *options)
         GF_OPTION_RECONF ("data-self-heal-algorithm",
                           priv->data_self_heal_algorithm, options, str, out);
 
+        GF_OPTION_RECONF ("halo-enabled",
+                          priv->halo_enabled, options, bool,
+                          out);
+
+        GF_OPTION_RECONF ("halo-shd-max-latency",
+                          priv->shd.halo_max_latency_msec, options, uint32,
+                          out);
+
+        GF_OPTION_RECONF ("halo-nfsd-max-latency",
+                          priv->nfsd.halo_max_latency_msec, options, uint32,
+                          out);
+
+        GF_OPTION_RECONF ("halo-max-latency", priv->halo_max_latency_msec,
+                          options, uint32, out);
+
+        GF_OPTION_RECONF ("halo-max-replicas", priv->halo_max_replicas, options,
+                              uint32, out);
+
+        GF_OPTION_RECONF ("halo-min-replicas", priv->halo_min_replicas, options,
+                              uint32, out);
+
         GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out);
 
         GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode,
@@ -473,6 +494,24 @@ init (xlator_t *this)
 
         GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out);
 
+        GF_OPTION_INIT ("halo-shd-max-latency", priv->shd.halo_max_latency_msec,
+                        uint32, out);
+
+        GF_OPTION_INIT ("halo-max-latency", priv->halo_max_latency_msec,
+                        uint32, out);
+        GF_OPTION_INIT ("halo-max-replicas", priv->halo_max_replicas, uint32,
+                        out);
+        GF_OPTION_INIT ("halo-min-replicas", priv->halo_min_replicas, uint32,
+                        out);
+
+        GF_OPTION_INIT ("halo-enabled",
+                        priv->halo_enabled, bool, out);
+
+        GF_OPTION_INIT ("halo-nfsd-max-latency",
+                        priv->nfsd.halo_max_latency_msec, uint32, out);
+
+        GF_OPTION_INIT ("iam-nfs-daemon", priv->nfsd.iamnfsd, bool, out);
+
         GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out);
 
         GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool,
@@ -528,7 +567,12 @@ init (xlator_t *this)
 
         priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count,
                                     gf_afr_mt_char);
-        if (!priv->child_up) {
+
+        priv->child_latency = GF_CALLOC (sizeof (*priv->child_latency),
+                                         child_count,
+                                         gf_afr_mt_child_latency_t);
+
+        if (!priv->child_up || !priv->child_latency) {
                 ret = -ENOMEM;
                 goto out;
         }
@@ -736,7 +780,50 @@ struct volume_options options[] = {
                          "jobs that can perform parallel heals in the "
                          "background."
         },
-        { .key  = {"heal-wait-queue-length"},
+        { .key   = {"halo-shd-max-latency"},
+          .type  = GF_OPTION_TYPE_INT,
+          .min   = 1,
+          .max   = 99999,
+          .default_value = "99999",
+           .description = "Maximum latency for shd halo replication in msec."
+        },
+        { .key   = {"halo-enabled"},
+          .type  = GF_OPTION_TYPE_BOOL,
+          .default_value = "False",
+           .description = "Enable Halo (geo) replication mode."
+        },
+        { .key   = {"halo-nfsd-max-latency"},
+          .type  = GF_OPTION_TYPE_INT,
+          .min   = 1,
+          .max   = 99999,
+          .default_value = "5",
+           .description = "Maximum latency for nfsd halo replication in msec."
+        },
+        { .key   = {"halo-max-latency"},
+          .type  = GF_OPTION_TYPE_INT,
+          .min   = 1,
+          .max   = 99999,
+          .default_value = "5",
+           .description = "Maximum latency for halo replication in msec."
+        },
+        { .key   = {"halo-max-replicas"},
+          .type  = GF_OPTION_TYPE_INT,
+          .min   = 1,
+          .max   = 99999,
+          .default_value = "99999",
+           .description = "The maximum number of halo replicas; replicas"
+                          " beyond this value will be written asynchronously"
+                          "via the SHD."
+        },
+        { .key   = {"halo-min-replicas"},
+          .type  = GF_OPTION_TYPE_INT,
+          .min   = 1,
+          .max   = 99999,
+          .default_value = "2",
+           .description = "The minimmum number of halo replicas, before adding "
+                          "out of region replicas."
+         },
+         { .key  = {"heal-wait-queue-length"},
           .type = GF_OPTION_TYPE_INT,
           .min  = 0,
           .max  = 10000, /*Around 100MB with sizeof(afr_local_t)= 10496 bytes*/
@@ -876,6 +963,13 @@ struct volume_options options[] = {
                          "translator is running as part of self-heal-daemon "
                          "or not."
         },
+        { .key = {"iam-nfs-daemon"},
+          .type = GF_OPTION_TYPE_BOOL,
+          .default_value = "off",
+          .description = "This option differentiates if the replicate "
+                         "translator is running as part of an NFS daemon "
+                         "or not."
+        },
         { .key = {"quorum-type"},
           .type = GF_OPTION_TYPE_STR,
           .value = { "none", "auto", "fixed"},
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 0535e7c7271..3be15175dc7 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -74,6 +74,11 @@ typedef enum {
         AFR_FAV_CHILD_POLICY_MAX,
 } afr_favorite_child_policy;
 
+struct afr_nfsd {
+        gf_boolean_t     iamnfsd;
+        uint32_t         halo_max_latency_msec;
+};
+
 typedef struct _afr_private {
         gf_lock_t lock;               /* to guard access to child_count, etc */
         unsigned int child_count;     /* total number of children   */
@@ -85,6 +90,7 @@ typedef struct _afr_private {
         inode_t *root_inode;
 
         unsigned char *child_up;
+        int64_t *child_latency;
         unsigned char *local;
 
         char **pending_key;
@@ -155,8 +161,14 @@ typedef struct _afr_private {
         gf_boolean_t           ensure_durability;
         char                   *sh_domain;
 	char                   *afr_dirty;
+        gf_boolean_t           halo_enabled;
+
+        uint32_t               halo_max_latency_msec;
+        uint32_t               halo_max_replicas;
+        uint32_t               halo_min_replicas;
 
-	afr_self_heald_t       shd;
+        afr_self_heald_t       shd;
+        struct afr_nfsd        nfsd;
 
         gf_boolean_t           consistent_metadata;
         uint64_t               spb_choice_timeout;