diff options
Diffstat (limited to 'xlators/cluster/afr/src/afr-common.c')
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 364 |
1 files changed, 310 insertions, 54 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index ac834e90f4b..17943d7baae 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -2531,7 +2531,6 @@ unwind: return 0; } - int afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this) { @@ -3227,7 +3226,7 @@ afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); + call_count = afr_frame_return (frame); if (call_count == 0) AFR_STACK_UNWIND (flush, frame, local->op_ret, @@ -4655,20 +4654,292 @@ __get_heard_from_all_status (xlator_t *this) return heard_from_all; } +static int +find_best_down_child (xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = -1; + int32_t best_child = -1; + int64_t best_latency = INT64_MAX; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] && + priv->child_latency[i] >= 0 && + priv->child_latency[i] < best_latency) { + best_child = i; + best_latency = priv->child_latency[i]; + } + } + if (best_child >= 0) { + gf_msg_debug (this->name, 0, "Found best down child (%d) " + "@ %ld ms latency", best_child, best_latency); + } + return best_child; +} + +int +find_worst_up_child (xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = -1; + int32_t worst_child = -1; + int64_t worst_latency = INT64_MIN; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] && + priv->child_latency[i] >= 0 && + priv->child_latency[i] > worst_latency) { + worst_child = i; + worst_latency = priv->child_latency[i]; + } + } + if (worst_child >= 0) { + gf_msg_debug (this->name, 0, "Found worst up child (%d)" + " @ %ld ms latency", worst_child, worst_latency); + } + return worst_child; +} + +void +__afr_handle_ping_event (xlator_t *this, xlator_t *child_xlator, + const int idx, int64_t halo_max_latency_msec, int32_t *event, + int64_t child_latency_msec) +{ + afr_private_t *priv = NULL; + int up_children = 0; + + priv = this->private; + + priv->child_latency[idx] = child_latency_msec; + gf_msg_debug (child_xlator->name, 0, "Client ping @ %ld ms", + child_latency_msec); + + up_children = __afr_get_up_children_count (priv); + + if (child_latency_msec > halo_max_latency_msec && + priv->child_up[idx] == 1 && + up_children > priv->halo_min_replicas) { + if ((up_children - 1) < + priv->halo_min_replicas) { + gf_log (child_xlator->name, GF_LOG_INFO, + "Overriding halo threshold, " + "min replicas: %d", + priv->halo_min_replicas); + } else { + gf_log (child_xlator->name, GF_LOG_INFO, + "Child latency (%ld ms) " + "exceeds halo threshold (%ld), " + "marking child down.", + child_latency_msec, + halo_max_latency_msec); + *event = GF_EVENT_CHILD_DOWN; + } + } else if (child_latency_msec < halo_max_latency_msec && + priv->child_up[idx] == 0) { + if (up_children < priv->halo_max_replicas) { + gf_log (child_xlator->name, GF_LOG_INFO, + "Child latency (%ld ms) " + "below halo threshold (%ld), " + "marking child up.", + child_latency_msec, + halo_max_latency_msec); + *event = GF_EVENT_CHILD_UP; + } else { + gf_log (child_xlator->name, GF_LOG_INFO, + "Not marking child %d up, " + "max replicas (%d) reached.", idx, + priv->halo_max_replicas); + } + } +} + +void +__afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator, + const int idx, int64_t halo_max_latency_msec, + int32_t *event, int32_t *call_psh, int32_t *up_child) +{ + afr_private_t *priv = NULL; + int up_children = 0; + int worst_up_child = -1; + + priv = this->private; + + /* + * This only really counts if the child was never up + * (value = -1) or had been down (value = 0). See + * comment at GF_EVENT_CHILD_DOWN for a more detailed + * explanation. + */ + if (priv->child_up[idx] != 1) { + priv->event_generation++; + } + priv->child_up[idx] = 1; + + *call_psh = 1; + *up_child = idx; + up_children = __afr_get_up_children_count (priv); + + /* + * Handle the edge case where we exceed + * halo_min_replicas and we've got a child which is + * marked up as it was helping to satisfy the + * halo_min_replicas even though it's latency exceeds + * halo_max_latency_msec. + */ + if (up_children > priv->halo_min_replicas) { + worst_up_child = find_worst_up_child (this); + if (worst_up_child >= 0 && + priv->child_latency[worst_up_child] > + halo_max_latency_msec) { + gf_msg_debug (this->name, 0, "Marking child %d down, " + "doesn't meet halo threshold (%ld), and > " + "halo_min_replicas (%d)", + worst_up_child, halo_max_latency_msec, + priv->halo_min_replicas); + priv->child_up[worst_up_child] = 0; + up_children--; + } + } + if (up_children > priv->halo_max_replicas && + !priv->shd.iamshd) { + worst_up_child = find_worst_up_child (this); + if (worst_up_child < 0) { + worst_up_child = idx; + } + priv->child_up[worst_up_child] = 0; + up_children--; + gf_msg_debug (this->name, 0, "Marking child %d down, " + "up_children (%d) > halo_max_replicas (%d)", + worst_up_child, up_children, priv->halo_max_replicas); + } + + if (up_children == 1) { + gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP, + "Subvolume '%s' came back up; " + "going online.", + child_xlator->name); + } else { + *event = GF_EVENT_SOME_DESCENDENT_UP; + } + + priv->last_event[idx] = *event; +} + +void +__afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator, + int idx, int64_t child_latency_msec, int32_t *event, + int32_t *call_psh, int32_t *up_child) +{ + afr_private_t *priv = NULL; + int i = 0; + int up_children = 0; + int down_children = 0; + int best_down_child = -1; + + priv = this->private; + + /* + * If a brick is down when we start, we'll get a + * CHILD_DOWN to indicate its initial state. There + * was never a CHILD_UP in this case, so if we + * increment "down_count" the difference between than + * and "up_count" will no longer be the number of + * children that are currently up. This has serious + * implications e.g. for quorum enforcement, so we + * don't increment these values unless the event + * represents an actual state transition between "up" + * (value = 1) and anything else. + */ + if (priv->child_up[idx] == 1) { + priv->event_generation++; + } + + /* + * If this is an _actual_ CHILD_DOWN event, we + * want to set the child_latency to < 0 to indicate + * the child is really disconnected. + */ + if (child_latency_msec < 0) { + priv->child_latency[idx] = child_latency_msec; + } + priv->child_up[idx] = 0; + + up_children = __afr_get_up_children_count (priv); + /* + * Handle the edge case where we need to find the + * next best child (to mark up) as marking this child + * down would cause us to fall below halo_min_replicas. + * We will also force the SHD to heal this child _now_ + * as we want it to be up to date if we are going to + * begin using it synchronously. + */ + if (up_children < priv->halo_min_replicas) { + best_down_child = find_best_down_child (this); + if (best_down_child >= 0) { + gf_msg_debug (this->name, 0, + "Swapping out child %d for " + "child %d to satisfy halo_min_replicas (%d).", + idx, best_down_child, priv->halo_min_replicas); + priv->child_up[best_down_child] = 1; + *call_psh = 1; + *up_child = best_down_child; + } + } + + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 0) + down_children++; + if (down_children == priv->child_count) { + gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_SUBVOLS_DOWN, + "All subvolumes are down. Going " + "offline until atleast one of them " + "comes back up."); + } else { + *event = GF_EVENT_SOME_DESCENDENT_DOWN; + } + priv->last_event[idx] = *event; +} + +static int64_t +afr_get_halo_latency (xlator_t *this) +{ + afr_private_t *priv = NULL; + int64_t halo_max_latency_msec = 0; + + priv = this->private; + + if (priv->shd.iamshd) { + halo_max_latency_msec = priv->shd.halo_max_latency_msec; + } else if (priv->nfsd.iamnfsd) { + halo_max_latency_msec = + priv->nfsd.halo_max_latency_msec; + } else { + halo_max_latency_msec = priv->halo_max_latency_msec; + } + gf_msg_debug (this->name, 0, "Using halo latency %ld", + halo_max_latency_msec); + return halo_max_latency_msec; +} + + int32_t afr_notify (xlator_t *this, int32_t event, void *data, void *data2) { afr_private_t *priv = NULL; + xlator_t *child_xlator = NULL; int i = -1; - int up_children = 0; - int down_children = 0; int propagate = 0; int had_heard_from_all = 0; int have_heard_from_all = 0; int idx = -1; int ret = -1; int call_psh = 0; + int up_child = -1; dict_t *input = NULL; dict_t *output = NULL; gf_boolean_t had_quorum = _gf_false; @@ -4677,6 +4948,10 @@ afr_notify (xlator_t *this, int32_t event, struct gf_upcall_cache_invalidation *up_ci = NULL; inode_table_t *itable = NULL; inode_t *inode = NULL; + int64_t halo_max_latency_msec = 0; + int64_t child_latency_msec = -1; + + child_xlator = (xlator_t *)data; priv = this->private; @@ -4701,7 +4976,7 @@ afr_notify (xlator_t *this, int32_t event, * subsequent revalidate lookup happens on all the dht's subvolumes * which triggers afr self-heals if any. */ - idx = find_child_index (this, data); + idx = find_child_index (this, child_xlator); if (idx < 0) { gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP, "Received child_up from invalid subvolume"); @@ -4710,6 +4985,30 @@ afr_notify (xlator_t *this, int32_t event, had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up, this); + if (priv->halo_enabled) { + halo_max_latency_msec = afr_get_halo_latency (this); + + if (event == GF_EVENT_CHILD_PING) { + /* Calculates the child latency and sets event + */ + child_latency_msec = (int64_t)(uintptr_t)data2; + LOCK (&priv->lock); + { + __afr_handle_ping_event (this, child_xlator, + idx, halo_max_latency_msec, &event, + child_latency_msec); + } + UNLOCK (&priv->lock); + } + } + + if (event == GF_EVENT_CHILD_PING) { + /* This is the only xlator that handles PING, no reason to + * propagate. + */ + goto out; + } + if (event == GF_EVENT_TRANSLATOR_OP) { LOCK (&priv->lock); { @@ -4736,57 +5035,15 @@ afr_notify (xlator_t *this, int32_t event, propagate = 1; break; case GF_EVENT_CHILD_UP: - /* - * This only really counts if the child was never up - * (value = -1) or had been down (value = 0). See - * comment at GF_EVENT_CHILD_DOWN for a more detailed - * explanation. - */ - if (priv->child_up[idx] != 1) { - priv->event_generation++; - } - priv->child_up[idx] = 1; - - call_psh = 1; - up_children = __afr_get_up_children_count (priv); - if (up_children == 1) { - gf_msg (this->name, GF_LOG_INFO, 0, - AFR_MSG_SUBVOL_UP, - "Subvolume '%s' came back up; " - "going online.", ((xlator_t *)data)->name); - gf_event (EVENT_AFR_SUBVOL_UP, - "subvol=%s", this->name); - - } else { - event = GF_EVENT_SOME_DESCENDENT_UP; - } - - priv->last_event[idx] = event; - + __afr_handle_child_up_event (this, child_xlator, + idx, halo_max_latency_msec, &event, &call_psh, + &up_child); break; case GF_EVENT_CHILD_DOWN: - if (priv->child_up[idx] == 1) { - priv->event_generation++; - } - priv->child_up[idx] = 0; - - for (i = 0; i < priv->child_count; i++) - if (priv->child_up[i] == 0) - down_children++; - if (down_children == priv->child_count) { - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_SUBVOLS_DOWN, - "All subvolumes are down. Going offline " - "until atleast one of them comes back up."); - gf_event (EVENT_AFR_SUBVOLS_DOWN, - "subvol=%s", this->name); - } else { - event = GF_EVENT_SOME_DESCENDENT_DOWN; - } - - priv->last_event[idx] = event; - + __afr_handle_child_down_event (this, child_xlator, idx, + child_latency_msec, &event, &call_psh, + &up_child); break; case GF_EVENT_CHILD_CONNECTING: @@ -4839,7 +5096,6 @@ afr_notify (xlator_t *this, int32_t event, had come up, propagate CHILD_UP, but only this time */ event = GF_EVENT_CHILD_DOWN; - up_children = __afr_get_up_children_count (priv); for (i = 0; i < priv->child_count; i++) { if (priv->last_event[i] == GF_EVENT_CHILD_UP) { event = GF_EVENT_CHILD_UP; |