From 3c35329feb4dd479c9e4856ee27fa4b12c708db2 Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Wed, 23 Dec 2015 13:49:14 +0530 Subject: afr: propagate child up event after timeout Problem: During mount, afr waits for response from all its children before notifying the parent xlator. In a 1x2 replica volume , if one of the nodes is down, the mount will hang for more than a minute until child down is received from the client xlator for that node. Fix: When parent up is received by afr, start a 10 second timer. In the timer call back, if we receive a successful child up from atleast one brick, propagate the event to the parent xlator. Change-Id: I31e57c8802c1a03a4a5d581ee4ab82f3a9c8799d BUG: 1054694 Signed-off-by: Ravishankar N Signed-off-by: Pranith Kumar K Reviewed-on: http://review.gluster.org/11113 NetBSD-regression: NetBSD Build System Smoke: Gluster Build System CentOS-regression: Gluster Build System --- xlators/cluster/afr/src/afr-common.c | 280 ++++++++++++++++++++++----------- xlators/cluster/afr/src/afr-messages.h | 10 +- xlators/cluster/afr/src/afr.c | 6 + xlators/cluster/afr/src/afr.h | 3 +- 4 files changed, 200 insertions(+), 99 deletions(-) (limited to 'xlators/cluster/afr') diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 32dc91ca16e..fda9785bdda 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -3917,6 +3917,117 @@ find_child_index (xlator_t *this, xlator_t *child) return i; } +static int +__afr_get_up_children_count (afr_private_t *priv) +{ + int up_children = 0; + int i = 0; + + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 1) + up_children++; + + return up_children; +} + +glusterfs_event_t +__afr_transform_event_from_state (afr_private_t *priv) +{ + int i = 0; + int up_children = 0; + + if (AFR_COUNT (priv->last_event, priv->child_count) == + priv->child_count) + /* have_heard_from_all. Let afr_notify() do the propagation. */ + return GF_EVENT_MAXVAL; + + up_children = __afr_get_up_children_count (priv); + if (up_children) { + /* We received at least one child up and there are pending + * notifications from some children. Treat these children as + * having sent a GF_EVENT_CHILD_DOWN. i.e. set the event as + * GF_EVENT_CHILD_MODIFIED, as done in afr_notify() */ + for (i = 0; i < priv->child_count; i++) { + if (priv->last_event[i]) + continue; + priv->last_event[i] = GF_EVENT_CHILD_MODIFIED; + priv->child_up[i] = 0; + } + return GF_EVENT_CHILD_UP; + } else { + for (i = 0; i < priv->child_count; i++) { + if (priv->last_event[i]) + continue; + priv->last_event[i] = GF_EVENT_SOME_CHILD_DOWN; + priv->child_up[i] = 0; + } + return GF_EVENT_CHILD_DOWN; + } + + return GF_EVENT_MAXVAL; +} + +static void +afr_notify_cbk (void *data) +{ + xlator_t *this = data; + afr_private_t *priv = this->private; + glusterfs_event_t event = GF_EVENT_MAXVAL; + gf_boolean_t propagate = _gf_false; + + LOCK (&priv->lock); + { + if (!priv->timer) { + /* + * Either child_up/child_down is already sent to parent. + * This is a spurious wake up. + */ + goto unlock; + } + priv->timer = NULL; + event = __afr_transform_event_from_state (priv); + if (event != GF_EVENT_MAXVAL) + propagate = _gf_true; + } +unlock: + UNLOCK (&priv->lock); + if (propagate) + default_notify (this, event, NULL); +} + +static void +__afr_launch_notify_timer (xlator_t *this, afr_private_t *priv) +{ + + struct timespec delay = {0, }; + + gf_msg_debug (this->name, 0, "Initiating child-down timer"); + delay.tv_sec = 10; + delay.tv_nsec = 0; + priv->timer = gf_timer_call_after (this->ctx, delay, + afr_notify_cbk, this); + if (priv->timer == NULL) { + gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_TIMER_CREATE_FAIL, + "Cannot create timer for delayed initialization"); + } +} + +int +__get_heard_from_all_status (xlator_t *this) +{ + afr_private_t *priv = this->private; + int heard_from_all = 1; + int i = 0; + + for (i = 0; i < priv->child_count; i++) { + if (!priv->last_event[i]) { + heard_from_all = 0; + break; + } + } + return heard_from_all; +} + int32_t afr_notify (xlator_t *this, int32_t event, void *data, void *data2) @@ -3949,12 +4060,6 @@ afr_notify (xlator_t *this, int32_t event, */ priv->did_discovery = _gf_false; - had_heard_from_all = 1; - for (i = 0; i < priv->child_count; i++) { - if (!priv->last_event[i]) { - had_heard_from_all = 0; - } - } /* parent xlators dont need to know about every child_up, child_down * because of afr ha. If all subvolumes go down, child_down has @@ -3974,10 +4079,32 @@ afr_notify (xlator_t *this, int32_t event, had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up, this); - switch (event) { - case GF_EVENT_CHILD_UP: + if (event == GF_EVENT_TRANSLATOR_OP) { LOCK (&priv->lock); { + had_heard_from_all = __get_heard_from_all_status (this); + } + UNLOCK (&priv->lock); + + if (!had_heard_from_all) { + ret = -1; + } else { + input = data; + output = data2; + ret = afr_xl_op (this, input, output); + } + goto out; + } + + LOCK (&priv->lock); + { + had_heard_from_all = __get_heard_from_all_status (this); + switch (event) { + case GF_EVENT_PARENT_UP: + __afr_launch_notify_timer (this, priv); + propagate = 1; + break; + case GF_EVENT_CHILD_UP: /* * This only really counts if the child was never up * (value = -1) or had been down (value = 0). See @@ -3985,48 +4112,28 @@ afr_notify (xlator_t *this, int32_t event, * explanation. */ if (priv->child_up[idx] != 1) { - priv->up_count++; - priv->event_generation++; + priv->event_generation++; } priv->child_up[idx] = 1; call_psh = 1; - for (i = 0; i < priv->child_count; i++) - if (priv->child_up[i] == 1) - up_children++; + up_children = __afr_get_up_children_count (priv); if (up_children == 1) { gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP, "Subvolume '%s' came back up; " - "going online.", ((xlator_t *)data)->name); + "going online.", ((xlator_t *)data)->name); } else { event = GF_EVENT_CHILD_MODIFIED; } priv->last_event[idx] = event; - } - UNLOCK (&priv->lock); - break; + break; - case GF_EVENT_CHILD_DOWN: - LOCK (&priv->lock); - { - /* - * If a brick is down when we start, we'll get a - * CHILD_DOWN to indicate its initial state. There - * was never a CHILD_UP in this case, so if we - * increment "down_count" the difference between than - * and "up_count" will no longer be the number of - * children that are currently up. This has serious - * implications e.g. for quorum enforcement, so we - * don't increment these values unless the event - * represents an actual state transition between "up" - * (value = 1) and anything else. - */ + case GF_EVENT_CHILD_DOWN: if (priv->child_up[idx] == 1) { - priv->down_count++; - priv->event_generation++; + priv->event_generation++; } priv->child_up[idx] = 0; @@ -4036,42 +4143,56 @@ afr_notify (xlator_t *this, int32_t event, if (down_children == priv->child_count) { gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_ALL_SUBVOLS_DOWN, - "All subvolumes are down. Going offline " - "until atleast one of them comes back up."); + "All subvolumes are down. Going offline " + "until atleast one of them comes back up."); } else { event = GF_EVENT_SOME_CHILD_DOWN; } priv->last_event[idx] = event; - } - UNLOCK (&priv->lock); - break; + break; - case GF_EVENT_CHILD_CONNECTING: - LOCK (&priv->lock); - { + case GF_EVENT_CHILD_CONNECTING: priv->last_event[idx] = event; - } - UNLOCK (&priv->lock); - break; + break; - case GF_EVENT_TRANSLATOR_OP: - input = data; - output = data2; - if (!had_heard_from_all) { - ret = -1; - goto out; + case GF_EVENT_SOME_CHILD_DOWN: + priv->last_event[idx] = event; + break; + + default: + propagate = 1; + break; } - ret = afr_xl_op (this, input, output); - goto out; - break; + have_heard_from_all = __get_heard_from_all_status (this); + if (!had_heard_from_all && have_heard_from_all) { + if (priv->timer) { + gf_timer_call_cancel (this->ctx, priv->timer); + priv->timer = NULL; + } + /* This is the first event which completes aggregation + of events from all subvolumes. If at least one subvol + had come up, propagate CHILD_UP, but only this time + */ + event = GF_EVENT_CHILD_DOWN; + up_children = __afr_get_up_children_count (priv); + for (i = 0; i < priv->child_count; i++) { + if (priv->last_event[i] == GF_EVENT_CHILD_UP) { + event = GF_EVENT_CHILD_UP; + break; + } - default: - propagate = 1; - break; + if (priv->last_event[i] == + GF_EVENT_CHILD_CONNECTING) { + event = GF_EVENT_CHILD_CONNECTING; + /* continue to check other events for CHILD_UP */ + } + } + } } + UNLOCK (&priv->lock); if (priv->quorum_count) { has_quorum = afr_has_quorum (priv->child_up, this); @@ -4084,44 +4205,11 @@ afr_notify (xlator_t *this, int32_t event, "Client-quorum is not met"); } - /* have all subvolumes reported status once by now? */ - have_heard_from_all = 1; - for (i = 0; i < priv->child_count; i++) { - if (!priv->last_event[i]) - have_heard_from_all = 0; - } - /* if all subvols have reported status, no need to hide anything or wait for anything else. Just propagate blindly */ if (have_heard_from_all) propagate = 1; - if (!had_heard_from_all && have_heard_from_all) { - /* This is the first event which completes aggregation - of events from all subvolumes. If at least one subvol - had come up, propagate CHILD_UP, but only this time - */ - event = GF_EVENT_CHILD_DOWN; - - LOCK (&priv->lock); - { - up_children = AFR_COUNT (priv->child_up, priv->child_count); - for (i = 0; i < priv->child_count; i++) { - if (priv->last_event[i] == GF_EVENT_CHILD_UP) { - event = GF_EVENT_CHILD_UP; - break; - } - - if (priv->last_event[i] == - GF_EVENT_CHILD_CONNECTING) { - event = GF_EVENT_CHILD_CONNECTING; - /* continue to check other events for CHILD_UP */ - } - } - } - UNLOCK (&priv->lock); - } - ret = 0; if (propagate) ret = default_notify (this, event, data); @@ -4366,18 +4454,18 @@ gf_boolean_t afr_have_quorum (char *logname, afr_private_t *priv) { unsigned int quorum = 0; + unsigned int up_children = 0; GF_VALIDATE_OR_GOTO(logname,priv,out); + up_children = __afr_get_up_children_count (priv); quorum = priv->quorum_count; - if (quorum != AFR_QUORUM_AUTO) { - return (priv->up_count >= (priv->down_count + quorum)); - } + if (quorum != AFR_QUORUM_AUTO) + return up_children >= quorum; quorum = priv->child_count / 2 + 1; - if (priv->up_count >= (priv->down_count + quorum)) { + if (up_children >= quorum) return _gf_true; - } /* * Special case for even numbers of nodes: if we have exactly half @@ -4388,7 +4476,7 @@ afr_have_quorum (char *logname, afr_private_t *priv) */ if ((priv->child_count % 2) == 0) { quorum = priv->child_count / 2; - if (priv->up_count >= (priv->down_count + quorum)) { + if (up_children >= quorum) { if (priv->child_up[0]) { return _gf_true; } diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h index 37736f5afa3..fac37b8c34b 100644 --- a/xlators/cluster/afr/src/afr-messages.h +++ b/xlators/cluster/afr/src/afr-messages.h @@ -40,7 +40,7 @@ */ #define GLFS_COMP_BASE_AFR GLFS_MSGID_COMP_AFR -#define GLFS_NUM_MESSAGES 40 +#define GLFS_NUM_MESSAGES 41 #define GLFS_MSGID_END (GLFS_COMP_BASE_AFR + GLFS_NUM_MESSAGES + 1) #define glfs_msg_start_x GLFS_COMP_BASE_AFR, "Invalid: Start of messages" @@ -353,5 +353,13 @@ */ #define AFR_MSG_NO_CHANGELOG (GLFS_COMP_BASE_AFR + 40) +/*! + * @messageid 108041 + * @diagnosis Unable to create timer thread for delayed initialization. + * @recommendedaction Possibly check process's log file for messages from + * timer infra. +*/ +#define AFR_MSG_TIMER_CREATE_FAIL (GLFS_COMP_BASE_AFR + 41) + #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages" #endif /* !_AFR_MESSAGES_H_ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 1b172d50e60..0930a081965 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -510,6 +510,12 @@ fini (xlator_t *this) afr_private_t *priv = NULL; priv = this->private; + LOCK (&priv->lock); + if (priv->timer != NULL) { + gf_timer_call_cancel(this->ctx, priv->timer); + priv->timer = NULL; + } + UNLOCK (&priv->lock); this->private = NULL; afr_priv_destroy (priv); //if (this->itable);//I dont see any destroy func diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index ef0d9f74e6e..0a872a98284 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -99,8 +99,7 @@ typedef struct _afr_private { unsigned int wait_count; /* # of servers to wait for success */ - uint64_t up_count; /* number of CHILD_UPs we have seen */ - uint64_t down_count; /* number of CHILD_DOWNs we have seen */ + gf_timer_t *timer; /* launched when parent up is received */ gf_boolean_t optimistic_change_log; gf_boolean_t eager_lock; -- cgit