summaryrefslogtreecommitdiffstats
path: root/xlators/cluster
diff options
context:
space:
mode:
authorRavishankar N <ravishankar@redhat.com>2015-12-23 13:49:14 +0530
committerPranith Kumar Karampuri <pkarampu@redhat.com>2016-04-27 00:35:19 -0700
commit3c35329feb4dd479c9e4856ee27fa4b12c708db2 (patch)
tree99ffea033b1982088344aaf584f8cd193f385aff /xlators/cluster
parent41fef03eb929a08588a4f4ebf4638096a5918a62 (diff)
afr: propagate child up event after timeout
Problem: During mount, afr waits for response from all its children before notifying the parent xlator. In a 1x2 replica volume , if one of the nodes is down, the mount will hang for more than a minute until child down is received from the client xlator for that node. Fix: When parent up is received by afr, start a 10 second timer. In the timer call back, if we receive a successful child up from atleast one brick, propagate the event to the parent xlator. Change-Id: I31e57c8802c1a03a4a5d581ee4ab82f3a9c8799d BUG: 1054694 Signed-off-by: Ravishankar N <ravishankar@redhat.com> Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> Reviewed-on: http://review.gluster.org/11113 NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org> Smoke: Gluster Build System <jenkins@build.gluster.com> CentOS-regression: Gluster Build System <jenkins@build.gluster.com>
Diffstat (limited to 'xlators/cluster')
-rw-r--r--xlators/cluster/afr/src/afr-common.c280
-rw-r--r--xlators/cluster/afr/src/afr-messages.h10
-rw-r--r--xlators/cluster/afr/src/afr.c6
-rw-r--r--xlators/cluster/afr/src/afr.h3
4 files changed, 200 insertions, 99 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 32dc91ca16e..fda9785bdda 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -3917,6 +3917,117 @@ find_child_index (xlator_t *this, xlator_t *child)
return i;
}
+static int
+__afr_get_up_children_count (afr_private_t *priv)
+{
+ int up_children = 0;
+ int i = 0;
+
+ for (i = 0; i < priv->child_count; i++)
+ if (priv->child_up[i] == 1)
+ up_children++;
+
+ return up_children;
+}
+
+glusterfs_event_t
+__afr_transform_event_from_state (afr_private_t *priv)
+{
+ int i = 0;
+ int up_children = 0;
+
+ if (AFR_COUNT (priv->last_event, priv->child_count) ==
+ priv->child_count)
+ /* have_heard_from_all. Let afr_notify() do the propagation. */
+ return GF_EVENT_MAXVAL;
+
+ up_children = __afr_get_up_children_count (priv);
+ if (up_children) {
+ /* We received at least one child up and there are pending
+ * notifications from some children. Treat these children as
+ * having sent a GF_EVENT_CHILD_DOWN. i.e. set the event as
+ * GF_EVENT_CHILD_MODIFIED, as done in afr_notify() */
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->last_event[i])
+ continue;
+ priv->last_event[i] = GF_EVENT_CHILD_MODIFIED;
+ priv->child_up[i] = 0;
+ }
+ return GF_EVENT_CHILD_UP;
+ } else {
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->last_event[i])
+ continue;
+ priv->last_event[i] = GF_EVENT_SOME_CHILD_DOWN;
+ priv->child_up[i] = 0;
+ }
+ return GF_EVENT_CHILD_DOWN;
+ }
+
+ return GF_EVENT_MAXVAL;
+}
+
+static void
+afr_notify_cbk (void *data)
+{
+ xlator_t *this = data;
+ afr_private_t *priv = this->private;
+ glusterfs_event_t event = GF_EVENT_MAXVAL;
+ gf_boolean_t propagate = _gf_false;
+
+ LOCK (&priv->lock);
+ {
+ if (!priv->timer) {
+ /*
+ * Either child_up/child_down is already sent to parent.
+ * This is a spurious wake up.
+ */
+ goto unlock;
+ }
+ priv->timer = NULL;
+ event = __afr_transform_event_from_state (priv);
+ if (event != GF_EVENT_MAXVAL)
+ propagate = _gf_true;
+ }
+unlock:
+ UNLOCK (&priv->lock);
+ if (propagate)
+ default_notify (this, event, NULL);
+}
+
+static void
+__afr_launch_notify_timer (xlator_t *this, afr_private_t *priv)
+{
+
+ struct timespec delay = {0, };
+
+ gf_msg_debug (this->name, 0, "Initiating child-down timer");
+ delay.tv_sec = 10;
+ delay.tv_nsec = 0;
+ priv->timer = gf_timer_call_after (this->ctx, delay,
+ afr_notify_cbk, this);
+ if (priv->timer == NULL) {
+ gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_TIMER_CREATE_FAIL,
+ "Cannot create timer for delayed initialization");
+ }
+}
+
+int
+__get_heard_from_all_status (xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ int heard_from_all = 1;
+ int i = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->last_event[i]) {
+ heard_from_all = 0;
+ break;
+ }
+ }
+ return heard_from_all;
+}
+
int32_t
afr_notify (xlator_t *this, int32_t event,
void *data, void *data2)
@@ -3949,12 +4060,6 @@ afr_notify (xlator_t *this, int32_t event,
*/
priv->did_discovery = _gf_false;
- had_heard_from_all = 1;
- for (i = 0; i < priv->child_count; i++) {
- if (!priv->last_event[i]) {
- had_heard_from_all = 0;
- }
- }
/* parent xlators dont need to know about every child_up, child_down
* because of afr ha. If all subvolumes go down, child_down has
@@ -3974,10 +4079,32 @@ afr_notify (xlator_t *this, int32_t event,
had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up,
this);
- switch (event) {
- case GF_EVENT_CHILD_UP:
+ if (event == GF_EVENT_TRANSLATOR_OP) {
LOCK (&priv->lock);
{
+ had_heard_from_all = __get_heard_from_all_status (this);
+ }
+ UNLOCK (&priv->lock);
+
+ if (!had_heard_from_all) {
+ ret = -1;
+ } else {
+ input = data;
+ output = data2;
+ ret = afr_xl_op (this, input, output);
+ }
+ goto out;
+ }
+
+ LOCK (&priv->lock);
+ {
+ had_heard_from_all = __get_heard_from_all_status (this);
+ switch (event) {
+ case GF_EVENT_PARENT_UP:
+ __afr_launch_notify_timer (this, priv);
+ propagate = 1;
+ break;
+ case GF_EVENT_CHILD_UP:
/*
* This only really counts if the child was never up
* (value = -1) or had been down (value = 0). See
@@ -3985,48 +4112,28 @@ afr_notify (xlator_t *this, int32_t event,
* explanation.
*/
if (priv->child_up[idx] != 1) {
- priv->up_count++;
- priv->event_generation++;
+ priv->event_generation++;
}
priv->child_up[idx] = 1;
call_psh = 1;
- for (i = 0; i < priv->child_count; i++)
- if (priv->child_up[i] == 1)
- up_children++;
+ up_children = __afr_get_up_children_count (priv);
if (up_children == 1) {
gf_msg (this->name, GF_LOG_INFO, 0,
AFR_MSG_SUBVOL_UP,
"Subvolume '%s' came back up; "
- "going online.", ((xlator_t *)data)->name);
+ "going online.", ((xlator_t *)data)->name);
} else {
event = GF_EVENT_CHILD_MODIFIED;
}
priv->last_event[idx] = event;
- }
- UNLOCK (&priv->lock);
- break;
+ break;
- case GF_EVENT_CHILD_DOWN:
- LOCK (&priv->lock);
- {
- /*
- * If a brick is down when we start, we'll get a
- * CHILD_DOWN to indicate its initial state. There
- * was never a CHILD_UP in this case, so if we
- * increment "down_count" the difference between than
- * and "up_count" will no longer be the number of
- * children that are currently up. This has serious
- * implications e.g. for quorum enforcement, so we
- * don't increment these values unless the event
- * represents an actual state transition between "up"
- * (value = 1) and anything else.
- */
+ case GF_EVENT_CHILD_DOWN:
if (priv->child_up[idx] == 1) {
- priv->down_count++;
- priv->event_generation++;
+ priv->event_generation++;
}
priv->child_up[idx] = 0;
@@ -4036,42 +4143,56 @@ afr_notify (xlator_t *this, int32_t event,
if (down_children == priv->child_count) {
gf_msg (this->name, GF_LOG_ERROR, 0,
AFR_MSG_ALL_SUBVOLS_DOWN,
- "All subvolumes are down. Going offline "
- "until atleast one of them comes back up.");
+ "All subvolumes are down. Going offline "
+ "until atleast one of them comes back up.");
} else {
event = GF_EVENT_SOME_CHILD_DOWN;
}
priv->last_event[idx] = event;
- }
- UNLOCK (&priv->lock);
- break;
+ break;
- case GF_EVENT_CHILD_CONNECTING:
- LOCK (&priv->lock);
- {
+ case GF_EVENT_CHILD_CONNECTING:
priv->last_event[idx] = event;
- }
- UNLOCK (&priv->lock);
- break;
+ break;
- case GF_EVENT_TRANSLATOR_OP:
- input = data;
- output = data2;
- if (!had_heard_from_all) {
- ret = -1;
- goto out;
+ case GF_EVENT_SOME_CHILD_DOWN:
+ priv->last_event[idx] = event;
+ break;
+
+ default:
+ propagate = 1;
+ break;
}
- ret = afr_xl_op (this, input, output);
- goto out;
- break;
+ have_heard_from_all = __get_heard_from_all_status (this);
+ if (!had_heard_from_all && have_heard_from_all) {
+ if (priv->timer) {
+ gf_timer_call_cancel (this->ctx, priv->timer);
+ priv->timer = NULL;
+ }
+ /* This is the first event which completes aggregation
+ of events from all subvolumes. If at least one subvol
+ had come up, propagate CHILD_UP, but only this time
+ */
+ event = GF_EVENT_CHILD_DOWN;
+ up_children = __afr_get_up_children_count (priv);
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->last_event[i] == GF_EVENT_CHILD_UP) {
+ event = GF_EVENT_CHILD_UP;
+ break;
+ }
- default:
- propagate = 1;
- break;
+ if (priv->last_event[i] ==
+ GF_EVENT_CHILD_CONNECTING) {
+ event = GF_EVENT_CHILD_CONNECTING;
+ /* continue to check other events for CHILD_UP */
+ }
+ }
+ }
}
+ UNLOCK (&priv->lock);
if (priv->quorum_count) {
has_quorum = afr_has_quorum (priv->child_up, this);
@@ -4084,44 +4205,11 @@ afr_notify (xlator_t *this, int32_t event,
"Client-quorum is not met");
}
- /* have all subvolumes reported status once by now? */
- have_heard_from_all = 1;
- for (i = 0; i < priv->child_count; i++) {
- if (!priv->last_event[i])
- have_heard_from_all = 0;
- }
-
/* if all subvols have reported status, no need to hide anything
or wait for anything else. Just propagate blindly */
if (have_heard_from_all)
propagate = 1;
- if (!had_heard_from_all && have_heard_from_all) {
- /* This is the first event which completes aggregation
- of events from all subvolumes. If at least one subvol
- had come up, propagate CHILD_UP, but only this time
- */
- event = GF_EVENT_CHILD_DOWN;
-
- LOCK (&priv->lock);
- {
- up_children = AFR_COUNT (priv->child_up, priv->child_count);
- for (i = 0; i < priv->child_count; i++) {
- if (priv->last_event[i] == GF_EVENT_CHILD_UP) {
- event = GF_EVENT_CHILD_UP;
- break;
- }
-
- if (priv->last_event[i] ==
- GF_EVENT_CHILD_CONNECTING) {
- event = GF_EVENT_CHILD_CONNECTING;
- /* continue to check other events for CHILD_UP */
- }
- }
- }
- UNLOCK (&priv->lock);
- }
-
ret = 0;
if (propagate)
ret = default_notify (this, event, data);
@@ -4366,18 +4454,18 @@ gf_boolean_t
afr_have_quorum (char *logname, afr_private_t *priv)
{
unsigned int quorum = 0;
+ unsigned int up_children = 0;
GF_VALIDATE_OR_GOTO(logname,priv,out);
+ up_children = __afr_get_up_children_count (priv);
quorum = priv->quorum_count;
- if (quorum != AFR_QUORUM_AUTO) {
- return (priv->up_count >= (priv->down_count + quorum));
- }
+ if (quorum != AFR_QUORUM_AUTO)
+ return up_children >= quorum;
quorum = priv->child_count / 2 + 1;
- if (priv->up_count >= (priv->down_count + quorum)) {
+ if (up_children >= quorum)
return _gf_true;
- }
/*
* Special case for even numbers of nodes: if we have exactly half
@@ -4388,7 +4476,7 @@ afr_have_quorum (char *logname, afr_private_t *priv)
*/
if ((priv->child_count % 2) == 0) {
quorum = priv->child_count / 2;
- if (priv->up_count >= (priv->down_count + quorum)) {
+ if (up_children >= quorum) {
if (priv->child_up[0]) {
return _gf_true;
}
diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h
index 37736f5afa3..fac37b8c34b 100644
--- a/xlators/cluster/afr/src/afr-messages.h
+++ b/xlators/cluster/afr/src/afr-messages.h
@@ -40,7 +40,7 @@
*/
#define GLFS_COMP_BASE_AFR GLFS_MSGID_COMP_AFR
-#define GLFS_NUM_MESSAGES 40
+#define GLFS_NUM_MESSAGES 41
#define GLFS_MSGID_END (GLFS_COMP_BASE_AFR + GLFS_NUM_MESSAGES + 1)
#define glfs_msg_start_x GLFS_COMP_BASE_AFR, "Invalid: Start of messages"
@@ -353,5 +353,13 @@
*/
#define AFR_MSG_NO_CHANGELOG (GLFS_COMP_BASE_AFR + 40)
+/*!
+ * @messageid 108041
+ * @diagnosis Unable to create timer thread for delayed initialization.
+ * @recommendedaction Possibly check process's log file for messages from
+ * timer infra.
+*/
+#define AFR_MSG_TIMER_CREATE_FAIL (GLFS_COMP_BASE_AFR + 41)
+
#define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"
#endif /* !_AFR_MESSAGES_H_ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 1b172d50e60..0930a081965 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -510,6 +510,12 @@ fini (xlator_t *this)
afr_private_t *priv = NULL;
priv = this->private;
+ LOCK (&priv->lock);
+ if (priv->timer != NULL) {
+ gf_timer_call_cancel(this->ctx, priv->timer);
+ priv->timer = NULL;
+ }
+ UNLOCK (&priv->lock);
this->private = NULL;
afr_priv_destroy (priv);
//if (this->itable);//I dont see any destroy func
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index ef0d9f74e6e..0a872a98284 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -99,8 +99,7 @@ typedef struct _afr_private {
unsigned int wait_count; /* # of servers to wait for success */
- uint64_t up_count; /* number of CHILD_UPs we have seen */
- uint64_t down_count; /* number of CHILD_DOWNs we have seen */
+ gf_timer_t *timer; /* launched when parent up is received */
gf_boolean_t optimistic_change_log;
gf_boolean_t eager_lock;