From bc7d07d8d5eab29741c7e28b7dcb38ce66c101cb Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Tue, 31 May 2011 01:55:57 +0000 Subject: cluster/afr: Send the first child up/down after all its children notify Signed-off-by: Pranith Kumar K Signed-off-by: Anand Avati BUG: 2870 (Inconsistent xattr values when creating bricks) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2870 --- xlators/cluster/afr/src/afr-common.c | 161 +++++++++++++++++++++++------------ xlators/cluster/afr/src/afr.c | 7 ++ xlators/cluster/afr/src/afr.h | 1 + 3 files changed, 115 insertions(+), 54 deletions(-) diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index f000aaf9217..c8b1ea96011 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -2470,92 +2470,145 @@ int32_t afr_notify (xlator_t *this, int32_t event, void *data, ...) { - afr_private_t * priv = NULL; - unsigned char * child_up = NULL; - int i = -1; - int up_children = 0; - int down_children = 0; + afr_private_t *priv = NULL; + int i = -1; + int up_children = 0; + int down_children = 0; + int propagate = 0; + + int had_heard_from_all = 0; + int have_heard_from_all = 0; + int idx = -1; + int ret = -1; priv = this->private; if (!priv) return 0; - child_up = priv->child_up; + had_heard_from_all = 1; + for (i = 0; i < priv->child_count; i++) { + if (!priv->last_event[i]) { + had_heard_from_all = 0; + } + } + + /* parent xlators dont need to know about every child_up, child_down + * because of afr ha. If all subvolumes go down, child_down has + * to be triggered. In that state when 1 subvolume comes up child_up + * needs to be triggered. dht optimises revalidate lookup by sending + * it only to one of its subvolumes. When child up/down happens + * for afr's subvolumes dht should be notified by child_modified. The + * subsequent revalidate lookup happens on all the dht's subvolumes + * which triggers afr self-heals if any. + */ + idx = find_child_index (this, data); + if (idx < 0) { + gf_log (this->name, GF_LOG_ERROR, "Received child_up " + "from invalid subvolume"); + goto out; + } switch (event) { case GF_EVENT_CHILD_UP: - i = find_child_index (this, data); - - /* temporarily - afr_attempt_lock_recovery (this, i); - */ - - child_up[i] = 1; - LOCK (&priv->lock); { + priv->child_up[idx] = 1; priv->up_count++; + + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 1) + up_children++; + if (up_children == 1) { + gf_log (this->name, GF_LOG_INFO, + "Subvolume '%s' came back up; " + "going online.", ((xlator_t *)data)->name); + } else { + event = GF_EVENT_CHILD_MODIFIED; + } + + priv->last_event[idx] = event; } UNLOCK (&priv->lock); - /* - if all the children were down, and one child came up, - send notify to parent - */ + break; - for (i = 0; i < priv->child_count; i++) - if (child_up[i] == 1) - up_children++; + case GF_EVENT_CHILD_DOWN: + LOCK (&priv->lock); + { + priv->child_up[idx] = 0; + priv->down_count++; - if (up_children == 1) { - gf_log (this->name, GF_LOG_INFO, - "Subvolume '%s' came back up; " - "going online.", ((xlator_t *)data)->name); + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 0) + down_children++; + if (down_children == priv->child_count) { + gf_log (this->name, GF_LOG_ERROR, + "All subvolumes are down. Going offline " + "until atleast one of them comes back up."); + } else { + event = GF_EVENT_CHILD_MODIFIED; + } - default_notify (this, event, data); - } else { - default_notify (this, GF_EVENT_CHILD_MODIFIED, data); + priv->last_event[idx] = event; } + UNLOCK (&priv->lock); break; - case GF_EVENT_CHILD_DOWN: - i = find_child_index (this, data); - - child_up[i] = 0; - + case GF_EVENT_CHILD_CONNECTING: LOCK (&priv->lock); { - priv->down_count++; + priv->last_event[idx] = event; } UNLOCK (&priv->lock); + break; + default: + propagate = 1; + break; + } - /* - if all children are down, and this was the last to go down, - send notify to parent - */ - - for (i = 0; i < priv->child_count; i++) - if (child_up[i] == 0) - down_children++; + /* have all subvolumes reported status once by now? */ + have_heard_from_all = 1; + for (i = 0; i < priv->child_count; i++) { + if (!priv->last_event[i]) + have_heard_from_all = 0; + } - if (down_children == priv->child_count) { - gf_log (this->name, GF_LOG_ERROR, - "All subvolumes are down. Going offline " - "until atleast one of them comes back up."); + /* if all subvols have reported status, no need to hide anything + or wait for anything else. Just propagate blindly */ + if (have_heard_from_all) + propagate = 1; - default_notify (this, event, data); - } else { - default_notify (this, GF_EVENT_CHILD_MODIFIED, data); - } + if (!had_heard_from_all && have_heard_from_all) { + /* This is the first event which completes aggregation + of events from all subvolumes. If at least one subvol + had come up, propagate CHILD_UP, but only this time + */ + event = GF_EVENT_CHILD_DOWN; - break; + LOCK (&priv->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (priv->last_event[i] == GF_EVENT_CHILD_UP) { + event = GF_EVENT_CHILD_UP; + break; + } - default: - default_notify (this, event, data); + if (priv->last_event[i] == + GF_EVENT_CHILD_CONNECTING) { + event = GF_EVENT_CHILD_CONNECTING; + /* continue to check other events for CHILD_UP */ + } + } + } + UNLOCK (&priv->lock); } - return 0; + ret = 0; + if (propagate) + ret = default_notify (this, event, data); +out: + return ret; } diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 79753c91b09..35dad50072d 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -706,6 +706,13 @@ init (xlator_t *this) i++; } + priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event), + gf_afr_mt_int32_t); + if (!priv->last_event) { + ret = -ENOMEM; + goto out; + } + LOCK_INIT (&priv->root_inode_lk); priv->first_lookup = 1; priv->root_inode = NULL; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index f1b0efbd22f..b806a524320 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -93,6 +93,7 @@ typedef struct _afr_private { gf_boolean_t optimistic_change_log; char vol_uuid[UUID_SIZE + 1]; + int32_t *last_event; } afr_private_t; typedef struct { -- cgit