From 8543529f381d16349662269342d55ab67a1a4582 Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Mon, 26 Dec 2011 10:10:13 +0530 Subject: cluster/afr: Handle split-brain/all-fool xattrs for directory In case of split-brain/all-fool xattrs perform conservative merge. Don't treat ignorant subvol as fool. Change-Id: I6ddf89949cd5793c2abbead7c47f091e8461f1d4 BUG: 765528 Signed-off-by: Pranith Kumar K Reviewed-on: http://review.gluster.com/2521 Tested-by: Gluster Build System Reviewed-by: Vijay Bellur --- xlators/cluster/afr/src/afr-self-heal-common.c | 245 +++++++++++------------ xlators/cluster/afr/src/afr-self-heal-common.h | 12 +- xlators/cluster/afr/src/afr-self-heal-data.c | 11 +- xlators/cluster/afr/src/afr-self-heal-entry.c | 24 ++- xlators/cluster/afr/src/afr-self-heal-metadata.c | 2 +- xlators/cluster/afr/src/afr.h | 5 + 6 files changed, 152 insertions(+), 147 deletions(-) diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 44bced74cc6..0558fafaae5 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -180,6 +180,7 @@ afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix, int afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, + unsigned char *ignorant_subvols, dict_t *xattr[], afr_transaction_type type, size_t child_count) { @@ -190,12 +191,6 @@ afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, int i = 0; int j = 0; int k = 0; - unsigned char *ignorant_subvols = NULL; - - ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), child_count, - gf_afr_mt_char); - if (NULL == ignorant_subvols) - goto out; afr_init_pending_matrix (pending_matrix, child_count); @@ -213,7 +208,8 @@ afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, * subvolume. */ - ignorant_subvols[i] = 1; + if (ignorant_subvols) + ignorant_subvols[i] = 1; continue; } @@ -224,19 +220,14 @@ afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, } } - afr_mark_ignorant_subvols_as_pending (pending_matrix, - ignorant_subvols, - child_count); - GF_FREE (ignorant_subvols); -out: return ret; } typedef enum { + AFR_NODE_INVALID, AFR_NODE_INNOCENT, AFR_NODE_FOOL, AFR_NODE_WISE, - AFR_NODE_INVALID = -1, } afr_node_type; typedef struct { @@ -490,23 +481,18 @@ out: int afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, - int32_t *valid_children, int child_count, - uint32_t uid) + int32_t *success_children, + unsigned int child_count, uint32_t uid) { int i = 0; int nsources = 0; int child = 0; - GF_ASSERT (bufs); - GF_ASSERT (valid_children); - GF_ASSERT (sources); - GF_ASSERT (child_count > 0); - for (i = 0; i < child_count; i++) { - if (-1 == valid_children[i]) - continue; + if (-1 == success_children[i]) + break; - child = valid_children[i]; + child = success_children[i]; if (uid == bufs[child].ia_uid) { sources[child] = 1; nsources++; @@ -516,21 +502,17 @@ afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, } int -afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *valid_children, - int child_count) +afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *success_children, + unsigned int child_count) { int i = 0; int smallest = -1; int child = 0; - GF_ASSERT (bufs); - GF_ASSERT (valid_children); - GF_ASSERT (child_count > 0); - for (i = 0; i < child_count; i++) { - if (-1 == valid_children[i]) - continue; - child = valid_children[i]; + if (-1 == success_children[i]) + break; + child = success_children[i]; if ((smallest == -1) || (bufs[child].ia_uid < bufs[smallest].ia_uid)) { smallest = child; @@ -540,20 +522,20 @@ afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *valid_children, } static int -afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *valid_children, +afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *success_children, int child_count, int32_t *sources) { int nsources = 0; int smallest = 0; - smallest = afr_get_child_with_lowest_uid (bufs, valid_children, + smallest = afr_get_child_with_lowest_uid (bufs, success_children, child_count); if (smallest < 0) { nsources = -1; goto out; } nsources = afr_mark_child_as_source_by_uid (sources, bufs, - valid_children, child_count, + success_children, child_count, bufs[smallest].ia_uid); out: return nsources; @@ -583,12 +565,10 @@ afr_get_character_str (afr_node_type type) afr_node_type afr_find_child_character_type (int32_t *pending_row, int32_t child, - int32_t child_count, const char *xlator_name) + unsigned int child_count) { afr_node_type type = AFR_NODE_INVALID; - GF_ASSERT (pending_row); - GF_ASSERT (child_count > 0); GF_ASSERT ((child >= 0) && (child < child_count)); if (afr_sh_is_innocent (pending_row, child_count)) @@ -597,44 +577,85 @@ afr_find_child_character_type (int32_t *pending_row, int32_t child, type = AFR_NODE_FOOL; else if (afr_sh_is_wise (pending_row, child, child_count)) type = AFR_NODE_WISE; - else - GF_ASSERT (0); - - gf_log (xlator_name, GF_LOG_DEBUG, "child %d character %s", - child, afr_get_character_str (type)); return type; } int afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, int32_t **pending_matrix, int32_t *sources, - int32_t *success_children, afr_transaction_type type) + int32_t *success_children, afr_transaction_type type, + int32_t *subvol_status, gf_boolean_t ignore_ignorant) { afr_private_t *priv = NULL; afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; int nsources = -1; + unsigned char *ignorant_subvols = NULL; + unsigned int child_count = 0; priv = this->private; + child_count = priv->child_count; if (afr_get_children_count (success_children, priv->child_count) == 0) goto out; + if (!ignore_ignorant) { + ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), + child_count, gf_afr_mt_char); + if (NULL == ignorant_subvols) + goto out; + } + afr_build_pending_matrix (priv->pending_key, pending_matrix, - xattr, type, priv->child_count); + ignorant_subvols, xattr, type, + priv->child_count); + if (!ignore_ignorant) + afr_mark_ignorant_subvols_as_pending (pending_matrix, + ignorant_subvols, + priv->child_count); sh_type = afr_self_heal_type_for_transaction (type); if (AFR_SELF_HEAL_INVALID == sh_type) goto out; afr_sh_print_pending_matrix (pending_matrix, this); - nsources = afr_mark_sources (sources, pending_matrix, bufs, - priv->child_count, sh_type, - success_children, this->name); + nsources = afr_mark_sources (this, sources, pending_matrix, bufs, + sh_type, success_children, subvol_status); out: + GF_FREE (ignorant_subvols); return nsources; } +void +afr_find_character_types (afr_node_character *characters, + int32_t **pending_matrix, int32_t *success_children, + unsigned int child_count) +{ + afr_node_type type = AFR_NODE_INVALID; + int child = 0; + int i = 0; + + for (i = 0; i < child_count; i++) { + child = success_children[i]; + if (child == -1) + break; + type = afr_find_child_character_type (pending_matrix[child], + child, child_count); + characters[child].type = type; + } +} + +void +afr_mark_success_children_sources (int32_t *sources, int32_t *success_children, + unsigned int child_count) +{ + int i = 0; + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + sources[success_children[i]] = 1; + } +} /** * mark_sources: Mark all 'source' nodes and return number of source * nodes found @@ -660,17 +681,18 @@ out: */ int -afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, - int32_t child_count, afr_self_heal_type type, - int32_t *valid_children, const char *xlator_name) +afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, + struct iatt *bufs, afr_self_heal_type type, + int32_t *success_children, int32_t *subvol_status) { /* stores the 'characters' (innocent, fool, wise) of the nodes */ - afr_node_character *characters = NULL; - int i = 0; - int nsources = -1; - xlator_t *this = NULL; + int nsources = -1; + unsigned int child_count = 0; + afr_private_t *priv = NULL; + priv = this->private; + child_count = priv->child_count; characters = GF_CALLOC (sizeof (afr_node_character), child_count, gf_afr_mt_afr_node_character); if (!characters) @@ -679,26 +701,14 @@ afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, this = THIS; /* start clean */ - for (i = 0; i < child_count; i++) { - sources[i] = 0; - } - + memset (sources, 0, sizeof (*sources) * child_count); nsources = 0; - for (i = 0; i < child_count; i++) { - characters[i].type = - afr_find_child_character_type (pending_matrix[i], i, - child_count, - xlator_name); - if (AFR_NODE_INVALID == characters[i].type) - gf_log (xlator_name, GF_LOG_WARNING, - "child %d had invalid xattrs", i); - } - - if ((type == AFR_SELF_HEAL_METADATA) - && afr_sh_all_nodes_innocent (characters, child_count)) { - - nsources = afr_sh_mark_lowest_uid_as_source (bufs, - valid_children, + afr_find_character_types (characters, pending_matrix, success_children, + child_count); + if (afr_sh_all_nodes_innocent (characters, child_count)) { + if (type == AFR_SELF_HEAL_METADATA) + nsources = afr_sh_mark_lowest_uid_as_source (bufs, + success_children, child_count, sources); goto out; @@ -708,17 +718,17 @@ afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, afr_sh_compute_wisdom (pending_matrix, characters, child_count); if (afr_sh_wise_nodes_conflict (characters, child_count)) { - /* split-brain */ - gf_log (this->name, GF_LOG_INFO, - "split-brain possible, no source detected"); + if (subvol_status) + *subvol_status |= SPLIT_BRAIN; nsources = -1; - } else { nsources = afr_sh_mark_wisest_as_sources (sources, characters, child_count); } } else { + if (subvol_status) + *subvol_status |= ALL_FOOLS; nsources = afr_mark_biggest_of_fools_as_source (sources, pending_matrix, characters, @@ -726,14 +736,10 @@ afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, } out: - if (nsources == 0) { - for (i = 0; i < child_count; i++) { - if (valid_children[i] != -1) - sources[valid_children[i]] = 1; - } - } - if (characters) - GF_FREE (characters); + if (nsources == 0) + afr_mark_success_children_sources (sources, success_children, + child_count); + GF_FREE (characters); gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources); return nsources; @@ -744,45 +750,14 @@ afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, int32_t *delta_matrix[], unsigned char success[], int child_count, afr_transaction_type type) { - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - int ret = 0; - int i = 0; - int j = 0; - int k = 0; - - /* start clean */ - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - delta_matrix[i][j] = 0; - } - } - - for (i = 0; i < child_count; i++) { - if (pending_raw) - pending_raw = NULL; - - for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], priv->pending_key[j], - &pending_raw); - if (ret < 0) - gf_log (THIS->name, GF_LOG_DEBUG, - "Unable to get dict value."); - if (!success[j]) - continue; - - k = afr_index_for_transaction_type (type); - - if (pending_raw != NULL) { - memcpy (pending, pending_raw, sizeof(pending)); - delta_matrix[i][j] = -(ntoh32 (pending[k])); - } else { - delta_matrix[i][j] = 0; - } + int i = 0; + int j = 0; - } - } + afr_build_pending_matrix (priv->pending_key, delta_matrix, NULL, + xattr, type, priv->child_count); + for (i = 0; i < priv->child_count; i++) + for (j = 0; j < priv->child_count; j++) + delta_matrix[i][j] = -delta_matrix[i][j]; } @@ -1262,7 +1237,8 @@ afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this, nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, sh->success_children, - afr_transaction_type_get (ia_type)); + afr_transaction_type_get (ia_type), + NULL, _gf_false); if (nsources < 0) { gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s," " in missing entry self-heal, continuing with the rest" @@ -1699,6 +1675,7 @@ afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this, int enoent_count = 0; int nsources = 0; int source = -1; + int32_t subvol_status = 0; local = frame->local; sh = &local->self_heal; @@ -1728,11 +1705,19 @@ afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this, nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, sh->success_children, - AFR_ENTRY_TRANSACTION); - if (nsources < 0) { - gf_log (this->name, GF_LOG_ERROR, "No sources for dir of %s," - " in missing entry self-heal, aborting self-heal", - local->loc.path); + AFR_ENTRY_TRANSACTION, &subvol_status, + _gf_true); + if ((subvol_status & ALL_FOOLS) || + (subvol_status & SPLIT_BRAIN)) { + gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative " + "merge", sh->parent_loc.path); + afr_mark_success_children_sources (sh->sources, + sh->success_children, + priv->child_count); + } else if (nsources < 0) { + gf_log (this->name, GF_LOG_ERROR, "No sources for dir " + "of %s, in missing entry self-heal, aborting " + "self-heal", local->loc.path); goto out; } diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h index 77c3375cc6a..42730a852e7 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -48,6 +48,7 @@ afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); int afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, + unsigned char *ignorant_subvols, dict_t *xattr[], afr_transaction_type type, size_t child_count); @@ -57,9 +58,9 @@ afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, int child_count, afr_transaction_type type); int -afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, - int32_t child_count, afr_self_heal_type type, - int32_t *valid_children, const char *xlator_name); +afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, + struct iatt *bufs, afr_self_heal_type type, + int32_t *success_children, int32_t *subvol_status); int afr_sh_delta_to_xattr (afr_private_t *priv, @@ -77,9 +78,10 @@ afr_self_heal_type afr_self_heal_type_for_transaction (afr_transaction_type type); int -afr_build_sources (xlator_t *xlator, dict_t **xattr, struct iatt *bufs, +afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, int32_t **pending_matrix, int32_t *sources, - int32_t *success_children, afr_transaction_type type); + int32_t *success_children, afr_transaction_type type, + int32_t *subvol_status, gf_boolean_t ignore_ignorant); void afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count); diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 431cef492c7..83920c081b7 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -662,7 +662,7 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this) frame->root->lk_owner); nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, sh->success_children, - AFR_DATA_TRANSACTION); + AFR_DATA_TRANSACTION, NULL, _gf_false); if (nsources == 0) { gf_log (this->name, GF_LOG_DEBUG, "No self-heal needed for %s", @@ -806,6 +806,7 @@ afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, int32_t nsources = 0; int32_t prev_read_child = -1; int32_t config_read_child = -1; + int32_t subvol_status = 0; priv = this->private; bufs = local->cont.lookup.bufs; @@ -819,7 +820,11 @@ afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, memset (sources, 0, sizeof (*sources) * priv->child_count); nsources = afr_build_sources (this, xattr, bufs, pending_matrix, - sources, success_children, txn_type); + sources, success_children, txn_type, + &subvol_status, _gf_false); + if (subvol_status & SPLIT_BRAIN) + gf_log (this->name, GF_LOG_WARNING, "%s: Possible split-brain", + local->loc.path); if (nsources < 0) goto out; @@ -991,7 +996,7 @@ afr_post_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie, (void) afr_build_sources (this, sh->xattr, NULL, sh->pending_matrix, sh->sources, sh->success_children, - AFR_DATA_TRANSACTION); + AFR_DATA_TRANSACTION, NULL, _gf_false); ret = afr_sh_inode_set_read_ctx (sh, this); if (ret) afr_sh_data_fail (frame, this); diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 3359029c3ac..ba29656e2cd 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -1598,7 +1598,7 @@ afr_sh_need_recreate (afr_self_heal_t *impunge_sh, int *sources, GF_ASSERT (sources); success_children = impunge_sh->success_children; - if (sources[child] || (child == impunge_sh->active_source)) { + if (child == impunge_sh->active_source) { GF_ASSERT (afr_is_child_present (success_children, child_count, child)); goto out; @@ -2115,8 +2115,8 @@ afr_sh_entry_fix (call_frame_t *frame, xlator_t *this, afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; int source = 0; - - int nsources = 0; + int nsources = 0; + int32_t subvol_status = 0; local = frame->local; sh = &local->self_heal; @@ -2137,23 +2137,31 @@ afr_sh_entry_fix (call_frame_t *frame, xlator_t *this, nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, sh->success_children, - AFR_ENTRY_TRANSACTION); - if (nsources == 0) { + AFR_ENTRY_TRANSACTION, &subvol_status, + _gf_true); + if ((subvol_status & ALL_FOOLS) || + (subvol_status & SPLIT_BRAIN)) { + gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative " + "merge", local->loc.path); + source = -1; + memset (sh->sources, 0, + sizeof (*sh->sources) * priv->child_count); + } else if (nsources == 0) { gf_log (this->name, GF_LOG_TRACE, "No self-heal needed for %s", local->loc.path); afr_sh_entry_finish (frame, this); return; + } else { + source = afr_sh_select_source (sh->sources, priv->child_count); } - source = afr_sh_select_source (sh->sources, priv->child_count); - sh->source = source; afr_reset_children (sh->fresh_children, priv->child_count); afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); + sh->fresh_children, priv->child_count); if (sh->source >= 0) afr_inode_set_read_ctx (this, sh->inode, sh->source, sh->fresh_children); diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 072ed74f8bf..992e9d88c3f 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -478,7 +478,7 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this, nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, sh->success_children, - AFR_METADATA_TRANSACTION); + AFR_METADATA_TRANSACTION, NULL, _gf_false); if (nsources == 0) { gf_log (this->name, GF_LOG_TRACE, "No self-heal needed for %s", diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 4aea44c4275..37a13d5de2a 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -57,6 +57,11 @@ typedef enum { AFR_POS_REMOTE } afr_child_pos_t; +typedef enum { + SPLIT_BRAIN = 1, + ALL_FOOLS = 2 +} afr_subvol_status_t; + typedef enum { AFR_INODE_SET_READ_CTX = 1, AFR_INODE_RM_STALE_CHILDREN, -- cgit