From 8543529f381d16349662269342d55ab67a1a4582 Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Mon, 26 Dec 2011 10:10:13 +0530 Subject: cluster/afr: Handle split-brain/all-fool xattrs for directory In case of split-brain/all-fool xattrs perform conservative merge. Don't treat ignorant subvol as fool. Change-Id: I6ddf89949cd5793c2abbead7c47f091e8461f1d4 BUG: 765528 Signed-off-by: Pranith Kumar K Reviewed-on: http://review.gluster.com/2521 Tested-by: Gluster Build System Reviewed-by: Vijay Bellur --- xlators/cluster/afr/src/afr-self-heal-common.c | 245 ++++++++++++------------- 1 file changed, 115 insertions(+), 130 deletions(-) (limited to 'xlators/cluster/afr/src/afr-self-heal-common.c') diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 44bced74c..0558fafaa 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -180,6 +180,7 @@ afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix, int afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, + unsigned char *ignorant_subvols, dict_t *xattr[], afr_transaction_type type, size_t child_count) { @@ -190,12 +191,6 @@ afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, int i = 0; int j = 0; int k = 0; - unsigned char *ignorant_subvols = NULL; - - ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), child_count, - gf_afr_mt_char); - if (NULL == ignorant_subvols) - goto out; afr_init_pending_matrix (pending_matrix, child_count); @@ -213,7 +208,8 @@ afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, * subvolume. */ - ignorant_subvols[i] = 1; + if (ignorant_subvols) + ignorant_subvols[i] = 1; continue; } @@ -224,19 +220,14 @@ afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, } } - afr_mark_ignorant_subvols_as_pending (pending_matrix, - ignorant_subvols, - child_count); - GF_FREE (ignorant_subvols); -out: return ret; } typedef enum { + AFR_NODE_INVALID, AFR_NODE_INNOCENT, AFR_NODE_FOOL, AFR_NODE_WISE, - AFR_NODE_INVALID = -1, } afr_node_type; typedef struct { @@ -490,23 +481,18 @@ out: int afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, - int32_t *valid_children, int child_count, - uint32_t uid) + int32_t *success_children, + unsigned int child_count, uint32_t uid) { int i = 0; int nsources = 0; int child = 0; - GF_ASSERT (bufs); - GF_ASSERT (valid_children); - GF_ASSERT (sources); - GF_ASSERT (child_count > 0); - for (i = 0; i < child_count; i++) { - if (-1 == valid_children[i]) - continue; + if (-1 == success_children[i]) + break; - child = valid_children[i]; + child = success_children[i]; if (uid == bufs[child].ia_uid) { sources[child] = 1; nsources++; @@ -516,21 +502,17 @@ afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, } int -afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *valid_children, - int child_count) +afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *success_children, + unsigned int child_count) { int i = 0; int smallest = -1; int child = 0; - GF_ASSERT (bufs); - GF_ASSERT (valid_children); - GF_ASSERT (child_count > 0); - for (i = 0; i < child_count; i++) { - if (-1 == valid_children[i]) - continue; - child = valid_children[i]; + if (-1 == success_children[i]) + break; + child = success_children[i]; if ((smallest == -1) || (bufs[child].ia_uid < bufs[smallest].ia_uid)) { smallest = child; @@ -540,20 +522,20 @@ afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *valid_children, } static int -afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *valid_children, +afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *success_children, int child_count, int32_t *sources) { int nsources = 0; int smallest = 0; - smallest = afr_get_child_with_lowest_uid (bufs, valid_children, + smallest = afr_get_child_with_lowest_uid (bufs, success_children, child_count); if (smallest < 0) { nsources = -1; goto out; } nsources = afr_mark_child_as_source_by_uid (sources, bufs, - valid_children, child_count, + success_children, child_count, bufs[smallest].ia_uid); out: return nsources; @@ -583,12 +565,10 @@ afr_get_character_str (afr_node_type type) afr_node_type afr_find_child_character_type (int32_t *pending_row, int32_t child, - int32_t child_count, const char *xlator_name) + unsigned int child_count) { afr_node_type type = AFR_NODE_INVALID; - GF_ASSERT (pending_row); - GF_ASSERT (child_count > 0); GF_ASSERT ((child >= 0) && (child < child_count)); if (afr_sh_is_innocent (pending_row, child_count)) @@ -597,44 +577,85 @@ afr_find_child_character_type (int32_t *pending_row, int32_t child, type = AFR_NODE_FOOL; else if (afr_sh_is_wise (pending_row, child, child_count)) type = AFR_NODE_WISE; - else - GF_ASSERT (0); - - gf_log (xlator_name, GF_LOG_DEBUG, "child %d character %s", - child, afr_get_character_str (type)); return type; } int afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, int32_t **pending_matrix, int32_t *sources, - int32_t *success_children, afr_transaction_type type) + int32_t *success_children, afr_transaction_type type, + int32_t *subvol_status, gf_boolean_t ignore_ignorant) { afr_private_t *priv = NULL; afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; int nsources = -1; + unsigned char *ignorant_subvols = NULL; + unsigned int child_count = 0; priv = this->private; + child_count = priv->child_count; if (afr_get_children_count (success_children, priv->child_count) == 0) goto out; + if (!ignore_ignorant) { + ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), + child_count, gf_afr_mt_char); + if (NULL == ignorant_subvols) + goto out; + } + afr_build_pending_matrix (priv->pending_key, pending_matrix, - xattr, type, priv->child_count); + ignorant_subvols, xattr, type, + priv->child_count); + if (!ignore_ignorant) + afr_mark_ignorant_subvols_as_pending (pending_matrix, + ignorant_subvols, + priv->child_count); sh_type = afr_self_heal_type_for_transaction (type); if (AFR_SELF_HEAL_INVALID == sh_type) goto out; afr_sh_print_pending_matrix (pending_matrix, this); - nsources = afr_mark_sources (sources, pending_matrix, bufs, - priv->child_count, sh_type, - success_children, this->name); + nsources = afr_mark_sources (this, sources, pending_matrix, bufs, + sh_type, success_children, subvol_status); out: + GF_FREE (ignorant_subvols); return nsources; } +void +afr_find_character_types (afr_node_character *characters, + int32_t **pending_matrix, int32_t *success_children, + unsigned int child_count) +{ + afr_node_type type = AFR_NODE_INVALID; + int child = 0; + int i = 0; + + for (i = 0; i < child_count; i++) { + child = success_children[i]; + if (child == -1) + break; + type = afr_find_child_character_type (pending_matrix[child], + child, child_count); + characters[child].type = type; + } +} + +void +afr_mark_success_children_sources (int32_t *sources, int32_t *success_children, + unsigned int child_count) +{ + int i = 0; + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + sources[success_children[i]] = 1; + } +} /** * mark_sources: Mark all 'source' nodes and return number of source * nodes found @@ -660,17 +681,18 @@ out: */ int -afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, - int32_t child_count, afr_self_heal_type type, - int32_t *valid_children, const char *xlator_name) +afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, + struct iatt *bufs, afr_self_heal_type type, + int32_t *success_children, int32_t *subvol_status) { /* stores the 'characters' (innocent, fool, wise) of the nodes */ - afr_node_character *characters = NULL; - int i = 0; - int nsources = -1; - xlator_t *this = NULL; + int nsources = -1; + unsigned int child_count = 0; + afr_private_t *priv = NULL; + priv = this->private; + child_count = priv->child_count; characters = GF_CALLOC (sizeof (afr_node_character), child_count, gf_afr_mt_afr_node_character); if (!characters) @@ -679,26 +701,14 @@ afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, this = THIS; /* start clean */ - for (i = 0; i < child_count; i++) { - sources[i] = 0; - } - + memset (sources, 0, sizeof (*sources) * child_count); nsources = 0; - for (i = 0; i < child_count; i++) { - characters[i].type = - afr_find_child_character_type (pending_matrix[i], i, - child_count, - xlator_name); - if (AFR_NODE_INVALID == characters[i].type) - gf_log (xlator_name, GF_LOG_WARNING, - "child %d had invalid xattrs", i); - } - - if ((type == AFR_SELF_HEAL_METADATA) - && afr_sh_all_nodes_innocent (characters, child_count)) { - - nsources = afr_sh_mark_lowest_uid_as_source (bufs, - valid_children, + afr_find_character_types (characters, pending_matrix, success_children, + child_count); + if (afr_sh_all_nodes_innocent (characters, child_count)) { + if (type == AFR_SELF_HEAL_METADATA) + nsources = afr_sh_mark_lowest_uid_as_source (bufs, + success_children, child_count, sources); goto out; @@ -708,17 +718,17 @@ afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, afr_sh_compute_wisdom (pending_matrix, characters, child_count); if (afr_sh_wise_nodes_conflict (characters, child_count)) { - /* split-brain */ - gf_log (this->name, GF_LOG_INFO, - "split-brain possible, no source detected"); + if (subvol_status) + *subvol_status |= SPLIT_BRAIN; nsources = -1; - } else { nsources = afr_sh_mark_wisest_as_sources (sources, characters, child_count); } } else { + if (subvol_status) + *subvol_status |= ALL_FOOLS; nsources = afr_mark_biggest_of_fools_as_source (sources, pending_matrix, characters, @@ -726,14 +736,10 @@ afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, } out: - if (nsources == 0) { - for (i = 0; i < child_count; i++) { - if (valid_children[i] != -1) - sources[valid_children[i]] = 1; - } - } - if (characters) - GF_FREE (characters); + if (nsources == 0) + afr_mark_success_children_sources (sources, success_children, + child_count); + GF_FREE (characters); gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources); return nsources; @@ -744,45 +750,14 @@ afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, int32_t *delta_matrix[], unsigned char success[], int child_count, afr_transaction_type type) { - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - int ret = 0; - int i = 0; - int j = 0; - int k = 0; - - /* start clean */ - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - delta_matrix[i][j] = 0; - } - } - - for (i = 0; i < child_count; i++) { - if (pending_raw) - pending_raw = NULL; - - for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], priv->pending_key[j], - &pending_raw); - if (ret < 0) - gf_log (THIS->name, GF_LOG_DEBUG, - "Unable to get dict value."); - if (!success[j]) - continue; - - k = afr_index_for_transaction_type (type); - - if (pending_raw != NULL) { - memcpy (pending, pending_raw, sizeof(pending)); - delta_matrix[i][j] = -(ntoh32 (pending[k])); - } else { - delta_matrix[i][j] = 0; - } + int i = 0; + int j = 0; - } - } + afr_build_pending_matrix (priv->pending_key, delta_matrix, NULL, + xattr, type, priv->child_count); + for (i = 0; i < priv->child_count; i++) + for (j = 0; j < priv->child_count; j++) + delta_matrix[i][j] = -delta_matrix[i][j]; } @@ -1262,7 +1237,8 @@ afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this, nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, sh->success_children, - afr_transaction_type_get (ia_type)); + afr_transaction_type_get (ia_type), + NULL, _gf_false); if (nsources < 0) { gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s," " in missing entry self-heal, continuing with the rest" @@ -1699,6 +1675,7 @@ afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this, int enoent_count = 0; int nsources = 0; int source = -1; + int32_t subvol_status = 0; local = frame->local; sh = &local->self_heal; @@ -1728,11 +1705,19 @@ afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this, nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, sh->success_children, - AFR_ENTRY_TRANSACTION); - if (nsources < 0) { - gf_log (this->name, GF_LOG_ERROR, "No sources for dir of %s," - " in missing entry self-heal, aborting self-heal", - local->loc.path); + AFR_ENTRY_TRANSACTION, &subvol_status, + _gf_true); + if ((subvol_status & ALL_FOOLS) || + (subvol_status & SPLIT_BRAIN)) { + gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative " + "merge", sh->parent_loc.path); + afr_mark_success_children_sources (sh->sources, + sh->success_children, + priv->child_count); + } else if (nsources < 0) { + gf_log (this->name, GF_LOG_ERROR, "No sources for dir " + "of %s, in missing entry self-heal, aborting " + "self-heal", local->loc.path); goto out; } -- cgit