diff options
Diffstat (limited to 'xlators/cluster/afr/src/afr-self-heal-common.c')
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 1968 |
1 files changed, 1296 insertions, 672 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 169d549d9..ef92b4205 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -1,20 +1,11 @@ /* - Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com> + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> This file is part of GlusterFS. - GlusterFS is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 3 of the License, - or (at your option) any later version. - - GlusterFS is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see - <http://www.gnu.org/licenses/>. + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. */ #include "glusterfs.h" @@ -27,6 +18,52 @@ #include "afr-self-heal.h" #include "pump.h" +#define ADD_FMT_STRING(msg, off, sh_str, status, print_log) \ + do { \ + if (AFR_SELF_HEAL_NOT_ATTEMPTED != status) { \ + off += snprintf (msg + off, sizeof (msg) - off, \ + " "sh_str" self heal %s,", \ + get_sh_completion_status (status));\ + print_log = 1; \ + } \ + } while (0) + +#define ADD_FMT_STRING_SYNC(msg, off, sh_str, status, print_log) \ + do { \ + if (AFR_SELF_HEAL_SYNC_BEGIN == status || \ + AFR_SELF_HEAL_FAILED == status) { \ + off += snprintf (msg + off, sizeof (msg) - off, \ + " "sh_str" self heal %s,", \ + get_sh_completion_status (status));\ + print_log = 1; \ + } \ + } while (0) + + +void +afr_sh_reset (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + memset (sh->child_errno, 0, + sizeof (*sh->child_errno) * priv->child_count); + memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count); + memset (sh->parentbufs, 0, + sizeof (*sh->parentbufs) * priv->child_count); + memset (sh->success, 0, sizeof (*sh->success) * priv->child_count); + memset (sh->locked_nodes, 0, + sizeof (*sh->locked_nodes) * priv->child_count); + sh->active_sinks = 0; + + afr_reset_xattr (sh->xattr, priv->child_count); +} + //Intersection[child]=1 if child is part of intersection void afr_children_intersection_get (int32_t *set1, int32_t *set2, @@ -81,21 +118,6 @@ afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this) sh->active_sinks = active_sinks; } -/** - * sink_count - return number of sinks in sources array - */ - -int -afr_sh_sink_count (int sources[], int child_count) -{ - int i = 0; - int sinks = 0; - for (i = 0; i < child_count; i++) - if (!sources[i]) - sinks++; - return sinks; -} - int afr_sh_source_count (int sources[], int child_count) { @@ -112,8 +134,8 @@ void afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno) { sh->op_ret = -1; - if (afr_error_more_important (sh->op_errno, op_errno)) - sh->op_errno = op_errno; + sh->op_errno = afr_most_important_error(sh->op_errno, op_errno, + _gf_false); } void @@ -135,13 +157,85 @@ afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); } sprintf (ptr, "]"); - gf_log (this->name, GF_LOG_DEBUG, - "pending_matrix: %s", buf); + gf_log (this->name, GF_LOG_DEBUG, "pending_matrix: %s", buf); } GF_FREE (buf); } +char* +afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this) +{ + afr_private_t * priv = this->private; + char *buf = NULL; + char *ptr = NULL; + int i = 0; + int j = 0; + int child_count = priv->child_count; + char *matrix_begin = "[ [ "; + char *matrix_end = "] ]"; + char *seperator = "] [ "; + int pending_entry_strlen = 12; //Including space after entry + int matrix_begin_strlen = 0; + int matrix_end_strlen = 0; + int seperator_strlen = 0; + int string_length = 0; + char *msg = "- Pending matrix: "; + + /* + * for a list of lists of [ [ a b ] [ c d ] ] + * */ + + matrix_begin_strlen = strlen (matrix_begin); + matrix_end_strlen = strlen (matrix_end); + seperator_strlen = strlen (seperator); + string_length = matrix_begin_strlen + matrix_end_strlen + + (child_count -1) * seperator_strlen + + (child_count * child_count * pending_entry_strlen); + + buf = GF_CALLOC (1, 1 + strlen (msg) + string_length , gf_afr_mt_char); + if (!buf) + goto out; + + ptr = buf; + ptr += sprintf (ptr, "%s", msg); + ptr += sprintf (ptr, "%s", matrix_begin); + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); + } + if (i < priv->child_count -1) + ptr += sprintf (ptr, "%s", seperator); + } + + ptr += sprintf (ptr, "%s", matrix_end); + +out: + return buf; +} + +void +afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, + const char *loc) +{ + char *buf = NULL; + char *free_ptr = NULL; + + buf = afr_get_pending_matrix_str (pending_matrix, this); + if (buf) + free_ptr = buf; + else + buf = ""; + + + gf_log (this->name, GF_LOG_ERROR, "Unable to self-heal contents of '%s'" + " (possible split-brain). Please delete the file from all but " + "the preferred subvolume.%s", loc, buf); + GF_FREE (free_ptr); + return; +} + + void afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count) { @@ -180,6 +274,7 @@ afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix, int afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, + unsigned char *ignorant_subvols, dict_t *xattr[], afr_transaction_type type, size_t child_count) { @@ -190,12 +285,6 @@ afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, int i = 0; int j = 0; int k = 0; - unsigned char *ignorant_subvols = NULL; - - ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), child_count, - gf_afr_mt_char); - if (NULL == ignorant_subvols) - goto out; afr_init_pending_matrix (pending_matrix, child_count); @@ -213,7 +302,8 @@ afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, * subvolume. */ - ignorant_subvols[i] = 1; + if (ignorant_subvols) + ignorant_subvols[i] = 1; continue; } @@ -224,19 +314,14 @@ afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, } } - afr_mark_ignorant_subvols_as_pending (pending_matrix, - ignorant_subvols, - child_count); - GF_FREE (ignorant_subvols); -out: return ret; } typedef enum { + AFR_NODE_INVALID, AFR_NODE_INNOCENT, AFR_NODE_FOOL, AFR_NODE_WISE, - AFR_NODE_INVALID = -1, } afr_node_type; typedef struct { @@ -316,7 +401,7 @@ afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count) * It is 1 if no other wise node accuses it. * Only wise nodes with wisdom 1 are sources. * - * If no nodes with wisdom 1 exist, a split-brain has occured. + * If no nodes with wisdom 1 exist, a split-brain has occurred. */ static void @@ -416,6 +501,8 @@ afr_find_biggest_witness_among_fools (int32_t *witnesses, { int i = 0; int biggest_witness = -1; + int biggest_witness_idx = -1; + int biggest_witness_cnt = -1; GF_ASSERT (witnesses); GF_ASSERT (characters); @@ -425,10 +512,21 @@ afr_find_biggest_witness_among_fools (int32_t *witnesses, if (characters[i].type != AFR_NODE_FOOL) continue; - if (biggest_witness < witnesses[i]) + if (biggest_witness < witnesses[i]) { biggest_witness = witnesses[i]; + biggest_witness_idx = i; + biggest_witness_cnt = 1; + continue; + } + + if (biggest_witness == witnesses[i]) + biggest_witness_cnt++; } - return biggest_witness; + + if (biggest_witness_cnt != 1) + return -1; + + return biggest_witness_idx; } int @@ -456,10 +554,84 @@ afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses, return nsources; } + +int +afr_mark_fool_as_source_by_idx (int32_t *sources, int child_count, int idx) +{ + if (idx >= 0 && idx < child_count) { + sources[idx] = 1; + return 1; + } + return 0; +} + + +static int +afr_find_largest_file_size (struct iatt *bufs, int32_t *success_children, + int child_count) +{ + int idx = -1; + int i = -1; + int child = -1; + uint64_t max_size = 0; + uint64_t min_size = 0; + int num_children = 0; + + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + + child = success_children[i]; + if (bufs[child].ia_size > max_size) { + max_size = bufs[child].ia_size; + idx = child; + } + + if ((num_children == 0) || (bufs[child].ia_size < min_size)) { + min_size = bufs[child].ia_size; + } + + num_children++; + } + + /* If sizes are same for all of them, finding sources will have to + * happen with pending changelog. So return -1 + */ + if ((num_children > 1) && (min_size == max_size)) + return -1; + return idx; +} + + +static int +afr_find_newest_file (struct iatt *bufs, int32_t *success_children, + int child_count) +{ + int idx = -1; + int i = -1; + int child = -1; + uint64_t max_ctime = 0; + + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + + child = success_children[i]; + if (bufs[child].ia_ctime > max_ctime) { + max_ctime = bufs[child].ia_ctime; + idx = child; + } + } + + return idx; +} + + static int afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, afr_node_character *characters, - int child_count) + int32_t *success_children, + int child_count, struct iatt *bufs) { int32_t biggest_witness = 0; int nsources = 0; @@ -467,6 +639,11 @@ afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, GF_ASSERT (child_count > 0); + biggest_witness = afr_find_largest_file_size (bufs, success_children, + child_count); + if (biggest_witness != -1) + goto found; + witnesses = GF_CALLOC (child_count, sizeof (*witnesses), gf_afr_mt_int32_t); if (NULL == witnesses) { @@ -479,34 +656,34 @@ afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, biggest_witness = afr_find_biggest_witness_among_fools (witnesses, characters, child_count); - nsources = afr_mark_fool_as_source_by_witness (sources, witnesses, - characters, child_count, - biggest_witness); + if (biggest_witness != -1) + goto found; + + biggest_witness = afr_find_newest_file (bufs, success_children, + child_count); + +found: + nsources = afr_mark_fool_as_source_by_idx (sources, child_count, + biggest_witness); out: - if (witnesses) - GF_FREE (witnesses); + GF_FREE (witnesses); return nsources; } int afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, - int32_t *valid_children, int child_count, - uint32_t uid) + int32_t *success_children, + unsigned int child_count, uint32_t uid) { int i = 0; int nsources = 0; int child = 0; - GF_ASSERT (bufs); - GF_ASSERT (valid_children); - GF_ASSERT (sources); - GF_ASSERT (child_count > 0); - for (i = 0; i < child_count; i++) { - if (-1 == valid_children[i]) - continue; + if (-1 == success_children[i]) + break; - child = valid_children[i]; + child = success_children[i]; if (uid == bufs[child].ia_uid) { sources[child] = 1; nsources++; @@ -516,21 +693,17 @@ afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs, } int -afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *valid_children, - int child_count) +afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *success_children, + unsigned int child_count) { int i = 0; int smallest = -1; int child = 0; - GF_ASSERT (bufs); - GF_ASSERT (valid_children); - GF_ASSERT (child_count > 0); - for (i = 0; i < child_count; i++) { - if (-1 == valid_children[i]) - continue; - child = valid_children[i]; + if (-1 == success_children[i]) + break; + child = success_children[i]; if ((smallest == -1) || (bufs[child].ia_uid < bufs[smallest].ia_uid)) { smallest = child; @@ -540,25 +713,97 @@ afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *valid_children, } static int -afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *valid_children, +afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *success_children, int child_count, int32_t *sources) { int nsources = 0; int smallest = 0; - smallest = afr_get_child_with_lowest_uid (bufs, valid_children, + smallest = afr_get_child_with_lowest_uid (bufs, success_children, child_count); if (smallest < 0) { nsources = -1; goto out; } nsources = afr_mark_child_as_source_by_uid (sources, bufs, - valid_children, child_count, + success_children, child_count, bufs[smallest].ia_uid); out: return nsources; } +int +afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children, + struct iatt *bufs) +{ + afr_private_t *priv = NULL; + int i = 0; + int child = -1; + int read_child = -1; + + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + child = success_children[i]; + if (child < 0) + break; + if (read_child < 0) + read_child = child; + else if (bufs[read_child].ia_size < bufs[child].ia_size) + read_child = child; + } + return read_child; +} + +int +afr_sh_mark_zero_size_file_as_sink (struct iatt *bufs, int32_t *success_children, + int child_count, int32_t *sources) +{ + int nsources = 0; + int i = 0; + int child = 0; + gf_boolean_t sink_exists = _gf_false; + gf_boolean_t source_exists = _gf_false; + int source = -1; + + for (i = 0; i < child_count; i++) { + child = success_children[i]; + if (child < 0) + break; + if (!bufs[child].ia_size) { + sink_exists = _gf_true; + continue; + } + if (!source_exists) { + source_exists = _gf_true; + source = child; + continue; + } + if (bufs[source].ia_size != bufs[child].ia_size) { + nsources = -1; + goto out; + } + } + if (!source_exists && !sink_exists) { + nsources = -1; + goto out; + } + + if (!source_exists || !sink_exists) + goto out; + + for (i = 0; i < child_count; i++) { + child = success_children[i]; + if (child < 0) + break; + if (bufs[child].ia_size) { + sources[child] = 1; + nsources++; + } + } +out: + return nsources; +} + char * afr_get_character_str (afr_node_type type) { @@ -583,12 +828,10 @@ afr_get_character_str (afr_node_type type) afr_node_type afr_find_child_character_type (int32_t *pending_row, int32_t child, - int32_t child_count, const char *xlator_name) + unsigned int child_count) { afr_node_type type = AFR_NODE_INVALID; - GF_ASSERT (pending_row); - GF_ASSERT (child_count > 0); GF_ASSERT ((child >= 0) && (child < child_count)); if (afr_sh_is_innocent (pending_row, child_count)) @@ -597,44 +840,85 @@ afr_find_child_character_type (int32_t *pending_row, int32_t child, type = AFR_NODE_FOOL; else if (afr_sh_is_wise (pending_row, child, child_count)) type = AFR_NODE_WISE; - else - GF_ASSERT (0); - - gf_log (xlator_name, GF_LOG_DEBUG, "child %d character %s", - child, afr_get_character_str (type)); return type; } int afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs, int32_t **pending_matrix, int32_t *sources, - int32_t *success_children, afr_transaction_type type) + int32_t *success_children, afr_transaction_type type, + int32_t *subvol_status, gf_boolean_t ignore_ignorant) { afr_private_t *priv = NULL; afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; int nsources = -1; + unsigned char *ignorant_subvols = NULL; + unsigned int child_count = 0; priv = this->private; + child_count = priv->child_count; if (afr_get_children_count (success_children, priv->child_count) == 0) goto out; + if (!ignore_ignorant) { + ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols), + child_count, gf_afr_mt_char); + if (NULL == ignorant_subvols) + goto out; + } + afr_build_pending_matrix (priv->pending_key, pending_matrix, - xattr, type, priv->child_count); + ignorant_subvols, xattr, type, + priv->child_count); + if (!ignore_ignorant) + afr_mark_ignorant_subvols_as_pending (pending_matrix, + ignorant_subvols, + priv->child_count); sh_type = afr_self_heal_type_for_transaction (type); if (AFR_SELF_HEAL_INVALID == sh_type) goto out; afr_sh_print_pending_matrix (pending_matrix, this); - nsources = afr_mark_sources (sources, pending_matrix, bufs, - priv->child_count, sh_type, - success_children, this->name); + nsources = afr_mark_sources (this, sources, pending_matrix, bufs, + sh_type, success_children, subvol_status); out: + GF_FREE (ignorant_subvols); return nsources; } +void +afr_find_character_types (afr_node_character *characters, + int32_t **pending_matrix, int32_t *success_children, + unsigned int child_count) +{ + afr_node_type type = AFR_NODE_INVALID; + int child = 0; + int i = 0; + + for (i = 0; i < child_count; i++) { + child = success_children[i]; + if (child == -1) + break; + type = afr_find_child_character_type (pending_matrix[child], + child, child_count); + characters[child].type = type; + } +} + +void +afr_mark_success_children_sources (int32_t *sources, int32_t *success_children, + unsigned int child_count) +{ + int i = 0; + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + sources[success_children[i]] = 1; + } +} /** * mark_sources: Mark all 'source' nodes and return number of source * nodes found @@ -660,17 +944,18 @@ out: */ int -afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, - int32_t child_count, afr_self_heal_type type, - int32_t *valid_children, const char *xlator_name) +afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, + struct iatt *bufs, afr_self_heal_type type, + int32_t *success_children, int32_t *subvol_status) { /* stores the 'characters' (innocent, fool, wise) of the nodes */ - afr_node_character *characters = NULL; - int i = 0; - int nsources = -1; - xlator_t *this = NULL; + int nsources = -1; + unsigned int child_count = 0; + afr_private_t *priv = NULL; + priv = this->private; + child_count = priv->child_count; characters = GF_CALLOC (sizeof (afr_node_character), child_count, gf_afr_mt_afr_node_character); if (!characters) @@ -679,28 +964,29 @@ afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, this = THIS; /* start clean */ - for (i = 0; i < child_count; i++) { - sources[i] = 0; - } - + memset (sources, 0, sizeof (*sources) * child_count); nsources = 0; - for (i = 0; i < child_count; i++) { - characters[i].type = - afr_find_child_character_type (pending_matrix[i], i, - child_count, - xlator_name); - if (AFR_NODE_INVALID == characters[i].type) - gf_log (xlator_name, GF_LOG_WARNING, - "child %d had invalid xattrs", i); - } - - if ((type == AFR_SELF_HEAL_METADATA) - && afr_sh_all_nodes_innocent (characters, child_count)) { - - nsources = afr_sh_mark_lowest_uid_as_source (bufs, - valid_children, + afr_find_character_types (characters, pending_matrix, success_children, + child_count); + if (afr_sh_all_nodes_innocent (characters, child_count)) { + switch (type) { + case AFR_SELF_HEAL_METADATA: + nsources = afr_sh_mark_lowest_uid_as_source (bufs, + success_children, + child_count, + sources); + break; + case AFR_SELF_HEAL_DATA: + nsources = afr_sh_mark_zero_size_file_as_sink (bufs, + success_children, child_count, sources); + if ((nsources < 0) && subvol_status) + *subvol_status |= SPLIT_BRAIN; + break; + default: + break; + } goto out; } @@ -708,32 +994,29 @@ afr_mark_sources (int32_t *sources, int32_t **pending_matrix, struct iatt *bufs, afr_sh_compute_wisdom (pending_matrix, characters, child_count); if (afr_sh_wise_nodes_conflict (characters, child_count)) { - /* split-brain */ - gf_log (this->name, GF_LOG_INFO, - "split-brain possible, no source detected"); + if (subvol_status) + *subvol_status |= SPLIT_BRAIN; nsources = -1; - } else { nsources = afr_sh_mark_wisest_as_sources (sources, characters, child_count); } } else { + if (subvol_status) + *subvol_status |= ALL_FOOLS; nsources = afr_mark_biggest_of_fools_as_source (sources, pending_matrix, characters, - child_count); + success_children, + child_count, bufs); } out: - if (nsources == 0) { - for (i = 0; i < child_count; i++) { - if (valid_children[i] != -1) - sources[valid_children[i]] = 1; - } - } - if (characters) - GF_FREE (characters); + if (nsources == 0) + afr_mark_success_children_sources (sources, success_children, + child_count); + GF_FREE (characters); gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources); return nsources; @@ -744,81 +1027,108 @@ afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, int32_t *delta_matrix[], unsigned char success[], int child_count, afr_transaction_type type) { - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - int ret = 0; - int i = 0; - int j = 0; - int k = 0; - - /* start clean */ - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - delta_matrix[i][j] = 0; - } - } - - for (i = 0; i < child_count; i++) { - if (pending_raw) - pending_raw = NULL; - - for (j = 0; j < child_count; j++) { - ret = dict_get_ptr (xattr[i], priv->pending_key[j], - &pending_raw); - if (ret < 0) - gf_log (THIS->name, GF_LOG_DEBUG, - "Unable to get dict value."); - if (!success[j]) - continue; + int tgt = 0; + int src = 0; + int value = 0; - k = afr_index_for_transaction_type (type); + afr_build_pending_matrix (priv->pending_key, delta_matrix, NULL, + xattr, type, priv->child_count); - if (pending_raw != NULL) { - memcpy (pending, pending_raw, sizeof(pending)); - delta_matrix[i][j] = -(ntoh32 (pending[k])); - } else { - delta_matrix[i][j] = 0; + /* + * The algorithm here has two parts. First, for each subvol indexed + * as tgt, we try to figure out what count everyone should have for it. + * If the self-heal succeeded, that's easy; the value is zero. + * Otherwise, the value is the maximum of the succeeding nodes' counts. + * Once we know the value, we loop through (possibly for a second time) + * setting each count to the difference so that when we're done all + * succeeding nodes will have the same count for tgt. + */ + for (tgt = 0; tgt < priv->child_count; ++tgt) { + value = 0; + if (!success[tgt]) { + /* Find the maximum. */ + for (src = 0; src < priv->child_count; ++src) { + if (!success[src]) { + continue; + } + if (delta_matrix[src][tgt] > value) { + value = delta_matrix[src][tgt]; + } + } + } + /* Force everyone who succeeded to the chosen value. */ + for (src = 0; src < priv->child_count; ++src) { + if (success[src]) { + delta_matrix[src][tgt] = value + - delta_matrix[src][tgt]; + } + else { + delta_matrix[src][tgt] = 0; } - } } } int -afr_sh_delta_to_xattr (afr_private_t *priv, +afr_sh_delta_to_xattr (xlator_t *this, int32_t *delta_matrix[], dict_t *xattr[], int child_count, afr_transaction_type type) { - int i = 0; - int j = 0; - int k = 0; - int ret = 0; - int32_t *pending = NULL; + int i = 0; + int j = 0; + int k = 0; + int ret = 0; + int32_t *pending = NULL; + int32_t *local_pending = NULL; + afr_private_t *priv = NULL; + priv = this->private; for (i = 0; i < child_count; i++) { if (!xattr[i]) continue; + local_pending = NULL; for (j = 0; j < child_count; j++) { pending = GF_CALLOC (sizeof (int32_t), 3, gf_afr_mt_int32_t); - if (!pending) + if (!pending) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate pending entry " + "for %s[%d] on %s", + priv->pending_key[j], type, + priv->children[i]->name); continue; + } /* 3 = data+metadata+entry */ k = afr_index_for_transaction_type (type); pending[k] = hton32 (delta_matrix[i][j]); + if (j == i) { + local_pending = pending; + continue; + } ret = dict_set_bin (xattr[i], priv->pending_key[j], pending, - 3 * sizeof (int32_t)); - if (ret < 0) - gf_log (THIS->name, GF_LOG_WARNING, + AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "Unable to set dict value."); + GF_FREE (pending); + } + } + if (local_pending) { + ret = dict_set_bin (xattr[i], priv->pending_key[i], + local_pending, + AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "Unable to set dict value."); + GF_FREE (local_pending); + } } } return 0; @@ -826,146 +1136,23 @@ afr_sh_delta_to_xattr (afr_private_t *priv, int -afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this) -{ - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int j = 0; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); - - if (ret != 0) - return 0; - - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); - - if (pending[j]) - return 1; - } - - return 0; -} - - -int -afr_sh_has_data_pending (dict_t *xattr, xlator_t *this) -{ - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int j = 0; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); - - if (ret != 0) - return 0; - - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - - if (pending[j]) - return 1; - } - - return 0; -} - - -int -afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this) -{ - /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */ - int32_t pending[3] = {0,}; - void *pending_raw = NULL; - afr_private_t *priv = NULL; - int ret = -1; - int i = 0; - int j = 0; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xattr, priv->pending_key[i], - &pending_raw); - - if (ret != 0) - return 0; - - memcpy (pending, pending_raw, sizeof(pending)); - j = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION); - - if (pending[j]) - return 1; - } - - return 0; -} - - -/** - * is_matrix_zero - return true if pending matrix is all zeroes - */ - -int -afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count) -{ - int i = 0; - int j = 0; - - for (i = 0; i < child_count; i++) - for (j = 0; j < child_count; j++) - if (pending_matrix[i][j]) - return 0; - return 1; -} - - -int afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; - afr_private_t *priv = NULL; - int i = 0; local = frame->local; sh = &local->self_heal; - priv = this->private; -// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); - memset (sh->buf, 0, sizeof (struct iatt) * priv->child_count); - - for (i = 0; i < priv->child_count; i++) { - sh->locked_nodes[i] = 0; - } + afr_sh_reset (frame, this); - for (i = 0; i < priv->child_count; i++) { - if (sh->xattr[i]) - dict_unref (sh->xattr[i]); - sh->xattr[i] = NULL; - } - - if (local->govinda_gOvinda || sh->op_failed) { - gf_log (this->name, GF_LOG_INFO, + if (local->unhealable) { + gf_log (this->name, GF_LOG_DEBUG, "split brain found, aborting selfheal of %s", local->loc.path); - sh->op_failed = 1; + } + + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { sh->completion_cbk (frame, this); } else { gf_log (this->name, GF_LOG_TRACE, @@ -993,6 +1180,37 @@ afr_sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) return 0; } +int +afr_sh_common_create (afr_self_heal_t *sh, unsigned int child_count) +{ + int ret = -ENOMEM; + sh->buf = GF_CALLOC (child_count, sizeof (*sh->buf), + gf_afr_mt_iatt); + if (!sh->buf) + goto out; + sh->parentbufs = GF_CALLOC (child_count, sizeof (*sh->parentbufs), + gf_afr_mt_iatt); + if (!sh->parentbufs) + goto out; + sh->child_errno = GF_CALLOC (child_count, sizeof (*sh->child_errno), + gf_afr_mt_int); + if (!sh->child_errno) + goto out; + sh->success_children = afr_children_create (child_count); + if (!sh->success_children) + goto out; + sh->fresh_children = afr_children_create (child_count); + if (!sh->fresh_children) + goto out; + sh->xattr = GF_CALLOC (child_count, sizeof (*sh->xattr), + gf_afr_mt_dict_t); + if (!sh->xattr) + goto out; + ret = 0; +out: + return ret; +} + void afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, xlator_t *this, @@ -1020,7 +1238,7 @@ afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie, sh->success_count++; sh->xattr[child_index] = dict_ref (xattr); } else { - gf_log (this->name, GF_LOG_ERROR, "path %s on subvolume" + gf_log (this->name, GF_LOG_DEBUG, "path %s on subvolume" " %s => -1 (%s)", loc->path, priv->children[child_index]->name, strerror (op_errno)); @@ -1049,64 +1267,140 @@ afr_valid_ia_type (ia_type_t ia_type) return _gf_false; } +int +afr_impunge_frame_create (call_frame_t *frame, xlator_t *this, + int active_source, call_frame_t **impunge_frame) +{ + afr_local_t *local = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int32_t op_errno = 0; + afr_private_t *priv = NULL; + int ret = 0; + call_frame_t *new_frame = NULL; + + op_errno = ENOMEM; + priv = this->private; + new_frame = copy_frame (frame); + if (!new_frame) { + goto out; + } + + AFR_LOCAL_ALLOC_OR_GOTO (impunge_local, out); + + local = frame->local; + new_frame->local = impunge_local; + impunge_sh = &impunge_local->self_heal; + impunge_sh->sh_frame = frame; + impunge_sh->active_source = active_source; + impunge_local->child_up = memdup (local->child_up, + sizeof (*local->child_up) * + priv->child_count); + if (!impunge_local->child_up) + goto out; + + impunge_local->pending = afr_matrix_create (priv->child_count, + AFR_NUM_CHANGE_LOGS); + if (!impunge_local->pending) + goto out; + + ret = afr_sh_common_create (impunge_sh, priv->child_count); + if (ret) { + op_errno = -ret; + goto out; + } + op_errno = 0; + *impunge_frame = new_frame; +out: + if (op_errno && new_frame) + AFR_STACK_DESTROY (new_frame); + return -op_errno; +} + void -afr_sh_call_entry_impunge_recreate (call_frame_t *frame, xlator_t *this, - int child_index, struct iatt *buf, - struct iatt *postparent, - afr_impunge_done_cbk_t impunge_done) +afr_sh_missing_entry_call_impunge_recreate (call_frame_t *frame, xlator_t *this, + struct iatt *buf, + struct iatt *postparent, + afr_impunge_done_cbk_t impunge_done) { call_frame_t *impunge_frame = NULL; afr_local_t *local = NULL; afr_local_t *impunge_local = NULL; afr_self_heal_t *sh = NULL; afr_self_heal_t *impunge_sh = NULL; + int ret = 0; + unsigned int enoent_count = 0; + afr_private_t *priv = NULL; + int i = 0; int32_t op_errno = 0; - impunge_frame = copy_frame (frame); - if (!impunge_frame) { - op_errno = ENOMEM; + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + enoent_count = afr_errno_count (NULL, sh->child_errno, + priv->child_count, ENOENT); + if (!enoent_count) { + gf_log (this->name, GF_LOG_INFO, + "no missing files - %s. proceeding to metadata check", + local->loc.path); goto out; } - - ALLOC_OR_GOTO (impunge_local, afr_local_t, out); - - local = frame->local; - sh = &local->self_heal; - impunge_frame->local = impunge_local; - impunge_sh = &impunge_local->self_heal; - impunge_sh->sh_frame = frame; - impunge_sh->active_source = sh->source; - impunge_sh->impunging_entry_mode = st_mode_from_ia (buf->ia_prot, - buf->ia_type); - impunge_sh->impunge_ret_child = child_index; - loc_copy (&impunge_local->loc, &local->loc); sh->impunge_done = impunge_done; - impunge_local->call_count = 1; - afr_sh_entry_impunge_create (impunge_frame, this, child_index, buf, - postparent); + ret = afr_impunge_frame_create (frame, this, sh->source, &impunge_frame); + if (ret) + goto out; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + loc_copy (&impunge_local->loc, &local->loc); + ret = afr_build_parent_loc (&impunge_sh->parent_loc, + &impunge_local->loc, &op_errno); + if (ret) { + ret = -op_errno; + goto out; + } + impunge_local->call_count = enoent_count; + impunge_sh->entrybuf = sh->buf[sh->source]; + impunge_sh->parentbuf = sh->parentbufs[sh->source]; + for (i = 0; i < priv->child_count; i++) { + if (!impunge_local->child_up[i]) { + impunge_sh->child_errno[i] = ENOTCONN; + continue; + } + if (sh->child_errno[i] != ENOENT) { + impunge_sh->child_errno[i] = EEXIST; + continue; + } + } + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i] != ENOENT) + continue; + afr_sh_entry_impunge_create (impunge_frame, this, i); + enoent_count--; + } + GF_ASSERT (!enoent_count); return; out: - gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, reason: %s", - local->loc.path, strerror (op_errno)); - impunge_done (frame, this, child_index, -1, op_errno); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, " + "reason: %s", local->loc.path, strerror (-ret)); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } + afr_sh_missing_entries_finish (frame, this); } int -afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this, int child, +afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno) { - int call_count = 0; afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; local = frame->local; - - if (op_ret == -1) - gf_log (this->name, GF_LOG_ERROR, - "create entry %s failed, on child %d reason, %s", - local->loc.path, child, strerror (op_errno)); - call_count = afr_frame_return (frame); - if (call_count == 0) - afr_sh_missing_entries_finish (frame, this); + sh = &local->self_heal; + if (op_ret < 0) + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_missing_entries_finish (frame, this); return 0; } @@ -1116,26 +1410,11 @@ sh_missing_entries_create (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; int type = 0; - afr_private_t *priv = NULL; - int enoent_count = 0; - int i = 0; struct iatt *buf = NULL; struct iatt *postparent = NULL; local = frame->local; sh = &local->self_heal; - priv = this->private; - - enoent_count = afr_errno_count (NULL, sh->child_errno, - priv->child_count, ENOENT); - if (enoent_count == 0) { - gf_log (this->name, GF_LOG_INFO, - "no missing files - %s. proceeding to metadata check", - local->loc.path); - /* proceed to next step - metadata self-heal */ - afr_sh_missing_entries_finish (frame, this); - return 0; - } buf = &sh->buf[sh->source]; postparent = &sh->parentbufs[sh->source]; @@ -1144,72 +1423,80 @@ sh_missing_entries_create (call_frame_t *frame, xlator_t *this) if (!afr_valid_ia_type (type)) { gf_log (this->name, GF_LOG_ERROR, "%s: unknown file type: 0%o", local->loc.path, type); - local->govinda_gOvinda = 1; + afr_set_local_for_unhealable (local); afr_sh_missing_entries_finish (frame, this); goto out; } - local->call_count = enoent_count; - for (i = 0; i < priv->child_count; i++) { - //If !child_up errno will be zero - if (sh->child_errno[i] != ENOENT) - continue; - afr_sh_call_entry_impunge_recreate (frame, this, i, + afr_sh_missing_entry_call_impunge_recreate (frame, this, buf, postparent, afr_sh_create_entry_cbk); - enoent_count--; - } - GF_ASSERT (enoent_count == 0); out: return 0; } void -afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this) +afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this, + int32_t op_ret, int32_t op_errno) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; - int32_t op_errno = 0; ia_type_t ia_type = IA_INVAL; int32_t nsources = 0; + loc_t *loc = NULL; + int32_t subvol_status = 0; + afr_transaction_type txn_type = AFR_DATA_TRANSACTION; + gf_boolean_t split_brain = _gf_false; + int read_child = -1; local = frame->local; sh = &local->self_heal; priv = this->private; + loc = &local->loc; - if (afr_get_children_count (sh->success_children, - priv->child_count) == 0) { - op_errno = afr_resultant_errno_get (NULL, sh->child_errno, - priv->child_count); - goto out; - } - - if (afr_gfid_missing_count (this->name, sh->success_children, - sh->buf, priv->child_count, - local->loc.path) || - afr_conflicting_iattrs (sh->buf, sh->success_children, - priv->child_count, local->loc.path, - this->name)) { - //this can happen if finding the fresh parent dir failed - local->govinda_gOvinda = 1; - sh->op_failed = 1; - op_errno = EIO; + if (op_ret < 0) { + if (op_errno == EIO) { + afr_set_local_for_unhealable (local); + } + // EIO can happen if finding the fresh parent dir failed goto out; } //now No chance for the ia_type to conflict ia_type = sh->buf[sh->success_children[0]].ia_type; + txn_type = afr_transaction_type_get (ia_type); nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, - sh->success_children, - afr_transaction_type_get (ia_type)); + sh->success_children, txn_type, + &subvol_status, _gf_false); if (nsources < 0) { gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s," " in missing entry self-heal, continuing with the rest" " of the self-heals", local->loc.path); - op_errno = EIO; - goto out; + if (subvol_status & SPLIT_BRAIN) { + split_brain = _gf_true; + switch (txn_type) { + case AFR_DATA_TRANSACTION: + nsources = 1; + sh->sources[sh->success_children[0]] = 1; + break; + case AFR_ENTRY_TRANSACTION: + read_child = afr_get_no_xattr_dir_read_child + (this, + sh->success_children, + sh->buf); + sh->sources[read_child] = 1; + nsources = 1; + break; + default: + op_errno = EIO; + goto out; + } + } else { + op_errno = EIO; + goto out; + } } afr_get_fresh_children (sh->success_children, sh->sources, @@ -1224,34 +1511,77 @@ afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this) if (sh->gfid_sh_success_cbk) sh->gfid_sh_success_cbk (frame, this); sh->type = sh->buf[sh->source].ia_type; - sh_missing_entries_create (frame, this); + if (uuid_is_null (loc->inode->gfid)) + uuid_copy (loc->gfid, sh->buf[sh->source].ia_gfid); + if (split_brain) { + afr_sh_missing_entries_finish (frame, this); + } else { + sh_missing_entries_create (frame, this); + } return; out: + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_missing_entries_finish (frame, this); return; } static int -afr_sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) +afr_sh_common_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xattr, + struct iatt *postparent) { int call_count = 0; afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; local = frame->local; + sh = &local->self_heal; + priv = this->private; afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, op_errno, inode, buf, xattr, - postparent, &local->loc); + postparent, &sh->lookup_loc); call_count = afr_frame_return (frame); - if (call_count == 0) - afr_sh_missing_entries_lookup_done (frame, this); + if (call_count) + goto out; + op_ret = -1; + if (!sh->success_count) { + op_errno = afr_resultant_errno_get (NULL, sh->child_errno, + priv->child_count); + gf_log (this->name, GF_LOG_ERROR, "Failed to lookup %s, " + "reason %s", sh->lookup_loc.path, + strerror (op_errno)); + goto done; + } + if ((sh->lookup_flags & AFR_LOOKUP_FAIL_CONFLICTS) && + (afr_conflicting_iattrs (sh->buf, sh->success_children, + priv->child_count, + sh->lookup_loc.path, this->name))) { + op_errno = EIO; + gf_log (this->name, GF_LOG_ERROR, "Conflicting entries " + "for %s", sh->lookup_loc.path); + goto done; + } + + if ((sh->lookup_flags & AFR_LOOKUP_FAIL_MISSING_GFIDS) && + (afr_gfid_missing_count (this->name, sh->success_children, + sh->buf, priv->child_count, + sh->lookup_loc.path))) { + op_errno = ENODATA; + gf_log (this->name, GF_LOG_ERROR, "Missing Gfids " + "for %s", sh->lookup_loc.path); + goto done; + } + op_ret = 0; + +done: + sh->lookup_done (frame, this, op_ret, op_errno); +out: return 0; } @@ -1274,7 +1604,7 @@ afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child, LOCK (&frame->lock); { afr_sh_set_error (sh, EIO); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } UNLOCK (&frame->lock); } @@ -1287,6 +1617,7 @@ afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child, void afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this, int child_index, struct iatt *buf, + struct iatt *parentbuf, afr_expunge_done_cbk_t expunge_done) { call_frame_t *expunge_frame = NULL; @@ -1295,13 +1626,14 @@ afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this, afr_self_heal_t *sh = NULL; afr_self_heal_t *expunge_sh = NULL; int32_t op_errno = 0; + int ret = 0; expunge_frame = copy_frame (frame); if (!expunge_frame) { goto out; } - ALLOC_OR_GOTO (expunge_local, afr_local_t, out); + AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out); local = frame->local; sh = &local->self_heal; @@ -1309,8 +1641,15 @@ afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this, expunge_sh = &expunge_local->self_heal; expunge_sh->sh_frame = frame; loc_copy (&expunge_local->loc, &local->loc); + ret = afr_build_parent_loc (&expunge_sh->parent_loc, + &expunge_local->loc, &op_errno); + if (ret) { + ret = -op_errno; + goto out; + } sh->expunge_done = expunge_done; - afr_sh_entry_expunge_remove (expunge_frame, this, child_index, buf); + afr_sh_entry_expunge_remove (expunge_frame, this, child_index, buf, + parentbuf); return; out: gf_log (this->name, GF_LOG_ERROR, "Expunge of %s failed, reason: %s", @@ -1347,15 +1686,18 @@ afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; priv = this->private; - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_missing_entries_finish (frame, this); } else { if (afr_gfid_missing_count (this->name, sh->fresh_children, sh->buf, priv->child_count, local->loc.path)) { afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_missing_entries_lookup_cbk, - _gf_true); + afr_sh_missing_entries_lookup_done, + sh->sh_gfid_req, + AFR_LOOKUP_FAIL_CONFLICTS| + AFR_LOOKUP_FAIL_MISSING_GFIDS, + NULL); } else { //No need to set gfid so goto missing entries lookup done //Behave as if you have done the lookup @@ -1366,7 +1708,7 @@ afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this) afr_children_copy (sh->success_children, sh->fresh_children, priv->child_count); - afr_sh_missing_entries_lookup_done (frame, this); + afr_sh_missing_entries_lookup_done (frame, this, 0, 0); } } return 0; @@ -1437,9 +1779,10 @@ afr_sh_purge_entry_common (call_frame_t *frame, xlator_t *this, if (!purge_condition (local, priv, i)) continue; gf_log (this->name, GF_LOG_INFO, "purging the stale entry %s " - "on %d", local->loc.path, i); + "on %s", local->loc.path, priv->children[i]->name); afr_sh_call_entry_expunge_remove (frame, this, (long) i, &sh->buf[i], + &sh->parentbufs[i], afr_sh_remove_entry_cbk); } out: @@ -1521,35 +1864,34 @@ afr_sh_save_child_iatts_from_policy (int32_t *children, struct iatt *bufs, } void -afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this) +afr_get_children_of_fresh_parent_dirs (afr_self_heal_t *sh, + unsigned int child_count) +{ + afr_children_intersection_get (sh->success_children, + sh->fresh_parent_dirs, + sh->sources, child_count); + afr_get_fresh_children (sh->success_children, sh->sources, + sh->fresh_children, child_count); + memset (sh->sources, 0, sizeof (*sh->sources) * child_count); +} + +void +afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this, + int32_t op_ret, int32_t op_errno) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; int32_t fresh_child_enoents = 0; int32_t fresh_parent_count = 0; - int32_t op_errno = 0; local = frame->local; sh = &local->self_heal; priv = this->private; - if (afr_get_children_count (sh->success_children, - priv->child_count) == 0) { - op_errno = afr_resultant_errno_get (NULL, sh->child_errno, - priv->child_count); + if (op_ret < 0) goto fail; - } - - //make intersection of (success_children & fresh_parent_dirs) fresh_children - //the other success_children will be added to it if they are not stale - afr_children_intersection_get (sh->success_children, - sh->fresh_parent_dirs, - sh->sources, priv->child_count); - afr_get_fresh_children (sh->success_children, sh->sources, - sh->fresh_children, priv->child_count); - memset (sh->sources, 0, sizeof (*sh->sources) * priv->child_count); - + afr_get_children_of_fresh_parent_dirs (sh, priv->child_count); fresh_parent_count = afr_get_children_count (sh->fresh_parent_dirs, priv->child_count); //we need the enoent count of the subvols present in fresh_parent_dirs @@ -1557,10 +1899,8 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this) sh->child_errno, priv->child_count, ENOENT); if (fresh_child_enoents == fresh_parent_count) { - gf_log (this->name, GF_LOG_INFO, "Deleting stale file %s", - local->loc.path); afr_sh_set_error (sh, ENOENT); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_purge_entry (frame, this); } else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children, priv->child_count, local->loc.path, @@ -1574,42 +1914,22 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this) afr_sh_purge_stale_entry (frame, this); } else { op_errno = EIO; - local->govinda_gOvinda = 1; + afr_set_local_for_unhealable (local); goto fail; } return; fail: + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_missing_entries_finish (frame, this); return; } -static int -afr_sh_children_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, dict_t *xattr, - struct iatt *postparent) -{ - int call_count = 0; - afr_local_t *local = NULL; - - local = frame->local; - - afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, - op_errno, inode, buf, xattr, - postparent, &local->loc); - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_children_lookup_done (frame, this); - - return 0; -} - -static int -afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this) +static void +afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this, + int32_t op_ret, int32_t op_errno) { afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; @@ -1617,54 +1937,42 @@ afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this) int enoent_count = 0; int nsources = 0; int source = -1; + int32_t subvol_status = 0; local = frame->local; sh = &local->self_heal; priv = this->private; - /* If We can't find a fresh parent directory here, - * we wont know which subvol is correct without finding a parent dir - * upwards which has correct xattrs, for that we may have to - * do lookups till root, we dont wanna do that, - * instead make sure that if there are conflicting gfid - * parent dirs, self-heal thus lookup is failed with EIO. - * if there are missing entries we dont know whether to delete or - * create so fail with EIO, - * If there are conflicting xattr fail with EIO. - */ - if (afr_get_children_count (sh->success_children, - priv->child_count) == 0) { - gf_log (this->name, GF_LOG_ERROR, "Parent dir lookup failed " - "for %s, in missing entry self-heal, continuing with " - "the rest of the self-heals", local->loc.path); + if (op_ret < 0) goto out; - } - enoent_count = afr_errno_count (NULL, sh->child_errno, priv->child_count, ENOENT); if (enoent_count > 0) { gf_log (this->name, GF_LOG_INFO, "Parent dir missing for %s," - " in missing entry self-heal, continuing with the rest" - " of the self-heals", local->loc.path); - goto out; - } - - if (afr_conflicting_iattrs (sh->buf, sh->success_children, - priv->child_count, sh->parent_loc.path, - this->name)) { - gf_log (this->name, GF_LOG_INFO, "conflicting stat info for " - "parent dirs of %s", local->loc.path); - goto out; + " in missing entry self-heal, aborting missing-entry " + "self-heal", + local->loc.path); + afr_sh_missing_entries_finish (frame, this); + return; } nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, sh->success_children, - AFR_ENTRY_TRANSACTION); - if (nsources < 0) { - gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s," - " in missing entry self-heal, continuing with the rest" - " of the self-heals", local->loc.path); + AFR_ENTRY_TRANSACTION, &subvol_status, + _gf_true); + if ((subvol_status & ALL_FOOLS) || + (subvol_status & SPLIT_BRAIN)) { + gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative " + "merge", sh->parent_loc.path); + afr_mark_success_children_sources (sh->sources, + sh->success_children, + priv->child_count); + } else if (nsources < 0) { + gf_log (this->name, GF_LOG_ERROR, "No sources for dir " + "of %s, in missing entry self-heal, aborting " + "self-heal", local->loc.path); + op_errno = EIO; goto out; } @@ -1672,44 +1980,21 @@ afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this) if (source == -1) { GF_ASSERT (0); gf_log (this->name, GF_LOG_DEBUG, "No active sources found."); + op_errno = EIO; goto out; } afr_get_fresh_children (sh->success_children, sh->sources, sh->fresh_parent_dirs, priv->child_count); afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_children_lookup_cbk, _gf_false); - return 0; + afr_sh_children_lookup_done, NULL, 0, + NULL); + return; out: - afr_sh_set_error (sh, EIO); - sh->op_failed = 1; - afr_sh_missing_entries_finish (frame, this); - return 0; -} - -int -afr_sh_conflicting_entry_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, - int32_t op_ret, int32_t op_errno, - inode_t *inode, struct iatt *buf, - dict_t *xattr, struct iatt *postparent) -{ - int call_count = 0; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - - afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret, - op_errno, inode, buf, xattr, - postparent, &sh->parent_loc); - call_count = afr_frame_return (frame); - - if (call_count == 0) - afr_sh_find_fresh_parents (frame, this); - - return 0; + afr_sh_set_error (sh, op_errno); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_missing_entries_finish (frame, this); + return; } void @@ -1727,6 +2012,7 @@ afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count) afr_reset_children (sh->success_children, child_count); afr_reset_children (sh->fresh_children, child_count); afr_reset_xattr (sh->xattr, child_count); + loc_wipe (&sh->lookup_loc); } /* afr self-heal state will be lost if this call is made @@ -1734,7 +2020,8 @@ afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count) */ int afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, - afr_lookup_cbk_t lookup_cbk, gf_boolean_t set_gfid) + afr_lookup_done_cbk_t lookup_done , uuid_t gfid, + int32_t flags, dict_t *xdata) { afr_local_t *local = NULL; int i = 0; @@ -1755,16 +2042,19 @@ afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, if (xattr_req) { afr_xattr_req_prepare (this, xattr_req, loc->path); - if (set_gfid) { + if (gfid) { gf_log (this->name, GF_LOG_DEBUG, "looking up %s with gfid: %s", - loc->path, uuid_utoa (sh->sh_gfid_req)); - GF_ASSERT (!uuid_is_null (sh->sh_gfid_req)); - afr_set_dict_gfid (xattr_req, sh->sh_gfid_req); + loc->path, uuid_utoa (gfid)); + GF_ASSERT (!uuid_is_null (gfid)); + afr_set_dict_gfid (xattr_req, gfid); } } afr_sh_common_reset (sh, priv->child_count); + sh->lookup_done = lookup_done; + loc_copy (&sh->lookup_loc, loc); + sh->lookup_flags = flags; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { gf_log (this->name, GF_LOG_DEBUG, @@ -1772,7 +2062,7 @@ afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, loc->path, priv->children[i]->name); STACK_WIND_COOKIE (frame, - lookup_cbk, + afr_sh_common_lookup_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->lookup, @@ -1792,7 +2082,8 @@ afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, int -afr_sh_post_nb_entrylk_conflicting_sh_cbk (call_frame_t *frame, xlator_t *this) +afr_sh_post_nb_entrylk_missing_entry_sh_cbk (call_frame_t *frame, + xlator_t *this) { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; @@ -1805,38 +2096,16 @@ afr_sh_post_nb_entrylk_conflicting_sh_cbk (call_frame_t *frame, xlator_t *this) if (int_lock->lock_op_ret < 0) { gf_log (this->name, GF_LOG_INFO, "Non blocking entrylks failed."); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_missing_entries_done (frame, this); } else { gf_log (this->name, GF_LOG_DEBUG, "Non blocking entrylks done. Proceeding to FOP"); afr_sh_common_lookup (frame, this, &sh->parent_loc, - afr_sh_conflicting_entry_lookup_cbk, - _gf_false); - } - - return 0; -} - -int -afr_sh_post_nb_entrylk_gfid_sh_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "Non blocking entrylks failed."); - afr_sh_missing_entries_done (frame, this); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking entrylks done. Proceeding to FOP"); - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_missing_entries_lookup_cbk, - _gf_true); + afr_sh_find_fresh_parents, + NULL, AFR_LOOKUP_FAIL_CONFLICTS, + NULL); } return 0; @@ -1848,7 +2117,9 @@ afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; + afr_private_t *priv = NULL; + priv = this->private; local = frame->local; int_lock = &local->internal_lock; @@ -1860,7 +2131,12 @@ afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, int_lock->lk_basename = base_name; int_lock->lk_loc = loc; int_lock->lock_cbk = lock_cbk; + int_lock->domain = this->name; + int_lock->lockee_count = 0; + afr_init_entry_lockee (&int_lock->lockee[0], local, loc, + base_name, priv->child_count); + int_lock->lockee_count++; afr_nonblocking_entrylk (frame, this); return 0; @@ -1872,6 +2148,9 @@ afr_self_heal_parent_entrylk (call_frame_t *frame, xlator_t *this, { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; + afr_internal_lock_t *int_lock = NULL; + int ret = -1; + int32_t op_errno = 0; local = frame->local; sh = &local->self_heal; @@ -1880,43 +2159,52 @@ afr_self_heal_parent_entrylk (call_frame_t *frame, xlator_t *this, "attempting to recreate missing entries for path=%s", local->loc.path); - GF_ASSERT (local->loc.parent); - afr_build_parent_loc (&sh->parent_loc, &local->loc); + ret = afr_build_parent_loc (&sh->parent_loc, &local->loc, &op_errno); + if (ret) + goto out; afr_sh_entrylk (frame, this, &sh->parent_loc, NULL, lock_cbk); return 0; -} - -static int -afr_self_heal_conflicting_entries (call_frame_t *frame, xlator_t *this) -{ - afr_self_heal_parent_entrylk (frame, this, - afr_sh_post_nb_entrylk_conflicting_sh_cbk); +out: + int_lock = &local->internal_lock; + int_lock->lock_op_ret = -1; + lock_cbk (frame, this); return 0; } static int -afr_self_heal_gfids (call_frame_t *frame, xlator_t *this) +afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) { + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + sh->sh_type_in_action = AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY; + + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); + afr_self_heal_parent_entrylk (frame, this, - afr_sh_post_nb_entrylk_gfid_sh_cbk); + afr_sh_post_nb_entrylk_missing_entry_sh_cbk); return 0; } -afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) +afr_local_t* +afr_self_heal_local_init (afr_local_t *l, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *lc = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *shc = NULL; + afr_private_t *priv = NULL; + afr_local_t *lc = NULL; + afr_self_heal_t *sh = NULL; + afr_self_heal_t *shc = NULL; + int ret = 0; priv = this->private; sh = &l->self_heal; - lc = GF_CALLOC (1, sizeof (afr_local_t), - gf_afr_mt_afr_local_t); + lc = mem_get0 (this->local_pool); if (!lc) goto out; @@ -1924,22 +2212,32 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) shc->unwind = sh->unwind; shc->gfid_sh_success_cbk = sh->gfid_sh_success_cbk; - shc->need_missing_entry_self_heal = sh->need_missing_entry_self_heal; - shc->need_gfid_self_heal = sh->need_gfid_self_heal; - shc->need_data_self_heal = sh->need_data_self_heal; - shc->need_metadata_self_heal = sh->need_metadata_self_heal; - shc->need_entry_self_heal = sh->need_entry_self_heal; + shc->do_missing_entry_self_heal = sh->do_missing_entry_self_heal; + shc->do_gfid_self_heal = sh->do_gfid_self_heal; + shc->do_data_self_heal = sh->do_data_self_heal; + shc->do_metadata_self_heal = sh->do_metadata_self_heal; + shc->do_entry_self_heal = sh->do_entry_self_heal; + shc->force_confirm_spb = sh->force_confirm_spb; shc->forced_merge = sh->forced_merge; - shc->data_lock_held = sh->data_lock_held; shc->background = sh->background; shc->type = sh->type; + shc->data_sh_info = ""; + shc->metadata_sh_info = ""; uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req); - if (l->loc.path) - loc_copy (&lc->loc, &l->loc); + if (l->loc.path) { + ret = loc_copy (&lc->loc, &l->loc); + if (ret < 0) + goto out; + } lc->child_up = memdup (l->child_up, sizeof (*lc->child_up) * priv->child_count); + if (!lc->child_up) { + ret = -1; + goto out; + } + if (l->xattr_req) lc->xattr_req = dict_ref (l->xattr_req); @@ -1947,40 +2245,25 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode); if (l->cont.lookup.xattr) lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr); - if (l->internal_lock.inode_locked_nodes) - lc->internal_lock.inode_locked_nodes = - memdup (l->internal_lock.inode_locked_nodes, - sizeof (*lc->internal_lock.inode_locked_nodes) * priv->child_count); - else - lc->internal_lock.inode_locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes), - priv->child_count, - gf_afr_mt_char); - if (l->internal_lock.entry_locked_nodes) - lc->internal_lock.entry_locked_nodes = - memdup (l->internal_lock.entry_locked_nodes, - sizeof (*lc->internal_lock.entry_locked_nodes) * priv->child_count); - else - lc->internal_lock.entry_locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.entry_locked_nodes), - priv->child_count, - gf_afr_mt_char); - if (l->internal_lock.locked_nodes) - lc->internal_lock.locked_nodes = - memdup (l->internal_lock.locked_nodes, - sizeof (*lc->internal_lock.locked_nodes) * priv->child_count); - else - lc->internal_lock.locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), - priv->child_count, - gf_afr_mt_char); - lc->internal_lock.inodelk_lock_count = - l->internal_lock.inodelk_lock_count; - lc->internal_lock.entrylk_lock_count = - l->internal_lock.entrylk_lock_count; + lc->internal_lock.locked_nodes = + GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), + priv->child_count, gf_afr_mt_char); + if (!lc->internal_lock.locked_nodes) { + ret = -1; + goto out; + } + + ret = afr_inodelk_init (&lc->internal_lock.inodelk[0], + this->name, priv->child_count); + if (ret) + goto out; out: + if (ret) { + afr_local_cleanup (lc, this); + lc = NULL; + } return lc; } @@ -1990,32 +2273,39 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) afr_private_t * priv = NULL; afr_local_t * local = NULL; afr_self_heal_t * sh = NULL; + afr_local_t * orig_frame_local = NULL; + afr_self_heal_t * orig_frame_sh = NULL; char sh_type_str[256] = {0,}; - gf_boolean_t split_brain = _gf_false; + gf_loglevel_t loglevel = 0; priv = this->private; local = bgsh_frame->local; sh = &local->self_heal; - if (local->govinda_gOvinda) - split_brain = _gf_true; - - afr_set_split_brain (this, sh->inode, split_brain); + if (local->unhealable) { + afr_set_split_brain (this, sh->inode, SPB, SPB); + } afr_self_heal_type_str_get (sh, sh_type_str, sizeof(sh_type_str)); - if (sh->op_failed) { - gf_log (this->name, GF_LOG_ERROR, "background %s self-heal " - "failed on %s", sh_type_str, local->loc.path); + if (is_self_heal_failed (sh, AFR_CHECK_ALL) && !priv->shd.iamshd) { + loglevel = GF_LOG_ERROR; + } else if (!is_self_heal_failed (sh, AFR_CHECK_ALL)) { + loglevel = GF_LOG_INFO; } else { - gf_log (this->name, GF_LOG_INFO, "background %s self-heal " - "completed on %s", sh_type_str, local->loc.path); + loglevel = GF_LOG_DEBUG; } + afr_log_self_heal_completion_status (local, loglevel); + FRAME_SU_UNDO (bgsh_frame, afr_local_t); if (!sh->unwound && sh->unwind) { - sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno); + orig_frame_local = sh->orig_frame->local; + orig_frame_sh = &orig_frame_local->self_heal; + orig_frame_sh->actual_sh_started = _gf_true; + sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, + is_self_heal_failed (sh, AFR_CHECK_ALL)); } if (sh->background) { @@ -2037,102 +2327,125 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; afr_private_t *priv = NULL; - int i = 0; - - call_frame_t *sh_frame = NULL; - afr_local_t *sh_local = NULL; + int32_t op_errno = 0; + int ret = 0; + afr_self_heal_t *orig_sh = NULL; + call_frame_t *sh_frame = NULL; + afr_local_t *sh_local = NULL; + loc_t *loc = NULL; local = frame->local; + orig_sh = &local->self_heal; priv = this->private; GF_ASSERT (local->loc.path); - if (local->self_heal.background) { - LOCK (&priv->lock); - { - if (priv->background_self_heals_started - < priv->background_self_heal_count) { - priv->background_self_heals_started++; - - - } else { - local->self_heal.background = _gf_false; - } - } - UNLOCK (&priv->lock); - } - gf_log (this->name, GF_LOG_TRACE, "performing self heal on %s (metadata=%d data=%d entry=%d)", local->loc.path, - local->self_heal.need_metadata_self_heal, - local->self_heal.need_data_self_heal, - local->self_heal.need_entry_self_heal); + local->self_heal.do_metadata_self_heal, + local->self_heal.do_data_self_heal, + local->self_heal.do_entry_self_heal); + op_errno = ENOMEM; sh_frame = copy_frame (frame); - afr_set_lk_owner (sh_frame, this); + if (!sh_frame) + goto out; + afr_set_lk_owner (sh_frame, this, sh_frame->root); + afr_set_low_priority (sh_frame); - sh_local = afr_local_copy (local, this); + sh_local = afr_self_heal_local_init (local, this); + if (!sh_local) + goto out; sh_frame->local = sh_local; sh = &sh_local->self_heal; sh->inode = inode_ref (inode); - sh->orig_frame = frame; sh->completion_cbk = afr_self_heal_completion_cbk; - sh->buf = GF_CALLOC (priv->child_count, sizeof (struct iatt), - gf_afr_mt_iatt); - sh->parentbufs = GF_CALLOC (priv->child_count, sizeof (struct iatt), - gf_afr_mt_iatt); - sh->child_errno = GF_CALLOC (priv->child_count, sizeof (int), - gf_afr_mt_int); sh->success = GF_CALLOC (priv->child_count, sizeof (*sh->success), gf_afr_mt_char); - sh->xattr = GF_CALLOC (priv->child_count, sizeof (dict_t *), - gf_afr_mt_dict_t); + if (!sh->success) + goto out; sh->sources = GF_CALLOC (sizeof (*sh->sources), priv->child_count, gf_afr_mt_int); + if (!sh->sources) + goto out; sh->locked_nodes = GF_CALLOC (sizeof (*sh->locked_nodes), priv->child_count, gf_afr_mt_int); + if (!sh->locked_nodes) + goto out; - sh->pending_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count, - gf_afr_mt_int32_t); + sh->pending_matrix = afr_matrix_create (priv->child_count, + priv->child_count); + if (!sh->pending_matrix) + goto out; - for (i = 0; i < priv->child_count; i++) { - sh->pending_matrix[i] = GF_CALLOC (sizeof (int32_t), - priv->child_count, - gf_afr_mt_int32_t); + sh->delta_matrix = afr_matrix_create (priv->child_count, + priv->child_count); + if (!sh->delta_matrix) + goto out; + + sh->fresh_parent_dirs = afr_children_create (priv->child_count); + if (!sh->fresh_parent_dirs) + goto out; + ret = afr_sh_common_create (sh, priv->child_count); + if (ret) { + op_errno = -ret; + goto out; } - sh->delta_matrix = GF_CALLOC (sizeof (int32_t *), priv->child_count, - gf_afr_mt_int32_t); - for (i = 0; i < priv->child_count; i++) { - sh->delta_matrix[i] = GF_CALLOC (sizeof (int32_t), - priv->child_count, - gf_afr_mt_int32_t); + if (local->self_heal.background) { + LOCK (&priv->lock); + { + if (priv->background_self_heals_started + < priv->background_self_heal_count) { + priv->background_self_heals_started++; + + + } else { + local->self_heal.background = _gf_false; + sh->background = _gf_false; + } + } + UNLOCK (&priv->lock); + } + + if (!local->loc.parent) { + sh->do_missing_entry_self_heal = _gf_false; + sh->do_gfid_self_heal = _gf_false; } - sh->success_children = afr_children_create (priv->child_count); - sh->fresh_children = afr_children_create (priv->child_count); - sh->fresh_parent_dirs = afr_children_create (priv->child_count); + sh->sh_type_in_action = AFR_SELF_HEAL_INVALID; FRAME_SU_DO (sh_frame, afr_local_t); - if (sh->need_missing_entry_self_heal) { - afr_self_heal_conflicting_entries (sh_frame, this); - } else if (sh->need_gfid_self_heal) { - GF_ASSERT (!uuid_is_null (sh->sh_gfid_req)); - afr_self_heal_gfids (sh_frame, this); + if (sh->do_missing_entry_self_heal || sh->do_gfid_self_heal) { + afr_self_heal_missing_entries (sh_frame, this); } else { + loc = &sh_local->loc; + if (uuid_is_null (loc->inode->gfid) && uuid_is_null (loc->gfid)) { + if (!uuid_is_null (inode->gfid)) + GF_ASSERT (!uuid_compare (inode->gfid, + sh->sh_gfid_req)); + uuid_copy (loc->gfid, sh->sh_gfid_req); + } gf_log (this->name, GF_LOG_TRACE, "proceeding to metadata check on %s", local->loc.path); afr_sh_missing_entries_done (sh_frame, this); } + op_errno = 0; +out: + if (op_errno) { + orig_sh->unwind (frame, this, -1, op_errno, 1); + if (sh_frame) + AFR_STACK_DESTROY (sh_frame); + } return 0; } @@ -2143,24 +2456,24 @@ afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str, GF_ASSERT (str && (size > strlen (" missing-entry gfid " "meta-data data entry"))); - if (self_heal_p->need_metadata_self_heal) { + if (self_heal_p->do_metadata_self_heal) { snprintf (str, size, " meta-data"); } - if (self_heal_p->need_data_self_heal) { + if (self_heal_p->do_data_self_heal) { snprintf (str + strlen(str), size - strlen(str), " data"); } - if (self_heal_p->need_entry_self_heal) { + if (self_heal_p->do_entry_self_heal) { snprintf (str + strlen(str), size - strlen(str), " entry"); } - if (self_heal_p->need_missing_entry_self_heal) { + if (self_heal_p->do_missing_entry_self_heal) { snprintf (str + strlen(str), size - strlen(str), " missing-entry"); } - if (self_heal_p->need_gfid_self_heal) { + if (self_heal_p->do_gfid_self_heal) { snprintf (str + strlen(str), size - strlen(str), " gfid"); } } @@ -2186,3 +2499,314 @@ afr_self_heal_type_for_transaction (afr_transaction_type type) } return sh_type; } + +int +afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +{ + int ret = -1; + uuid_t pargfid = {0}; + + if (!child) + goto out; + + if (!uuid_is_null (parent->inode->gfid)) + uuid_copy (pargfid, parent->inode->gfid); + else if (!uuid_is_null (parent->gfid)) + uuid_copy (pargfid, parent->gfid); + + if (uuid_is_null (pargfid)) + goto out; + + if (strcmp (parent->path, "/") == 0) + ret = gf_asprintf ((char **)&child->path, "/%s", name); + else + ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path, + name); + + if (-1 == ret) { + gf_log (this->name, GF_LOG_ERROR, + "asprintf failed while setting child path"); + } + + child->name = strrchr (child->path, '/'); + if (child->name) + child->name++; + + child->parent = inode_ref (parent->inode); + child->inode = inode_new (parent->inode->table); + uuid_copy (child->pargfid, pargfid); + + if (!child->inode) { + ret = -1; + goto out; + } + + ret = 0; +out: + if ((ret == -1) && child) + loc_wipe (child); + + return ret; +} + +int +afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, + afr_transaction_type type, afr_fxattrop_cbk_t cbk, + int (*finish)(call_frame_t *frame, xlator_t *this)) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t **erase_xattr = NULL; + int ret = -1; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix, + sh->success, priv->child_count, type); + + erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count, + gf_afr_mt_dict_t); + if (!erase_xattr) + goto out; + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + call_count++; + erase_xattr[i] = dict_new (); + if (!erase_xattr[i]) + goto out; + } + } + + afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr, + priv->child_count, type); + + gf_log (this->name, GF_LOG_DEBUG, "Delta matrix for: %s", + lkowner_utoa (&frame->root->lk_owner)); + afr_sh_print_pending_matrix (sh->delta_matrix, this); + local->call_count = call_count; + if (call_count == 0) { + ret = 0; + finish (frame, this); + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (!erase_xattr[i]) + continue; + + if (sh->healing_fd) {//true for ENTRY, reg file DATA transaction + STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + sh->healing_fd, + GF_XATTROP_ADD_ARRAY, erase_xattr[i], + NULL); + } else { + STACK_WIND_COOKIE (frame, cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, erase_xattr[i], + NULL); + } + } + + ret = 0; +out: + if (erase_xattr) { + for (i = 0; i < priv->child_count; i++) { + if (erase_xattr[i]) { + dict_unref (erase_xattr[i]); + } + } + } + + GF_FREE (erase_xattr); + + if (ret < 0) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + finish (frame, this); + } + + return 0; +} + +void +afr_set_self_heal_status(afr_self_heal_t *sh, afr_self_heal_status status) +{ + xlator_t *this = NULL; + afr_sh_status_for_all_type *sh_status = &(sh->afr_all_sh_status); + afr_self_heal_type sh_type_in_action = sh->sh_type_in_action; + this = THIS; + + if (!sh) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal" + "Structure"); + goto out; + } + + switch (sh_type_in_action) { + case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: + sh_status->gfid_or_missing_entry_self_heal = status; + break; + case AFR_SELF_HEAL_METADATA: + sh_status->metadata_self_heal = status; + break; + case AFR_SELF_HEAL_DATA: + sh_status->data_self_heal = status; + break; + case AFR_SELF_HEAL_ENTRY: + sh_status->entry_self_heal = status; + break; + case AFR_SELF_HEAL_INVALID: + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid" + "self heal type in action"); + break; + } +out: + return; +} + +void +afr_set_local_for_unhealable (afr_local_t *local) +{ + afr_self_heal_t *sh = NULL; + + sh = &local->self_heal; + + local->unhealable = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +} + +int +is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type) +{ + afr_sh_status_for_all_type sh_status = sh->afr_all_sh_status; + afr_self_heal_type sh_type_in_action = AFR_SELF_HEAL_INVALID; + afr_self_heal_status status = AFR_SELF_HEAL_FAILED; + xlator_t *this = NULL; + int sh_failed = 0; + + this = THIS; + + if (!sh) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal " + "structure"); + sh_failed = 1; + goto out; + } + + if (type == AFR_CHECK_ALL) { + if ((sh_status.gfid_or_missing_entry_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.metadata_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.data_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.entry_self_heal == AFR_SELF_HEAL_FAILED)) + sh_failed = 1; + } else if (type == AFR_CHECK_SPECIFIC) { + sh_type_in_action = sh->sh_type_in_action; + switch (sh_type_in_action) { + case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: + status = sh_status.gfid_or_missing_entry_self_heal; + break; + case AFR_SELF_HEAL_METADATA: + status = sh_status.metadata_self_heal; + break; + case AFR_SELF_HEAL_ENTRY: + status = sh_status.entry_self_heal; + break; + case AFR_SELF_HEAL_DATA: + status = sh_status.data_self_heal; + break; + case AFR_SELF_HEAL_INVALID: + status = AFR_SELF_HEAL_NOT_ATTEMPTED; + break; + } + if (status == AFR_SELF_HEAL_FAILED) + sh_failed = 1; + + } + +out: + return sh_failed; +} + +char * +get_sh_completion_status (afr_self_heal_status status) +{ + + char *not_attempted = " is not attempted"; + char *failed = " failed"; + char *started = " is started"; + char *sync_begin = " is successfully completed"; + char *result = " has unknown status"; + + switch (status) + { + case AFR_SELF_HEAL_NOT_ATTEMPTED: + result = not_attempted; + break; + case AFR_SELF_HEAL_FAILED: + result = failed; + break; + case AFR_SELF_HEAL_STARTED: + result = started; + break; + case AFR_SELF_HEAL_SYNC_BEGIN: + result = sync_begin; + break; + } + + return result; + +} + +void +afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t loglvl) +{ + + char sh_log[4096] = {0}; + afr_self_heal_t *sh = &local->self_heal; + afr_sh_status_for_all_type all_status = sh->afr_all_sh_status; + xlator_t *this = NULL; + size_t off = 0; + int data_sh = 0; + int metadata_sh = 0; + int print_log = 0; + + this = THIS; + + ADD_FMT_STRING (sh_log, off, "gfid or missing entry", + all_status.gfid_or_missing_entry_self_heal, print_log); + ADD_FMT_STRING_SYNC (sh_log, off, "metadata", + all_status.metadata_self_heal, print_log); + if (sh->background) { + ADD_FMT_STRING_SYNC (sh_log, off, "backgroung data", + all_status.data_self_heal, print_log); + } else { + ADD_FMT_STRING_SYNC (sh_log, off, "foreground data", + all_status.data_self_heal, print_log); + } + ADD_FMT_STRING_SYNC (sh_log, off, "entry", all_status.entry_self_heal, + print_log); + + if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.data_self_heal && + strcmp (sh->data_sh_info, "") && sh->data_sh_info ) + data_sh = 1; + if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.metadata_self_heal && + strcmp (sh->metadata_sh_info, "") && sh->metadata_sh_info) + metadata_sh = 1; + + if (!print_log) + return; + + gf_log (this->name, loglvl, "%s %s %s on %s", sh_log, + ((data_sh == 1) ? sh->data_sh_info : ""), + ((metadata_sh == 1) ? sh->metadata_sh_info : ""), + local->loc.path); +} |
