diff options
author | Vikas Gorur <vikas@gluster.com> | 2009-03-24 05:44:59 -0700 |
---|---|---|
committer | Anand V. Avati <avati@amp.gluster.com> | 2009-03-24 21:53:19 +0530 |
commit | 6e8017479fd9997ae47e7c8cbb74247d7e2b4fd0 (patch) | |
tree | dbd48b0cd4c089db017f1f09ae04abb6a8f53d89 /xlators/cluster | |
parent | 8f590ad883c170827cc06dcbcb17e5aaa1788996 (diff) |
Made self heal logic more precise.
Discard earlier patch sent for the same error. This patch fixes
it more comprehensively.
This solves the spurious split-brain seen by many users.
Signed-off-by: Anand V. Avati <avati@amp.gluster.com>
Diffstat (limited to 'xlators/cluster')
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 255 |
1 files changed, 233 insertions, 22 deletions
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index aea31a748f5..04530ccb0a0 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -211,43 +211,254 @@ afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[], /** * mark_sources: Mark all 'source' nodes and return number of source * nodes found + * + * A node (a row in the pending matrix) belongs to one of + * three categories: + * + * M is the pending matrix. + * + * 'innocent' - M[i] is all zeroes + * 'fool' - M[i] has i'th element = 1 (self-reference) + * 'wise' - M[i] has i'th element = 0, others are 1 or 0. + * + * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is + * needed. + * + * A 'wise' node can be a source. If two 'wise' nodes conflict, it is + * a split-brain. If one wise node refers to the other but the other doesn't + * refer back, the referrer is a source. + * + * All fools are sinks, unless there are no 'wise' nodes. In that case, + * one of the fools is made a source. */ +typedef enum { + AFR_NODE_INNOCENT, + AFR_NODE_FOOL, + AFR_NODE_WISE +} afr_node_type; + +typedef struct { + afr_node_type type; + int wisdom; +} afr_node_character; + + +static int +afr_sh_is_innocent (int32_t *array, int child_count) +{ + int i = 0; + int ret = 1; /* innocent until proven guilty */ + + for (i = 0; i < child_count; i++) { + if (array[i]) { + ret = 0; + break; + } + } + + return ret; +} + + +static int +afr_sh_is_fool (int32_t *array, int i, int child_count) +{ + return array[i]; /* fool if accuses itself */ +} + + +static int +afr_sh_is_wise (int32_t *array, int i, int child_count) +{ + return !array[i]; /* wise if does not accuse itself */ +} + + +static int +afr_sh_all_nodes_innocent (afr_node_character *characters, + int child_count) +{ + int i = 0; + int ret = 1; + + for (i = 0; i < child_count; i++) { + if (characters[i].type != AFR_NODE_INNOCENT) { + ret = 0; + break; + } + } + + return ret; +} + + +static int +afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count) +{ + int i = 0; + int ret = 0; + + for (i = 0; i < child_count; i++) { + if (characters[i].type == AFR_NODE_WISE) { + ret = 1; + break; + } + } + + return ret; +} + + +/* + * The 'wisdom' of a wise node is 0 if any other wise node accuses to it. + * It is 1 if no other wise node accuses it. + * Only wise nodes with wisdom 1 are sources. + * + * If no nodes with wisdom 1 exist, a split-brain has occured. + */ + +static void +afr_sh_compute_wisdom (int32_t *pending_matrix[], + afr_node_character characters[], int child_count) +{ + int i = 0; + int j = 0; + + for (i = 0; i < child_count; i++) { + if (characters[i].type == AFR_NODE_WISE) { + characters[i].wisdom = 1; + + for (j = 0; j < child_count; j++) { + if ((characters[j].type == AFR_NODE_WISE) + && pending_matrix[j][i]) { + + characters[i].wisdom = 0; + } + } + } + } +} + + +static int +afr_sh_wise_nodes_conflict (afr_node_character *characters, + int child_count) +{ + int i = 0; + int ret = 1; + + for (i = 0; i < child_count; i++) { + if ((characters[i].type == AFR_NODE_WISE) + && characters[i].wisdom == 1) { + + /* There is atleast one bona-fide wise node */ + ret = 0; + break; + } + } + + return ret; +} + + +static int +afr_sh_mark_wisest_as_sources (int sources[], + afr_node_character *characters, + int child_count) +{ + int nsources = 0; + + int i = 0; + + for (i = 0; i < child_count; i++) { + if (characters[i].wisdom == 1) { + sources[i] = 1; + nsources++; + } + } + + return nsources; +} + + +static int +afr_sh_mark_a_fool_as_source (int sources[], afr_node_character *characters, + int child_count) +{ + int i = 0; + + int nsources = 0; + + for (i = 0; i < child_count; i++) { + if (characters[i].type == AFR_NODE_FOOL) { + sources[i] = 1; + nsources++; + break; + } + } + + return nsources; +} + + int afr_sh_mark_sources (int32_t *pending_matrix[], int sources[], int child_count) { int i = 0; - int j = 0; int nsources = 0; + /* stores the 'characters' (innocent, fool, wise) of the nodes */ + afr_node_character * + characters = CALLOC (sizeof (afr_node_character), + child_count); /* start clean */ for (i = 0; i < child_count; i++) { sources[i] = 0; } + + for (i = 0; i < child_count; i++) { + if (afr_sh_is_innocent (pending_matrix[i], child_count)) { + characters[i].type = AFR_NODE_INNOCENT; + + } else if (afr_sh_is_fool (pending_matrix[i], i, child_count)) { + characters[i].type = AFR_NODE_FOOL; + + } else if (afr_sh_is_wise (pending_matrix[i], i, child_count)) { + characters[i].type = AFR_NODE_WISE; + + } else { + gf_log ("[module:afr]", GF_LOG_ERROR, + "node %d is diabolical! " + "(This message should never appear." + " Please file a bug report.)", i); + } + } + + if (afr_sh_all_nodes_innocent (characters, child_count)) { + /* no self-heal needed */ + goto out; + + } else if (afr_sh_wise_nodes_exist (characters, child_count)) { + afr_sh_compute_wisdom (pending_matrix, characters, child_count); + + if (afr_sh_wise_nodes_conflict (characters, child_count)) { + /* split-brain */ + goto out; + + } else { + nsources = afr_sh_mark_wisest_as_sources (sources, + characters, + child_count); + } + } else { + nsources = afr_sh_mark_a_fool_as_source (sources, characters, + child_count); + } - /* - Let's 'normalize' the pending matrix first, - by disregarding all pending entries that refer - to themselves - */ - for (i = 0; i < child_count; i++) { - pending_matrix[i][i] = 0; - } - - for (i = 0; i < child_count; i++) { - for (j = 0; j < child_count; j++) { - if (pending_matrix[j][i]) - break; - } - - if (j == child_count) { - nsources++; - sources[i] = 1; - } - } - +out: return nsources; } |