From 8beaf169e39b262416e2274a028292379d39b310 Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Fri, 9 Jan 2015 14:43:22 +0000 Subject: cluster/afr: split-brain resolution CLI Extend the AFR heal command to include automated split-brain resolution. This patch [3/3] is the final patch for afr automated split-brain resolution implementation. "gluster volume heal [full | statistics [heal-count [replica ]] |info [healed | heal-failed | split-brain]| split-brain {bigger-file |source-brick []}]" The new additions being: 1.gluster volume heal split-brain bigger-file Locates the replica containing the FILE, selects bigger-file as source and completes heal. 2.gluster volume heal split-brain source-brick Selects present in as source and completes heal. 3.gluster volume heal split-brain Selects all split-brained files in as source and completes heal. Note: can be either the full file name as seen from the root of the volume (or) the gfid-string representation of the file, which sometimes gets displayed in the heal info command's output. Entry/gfid split-brain resolution is not supported. Example can be found in the test case. Change-Id: I4649733922d406f14f28ee9033a5cb627b9538b3 BUG: 1136769 Signed-off-by: Ravishankar N Reviewed-on: http://review.gluster.org/9377 Reviewed-by: Pranith Kumar Karampuri Tested-by: Pranith Kumar Karampuri Tested-by: Gluster Build System --- xlators/cluster/afr/src/afr-self-heal-common.c | 191 ++++++++++++++++++++++++- 1 file changed, 190 insertions(+), 1 deletion(-) (limited to 'xlators/cluster/afr/src/afr-self-heal-common.c') diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 6198d4cf72c..e9d853c4ecd 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -17,7 +17,7 @@ #include "afr.h" #include "afr-self-heal.h" #include "byte-order.h" - +#include "protocol-common.h" int afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -287,6 +287,39 @@ afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, return 0; } +/* + * If by chance there are multiple sources with differing sizes, select + * the largest file as the source. + * + * This can happen if data was directly modified in the backend or for snapshots + */ +void +afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources, + struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + uint64_t size = 0; + + /* Find source with biggest file size */ + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (size <= replies[i].poststat.ia_size) { + size = replies[i].poststat.ia_size; + } + } + + /* Mark sources with less size as not source */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (size > replies[i].poststat.ia_size) + sources[i] = 0; + } +} + void afr_mark_active_sinks (xlator_t *this, unsigned char *sources, unsigned char *locked_on, unsigned char *sinks) @@ -303,6 +336,154 @@ afr_mark_active_sinks (xlator_t *this, unsigned char *sources, } } +gf_boolean_t +afr_dict_contains_heal_op (call_frame_t *frame) +{ + afr_local_t *local = NULL; + dict_t *xdata_req = NULL; + int ret = 0; + int heal_op = -1; + + local = frame->local; + xdata_req = local->xdata_req; + ret = dict_get_int32 (xdata_req, "heal-op", &heal_op); + if (ret) + return _gf_false; + if (local->xdata_rsp == NULL) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) + return _gf_true; + } + ret = dict_set_str (local->xdata_rsp, "sh-fail-msg", + "File not in split-brain"); + + return _gf_true; +} + +/* Return a source depending on the type of heal_op, and set sources[source], + * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so + * only if the following condition is met: + * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1)) + * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and + * sinks[node] are 1. This should be the case if the file is in split-brain. + */ +int +afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this, + unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies, + afr_transaction_type type) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xdata_req = NULL; + dict_t *xdata_rsp = NULL; + int ret = 0; + int heal_op = -1; + int i = 0; + char *name = NULL; + int source = -1; + + local = frame->local; + priv = this->private; + xdata_req = local->xdata_req; + ret = dict_get_int32 (xdata_req, "heal-op", &heal_op); + if (ret) + goto out; + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) + if (sources[i] || !sinks[i] || !healed_sinks[i]) { + ret = -1; + goto out; + } + } + if (local->xdata_rsp == NULL) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) { + ret = -1; + goto out; + } + } + xdata_rsp = local->xdata_rsp; + + switch (heal_op) { + case GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE: + if (type == AFR_METADATA_TRANSACTION) { + ret = dict_set_str (xdata_rsp, "sh-fail-msg", + "Use source-brick option to" + " heal metadata split-brain"); + if (!ret) + ret = -1; + goto out; + } + for (i = 0 ; i < priv->child_count; i++) + if (locked_on[i]) + sources[i] = 1; + afr_mark_largest_file_as_source (this, sources, replies); + if (AFR_COUNT (sources, priv->child_count) != 1) { + ret = dict_set_str (xdata_rsp, "sh-fail-msg", + "No bigger file"); + if (!ret) + ret = -1; + goto out; + } + for (i = 0 ; i < priv->child_count; i++) + if (sources[i]) + source = i; + sinks[source] = 0; + healed_sinks[source] = 0; + break; + case GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK: + ret = dict_get_str (xdata_req, "child-name", &name); + if (ret) + goto out; + source = afr_get_child_index_from_name (this, name); + if (source < 0) { + ret = dict_set_str (xdata_rsp, "sh-fail-msg", + "Invalid brick name"); + if (!ret) + ret = -1; + goto out; + } + if (locked_on[source] != 1) { + ret = dict_set_str (xdata_rsp, "sh-fail-msg", + "Brick is not up"); + if (!ret) + ret = -1; + goto out; + } + sources[source] = 1; + sinks[source] = 0; + healed_sinks[source] = 0; + break; + default: + ret = -1; + goto out; + } + ret = source; +out: + return ret; + +} + +int +afr_get_child_index_from_name (xlator_t *this, char *name) +{ + afr_private_t *priv = this->private; + int index = -1; + + for (index = 0; index < priv->child_count; index++) { + if (!strcmp (priv->children[index]->name, name)) + goto out; + } + index = -1; +out: + return index; +} + + gf_boolean_t afr_does_witness_exist (xlator_t *this, uint64_t *witness) { @@ -427,6 +608,14 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, } } + /* If no sources, all locked nodes are sinks - split brain */ + if (AFR_COUNT (sources, priv->child_count) == 0) { + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) + sinks[i] = 1; + } + } + /* In afr-v1 if a file is self-accused but didn't have any pending * operations on others then it is similar to 'dirty' in afr-v2. * Consider such cases as witness. -- cgit