From 8beaf169e39b262416e2274a028292379d39b310 Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Fri, 9 Jan 2015 14:43:22 +0000 Subject: cluster/afr: split-brain resolution CLI Extend the AFR heal command to include automated split-brain resolution. This patch [3/3] is the final patch for afr automated split-brain resolution implementation. "gluster volume heal [full | statistics [heal-count [replica ]] |info [healed | heal-failed | split-brain]| split-brain {bigger-file |source-brick []}]" The new additions being: 1.gluster volume heal split-brain bigger-file Locates the replica containing the FILE, selects bigger-file as source and completes heal. 2.gluster volume heal split-brain source-brick Selects present in as source and completes heal. 3.gluster volume heal split-brain Selects all split-brained files in as source and completes heal. Note: can be either the full file name as seen from the root of the volume (or) the gfid-string representation of the file, which sometimes gets displayed in the heal info command's output. Entry/gfid split-brain resolution is not supported. Example can be found in the test case. Change-Id: I4649733922d406f14f28ee9033a5cb627b9538b3 BUG: 1136769 Signed-off-by: Ravishankar N Reviewed-on: http://review.gluster.org/9377 Reviewed-by: Pranith Kumar Karampuri Tested-by: Pranith Kumar Karampuri Tested-by: Gluster Build System --- xlators/cluster/afr/src/afr-common.c | 76 +++++++++ xlators/cluster/afr/src/afr-inode-read.c | 5 + xlators/cluster/afr/src/afr-self-heal-common.c | 191 ++++++++++++++++++++++- xlators/cluster/afr/src/afr-self-heal-data.c | 62 +++----- xlators/cluster/afr/src/afr-self-heal-metadata.c | 34 ++-- xlators/cluster/afr/src/afr-self-heal.h | 18 +++ xlators/cluster/afr/src/afr.h | 4 + xlators/cluster/dht/src/dht-common.c | 12 +- 8 files changed, 343 insertions(+), 59 deletions(-) (limited to 'xlators/cluster') diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index f39db802588..e6d45add4e8 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -4471,5 +4471,81 @@ out: AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL); if (dict) dict_unref (dict); + if (inode) { + inode_forget (inode, 1); + inode_unref (inode); + } + return ret; +} + +int32_t +afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + gf_boolean_t data_selfheal = _gf_false; + gf_boolean_t metadata_selfheal = _gf_false; + gf_boolean_t entry_selfheal = _gf_false; + dict_t *dict = NULL; + afr_local_t *local = NULL; + inode_t *inode = NULL; + int entry_ret = 0, metadata_ret = 0, data_ret = 0; + int ret = 0, op_errno = 0; + + local = frame->local; + dict = dict_new (); + if (!dict) { + op_errno = ENOMEM; + ret = -1; + goto out; + } + + ret = afr_selfheal_unlocked_inspect (frame, this, loc->gfid, &inode, + &data_selfheal, + &metadata_selfheal, + &entry_selfheal); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + + if (!data_selfheal && !metadata_selfheal && !entry_selfheal) { + ret = dict_set_str (dict, "sh-fail-msg", + "File not in split-brain"); + if (ret) + gf_log (this->name, GF_LOG_WARNING, + "Failed to set sh-fail-msg in dict"); + ret = 0; + goto out; + } + + if (data_selfheal) + data_ret = afr_selfheal_data (frame, this, inode); + + if (metadata_selfheal) + metadata_ret = afr_selfheal_metadata (frame, this, inode); + + if (entry_selfheal) + entry_ret = afr_selfheal_entry (frame, this, inode); + + ret = (data_ret | metadata_ret | entry_ret); + + if (local->xdata_rsp) { + /* 'sh-fail-msg' has been set in the dict during self-heal.*/ + dict_copy (local->xdata_rsp, dict); + ret = 0; + } else if (ret) { + /*Some other error during self-heal. Just propagate it.*/ + op_errno = -ret; + ret = -1; + } + +out: + AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL); + if (dict) + dict_unref(dict); + if (inode) { + inode_forget (inode, 1); + inode_unref (inode); + } return ret; } diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index e64070e1bcd..78dd65f30e7 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -1380,6 +1380,11 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, return 0; } + if (!strcmp (name, GF_AFR_HEAL_SBRAIN)) { + afr_heal_splitbrain_file (frame, this, loc); + return 0; + } + /* * if we are doing getxattr with pathinfo as the key then we * collect information from all childs diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 6198d4cf72c..e9d853c4ecd 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -17,7 +17,7 @@ #include "afr.h" #include "afr-self-heal.h" #include "byte-order.h" - +#include "protocol-common.h" int afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -287,6 +287,39 @@ afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, return 0; } +/* + * If by chance there are multiple sources with differing sizes, select + * the largest file as the source. + * + * This can happen if data was directly modified in the backend or for snapshots + */ +void +afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources, + struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + uint64_t size = 0; + + /* Find source with biggest file size */ + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (size <= replies[i].poststat.ia_size) { + size = replies[i].poststat.ia_size; + } + } + + /* Mark sources with less size as not source */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (size > replies[i].poststat.ia_size) + sources[i] = 0; + } +} + void afr_mark_active_sinks (xlator_t *this, unsigned char *sources, unsigned char *locked_on, unsigned char *sinks) @@ -303,6 +336,154 @@ afr_mark_active_sinks (xlator_t *this, unsigned char *sources, } } +gf_boolean_t +afr_dict_contains_heal_op (call_frame_t *frame) +{ + afr_local_t *local = NULL; + dict_t *xdata_req = NULL; + int ret = 0; + int heal_op = -1; + + local = frame->local; + xdata_req = local->xdata_req; + ret = dict_get_int32 (xdata_req, "heal-op", &heal_op); + if (ret) + return _gf_false; + if (local->xdata_rsp == NULL) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) + return _gf_true; + } + ret = dict_set_str (local->xdata_rsp, "sh-fail-msg", + "File not in split-brain"); + + return _gf_true; +} + +/* Return a source depending on the type of heal_op, and set sources[source], + * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so + * only if the following condition is met: + * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1)) + * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and + * sinks[node] are 1. This should be the case if the file is in split-brain. + */ +int +afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this, + unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies, + afr_transaction_type type) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xdata_req = NULL; + dict_t *xdata_rsp = NULL; + int ret = 0; + int heal_op = -1; + int i = 0; + char *name = NULL; + int source = -1; + + local = frame->local; + priv = this->private; + xdata_req = local->xdata_req; + ret = dict_get_int32 (xdata_req, "heal-op", &heal_op); + if (ret) + goto out; + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) + if (sources[i] || !sinks[i] || !healed_sinks[i]) { + ret = -1; + goto out; + } + } + if (local->xdata_rsp == NULL) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) { + ret = -1; + goto out; + } + } + xdata_rsp = local->xdata_rsp; + + switch (heal_op) { + case GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE: + if (type == AFR_METADATA_TRANSACTION) { + ret = dict_set_str (xdata_rsp, "sh-fail-msg", + "Use source-brick option to" + " heal metadata split-brain"); + if (!ret) + ret = -1; + goto out; + } + for (i = 0 ; i < priv->child_count; i++) + if (locked_on[i]) + sources[i] = 1; + afr_mark_largest_file_as_source (this, sources, replies); + if (AFR_COUNT (sources, priv->child_count) != 1) { + ret = dict_set_str (xdata_rsp, "sh-fail-msg", + "No bigger file"); + if (!ret) + ret = -1; + goto out; + } + for (i = 0 ; i < priv->child_count; i++) + if (sources[i]) + source = i; + sinks[source] = 0; + healed_sinks[source] = 0; + break; + case GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK: + ret = dict_get_str (xdata_req, "child-name", &name); + if (ret) + goto out; + source = afr_get_child_index_from_name (this, name); + if (source < 0) { + ret = dict_set_str (xdata_rsp, "sh-fail-msg", + "Invalid brick name"); + if (!ret) + ret = -1; + goto out; + } + if (locked_on[source] != 1) { + ret = dict_set_str (xdata_rsp, "sh-fail-msg", + "Brick is not up"); + if (!ret) + ret = -1; + goto out; + } + sources[source] = 1; + sinks[source] = 0; + healed_sinks[source] = 0; + break; + default: + ret = -1; + goto out; + } + ret = source; +out: + return ret; + +} + +int +afr_get_child_index_from_name (xlator_t *this, char *name) +{ + afr_private_t *priv = this->private; + int index = -1; + + for (index = 0; index < priv->child_count; index++) { + if (!strcmp (priv->children[index]->name, name)) + goto out; + } + index = -1; +out: + return index; +} + + gf_boolean_t afr_does_witness_exist (xlator_t *this, uint64_t *witness) { @@ -427,6 +608,14 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, } } + /* If no sources, all locked nodes are sinks - split brain */ + if (AFR_COUNT (sources, priv->child_count) == 0) { + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) + sinks[i] = 1; + } + } + /* In afr-v1 if a file is self-accused but didn't have any pending * operations on others then it is similar to 'dirty' in afr-v2. * Consider such cases as witness. diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index a434b9e6ba1..45a099cec86 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -17,6 +17,7 @@ #include "afr.h" #include "afr-self-heal.h" #include "byte-order.h" +#include "protocol-common.h" enum { AFR_SELFHEAL_DATA_FULL = 0, @@ -426,41 +427,6 @@ afr_does_size_mismatch (xlator_t *this, unsigned char *sources, return _gf_false; } -/* - * If by chance there are multiple sources with differing sizes, select - * the largest file as the source. - * - * This can happen if data was directly modified in the backend or for snapshots - */ - -static void -afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources, - struct afr_reply *replies) -{ - int i = 0; - afr_private_t *priv = NULL; - uint64_t size = 0; - - /* Find source with biggest file size */ - priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (!sources[i]) - continue; - if (size <= replies[i].poststat.ia_size) { - size = replies[i].poststat.ia_size; - } - } - - /* Mark sources with less size as not source */ - for (i = 0; i < priv->child_count; i++) { - if (!sources[i]) - continue; - if (size > replies[i].poststat.ia_size) - sources[i] = 0; - } - - return; -} static void afr_mark_biggest_witness_as_source (xlator_t *this, unsigned char *sources, @@ -518,7 +484,9 @@ afr_mark_newest_file_as_source (xlator_t *this, unsigned char *sources, } static int -__afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources, +__afr_selfheal_data_finalize_source (call_frame_t *frame, xlator_t *this, + unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, struct afr_reply *replies, @@ -528,7 +496,6 @@ __afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources, afr_private_t *priv = NULL; int source = -1; int sources_count = 0; - priv = this->private; sources_count = AFR_COUNT (sources, priv->child_count); @@ -536,9 +503,21 @@ __afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources, if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0) || !sources_count) { /* split brain */ - return -EIO; + source = afr_mark_split_brain_source_sinks (frame, this, + sources, sinks, + healed_sinks, + locked_on, replies, + AFR_DATA_TRANSACTION); + if (source < 0) + return -EIO; + return source; } + /* No split brain at this point. If we were called from + * afr_heal_splitbrain_file(), abort.*/ + if (afr_dict_contains_heal_op(frame)) + return -EIO; + /* If there are no witnesses/size-mismatches on sources we are done*/ if (!afr_does_size_mismatch (this, sources, replies) && !afr_has_source_witnesses (this, sources, witness)) @@ -605,9 +584,10 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, */ AFR_INTERSECT (healed_sinks, sinks, locked_on, priv->child_count); - source = __afr_selfheal_data_finalize_source (this, sources, - healed_sinks, locked_on, - replies, witness); + source = __afr_selfheal_data_finalize_source (frame, this, sources, + sinks, healed_sinks, + locked_on, replies, + witness); if (source < 0) return -EIO; diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 0518c1821e3..05d9f2b4917 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -17,6 +17,7 @@ #include "afr.h" #include "afr-self-heal.h" #include "byte-order.h" +#include "protocol-common.h" #define AFR_HEAL_ATTR (GF_SET_ATTR_UID|GF_SET_ATTR_GID|GF_SET_ATTR_MODE) @@ -199,6 +200,7 @@ out: static int __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, struct afr_reply *replies) @@ -208,13 +210,26 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this, struct iatt first = {0, }; int source = -1; int sources_count = 0; + dict_t *xdata_req = NULL; + afr_local_t *local = NULL; priv = this->private; + local = frame->local; + xdata_req = local->xdata_req; sources_count = AFR_COUNT (sources, priv->child_count); if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0) || !sources_count) { + + source = afr_mark_split_brain_source_sinks (frame, this, + sources, sinks, + healed_sinks, + locked_on, replies, + AFR_METADATA_TRANSACTION); + if (source >= 0) + return source; + /* If this is a directory mtime/ctime only split brain use the most recent */ source = afr_dirtime_splitbrain_source (frame, this, @@ -224,17 +239,7 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this, "split brain on %s", uuid_utoa (replies[source].poststat.ia_gfid)); sources[source] = 1; - - for (i = 0; i < priv->child_count; i++) { - if (i == source) - continue; - - if (!locked_on[i]) - continue; - - healed_sinks[i] = 1; - } - + healed_sinks[source] = 0; return source; } @@ -253,6 +258,11 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this, } } + /* No split brain at this point. If we were called from + * afr_heal_splitbrain_file(), abort.*/ + if (afr_dict_contains_heal_op(frame)) + return -EIO; + for (i = 0; i < priv->child_count; i++) { if (!sources[i]) continue; @@ -352,7 +362,7 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *i } source = __afr_selfheal_metadata_finalize_source (frame, this, sources, - healed_sinks, + sinks, healed_sinks, locked_on, replies); if (source < 0) diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 50cff91ccb3..74cc9608cf6 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -193,9 +193,27 @@ afr_log_selfheal (uuid_t gfid, xlator_t *this, int ret, char *type, int source, unsigned char *healed_sinks); void +afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources, + struct afr_reply *replies); +void afr_mark_active_sinks (xlator_t *this, unsigned char *sources, unsigned char *locked_on, unsigned char *sinks); +gf_boolean_t +afr_dict_contains_heal_op (call_frame_t *frame); + +int +afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this, + unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies, + afr_transaction_type type); + +int +afr_get_child_index_from_name (xlator_t *this, char *name); + gf_boolean_t afr_does_witness_exist (xlator_t *this, uint64_t *witness); diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 4fdc5f774cc..09821b724fe 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -1021,4 +1021,8 @@ afr_is_xattr_ignorable (char *key); int afr_get_heal_info (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); + +int +afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc); + #endif /* __AFR_H__ */ diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 82b527e9141..866e3faf629 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -2636,8 +2636,10 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, this_call_cnt = dht_frame_return (frame); - if (!xattr || (op_ret == -1)) + if (!xattr || (op_ret == -1)) { + local->op_ret = op_ret; goto out; + } if (dict_get (xattr, conf->xattr_name)) { dict_del (xattr, conf->xattr_name); @@ -2808,7 +2810,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, subvol = layout->list[i].xlator; STACK_WIND (frame, dht_vgetxattr_dir_cbk, subvol, subvol->fops->getxattr, - loc, key, NULL); + loc, key, xdata); } return 0; } @@ -2821,7 +2823,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, local->call_cnt = 1; STACK_WIND (frame, dht_vgetxattr_cbk, cached_subvol, - cached_subvol->fops->getxattr, loc, key, NULL); + cached_subvol->fops->getxattr, loc, key, xdata); return 0; } @@ -2854,7 +2856,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, if (hashed_subvol) { STACK_WIND (frame, dht_linkinfo_getxattr_cbk, hashed_subvol, hashed_subvol->fops->getxattr, loc, - GF_XATTR_PATHINFO_KEY, NULL); + GF_XATTR_PATHINFO_KEY, xdata); return 0; } op_errno = ENODATA; @@ -2933,7 +2935,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, subvol = layout->list[i].xlator; STACK_WIND (frame, dht_getxattr_cbk, subvol, subvol->fops->getxattr, - loc, key, NULL); + loc, key, xdata); } return 0; -- cgit