summaryrefslogtreecommitdiffstats
path: root/xlators
diff options
context:
space:
mode:
authorRavishankar N <ravishankar@redhat.com>2015-01-09 14:43:22 +0000
committerPranith Kumar Karampuri <pkarampu@redhat.com>2015-01-15 01:28:37 -0800
commit8beaf169e39b262416e2274a028292379d39b310 (patch)
treee5cfd6da9af293ba7625c057914583a03bbeadab /xlators
parent6da85222e5e49bcb15c4c8998f26c8dffb6a5b34 (diff)
cluster/afr: split-brain resolution CLI
Extend the AFR heal command to include automated split-brain resolution. This patch [3/3] is the final patch for afr automated split-brain resolution implementation. "gluster volume heal <VOLNAME> [full | statistics [heal-count [replica <HOSTNAME:BRICKNAME>]] |info [healed | heal-failed | split-brain]| split-brain {bigger-file <FILE> |source-brick <HOSTNAME:BRICKNAME> [<FILE>]}]" The new additions being: 1.gluster volume heal <VOLNAME> split-brain bigger-file <FILE> Locates the replica containing the FILE, selects bigger-file as source and completes heal. 2.gluster volume heal <VOLNAME> split-brain source-brick <HOSTNAME:BRICKNAME> <FILE> Selects <FILE> present in <HOSTNAME:BRICKNAME> as source and completes heal. 3.gluster volume heal <VOLNAME> split-brain <HOSTNAME:BRICKNAME> Selects all split-brained files in <HOSTNAME:BRICKNAME> as source and completes heal. Note: <FILE> can be either the full file name as seen from the root of the volume (or) the gfid-string representation of the file, which sometimes gets displayed in the heal info command's output. Entry/gfid split-brain resolution is not supported. Example can be found in the test case. Change-Id: I4649733922d406f14f28ee9033a5cb627b9538b3 BUG: 1136769 Signed-off-by: Ravishankar N <ravishankar@redhat.com> Reviewed-on: http://review.gluster.org/9377 Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com> Tested-by: Pranith Kumar Karampuri <pkarampu@redhat.com> Tested-by: Gluster Build System <jenkins@build.gluster.com>
Diffstat (limited to 'xlators')
-rw-r--r--xlators/cluster/afr/src/afr-common.c76
-rw-r--r--xlators/cluster/afr/src/afr-inode-read.c5
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c191
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c62
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c34
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h18
-rw-r--r--xlators/cluster/afr/src/afr.h4
-rw-r--r--xlators/cluster/dht/src/dht-common.c12
8 files changed, 343 insertions, 59 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index f39db802588..e6d45add4e8 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -4471,5 +4471,81 @@ out:
AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL);
if (dict)
dict_unref (dict);
+ if (inode) {
+ inode_forget (inode, 1);
+ inode_unref (inode);
+ }
+ return ret;
+}
+
+int32_t
+afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ gf_boolean_t data_selfheal = _gf_false;
+ gf_boolean_t metadata_selfheal = _gf_false;
+ gf_boolean_t entry_selfheal = _gf_false;
+ dict_t *dict = NULL;
+ afr_local_t *local = NULL;
+ inode_t *inode = NULL;
+ int entry_ret = 0, metadata_ret = 0, data_ret = 0;
+ int ret = 0, op_errno = 0;
+
+ local = frame->local;
+ dict = dict_new ();
+ if (!dict) {
+ op_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ ret = afr_selfheal_unlocked_inspect (frame, this, loc->gfid, &inode,
+ &data_selfheal,
+ &metadata_selfheal,
+ &entry_selfheal);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ if (!data_selfheal && !metadata_selfheal && !entry_selfheal) {
+ ret = dict_set_str (dict, "sh-fail-msg",
+ "File not in split-brain");
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING,
+ "Failed to set sh-fail-msg in dict");
+ ret = 0;
+ goto out;
+ }
+
+ if (data_selfheal)
+ data_ret = afr_selfheal_data (frame, this, inode);
+
+ if (metadata_selfheal)
+ metadata_ret = afr_selfheal_metadata (frame, this, inode);
+
+ if (entry_selfheal)
+ entry_ret = afr_selfheal_entry (frame, this, inode);
+
+ ret = (data_ret | metadata_ret | entry_ret);
+
+ if (local->xdata_rsp) {
+ /* 'sh-fail-msg' has been set in the dict during self-heal.*/
+ dict_copy (local->xdata_rsp, dict);
+ ret = 0;
+ } else if (ret) {
+ /*Some other error during self-heal. Just propagate it.*/
+ op_errno = -ret;
+ ret = -1;
+ }
+
+out:
+ AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL);
+ if (dict)
+ dict_unref(dict);
+ if (inode) {
+ inode_forget (inode, 1);
+ inode_unref (inode);
+ }
return ret;
}
diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c
index e64070e1bcd..78dd65f30e7 100644
--- a/xlators/cluster/afr/src/afr-inode-read.c
+++ b/xlators/cluster/afr/src/afr-inode-read.c
@@ -1380,6 +1380,11 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
return 0;
}
+ if (!strcmp (name, GF_AFR_HEAL_SBRAIN)) {
+ afr_heal_splitbrain_file (frame, this, loc);
+ return 0;
+ }
+
/*
* if we are doing getxattr with pathinfo as the key then we
* collect information from all childs
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 6198d4cf72c..e9d853c4ecd 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -17,7 +17,7 @@
#include "afr.h"
#include "afr-self-heal.h"
#include "byte-order.h"
-
+#include "protocol-common.h"
int
afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -287,6 +287,39 @@ afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
return 0;
}
+/*
+ * If by chance there are multiple sources with differing sizes, select
+ * the largest file as the source.
+ *
+ * This can happen if data was directly modified in the backend or for snapshots
+ */
+void
+afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint64_t size = 0;
+
+ /* Find source with biggest file size */
+ priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (size <= replies[i].poststat.ia_size) {
+ size = replies[i].poststat.ia_size;
+ }
+ }
+
+ /* Mark sources with less size as not source */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (size > replies[i].poststat.ia_size)
+ sources[i] = 0;
+ }
+}
+
void
afr_mark_active_sinks (xlator_t *this, unsigned char *sources,
unsigned char *locked_on, unsigned char *sinks)
@@ -304,6 +337,154 @@ afr_mark_active_sinks (xlator_t *this, unsigned char *sources,
}
gf_boolean_t
+afr_dict_contains_heal_op (call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ dict_t *xdata_req = NULL;
+ int ret = 0;
+ int heal_op = -1;
+
+ local = frame->local;
+ xdata_req = local->xdata_req;
+ ret = dict_get_int32 (xdata_req, "heal-op", &heal_op);
+ if (ret)
+ return _gf_false;
+ if (local->xdata_rsp == NULL) {
+ local->xdata_rsp = dict_new();
+ if (!local->xdata_rsp)
+ return _gf_true;
+ }
+ ret = dict_set_str (local->xdata_rsp, "sh-fail-msg",
+ "File not in split-brain");
+
+ return _gf_true;
+}
+
+/* Return a source depending on the type of heal_op, and set sources[source],
+ * sinks[source] and healed_sinks[source] to 1, 0 and 0 respectively. Do so
+ * only if the following condition is met:
+ * ∀i((i ∈ locked_on[] ∧ i=1)==>(sources[i]=0 ∧ sinks[i]=1 ∧ healed_sinks[i]=1))
+ * i.e. for each locked node, sources[node] is 0; healed_sinks[node] and
+ * sinks[node] are 1. This should be the case if the file is in split-brain.
+ */
+int
+afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
+ unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies,
+ afr_transaction_type type)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xdata_req = NULL;
+ dict_t *xdata_rsp = NULL;
+ int ret = 0;
+ int heal_op = -1;
+ int i = 0;
+ char *name = NULL;
+ int source = -1;
+
+ local = frame->local;
+ priv = this->private;
+ xdata_req = local->xdata_req;
+ ret = dict_get_int32 (xdata_req, "heal-op", &heal_op);
+ if (ret)
+ goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i])
+ if (sources[i] || !sinks[i] || !healed_sinks[i]) {
+ ret = -1;
+ goto out;
+ }
+ }
+ if (local->xdata_rsp == NULL) {
+ local->xdata_rsp = dict_new();
+ if (!local->xdata_rsp) {
+ ret = -1;
+ goto out;
+ }
+ }
+ xdata_rsp = local->xdata_rsp;
+
+ switch (heal_op) {
+ case GF_AFR_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:
+ if (type == AFR_METADATA_TRANSACTION) {
+ ret = dict_set_str (xdata_rsp, "sh-fail-msg",
+ "Use source-brick option to"
+ " heal metadata split-brain");
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ for (i = 0 ; i < priv->child_count; i++)
+ if (locked_on[i])
+ sources[i] = 1;
+ afr_mark_largest_file_as_source (this, sources, replies);
+ if (AFR_COUNT (sources, priv->child_count) != 1) {
+ ret = dict_set_str (xdata_rsp, "sh-fail-msg",
+ "No bigger file");
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ for (i = 0 ; i < priv->child_count; i++)
+ if (sources[i])
+ source = i;
+ sinks[source] = 0;
+ healed_sinks[source] = 0;
+ break;
+ case GF_AFR_OP_SBRAIN_HEAL_FROM_BRICK:
+ ret = dict_get_str (xdata_req, "child-name", &name);
+ if (ret)
+ goto out;
+ source = afr_get_child_index_from_name (this, name);
+ if (source < 0) {
+ ret = dict_set_str (xdata_rsp, "sh-fail-msg",
+ "Invalid brick name");
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ if (locked_on[source] != 1) {
+ ret = dict_set_str (xdata_rsp, "sh-fail-msg",
+ "Brick is not up");
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ sources[source] = 1;
+ sinks[source] = 0;
+ healed_sinks[source] = 0;
+ break;
+ default:
+ ret = -1;
+ goto out;
+ }
+ ret = source;
+out:
+ return ret;
+
+}
+
+int
+afr_get_child_index_from_name (xlator_t *this, char *name)
+{
+ afr_private_t *priv = this->private;
+ int index = -1;
+
+ for (index = 0; index < priv->child_count; index++) {
+ if (!strcmp (priv->children[index]->name, name))
+ goto out;
+ }
+ index = -1;
+out:
+ return index;
+}
+
+
+gf_boolean_t
afr_does_witness_exist (xlator_t *this, uint64_t *witness)
{
int i = 0;
@@ -427,6 +608,14 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
}
}
+ /* If no sources, all locked nodes are sinks - split brain */
+ if (AFR_COUNT (sources, priv->child_count) == 0) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i])
+ sinks[i] = 1;
+ }
+ }
+
/* In afr-v1 if a file is self-accused but didn't have any pending
* operations on others then it is similar to 'dirty' in afr-v2.
* Consider such cases as witness.
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index a434b9e6ba1..45a099cec86 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -17,6 +17,7 @@
#include "afr.h"
#include "afr-self-heal.h"
#include "byte-order.h"
+#include "protocol-common.h"
enum {
AFR_SELFHEAL_DATA_FULL = 0,
@@ -426,41 +427,6 @@ afr_does_size_mismatch (xlator_t *this, unsigned char *sources,
return _gf_false;
}
-/*
- * If by chance there are multiple sources with differing sizes, select
- * the largest file as the source.
- *
- * This can happen if data was directly modified in the backend or for snapshots
- */
-
-static void
-afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources,
- struct afr_reply *replies)
-{
- int i = 0;
- afr_private_t *priv = NULL;
- uint64_t size = 0;
-
- /* Find source with biggest file size */
- priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- if (!sources[i])
- continue;
- if (size <= replies[i].poststat.ia_size) {
- size = replies[i].poststat.ia_size;
- }
- }
-
- /* Mark sources with less size as not source */
- for (i = 0; i < priv->child_count; i++) {
- if (!sources[i])
- continue;
- if (size > replies[i].poststat.ia_size)
- sources[i] = 0;
- }
-
- return;
-}
static void
afr_mark_biggest_witness_as_source (xlator_t *this, unsigned char *sources,
@@ -518,7 +484,9 @@ afr_mark_newest_file_as_source (xlator_t *this, unsigned char *sources,
}
static int
-__afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources,
+__afr_selfheal_data_finalize_source (call_frame_t *frame, xlator_t *this,
+ unsigned char *sources,
+ unsigned char *sinks,
unsigned char *healed_sinks,
unsigned char *locked_on,
struct afr_reply *replies,
@@ -528,7 +496,6 @@ __afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources,
afr_private_t *priv = NULL;
int source = -1;
int sources_count = 0;
-
priv = this->private;
sources_count = AFR_COUNT (sources, priv->child_count);
@@ -536,9 +503,21 @@ __afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources,
if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0)
|| !sources_count) {
/* split brain */
- return -EIO;
+ source = afr_mark_split_brain_source_sinks (frame, this,
+ sources, sinks,
+ healed_sinks,
+ locked_on, replies,
+ AFR_DATA_TRANSACTION);
+ if (source < 0)
+ return -EIO;
+ return source;
}
+ /* No split brain at this point. If we were called from
+ * afr_heal_splitbrain_file(), abort.*/
+ if (afr_dict_contains_heal_op(frame))
+ return -EIO;
+
/* If there are no witnesses/size-mismatches on sources we are done*/
if (!afr_does_size_mismatch (this, sources, replies) &&
!afr_has_source_witnesses (this, sources, witness))
@@ -605,9 +584,10 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this,
*/
AFR_INTERSECT (healed_sinks, sinks, locked_on, priv->child_count);
- source = __afr_selfheal_data_finalize_source (this, sources,
- healed_sinks, locked_on,
- replies, witness);
+ source = __afr_selfheal_data_finalize_source (frame, this, sources,
+ sinks, healed_sinks,
+ locked_on, replies,
+ witness);
if (source < 0)
return -EIO;
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index 0518c1821e3..05d9f2b4917 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -17,6 +17,7 @@
#include "afr.h"
#include "afr-self-heal.h"
#include "byte-order.h"
+#include "protocol-common.h"
#define AFR_HEAL_ATTR (GF_SET_ATTR_UID|GF_SET_ATTR_GID|GF_SET_ATTR_MODE)
@@ -199,6 +200,7 @@ out:
static int
__afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
unsigned char *sources,
+ unsigned char *sinks,
unsigned char *healed_sinks,
unsigned char *locked_on,
struct afr_reply *replies)
@@ -208,13 +210,26 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
struct iatt first = {0, };
int source = -1;
int sources_count = 0;
+ dict_t *xdata_req = NULL;
+ afr_local_t *local = NULL;
priv = this->private;
+ local = frame->local;
+ xdata_req = local->xdata_req;
sources_count = AFR_COUNT (sources, priv->child_count);
if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0)
|| !sources_count) {
+
+ source = afr_mark_split_brain_source_sinks (frame, this,
+ sources, sinks,
+ healed_sinks,
+ locked_on, replies,
+ AFR_METADATA_TRANSACTION);
+ if (source >= 0)
+ return source;
+
/* If this is a directory mtime/ctime only split brain
use the most recent */
source = afr_dirtime_splitbrain_source (frame, this,
@@ -224,17 +239,7 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
"split brain on %s",
uuid_utoa (replies[source].poststat.ia_gfid));
sources[source] = 1;
-
- for (i = 0; i < priv->child_count; i++) {
- if (i == source)
- continue;
-
- if (!locked_on[i])
- continue;
-
- healed_sinks[i] = 1;
- }
-
+ healed_sinks[source] = 0;
return source;
}
@@ -253,6 +258,11 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
}
}
+ /* No split brain at this point. If we were called from
+ * afr_heal_splitbrain_file(), abort.*/
+ if (afr_dict_contains_heal_op(frame))
+ return -EIO;
+
for (i = 0; i < priv->child_count; i++) {
if (!sources[i])
continue;
@@ -352,7 +362,7 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *i
}
source = __afr_selfheal_metadata_finalize_source (frame, this, sources,
- healed_sinks,
+ sinks, healed_sinks,
locked_on, replies);
if (source < 0)
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 50cff91ccb3..74cc9608cf6 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -193,10 +193,28 @@ afr_log_selfheal (uuid_t gfid, xlator_t *this, int ret, char *type,
int source, unsigned char *healed_sinks);
void
+afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies);
+void
afr_mark_active_sinks (xlator_t *this, unsigned char *sources,
unsigned char *locked_on, unsigned char *sinks);
gf_boolean_t
+afr_dict_contains_heal_op (call_frame_t *frame);
+
+int
+afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
+ unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies,
+ afr_transaction_type type);
+
+int
+afr_get_child_index_from_name (xlator_t *this, char *name);
+
+gf_boolean_t
afr_does_witness_exist (xlator_t *this, uint64_t *witness);
int
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 4fdc5f774cc..09821b724fe 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -1021,4 +1021,8 @@ afr_is_xattr_ignorable (char *key);
int
afr_get_heal_info (call_frame_t *frame, xlator_t *this, loc_t *loc,
dict_t *xdata);
+
+int
+afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc);
+
#endif /* __AFR_H__ */
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 82b527e9141..866e3faf629 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -2636,8 +2636,10 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
this_call_cnt = dht_frame_return (frame);
- if (!xattr || (op_ret == -1))
+ if (!xattr || (op_ret == -1)) {
+ local->op_ret = op_ret;
goto out;
+ }
if (dict_get (xattr, conf->xattr_name)) {
dict_del (xattr, conf->xattr_name);
@@ -2808,7 +2810,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
subvol = layout->list[i].xlator;
STACK_WIND (frame, dht_vgetxattr_dir_cbk,
subvol, subvol->fops->getxattr,
- loc, key, NULL);
+ loc, key, xdata);
}
return 0;
}
@@ -2821,7 +2823,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
local->call_cnt = 1;
STACK_WIND (frame, dht_vgetxattr_cbk, cached_subvol,
- cached_subvol->fops->getxattr, loc, key, NULL);
+ cached_subvol->fops->getxattr, loc, key, xdata);
return 0;
}
@@ -2854,7 +2856,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
if (hashed_subvol) {
STACK_WIND (frame, dht_linkinfo_getxattr_cbk, hashed_subvol,
hashed_subvol->fops->getxattr, loc,
- GF_XATTR_PATHINFO_KEY, NULL);
+ GF_XATTR_PATHINFO_KEY, xdata);
return 0;
}
op_errno = ENODATA;
@@ -2933,7 +2935,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
subvol = layout->list[i].xlator;
STACK_WIND (frame, dht_getxattr_cbk,
subvol, subvol->fops->getxattr,
- loc, key, NULL);
+ loc, key, xdata);
}
return 0;