diff options
-rw-r--r-- | libglusterfs/src/glusterfs.h | 4 | ||||
-rw-r--r-- | tests/basic/afr/split-brain-resolution.t | 86 | ||||
-rw-r--r-- | tests/volume.rc | 6 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 184 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-inode-write.c | 146 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-read-txn.c | 7 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 18 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.h | 14 |
8 files changed, 431 insertions, 34 deletions
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h index 9c9d9648a92..095ca2a1388 100644 --- a/libglusterfs/src/glusterfs.h +++ b/libglusterfs/src/glusterfs.h @@ -141,7 +141,9 @@ #define GF_AFR_HEAL_INFO "glusterfs.heal-info" #define GF_AFR_HEAL_SBRAIN "glusterfs.heal-sbrain" -#define GF_AFR_SBRAIN_STATUS "afr.split-brain-status" +#define GF_AFR_SBRAIN_STATUS "replica.split-brain-status" +#define GF_AFR_SBRAIN_CHOICE "replica.split-brain-choice" +#define GF_AFR_SBRAIN_RESOLVE "replica.split-brain-heal-finalize" #define GF_GFIDLESS_LOOKUP "gfidless-lookup" /* replace-brick and pump related internal xattrs */ diff --git a/tests/basic/afr/split-brain-resolution.t b/tests/basic/afr/split-brain-resolution.t new file mode 100644 index 00000000000..feb527a71b3 --- /dev/null +++ b/tests/basic/afr/split-brain-resolution.t @@ -0,0 +1,86 @@ +#!/bin/bash +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +cleanup; + +function get_split_brain_status { + local path=$1 + echo `getfattr -n replica.split-brain-status $path` | cut -f2 -d"=" | sed -e 's/^"//' -e 's/"$//' +} + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} +TEST $CLI volume start $V0 + +#Disable self-heal-daemon +TEST $CLI volume set $V0 cluster.self-heal-daemon off + +TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0; + +TEST `echo "some-data" > $M0/data-split-brain.txt` +TEST `echo "some-data" > $M0/metadata-split-brain.txt` + +#Create data and metadata split-brain +TEST kill_brick $V0 $H0 $B0/${V0}0 + +TEST `echo "brick1_alive" > $M0/data-split-brain.txt` +TEST setfattr -n user.test -v brick1 $M0/metadata-split-brain.txt + +TEST $CLI volume start $V0 force +TEST kill_brick $V0 $H0 $B0/${V0}1 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 + +TEST `echo "brick0_alive" > $M0/data-split-brain.txt` +TEST setfattr -n user.test -v brick0 $M0/metadata-split-brain.txt + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 +EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 + +EXPECT 4 afr_get_pending_heal_count $V0 + +TEST ! cat $M0/data-split-brain.txt +TEST ! getxattr -n user.test $M0/metadata-split-brain.txt + +#Inspect file in data-split-brain +EXPECT "data-split-brain:yes metadata-split-brain:no Choices:patchy-client-0,patchy-client-1" get_split_brain_status $M0/data-split-brain.txt +TEST setfattr -n replica.split-brain-choice -v $V0-client-0 $M0/data-split-brain.txt + +#Should now be able to read the contents of data-split-brain.txt +EXPECT "brick0_alive" cat $M0/data-split-brain.txt + +TEST setfattr -n replica.split-brain-choice -v $V0-client-1 $M0/data-split-brain.txt + +#Should now be able to read the contents of data-split-brain.txt +EXPECT "brick1_alive" cat $M0/data-split-brain.txt + +#Inspect the file in metadata-split-brain +EXPECT "data-split-brain:no metadata-split-brain:yes Choices:patchy-client-0,patchy-client-1" get_split_brain_status $M0/metadata-split-brain.txt +TEST setfattr -n replica.split-brain-choice -v $V0-client-0 $M0/metadata-split-brain.txt + +EXPECT "brick0" get_text_xattr user.test $M0/metadata-split-brain.txt + +TEST setfattr -n replica.split-brain-choice -v $V0-client-1 $M0/metadata-split-brain.txt +EXPECT "brick1" get_text_xattr user.test $M0/metadata-split-brain.txt + +#Check that setting split-brain-choice to "none" results in EIO again +TEST setfattr -n replica.split-brain-choice -v none $M0/metadata-split-brain.txt +TEST setfattr -n replica.split-brain-choice -v none $M0/data-split-brain.txt +TEST ! getxattr -n user.test $M0/metadata-split-brain.txt +TEST ! cat $M0/data-split-brain.txt + +#Negative test cases should fail +TEST ! setfattr -n replica.split-brain-choice -v $V0-client-4 $M0/data-split-brain.txt +TEST ! setfattr -n replica.split-brain-heal-finalize -v $V0-client-4 $M0/metadata-split-brain.txt + +#Heal the files +TEST setfattr -n replica.split-brain-heal-finalize -v $V0-client-0 $M0/metadata-split-brain.txt +TEST setfattr -n replica.split-brain-heal-finalize -v $V0-client-1 $M0/data-split-brain.txt + +EXPECT "brick0" get_text_xattr user.test $M0/metadata-split-brain.txt +EXPECT "brick1_alive" cat $M0/data-split-brain.txt + +EXPECT 0 afr_get_pending_heal_count $V0 + +cleanup; diff --git a/tests/volume.rc b/tests/volume.rc index 6abf68dc75c..1276dccdbae 100644 --- a/tests/volume.rc +++ b/tests/volume.rc @@ -242,6 +242,12 @@ function gf_gfid_xattr_to_str { echo "${xval:2:8}-${xval:10:4}-${xval:14:4}-${xval:18:4}-${xval:22:12}" } +function get_text_xattr { + local key=$1 + local path=$2 + getfattr -d -m. -e text $path 2>/dev/null | grep $key | cut -f2 -d'=' +} + function gf_check_file_opened_in_brick { vol=$1 host=$2 diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index f7cc202d4d1..0af46993a34 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -67,6 +67,37 @@ afr_copy_frame (call_frame_t *base) return frame; } +int +__afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx) +{ + uint64_t ctx_int = 0; + int ret = -1; + afr_inode_ctx_t *tmp_ctx = NULL; + + ret = __inode_ctx_get (inode, this, &ctx_int); + if (ret) { + tmp_ctx = GF_CALLOC (1, sizeof (afr_inode_ctx_t), + gf_afr_mt_inode_ctx_t); + if (!tmp_ctx) + goto out; + + ctx_int = (long) tmp_ctx; + ret = __inode_ctx_set (inode, this, &ctx_int); + if (ret) { + GF_FREE (tmp_ctx); + goto out; + } + tmp_ctx->spb_choice = -1; + tmp_ctx->read_subvol = 0; + } else { + tmp_ctx = (afr_inode_ctx_t *) ctx_int; + } + + *ctx = tmp_ctx; + ret = 0; +out: + return ret; +} /* * INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS: * @@ -109,13 +140,16 @@ __afr_inode_read_subvol_get_small (inode_t *inode, xlator_t *this, uint32_t event = 0; uint64_t val = 0; int i = 0; + afr_inode_ctx_t *ctx = NULL; priv = this->private; - ret = __inode_ctx_get (inode, this, &val); + ret = __afr_inode_ctx_get (this, inode, &ctx); if (ret < 0) return ret; + val = ctx->read_subvol; + metadatamap = (val & 0x000000000000ffff); datamap = (val & 0x00000000ffff0000) >> 16; event = (val & 0xffffffff00000000) >> 32; @@ -143,9 +177,15 @@ __afr_inode_read_subvol_set_small (inode_t *inode, xlator_t *this, uint16_t metadatamap = 0; uint64_t val = 0; int i = 0; + int ret = -1; + afr_inode_ctx_t *ctx = NULL; priv = this->private; + ret = __afr_inode_ctx_get (this, inode, &ctx); + if (ret) + goto out; + for (i = 0; i < priv->child_count; i++) { if (data[i]) datamap |= (1 << i); @@ -157,9 +197,12 @@ __afr_inode_read_subvol_set_small (inode_t *inode, xlator_t *this, (((uint64_t) datamap) << 16) | (((uint64_t) event) << 32); - return __inode_ctx_set (inode, this, &val); -} + ctx->read_subvol = val; + ret = 0; +out: + return ret; +} int __afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this) @@ -169,9 +212,13 @@ __afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this) uint16_t metadatamap = 0; uint32_t event = 0; uint64_t val = 0; + afr_inode_ctx_t *ctx = NULL; + + ret = __afr_inode_ctx_get (this, inode, &ctx); + if (ret) + return ret; - ret = __inode_ctx_get (inode, this, &val); - (void) ret; + val = ctx->read_subvol; metadatamap = (val & 0x000000000000ffff) >> 0; datamap = (val & 0x00000000ffff0000) >> 16; @@ -181,7 +228,9 @@ __afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this) (((uint64_t) datamap) << 16) | (((uint64_t) event) << 32); - return __inode_ctx_set (inode, this, &val); + ctx->read_subvol = val; + + return ret; } @@ -205,6 +254,20 @@ __afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, return ret; } +int +__afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this, + int *spb_choice) +{ + afr_inode_ctx_t *ctx = NULL; + int ret = -1; + + ret = __afr_inode_ctx_get (this, inode, &ctx); + if (ret < 0) + return ret; + + *spb_choice = ctx->spb_choice; + return 0; +} int __afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, @@ -224,6 +287,23 @@ __afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data return ret; } +int +__afr_inode_split_brain_choice_set (inode_t *inode, xlator_t *this, + int spb_choice) +{ + afr_inode_ctx_t *ctx = NULL; + int ret = -1; + + ret = __afr_inode_ctx_get (this, inode, &ctx); + if (ret) + goto out; + + ctx->spb_choice = spb_choice; + + ret = 0; +out: + return ret; +} int __afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this) @@ -258,6 +338,22 @@ afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, unsigned char *data, return ret; } +int +afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this, + int *spb_choice) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __afr_inode_split_brain_choice_get (inode, this, + spb_choice); + } + UNLOCK(&inode->lock); + + return ret; +} + int afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, @@ -275,6 +371,22 @@ afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, return ret; } +int +afr_inode_split_brain_choice_set (inode_t *inode, xlator_t *this, + int spb_choice) +{ + int ret = -1; + + LOCK(&inode->lock); + { + ret = __afr_inode_split_brain_choice_set (inode, this, + spb_choice); + } + UNLOCK(&inode->lock); + + return ret; +} + int afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this) @@ -1220,6 +1332,7 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) gf_boolean_t locked_entry = _gf_false; gf_boolean_t can_interpret = _gf_true; inode_t *parent = NULL; + int spb_choice = -1; priv = this->private; local = frame->local; @@ -1232,6 +1345,8 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) afr_inode_read_subvol_get (parent, this, readable, NULL, &event); + afr_inode_split_brain_choice_get (local->inode, this, + &spb_choice); /* First, check if we have a gfid-change from somewhere, If so, propagate that so that a fresh lookup can be issued @@ -1321,18 +1436,24 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) } } else { cant_interpret: - if (read_subvol == -1) - dict_del (replies[0].xdata, GF_CONTENT_KEY); - else - dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY); + if (read_subvol == -1) { + if (spb_choice >= 0) + read_subvol = spb_choice; + else + read_subvol = 0; + } + dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY); } afr_handle_quota_size (frame, this); unwind: - if (read_subvol == -1) - read_subvol = 0; - + if (read_subvol == -1) { + if (spb_choice >= 0) + read_subvol = spb_choice; + else + read_subvol = 0; + } par_read_subvol = afr_get_parent_read_subvol (this, parent, replies, readable); @@ -1741,8 +1862,12 @@ afr_discover_done (call_frame_t *frame, xlator_t *this) } unwind: - if (read_subvol == -1) - read_subvol = 0; + if (read_subvol == -1) { + afr_inode_split_brain_choice_get (local->inode, this, + &read_subvol); + if (read_subvol == -1) + read_subvol = 0; + } AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, local->inode, &local->replies[read_subvol].poststat, @@ -3468,6 +3593,15 @@ out: int afr_forget (xlator_t *this, inode_t *inode) { + uint64_t ctx_int = 0; + afr_inode_ctx_t *ctx = NULL; + + inode_ctx_del (inode, this, &ctx_int); + if (!ctx_int) + return 0; + + ctx = (afr_inode_ctx_t *)ctx_int; + GF_FREE (ctx); return 0; } @@ -4594,8 +4728,26 @@ afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc) } out: - AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL); + if (local->op == GF_FOP_GETXATTR) + AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL); + else if (local->op == GF_FOP_SETXATTR) + AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL); if (dict) dict_unref(dict); return ret; } + +int +afr_get_child_index_from_name (xlator_t *this, char *name) +{ + afr_private_t *priv = this->private; + int index = -1; + + for (index = 0; index < priv->child_count; index++) { + if (!strcmp (priv->children[index]->name, name)) + goto out; + } + index = -1; +out: + return index; +} diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 0c96d069ae5..776933892ff 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -34,8 +34,8 @@ #include "common-utils.h" #include "compat-errno.h" #include "compat.h" +#include "protocol-common.h" -#include "afr.h" #include "afr-transaction.h" @@ -961,6 +961,145 @@ afr_setxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) return 0; } +int +afr_split_brain_resolve_do (call_frame_t *frame, xlator_t *this, loc_t *loc, + char *data) +{ + afr_local_t *local = NULL; + int ret = -1; + int op_errno = EINVAL; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_SETXATTR; + + local->xdata_req = dict_new (); + + if (!local->xdata_req) { + op_errno = ENOMEM; + goto out; + } + + ret = dict_set_int32 (local->xdata_req, "heal-op", + GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + ret = dict_set_str (local->xdata_req, "child-name", data); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + afr_heal_splitbrain_file (frame, this, loc); +out: + if (ret < 0) + AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + return 0; +} + +int +afr_set_split_brain_choice (call_frame_t *frame, xlator_t *this, loc_t *loc, + int spb_choice) +{ + int ret = -1; + int op_errno = ENOMEM; + afr_private_t *priv = NULL; + + priv = this->private; + + ret = afr_inode_split_brain_choice_set (loc->inode, this, spb_choice); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Failed to set" + "split-brain choice as %s for %s", + priv->children[spb_choice]->name, + loc->name); + } + inode_invalidate (loc->inode); + AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL); + return ret; +} + +int +afr_get_split_brain_child_index (xlator_t *this, void *value, size_t len) +{ + int spb_child_index = -1; + char *spb_child_str = NULL; + + spb_child_str = alloca0 (len + 1); + memcpy (spb_child_str, value, len); + + if (!strcmp (spb_child_str, "none")) + return -2; + + spb_child_index = afr_get_child_index_from_name (this, + spb_child_str); + if (spb_child_index < 0) { + gf_log (this->name, GF_LOG_ERROR, "Invalid subvol: %s", + spb_child_str); + } + return spb_child_index; +} + +int +afr_handle_split_brain_commands (xlator_t *this, call_frame_t *frame, + loc_t *loc, dict_t *dict) +{ + int len = 0; + void *value = NULL; + int spb_child_index = -1; + int ret = -1; + int op_errno = EINVAL; + afr_private_t *priv = NULL; + + priv = this->private; + + ret = dict_get_ptr_and_len (dict, GF_AFR_SBRAIN_CHOICE, &value, + &len); + if (value) { + spb_child_index = afr_get_split_brain_child_index (this, value, + len); + if (spb_child_index < 0) { + /* Case where value was "none" */ + if (spb_child_index == -2) + spb_child_index = -1; + else { + ret = 1; + goto out; + } + } + + afr_set_split_brain_choice (frame, this, loc, + spb_child_index); + ret = 0; + goto out; + } + + ret = dict_get_ptr_and_len (dict, GF_AFR_SBRAIN_RESOLVE, &value, &len); + if (value) { + spb_child_index = afr_get_split_brain_child_index (this, value, + len); + if (spb_child_index < 0) { + ret = 1; + goto out; + } + + afr_split_brain_resolve_do (frame, this, loc, + priv->children[spb_child_index]->name); + ret = 0; + } +out: + /* key was correct but value was invalid when ret == 1 */ + if (ret == 1) { + AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + ret = 0; + } + return ret; +} int afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, @@ -977,6 +1116,11 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, op_errno, out); + ret = afr_handle_split_brain_commands (this, frame, loc, dict); + + if (ret == 0) + return 0; + transaction_frame = copy_frame (frame); if (!transaction_frame) goto out; diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index ec67a20e624..eaa73d9be20 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -56,6 +56,7 @@ afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err) int event_generation = 0; inode_t *inode = NULL; int ret = -1; + int spb_choice = -1; local = frame->local; inode = local->inode; @@ -96,6 +97,12 @@ afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err) local->read_attempted[read_subvol] = 1; readfn: + if (read_subvol == -1) { + ret = afr_inode_split_brain_choice_get (inode, this, + &spb_choice); + if ((ret == 0) && spb_choice >= 0) + read_subvol = spb_choice; + } local->readfn (frame, this, read_subvol); return 0; diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 2441f413f3e..21b4c4414d9 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -389,9 +389,11 @@ afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this, local = frame->local; priv = this->private; xdata_req = local->xdata_req; + ret = dict_get_int32 (xdata_req, "heal-op", &heal_op); if (ret) goto out; + for (i = 0; i < priv->child_count; i++) { if (locked_on[i]) if (sources[i] || !sinks[i] || !healed_sinks[i]) { @@ -468,22 +470,6 @@ out: } -int -afr_get_child_index_from_name (xlator_t *this, char *name) -{ - afr_private_t *priv = this->private; - int index = -1; - - for (index = 0; index < priv->child_count; index++) { - if (!strcmp (priv->children[index]->name, name)) - goto out; - } - index = -1; -out: - return index; -} - - gf_boolean_t afr_does_witness_exist (xlator_t *this, uint64_t *witness) { diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index d7d15c69845..0885b582d77 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -733,6 +733,11 @@ typedef struct _afr_local { } afr_local_t; +typedef struct _afr_inode_ctx { + uint64_t read_subvol; + int spb_choice; +} afr_inode_ctx_t; + /* did a call fail due to a child failing? */ #define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ ((op_errno == ENOTCONN) || \ @@ -1026,4 +1031,13 @@ afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc); int afr_get_split_brain_status (call_frame_t *frame, xlator_t *this, loc_t *loc); + +int +afr_inode_split_brain_choice_set (inode_t *inode, xlator_t *this, + int spb_choice); +int +afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this, + int *spb_choice); +int +afr_get_child_index_from_name (xlator_t *this, char *name); #endif /* __AFR_H__ */ |