From cc9701d4f533bf7337d2925424e2498ab64fdf13 Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Fri, 16 Nov 2012 07:08:30 +0530 Subject: cluster/afr: Remember type of split-brain in inode-ctx Along with this change, fixed the race of setting the split-brain status in inode-ctx after unwinding the fop from self-heal in case of back-ground self-heal. Change-Id: Ifc829300df485f50f139443802e8b6dc7038b4ad BUG: 873962 Signed-off-by: Pranith Kumar K Reviewed-on: http://review.gluster.org/4198 Tested-by: Gluster Build System Reviewed-by: Anand Avati --- tests/bugs/bug-873962.t | 85 ++++++++++++ xlators/cluster/afr/src/afr-common.c | 169 +++++++++++------------ xlators/cluster/afr/src/afr-self-heal-common.c | 7 +- xlators/cluster/afr/src/afr-self-heal-data.c | 12 +- xlators/cluster/afr/src/afr-self-heal-metadata.c | 28 ++-- xlators/cluster/afr/src/afr.h | 15 +- 6 files changed, 199 insertions(+), 117 deletions(-) create mode 100755 tests/bugs/bug-873962.t diff --git a/tests/bugs/bug-873962.t b/tests/bugs/bug-873962.t new file mode 100755 index 000000000..c18c71f60 --- /dev/null +++ b/tests/bugs/bug-873962.t @@ -0,0 +1,85 @@ +#!/bin/bash + +#AFR TEST-IDENTIFIER SPLIT-BRAIN +. $(dirname $0)/../include.rc + +cleanup; + +TEST glusterd +TEST pidof glusterd +TEST $CLI volume info; + +B0_hiphenated=`echo $B0 | tr '/' '-'` +TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2} + +#Make sure self-heal is not triggered when the bricks are re-started +TEST $CLI volume set $V0 cluster.self-heal-daemon off +TEST $CLI volume set $V0 performance.stat-prefetch off +TEST $CLI volume start $V0 +TEST glusterfs -s $H0 --volfile-id=$V0 $M0 +TEST glusterfs -s $H0 --volfile-id=$V0 $M1 +TEST cd $M0 +TEST touch a +TEST touch b +TEST touch c +TEST touch d +echo "1" > b +echo "1" > d +kill -9 `cat /var/lib/glusterd/vols/$V0/run/$H0$B0_hiphenated-${V0}2.pid` +echo "1" > a +echo "1" > c +TEST setfattr -n trusted.mdata -v abc b +TEST setfattr -n trusted.mdata -v abc d +TEST $CLI volume start $V0 force +sleep 5 +kill -9 `cat /var/lib/glusterd/vols/$V0/run/$H0$B0_hiphenated-${V0}1.pid` +echo "2" > a +echo "2" > c +TEST setfattr -n trusted.mdata -v def b +TEST setfattr -n trusted.mdata -v def d +TEST $CLI volume start $V0 force +sleep 10 + +#Files are in split-brain, so open should fail +TEST ! cat a; +TEST ! cat $M1/a; +TEST ! cat b; +TEST ! cat $M1/b; + +#Reset split-brain status +TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/a; +TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/b; + +sleep 1 +#The operations should do self-heal and give correct output +EXPECT "2" cat a; +EXPECT "2" cat $M1/a; +EXPECT "def" getfattr -n trusted.mdata --only-values b 2>/dev/null +EXPECT "def" getfattr -n trusted.mdata --only-values $M1/b 2>/dev/null + +TEST $CLI volume set $V0 cluster.data-self-heal off +TEST $CLI volume set $V0 cluster.metadata-self-heal off +sleep 5 #Wait for vol file to be updated + +sleep 1 +#Files are in split-brain, so open should fail +TEST ! cat c +TEST ! cat $M1/c +TEST ! cat d +TEST ! cat $M1/d + +TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/c +TEST setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000000000000 $B0/${V0}1/d + +sleep 1 +#The operations should NOT do self-heal but give correct output +EXPECT "2" cat c +EXPECT "2" cat $M1/c +EXPECT "1" cat d +EXPECT "1" cat $M1/d + +#Check that the self-heal is not triggered. +EXPECT "1" cat $B0/${V0}1/c +EXPECT "abc" getfattr -n trusted.mdata --only-values $B0/${V0}1/d 2>/dev/null +cd +cleanup; diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index c38e52725..ee96944f8 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -49,8 +49,7 @@ #include "afr-self-heald.h" #include "pump.h" -#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000200000000ULL -#define AFR_ICTX_SPLIT_BRAIN_MASK 0x0000000100000000ULL +#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000100000000ULL #define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL int @@ -203,59 +202,86 @@ out: return ret; } -afr_inode_ctx_t* -afr_inode_ctx_get_from_addr (uint64_t addr, int32_t child_count) +void +afr_inode_ctx_destroy (afr_inode_ctx_t *ctx) { - int ret = -1; - afr_inode_ctx_t *ctx = NULL; - size_t size = 0; + if (!ctx) + return; + GF_FREE (ctx->fresh_children); + GF_FREE (ctx); +} - GF_ASSERT (child_count > 0); +afr_inode_ctx_t* +__afr_inode_ctx_get (inode_t *inode, xlator_t *this) +{ + int ret = 0; + uint64_t ctx_addr = 0; + afr_inode_ctx_t *ctx = NULL; + afr_private_t *priv = NULL; - if (!addr) { - ctx = GF_CALLOC (1, sizeof (*ctx), - gf_afr_mt_inode_ctx_t); - if (!ctx) - goto out; - size = sizeof (*ctx->fresh_children); - ctx->fresh_children = GF_CALLOC (child_count, size, - gf_afr_mt_int32_t); - if (!ctx->fresh_children) - goto out; - } else { - ctx = (afr_inode_ctx_t*) (long) addr; + priv = this->private; + ret = __inode_ctx_get (inode, this, &ctx_addr); + if (ret < 0) + ctx_addr = 0; + if (ctx_addr != 0) { + ctx = (afr_inode_ctx_t*) (long) ctx_addr; + goto out; } - ret = 0; + ctx = GF_CALLOC (1, sizeof (*ctx), + gf_afr_mt_inode_ctx_t); + if (!ctx) + goto fail; + ctx->fresh_children = GF_CALLOC (priv->child_count, + sizeof (*ctx->fresh_children), + gf_afr_mt_int32_t); + if (!ctx->fresh_children) + goto fail; + ret = __inode_ctx_put (inode, this, (uint64_t)ctx); + if (ret) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " + "set the inode ctx (%s)", + uuid_utoa (inode->gfid)); + goto fail; + } + out: - if (ret && ctx) { - GF_FREE (ctx->fresh_children); - GF_FREE (ctx); - ctx = NULL; + return ctx; + +fail: + afr_inode_ctx_destroy (ctx); + return NULL; +} + +afr_inode_ctx_t* +afr_inode_ctx_get (inode_t *inode, xlator_t *this) +{ + afr_inode_ctx_t *ctx = NULL; + + LOCK (&inode->lock); + { + ctx = __afr_inode_ctx_get (inode, this); } + UNLOCK (&inode->lock); return ctx; } void -afr_inode_get_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) +afr_inode_get_ctx_params (xlator_t *this, inode_t *inode, + afr_inode_params_t *params) { GF_ASSERT (inode); GF_ASSERT (params); - int ret = 0; afr_inode_ctx_t *ctx = NULL; afr_private_t *priv = NULL; int i = 0; - uint64_t ctx_addr = 0; int32_t read_child = -1; int32_t *fresh_children = NULL; priv = this->private; LOCK (&inode->lock); { - ret = __inode_ctx_get (inode, this, &ctx_addr); - if (ret < 0) - goto unlock; - ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count); + ctx = __afr_inode_ctx_get (inode, this); if (!ctx) goto unlock; switch (params->op) { @@ -274,12 +300,6 @@ afr_inode_get_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) if (ctx->masks & AFR_ICTX_OPENDIR_DONE_MASK) params->u.value = _gf_true; break; - case AFR_INODE_GET_SPLIT_BRAIN: - params->u.value = _gf_false; - if (ctx->masks & AFR_ICTX_SPLIT_BRAIN_MASK) - params->u.value = _gf_true; - ; - break; default: GF_ASSERT (0); break; @@ -292,11 +312,16 @@ unlock: gf_boolean_t afr_is_split_brain (xlator_t *this, inode_t *inode) { - afr_inode_params_t params = {0}; + afr_inode_ctx_t *ctx = NULL; + gf_boolean_t spb = _gf_false; - params.op = AFR_INODE_GET_SPLIT_BRAIN; - afr_inode_get_ctx (this, inode, ¶ms); - return params.u.value; + ctx = afr_inode_ctx_get (inode, this); + if (!ctx) + goto out; + if ((ctx->mdata_spb == SPB) || (ctx->data_spb == SPB)) + spb = _gf_true; +out: + return spb; } gf_boolean_t @@ -305,11 +330,10 @@ afr_is_opendir_done (xlator_t *this, inode_t *inode) afr_inode_params_t params = {0}; params.op = AFR_INODE_GET_OPENDIR_DONE; - afr_inode_get_ctx (this, inode, ¶ms); + afr_inode_get_ctx_params (this, inode, ¶ms); return params.u.value; } - int32_t afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) { @@ -317,7 +341,7 @@ afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) params.op = AFR_INODE_GET_READ_CTX; params.u.read_ctx.children = fresh_children; - afr_inode_get_ctx (this, inode, ¶ms); + afr_inode_get_ctx_params (this, inode, ¶ms); return params.u.read_ctx.read_child; } @@ -379,31 +403,14 @@ afr_inode_ctx_set_opendir_done (afr_inode_ctx_t *ctx) } void -afr_inode_ctx_set_splitbrain (afr_inode_ctx_t *ctx, gf_boolean_t set) -{ - uint64_t remaining_mask = 0; - uint64_t mask = 0; - - if (set) { - remaining_mask = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks); - mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_SPLIT_BRAIN_MASK); - ctx->masks = remaining_mask | mask; - } else { - ctx->masks = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks); - } -} - -void -afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) +afr_inode_set_ctx_params (xlator_t *this, inode_t *inode, + afr_inode_params_t *params) { GF_ASSERT (inode); GF_ASSERT (params); - int ret = 0; afr_inode_ctx_t *ctx = NULL; afr_private_t *priv = NULL; - uint64_t ctx_addr = 0; - gf_boolean_t set = _gf_false; int32_t read_child = -1; int32_t *fresh_children = NULL; int32_t *stale_children = NULL; @@ -411,10 +418,7 @@ afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) priv = this->private; LOCK (&inode->lock); { - ret = __inode_ctx_get (inode, this, &ctx_addr); - if (ret < 0) - ctx_addr = 0; - ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count); + ctx = __afr_inode_ctx_get (inode, this); if (!ctx) goto unlock; switch (params->op) { @@ -434,33 +438,26 @@ afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) case AFR_INODE_SET_OPENDIR_DONE: afr_inode_ctx_set_opendir_done (ctx); break; - case AFR_INODE_SET_SPLIT_BRAIN: - set = params->u.value; - afr_inode_ctx_set_splitbrain (ctx, set); - break; default: GF_ASSERT (0); break; } - ret = __inode_ctx_put (inode, this, (uint64_t)ctx); - if (ret) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " - "set the inode ctx (%s)", - uuid_utoa (inode->gfid)); - } } unlock: UNLOCK (&inode->lock); } void -afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set) +afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, + afr_spb_state_t data_spb) { - afr_inode_params_t params = {0}; + afr_inode_ctx_t *ctx = NULL; - params.op = AFR_INODE_SET_SPLIT_BRAIN; - params.u.value = set; - afr_inode_set_ctx (this, inode, ¶ms); + ctx = afr_inode_ctx_get (inode, this); + if (mdata_spb != DONT_KNOW) + ctx->mdata_spb = mdata_spb; + if (data_spb != DONT_KNOW) + ctx->data_spb = data_spb; } void @@ -469,7 +466,7 @@ afr_set_opendir_done (xlator_t *this, inode_t *inode) afr_inode_params_t params = {0}; params.op = AFR_INODE_SET_OPENDIR_DONE; - afr_inode_set_ctx (this, inode, ¶ms); + afr_inode_set_ctx_params (this, inode, ¶ms); } void @@ -488,7 +485,7 @@ afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, params.op = AFR_INODE_SET_READ_CTX; params.u.read_ctx.read_child = read_child; params.u.read_ctx.children = fresh_children; - afr_inode_set_ctx (this, inode, ¶ms); + afr_inode_set_ctx_params (this, inode, ¶ms); } void @@ -501,7 +498,7 @@ afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, params.op = AFR_INODE_RM_STALE_CHILDREN; params.u.read_ctx.children = stale_children; - afr_inode_set_ctx (this, inode, ¶ms); + afr_inode_set_ctx_params (this, inode, ¶ms); } gf_boolean_t diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 07603c5b2..a0dfa59d9 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -2067,19 +2067,16 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) afr_local_t * orig_frame_local = NULL; afr_self_heal_t * orig_frame_sh = NULL; char sh_type_str[256] = {0,}; - gf_boolean_t split_brain = _gf_false; priv = this->private; local = bgsh_frame->local; sh = &local->self_heal; - if (local->govinda_gOvinda || sh->mdata_spb || sh->data_spb) { - split_brain = _gf_true; + if (local->govinda_gOvinda) { + afr_set_split_brain (this, sh->inode, SPB, SPB); sh->op_failed = 1; } - afr_set_split_brain (this, sh->inode, split_brain); - afr_self_heal_type_str_get (sh, sh_type_str, sizeof(sh_type_str)); if (sh->op_failed) { diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index c90ed7c1a..969443687 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -716,12 +716,13 @@ afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this) "split-brain). Please delete the file from all but " "the preferred subvolume.", local->loc.path); - sh->data_spb = _gf_true; + afr_set_split_brain (this, sh->inode, DONT_KNOW, SPB); + afr_sh_data_fail (frame, this); return 0; } - sh->data_spb = _gf_false; + afr_set_split_brain (this, sh->inode, DONT_KNOW, NO_SPB); ret = afr_sh_inode_set_read_ctx (sh, this); if (ret) { gf_log (this->name, GF_LOG_DEBUG, @@ -1401,13 +1402,6 @@ afr_self_heal_data (call_frame_t *frame, xlator_t *this) local = frame->local; sh = &local->self_heal; - /* Self-heal completion cbk changes inode split-brain status based on - * govinda_gOvinda, mdata_spb, data_spb value. - * Initialize data_spb with current split-brain status. - * If for some reason self-heal fails(locking phase etc), it makes sure - * we retain the split-brain status before this self-heal started. - */ - sh->data_spb = afr_is_split_brain (this, sh->inode); if (afr_can_start_data_self_heal (sh, priv)) { if (IA_ISREG (sh->type)) { afr_sh_data_open (frame, this); diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 647ee47a1..cf8a33add 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -88,6 +88,19 @@ afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this) return 0; } +int +afr_sh_metadata_fail (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + sh->op_failed = 1; + afr_sh_metadata_finish (frame, this); + return 0; +} int afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, @@ -397,13 +410,12 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this, "(possible split-brain). Please fix the file on " "all backend volumes", local->loc.path); - sh->mdata_spb = _gf_true; - - afr_sh_metadata_finish (frame, this); + afr_set_split_brain (this, sh->inode, SPB, DONT_KNOW); + afr_sh_metadata_fail (frame, this); goto out; } - sh->mdata_spb = _gf_false; + afr_set_split_brain (this, sh->inode, NO_SPB, DONT_KNOW); if (nsources == 0) { gf_log (this->name, GF_LOG_TRACE, "No self-heal needed for %s", @@ -529,14 +541,6 @@ afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) local = frame->local; sh = &local->self_heal; - /* Self-heal completion cbk changes inode split-brain status based on - * govinda_gOvinda, mdata_spb, data_spb value. - * Initialize mdata_spb with current split-brain status. - * If for some reason self-heal fails(locking phase etc), it makes sure - * we retain the split-brain status before this self-heal started. - */ - sh->mdata_spb = afr_is_split_brain (this, sh->inode); - if (afr_can_start_metadata_self_heal (sh, priv)) { afr_sh_metadata_lock (frame, this); } else { diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index da9cc67c6..475e5dda4 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -58,10 +58,8 @@ typedef enum { AFR_INODE_SET_READ_CTX = 1, AFR_INODE_RM_STALE_CHILDREN, AFR_INODE_SET_OPENDIR_DONE, - AFR_INODE_SET_SPLIT_BRAIN, AFR_INODE_GET_READ_CTX, AFR_INODE_GET_OPENDIR_DONE, - AFR_INODE_GET_SPLIT_BRAIN, } afr_inode_op_t; typedef struct afr_inode_params_ { @@ -75,9 +73,17 @@ typedef struct afr_inode_params_ { } u; } afr_inode_params_t; +typedef enum afr_spb_state { + DONT_KNOW, + SPB, + NO_SPB +} afr_spb_state_t; + typedef struct afr_inode_ctx_ { uint64_t masks; int32_t *fresh_children;//increasing order of latency + afr_spb_state_t mdata_spb; + afr_spb_state_t data_spb; } afr_inode_ctx_t; typedef enum { @@ -272,8 +278,6 @@ typedef struct { int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this); int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this); void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this); - gf_boolean_t mdata_spb; - gf_boolean_t data_spb; call_frame_t *sh_frame; } afr_self_heal_t; @@ -833,7 +837,8 @@ gf_boolean_t afr_is_split_brain (xlator_t *this, inode_t *inode); void -afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set); +afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, + afr_spb_state_t data_spb); int afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, -- cgit