From f153c835807ac31006ba690b1deb47b20b51bc83 Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Wed, 27 Jun 2012 16:42:35 +0530 Subject: cluster/afr: Modified split-brain handling RCA The bug is observed because the decision to mark a file in split-brain is taken outside appropriate locks. Lookup gathers xattrs outside any lock. The xattrs being in split-brain in lookup should only be taken as a hint. Appropriate inodelks should be taken before confirming a split-brain. Self-heal confirms this at the moment. If data/metadata self-heal is turned off, inspecting of xattrs could not be performed so split-brain behavior does not work correctly if the self-heal options are turned off. Fix Self-heals are launched to inspect xattrs even when the data/metadata self-heal options are turned off. The decision to heal data/metadata after the xattrs are inspected is based on whether the options are turned on/off. So decision to set/reset split-brain flag is taken inside appropriate locks. Testcases: tests 33-36 in https://github.com/pranithk/gluster-tests/blob/master/afr/self-heal.sh Change-Id: Ia8aeab08208b50c06609ad35a9d72f3d553ee343 BUG: 833727 Signed-off-by: Pranith Kumar K Reviewed-on: http://review.gluster.com/3626 Tested-by: Gluster Build System Reviewed-by: Anand Avati --- xlators/cluster/afr/src/afr-common.c | 81 +++++++++++++++++++----- xlators/cluster/afr/src/afr-self-heal-common.c | 3 +- xlators/cluster/afr/src/afr-self-heal-data.c | 63 ++++++++++-------- xlators/cluster/afr/src/afr-self-heal-metadata.c | 50 +++++++++++---- xlators/cluster/afr/src/afr.h | 6 ++ 5 files changed, 146 insertions(+), 57 deletions(-) diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index b6e160743..70a9cd354 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -908,6 +908,8 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) GF_FREE (local->cont.lookup.success_children); GF_FREE (local->cont.lookup.sources); + afr_matrix_cleanup (local->cont.lookup.pending_matrix, + priv->child_count); } { /* getxattr */ @@ -1181,6 +1183,36 @@ afr_lookup_set_self_heal_params_by_xattr (afr_local_t *local, xlator_t *this, } } +void +afr_lookup_check_set_metadata_split_brain (afr_local_t *local, xlator_t *this) +{ + int32_t *sources = NULL; + afr_private_t *priv = NULL; + int32_t subvol_status = 0; + int32_t *success_children = NULL; + dict_t **xattrs = NULL; + struct iatt *bufs = NULL; + int32_t **pending_matrix = NULL; + + priv = this->private; + + sources = GF_CALLOC (priv->child_count, sizeof (*sources), + gf_afr_mt_int32_t); + if (NULL == sources) + goto out; + success_children = local->cont.lookup.success_children; + xattrs = local->cont.lookup.xattrs; + bufs = local->cont.lookup.bufs; + pending_matrix = local->cont.lookup.pending_matrix; + afr_build_sources (this, xattrs, bufs, pending_matrix, + sources, success_children, AFR_METADATA_TRANSACTION, + &subvol_status, _gf_false); + if (subvol_status & SPLIT_BRAIN) + local->cont.lookup.possible_spb = _gf_true; +out: + GF_FREE (sources); +} + static void afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, struct iatt *buf, struct iatt *lookup_buf) @@ -1214,8 +1246,26 @@ afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, } static void -afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this, - gf_boolean_t split_brain) +afr_detect_self_heal_by_split_brain_status (afr_local_t *local, xlator_t *this) +{ + gf_boolean_t split_brain = _gf_false; + afr_self_heal_t *sh = NULL; + + sh = &local->self_heal; + + split_brain = afr_is_split_brain (this, local->cont.lookup.inode); + split_brain = split_brain || local->cont.lookup.possible_spb; + if ((local->success_count > 0) && split_brain && + IA_ISREG (local->cont.lookup.inode->ia_type)) { + sh->force_confirm_spb = _gf_true; + gf_log (this->name, GF_LOG_WARNING, + "split brain detected during lookup of %s.", + local->loc.path); + } +} + +static void +afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this) { GF_ASSERT (local); GF_ASSERT (this); @@ -1233,15 +1283,6 @@ afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this, goto out; } - if ((local->success_count > 0) && split_brain && - IA_ISREG (local->cont.lookup.inode->ia_type)) { - local->self_heal.do_data_self_heal = _gf_true; - local->self_heal.do_gfid_self_heal = _gf_true; - gf_log (this->name, GF_LOG_WARNING, - "split brain detected during lookup of %s.", - local->loc.path); - } - out: return; } @@ -1252,6 +1293,8 @@ afr_can_self_heal_proceed (afr_self_heal_t *sh, afr_private_t *priv) GF_ASSERT (sh); GF_ASSERT (priv); + if (sh->force_confirm_spb) + return _gf_true; return (sh->do_gfid_self_heal || sh->do_missing_entry_self_heal || (afr_data_self_heal_enabled (priv->data_self_heal) && @@ -1516,13 +1559,11 @@ afr_lookup_set_self_heal_params (afr_local_t *local, xlator_t *this) int32_t child1 = -1; int32_t child2 = -1; afr_self_heal_t *sh = NULL; - gf_boolean_t split_brain = _gf_false; priv = this->private; sh = &local->self_heal; - split_brain = afr_is_split_brain (this, local->cont.lookup.inode); - afr_detect_self_heal_by_lookup_status (local, this, split_brain); + afr_detect_self_heal_by_lookup_status (local, this); if (afr_lookup_gfid_missing_count (local, this)) local->self_heal.do_gfid_self_heal = _gf_true; @@ -1549,9 +1590,11 @@ afr_lookup_set_self_heal_params (afr_local_t *local, xlator_t *this) afr_lookup_set_self_heal_params_by_xattr (local, this, xattr[child1]); } - if (afr_open_only_data_self_heal (priv->data_self_heal) - && !split_brain) + if (afr_open_only_data_self_heal (priv->data_self_heal)) sh->do_data_self_heal = _gf_false; + if (sh->do_metadata_self_heal) + afr_lookup_check_set_metadata_split_brain (local, this); + afr_detect_self_heal_by_split_brain_status (local, this); } int @@ -2143,6 +2186,7 @@ afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) struct iatt *iatts = NULL; int32_t *success_children = NULL; int32_t *sources = NULL; + int32_t **pending_matrix = NULL; GF_ASSERT (local); local->cont.lookup.xattrs = GF_CALLOC (child_count, @@ -2175,6 +2219,11 @@ afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) goto out; local->cont.lookup.sources = sources; + pending_matrix = afr_matrix_create (child_count, child_count); + if (NULL == pending_matrix) + goto out; + local->cont.lookup.pending_matrix = pending_matrix; + ret = 0; out: return ret; diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index f082685e2..48fa31d86 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -2095,6 +2095,7 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) shc->do_data_self_heal = sh->do_data_self_heal; shc->do_metadata_self_heal = sh->do_metadata_self_heal; shc->do_entry_self_heal = sh->do_entry_self_heal; + shc->force_confirm_spb = sh->force_confirm_spb; shc->forced_merge = sh->forced_merge; shc->background = sh->background; shc->type = sh->type; @@ -2162,7 +2163,7 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) local = bgsh_frame->local; sh = &local->self_heal; - if (local->govinda_gOvinda) + if (local->govinda_gOvinda || sh->mdata_spb || sh->data_spb) split_brain = _gf_true; afr_set_split_brain (this, sh->inode, split_brain); diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 1672b4f28..6619c1353 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -688,16 +688,6 @@ afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this) nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, sh->success_children, AFR_DATA_TRANSACTION, NULL, _gf_true); - if ((nsources == 0) && !sh->sync_done) { - gf_log (this->name, GF_LOG_DEBUG, - "No self-heal needed for %s", - local->loc.path); - - local->govinda_gOvinda = 0; - afr_sh_data_finish (frame, this); - return 0; - } - if ((nsources == -1) && (priv->favorite_child != -1) && (sh->child_errno[priv->favorite_child] == 0)) { @@ -720,12 +710,12 @@ afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this) "split-brain). Please delete the file from all but " "the preferred subvolume.", local->loc.path); - local->govinda_gOvinda = 1; + sh->data_spb = _gf_true; afr_sh_data_fail (frame, this); return 0; } - local->govinda_gOvinda = 0; + sh->data_spb = _gf_false; ret = afr_sh_inode_set_read_ctx (sh, this); if (ret) { gf_log (this->name, GF_LOG_DEBUG, @@ -738,7 +728,20 @@ afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this) if (sh->sync_done) { afr_sh_data_setattr (frame, this); } else { - afr_sh_data_fix (frame, this); + if (nsources == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "No self-heal needed for %s", + local->loc.path); + + afr_sh_data_finish (frame, this); + return 0; + } + + if (sh->do_data_self_heal && + afr_data_self_heal_enabled (priv->data_self_heal)) + afr_sh_data_fix (frame, this); + else + afr_sh_data_finish (frame, this); } return 0; } @@ -764,11 +767,7 @@ afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, bufs = local->cont.lookup.bufs; success_children = local->cont.lookup.success_children; - pending_matrix = afr_matrix_create (priv->child_count, - priv->child_count); - if (NULL == pending_matrix) - goto out; - + pending_matrix = local->cont.lookup.pending_matrix; sources = local->cont.lookup.sources; memset (sources, 0, sizeof (*sources) * priv->child_count); @@ -780,9 +779,7 @@ afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, local->loc.path); switch (txn_type) { case AFR_DATA_TRANSACTION: - afr_set_split_brain (this, - local->cont.lookup.inode, - _gf_true); + local->cont.lookup.possible_spb = _gf_true; nsources = 1; sources[success_children[0]] = 1; break; @@ -809,7 +806,6 @@ afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, sources, priv->hash_mode, gfid); out: - afr_matrix_cleanup (pending_matrix, priv->child_count); gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d", read_child); return read_child; @@ -1351,6 +1347,17 @@ afr_sh_non_reg_lock_success (call_frame_t *frame, xlator_t *this) return 0; } +gf_boolean_t +afr_can_start_data_self_heal (afr_self_heal_t *sh, afr_private_t *priv) +{ + if (sh->force_confirm_spb) + return _gf_true; + if (sh->do_data_self_heal && + afr_data_self_heal_enabled (priv->data_self_heal)) + return _gf_true; + return _gf_false; +} + int afr_self_heal_data (call_frame_t *frame, xlator_t *this) { @@ -1361,10 +1368,14 @@ afr_self_heal_data (call_frame_t *frame, xlator_t *this) local = frame->local; sh = &local->self_heal; - local->govinda_gOvinda = afr_is_split_brain (this, sh->inode); - - if (sh->do_data_self_heal && - afr_data_self_heal_enabled (priv->data_self_heal)) { + /* Self-heal completion cbk changes inode split-brain status based on + * govinda_gOvinda, mdata_spb, data_spb value. + * Initialize data_spb with current split-brain status. + * If for some reason self-heal fails(locking phase etc), it makes sure + * we retain the split-brain status before this self-heal started. + */ + sh->data_spb = afr_is_split_brain (this, sh->inode); + if (afr_can_start_data_self_heal (sh, priv)) { if (IA_ISREG (sh->type)) { afr_sh_data_open (frame, this); } else { diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 147ab98d4..79ce07d00 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -50,7 +50,7 @@ afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; afr_sh_reset (frame, this); - if (local->govinda_gOvinda) { + if (sh->mdata_spb) { gf_log (this->name, GF_LOG_INFO, "split-brain detected, aborting selfheal of %s", local->loc.path); @@ -383,15 +383,6 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this, sh->pending_matrix, sh->sources, sh->success_children, AFR_METADATA_TRANSACTION, NULL, _gf_false); - if (nsources == 0) { - gf_log (this->name, GF_LOG_TRACE, - "No self-heal needed for %s", - local->loc.path); - - afr_sh_metadata_finish (frame, this); - goto out; - } - if ((nsources == -1) && (priv->favorite_child != -1) && (sh->child_errno[priv->favorite_child] == 0)) { @@ -413,7 +404,17 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this, "(possible split-brain). Please fix the file on " "all backend volumes", local->loc.path); - local->govinda_gOvinda = 1; + sh->mdata_spb = _gf_true; + + afr_sh_metadata_finish (frame, this); + goto out; + } + + sh->mdata_spb = _gf_false; + if (nsources == 0) { + gf_log (this->name, GF_LOG_TRACE, + "No self-heal needed for %s", + local->loc.path); afr_sh_metadata_finish (frame, this); goto out; @@ -452,7 +453,10 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this, sh->fresh_children); } - afr_sh_metadata_sync_prepare (frame, this); + if (sh->do_metadata_self_heal && priv->metadata_self_heal) + afr_sh_metadata_sync_prepare (frame, this); + else + afr_sh_metadata_finish (frame, this); out: return; } @@ -512,17 +516,35 @@ afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) return 0; } +gf_boolean_t +afr_can_start_metadata_self_heal (afr_self_heal_t *sh, afr_private_t *priv) +{ + if (sh->force_confirm_spb) + return _gf_true; + if (sh->do_metadata_self_heal && priv->metadata_self_heal) + return _gf_true; + return _gf_false; +} int afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = this->private; - + afr_self_heal_t *sh = &local->self_heal; local = frame->local; + sh = &local->self_heal; + + /* Self-heal completion cbk changes inode split-brain status based on + * govinda_gOvinda, mdata_spb, data_spb value. + * Initialize mdata_spb with current split-brain status. + * If for some reason self-heal fails(locking phase etc), it makes sure + * we retain the split-brain status before this self-heal started. + */ + sh->mdata_spb = afr_is_split_brain (this, sh->inode); - if (local->self_heal.do_metadata_self_heal && priv->metadata_self_heal) { + if (afr_can_start_metadata_self_heal (sh, priv)) { afr_sh_metadata_lock (frame, this); } else { afr_sh_metadata_done (frame, this); diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index abb955334..5ed478c4a 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -170,6 +170,8 @@ typedef struct { gf_boolean_t do_entry_self_heal; gf_boolean_t do_gfid_self_heal; gf_boolean_t do_missing_entry_self_heal; + gf_boolean_t force_confirm_spb; /* Check for split-brains even when + self-heal is turned off */ gf_boolean_t forced_merge; /* Is this a self-heal triggered to forcibly merge the directories? */ @@ -266,6 +268,8 @@ typedef struct { int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this); int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this); void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this); + gf_boolean_t mdata_spb; + gf_boolean_t data_spb; call_frame_t *sh_frame; } afr_self_heal_t; @@ -444,7 +448,9 @@ typedef struct _afr_local { int32_t read_child; int32_t *sources; int32_t *success_children; + int32_t **pending_matrix; gf_boolean_t fresh_lookup; + gf_boolean_t possible_spb; } lookup; struct { -- cgit