From fa90243e20eeef91eda5a5bb249bed05066852e3 Mon Sep 17 00:00:00 2001 From: Pranith Kumar K Date: Sat, 14 Apr 2012 14:33:53 +0530 Subject: cluster/afr: Enforce order in pre/post op The xattrop order in pre/post op on all the subvols is client-0, client-1... client-n where n is (replica-count - 1). This order can lead to invalid split-brains if the brick dies in the middle of xattrops. Example: transaction completed pre-op, so on all the subvolumes xattrs have '1' changelog. Now post-op is sent to both the subvols. On subvol-0 change-log of client-0 is decremented to 0, before decrementing change-log of client-1 to 0 the brick dies. This change-log status on subvol-0 gives the meaning that a change is done on subvol-0 successfully but on subvol-1 it failed. Which is not what happened. Changes done when the subvol-0 was down will lead to pending change-log on subvol-1 for subvol-0. Which is correct. When the subvol-0 is brought back up, the change-log will be in split-brain state even when it is not a legitimate split-brain. If the brick dies in the middle of xattrops it should remain fool. Pre-op should perform xattrop of the local change-log first and post-op should perform xattrop of the local change-log last. In case of optimistic changelogs txn_changelog should be done last on local if it succeeds, first if it fails. Change-Id: Ib6eeb20cdc49b0b1fd2f454f25a9c8e08388c6e7 BUG: 765194 Signed-off-by: Pranith Kumar K Reviewed-on: http://review.gluster.com/3226 Tested-by: Gluster Build System Reviewed-by: Anand Avati --- xlators/cluster/afr/src/afr-common.c | 13 +- xlators/cluster/afr/src/afr-self-heal-common.c | 38 ++++-- xlators/cluster/afr/src/afr-self-heal-common.h | 2 +- xlators/cluster/afr/src/afr-self-heal-data.c | 2 +- xlators/cluster/afr/src/afr-self-heal-entry.c | 5 +- xlators/cluster/afr/src/afr-self-heal-metadata.c | 2 +- xlators/cluster/afr/src/afr-transaction.c | 158 ++++++++++++----------- xlators/cluster/afr/src/afr-transaction.h | 8 +- xlators/cluster/afr/src/afr.h | 7 +- 9 files changed, 135 insertions(+), 100 deletions(-) diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 171137b91..1d1ac0881 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -798,6 +798,8 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) priv = this->private; afr_matrix_cleanup (local->pending, priv->child_count); + afr_matrix_cleanup (local->transaction.txn_changelog, + priv->child_count); if (local->internal_lock.locked_nodes) GF_FREE (local->internal_lock.locked_nodes); @@ -813,7 +815,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) GF_FREE (local->transaction.pre_op); - GF_FREE (local->transaction.child_errno); GF_FREE (local->transaction.eager_lock); GF_FREE (local->transaction.basename); @@ -3924,12 +3925,10 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (!local->pending) goto out; - local->transaction.child_errno = - GF_CALLOC (sizeof (*local->transaction.child_errno), - priv->child_count, - gf_afr_mt_int32_t); - local->transaction.erase_pending = 1; - + local->transaction.txn_changelog = afr_matrix_create (priv->child_count, + AFR_NUM_CHANGE_LOGS); + if (!local->transaction.txn_changelog) + goto out; ret = 0; out: return ret; diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index e0af66952..5d0db8f63 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -847,20 +847,24 @@ afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, int -afr_sh_delta_to_xattr (afr_private_t *priv, +afr_sh_delta_to_xattr (xlator_t *this, int32_t *delta_matrix[], dict_t *xattr[], int child_count, afr_transaction_type type) { - int i = 0; - int j = 0; - int k = 0; - int ret = 0; - int32_t *pending = NULL; + int i = 0; + int j = 0; + int k = 0; + int ret = 0; + int32_t *pending = NULL; + int32_t *local_pending = NULL; + afr_private_t *priv = NULL; + priv = this->private; for (i = 0; i < child_count; i++) { if (!xattr[i]) continue; + local_pending = NULL; for (j = 0; j < child_count; j++) { pending = GF_CALLOC (sizeof (int32_t), 3, gf_afr_mt_int32_t); @@ -873,12 +877,28 @@ afr_sh_delta_to_xattr (afr_private_t *priv, pending[k] = hton32 (delta_matrix[i][j]); + if (j == i) { + local_pending = pending; + continue; + } ret = dict_set_bin (xattr[i], priv->pending_key[j], pending, - 3 * sizeof (int32_t)); - if (ret < 0) - gf_log (THIS->name, GF_LOG_WARNING, + AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, "Unable to set dict value."); + GF_FREE (pending); + } + } + if (local_pending) { + ret = dict_set_bin (xattr[i], priv->pending_key[i], + local_pending, + AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "Unable to set dict value."); + GF_FREE (local_pending); + } } } return 0; diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h index d104593ac..cea554670 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -52,7 +52,7 @@ afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, int32_t *success_children, int32_t *subvol_status); int -afr_sh_delta_to_xattr (afr_private_t *priv, +afr_sh_delta_to_xattr (xlator_t *this, int32_t *delta_matrix[], dict_t *xattr[], int child_count, afr_transaction_type type); diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 78798cba1..a5cbc5f04 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -421,7 +421,7 @@ afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) } } - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, + afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr, priv->child_count, AFR_DATA_TRANSACTION); GF_ASSERT (call_count); diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 9fa128e42..d528f9842 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -190,7 +190,7 @@ afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) if (call_count == 0) need_unwind = 1; - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, + afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr, priv->child_count, AFR_ENTRY_TRANSACTION); local->call_count = call_count; @@ -1125,7 +1125,8 @@ afr_sh_entry_impunge_perform_xattrop (call_frame_t *impunge_frame, goto out; } - afr_set_pending_dict (priv, xattr, impunge_local->pending); + afr_set_pending_dict (priv, xattr, impunge_local->pending, active_src, + LOCAL_LAST); STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_xattrop_cbk, (void *) (long) active_src, diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 79718315c..1bb22a09b 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -166,7 +166,7 @@ afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this) } } - afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, + afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr, priv->child_count, AFR_METADATA_TRANSACTION); local->call_count = call_count; diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index e0729fbf6..1d744d4eb 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -23,7 +23,6 @@ of RENAME */ #define LOCKED_LOWER 0x2 /* for lower_path of RENAME */ - afr_fd_ctx_t * __afr_fd_ctx_get (fd_t *fd, xlator_t *this) { @@ -267,63 +266,41 @@ __fop_changelog_needed (call_frame_t *frame, xlator_t *this) } int -afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending) +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, + int child, afr_xattrop_type_t op) { int i = 0; int ret = 0; + if (op == LOCAL_FIRST) { + ret = dict_set_static_bin (xattr, priv->pending_key[child], + pending[child], + AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); + if (ret) + goto out; + } for (i = 0; i < priv->child_count; i++) { + if (i == child) + continue; ret = dict_set_static_bin (xattr, priv->pending_key[i], pending[i], - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); + AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); /* 3 = data+metadata+entry */ if (ret < 0) goto out; } - -out: - return ret; -} - - -static int -afr_set_piggyback_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, - afr_transaction_type type) -{ - int i = 0; - int ret = 0; - int *arr = NULL; - int index = 0; - size_t pending_xattr_size = 3 * sizeof (int32_t); - /* 3 = data+metadata+entry */ - - index = afr_index_for_transaction_type (type); - - for (i = 0; i < priv->child_count; i++) { - arr = GF_CALLOC (1, pending_xattr_size, - gf_afr_mt_char); - if (!arr) { - ret = -1; - goto out; - } - - memcpy (arr, pending[i], pending_xattr_size); - - arr[index] = hton32 (ntoh32(arr[index]) + 1); - - ret = dict_set_bin (xattr, priv->pending_key[i], - arr, pending_xattr_size); - - if (ret < 0) + if (op == LOCAL_LAST) { + ret = dict_set_static_bin (xattr, priv->pending_key[child], + pending[child], + AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); + if (ret) goto out; } - out: return ret; } - int afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) { @@ -491,12 +468,65 @@ afr_changelog_post_op_call_count (afr_transaction_type type, return call_count; } +void +afr_compute_txn_changelog (afr_local_t *local, afr_private_t *priv) +{ + int i = 0; + int index = 0; + int32_t postop = 0; + int32_t preop = 1; + int32_t **txn_changelog = NULL; + + txn_changelog = local->transaction.txn_changelog; + index = afr_index_for_transaction_type (local->transaction.type); + for (i = 0; i < priv->child_count; i++) { + postop = ntoh32 (local->pending[i][index]); + txn_changelog[i][index] = hton32 (postop + preop); + } +} + +afr_xattrop_type_t +afr_get_postop_xattrop_type (int32_t **pending, int optimized, int child, + afr_transaction_type type) +{ + int index = 0; + afr_xattrop_type_t op = LOCAL_LAST; + + index = afr_index_for_transaction_type (type); + if (optimized && !pending[child][index]) + op = LOCAL_FIRST; + return op; +} + +void +afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr, + int optimized, int child) +{ + int32_t **txn_changelog = NULL; + int32_t **changelog = NULL; + afr_private_t *priv = NULL; + int ret = 0; + afr_xattrop_type_t op = LOCAL_LAST; + + priv = this->private; + txn_changelog = local->transaction.txn_changelog; + op = afr_get_postop_xattrop_type (local->pending, optimized, child, + local->transaction.type); + if (optimized) + changelog = txn_changelog; + else + changelog = local->pending; + ret = afr_set_pending_dict (priv, xattr, changelog, child, op); + if (ret < 0) + gf_log (this->name, GF_LOG_INFO, + "failed to set pending entry"); +} + int afr_changelog_post_op (call_frame_t *frame, xlator_t *this) { afr_private_t * priv = this->private; afr_internal_lock_t *int_lock = NULL; - int ret = 0; int i = 0; int call_count = 0; @@ -506,7 +536,6 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) int piggyback = 0; int index = 0; int nothing_failed = 1; - int32_t changelog = 0; local = frame->local; int_lock = &local->internal_lock; @@ -550,26 +579,15 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) } } - index = afr_index_for_transaction_type (local->transaction.type); - if (local->optimistic_change_log && - local->transaction.type != AFR_DATA_TRANSACTION) { - /* if nothing_failed, then local->pending[..] == {0 .. 0} */ - for (i = 0; i < priv->child_count; i++) { - changelog = ntoh32 (local->pending[i][index]); - local->pending[i][index] = hton32 (changelog + 1); - } - } + afr_compute_txn_changelog (local , priv); for (i = 0; i < priv->child_count; i++) { if (!local->transaction.pre_op[i]) continue; - ret = afr_set_pending_dict (priv, xattr[i], local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); - + if (local->transaction.type != AFR_DATA_TRANSACTION) + afr_set_postop_dict (local, this, xattr[i], + local->optimistic_change_log, i); switch (local->transaction.type) { case AFR_DATA_TRANSACTION: { @@ -593,10 +611,8 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) } UNLOCK (&local->fd->lock); - if (piggyback && !nothing_failed) - ret = afr_set_piggyback_dict (priv, xattr[i], - local->pending, - local->transaction.type); + afr_set_postop_dict (local, this, xattr[i], + piggyback, i); if (nothing_failed && piggyback) { afr_changelog_post_op_cbk (frame, (void *)(long)i, @@ -616,7 +632,7 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) break; case AFR_METADATA_TRANSACTION: { - if (nothing_failed) { + if (nothing_failed && local->optimistic_change_log) { afr_changelog_post_op_cbk (frame, (void *)(long)i, this, 1, 0, xattr[i], NULL); @@ -642,7 +658,7 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) case AFR_ENTRY_RENAME_TRANSACTION: { - if (nothing_failed) { + if (nothing_failed && local->optimistic_change_log) { afr_changelog_post_op_cbk (frame, (void *)(long)i, this, 1, 0, xattr[i], NULL); @@ -666,17 +682,14 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) value */ - ret = afr_set_pending_dict (priv, xattr[i], local->pending); - - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); + afr_set_postop_dict (local, this, xattr[i], + local->optimistic_change_log, i); /* fall through */ case AFR_ENTRY_TRANSACTION: { - if (nothing_failed) { + if (nothing_failed && local->optimistic_change_log) { afr_changelog_post_op_cbk (frame, (void *)(long)i, this, 1, 0, xattr[i], NULL); @@ -820,7 +833,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) for (i = 0; i < priv->child_count; i++) { if (!locked_nodes[i]) continue; - ret = afr_set_pending_dict (priv, xattr[i], local->pending); + ret = afr_set_pending_dict (priv, xattr[i], local->pending, + i, LOCAL_FIRST); if (ret < 0) gf_log (this->name, GF_LOG_INFO, @@ -929,7 +943,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) value */ - ret = afr_set_pending_dict (priv, xattr[i], local->pending); + ret = afr_set_pending_dict (priv, xattr[i], local->pending, + i, LOCAL_FIRST); if (ret < 0) gf_log (this->name, GF_LOG_INFO, @@ -1268,7 +1283,6 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index child_index, local->transaction.type); } - int afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) { diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index 965ee6737..d19d16fb3 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -11,6 +11,11 @@ #ifndef __TRANSACTION_H__ #define __TRANSACTION_H__ +typedef enum { + LOCAL_FIRST = 1, + LOCAL_LAST = 2 +} afr_xattrop_type_t; + void afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index); @@ -24,5 +29,6 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) afr_fd_ctx_t * afr_fd_ctx_get (fd_t *fd, xlator_t *this); int -afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending); +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, + int child, afr_xattrop_type_t op); #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index dfad29de1..27cb83a57 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -670,12 +670,7 @@ typedef struct _afr_local { afr_transaction_type type; - int success_count; - int erase_pending; - int failure_count; - - int last_tried; - int32_t *child_errno; + int32_t **txn_changelog;//changelog after pre+post ops unsigned char *pre_op; call_frame_t *main_frame; -- cgit