diff options
| author | Pranith Kumar K <pranithk@gluster.com> | 2012-04-14 14:33:53 +0530 | 
|---|---|---|
| committer | Anand Avati <avati@redhat.com> | 2012-05-18 20:30:30 -0700 | 
| commit | fa90243e20eeef91eda5a5bb249bed05066852e3 (patch) | |
| tree | 7add1693e67dc25e69b8c1f8ed6d6e096c84461f | |
| parent | 24c1cbf4f7afd54a506a8265de9d22ce2b2e670f (diff) | |
cluster/afr: Enforce order in pre/post op
The xattrop order in pre/post op on all the subvols
is client-0, client-1... client-n where n is (replica-count - 1).
This order can lead to invalid split-brains if the brick
dies in the middle of xattrops.
Example: transaction completed pre-op, so on all the subvolumes
xattrs have '1' changelog. Now post-op is sent to both the subvols.
On subvol-0 change-log of client-0 is decremented to 0, before
decrementing change-log of client-1 to 0 the brick dies.
This change-log status on subvol-0 gives the meaning that a
change is done on subvol-0 successfully but on subvol-1 it failed.
Which is not what happened.
Changes done when the subvol-0 was down will lead to pending
change-log on subvol-1 for subvol-0. Which is correct.
When the subvol-0 is brought back up, the change-log will be in
split-brain state even when it is not a legitimate split-brain.
If the brick dies in the middle of xattrops it should remain fool.
Pre-op should perform xattrop of the local change-log first and
post-op should perform xattrop of the local change-log last.
In case of optimistic changelogs txn_changelog should be done
last on local if it succeeds, first if it fails.
Change-Id: Ib6eeb20cdc49b0b1fd2f454f25a9c8e08388c6e7
BUG: 765194
Signed-off-by: Pranith Kumar K <pranithk@gluster.com>
Reviewed-on: http://review.gluster.com/3226
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Anand Avati <avati@redhat.com>
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 13 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 38 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.h | 2 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-data.c | 2 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-entry.c | 5 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-metadata.c | 2 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.c | 158 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.h | 8 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 7 | 
9 files changed, 135 insertions, 100 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 171137b910d..1d1ac088178 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -798,6 +798,8 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)          priv = this->private;          afr_matrix_cleanup (local->pending, priv->child_count); +        afr_matrix_cleanup (local->transaction.txn_changelog, +                            priv->child_count);          if (local->internal_lock.locked_nodes)                  GF_FREE (local->internal_lock.locked_nodes); @@ -813,7 +815,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)          GF_FREE (local->transaction.pre_op); -        GF_FREE (local->transaction.child_errno);          GF_FREE (local->transaction.eager_lock);          GF_FREE (local->transaction.basename); @@ -3924,12 +3925,10 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)          if (!local->pending)                  goto out; -        local->transaction.child_errno = -                GF_CALLOC (sizeof (*local->transaction.child_errno), -                           priv->child_count, -                           gf_afr_mt_int32_t); -        local->transaction.erase_pending = 1; - +        local->transaction.txn_changelog = afr_matrix_create (priv->child_count, +                                                           AFR_NUM_CHANGE_LOGS); +        if (!local->transaction.txn_changelog) +                goto out;          ret = 0;  out:          return ret; diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index e0af66952ac..5d0db8f637d 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -847,20 +847,24 @@ afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,  int -afr_sh_delta_to_xattr (afr_private_t *priv, +afr_sh_delta_to_xattr (xlator_t *this,                         int32_t *delta_matrix[], dict_t *xattr[],                         int child_count, afr_transaction_type type)  { -        int      i       = 0; -        int      j       = 0; -        int      k       = 0; -        int      ret     = 0; -        int32_t *pending = NULL; +        int              i       = 0; +        int              j       = 0; +        int              k       = 0; +        int              ret     = 0; +        int32_t         *pending = NULL; +        int32_t         *local_pending = NULL; +        afr_private_t   *priv = NULL; +        priv = this->private;          for (i = 0; i < child_count; i++) {                  if (!xattr[i])                          continue; +                local_pending = NULL;                  for (j = 0; j < child_count; j++) {                          pending = GF_CALLOC (sizeof (int32_t), 3,                                               gf_afr_mt_int32_t); @@ -873,12 +877,28 @@ afr_sh_delta_to_xattr (afr_private_t *priv,                          pending[k] = hton32 (delta_matrix[i][j]); +                        if (j == i) { +                                local_pending = pending; +                                continue; +                        }                          ret = dict_set_bin (xattr[i], priv->pending_key[j],                                              pending, -                                            3 * sizeof (int32_t)); -                        if (ret < 0) -                                gf_log (THIS->name, GF_LOG_WARNING, +                                        AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); +                        if (ret < 0) { +                                gf_log (this->name, GF_LOG_WARNING,                                          "Unable to set dict value."); +                                GF_FREE (pending); +                        } +                } +                if (local_pending) { +                        ret = dict_set_bin (xattr[i], priv->pending_key[i], +                                            local_pending, +                                        AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); +                        if (ret < 0) { +                                gf_log (this->name, GF_LOG_WARNING, +                                        "Unable to set dict value."); +                                GF_FREE (local_pending); +                        }                  }          }          return 0; diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h index d104593ac70..cea554670b5 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -52,7 +52,7 @@ afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix,                    int32_t *success_children, int32_t *subvol_status);  int -afr_sh_delta_to_xattr (afr_private_t *priv, +afr_sh_delta_to_xattr (xlator_t *this,                         int32_t *delta_matrix[], dict_t *xattr[],  		       int child_count, afr_transaction_type type); diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 78798cba146..a5cbc5f0478 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -421,7 +421,7 @@ afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this)                  }          } -        afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, +        afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr,                                 priv->child_count, AFR_DATA_TRANSACTION);          GF_ASSERT (call_count); diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 9fa128e4234..d528f9842a2 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -190,7 +190,7 @@ afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this)          if (call_count == 0)                  need_unwind = 1; -        afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, +        afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr,                                 priv->child_count, AFR_ENTRY_TRANSACTION);          local->call_count = call_count; @@ -1125,7 +1125,8 @@ afr_sh_entry_impunge_perform_xattrop (call_frame_t *impunge_frame,                  goto out;          } -        afr_set_pending_dict (priv, xattr, impunge_local->pending); +        afr_set_pending_dict (priv, xattr, impunge_local->pending, active_src, +                              LOCAL_LAST);          STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_xattrop_cbk,                             (void *) (long) active_src, diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 79718315cbf..1bb22a09b79 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -166,7 +166,7 @@ afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this)                  }          } -        afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr, +        afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr,                                 priv->child_count, AFR_METADATA_TRANSACTION);          local->call_count = call_count; diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index e0729fbf6dc..1d744d4eb2e 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -23,7 +23,6 @@                                        of RENAME */  #define LOCKED_LOWER    0x2        /* for lower_path of RENAME */ -  afr_fd_ctx_t *  __afr_fd_ctx_get (fd_t *fd, xlator_t *this)  { @@ -267,63 +266,41 @@ __fop_changelog_needed (call_frame_t *frame, xlator_t *this)  }  int -afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending) +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, +                      int child, afr_xattrop_type_t op)  {          int i = 0;          int ret = 0; +        if (op == LOCAL_FIRST) { +                ret = dict_set_static_bin (xattr, priv->pending_key[child], +                                           pending[child], +                                   AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); +                if (ret) +                        goto out; +        }          for (i = 0; i < priv->child_count; i++) { +                if (i == child) +                        continue;                  ret = dict_set_static_bin (xattr, priv->pending_key[i],                                             pending[i], -                                AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); +                                   AFR_NUM_CHANGE_LOGS * sizeof (int32_t));                  /* 3 = data+metadata+entry */                  if (ret < 0)                          goto out;          } - -out: -        return ret; -} - - -static int -afr_set_piggyback_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, -                        afr_transaction_type type) -{ -        int i = 0; -        int ret = 0; -        int *arr = NULL; -        int index = 0; -        size_t pending_xattr_size = 3 * sizeof (int32_t); -        /* 3 = data+metadata+entry */ - -        index = afr_index_for_transaction_type (type); - -        for (i = 0; i < priv->child_count; i++) { -                arr = GF_CALLOC (1, pending_xattr_size, -                                 gf_afr_mt_char); -                if (!arr) { -                        ret = -1; -                        goto out; -                } - -                memcpy (arr, pending[i], pending_xattr_size); - -                arr[index] = hton32 (ntoh32(arr[index]) + 1); - -                ret = dict_set_bin (xattr, priv->pending_key[i], -                                    arr, pending_xattr_size); - -                if (ret < 0) +        if (op == LOCAL_LAST) { +                ret = dict_set_static_bin (xattr, priv->pending_key[child], +                                           pending[child], +                                   AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); +                if (ret)                          goto out;          } -  out:          return ret;  } -  int  afr_lock_server_count (afr_private_t *priv, afr_transaction_type type)  { @@ -491,12 +468,65 @@ afr_changelog_post_op_call_count (afr_transaction_type type,          return call_count;  } +void +afr_compute_txn_changelog (afr_local_t *local, afr_private_t *priv) +{ +        int             i = 0; +        int             index = 0; +        int32_t         postop = 0; +        int32_t         preop = 1; +        int32_t         **txn_changelog = NULL; + +        txn_changelog = local->transaction.txn_changelog; +        index = afr_index_for_transaction_type (local->transaction.type); +        for (i = 0; i < priv->child_count; i++) { +                postop = ntoh32 (local->pending[i][index]); +                txn_changelog[i][index] = hton32 (postop + preop); +        } +} + +afr_xattrop_type_t +afr_get_postop_xattrop_type (int32_t **pending, int optimized, int child, +                             afr_transaction_type type) +{ +        int                     index = 0; +        afr_xattrop_type_t      op = LOCAL_LAST; + +        index = afr_index_for_transaction_type (type); +        if (optimized && !pending[child][index]) +                op = LOCAL_FIRST; +        return op; +} + +void +afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr, +                     int optimized, int child) +{ +        int32_t                 **txn_changelog = NULL; +        int32_t                 **changelog = NULL; +        afr_private_t           *priv = NULL; +        int                     ret = 0; +        afr_xattrop_type_t      op = LOCAL_LAST; + +        priv = this->private; +        txn_changelog = local->transaction.txn_changelog; +        op = afr_get_postop_xattrop_type (local->pending, optimized, child, +                                          local->transaction.type); +        if (optimized) +                changelog = txn_changelog; +        else +                changelog = local->pending; +        ret = afr_set_pending_dict (priv, xattr, changelog, child, op); +        if (ret < 0) +                gf_log (this->name, GF_LOG_INFO, +                        "failed to set pending entry"); +} +  int  afr_changelog_post_op (call_frame_t *frame, xlator_t *this)  {          afr_private_t * priv = this->private;          afr_internal_lock_t *int_lock = NULL; -        int ret        = 0;          int i          = 0;          int call_count = 0; @@ -506,7 +536,6 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)          int            piggyback = 0;          int            index = 0;          int            nothing_failed = 1; -        int32_t        changelog = 0;          local    = frame->local;          int_lock = &local->internal_lock; @@ -550,26 +579,15 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)                  }          } -        index = afr_index_for_transaction_type (local->transaction.type); -        if (local->optimistic_change_log && -            local->transaction.type != AFR_DATA_TRANSACTION) { -                /* if nothing_failed, then local->pending[..] == {0 .. 0} */ -                for (i = 0; i < priv->child_count; i++) { -                        changelog = ntoh32 (local->pending[i][index]); -                        local->pending[i][index] = hton32 (changelog + 1); -                } -        } +        afr_compute_txn_changelog (local , priv);          for (i = 0; i < priv->child_count; i++) {                  if (!local->transaction.pre_op[i])                          continue; -                ret = afr_set_pending_dict (priv, xattr[i], local->pending); - -                if (ret < 0) -                        gf_log (this->name, GF_LOG_INFO, -                                "failed to set pending entry"); - +                if (local->transaction.type != AFR_DATA_TRANSACTION) +                        afr_set_postop_dict (local, this, xattr[i], +                                             local->optimistic_change_log, i);                  switch (local->transaction.type) {                  case AFR_DATA_TRANSACTION:                  { @@ -593,10 +611,8 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)                          }                          UNLOCK (&local->fd->lock); -                        if (piggyback && !nothing_failed) -                                ret = afr_set_piggyback_dict (priv, xattr[i], -                                                              local->pending, -                                                              local->transaction.type); +                        afr_set_postop_dict (local, this, xattr[i], +                                             piggyback, i);                          if (nothing_failed && piggyback) {                                  afr_changelog_post_op_cbk (frame, (void *)(long)i, @@ -616,7 +632,7 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)                  break;                  case AFR_METADATA_TRANSACTION:                  { -                        if (nothing_failed) { +                        if (nothing_failed && local->optimistic_change_log) {                                  afr_changelog_post_op_cbk (frame, (void *)(long)i,                                                             this, 1, 0, xattr[i],                                                             NULL); @@ -642,7 +658,7 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)                  case AFR_ENTRY_RENAME_TRANSACTION:                  { -                        if (nothing_failed) { +                        if (nothing_failed && local->optimistic_change_log) {                                  afr_changelog_post_op_cbk (frame, (void *)(long)i,                                                             this, 1, 0, xattr[i],                                                             NULL); @@ -666,17 +682,14 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)                    value                  */ -                ret = afr_set_pending_dict (priv, xattr[i], local->pending); - -                if (ret < 0) -                        gf_log (this->name, GF_LOG_INFO, -                                "failed to set pending entry"); +                afr_set_postop_dict (local, this, xattr[i], +                                     local->optimistic_change_log, i);                  /* fall through */                  case AFR_ENTRY_TRANSACTION:                  { -                        if (nothing_failed) { +                        if (nothing_failed && local->optimistic_change_log) {                                  afr_changelog_post_op_cbk (frame, (void *)(long)i,                                                             this, 1, 0, xattr[i],                                                             NULL); @@ -820,7 +833,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)          for (i = 0; i < priv->child_count; i++) {                  if (!locked_nodes[i])                          continue; -                ret = afr_set_pending_dict (priv, xattr[i], local->pending); +                ret = afr_set_pending_dict (priv, xattr[i], local->pending, +                                            i, LOCAL_FIRST);                  if (ret < 0)                          gf_log (this->name, GF_LOG_INFO, @@ -929,7 +943,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)                    value                  */ -                ret = afr_set_pending_dict (priv, xattr[i], local->pending); +                ret = afr_set_pending_dict (priv, xattr[i], local->pending, +                                            i, LOCAL_FIRST);                  if (ret < 0)                          gf_log (this->name, GF_LOG_INFO, @@ -1268,7 +1283,6 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index                             child_index, local->transaction.type);  } -  int  afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)  { diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index 965ee67374c..d19d16fb39e 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -11,6 +11,11 @@  #ifndef __TRANSACTION_H__  #define __TRANSACTION_H__ +typedef enum { +        LOCAL_FIRST = 1, +        LOCAL_LAST = 2 +} afr_xattrop_type_t; +  void  afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,  			    int child_index); @@ -24,5 +29,6 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)  afr_fd_ctx_t *  afr_fd_ctx_get (fd_t *fd, xlator_t *this);  int -afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending); +afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, +                      int child, afr_xattrop_type_t op);  #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index dfad29de117..27cb83a5791 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -670,12 +670,7 @@ typedef struct _afr_local {                  afr_transaction_type type; -                int success_count; -                int erase_pending; -                int failure_count; - -                int last_tried; -                int32_t *child_errno; +                int32_t         **txn_changelog;//changelog after pre+post ops                  unsigned char   *pre_op;                  call_frame_t *main_frame;  | 
