summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr/src/afr-transaction.c
diff options
context:
space:
mode:
authorPranith Kumar K <pranithk@gluster.com>2012-04-14 14:33:53 +0530
committerAnand Avati <avati@redhat.com>2012-05-18 20:30:30 -0700
commitfa90243e20eeef91eda5a5bb249bed05066852e3 (patch)
tree7add1693e67dc25e69b8c1f8ed6d6e096c84461f /xlators/cluster/afr/src/afr-transaction.c
parent24c1cbf4f7afd54a506a8265de9d22ce2b2e670f (diff)
cluster/afr: Enforce order in pre/post op
The xattrop order in pre/post op on all the subvols is client-0, client-1... client-n where n is (replica-count - 1). This order can lead to invalid split-brains if the brick dies in the middle of xattrops. Example: transaction completed pre-op, so on all the subvolumes xattrs have '1' changelog. Now post-op is sent to both the subvols. On subvol-0 change-log of client-0 is decremented to 0, before decrementing change-log of client-1 to 0 the brick dies. This change-log status on subvol-0 gives the meaning that a change is done on subvol-0 successfully but on subvol-1 it failed. Which is not what happened. Changes done when the subvol-0 was down will lead to pending change-log on subvol-1 for subvol-0. Which is correct. When the subvol-0 is brought back up, the change-log will be in split-brain state even when it is not a legitimate split-brain. If the brick dies in the middle of xattrops it should remain fool. Pre-op should perform xattrop of the local change-log first and post-op should perform xattrop of the local change-log last. In case of optimistic changelogs txn_changelog should be done last on local if it succeeds, first if it fails. Change-Id: Ib6eeb20cdc49b0b1fd2f454f25a9c8e08388c6e7 BUG: 765194 Signed-off-by: Pranith Kumar K <pranithk@gluster.com> Reviewed-on: http://review.gluster.com/3226 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Anand Avati <avati@redhat.com>
Diffstat (limited to 'xlators/cluster/afr/src/afr-transaction.c')
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c158
1 files changed, 86 insertions, 72 deletions
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index e0729fbf6..1d744d4eb 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -23,7 +23,6 @@
of RENAME */
#define LOCKED_LOWER 0x2 /* for lower_path of RENAME */
-
afr_fd_ctx_t *
__afr_fd_ctx_get (fd_t *fd, xlator_t *this)
{
@@ -267,63 +266,41 @@ __fop_changelog_needed (call_frame_t *frame, xlator_t *this)
}
int
-afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending)
+afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending,
+ int child, afr_xattrop_type_t op)
{
int i = 0;
int ret = 0;
+ if (op == LOCAL_FIRST) {
+ ret = dict_set_static_bin (xattr, priv->pending_key[child],
+ pending[child],
+ AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
+ if (ret)
+ goto out;
+ }
for (i = 0; i < priv->child_count; i++) {
+ if (i == child)
+ continue;
ret = dict_set_static_bin (xattr, priv->pending_key[i],
pending[i],
- AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
+ AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
/* 3 = data+metadata+entry */
if (ret < 0)
goto out;
}
-
-out:
- return ret;
-}
-
-
-static int
-afr_set_piggyback_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending,
- afr_transaction_type type)
-{
- int i = 0;
- int ret = 0;
- int *arr = NULL;
- int index = 0;
- size_t pending_xattr_size = 3 * sizeof (int32_t);
- /* 3 = data+metadata+entry */
-
- index = afr_index_for_transaction_type (type);
-
- for (i = 0; i < priv->child_count; i++) {
- arr = GF_CALLOC (1, pending_xattr_size,
- gf_afr_mt_char);
- if (!arr) {
- ret = -1;
- goto out;
- }
-
- memcpy (arr, pending[i], pending_xattr_size);
-
- arr[index] = hton32 (ntoh32(arr[index]) + 1);
-
- ret = dict_set_bin (xattr, priv->pending_key[i],
- arr, pending_xattr_size);
-
- if (ret < 0)
+ if (op == LOCAL_LAST) {
+ ret = dict_set_static_bin (xattr, priv->pending_key[child],
+ pending[child],
+ AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
+ if (ret)
goto out;
}
-
out:
return ret;
}
-
int
afr_lock_server_count (afr_private_t *priv, afr_transaction_type type)
{
@@ -491,12 +468,65 @@ afr_changelog_post_op_call_count (afr_transaction_type type,
return call_count;
}
+void
+afr_compute_txn_changelog (afr_local_t *local, afr_private_t *priv)
+{
+ int i = 0;
+ int index = 0;
+ int32_t postop = 0;
+ int32_t preop = 1;
+ int32_t **txn_changelog = NULL;
+
+ txn_changelog = local->transaction.txn_changelog;
+ index = afr_index_for_transaction_type (local->transaction.type);
+ for (i = 0; i < priv->child_count; i++) {
+ postop = ntoh32 (local->pending[i][index]);
+ txn_changelog[i][index] = hton32 (postop + preop);
+ }
+}
+
+afr_xattrop_type_t
+afr_get_postop_xattrop_type (int32_t **pending, int optimized, int child,
+ afr_transaction_type type)
+{
+ int index = 0;
+ afr_xattrop_type_t op = LOCAL_LAST;
+
+ index = afr_index_for_transaction_type (type);
+ if (optimized && !pending[child][index])
+ op = LOCAL_FIRST;
+ return op;
+}
+
+void
+afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr,
+ int optimized, int child)
+{
+ int32_t **txn_changelog = NULL;
+ int32_t **changelog = NULL;
+ afr_private_t *priv = NULL;
+ int ret = 0;
+ afr_xattrop_type_t op = LOCAL_LAST;
+
+ priv = this->private;
+ txn_changelog = local->transaction.txn_changelog;
+ op = afr_get_postop_xattrop_type (local->pending, optimized, child,
+ local->transaction.type);
+ if (optimized)
+ changelog = txn_changelog;
+ else
+ changelog = local->pending;
+ ret = afr_set_pending_dict (priv, xattr, changelog, child, op);
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_INFO,
+ "failed to set pending entry");
+}
+
int
afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
{
afr_private_t * priv = this->private;
afr_internal_lock_t *int_lock = NULL;
- int ret = 0;
int i = 0;
int call_count = 0;
@@ -506,7 +536,6 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
int piggyback = 0;
int index = 0;
int nothing_failed = 1;
- int32_t changelog = 0;
local = frame->local;
int_lock = &local->internal_lock;
@@ -550,26 +579,15 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
}
}
- index = afr_index_for_transaction_type (local->transaction.type);
- if (local->optimistic_change_log &&
- local->transaction.type != AFR_DATA_TRANSACTION) {
- /* if nothing_failed, then local->pending[..] == {0 .. 0} */
- for (i = 0; i < priv->child_count; i++) {
- changelog = ntoh32 (local->pending[i][index]);
- local->pending[i][index] = hton32 (changelog + 1);
- }
- }
+ afr_compute_txn_changelog (local , priv);
for (i = 0; i < priv->child_count; i++) {
if (!local->transaction.pre_op[i])
continue;
- ret = afr_set_pending_dict (priv, xattr[i], local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_INFO,
- "failed to set pending entry");
-
+ if (local->transaction.type != AFR_DATA_TRANSACTION)
+ afr_set_postop_dict (local, this, xattr[i],
+ local->optimistic_change_log, i);
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
{
@@ -593,10 +611,8 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
}
UNLOCK (&local->fd->lock);
- if (piggyback && !nothing_failed)
- ret = afr_set_piggyback_dict (priv, xattr[i],
- local->pending,
- local->transaction.type);
+ afr_set_postop_dict (local, this, xattr[i],
+ piggyback, i);
if (nothing_failed && piggyback) {
afr_changelog_post_op_cbk (frame, (void *)(long)i,
@@ -616,7 +632,7 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
break;
case AFR_METADATA_TRANSACTION:
{
- if (nothing_failed) {
+ if (nothing_failed && local->optimistic_change_log) {
afr_changelog_post_op_cbk (frame, (void *)(long)i,
this, 1, 0, xattr[i],
NULL);
@@ -642,7 +658,7 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
case AFR_ENTRY_RENAME_TRANSACTION:
{
- if (nothing_failed) {
+ if (nothing_failed && local->optimistic_change_log) {
afr_changelog_post_op_cbk (frame, (void *)(long)i,
this, 1, 0, xattr[i],
NULL);
@@ -666,17 +682,14 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
value
*/
- ret = afr_set_pending_dict (priv, xattr[i], local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_INFO,
- "failed to set pending entry");
+ afr_set_postop_dict (local, this, xattr[i],
+ local->optimistic_change_log, i);
/* fall through */
case AFR_ENTRY_TRANSACTION:
{
- if (nothing_failed) {
+ if (nothing_failed && local->optimistic_change_log) {
afr_changelog_post_op_cbk (frame, (void *)(long)i,
this, 1, 0, xattr[i],
NULL);
@@ -820,7 +833,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
for (i = 0; i < priv->child_count; i++) {
if (!locked_nodes[i])
continue;
- ret = afr_set_pending_dict (priv, xattr[i], local->pending);
+ ret = afr_set_pending_dict (priv, xattr[i], local->pending,
+ i, LOCAL_FIRST);
if (ret < 0)
gf_log (this->name, GF_LOG_INFO,
@@ -929,7 +943,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
value
*/
- ret = afr_set_pending_dict (priv, xattr[i], local->pending);
+ ret = afr_set_pending_dict (priv, xattr[i], local->pending,
+ i, LOCAL_FIRST);
if (ret < 0)
gf_log (this->name, GF_LOG_INFO,
@@ -1268,7 +1283,6 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index
child_index, local->transaction.type);
}
-
int
afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
{