summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPranith Kumar K <pranithk@gluster.com>2012-04-14 14:33:53 +0530
committerAnand Avati <avati@redhat.com>2012-05-18 20:30:30 -0700
commitfa90243e20eeef91eda5a5bb249bed05066852e3 (patch)
tree7add1693e67dc25e69b8c1f8ed6d6e096c84461f
parent24c1cbf4f7afd54a506a8265de9d22ce2b2e670f (diff)
cluster/afr: Enforce order in pre/post op
The xattrop order in pre/post op on all the subvols is client-0, client-1... client-n where n is (replica-count - 1). This order can lead to invalid split-brains if the brick dies in the middle of xattrops. Example: transaction completed pre-op, so on all the subvolumes xattrs have '1' changelog. Now post-op is sent to both the subvols. On subvol-0 change-log of client-0 is decremented to 0, before decrementing change-log of client-1 to 0 the brick dies. This change-log status on subvol-0 gives the meaning that a change is done on subvol-0 successfully but on subvol-1 it failed. Which is not what happened. Changes done when the subvol-0 was down will lead to pending change-log on subvol-1 for subvol-0. Which is correct. When the subvol-0 is brought back up, the change-log will be in split-brain state even when it is not a legitimate split-brain. If the brick dies in the middle of xattrops it should remain fool. Pre-op should perform xattrop of the local change-log first and post-op should perform xattrop of the local change-log last. In case of optimistic changelogs txn_changelog should be done last on local if it succeeds, first if it fails. Change-Id: Ib6eeb20cdc49b0b1fd2f454f25a9c8e08388c6e7 BUG: 765194 Signed-off-by: Pranith Kumar K <pranithk@gluster.com> Reviewed-on: http://review.gluster.com/3226 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Anand Avati <avati@redhat.com>
-rw-r--r--xlators/cluster/afr/src/afr-common.c13
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.c38
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.h2
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c2
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c5
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c2
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c158
-rw-r--r--xlators/cluster/afr/src/afr-transaction.h8
-rw-r--r--xlators/cluster/afr/src/afr.h7
9 files changed, 135 insertions, 100 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 171137b91..1d1ac0881 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -798,6 +798,8 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
priv = this->private;
afr_matrix_cleanup (local->pending, priv->child_count);
+ afr_matrix_cleanup (local->transaction.txn_changelog,
+ priv->child_count);
if (local->internal_lock.locked_nodes)
GF_FREE (local->internal_lock.locked_nodes);
@@ -813,7 +815,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
GF_FREE (local->transaction.pre_op);
- GF_FREE (local->transaction.child_errno);
GF_FREE (local->transaction.eager_lock);
GF_FREE (local->transaction.basename);
@@ -3924,12 +3925,10 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
if (!local->pending)
goto out;
- local->transaction.child_errno =
- GF_CALLOC (sizeof (*local->transaction.child_errno),
- priv->child_count,
- gf_afr_mt_int32_t);
- local->transaction.erase_pending = 1;
-
+ local->transaction.txn_changelog = afr_matrix_create (priv->child_count,
+ AFR_NUM_CHANGE_LOGS);
+ if (!local->transaction.txn_changelog)
+ goto out;
ret = 0;
out:
return ret;
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index e0af66952..5d0db8f63 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -847,20 +847,24 @@ afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,
int
-afr_sh_delta_to_xattr (afr_private_t *priv,
+afr_sh_delta_to_xattr (xlator_t *this,
int32_t *delta_matrix[], dict_t *xattr[],
int child_count, afr_transaction_type type)
{
- int i = 0;
- int j = 0;
- int k = 0;
- int ret = 0;
- int32_t *pending = NULL;
+ int i = 0;
+ int j = 0;
+ int k = 0;
+ int ret = 0;
+ int32_t *pending = NULL;
+ int32_t *local_pending = NULL;
+ afr_private_t *priv = NULL;
+ priv = this->private;
for (i = 0; i < child_count; i++) {
if (!xattr[i])
continue;
+ local_pending = NULL;
for (j = 0; j < child_count; j++) {
pending = GF_CALLOC (sizeof (int32_t), 3,
gf_afr_mt_int32_t);
@@ -873,12 +877,28 @@ afr_sh_delta_to_xattr (afr_private_t *priv,
pending[k] = hton32 (delta_matrix[i][j]);
+ if (j == i) {
+ local_pending = pending;
+ continue;
+ }
ret = dict_set_bin (xattr[i], priv->pending_key[j],
pending,
- 3 * sizeof (int32_t));
- if (ret < 0)
- gf_log (THIS->name, GF_LOG_WARNING,
+ AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
"Unable to set dict value.");
+ GF_FREE (pending);
+ }
+ }
+ if (local_pending) {
+ ret = dict_set_bin (xattr[i], priv->pending_key[i],
+ local_pending,
+ AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Unable to set dict value.");
+ GF_FREE (local_pending);
+ }
}
}
return 0;
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h
index d104593ac..cea554670 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.h
+++ b/xlators/cluster/afr/src/afr-self-heal-common.h
@@ -52,7 +52,7 @@ afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix,
int32_t *success_children, int32_t *subvol_status);
int
-afr_sh_delta_to_xattr (afr_private_t *priv,
+afr_sh_delta_to_xattr (xlator_t *this,
int32_t *delta_matrix[], dict_t *xattr[],
int child_count, afr_transaction_type type);
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 78798cba1..a5cbc5f04 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -421,7 +421,7 @@ afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this)
}
}
- afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr,
+ afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr,
priv->child_count, AFR_DATA_TRANSACTION);
GF_ASSERT (call_count);
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index 9fa128e42..d528f9842 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -190,7 +190,7 @@ afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this)
if (call_count == 0)
need_unwind = 1;
- afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr,
+ afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr,
priv->child_count, AFR_ENTRY_TRANSACTION);
local->call_count = call_count;
@@ -1125,7 +1125,8 @@ afr_sh_entry_impunge_perform_xattrop (call_frame_t *impunge_frame,
goto out;
}
- afr_set_pending_dict (priv, xattr, impunge_local->pending);
+ afr_set_pending_dict (priv, xattr, impunge_local->pending, active_src,
+ LOCAL_LAST);
STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_xattrop_cbk,
(void *) (long) active_src,
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index 79718315c..1bb22a09b 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -166,7 +166,7 @@ afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this)
}
}
- afr_sh_delta_to_xattr (priv, sh->delta_matrix, erase_xattr,
+ afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr,
priv->child_count, AFR_METADATA_TRANSACTION);
local->call_count = call_count;
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index e0729fbf6..1d744d4eb 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -23,7 +23,6 @@
of RENAME */
#define LOCKED_LOWER 0x2 /* for lower_path of RENAME */
-
afr_fd_ctx_t *
__afr_fd_ctx_get (fd_t *fd, xlator_t *this)
{
@@ -267,63 +266,41 @@ __fop_changelog_needed (call_frame_t *frame, xlator_t *this)
}
int
-afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending)
+afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending,
+ int child, afr_xattrop_type_t op)
{
int i = 0;
int ret = 0;
+ if (op == LOCAL_FIRST) {
+ ret = dict_set_static_bin (xattr, priv->pending_key[child],
+ pending[child],
+ AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
+ if (ret)
+ goto out;
+ }
for (i = 0; i < priv->child_count; i++) {
+ if (i == child)
+ continue;
ret = dict_set_static_bin (xattr, priv->pending_key[i],
pending[i],
- AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
+ AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
/* 3 = data+metadata+entry */
if (ret < 0)
goto out;
}
-
-out:
- return ret;
-}
-
-
-static int
-afr_set_piggyback_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending,
- afr_transaction_type type)
-{
- int i = 0;
- int ret = 0;
- int *arr = NULL;
- int index = 0;
- size_t pending_xattr_size = 3 * sizeof (int32_t);
- /* 3 = data+metadata+entry */
-
- index = afr_index_for_transaction_type (type);
-
- for (i = 0; i < priv->child_count; i++) {
- arr = GF_CALLOC (1, pending_xattr_size,
- gf_afr_mt_char);
- if (!arr) {
- ret = -1;
- goto out;
- }
-
- memcpy (arr, pending[i], pending_xattr_size);
-
- arr[index] = hton32 (ntoh32(arr[index]) + 1);
-
- ret = dict_set_bin (xattr, priv->pending_key[i],
- arr, pending_xattr_size);
-
- if (ret < 0)
+ if (op == LOCAL_LAST) {
+ ret = dict_set_static_bin (xattr, priv->pending_key[child],
+ pending[child],
+ AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
+ if (ret)
goto out;
}
-
out:
return ret;
}
-
int
afr_lock_server_count (afr_private_t *priv, afr_transaction_type type)
{
@@ -491,12 +468,65 @@ afr_changelog_post_op_call_count (afr_transaction_type type,
return call_count;
}
+void
+afr_compute_txn_changelog (afr_local_t *local, afr_private_t *priv)
+{
+ int i = 0;
+ int index = 0;
+ int32_t postop = 0;
+ int32_t preop = 1;
+ int32_t **txn_changelog = NULL;
+
+ txn_changelog = local->transaction.txn_changelog;
+ index = afr_index_for_transaction_type (local->transaction.type);
+ for (i = 0; i < priv->child_count; i++) {
+ postop = ntoh32 (local->pending[i][index]);
+ txn_changelog[i][index] = hton32 (postop + preop);
+ }
+}
+
+afr_xattrop_type_t
+afr_get_postop_xattrop_type (int32_t **pending, int optimized, int child,
+ afr_transaction_type type)
+{
+ int index = 0;
+ afr_xattrop_type_t op = LOCAL_LAST;
+
+ index = afr_index_for_transaction_type (type);
+ if (optimized && !pending[child][index])
+ op = LOCAL_FIRST;
+ return op;
+}
+
+void
+afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr,
+ int optimized, int child)
+{
+ int32_t **txn_changelog = NULL;
+ int32_t **changelog = NULL;
+ afr_private_t *priv = NULL;
+ int ret = 0;
+ afr_xattrop_type_t op = LOCAL_LAST;
+
+ priv = this->private;
+ txn_changelog = local->transaction.txn_changelog;
+ op = afr_get_postop_xattrop_type (local->pending, optimized, child,
+ local->transaction.type);
+ if (optimized)
+ changelog = txn_changelog;
+ else
+ changelog = local->pending;
+ ret = afr_set_pending_dict (priv, xattr, changelog, child, op);
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_INFO,
+ "failed to set pending entry");
+}
+
int
afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
{
afr_private_t * priv = this->private;
afr_internal_lock_t *int_lock = NULL;
- int ret = 0;
int i = 0;
int call_count = 0;
@@ -506,7 +536,6 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
int piggyback = 0;
int index = 0;
int nothing_failed = 1;
- int32_t changelog = 0;
local = frame->local;
int_lock = &local->internal_lock;
@@ -550,26 +579,15 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
}
}
- index = afr_index_for_transaction_type (local->transaction.type);
- if (local->optimistic_change_log &&
- local->transaction.type != AFR_DATA_TRANSACTION) {
- /* if nothing_failed, then local->pending[..] == {0 .. 0} */
- for (i = 0; i < priv->child_count; i++) {
- changelog = ntoh32 (local->pending[i][index]);
- local->pending[i][index] = hton32 (changelog + 1);
- }
- }
+ afr_compute_txn_changelog (local , priv);
for (i = 0; i < priv->child_count; i++) {
if (!local->transaction.pre_op[i])
continue;
- ret = afr_set_pending_dict (priv, xattr[i], local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_INFO,
- "failed to set pending entry");
-
+ if (local->transaction.type != AFR_DATA_TRANSACTION)
+ afr_set_postop_dict (local, this, xattr[i],
+ local->optimistic_change_log, i);
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
{
@@ -593,10 +611,8 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
}
UNLOCK (&local->fd->lock);
- if (piggyback && !nothing_failed)
- ret = afr_set_piggyback_dict (priv, xattr[i],
- local->pending,
- local->transaction.type);
+ afr_set_postop_dict (local, this, xattr[i],
+ piggyback, i);
if (nothing_failed && piggyback) {
afr_changelog_post_op_cbk (frame, (void *)(long)i,
@@ -616,7 +632,7 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
break;
case AFR_METADATA_TRANSACTION:
{
- if (nothing_failed) {
+ if (nothing_failed && local->optimistic_change_log) {
afr_changelog_post_op_cbk (frame, (void *)(long)i,
this, 1, 0, xattr[i],
NULL);
@@ -642,7 +658,7 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
case AFR_ENTRY_RENAME_TRANSACTION:
{
- if (nothing_failed) {
+ if (nothing_failed && local->optimistic_change_log) {
afr_changelog_post_op_cbk (frame, (void *)(long)i,
this, 1, 0, xattr[i],
NULL);
@@ -666,17 +682,14 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
value
*/
- ret = afr_set_pending_dict (priv, xattr[i], local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_INFO,
- "failed to set pending entry");
+ afr_set_postop_dict (local, this, xattr[i],
+ local->optimistic_change_log, i);
/* fall through */
case AFR_ENTRY_TRANSACTION:
{
- if (nothing_failed) {
+ if (nothing_failed && local->optimistic_change_log) {
afr_changelog_post_op_cbk (frame, (void *)(long)i,
this, 1, 0, xattr[i],
NULL);
@@ -820,7 +833,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
for (i = 0; i < priv->child_count; i++) {
if (!locked_nodes[i])
continue;
- ret = afr_set_pending_dict (priv, xattr[i], local->pending);
+ ret = afr_set_pending_dict (priv, xattr[i], local->pending,
+ i, LOCAL_FIRST);
if (ret < 0)
gf_log (this->name, GF_LOG_INFO,
@@ -929,7 +943,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
value
*/
- ret = afr_set_pending_dict (priv, xattr[i], local->pending);
+ ret = afr_set_pending_dict (priv, xattr[i], local->pending,
+ i, LOCAL_FIRST);
if (ret < 0)
gf_log (this->name, GF_LOG_INFO,
@@ -1268,7 +1283,6 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index
child_index, local->transaction.type);
}
-
int
afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
{
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
index 965ee6737..d19d16fb3 100644
--- a/xlators/cluster/afr/src/afr-transaction.h
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -11,6 +11,11 @@
#ifndef __TRANSACTION_H__
#define __TRANSACTION_H__
+typedef enum {
+ LOCAL_FIRST = 1,
+ LOCAL_LAST = 2
+} afr_xattrop_type_t;
+
void
afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
int child_index);
@@ -24,5 +29,6 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
afr_fd_ctx_t *
afr_fd_ctx_get (fd_t *fd, xlator_t *this);
int
-afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending);
+afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending,
+ int child, afr_xattrop_type_t op);
#endif /* __TRANSACTION_H__ */
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index dfad29de1..27cb83a57 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -670,12 +670,7 @@ typedef struct _afr_local {
afr_transaction_type type;
- int success_count;
- int erase_pending;
- int failure_count;
-
- int last_tried;
- int32_t *child_errno;
+ int32_t **txn_changelog;//changelog after pre+post ops
unsigned char *pre_op;
call_frame_t *main_frame;