summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnand V. Avati <avati@amp.gluster.com>2010-10-18 00:16:31 +0000
committerVijay Bellur <vijay@dev.gluster.com>2010-10-18 03:25:46 -0700
commitf213c1b051d7e91e33a2e4631a9ef383ae48921e (patch)
treebd4fb173a3f2d099d6b2b7a7b8fe2c5dd32f2435
parent9be13aff89232c5ede11bdb37c49c2e5dca5d840 (diff)
replicate: replace first-write-to-flush optimization
use a changelog piggybacking optimization instead of first-write-to-flush optimization and do other cleanups (removal of post-post-op hook etc.) Signed-off-by: Anand V. Avati <avati@amp.gluster.com> Signed-off-by: Vijay Bellur <vijay@dev.gluster.com> BUG: 1235 (Bug for all pump/migrate commits) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=1235
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c16
-rw-r--r--xlators/cluster/afr/src/afr-open.c77
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c742
-rw-r--r--xlators/cluster/afr/src/afr.c128
-rw-r--r--xlators/cluster/afr/src/afr.h27
5 files changed, 423 insertions, 567 deletions
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index a7751e68e9c..8ebba6d4ad2 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -305,13 +305,9 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
fd_ctx = (afr_fd_ctx_t *)(long) ctx;
- if (fd_ctx->down_count < priv->down_count) {
- local->up_down_flush_cbk = afr_do_writev;
- afr_up_down_flush (frame, this, fd, AFR_CHILD_DOWN_FLUSH);
-
- } else if (fd_ctx->up_count < priv->up_count) {
- local->up_down_flush_cbk = afr_do_writev;
- afr_up_down_flush (frame, this, fd, AFR_CHILD_UP_FLUSH);
+ if (fd_ctx->up_count < priv->up_count) {
+ local->openfd_flush_cbk = afr_do_writev;
+ afr_openfd_flush (frame, this, fd);
} else {
afr_do_writev (frame, this);
@@ -787,9 +783,9 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this,
fd_ctx = (afr_fd_ctx_t *)(long) ctx;
- if (fd_ctx->down_count < priv->down_count) {
- local->up_down_flush_cbk = afr_do_ftruncate;
- afr_up_down_flush (frame, this, fd, AFR_CHILD_DOWN_FLUSH);
+ if (fd_ctx->up_count < priv->up_count) {
+ local->openfd_flush_cbk = afr_do_ftruncate;
+ afr_openfd_flush (frame, this, fd);
} else {
afr_do_ftruncate (frame, this);
}
diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c
index 43a38c0b112..00958438a4c 100644
--- a/xlators/cluster/afr/src/afr-open.c
+++ b/xlators/cluster/afr/src/afr-open.c
@@ -215,9 +215,8 @@ out:
int
-afr_up_down_flush_open_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- fd_t *fd)
+afr_openfd_sh_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
@@ -257,7 +256,7 @@ out:
call_count = afr_frame_return (frame);
if (call_count == 0) {
- local->transaction.post_post_op (frame, this);
+ local->transaction.resume (frame, this);
}
return 0;
@@ -265,7 +264,7 @@ out:
static int
-__unopened_count (int child_count, unsigned char *opened_on, unsigned char *child_up)
+__unopened_count (int child_count, unsigned int *opened_on, unsigned char *child_up)
{
int i;
int count = 0;
@@ -280,7 +279,7 @@ __unopened_count (int child_count, unsigned char *opened_on, unsigned char *chil
int
-afr_up_down_flush_sh_unwind (call_frame_t *frame, xlator_t *this)
+afr_openfd_sh_unwind (call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
@@ -311,8 +310,17 @@ afr_up_down_flush_sh_unwind (call_frame_t *frame, xlator_t *this)
fd_ctx = (afr_fd_ctx_t *)(long) ctx;
- call_count = __unopened_count (priv->child_count, fd_ctx->opened_on,
- local->child_up);
+ LOCK (&local->fd->lock);
+ {
+ call_count = __unopened_count (priv->child_count,
+ fd_ctx->opened_on,
+ local->child_up);
+ for (i = 0; i < priv->child_count; i++) {
+ fd_ctx->pre_op_done[i] = 0;
+ fd_ctx->pre_op_piggyback[i] = 0;
+ }
+ }
+ UNLOCK (&local->fd->lock);
if (call_count == 0) {
abandon = 1;
@@ -332,7 +340,7 @@ afr_up_down_flush_sh_unwind (call_frame_t *frame, xlator_t *this)
"opening fd for %s on subvolume %s",
local->loc.path, priv->children[i]->name);
- STACK_WIND_COOKIE (frame, afr_up_down_flush_open_cbk,
+ STACK_WIND_COOKIE (frame, afr_openfd_sh_open_cbk,
(void *)(long) i,
priv->children[i],
priv->children[i]->fops->open,
@@ -346,14 +354,14 @@ afr_up_down_flush_sh_unwind (call_frame_t *frame, xlator_t *this)
out:
if (abandon)
- local->transaction.post_post_op (frame, this);
+ local->transaction.resume (frame, this);
return 0;
}
int
-afr_up_down_flush_post_post_op (call_frame_t *frame, xlator_t *this)
+afr_openfd_sh (call_frame_t *frame, xlator_t *this)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
@@ -369,7 +377,7 @@ afr_up_down_flush_post_post_op (call_frame_t *frame, xlator_t *this)
if (ret < 0) {
gf_log (this->name, GF_LOG_TRACE,
"Inode path failed. Possible open-unlink-write detected");
- afr_up_down_flush_sh_unwind (frame, this);
+ afr_openfd_sh_unwind (frame, this);
return 0;
}
@@ -386,7 +394,7 @@ afr_up_down_flush_post_post_op (call_frame_t *frame, xlator_t *this)
sh->need_data_self_heal = _gf_true;
sh->mode = local->fd->inode->st_mode;
sh->background = _gf_false;
- sh->unwind = afr_up_down_flush_sh_unwind;
+ sh->unwind = afr_openfd_sh_unwind;
afr_self_heal (frame, this);
@@ -395,21 +403,7 @@ afr_up_down_flush_post_post_op (call_frame_t *frame, xlator_t *this)
int
-afr_up_down_flush_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- priv = this->private;
-
- local->transaction.resume (frame, this);
- return 0;
-}
-
-
-int
-afr_up_down_flush_done (call_frame_t *frame, xlator_t *this)
+afr_openfd_flush_done (call_frame_t *frame, xlator_t *this)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
@@ -418,7 +412,6 @@ afr_up_down_flush_done (call_frame_t *frame, xlator_t *this)
afr_fd_ctx_t * fd_ctx = NULL;
int _ret = -1;
- int i = 0;
priv = this->private;
local = frame->local;
@@ -435,26 +428,20 @@ afr_up_down_flush_done (call_frame_t *frame, xlator_t *this)
fd_ctx->down_count = priv->down_count;
fd_ctx->up_count = priv->up_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i])
- fd_ctx->pre_op_done[i] = 0;
- }
}
out:
UNLOCK (&local->fd->lock);
afr_local_transaction_cleanup (local, this);
- local->up_down_flush_cbk (frame, this);
+ local->openfd_flush_cbk (frame, this);
return 0;
}
int
-afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd,
- afr_flush_type type)
+afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
{
afr_private_t * priv = NULL;
afr_local_t * local = NULL;
@@ -473,18 +460,8 @@ afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd,
// local->fd = fd_ref (local->fd);
- local->transaction.fop = afr_up_down_flush_wind;
- local->transaction.done = afr_up_down_flush_done;
-
- switch (type) {
- case AFR_CHILD_UP_FLUSH:
- local->transaction.post_post_op = afr_up_down_flush_post_post_op;
- break;
-
- case AFR_CHILD_DOWN_FLUSH:
- local->transaction.post_post_op = NULL;
- break;
- }
+ local->transaction.fop = afr_openfd_sh;
+ local->transaction.done = afr_openfd_flush_done;
local->transaction.start = 0;
local->transaction.len = 0;
@@ -493,7 +470,7 @@ afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd,
"doing up/down flush on fd=%p",
fd);
- afr_transaction (frame, this, AFR_FLUSH_TRANSACTION);
+ afr_transaction (frame, this, AFR_DATA_TRANSACTION);
op_ret = 0;
out:
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index 75a059f303c..53a2b380d70 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -33,6 +33,25 @@
#define LOCKED_LOWER 0x2 /* for lower_path of RENAME */
+afr_fd_ctx_t *
+afr_fd_ctx_get (fd_t *fd, xlator_t *this)
+{
+ uint64_t ctx = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int ret = 0;
+
+ ret = fd_ctx_get (fd, this, &ctx);
+
+ if (ret < 0)
+ goto out;
+
+ fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+
+out:
+ return fd_ctx;
+}
+
+
static void
afr_pid_save (call_frame_t *frame)
{
@@ -82,79 +101,53 @@ __mark_child_dead (int32_t *pending[], int child_count, int child,
static void
-__mark_fop_failed_on_fd (fd_t *fd, xlator_t *this,
- int child_index)
-{
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
-
- int ret = 0;
-
- ret = fd_ctx_get (fd, this, &ctx);
-
- if (ret < 0)
- goto out;
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- fd_ctx->child_failed[child_index] = 1;
-out:
- return;
-}
-
-
-static void
-__mark_failed_children (int32_t *pending[], int child_count,
- xlator_t *this, fd_t *fd, afr_transaction_type type)
+__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index)
{
- uint64_t ctx;
+ afr_local_t *local = NULL;
afr_fd_ctx_t * fd_ctx = NULL;
- int ret = 0;
- int i = 0;
- int j = 0;
+ local = frame->local;
- ret = fd_ctx_get (fd, this, &ctx);
+ if (!local->fd)
+ return;
- if (ret < 0)
+ fd_ctx = afr_fd_ctx_get (local->fd, this);
+ if (!fd_ctx)
goto out;
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- for (i = 0; i < child_count; i++) {
- j = afr_index_for_transaction_type (type);
-
- if (fd_ctx->child_failed[i])
- pending[i][j] = 0;
+ LOCK (&local->fd->lock);
+ {
+ if (local->transaction.type == AFR_DATA_TRANSACTION)
+ fd_ctx->pre_op_done[child_index]++;
}
-
+ UNLOCK (&local->fd->lock);
+
out:
return;
}
static void
-__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index)
+__mark_pre_op_undone_on_fd (call_frame_t *frame, xlator_t *this, int child_index)
{
afr_local_t *local = NULL;
-
- uint64_t ctx;
afr_fd_ctx_t * fd_ctx = NULL;
- int ret = 0;
local = frame->local;
- ret = fd_ctx_get (local->fd, this, &ctx);
+ if (!local->fd)
+ return;
- if (ret < 0)
+ fd_ctx = afr_fd_ctx_get (local->fd, this);
+ if (!fd_ctx)
goto out;
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- if ((local->op == GF_FOP_WRITE)
- || (local->op == GF_FOP_FTRUNCATE)) {
- fd_ctx->pre_op_done[child_index] = 1;
+ LOCK (&local->fd->lock);
+ {
+ if (local->transaction.type == AFR_DATA_TRANSACTION)
+ fd_ctx->pre_op_done[child_index]--;
}
+ UNLOCK (&local->fd->lock);
out:
return;
@@ -191,115 +184,6 @@ __mark_all_success (int32_t *pending[], int child_count,
}
-static int
-__is_first_write_on_fd (xlator_t *this, fd_t *fd)
-{
- int op_ret = 0;
- int _ret = -1;
- int i = 0;
-
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
-
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
- LOCK (&fd->lock);
- {
- _ret = __fd_ctx_get (fd, this, &ctx);
-
- if (_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "could not get fd ctx on fd=%p",
- fd);
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- op_ret = 1;
- for (i = 0; i < priv->child_count; i++) {
- if (fd_ctx->pre_op_done[i] == 0)
- continue;
-
- op_ret = 0;
- }
- }
-out:
- UNLOCK (&fd->lock);
-
- return op_ret;
-}
-
-
-static int
-__if_fd_pre_op_done (xlator_t *this, fd_t *fd, int child_index)
-{
- int op_ret = 0;
- int _ret = -1;
-
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
-
- LOCK (&fd->lock);
- {
- _ret = __fd_ctx_get (fd, this, &ctx);
-
- if (_ret < 0) {
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- if (fd_ctx->pre_op_done[child_index]) {
- op_ret = 1;
- }
- fd_ctx->pre_op_done[child_index] = 0;
- }
-out:
- UNLOCK (&fd->lock);
-
- return op_ret;
-}
-
-
-static int
-afr_pre_op_done_count (xlator_t *this, fd_t *fd, unsigned char *child_up)
-{
- int i = 0;
- int count = 0;
-
- int _ret = 0;
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
-
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
- LOCK (&fd->lock);
- {
- _ret = __fd_ctx_get (fd, this, &ctx);
-
- if (_ret < 0) {
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- for (i = 0; i < priv->child_count; i++) {
- if (fd_ctx->pre_op_done[i] && child_up[i]) {
- count++;
- }
- }
- }
-out:
- UNLOCK (&fd->lock);
-
- return count;
-}
-
static int
__changelog_enabled (afr_private_t *priv, afr_transaction_type type)
@@ -325,9 +209,6 @@ __changelog_enabled (afr_private_t *priv, afr_transaction_type type)
ret = 1;
break;
-
- case AFR_FLUSH_TRANSACTION:
- ret = 1;
}
return ret;
@@ -339,7 +220,6 @@ __changelog_needed_pre_op (call_frame_t *frame, xlator_t *this)
{
afr_private_t * priv = NULL;
afr_local_t * local = NULL;
- fd_t * fd = NULL;
int op_ret = 0;
@@ -351,15 +231,7 @@ __changelog_needed_pre_op (call_frame_t *frame, xlator_t *this)
case GF_FOP_WRITE:
case GF_FOP_FTRUNCATE:
- /*
- if it's a data transaction, we write the changelog
- only on the first write on an fd
- */
-
- fd = local->fd;
- if (!fd || __is_first_write_on_fd (this, fd))
- op_ret = 1;
-
+ op_ret = 1;
break;
case GF_FOP_FLUSH:
@@ -395,11 +267,11 @@ __changelog_needed_post_op (call_frame_t *frame, xlator_t *this)
case GF_FOP_WRITE:
case GF_FOP_FTRUNCATE:
- op_ret = 0;
+ op_ret = 1;
break;
case GF_FOP_FLUSH:
- op_ret = 1;
+ op_ret = 0;
break;
default:
@@ -412,6 +284,39 @@ __changelog_needed_post_op (call_frame_t *frame, xlator_t *this)
static int
+afr_set_piggyback_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending,
+ afr_transaction_type type)
+{
+ int i;
+ int ret = 0;
+ int *arr = NULL;
+ int index = 0;
+
+ index = afr_index_for_transaction_type (type);
+
+ for (i = 0; i < priv->child_count; i++) {
+ arr = CALLOC (3 * sizeof (int32_t), priv->child_count);
+ if (!arr) {
+ ret = -1;
+ goto out;
+ }
+
+ memcpy (arr, pending[i], 3 * sizeof (int32_t));
+ arr[index]++;
+ ret = dict_set_bin (xattr, priv->pending_key[i],
+ arr, 3 * sizeof (int32_t));
+ /* 3 = data+metadata+entry */
+
+ if (ret < 0)
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+
+static int
afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending)
{
int i;
@@ -437,7 +342,6 @@ afr_lock_server_count (afr_private_t *priv, afr_transaction_type type)
int ret = 0;
switch (type) {
- case AFR_FLUSH_TRANSACTION:
case AFR_DATA_TRANSACTION:
ret = priv->data_lock_server_count;
break;
@@ -562,7 +466,6 @@ afr_unlock (call_frame_t *frame, xlator_t *this)
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- case AFR_FLUSH_TRANSACTION:
if (local->transaction.locked_nodes[i] & LOCKED_YES) {
if (local->fd) {
@@ -671,11 +574,10 @@ afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
afr_private_t * priv = NULL;
afr_local_t * local = NULL;
+ int child_index = -1;
int call_count = -1;
- int (*post_post_op) (call_frame_t *, xlator_t *);
-
priv = this->private;
local = frame->local;
@@ -685,25 +587,23 @@ afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
UNLOCK (&frame->lock);
- if (call_count == 0) {
- if (local->transaction.post_post_op) {
- post_post_op = local->transaction.post_post_op;
+ child_index = (long) cookie;
- if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- local->transaction.post_post_op = local->transaction.done;
- } else {
- local->transaction.post_post_op = afr_unlock;
- }
+ if (op_ret == 1) {
+ /* cached */
+ }
+
+ if (op_ret == 0) {
+ __mark_pre_op_undone_on_fd (frame, this, child_index);
+ }
- post_post_op (frame, this);
+ if (call_count == 0) {
+ if (afr_lock_server_count (priv, local->transaction.type) == 0) {
+ local->transaction.done (frame, this);
} else {
- if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- local->transaction.done (frame, this);
- } else {
- afr_unlock (frame, this);
- }
+ afr_unlock (frame, this);
}
- }
+ }
return 0;
}
@@ -720,18 +620,17 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
afr_local_t * local = NULL;
dict_t **xattr = NULL;
+ afr_fd_ctx_t *fdctx = NULL;
+ int piggyback = 0;
+ int index = 0;
+ int nothing_failed = 1;
+
local = frame->local;
__mark_down_children (local->pending, priv->child_count,
local->child_up, local->transaction.type);
- if (local->op == GF_FOP_FLUSH) {
- __mark_failed_children (local->pending, priv->child_count,
- this, local->fd,
- local->transaction.type);
- }
-
xattr = alloca (priv->child_count * sizeof (*xattr));
memset (xattr, 0, (priv->child_count * sizeof (*xattr)));
for (i = 0; i < priv->child_count; i++) {
@@ -739,16 +638,15 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
dict_ref (xattr[i]);
}
- if (local->op == GF_FOP_FLUSH) {
- call_count = afr_pre_op_done_count (this, local->fd, local->child_up);
- } else {
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
- if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
- call_count *= 2;
- }
+ if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ call_count *= 2;
}
+ if (local->fd)
+ fdctx = afr_fd_ctx_get (local->fd, this);
+
local->call_count = call_count;
if (call_count == 0) {
@@ -761,100 +659,136 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
return 0;
}
+ /* check if something has failed, to handle piggybacking */
+ nothing_failed = 1;
+ index = afr_index_for_transaction_type (local->transaction.type);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->pending[i][index] == 0) {
+ nothing_failed = 0;
+ break;
+ }
+ }
+
+
for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- ret = afr_set_pending_dict (priv, xattr[i],
- local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set pending entry");
-
-
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- {
- if (local->fd)
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- else
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- call_count--;
- }
- break;
+ if (!local->child_up[i])
+ continue;
+
+ ret = afr_set_pending_dict (priv, xattr[i],
+ local->pending);
+
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to set pending entry");
+
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ {
+ if (!fdctx) {
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->loc,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ break;
+ }
- case AFR_FLUSH_TRANSACTION:
- {
- if (__if_fd_pre_op_done (this, local->fd, i)) {
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- call_count--;
+ LOCK (&local->fd->lock);
+ {
+ piggyback = 0;
+ if (fdctx->pre_op_piggyback[i]) {
+ fdctx->pre_op_piggyback[i]--;
+ piggyback = 1;
}
- }
- break;
+ }
+ UNLOCK (&local->fd->lock);
+
+ if (piggyback && !nothing_failed)
+ ret = afr_set_piggyback_dict (priv, xattr[i],
+ local->pending,
+ local->transaction.type);
+ if (nothing_failed && piggyback) {
+ afr_changelog_post_op_cbk (frame, (void *)(long)i,
+ this, 1, 0, xattr[i]);
+ } else {
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_post_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ }
+ }
+ break;
- case AFR_ENTRY_RENAME_TRANSACTION:
- {
- STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.new_parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
-
- call_count--;
- }
+ case AFR_METADATA_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ else
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->loc,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ call_count--;
+ }
+ break;
- /*
- set it again because previous stack_wind
- might have already returned (think of case
- where subvolume is posix) and would have
- used the dict as placeholder for return
- value
- */
-
- ret = afr_set_pending_dict (priv, xattr[i],
- local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set pending entry");
-
- /* fall through */
-
- case AFR_ENTRY_TRANSACTION:
- {
- if (local->fd)
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- else
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- call_count--;
- }
- break;
- }
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ {
+ STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.new_parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ call_count--;
+ }
- if (!call_count)
- break;
- }
+ /*
+ set it again because previous stack_wind
+ might have already returned (think of case
+ where subvolume is posix) and would have
+ used the dict as placeholder for return
+ value
+ */
+ ret = afr_set_pending_dict (priv, xattr[i],
+ local->pending);
+
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to set pending entry");
+
+ /* fall through */
+
+ case AFR_ENTRY_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ else
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ call_count--;
+ }
+ break;
+ }
+
+ if (!call_count)
+ break;
}
for (i = 0; i < priv->child_count; i++) {
@@ -881,6 +815,10 @@ afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
LOCK (&frame->lock);
{
+ if (op_ret == 1) {
+ /* special op_ret for piggyback */
+ }
+
if (op_ret == 0) {
__mark_pre_op_done_on_fd (frame, this, child_index);
}
@@ -934,6 +872,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
int ret = 0;
int call_count = 0;
dict_t **xattr = NULL;
+ afr_fd_ctx_t *fdctx = NULL;
+ int piggyback = 0;
afr_local_t *local = NULL;
@@ -969,97 +909,139 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
__mark_all_pending (local->pending, priv->child_count,
local->transaction.type);
+ if (local->fd)
+ fdctx = afr_fd_ctx_get (local->fd, this);
+
for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- ret = afr_set_pending_dict (priv, xattr[i],
- local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set pending entry");
-
-
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- case AFR_FLUSH_TRANSACTION:
- {
- if (local->fd)
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- else
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &(local->loc),
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- }
- break;
+ if (!local->child_up[i])
+ continue;
+
+ ret = afr_set_pending_dict (priv, xattr[i],
+ local->pending);
+
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to set pending entry");
+
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ {
+ if (!fdctx) {
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &(local->loc),
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ break;
+ }
+
+ LOCK (&local->fd->lock);
+ {
+ piggyback = 0;
+ if (fdctx->pre_op_done[i]) {
+ fdctx->pre_op_piggyback[i]++;
+ piggyback = 1;
+ fdctx->hit++;
+ } else {
+ fdctx->miss++;
+ }
+ }
+ UNLOCK (&local->fd->lock);
+
+ if (piggyback)
+ afr_changelog_pre_op_cbk (frame, (void *)(long)i,
+ this, 1, 0, xattr[i]);
+ else
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ }
+ break;
+
+ case AFR_METADATA_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ else
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &(local->loc),
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ }
+ break;
- case AFR_ENTRY_RENAME_TRANSACTION:
- {
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.new_parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
-
- call_count--;
- }
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ {
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.new_parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+
+ call_count--;
+ }
- /*
- set it again because previous stack_wind
- might have already returned (think of case
- where subvolume is posix) and would have
- used the dict as placeholder for return
- value
- */
+ /*
+ set it again because previous stack_wind
+ might have already returned (think of case
+ where subvolume is posix) and would have
+ used the dict as placeholder for return
+ value
+ */
- ret = afr_set_pending_dict (priv, xattr[i],
- local->pending);
+ ret = afr_set_pending_dict (priv, xattr[i],
+ local->pending);
- if (ret < 0)
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set pending entry");
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to set pending entry");
- /* fall through */
+ /* fall through */
- case AFR_ENTRY_TRANSACTION:
- {
- if (local->fd)
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- else
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- }
+ case AFR_ENTRY_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ else
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ }
- break;
- }
+ break;
+ }
- if (!--call_count)
- break;
- }
+ if (!--call_count)
+ break;
}
for (i = 0; i < priv->child_count; i++) {
@@ -1294,7 +1276,6 @@ int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index)
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- case AFR_FLUSH_TRANSACTION:
if (local->fd) {
STACK_WIND_COOKIE (frame, afr_lock_cbk,
@@ -1421,15 +1402,8 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index
local = frame->local;
priv = this->private;
- switch (local->op) {
- case GF_FOP_WRITE:
- __mark_fop_failed_on_fd (local->fd, this, child_index);
- break;
- default:
- __mark_child_dead (local->pending, priv->child_count,
- child_index, local->transaction.type);
- break;
- }
+ __mark_child_dead (local->pending, priv->child_count,
+ child_index, local->transaction.type);
}
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 90c7d7ab511..ced89bdb1c9 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -1121,10 +1121,10 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd)
goto unlock;
}
- fd_ctx->child_failed = CALLOC (sizeof (*fd_ctx->child_failed),
- priv->child_count);
+ fd_ctx->pre_op_piggyback = CALLOC (sizeof (*fd_ctx->pre_op_piggyback),
+ priv->child_count);
- if (!fd_ctx->child_failed) {
+ if (!fd_ctx->pre_op_piggyback) {
gf_log (this->name, GF_LOG_ERROR,
"Out of memory");
@@ -1278,73 +1278,6 @@ afr_flush_done (call_frame_t *frame, xlator_t *this)
int
-afr_plain_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-
-{
- afr_local_t *local = NULL;
-
- int call_count = -1;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- AFR_STACK_UNWIND (flush, frame, local->op_ret, local->op_errno);
-
- return 0;
-}
-
-
-static int
-__no_pre_op_done (xlator_t *this, fd_t *fd)
-{
- int i = 0;
- int op_ret = 1;
-
- int _ret = 0;
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
-
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
- LOCK (&fd->lock);
- {
- _ret = __fd_ctx_get (fd, this, &ctx);
-
- if (_ret < 0) {
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- for (i = 0; i < priv->child_count; i++) {
- if (fd_ctx->pre_op_done[i]) {
- op_ret = 0;
- break;
- }
- }
- }
-out:
- UNLOCK (&fd->lock);
-
- return op_ret;
-}
-
-
-int
afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
{
afr_private_t * priv = NULL;
@@ -1357,7 +1290,6 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
int op_ret = -1;
int op_errno = 0;
- int i = 0;
int call_count = 0;
VALIDATE_OR_GOTO (frame, out);
@@ -1376,45 +1308,29 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
call_count = afr_up_children_count (priv->child_count, local->child_up);
- if (__no_pre_op_done (this, fd)) {
- frame->local = local;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_plain_flush_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->flush,
- fd);
- if (!--call_count)
- break;
- }
- }
- } else {
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "Out of memory.");
+ goto out;
+ }
- transaction_frame->local = local;
+ transaction_frame->local = local;
- local->op = GF_FOP_FLUSH;
+ local->op = GF_FOP_FLUSH;
- local->transaction.fop = afr_flush_wind;
- local->transaction.done = afr_flush_done;
- local->transaction.unwind = afr_flush_unwind;
+ local->transaction.fop = afr_flush_wind;
+ local->transaction.done = afr_flush_done;
+ local->transaction.unwind = afr_flush_unwind;
- local->fd = fd_ref (fd);
+ local->fd = fd_ref (fd);
- local->transaction.main_frame = frame;
- local->transaction.start = 0;
- local->transaction.len = 0;
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
- afr_transaction (transaction_frame, this, AFR_FLUSH_TRANSACTION);
- }
+ afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
op_ret = 0;
out:
@@ -1446,8 +1362,8 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)
fd_ctx = (afr_fd_ctx_t *)(long) ctx;
if (fd_ctx) {
- if (fd_ctx->child_failed)
- FREE (fd_ctx->child_failed);
+ if (fd_ctx->pre_op_piggyback)
+ FREE (fd_ctx->pre_op_piggyback);
if (fd_ctx->pre_op_done)
FREE (fd_ctx->pre_op_done);
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 6e1f810b67d..17310c8fd96 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -162,7 +162,6 @@ typedef enum {
AFR_METADATA_TRANSACTION, /* chmod, chown, ... */
AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */
AFR_ENTRY_RENAME_TRANSACTION, /* rename */
- AFR_FLUSH_TRANSACTION, /* flush */
} afr_transaction_type;
@@ -179,7 +178,6 @@ afr_index_for_transaction_type (afr_transaction_type type)
switch (type) {
case AFR_DATA_TRANSACTION:
- case AFR_FLUSH_TRANSACTION:
return 0;
case AFR_METADATA_TRANSACTION:
@@ -194,12 +192,6 @@ afr_index_for_transaction_type (afr_transaction_type type)
}
-typedef enum {
- AFR_CHILD_UP_FLUSH,
- AFR_CHILD_DOWN_FLUSH,
-} afr_flush_type;
-
-
typedef struct _afr_local {
unsigned int call_count;
unsigned int success_count;
@@ -235,7 +227,7 @@ typedef struct _afr_local {
int32_t inodelk_count;
int32_t entrylk_count;
- int (*up_down_flush_cbk) (call_frame_t *, xlator_t *);
+ int (*openfd_flush_cbk) (call_frame_t *, xlator_t *);
/*
This struct contains the arguments for the "continuation"
@@ -535,8 +527,6 @@ typedef struct _afr_local {
int (*unwind) (call_frame_t *frame, xlator_t *this);
- /* post-op hook */
- int (*post_post_op) (call_frame_t *frame, xlator_t *this);
} transaction;
afr_self_heal_t self_heal;
@@ -544,15 +534,17 @@ typedef struct _afr_local {
typedef struct {
- unsigned char *pre_op_done;
- unsigned char *opened_on; /* which subvolumes the fd is open on */
- unsigned char *child_failed;
+ unsigned int *pre_op_done;
+ unsigned int *opened_on; /* which subvolumes the fd is open on */
+ unsigned int *pre_op_piggyback;
int flags;
int32_t wbflags;
uint64_t up_count; /* number of CHILD_UPs this fd has seen */
uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */
int32_t last_tried;
+
+ int hit, miss;
gf_boolean_t failed_over;
struct list_head entries; /* needed for readdir failover */
} afr_fd_ctx_t;
@@ -623,9 +615,6 @@ int
afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
fd_t *fd, int32_t wbflags);
-int
-afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, afr_flush_type type);
-
void
afr_set_opendir_done (xlator_t *this, inode_t *inode);
@@ -638,6 +627,10 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this);
int
afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
+int
+afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd);
+
+
#define AFR_STACK_UNWIND(fop, frame, params ...) \
do { \
afr_local_t *__local = NULL; \