summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnand V. Avati <avati@blackhole.gluster.com>2010-09-29 00:28:07 +0000
committerVijay Bellur <vijay@dev.gluster.com>2010-09-29 01:43:24 -0700
commitaee339605337916aaa1e38a0e9ed2422f0f0dcfb (patch)
treea667cceff6b38226cc1cc3f03a1d1c78daeb9414
parent25e4eefc40de8fc47260c1d8209679269686c162 (diff)
replicate: replace first-write-to-flush optimization
use a changelog piggybacking optimization instead of first-write-to-flush optimization and do other cleanups (removal of post-post-op hook etc.) Signed-off-by: Anand V. Avati <avati@blackhole.gluster.com> Signed-off-by: Anand V. Avati <avati@amp.gluster.com> Signed-off-by: Vijay Bellur <vijay@dev.gluster.com> BUG: 1235 (Bug for all pump/migrate commits) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=1235
-rw-r--r--xlators/cluster/afr/src/afr-common.c142
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c17
-rw-r--r--xlators/cluster/afr/src/afr-lk-common.c4
-rw-r--r--xlators/cluster/afr/src/afr-open.c61
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c768
-rw-r--r--xlators/cluster/afr/src/afr.h24
6 files changed, 422 insertions, 594 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 13eb57e3e..b5f060a87 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -1188,25 +1188,22 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd)
goto unlock;
}
- fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on),
- priv->child_count,
- gf_afr_mt_char);
- if (!fd_ctx->opened_on) {
+ fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback),
+ priv->child_count,
+ gf_afr_mt_char);
+ if (!fd_ctx->pre_op_piggyback) {
gf_log (this->name, GF_LOG_ERROR,
"Out of memory");
ret = -ENOMEM;
goto unlock;
}
- fd_ctx->child_failed = GF_CALLOC (
- sizeof (*fd_ctx->child_failed),
- priv->child_count,
- gf_afr_mt_char);
-
- if (!fd_ctx->child_failed) {
+ fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on),
+ priv->child_count,
+ gf_afr_mt_char);
+ if (!fd_ctx->opened_on) {
gf_log (this->name, GF_LOG_ERROR,
"Out of memory");
-
ret = -ENOMEM;
goto unlock;
}
@@ -1352,73 +1349,6 @@ afr_flush_done (call_frame_t *frame, xlator_t *this)
int
-afr_plain_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-
-{
- afr_local_t *local = NULL;
-
- int call_count = -1;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- AFR_STACK_UNWIND (flush, frame, local->op_ret, local->op_errno);
-
- return 0;
-}
-
-
-static int
-__no_pre_op_done (xlator_t *this, fd_t *fd)
-{
- int i = 0;
- int op_ret = 1;
-
- int _ret = 0;
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
-
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
- LOCK (&fd->lock);
- {
- _ret = __fd_ctx_get (fd, this, &ctx);
-
- if (_ret < 0) {
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- for (i = 0; i < priv->child_count; i++) {
- if (fd_ctx->pre_op_done[i]) {
- op_ret = 0;
- break;
- }
- }
- }
-out:
- UNLOCK (&fd->lock);
-
- return op_ret;
-}
-
-
-int
afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
{
afr_private_t * priv = NULL;
@@ -1431,7 +1361,6 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
int op_ret = -1;
int op_errno = 0;
- int i = 0;
int call_count = 0;
VALIDATE_OR_GOTO (frame, out);
@@ -1450,45 +1379,30 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
call_count = afr_up_children_count (priv->child_count, local->child_up);
- if (__no_pre_op_done (this, fd)) {
- frame->local = local;
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "Out of memory.");
+ goto out;
+ }
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_plain_flush_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->flush,
- fd);
- if (!--call_count)
- break;
- }
- }
- } else {
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- goto out;
- }
+ transaction_frame->local = local;
- transaction_frame->local = local;
+ local->op = GF_FOP_FLUSH;
- local->op = GF_FOP_FLUSH;
+ local->transaction.fop = afr_flush_wind;
+ local->transaction.done = afr_flush_done;
+ local->transaction.unwind = afr_flush_unwind;
- local->transaction.fop = afr_flush_wind;
- local->transaction.done = afr_flush_done;
- local->transaction.unwind = afr_flush_unwind;
+ local->fd = fd_ref (fd);
- local->fd = fd_ref (fd);
+ local->transaction.main_frame = frame;
+ local->transaction.start = 0;
+ local->transaction.len = 0;
- local->transaction.main_frame = frame;
- local->transaction.start = 0;
- local->transaction.len = 0;
+ afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
- afr_transaction (transaction_frame, this, AFR_FLUSH_TRANSACTION);
- }
op_ret = 0;
out:
@@ -1519,10 +1433,10 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)
fd_ctx = (afr_fd_ctx_t *)(long) ctx;
- if (fd_ctx) {
- if (fd_ctx->child_failed)
- GF_FREE (fd_ctx->child_failed);
+ gf_log (this->name, GF_LOG_TRACE,
+ "hits=%d, miss=%d", fd_ctx->hit, fd_ctx->miss);
+ if (fd_ctx) {
if (fd_ctx->pre_op_done)
GF_FREE (fd_ctx->pre_op_done);
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 587e48400..5a2c40cc4 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -291,14 +291,9 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
fd_ctx = (afr_fd_ctx_t *)(long) ctx;
- if (fd_ctx->down_count < priv->down_count) {
- local->up_down_flush_cbk = afr_do_writev;
- afr_up_down_flush (frame, this, fd, AFR_CHILD_DOWN_FLUSH);
-
- } else if (fd_ctx->up_count < priv->up_count) {
- local->up_down_flush_cbk = afr_do_writev;
- afr_up_down_flush (frame, this, fd, AFR_CHILD_UP_FLUSH);
-
+ if (fd_ctx->up_count < priv->up_count) {
+ local->openfd_flush_cbk = afr_do_writev;
+ afr_openfd_flush (frame, this, fd);
} else {
afr_do_writev (frame, this);
}
@@ -769,9 +764,9 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this,
fd_ctx = (afr_fd_ctx_t *)(long) ctx;
- if (fd_ctx->down_count < priv->down_count) {
- local->up_down_flush_cbk = afr_do_ftruncate;
- afr_up_down_flush (frame, this, fd, AFR_CHILD_DOWN_FLUSH);
+ if (fd_ctx->up_count < priv->up_count) {
+ local->openfd_flush_cbk = afr_do_ftruncate;
+ afr_openfd_flush (frame, this, fd);
} else {
afr_do_ftruncate (frame, this);
}
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
index 749c6bf9a..ee53d1d7b 100644
--- a/xlators/cluster/afr/src/afr-lk-common.c
+++ b/xlators/cluster/afr/src/afr-lk-common.c
@@ -433,7 +433,6 @@ is_afr_lock_transaction (afr_local_t *local)
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- case AFR_FLUSH_TRANSACTION:
ret = 1;
break;
@@ -878,7 +877,6 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this)
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- case AFR_FLUSH_TRANSACTION:
memcpy (int_lock->inode_locked_nodes,
int_lock->locked_nodes,
priv->child_count);
@@ -998,7 +996,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index)
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- case AFR_FLUSH_TRANSACTION:
if (local->fd) {
afr_trace_inodelk_in (frame, AFR_INODELK_TRANSACTION,
@@ -1110,7 +1107,6 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this)
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- case AFR_FLUSH_TRANSACTION:
initialize_inodelk_variables (frame, this);
break;
diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c
index 69bbb56b8..d943213b2 100644
--- a/xlators/cluster/afr/src/afr-open.c
+++ b/xlators/cluster/afr/src/afr-open.c
@@ -216,9 +216,8 @@ out:
int
-afr_up_down_flush_open_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- fd_t *fd)
+afr_openfd_sh_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd)
{
afr_internal_lock_t *int_lock = NULL;
afr_local_t *local = NULL;
@@ -261,7 +260,7 @@ out:
if (call_count == 0) {
int_lock->lock_cbk = local->transaction.done;
- local->transaction.post_post_op (frame, this);
+ local->transaction.resume (frame, this);
}
return 0;
@@ -269,7 +268,7 @@ out:
static int
-__unopened_count (int child_count, unsigned char *opened_on, unsigned char *child_up)
+__unopened_count (int child_count, unsigned int *opened_on, unsigned char *child_up)
{
int i;
int count = 0;
@@ -284,7 +283,7 @@ __unopened_count (int child_count, unsigned char *opened_on, unsigned char *chil
int
-afr_up_down_flush_sh_unwind (call_frame_t *frame, xlator_t *this)
+afr_openfd_sh_unwind (call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
@@ -331,7 +330,7 @@ afr_up_down_flush_sh_unwind (call_frame_t *frame, xlator_t *this)
"opening fd for %s on subvolume %s",
local->loc.path, priv->children[i]->name);
- STACK_WIND_COOKIE (frame, afr_up_down_flush_open_cbk,
+ STACK_WIND_COOKIE (frame, afr_openfd_sh_open_cbk,
(void *)(long) i,
priv->children[i],
priv->children[i]->fops->open,
@@ -345,14 +344,14 @@ afr_up_down_flush_sh_unwind (call_frame_t *frame, xlator_t *this)
out:
if (abandon)
- local->transaction.post_post_op (frame, this);
+ local->transaction.resume (frame, this);
return 0;
}
int
-afr_up_down_flush_post_post_op (call_frame_t *frame, xlator_t *this)
+afr_openfd_sh (call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
afr_self_heal_t *sh = NULL;
@@ -375,7 +374,7 @@ afr_up_down_flush_post_post_op (call_frame_t *frame, xlator_t *this)
sh->need_data_self_heal = _gf_true;
sh->type = local->fd->inode->ia_type;
sh->background = _gf_false;
- sh->unwind = afr_up_down_flush_sh_unwind;
+ sh->unwind = afr_openfd_sh_unwind;
afr_self_heal_type_str_get(&local->self_heal,
sh_type_str,
@@ -391,19 +390,7 @@ afr_up_down_flush_post_post_op (call_frame_t *frame, xlator_t *this)
int
-afr_up_down_flush_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- local->transaction.resume (frame, this);
- return 0;
-}
-
-
-int
-afr_up_down_flush_done (call_frame_t *frame, xlator_t *this)
+afr_openfd_flush_done (call_frame_t *frame, xlator_t *this)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
@@ -412,7 +399,6 @@ afr_up_down_flush_done (call_frame_t *frame, xlator_t *this)
afr_fd_ctx_t * fd_ctx = NULL;
int _ret = -1;
- int i = 0;
priv = this->private;
local = frame->local;
@@ -429,11 +415,6 @@ afr_up_down_flush_done (call_frame_t *frame, xlator_t *this)
fd_ctx->down_count = priv->down_count;
fd_ctx->up_count = priv->up_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i])
- fd_ctx->pre_op_done[i] = 0;
- }
}
out:
UNLOCK (&local->fd->lock);
@@ -444,15 +425,14 @@ out:
"The up/down flush is over");
fd_unref (local->fd);
- local->up_down_flush_cbk (frame, this);
+ local->openfd_flush_cbk (frame, this);
return 0;
}
int
-afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd,
- afr_flush_type type)
+afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd)
{
afr_local_t * local = NULL;
@@ -466,18 +446,8 @@ afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd,
local->fd = fd_ref (fd);
- local->transaction.fop = afr_up_down_flush_wind;
- local->transaction.done = afr_up_down_flush_done;
-
- switch (type) {
- case AFR_CHILD_UP_FLUSH:
- local->transaction.post_post_op = afr_up_down_flush_post_post_op;
- break;
-
- case AFR_CHILD_DOWN_FLUSH:
- local->transaction.post_post_op = NULL;
- break;
- }
+ local->transaction.fop = afr_openfd_sh;
+ local->transaction.done = afr_openfd_flush_done;
local->transaction.start = 0;
local->transaction.len = 0;
@@ -486,8 +456,9 @@ afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd,
"doing up/down flush on fd=%p",
fd);
- afr_transaction (frame, this, AFR_FLUSH_TRANSACTION);
+ afr_transaction (frame, this, AFR_DATA_TRANSACTION);
out:
return 0;
}
+
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index b46153267..e31f0c1df 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -33,6 +33,25 @@
#define LOCKED_LOWER 0x2 /* for lower_path of RENAME */
+afr_fd_ctx_t *
+afr_fd_ctx_get (fd_t *fd, xlator_t *this)
+{
+ uint64_t ctx = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int ret = 0;
+
+ ret = fd_ctx_get (fd, this, &ctx);
+
+ if (ret < 0)
+ goto out;
+
+ fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+
+out:
+ return fd_ctx;
+}
+
+
static void
afr_pid_save (call_frame_t *frame)
{
@@ -82,80 +101,54 @@ __mark_child_dead (int32_t *pending[], int child_count, int child,
static void
-__mark_fop_failed_on_fd (fd_t *fd, xlator_t *this,
- int child_index)
+__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index)
{
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
-
- int ret = 0;
-
- ret = fd_ctx_get (fd, this, &ctx);
-
- if (ret < 0)
- goto out;
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- fd_ctx->child_failed[child_index] = 1;
-out:
- return;
-}
-
+ afr_local_t *local = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
-static void
-__mark_failed_children (int32_t *pending[], int child_count,
- xlator_t *this, fd_t *fd, afr_transaction_type type)
-{
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
+ local = frame->local;
- int ret = 0;
- int i = 0;
- int j = 0;
+ if (!local->fd)
+ return;
- ret = fd_ctx_get (fd, this, &ctx);
+ fd_ctx = afr_fd_ctx_get (local->fd, this);
- if (ret < 0)
+ if (!fd_ctx)
goto out;
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- for (i = 0; i < child_count; i++) {
- j = afr_index_for_transaction_type (type);
-
- if (fd_ctx->child_failed[i])
- pending[i][j] = 0;
+ LOCK (&local->fd->lock);
+ {
+ if (local->transaction.type == AFR_DATA_TRANSACTION)
+ fd_ctx->pre_op_done[child_index]++;
}
-
+ UNLOCK (&local->fd->lock);
out:
return;
}
static void
-__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index)
+__mark_pre_op_undone_on_fd (call_frame_t *frame, xlator_t *this, int child_index)
{
- afr_local_t *local = NULL;
-
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
- int ret = 0;
+ afr_local_t *local = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
local = frame->local;
- ret = fd_ctx_get (local->fd, this, &ctx);
+ if (!local->fd)
+ return;
- if (ret < 0)
- goto out;
+ fd_ctx = afr_fd_ctx_get (local->fd, this);
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ if (!fd_ctx)
+ goto out;
- if ((local->op == GF_FOP_WRITE)
- || (local->op == GF_FOP_FTRUNCATE)) {
- fd_ctx->pre_op_done[child_index] = 1;
+ LOCK (&local->fd->lock);
+ {
+ if (local->transaction.type == AFR_DATA_TRANSACTION)
+ fd_ctx->pre_op_done[child_index]--;
}
-
+ UNLOCK (&local->fd->lock);
out:
return;
}
@@ -192,116 +185,6 @@ __mark_all_success (int32_t *pending[], int child_count,
static int
-__is_first_write_on_fd (xlator_t *this, fd_t *fd)
-{
- int op_ret = 0;
- int _ret = -1;
- int i = 0;
-
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
-
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
- LOCK (&fd->lock);
- {
- _ret = __fd_ctx_get (fd, this, &ctx);
-
- if (_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "could not get fd ctx on fd=%p",
- fd);
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- op_ret = 1;
- for (i = 0; i < priv->child_count; i++) {
- if (fd_ctx->pre_op_done[i] == 0)
- continue;
-
- op_ret = 0;
- }
- }
-out:
- UNLOCK (&fd->lock);
-
- return op_ret;
-}
-
-
-static int
-__if_fd_pre_op_done (xlator_t *this, fd_t *fd, int child_index)
-{
- int op_ret = 0;
- int _ret = -1;
-
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
-
- LOCK (&fd->lock);
- {
- _ret = __fd_ctx_get (fd, this, &ctx);
-
- if (_ret < 0) {
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- if (fd_ctx->pre_op_done[child_index]) {
- op_ret = 1;
- }
- fd_ctx->pre_op_done[child_index] = 0;
- }
-out:
- UNLOCK (&fd->lock);
-
- return op_ret;
-}
-
-
-static int
-afr_pre_op_done_count (xlator_t *this, fd_t *fd, unsigned char *child_up)
-{
- int i = 0;
- int count = 0;
-
- int _ret = 0;
- uint64_t ctx;
- afr_fd_ctx_t * fd_ctx = NULL;
-
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
- LOCK (&fd->lock);
- {
- _ret = __fd_ctx_get (fd, this, &ctx);
-
- if (_ret < 0) {
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- for (i = 0; i < priv->child_count; i++) {
- if (fd_ctx->pre_op_done[i] && child_up[i]) {
- count++;
- }
- }
- }
-out:
- UNLOCK (&fd->lock);
-
- return count;
-}
-
-
-static int
__changelog_enabled (afr_private_t *priv, afr_transaction_type type)
{
int ret = 0;
@@ -325,9 +208,6 @@ __changelog_enabled (afr_private_t *priv, afr_transaction_type type)
ret = 1;
break;
-
- case AFR_FLUSH_TRANSACTION:
- ret = 1;
}
return ret;
@@ -339,7 +219,6 @@ __changelog_needed_pre_op (call_frame_t *frame, xlator_t *this)
{
afr_private_t * priv = NULL;
afr_local_t * local = NULL;
- fd_t * fd = NULL;
int op_ret = 0;
@@ -351,20 +230,10 @@ __changelog_needed_pre_op (call_frame_t *frame, xlator_t *this)
case GF_FOP_WRITE:
case GF_FOP_FTRUNCATE:
- /*
- if it's a data transaction, we write the changelog
- only on the first write on an fd
- */
-
- fd = local->fd;
- if (!fd || __is_first_write_on_fd (this, fd))
- op_ret = 1;
-
+ op_ret = 1;
break;
case GF_FOP_FLUSH:
- /* only do post-op on flush() */
-
op_ret = 0;
break;
@@ -395,11 +264,11 @@ __changelog_needed_post_op (call_frame_t *frame, xlator_t *this)
case GF_FOP_WRITE:
case GF_FOP_FTRUNCATE:
- op_ret = 0;
+ op_ret = 1;
break;
case GF_FOP_FLUSH:
- op_ret = 1;
+ op_ret = 0;
break;
default:
@@ -431,13 +300,48 @@ out:
}
+static int
+afr_set_piggyback_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending,
+ afr_transaction_type type)
+{
+ int i;
+ int ret = 0;
+ int *arr = NULL;
+ int index = 0;
+
+ index = afr_index_for_transaction_type (type);
+
+ for (i = 0; i < priv->child_count; i++) {
+ arr = GF_CALLOC (3 * sizeof (int32_t), priv->child_count,
+ gf_afr_mt_char);
+ if (!arr) {
+ ret = -1;
+ goto out;
+ }
+
+ memcpy (arr, pending[i], 3 * sizeof (int32_t));
+
+ arr[index]++;
+
+ ret = dict_set_bin (xattr, priv->pending_key[i],
+ arr, 3 * sizeof (int32_t));
+ /* 3 = data+metadata+entry */
+
+ if (ret < 0)
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+
int
afr_lock_server_count (afr_private_t *priv, afr_transaction_type type)
{
int ret = 0;
switch (type) {
- case AFR_FLUSH_TRANSACTION:
case AFR_DATA_TRANSACTION:
ret = priv->child_count;
break;
@@ -464,15 +368,23 @@ afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
afr_internal_lock_t *int_lock = NULL;
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
+ int child_index = 0;
int call_count = -1;
- int (*post_post_op) (call_frame_t *, xlator_t *);
-
priv = this->private;
local = frame->local;
int_lock = &local->internal_lock;
+ child_index = (long) cookie;
+
+ if (op_ret == 1) {
+ }
+
+ if (op_ret == 0) {
+ __mark_pre_op_undone_on_fd (frame, this, child_index);
+ }
+
LOCK (&frame->lock);
{
call_count = --local->call_count;
@@ -480,24 +392,11 @@ afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
UNLOCK (&frame->lock);
if (call_count == 0) {
- if (local->transaction.post_post_op) {
- post_post_op = local->transaction.post_post_op;
-
- if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- local->transaction.post_post_op = local->transaction.done;
- } else {
- int_lock->lock_cbk = local->transaction.done;
- local->transaction.post_post_op = afr_unlock;
- }
-
- post_post_op (frame, this);
+ if (afr_lock_server_count (priv, local->transaction.type) == 0) {
+ local->transaction.done (frame, this);
} else {
- if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- local->transaction.done (frame, this);
- } else {
- int_lock->lock_cbk = local->transaction.done;
- afr_unlock (frame, this);
- }
+ int_lock->lock_cbk = local->transaction.done;
+ afr_unlock (frame, this);
}
}
@@ -515,7 +414,11 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
int call_count = 0;
afr_local_t * local = NULL;
+ afr_fd_ctx_t *fdctx = NULL;
dict_t **xattr = NULL;
+ int piggyback = 0;
+ int index = 0;
+ int nothing_failed = 1;
local = frame->local;
int_lock = &local->internal_lock;
@@ -523,12 +426,6 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
__mark_down_children (local->pending, priv->child_count,
local->child_up, local->transaction.type);
- if (local->op == GF_FOP_FLUSH) {
- __mark_failed_children (local->pending, priv->child_count,
- this, local->fd,
- local->transaction.type);
- }
-
xattr = alloca (priv->child_count * sizeof (*xattr));
memset (xattr, 0, (priv->child_count * sizeof (*xattr)));
for (i = 0; i < priv->child_count; i++) {
@@ -536,18 +433,17 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
dict_ref (xattr[i]);
}
- if (local->op == GF_FOP_FLUSH) {
- call_count = afr_pre_op_done_count (this, local->fd, local->child_up);
- } else {
- call_count = afr_up_children_count (priv->child_count, local->child_up);
+ call_count = afr_up_children_count (priv->child_count, local->child_up);
- if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
- call_count *= 2;
- }
+ if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ call_count *= 2;
}
local->call_count = call_count;
+ if (local->fd)
+ fdctx = afr_fd_ctx_get (local->fd, this);
+
if (call_count == 0) {
/* no child is up */
for (i = 0; i < priv->child_count; i++) {
@@ -559,100 +455,134 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
return 0;
}
+ /* check if something has failed, to handle piggybacking */
+ nothing_failed = 1;
+ index = afr_index_for_transaction_type (local->transaction.type);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->pending[i][index] == 0) {
+ nothing_failed = 0;
+ break;
+ }
+ }
+
for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- ret = afr_set_pending_dict (priv, xattr[i],
- local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set pending entry");
-
-
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- {
- if (local->fd)
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- else
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- call_count--;
- }
- break;
+ if (!local->child_up[i])
+ continue;
- case AFR_FLUSH_TRANSACTION:
- {
- if (__if_fd_pre_op_done (this, local->fd, i)) {
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- call_count--;
+ ret = afr_set_pending_dict (priv, xattr[i],
+ local->pending);
+
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to set pending entry");
+
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ {
+ if (!fdctx) {
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->loc,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ break;
+ }
+
+ LOCK (&local->fd->lock);
+ {
+ piggyback = 0;
+ if (fdctx->pre_op_piggyback[i]) {
+ fdctx->pre_op_piggyback[i]--;
+ piggyback = 1;
}
- }
- break;
+ }
+ UNLOCK (&local->fd->lock);
- case AFR_ENTRY_RENAME_TRANSACTION:
- {
- STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.new_parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
+ if (piggyback && !nothing_failed)
+ ret = afr_set_piggyback_dict (priv, xattr[i],
+ local->pending,
+ local->transaction.type);
- call_count--;
- }
+ if (nothing_failed && piggyback) {
+ afr_changelog_post_op_cbk (frame, (void *)(long)i,
+ this, 1, 0, xattr[i]);
+ } else {
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_post_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ }
+ }
+ break;
+ case AFR_METADATA_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ else
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->loc,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ }
+ break;
- /*
- set it again because previous stack_wind
- might have already returned (think of case
- where subvolume is posix) and would have
- used the dict as placeholder for return
- value
- */
-
- ret = afr_set_pending_dict (priv, xattr[i],
- local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set pending entry");
-
- /* fall through */
-
- case AFR_ENTRY_TRANSACTION:
- {
- if (local->fd)
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- else
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- call_count--;
- }
- break;
- }
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ {
+ STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.new_parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ call_count--;
+ }
- if (!call_count)
- break;
- }
+ /*
+ set it again because previous stack_wind
+ might have already returned (think of case
+ where subvolume is posix) and would have
+ used the dict as placeholder for return
+ value
+ */
+
+ ret = afr_set_pending_dict (priv, xattr[i],
+ local->pending);
+
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to set pending entry");
+
+ /* fall through */
+
+ case AFR_ENTRY_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ else
+ STACK_WIND (frame, afr_changelog_post_op_cbk,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ }
+ break;
+ }
+
+ if (!--call_count)
+ break;
}
for (i = 0; i < priv->child_count; i++) {
@@ -679,6 +609,10 @@ afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
LOCK (&frame->lock);
{
+ if (op_ret == 1) {
+ /* special op_ret for piggyback */
+ }
+
if (op_ret == 0) {
__mark_pre_op_done_on_fd (frame, this, child_index);
}
@@ -731,8 +665,10 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
int ret = 0;
int call_count = 0;
dict_t **xattr = NULL;
+ afr_fd_ctx_t *fdctx = NULL;
afr_local_t *local = NULL;
+ int piggyback = 0;
local = frame->local;
@@ -768,97 +704,136 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
__mark_all_pending (local->pending, priv->child_count,
local->transaction.type);
+ if (local->fd)
+ fdctx = afr_fd_ctx_get (local->fd, this);
+
for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- ret = afr_set_pending_dict (priv, xattr[i],
- local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set pending entry");
-
-
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- case AFR_FLUSH_TRANSACTION:
- {
- if (local->fd)
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- else
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &(local->loc),
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- }
- break;
+ if (!local->child_up[i])
+ continue;
+ ret = afr_set_pending_dict (priv, xattr[i],
+ local->pending);
- case AFR_ENTRY_RENAME_TRANSACTION:
- {
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.new_parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
-
- call_count--;
- }
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to set pending entry");
+
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ {
+ if (!fdctx) {
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &(local->loc),
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ break;
+ }
+
+ LOCK (&local->fd->lock);
+ {
+ piggyback = 0;
+ if (fdctx->pre_op_done[i]) {
+ fdctx->pre_op_piggyback[i]++;
+ piggyback = 1;
+ fdctx->hit++;
+ } else {
+ fdctx->miss++;
+ }
+ }
+ UNLOCK (&local->fd->lock);
+
+ if (piggyback)
+ afr_changelog_pre_op_cbk (frame, (void *)(long)i,
+ this, 1, 0, xattr[i]);
+ else
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ }
+ break;
+ case AFR_METADATA_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ else
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &(local->loc),
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ }
+ break;
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ {
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.new_parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+
+ call_count--;
+ }
- /*
- set it again because previous stack_wind
- might have already returned (think of case
- where subvolume is posix) and would have
- used the dict as placeholder for return
- value
- */
-
- ret = afr_set_pending_dict (priv, xattr[i],
- local->pending);
-
- if (ret < 0)
- gf_log (this->name, GF_LOG_DEBUG,
- "failed to set pending entry");
-
- /* fall through */
-
- case AFR_ENTRY_TRANSACTION:
- {
- if (local->fd)
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- else
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i]);
- }
- break;
- }
+ /*
+ set it again because previous stack_wind
+ might have already returned (think of case
+ where subvolume is posix) and would have
+ used the dict as placeholder for return
+ value
+ */
- if (!--call_count)
- break;
- }
+ ret = afr_set_pending_dict (priv, xattr[i],
+ local->pending);
+
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to set pending entry");
+
+ /* fall through */
+
+ case AFR_ENTRY_TRANSACTION:
+ {
+ if (local->fd)
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ else
+ STACK_WIND_COOKIE (frame,
+ afr_changelog_pre_op_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr[i]);
+ }
+ break;
+ }
+
+ if (!--call_count)
+ break;
}
for (i = 0; i < priv->child_count; i++) {
@@ -1038,7 +1013,6 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this)
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- case AFR_FLUSH_TRANSACTION:
afr_set_transaction_flock (local);
int_lock->lock_cbk = afr_post_nonblocking_inodelk_cbk;
@@ -1098,7 +1072,7 @@ afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
afr_changelog_pre_op (frame, this);
} else {
__mark_all_success (local->pending, priv->child_count,
- local->transaction.type);
+ local->transaction.type);
afr_pid_restore (frame);
@@ -1148,15 +1122,8 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index
local = frame->local;
priv = this->private;
- switch (local->op) {
- case GF_FOP_WRITE:
- __mark_fop_failed_on_fd (local->fd, this, child_index);
- break;
- default:
- __mark_child_dead (local->pending, priv->child_count,
- child_index, local->transaction.type);
- break;
- }
+ __mark_child_dead (local->pending, priv->child_count,
+ child_index, local->transaction.type);
}
@@ -1175,16 +1142,7 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
local->transaction.type = type;
if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- if (__changelog_needed_pre_op (frame, this)) {
- afr_changelog_pre_op (frame, this);
- } else {
- __mark_all_success (local->pending, priv->child_count,
- local->transaction.type);
-
- afr_pid_restore (frame);
-
- local->transaction.fop (frame, this);
- }
+ afr_internal_lock_finish (frame, this);
} else {
afr_lock (frame, this);
}
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index db762c11e..749264a8d 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -174,7 +174,6 @@ typedef enum {
AFR_METADATA_TRANSACTION, /* chmod, chown, ... */
AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */
AFR_ENTRY_RENAME_TRANSACTION, /* rename */
- AFR_FLUSH_TRANSACTION, /* flush */
} afr_transaction_type;
typedef enum {
@@ -217,7 +216,6 @@ afr_index_for_transaction_type (afr_transaction_type type)
switch (type) {
case AFR_DATA_TRANSACTION:
- case AFR_FLUSH_TRANSACTION:
return 0;
case AFR_METADATA_TRANSACTION:
@@ -232,11 +230,6 @@ afr_index_for_transaction_type (afr_transaction_type type)
}
-typedef enum {
- AFR_CHILD_UP_FLUSH,
- AFR_CHILD_DOWN_FLUSH,
-} afr_flush_type;
-
typedef struct {
loc_t *lk_loc;
struct flock lk_flock;
@@ -309,7 +302,7 @@ typedef struct _afr_local {
dict_t *dict;
- int (*up_down_flush_cbk) (call_frame_t *, xlator_t *);
+ int (*openfd_flush_cbk) (call_frame_t *frame, xlator_t *this);
/*
This struct contains the arguments for the "continuation"
@@ -606,7 +599,6 @@ typedef struct _afr_local {
int (*unwind) (call_frame_t *frame, xlator_t *this);
/* post-op hook */
- int (*post_post_op) (call_frame_t *frame, xlator_t *this);
} transaction;
afr_self_heal_t self_heal;
@@ -614,15 +606,17 @@ typedef struct _afr_local {
typedef struct {
- unsigned char *pre_op_done;
- unsigned char *opened_on; /* which subvolumes the fd is open on */
- unsigned char *child_failed;
+ unsigned int *pre_op_done;
+ unsigned int *opened_on; /* which subvolumes the fd is open on */
+ unsigned int *pre_op_piggyback;
int flags;
int32_t wbflags;
uint64_t up_count; /* number of CHILD_UPs this fd has seen */
uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */
int32_t last_tried;
+
+ int hit, miss;
gf_boolean_t failed_over;
struct list_head entries; /* needed for readdir failover */
} afr_fd_ctx_t;
@@ -729,9 +723,6 @@ int
afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
fd_t *fd, int32_t wbflags);
-int
-afr_up_down_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, afr_flush_type type);
-
void
afr_set_opendir_done (xlator_t *this, inode_t *inode);
@@ -744,6 +735,9 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this);
int
afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
+int
+afr_openfd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd);
+
#define AFR_STACK_UNWIND(fop, frame, params ...) \
do { \
afr_local_t *__local = NULL; \