From c9c2c08d34003f49bc3a509757a135665fb20518 Mon Sep 17 00:00:00 2001 From: Ravishankar N Date: Tue, 5 Apr 2016 15:16:52 +0530 Subject: arbiter: write performance improvement Backport of: http://review.gluster.org/#/c/13906 Problem: The throughput for a 'dd' workload was much less for arbiter configuration when compared to normal replica-3 volume. There were 2 issues: i)arbiter_writev was using the request dict as response dict while unwinding, leading to incorect GLUSTERFS_WRITE_IS_APPEND and GLUSTERFS_OPEN_FD_COUNT values (=4), leading to immediate post-ops because is_afr_delayed_changelog_post_op_needed() failed due to afr_are_multiple_fds_opened() check. ii) The arbiter code in afr was setting local->transaction.{start and len} =0 to take full file locks. What this meant was even for simultaenous but non-overlapping writevs, afr_transaction_eager_lock_init() was not happening because afr_locals_overlap() always stays true. Consequently is_afr_delayed_changelog_post_op_needed() failed due to local->delayed_post_op not being set. Fix: i) Send appropriate response dict values in arbiter_writev. ii) Modify flock params instead of local->transaction.{start and len} to take full file locks in the transaction. Also changed _fill_writev_xdata() in posix to fill rsp_xdata for whatever key is requested for. Change-Id: I1c5fc5e98aba49ade540bb441a022e65b753432a BUG: 1324809 Signed-off-by: Ravishankar N Reported-by: Robert Rauch Reported-by: Russel Purinton Reviewed-on: http://review.gluster.org/13925 Smoke: Gluster Build System Reviewed-by: Pranith Kumar Karampuri CentOS-regression: Gluster Build System NetBSD-regression: NetBSD Build System --- xlators/cluster/afr/src/afr-inode-write.c | 8 ----- xlators/cluster/afr/src/afr-transaction.c | 16 ++++++--- xlators/features/arbiter/src/arbiter.c | 57 ++++++++++++++++++++++++++----- xlators/storage/posix/src/posix.c | 35 +++++++++++-------- 4 files changed, 82 insertions(+), 34 deletions(-) diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 4206ef2f111..084a78ecf47 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -417,7 +417,6 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) { call_frame_t *transaction_frame = NULL; afr_local_t *local = NULL; - afr_private_t *priv = NULL; int ret = -1; int op_errno = ENOMEM; @@ -426,7 +425,6 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) goto out; local = frame->local; - priv = this->private; transaction_frame->local = local; frame->local = NULL; @@ -456,12 +454,6 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) local->transaction.start = local->cont.writev.offset; local->transaction.len = iov_length (local->cont.writev.vector, local->cont.writev.count); - - /*Lock entire file to avoid network split brains.*/ - if (priv->arbiter_count == 1) { - local->transaction.start = 0; - local->transaction.len = 0; - } } ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 4c85a4b0d03..59d03e3ed9e 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -1356,16 +1356,24 @@ afr_post_lower_unlock_cbk (call_frame_t *frame, xlator_t *this) int -afr_set_transaction_flock (afr_local_t *local) +afr_set_transaction_flock (xlator_t *this, afr_local_t *local) { afr_internal_lock_t *int_lock = NULL; afr_inodelk_t *inodelk = NULL; + afr_private_t *priv = NULL; int_lock = &local->internal_lock; inodelk = afr_get_inodelk (int_lock, int_lock->domain); + priv = this->private; - inodelk->flock.l_len = local->transaction.len; - inodelk->flock.l_start = local->transaction.start; + if (priv->arbiter_count) { + /*Lock entire file to avoid network split brains.*/ + inodelk->flock.l_len = 0; + inodelk->flock.l_start = 0; + } else { + inodelk->flock.l_len = local->transaction.len; + inodelk->flock.l_start = local->transaction.start; + } inodelk->flock.l_type = F_WRLCK; return 0; @@ -1386,7 +1394,7 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - afr_set_transaction_flock (local); + afr_set_transaction_flock (this, local); int_lock->lock_cbk = afr_post_nonblocking_inodelk_cbk; diff --git a/xlators/features/arbiter/src/arbiter.c b/xlators/features/arbiter/src/arbiter.c index 87145da5680..92dcbeaf9eb 100644 --- a/xlators/features/arbiter/src/arbiter.c +++ b/xlators/features/arbiter/src/arbiter.c @@ -133,8 +133,7 @@ arbiter_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, } buf = ctx->iattbuf; unwind: - STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, buf, buf, - xdata); + STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, buf, buf, NULL); return 0; } @@ -157,10 +156,48 @@ arbiter_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, buf = ctx->iattbuf; unwind: STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, buf, buf, - xdata); + NULL); return 0; } +dict_t* +arbiter_fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this) +{ + dict_t *rsp_xdata = NULL; + int32_t ret = 0; + int is_append = 1; + + if (!fd || !fd->inode || gf_uuid_is_null (fd->inode->gfid)) { + goto out; + } + + if (!xdata) + goto out; + + rsp_xdata = dict_new(); + if (!rsp_xdata) + goto out; + + if (dict_get (xdata, GLUSTERFS_OPEN_FD_COUNT)) { + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_OPEN_FD_COUNT, + fd->inode->fd_count); + if (ret < 0) { + gf_msg_debug (this->name, 0, "Failed to set dict value" + " for GLUSTERFS_OPEN_FD_COUNT"); + } + } + if (dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) { + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, + is_append); + if (ret < 0) { + gf_msg_debug (this->name, 0, "Failed to set dict value" + " for GLUSTERFS_WRITE_IS_APPEND"); + } + } +out: + return rsp_xdata; +} + int32_t arbiter_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, int32_t count, off_t off, uint32_t flags, @@ -168,6 +205,7 @@ arbiter_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, { arbiter_inode_ctx_t *ctx = NULL; struct iatt *buf = NULL; + dict_t *rsp_xdata = NULL; int op_ret = 0; int op_errno = 0; @@ -179,8 +217,12 @@ arbiter_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, } buf = ctx->iattbuf; op_ret = iov_length (vector, count); + rsp_xdata = arbiter_fill_writev_xdata (fd, xdata, this); unwind: - STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, buf, buf, xdata); + STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, buf, buf, + rsp_xdata); + if (rsp_xdata) + dict_unref (rsp_xdata); return 0; } @@ -201,8 +243,7 @@ arbiter_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, } buf = ctx->iattbuf; unwind: - STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, buf, buf, - xdata); + STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, buf, buf, NULL); return 0; } @@ -223,7 +264,7 @@ arbiter_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, } buf = ctx->iattbuf; unwind: - STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, buf, buf, xdata); + STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, buf, buf, NULL); return 0; } @@ -244,7 +285,7 @@ arbiter_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, } buf = ctx->iattbuf; unwind: - STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, buf, buf, xdata); + STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, buf, buf, NULL); return 0; } diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index c43029edd72..5249cc6ba8e 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -2997,28 +2997,35 @@ _fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this, int is_append) goto out; } - if (!xdata || !dict_get (xdata, GLUSTERFS_OPEN_FD_COUNT)) + if (!xdata) goto out; rsp_xdata = dict_new(); if (!rsp_xdata) goto out; - ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_OPEN_FD_COUNT, - fd->inode->fd_count); - if (ret < 0) { - gf_msg (this->name, GF_LOG_WARNING, 0, P_MSG_DICT_SET_FAILED, - "%s: Failed to set dictionary value for %s", - uuid_utoa (fd->inode->gfid), GLUSTERFS_OPEN_FD_COUNT); + if (dict_get (xdata, GLUSTERFS_OPEN_FD_COUNT)) { + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_OPEN_FD_COUNT, + fd->inode->fd_count); + if (ret < 0) { + gf_msg (this->name, GF_LOG_WARNING, 0, + P_MSG_DICT_SET_FAILED, "%s: Failed to set " + "dictionary value for %s", + uuid_utoa (fd->inode->gfid), + GLUSTERFS_OPEN_FD_COUNT); + } } - ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, - is_append); - if (ret < 0) { - gf_msg (this->name, GF_LOG_WARNING, 0, P_MSG_DICT_SET_FAILED, - "%s: Failed to set dictionary value for %s", - uuid_utoa (fd->inode->gfid), - GLUSTERFS_WRITE_IS_APPEND); + if (dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) { + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, + is_append); + if (ret < 0) { + gf_msg (this->name, GF_LOG_WARNING, 0, + P_MSG_DICT_SET_FAILED, "%s: Failed to set " + "dictionary value for %s", + uuid_utoa (fd->inode->gfid), + GLUSTERFS_WRITE_IS_APPEND); + } } out: return rsp_xdata; -- cgit