summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRavishankar N <ravishankar@redhat.com>2016-04-05 15:16:52 +0530
committerPranith Kumar Karampuri <pkarampu@redhat.com>2016-04-11 04:32:37 -0700
commite1004679563ef17c460f83098983baf105655712 (patch)
tree86b6d63da1aee0131fcc285203f90213f06328d8
parent09c9da3b12da73dd718522fdf7587d3be5a14137 (diff)
arbiter: write performance improvement
Problem: The throughput for a 'dd' workload was much less for arbiter configuration when compared to normal replica-3 volume. There were 2 issues: i)arbiter_writev was using the request dict as response dict while unwinding, leading to incorect GLUSTERFS_WRITE_IS_APPEND and GLUSTERFS_OPEN_FD_COUNT values (=4), leading to immediate post-ops because is_afr_delayed_changelog_post_op_needed() failed due to afr_are_multiple_fds_opened() check. ii) The arbiter code in afr was setting local->transaction.{start and len} =0 to take full file locks. What this meant was even for simultaenous but non-overlapping writevs, afr_transaction_eager_lock_init() was not happening because afr_locals_overlap() always stays true. Consequently is_afr_delayed_changelog_post_op_needed() failed due to local->delayed_post_op not being set. Fix: i) Send appropriate response dict values in arbiter_writev. ii) Modify flock params instead of local->transaction.{start and len} to take full file locks in the transaction. Also changed _fill_writev_xdata() in posix to fill rsp_xdata for whatever key is requested for. Change-Id: I1c5fc5e98aba49ade540bb441a022e65b753432a BUG: 1324004 Signed-off-by: Ravishankar N <ravishankar@redhat.com> Reported-by: Robert Rauch <robert.rauch@gns-systems.de> Reported-by: Russel Purinton <russell.purinton@gmail.com> Reviewed-on: http://review.gluster.org/13906 Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com> Smoke: Gluster Build System <jenkins@build.gluster.com> CentOS-regression: Gluster Build System <jenkins@build.gluster.com> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
-rw-r--r--xlators/cluster/afr/src/afr-inode-write.c8
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c16
-rw-r--r--xlators/features/arbiter/src/arbiter.c57
-rw-r--r--xlators/storage/posix/src/posix.c35
4 files changed, 82 insertions, 34 deletions
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index e0928425c6a..15bae87a4f4 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -412,7 +412,6 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)
{
call_frame_t *transaction_frame = NULL;
afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
int ret = -1;
int op_errno = ENOMEM;
@@ -421,7 +420,6 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)
goto out;
local = frame->local;
- priv = this->private;
transaction_frame->local = local;
frame->local = NULL;
@@ -451,12 +449,6 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)
local->transaction.start = local->cont.writev.offset;
local->transaction.len = iov_length (local->cont.writev.vector,
local->cont.writev.count);
-
- /*Lock entire file to avoid network split brains.*/
- if (priv->arbiter_count == 1) {
- local->transaction.start = 0;
- local->transaction.len = 0;
- }
}
ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index 92f68b91113..9c27f07b5d4 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -1356,16 +1356,24 @@ afr_post_lower_unlock_cbk (call_frame_t *frame, xlator_t *this)
int
-afr_set_transaction_flock (afr_local_t *local)
+afr_set_transaction_flock (xlator_t *this, afr_local_t *local)
{
afr_internal_lock_t *int_lock = NULL;
afr_inodelk_t *inodelk = NULL;
+ afr_private_t *priv = NULL;
int_lock = &local->internal_lock;
inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ priv = this->private;
- inodelk->flock.l_len = local->transaction.len;
- inodelk->flock.l_start = local->transaction.start;
+ if (priv->arbiter_count) {
+ /*Lock entire file to avoid network split brains.*/
+ inodelk->flock.l_len = 0;
+ inodelk->flock.l_start = 0;
+ } else {
+ inodelk->flock.l_len = local->transaction.len;
+ inodelk->flock.l_start = local->transaction.start;
+ }
inodelk->flock.l_type = F_WRLCK;
return 0;
@@ -1386,7 +1394,7 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this)
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- afr_set_transaction_flock (local);
+ afr_set_transaction_flock (this, local);
int_lock->lock_cbk = afr_post_nonblocking_inodelk_cbk;
diff --git a/xlators/features/arbiter/src/arbiter.c b/xlators/features/arbiter/src/arbiter.c
index b404597be9d..786f60b7bc9 100644
--- a/xlators/features/arbiter/src/arbiter.c
+++ b/xlators/features/arbiter/src/arbiter.c
@@ -128,8 +128,7 @@ arbiter_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
}
buf = ctx->iattbuf;
unwind:
- STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, buf, buf,
- xdata);
+ STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, buf, buf, NULL);
return 0;
}
@@ -152,10 +151,48 @@ arbiter_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
buf = ctx->iattbuf;
unwind:
STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, buf, buf,
- xdata);
+ NULL);
return 0;
}
+dict_t*
+arbiter_fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this)
+{
+ dict_t *rsp_xdata = NULL;
+ int32_t ret = 0;
+ int is_append = 1;
+
+ if (!fd || !fd->inode || gf_uuid_is_null (fd->inode->gfid)) {
+ goto out;
+ }
+
+ if (!xdata)
+ goto out;
+
+ rsp_xdata = dict_new();
+ if (!rsp_xdata)
+ goto out;
+
+ if (dict_get (xdata, GLUSTERFS_OPEN_FD_COUNT)) {
+ ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_OPEN_FD_COUNT,
+ fd->inode->fd_count);
+ if (ret < 0) {
+ gf_msg_debug (this->name, 0, "Failed to set dict value"
+ " for GLUSTERFS_OPEN_FD_COUNT");
+ }
+ }
+ if (dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) {
+ ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND,
+ is_append);
+ if (ret < 0) {
+ gf_msg_debug (this->name, 0, "Failed to set dict value"
+ " for GLUSTERFS_WRITE_IS_APPEND");
+ }
+ }
+out:
+ return rsp_xdata;
+}
+
int32_t
arbiter_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iovec *vector, int32_t count, off_t off, uint32_t flags,
@@ -163,6 +200,7 @@ arbiter_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
{
arbiter_inode_ctx_t *ctx = NULL;
struct iatt *buf = NULL;
+ dict_t *rsp_xdata = NULL;
int op_ret = 0;
int op_errno = 0;
@@ -174,8 +212,12 @@ arbiter_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
}
buf = ctx->iattbuf;
op_ret = iov_length (vector, count);
+ rsp_xdata = arbiter_fill_writev_xdata (fd, xdata, this);
unwind:
- STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, buf, buf, xdata);
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, buf, buf,
+ rsp_xdata);
+ if (rsp_xdata)
+ dict_unref (rsp_xdata);
return 0;
}
@@ -196,8 +238,7 @@ arbiter_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd,
}
buf = ctx->iattbuf;
unwind:
- STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, buf, buf,
- xdata);
+ STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, buf, buf, NULL);
return 0;
}
@@ -218,7 +259,7 @@ arbiter_discard (call_frame_t *frame, xlator_t *this, fd_t *fd,
}
buf = ctx->iattbuf;
unwind:
- STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, buf, buf, xdata);
+ STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, buf, buf, NULL);
return 0;
}
@@ -239,7 +280,7 @@ arbiter_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd,
}
buf = ctx->iattbuf;
unwind:
- STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, buf, buf, xdata);
+ STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, buf, buf, NULL);
return 0;
}
diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c
index f72c13d9066..8cce4be1c34 100644
--- a/xlators/storage/posix/src/posix.c
+++ b/xlators/storage/posix/src/posix.c
@@ -3055,28 +3055,35 @@ _fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this, int is_append)
goto out;
}
- if (!xdata || !dict_get (xdata, GLUSTERFS_OPEN_FD_COUNT))
+ if (!xdata)
goto out;
rsp_xdata = dict_new();
if (!rsp_xdata)
goto out;
- ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_OPEN_FD_COUNT,
- fd->inode->fd_count);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_WARNING, 0, P_MSG_DICT_SET_FAILED,
- "%s: Failed to set dictionary value for %s",
- uuid_utoa (fd->inode->gfid), GLUSTERFS_OPEN_FD_COUNT);
+ if (dict_get (xdata, GLUSTERFS_OPEN_FD_COUNT)) {
+ ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_OPEN_FD_COUNT,
+ fd->inode->fd_count);
+ if (ret < 0) {
+ gf_msg (this->name, GF_LOG_WARNING, 0,
+ P_MSG_DICT_SET_FAILED, "%s: Failed to set "
+ "dictionary value for %s",
+ uuid_utoa (fd->inode->gfid),
+ GLUSTERFS_OPEN_FD_COUNT);
+ }
}
- ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND,
- is_append);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_WARNING, 0, P_MSG_DICT_SET_FAILED,
- "%s: Failed to set dictionary value for %s",
- uuid_utoa (fd->inode->gfid),
- GLUSTERFS_WRITE_IS_APPEND);
+ if (dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) {
+ ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND,
+ is_append);
+ if (ret < 0) {
+ gf_msg (this->name, GF_LOG_WARNING, 0,
+ P_MSG_DICT_SET_FAILED, "%s: Failed to set "
+ "dictionary value for %s",
+ uuid_utoa (fd->inode->gfid),
+ GLUSTERFS_WRITE_IS_APPEND);
+ }
}
out:
return rsp_xdata;