diff options
| author | Ravishankar N <ravishankar@redhat.com> | 2016-04-05 15:16:52 +0530 | 
|---|---|---|
| committer | Kaushal M <kaushal@redhat.com> | 2016-04-11 05:04:04 -0700 | 
| commit | c9c2c08d34003f49bc3a509757a135665fb20518 (patch) | |
| tree | c66231eac65674b43b7866bde6e40351e334232c | |
| parent | 0a01154c68cb5eb884096fc67288a71c391d9160 (diff) | |
arbiter: write performance improvement
Backport of: http://review.gluster.org/#/c/13906
Problem: The throughput for a 'dd' workload was much less for arbiter
configuration when compared to normal replica-3 volume. There were 2
issues:
i)arbiter_writev was using the request dict as response dict while
unwinding, leading to incorect GLUSTERFS_WRITE_IS_APPEND and
GLUSTERFS_OPEN_FD_COUNT values (=4), leading to immediate post-ops
because is_afr_delayed_changelog_post_op_needed() failed due to
afr_are_multiple_fds_opened() check.
ii) The arbiter code in afr was setting local->transaction.{start and len} =0
to take full file locks. What this meant was even for simultaenous but
non-overlapping writevs, afr_transaction_eager_lock_init() was not
happening because afr_locals_overlap() always stays true. Consequently
is_afr_delayed_changelog_post_op_needed() failed due to
local->delayed_post_op not being set.
Fix:
i) Send appropriate response dict values in arbiter_writev.
ii) Modify flock params instead of local->transaction.{start and len} to
take full file locks in the transaction.
Also changed _fill_writev_xdata() in posix to fill rsp_xdata for
whatever key is requested for.
Change-Id: I1c5fc5e98aba49ade540bb441a022e65b753432a
BUG: 1324809
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
Reported-by: Robert Rauch <robert.rauch@gns-systems.de>
Reported-by: Russel Purinton <russell.purinton@gmail.com>
Reviewed-on: http://review.gluster.org/13925
Smoke: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
CentOS-regression: Gluster Build System <jenkins@build.gluster.com>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
| -rw-r--r-- | xlators/cluster/afr/src/afr-inode-write.c | 8 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.c | 16 | ||||
| -rw-r--r-- | xlators/features/arbiter/src/arbiter.c | 57 | ||||
| -rw-r--r-- | xlators/storage/posix/src/posix.c | 35 | 
4 files changed, 82 insertions, 34 deletions
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 4206ef2f111..084a78ecf47 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -417,7 +417,6 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)  {          call_frame_t    *transaction_frame = NULL;          afr_local_t     *local             = NULL; -        afr_private_t   *priv              = NULL;          int             ret   = -1;          int             op_errno = ENOMEM; @@ -426,7 +425,6 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)                  goto out;          local = frame->local; -        priv = this->private;          transaction_frame->local = local;  	frame->local = NULL; @@ -456,12 +454,6 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)                  local->transaction.start   = local->cont.writev.offset;                  local->transaction.len     = iov_length (local->cont.writev.vector,                                                           local->cont.writev.count); - -                /*Lock entire file to avoid network split brains.*/ -                if (priv->arbiter_count == 1) { -                        local->transaction.start   = 0; -                        local->transaction.len     = 0; -                }          }          ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 4c85a4b0d03..59d03e3ed9e 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -1356,16 +1356,24 @@ afr_post_lower_unlock_cbk (call_frame_t *frame, xlator_t *this)  int -afr_set_transaction_flock (afr_local_t *local) +afr_set_transaction_flock (xlator_t *this, afr_local_t *local)  {          afr_internal_lock_t *int_lock = NULL;          afr_inodelk_t       *inodelk  = NULL; +        afr_private_t       *priv     = NULL;          int_lock = &local->internal_lock;          inodelk = afr_get_inodelk (int_lock, int_lock->domain); +        priv = this->private; -        inodelk->flock.l_len   = local->transaction.len; -        inodelk->flock.l_start = local->transaction.start; +        if (priv->arbiter_count) { +                /*Lock entire file to avoid network split brains.*/ +                inodelk->flock.l_len   = 0; +                inodelk->flock.l_start = 0; +        } else { +                inodelk->flock.l_len   = local->transaction.len; +                inodelk->flock.l_start = local->transaction.start; +        }          inodelk->flock.l_type  = F_WRLCK;          return 0; @@ -1386,7 +1394,7 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this)          switch (local->transaction.type) {          case AFR_DATA_TRANSACTION:          case AFR_METADATA_TRANSACTION: -                afr_set_transaction_flock (local); +                afr_set_transaction_flock (this, local);                  int_lock->lock_cbk = afr_post_nonblocking_inodelk_cbk; diff --git a/xlators/features/arbiter/src/arbiter.c b/xlators/features/arbiter/src/arbiter.c index 87145da5680..92dcbeaf9eb 100644 --- a/xlators/features/arbiter/src/arbiter.c +++ b/xlators/features/arbiter/src/arbiter.c @@ -133,8 +133,7 @@ arbiter_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,          }          buf = ctx->iattbuf;  unwind: -        STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, buf, buf, -                             xdata); +        STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, buf, buf, NULL);          return 0;  } @@ -157,10 +156,48 @@ arbiter_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,          buf = ctx->iattbuf;  unwind:          STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, buf, buf, -                             xdata); +                             NULL);          return 0;  } +dict_t* +arbiter_fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this) +{ +        dict_t  *rsp_xdata = NULL; +        int32_t ret = 0; +        int is_append = 1; + +        if (!fd || !fd->inode || gf_uuid_is_null (fd->inode->gfid)) { +                goto out; +        } + +        if (!xdata) +                goto out; + +        rsp_xdata = dict_new(); +        if (!rsp_xdata) +                goto out; + +        if (dict_get (xdata, GLUSTERFS_OPEN_FD_COUNT)) { +                ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_OPEN_FD_COUNT, +                                       fd->inode->fd_count); +                if (ret < 0) { +                        gf_msg_debug (this->name, 0, "Failed to set dict value" +                                      " for GLUSTERFS_OPEN_FD_COUNT"); +                } +        } +        if (dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) { +                ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, +                                       is_append); +                if (ret < 0) { +                        gf_msg_debug (this->name, 0, "Failed to set dict value" +                                      " for GLUSTERFS_WRITE_IS_APPEND"); +                } +        } +out: +        return rsp_xdata; +} +  int32_t  arbiter_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,                  struct iovec *vector, int32_t count, off_t off, uint32_t flags, @@ -168,6 +205,7 @@ arbiter_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,  {          arbiter_inode_ctx_t *ctx      = NULL;          struct iatt         *buf      = NULL; +        dict_t           *rsp_xdata   = NULL;          int                  op_ret   = 0;          int                  op_errno = 0; @@ -179,8 +217,12 @@ arbiter_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,          }          buf = ctx->iattbuf;          op_ret = iov_length (vector, count); +        rsp_xdata = arbiter_fill_writev_xdata (fd, xdata, this);  unwind: -        STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, buf, buf, xdata); +        STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, buf, buf, +                             rsp_xdata); +        if (rsp_xdata) +                dict_unref (rsp_xdata);          return 0;  } @@ -201,8 +243,7 @@ arbiter_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd,          }          buf = ctx->iattbuf;  unwind: -        STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, buf, buf, -                            xdata); +        STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, buf, buf, NULL);          return 0;  } @@ -223,7 +264,7 @@ arbiter_discard (call_frame_t *frame, xlator_t *this, fd_t *fd,          }          buf = ctx->iattbuf;  unwind: -        STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, buf, buf, xdata); +        STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, buf, buf, NULL);          return 0;  } @@ -244,7 +285,7 @@ arbiter_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd,          }          buf = ctx->iattbuf;  unwind: -        STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, buf, buf, xdata); +        STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, buf, buf, NULL);          return 0;  } diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index c43029edd72..5249cc6ba8e 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -2997,28 +2997,35 @@ _fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this, int is_append)                  goto out;          } -        if (!xdata || !dict_get (xdata, GLUSTERFS_OPEN_FD_COUNT)) +        if (!xdata)                  goto out;          rsp_xdata = dict_new();          if (!rsp_xdata)                  goto out; -        ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_OPEN_FD_COUNT, -                               fd->inode->fd_count); -        if (ret < 0) { -                gf_msg (this->name, GF_LOG_WARNING, 0, P_MSG_DICT_SET_FAILED, -                        "%s: Failed to set dictionary value for %s", -                        uuid_utoa (fd->inode->gfid), GLUSTERFS_OPEN_FD_COUNT); +        if (dict_get (xdata, GLUSTERFS_OPEN_FD_COUNT)) { +                ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_OPEN_FD_COUNT, +                                       fd->inode->fd_count); +                if (ret < 0) { +                        gf_msg (this->name, GF_LOG_WARNING, 0, +                                P_MSG_DICT_SET_FAILED, "%s: Failed to set " +                                "dictionary value for %s", +                                uuid_utoa (fd->inode->gfid), +                                GLUSTERFS_OPEN_FD_COUNT); +                }          } -        ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, -                               is_append); -        if (ret < 0) { -                gf_msg (this->name, GF_LOG_WARNING, 0, P_MSG_DICT_SET_FAILED, -                        "%s: Failed to set dictionary value for %s", -                        uuid_utoa (fd->inode->gfid), -                        GLUSTERFS_WRITE_IS_APPEND); +        if (dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) { +                ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, +                                       is_append); +                if (ret < 0) { +                        gf_msg (this->name, GF_LOG_WARNING, 0, +                                P_MSG_DICT_SET_FAILED, "%s: Failed to set " +                                "dictionary value for %s", +                                uuid_utoa (fd->inode->gfid), +                                GLUSTERFS_WRITE_IS_APPEND); +                }          }  out:          return rsp_xdata;  | 
