diff options
author | Xavier Hernandez <xhernandez@datalab.es> | 2015-05-20 15:17:35 +0200 |
---|---|---|
committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2015-05-27 03:25:47 -0700 |
commit | 3b666b40efbed157e8c5991f29b345d93b28c659 (patch) | |
tree | a6fb9a20bed31bbcb0e5dd2025e51d7f4ea6e257 /xlators/cluster/ec/src/ec-inode-write.c | |
parent | 5513144feb5b062b733d7514adf194429e31666f (diff) |
cluster/ec: Forced unlock when lock contention is detected
EC uses an eager lock mechanism to optimize multiple read/write
requests on the same entry or inode. This increases performance
but can have adverse results when other clients try to access the
same entry/inode.
To solve this, this patch adds a functionality to detect when this
happens and force an earlier release to not block other clients.
The method consists on requesting GF_GLUSTERFS_INODELK_COUNT and
GF_GLUSTERFS_ENTRYLK_COUNT for all fops that take a lock. When this
count is greater than one, the lock is marked to be released. All
fops already waiting for this lock will be executed normally before
releasing the lock, but new requests that also require it will be
blocked and restarted after the lock has been released and reacquired
again.
Another problem was that some operations did correctly lock the
parent of an entry when needed, but got the size and version xattrs
from the entry instead of the parent.
This patch solves this problem by binding all queries of size and
version to each lock and replacing all entrylk calls by inodelk ones
to remove concurrent updates on directory metadata. This also allows
rename to correctly update source and destination directories.
Change-Id: I2df0b22bc6f407d49f3cbf0733b0720015bacfbd
BUG: 1165041
Signed-off-by: Xavier Hernandez <xhernandez@datalab.es>
Reviewed-on: http://review.gluster.org/10852
Tested-by: NetBSD Build System
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
Diffstat (limited to 'xlators/cluster/ec/src/ec-inode-write.c')
-rw-r--r-- | xlators/cluster/ec/src/ec-inode-write.c | 159 |
1 files changed, 78 insertions, 81 deletions
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c index 6b485a26fbc..368b3ae5edf 100644 --- a/xlators/cluster/ec/src/ec-inode-write.c +++ b/xlators/cluster/ec/src/ec-inode-write.c @@ -123,11 +123,13 @@ ec_manager_xattr (ec_fop_data_t *fop, int32_t state) switch (state) { case EC_STATE_INIT: case EC_STATE_LOCK: - if (fop->fd == NULL) - ec_lock_prepare_inode(fop, &fop->loc[0], 1); - else - ec_lock_prepare_fd(fop, fop->fd, 1); - + if (fop->fd == NULL) { + ec_lock_prepare_inode(fop, &fop->loc[0], + EC_UPDATE_META | EC_QUERY_INFO); + } else { + ec_lock_prepare_fd(fop, fop->fd, + EC_UPDATE_META | EC_QUERY_INFO); + } ec_lock(fop); return EC_STATE_DISPATCH; @@ -378,21 +380,15 @@ int32_t ec_manager_setattr(ec_fop_data_t * fop, int32_t state) { case EC_STATE_INIT: case EC_STATE_LOCK: - if (fop->fd == NULL) - { - ec_lock_prepare_inode(fop, &fop->loc[0], 1); - } - else - { - ec_lock_prepare_fd(fop, fop->fd, 1); + if (fop->fd == NULL) { + ec_lock_prepare_inode(fop, &fop->loc[0], + EC_UPDATE_META | EC_QUERY_INFO); + } else { + ec_lock_prepare_fd(fop, fop->fd, + EC_UPDATE_META | EC_QUERY_INFO); } ec_lock(fop); - return EC_STATE_GET_SIZE_AND_VERSION; - - case EC_STATE_GET_SIZE_AND_VERSION: - ec_get_size_version(fop); - return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: @@ -412,17 +408,17 @@ int32_t ec_manager_setattr(ec_fop_data_t * fop, int32_t state) cbk->op_errno = EIO; } } - if (cbk->op_ret < 0) - { + if (cbk->op_ret < 0) { ec_fop_set_error(fop, cbk->op_errno); - } - else - { + } else if (cbk->iatt[0].ia_type == IA_IFREG) { ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count); - cbk->iatt[0].ia_size = fop->pre_size; - cbk->iatt[1].ia_size = fop->pre_size; + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_get_inode_size(fop, + fop->locks[0].lock->loc.inode, + &cbk->iatt[0].ia_size)); + cbk->iatt[1].ia_size = cbk->iatt[0].ia_size; } } else @@ -462,7 +458,6 @@ int32_t ec_manager_setattr(ec_fop_data_t * fop, int32_t state) case -EC_STATE_INIT: case -EC_STATE_LOCK: - case -EC_STATE_GET_SIZE_AND_VERSION: case -EC_STATE_DISPATCH: case -EC_STATE_PREPARE_ANSWER: case -EC_STATE_REPORT: @@ -992,21 +987,17 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state) /* Fall through */ case EC_STATE_LOCK: - if (fop->id == GF_FOP_TRUNCATE) - { - ec_lock_prepare_inode(fop, &fop->loc[0], 1); - } - else - { - ec_lock_prepare_fd(fop, fop->fd, 1); + if (fop->id == GF_FOP_TRUNCATE) { + ec_lock_prepare_inode(fop, &fop->loc[0], + EC_UPDATE_DATA | EC_UPDATE_META | + EC_QUERY_INFO); + } else { + ec_lock_prepare_fd(fop, fop->fd, + EC_UPDATE_DATA | EC_UPDATE_META | + EC_QUERY_INFO); } ec_lock(fop); - return EC_STATE_GET_SIZE_AND_VERSION; - - case EC_STATE_GET_SIZE_AND_VERSION: - ec_prepare_update(fop); - return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: @@ -1035,14 +1026,18 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state) ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count); - cbk->iatt[0].ia_size = fop->pre_size; + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_get_inode_size(fop, + fop->locks[0].lock->loc.inode, + &cbk->iatt[0].ia_size)); cbk->iatt[1].ia_size = fop->user_size; - fop->post_size = fop->user_size; - if ((fop->pre_size > fop->post_size) && - (fop->user_size != fop->offset)) - { - if (!ec_truncate_clean(fop)) - { + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_set_inode_size(fop, + fop->locks[0].lock->loc.inode, + fop->user_size)); + if ((cbk->iatt[0].ia_size > cbk->iatt[1].ia_size) && + (fop->user_size != fop->offset)) { + if (!ec_truncate_clean(fop)) { ec_fop_set_error(fop, EIO); } } @@ -1085,7 +1080,6 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state) case -EC_STATE_INIT: case -EC_STATE_LOCK: - case -EC_STATE_GET_SIZE_AND_VERSION: case -EC_STATE_DISPATCH: case -EC_STATE_PREPARE_ANSWER: case -EC_STATE_REPORT: @@ -1355,9 +1349,13 @@ void ec_writev_start(ec_fop_data_t *fop) ec_fd_t *ctx; fd_t *fd; size_t tail; + uint64_t current; uid_t uid; gid_t gid; + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, ¤t)); + fd = fd_anonymous(fop->fd->inode); if (fd == NULL) { ec_fop_set_error(fop, EIO); @@ -1373,7 +1371,7 @@ void ec_writev_start(ec_fop_data_t *fop) ctx = ec_fd_get(fop->fd, fop->xl); if (ctx != NULL) { if ((ctx->flags & O_APPEND) != 0) { - fop->offset = fop->pre_size; + fop->offset = current; } } @@ -1404,22 +1402,17 @@ void ec_writev_start(ec_fop_data_t *fop) iobref_unref(fop->buffers); fop->buffers = iobref; - if (fop->head > 0) - { + if (fop->head > 0) { ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, ec_writev_merge_head, NULL, fd, ec->stripe_size, fop->offset, 0, NULL); } tail = fop->size - fop->user_size - fop->head; - if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) - { - if (fop->pre_size > fop->offset + fop->head + fop->user_size) - { + if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) { + if (current > fop->offset + fop->head + fop->user_size) { ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, ec_writev_merge_tail, NULL, fd, ec->stripe_size, fop->offset + fop->size - ec->stripe_size, 0, NULL); - } - else - { + } else { memset(fop->vector[0].iov_base + fop->size - tail, 0, tail); } } @@ -1530,14 +1523,11 @@ int32_t ec_manager_writev(ec_fop_data_t * fop, int32_t state) { case EC_STATE_INIT: case EC_STATE_LOCK: - ec_lock_prepare_fd(fop, fop->fd, 1); + ec_lock_prepare_fd(fop, fop->fd, + EC_UPDATE_DATA | EC_UPDATE_META | + EC_QUERY_INFO); ec_lock(fop); - return EC_STATE_GET_SIZE_AND_VERSION; - - case EC_STATE_GET_SIZE_AND_VERSION: - ec_prepare_update(fop); - return EC_STATE_DISPATCH; case EC_STATE_DISPATCH: @@ -1574,27 +1564,34 @@ int32_t ec_manager_writev(ec_fop_data_t * fop, int32_t state) ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, cbk->count); + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_get_inode_size(fop, fop->fd->inode, + &cbk->iatt[0].ia_size)); + cbk->iatt[1].ia_size = cbk->iatt[0].ia_size; size = fop->offset + fop->head + fop->user_size; - if (size > fop->pre_size) - { - fop->post_size = size; - } - - cbk->iatt[0].ia_size = fop->pre_size; - cbk->iatt[1].ia_size = fop->post_size; - - cbk->op_ret *= ec->fragments; - if (cbk->op_ret < fop->head) - { - cbk->op_ret = 0; - } - else - { - cbk->op_ret -= fop->head; + if (size > cbk->iatt[0].ia_size) { + /* Only update inode size if this is a top level fop. + * Otherwise this is an internal write and the top + * level fop should take care of the real inode size. + */ + if (fop->parent == NULL) { + /* This shouldn't fail because we have the inode + * locked. */ + GF_ASSERT(ec_set_inode_size(fop, fop->fd->inode, + size)); + } + cbk->iatt[1].ia_size = size; } - if (cbk->op_ret > fop->user_size) - { - cbk->op_ret = fop->user_size; + if (fop->error == 0) { + cbk->op_ret *= ec->fragments; + if (cbk->op_ret < fop->head) { + cbk->op_ret = 0; + } else { + cbk->op_ret -= fop->head; + } + if (cbk->op_ret > fop->user_size) { + cbk->op_ret = fop->user_size; + } } } } @@ -1621,8 +1618,8 @@ int32_t ec_manager_writev(ec_fop_data_t * fop, int32_t state) case -EC_STATE_INIT: case -EC_STATE_LOCK: - case -EC_STATE_GET_SIZE_AND_VERSION: case -EC_STATE_DISPATCH: + case -EC_STATE_DELAYED_START: case -EC_STATE_PREPARE_ANSWER: case -EC_STATE_REPORT: GF_ASSERT(fop->error != 0); |