diff options
author | Pranith Kumar K <pkarampu@redhat.com> | 2018-01-31 16:41:14 +0530 |
---|---|---|
committer | Pranith Kumar Karampuri <pkarampu@redhat.com> | 2018-03-14 13:32:35 +0000 |
commit | 346714305f9de30d5f78494091770c1555c601bb (patch) | |
tree | ebdb744fd1858a98495e77069cb4e670b2ef87c6 /xlators/cluster/afr/src/afr-common.c | |
parent | f32f85c4e6c8128643e1f88fe981a63680e79fe0 (diff) |
cluster/afr: Make AFR eager-locking similar to EC
Problem:
1) Afr's eager-lock only works for data transactions.
2) When there are conflicting writes, write with conflicting region initiates
unlock of eager-lock leading to extra pre-ops and post-ops on the file. When
eager-lock goes off, it leads to extra fsyncs for random-write workload in afr.
Solution (that is modeled after EC):
In EC, when there is a conflicting write, it waits for the current write to
complete before it winds the conflicted write. This leads to better utilization
of network and disk, because we will not be doing extra xattrops and FSYNCs and
inodelk/unlock. Moved fd based counters to inode based counters.
I tried to model the solution based on EC's locking, but it is not similar to
AFR because we had to keep backward compatibility.
Lifecycle of lock:
==================
First transaction is added to inode->owners list and an inodelk will be sent on
the wire. All the next transactions will be put in inode->waiters list until
the first transaction completes inodelk and [f]xattrop completely. Once
[f]xattrop also completes, all the requests in the inode->waiters list are
checked if it conflict with any of the existing locks which are in
inode->owners list and if not are added to inode->owners list and resumed with
doing transaction. When these transactions complete fop phase they will be
moved to inode->post_op list and resume the transactions that were paused
because of conflicts. Post-op and unlock will not be issued on the wire until
that is the last transaction on that inode. Last transaction when it has to
perform post-op can choose to sleep for deyed-post-op-secs value. During that
time if any other transaction comes, it will wake up the sleeping transaction
and takes over the ownership of the lock and the cycle continues. If the
dealyed-post-op-secs expire, then the timer thread will wakeup the sleeping
transaction and it will set lock->release to true and starts doing post-op and
then unlock. During this time if any other transactions come, they will be put
in inode->frozen list. Once the previous unlock comes it will move the frozen
list to waiters list and moves the first element from this waiters-list to
owners-list and attempts the lock and the cycle continues. This is the general
idea. There is logic at the time of dealying and at the time of new
transaction or in flush fop to wakeup existing sleeping transactions or
choosing whether to delay a transaction etc, which is subjected to change based
on future enhancements etc.
Fixes: #418
BUG: 1549606
Change-Id: I88b570bbcf332a27c82d2767dfa82472f60055dc
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Diffstat (limited to 'xlators/cluster/afr/src/afr-common.c')
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 315 |
1 files changed, 146 insertions, 169 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index c9953139b7e..bfd8c2e8c2c 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -121,37 +121,77 @@ afr_is_possibly_under_txn (afr_transaction_type type, afr_local_t *local, return _gf_false; } +static void +afr_inode_ctx_destroy (afr_inode_ctx_t *ctx) +{ + int i = 0; + + if (!ctx) + return; + + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + GF_FREE (ctx->pre_op_done[i]); + } + + GF_FREE (ctx); +} + int __afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx) { - uint64_t ctx_int = 0; - int ret = -1; - afr_inode_ctx_t *tmp_ctx = NULL; + uint64_t ctx_int = 0; + int ret = -1; + int i = -1; + int num_locks = -1; + afr_inode_ctx_t *ictx = NULL; + afr_lock_t *lock = NULL; + afr_private_t *priv = this->private; ret = __inode_ctx_get (inode, this, &ctx_int); - if (ret) { - tmp_ctx = GF_CALLOC (1, sizeof (afr_inode_ctx_t), - gf_afr_mt_inode_ctx_t); - if (!tmp_ctx) - goto out; + if (ret == 0) { + *ctx = (afr_inode_ctx_t *)ctx_int; + return 0; + } - ctx_int = (long) tmp_ctx; - ret = __inode_ctx_set (inode, this, &ctx_int); - if (ret) { - GF_FREE (tmp_ctx); + ictx = GF_CALLOC (1, sizeof (afr_inode_ctx_t), gf_afr_mt_inode_ctx_t); + if (!ictx) + goto out; + + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + ictx->pre_op_done[i] = GF_CALLOC (sizeof *ictx->pre_op_done[i], + priv->child_count, + gf_afr_mt_int32_t); + if (!ictx->pre_op_done[i]) { + ret = -ENOMEM; goto out; } - tmp_ctx->spb_choice = -1; - tmp_ctx->read_subvol = 0; - tmp_ctx->write_subvol = 0; - tmp_ctx->lock_count = 0; - } else { - tmp_ctx = (afr_inode_ctx_t *) ctx_int; } - *ctx = tmp_ctx; + num_locks = sizeof(ictx->lock)/sizeof(afr_lock_t); + for (i = 0; i < num_locks; i++) { + lock = &ictx->lock[i]; + INIT_LIST_HEAD (&lock->post_op); + INIT_LIST_HEAD (&lock->frozen); + INIT_LIST_HEAD (&lock->waiting); + INIT_LIST_HEAD (&lock->owners); + } + + ctx_int = (uint64_t)ictx; + ret = __inode_ctx_set (inode, this, &ctx_int); + if (ret) { + goto out; + } + + ictx->spb_choice = -1; + ictx->read_subvol = 0; + ictx->write_subvol = 0; + ictx->lock_count = 0; ret = 0; + *ctx = ictx; out: + if (ret) { + afr_inode_ctx_destroy (ictx); + } return ret; } @@ -1752,10 +1792,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) GF_FREE (local->internal_lock.locked_nodes); - for (i = 0; local->internal_lock.inodelk[i].domain; i++) { - GF_FREE (local->internal_lock.inodelk[i].locked_nodes); - } - GF_FREE (local->internal_lock.lower_locked_nodes); afr_entry_lockee_cleanup (&local->internal_lock); @@ -1772,7 +1808,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) GF_FREE (local->transaction.changelog_xdata); } - GF_FREE (local->transaction.eager_lock); GF_FREE (local->transaction.failed_subvols); GF_FREE (local->transaction.basename); @@ -1819,16 +1854,6 @@ afr_local_replies_wipe (afr_local_t *local, afr_private_t *priv) memset (local->replies, 0, sizeof(*local->replies) * priv->child_count); } -void -afr_remove_eager_lock_stub (afr_local_t *local) -{ - LOCK (&local->fd->lock); - { - list_del_init (&local->transaction.eager_locked); - } - UNLOCK (&local->fd->lock); -} - static gf_boolean_t afr_fop_lock_is_unlock (call_frame_t *frame) { @@ -1933,10 +1958,6 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) syncbarrier_destroy (&local->barrier); - if (local->transaction.eager_lock_on && - !list_empty (&local->transaction.eager_locked)) - afr_remove_eager_lock_stub (local); - afr_local_transaction_cleanup (local, this); priv = this->private; @@ -3228,22 +3249,8 @@ out: void _afr_cleanup_fd_ctx (afr_fd_ctx_t *fd_ctx) { - int i = 0; - - - for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) - GF_FREE (fd_ctx->pre_op_done[i]); - GF_FREE (fd_ctx->opened_on); - - GF_FREE (fd_ctx->lock_piggyback); - - GF_FREE (fd_ctx->lock_acquired); - - pthread_mutex_destroy (&fd_ctx->delay_lock); - GF_FREE (fd_ctx); - return; } @@ -3261,15 +3268,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) fd_ctx = (afr_fd_ctx_t *)(long) ctx; if (fd_ctx) { - /*no need to take any locks*/ - if (!list_empty (&fd_ctx->eager_locked)) - gf_msg (this->name, GF_LOG_WARNING, 0, - AFR_MSG_INVALID_DATA, "%s: Stale " - "Eager-lock stubs found", - uuid_utoa (fd->inode->gfid)); - _afr_cleanup_fd_ctx (fd_ctx); - } out: @@ -3350,23 +3349,6 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto out; } - ret = pthread_mutex_init (&fd_ctx->delay_lock, NULL); - if (ret) { - GF_FREE (fd_ctx); - fd_ctx = NULL; - goto out; - } - - for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { - fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]), - priv->child_count, - gf_afr_mt_int32_t); - if (!fd_ctx->pre_op_done[i]) { - ret = -ENOMEM; - goto out; - } - } - fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), priv->child_count, gf_afr_mt_int32_t); @@ -3382,26 +3364,8 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED; } - fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->lock_piggyback) { - ret = -ENOMEM; - goto out; - } - - fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->lock_acquired) { - ret = -ENOMEM; - goto out; - } - fd_ctx->readdir_subvol = -1; - INIT_LIST_HEAD (&fd_ctx->eager_locked); - ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); if (ret) gf_msg_debug (this->name, 0, @@ -3473,12 +3437,70 @@ afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) return 0; } +afr_local_t* +afr_wakeup_same_fd_delayed_op (xlator_t *this, afr_lock_t *lock, fd_t *fd) +{ + afr_local_t *local = NULL; + + if (lock->delay_timer) { + local = list_entry(lock->post_op.next, afr_local_t, + transaction.owner_list); + if (fd == local->fd) { + if (gf_timer_call_cancel (this->ctx, + lock->delay_timer)) { + local = NULL; + } else { + lock->delay_timer = NULL; + } + } else { + local = NULL; + } + } + + return local; +} + +void +afr_delayed_changelog_wake_resume (xlator_t *this, inode_t *inode, + call_stub_t *stub) +{ + afr_inode_ctx_t *ctx = NULL; + afr_lock_t *lock = NULL; + afr_local_t *metadata_local = NULL; + afr_local_t *data_local = NULL; + LOCK (&inode->lock); + { + (void)__afr_inode_ctx_get (this, inode, &ctx); + lock = &ctx->lock[AFR_DATA_TRANSACTION]; + data_local = afr_wakeup_same_fd_delayed_op (this, lock, + stub->args.fd); + lock = &ctx->lock[AFR_METADATA_TRANSACTION]; + metadata_local = afr_wakeup_same_fd_delayed_op (this, lock, + stub->args.fd); + } + UNLOCK (&inode->lock); + + if (data_local) { + data_local->transaction.resume_stub = stub; + } else if (metadata_local) { + metadata_local->transaction.resume_stub = stub; + } else { + call_resume (stub); + } + if (data_local) { + afr_delayed_changelog_wake_up_cbk (data_local); + } + if (metadata_local) { + afr_delayed_changelog_wake_up_cbk (metadata_local); + } +} + int afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_local_t *local = NULL; - call_stub_t *stub = NULL; - int op_errno = ENOMEM; + afr_local_t *local = NULL; + call_stub_t *stub = NULL; + int op_errno = ENOMEM; local = AFR_FRAME_INIT (frame, op_errno); if (!local) @@ -3494,7 +3516,7 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) if (!stub) goto out; - afr_delayed_changelog_wake_resume (this, fd, stub); + afr_delayed_changelog_wake_resume (this, fd->inode, stub); return 0; out: @@ -3502,7 +3524,6 @@ out: return 0; } - int afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) @@ -4565,7 +4586,7 @@ afr_forget (xlator_t *this, inode_t *inode) return 0; ctx = (afr_inode_ctx_t *)ctx_int; - GF_FREE (ctx); + afr_inode_ctx_destroy (ctx); return 0; } @@ -5382,21 +5403,6 @@ out: } int -afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count) -{ - int ret = -ENOMEM; - - lk->domain = dom; - lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->locked_nodes) - goto out; - ret = 0; -out: - return ret; -} - -int afr_transaction_local_init (afr_local_t *local, xlator_t *this) { int ret = -ENOMEM; @@ -5407,25 +5413,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (ret < 0) goto out; - if ((local->transaction.type == AFR_DATA_TRANSACTION) || - (local->transaction.type == AFR_METADATA_TRANSACTION)) { - ret = afr_inodelk_init (&local->internal_lock.inodelk[0], - this->name, priv->child_count); - if (ret < 0) - goto out; - } - ret = -ENOMEM; local->pre_op_compat = priv->pre_op_compat; - local->transaction.eager_lock = - GF_CALLOC (sizeof (*local->transaction.eager_lock), - priv->child_count, - gf_afr_mt_int32_t); - - if (!local->transaction.eager_lock) - goto out; - local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op), priv->child_count, gf_afr_mt_char); @@ -5457,9 +5447,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (!local->pending) goto out; - INIT_LIST_HEAD (&local->transaction.eager_locked); - ret = 0; + INIT_LIST_HEAD (&local->transaction.wait_list); + INIT_LIST_HEAD (&local->transaction.owner_list); out: return ret; } @@ -5494,24 +5484,6 @@ out: return; } -void -afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - - local = frame->local; - - if (!local->fd) - return; - - fd_ctx = afr_fd_ctx_get (local->fd, this); - if (!fd_ctx) - return; - - fd_ctx->open_fd_count = local->open_fd_count; -} - int** afr_mark_pending_changelog (afr_private_t *priv, unsigned char *pending, dict_t *xattr, ia_type_t iat) @@ -5620,7 +5592,7 @@ out: int afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this, - inode_t *inode, gf_boolean_t *dsh, + fd_t *fd, gf_boolean_t *dsh, gf_boolean_t *pflag) { int ret = -1; @@ -5630,8 +5602,8 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this, unsigned char *healed_sinks = NULL; unsigned char *undid_pending = NULL; afr_private_t *priv = NULL; - fd_t *fd = NULL; struct afr_reply *locked_replies = NULL; + inode_t *inode = fd->inode; priv = this->private; data_lock = alloca0 (priv->child_count); @@ -5640,18 +5612,6 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this, healed_sinks = alloca0 (priv->child_count); undid_pending = alloca0 (priv->child_count); - /* Heal-info does an open() on the file being examined so that the - * current eager-lock holding client, if present, at some point sees - * open-fd count being > 1 and releases the eager-lock so that heal-info - * doesn't remain blocked forever until IO completes. - */ - ret = afr_selfheal_data_open (this, inode, &fd); - if (ret < 0) { - gf_msg_debug (this->name, -ret, "%s: Failed to open", - uuid_utoa (inode->gfid)); - goto out; - } - locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); ret = afr_selfheal_inodelk (frame, this, inode, this->name, @@ -5674,8 +5634,6 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this, out: if (locked_replies) afr_replies_wipe (locked_replies, priv->child_count); - if (fd) - fd_unref (fd); return ret; } @@ -5760,6 +5718,7 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid, { int ret = -1; + fd_t *fd = NULL; gf_boolean_t dsh = _gf_false; gf_boolean_t msh = _gf_false; gf_boolean_t esh = _gf_false; @@ -5771,6 +5730,21 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid, /* For every heal type hold locks and check if it indeed needs heal */ + + /* Heal-info does an open() on the file being examined so that the + * current eager-lock holding client, if present, at some point sees + * open-fd count being > 1 and releases the eager-lock so that heal-info + * doesn't remain blocked forever until IO completes. + */ + if ((*inode)->ia_type == IA_IFREG) { + ret = afr_selfheal_data_open (this, *inode, &fd); + if (ret < 0) { + gf_msg_debug (this->name, -ret, "%s: Failed to open", + uuid_utoa ((*inode)->gfid)); + goto out; + } + } + if (msh) { ret = afr_selfheal_locked_metadata_inspect (frame, this, *inode, &msh, @@ -5780,7 +5754,7 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid, } if (dsh) { - ret = afr_selfheal_locked_data_inspect (frame, this, *inode, + ret = afr_selfheal_locked_data_inspect (frame, this, fd, &dsh, pending); if (ret == -EIO || (ret == -EAGAIN)) goto out; @@ -5795,6 +5769,8 @@ out: *data_selfheal = dsh; *entry_selfheal = esh; *metadata_selfheal = msh; + if (fd) + fd_unref (fd); return ret; } @@ -6429,6 +6405,7 @@ afr_write_subvol_reset (call_frame_t *frame, xlator_t *this) local = frame->local; LOCK(&local->inode->lock); { + GF_ASSERT (local->inode_ctx->lock_count > 0); local->inode_ctx->lock_count--; if (!local->inode_ctx->lock_count) |