diff options
-rw-r--r-- | tests/bugs/replicate/bug-966018.t | 36 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 315 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-inode-write.c | 6 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-lk-common.c | 348 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 13 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-data.c | 14 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal.h | 2 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-transaction.c | 913 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-transaction.h | 13 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.h | 97 |
10 files changed, 813 insertions, 944 deletions
diff --git a/tests/bugs/replicate/bug-966018.t b/tests/bugs/replicate/bug-966018.t deleted file mode 100644 index 1b5296b498b..00000000000 --- a/tests/bugs/replicate/bug-966018.t +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -. $(dirname $0)/../../include.rc -. $(dirname $0)/../../volume.rc -. $(dirname $0)/../../nfs.rc - -#This tests if cluster.eager-lock blocks metadata operations on nfs/fuse mounts. -#If it is not woken up, INODELK from the next command waits -#for post-op-delay secs. - -cleanup; -TEST glusterd -TEST pidof glusterd - -TEST $CLI volume create $V0 replica 2 $H0:$B0/r2_0 $H0:$B0/r2_1 -TEST $CLI volume set $V0 ensure-durability off -TEST $CLI volume set $V0 cluster.eager-lock on -TEST $CLI volume set $V0 cluster.post-op-delay-secs 3 -TEST $CLI volume set $V0 nfs.disable false - -TEST $CLI volume start $V0 -TEST $CLI volume profile $V0 start -EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available; -TEST mount_nfs $H0:/$V0 $N0 nolock; -TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $M0 -echo 1 > $N0/1 && chmod +x $N0/1 -echo 1 > $M0/1 && chmod +x $M0/1 - -#Check that INODELK MAX latency is not in the order of seconds -#Test if the MAX INODELK fop latency is of the order of seconds. -inodelk_max_latency=$($CLI volume profile $V0 info | grep INODELK | awk 'BEGIN {max = 0} {if ($6 > max) max=$6;} END {print max}' | cut -d. -f 1 | egrep "[0-9]{7,}") - -TEST [ -z $inodelk_max_latency ] -EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $N0 - -cleanup; diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index c9953139b7e..bfd8c2e8c2c 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -121,37 +121,77 @@ afr_is_possibly_under_txn (afr_transaction_type type, afr_local_t *local, return _gf_false; } +static void +afr_inode_ctx_destroy (afr_inode_ctx_t *ctx) +{ + int i = 0; + + if (!ctx) + return; + + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + GF_FREE (ctx->pre_op_done[i]); + } + + GF_FREE (ctx); +} + int __afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx) { - uint64_t ctx_int = 0; - int ret = -1; - afr_inode_ctx_t *tmp_ctx = NULL; + uint64_t ctx_int = 0; + int ret = -1; + int i = -1; + int num_locks = -1; + afr_inode_ctx_t *ictx = NULL; + afr_lock_t *lock = NULL; + afr_private_t *priv = this->private; ret = __inode_ctx_get (inode, this, &ctx_int); - if (ret) { - tmp_ctx = GF_CALLOC (1, sizeof (afr_inode_ctx_t), - gf_afr_mt_inode_ctx_t); - if (!tmp_ctx) - goto out; + if (ret == 0) { + *ctx = (afr_inode_ctx_t *)ctx_int; + return 0; + } - ctx_int = (long) tmp_ctx; - ret = __inode_ctx_set (inode, this, &ctx_int); - if (ret) { - GF_FREE (tmp_ctx); + ictx = GF_CALLOC (1, sizeof (afr_inode_ctx_t), gf_afr_mt_inode_ctx_t); + if (!ictx) + goto out; + + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + ictx->pre_op_done[i] = GF_CALLOC (sizeof *ictx->pre_op_done[i], + priv->child_count, + gf_afr_mt_int32_t); + if (!ictx->pre_op_done[i]) { + ret = -ENOMEM; goto out; } - tmp_ctx->spb_choice = -1; - tmp_ctx->read_subvol = 0; - tmp_ctx->write_subvol = 0; - tmp_ctx->lock_count = 0; - } else { - tmp_ctx = (afr_inode_ctx_t *) ctx_int; } - *ctx = tmp_ctx; + num_locks = sizeof(ictx->lock)/sizeof(afr_lock_t); + for (i = 0; i < num_locks; i++) { + lock = &ictx->lock[i]; + INIT_LIST_HEAD (&lock->post_op); + INIT_LIST_HEAD (&lock->frozen); + INIT_LIST_HEAD (&lock->waiting); + INIT_LIST_HEAD (&lock->owners); + } + + ctx_int = (uint64_t)ictx; + ret = __inode_ctx_set (inode, this, &ctx_int); + if (ret) { + goto out; + } + + ictx->spb_choice = -1; + ictx->read_subvol = 0; + ictx->write_subvol = 0; + ictx->lock_count = 0; ret = 0; + *ctx = ictx; out: + if (ret) { + afr_inode_ctx_destroy (ictx); + } return ret; } @@ -1752,10 +1792,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) GF_FREE (local->internal_lock.locked_nodes); - for (i = 0; local->internal_lock.inodelk[i].domain; i++) { - GF_FREE (local->internal_lock.inodelk[i].locked_nodes); - } - GF_FREE (local->internal_lock.lower_locked_nodes); afr_entry_lockee_cleanup (&local->internal_lock); @@ -1772,7 +1808,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) GF_FREE (local->transaction.changelog_xdata); } - GF_FREE (local->transaction.eager_lock); GF_FREE (local->transaction.failed_subvols); GF_FREE (local->transaction.basename); @@ -1819,16 +1854,6 @@ afr_local_replies_wipe (afr_local_t *local, afr_private_t *priv) memset (local->replies, 0, sizeof(*local->replies) * priv->child_count); } -void -afr_remove_eager_lock_stub (afr_local_t *local) -{ - LOCK (&local->fd->lock); - { - list_del_init (&local->transaction.eager_locked); - } - UNLOCK (&local->fd->lock); -} - static gf_boolean_t afr_fop_lock_is_unlock (call_frame_t *frame) { @@ -1933,10 +1958,6 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) syncbarrier_destroy (&local->barrier); - if (local->transaction.eager_lock_on && - !list_empty (&local->transaction.eager_locked)) - afr_remove_eager_lock_stub (local); - afr_local_transaction_cleanup (local, this); priv = this->private; @@ -3228,22 +3249,8 @@ out: void _afr_cleanup_fd_ctx (afr_fd_ctx_t *fd_ctx) { - int i = 0; - - - for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) - GF_FREE (fd_ctx->pre_op_done[i]); - GF_FREE (fd_ctx->opened_on); - - GF_FREE (fd_ctx->lock_piggyback); - - GF_FREE (fd_ctx->lock_acquired); - - pthread_mutex_destroy (&fd_ctx->delay_lock); - GF_FREE (fd_ctx); - return; } @@ -3261,15 +3268,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) fd_ctx = (afr_fd_ctx_t *)(long) ctx; if (fd_ctx) { - /*no need to take any locks*/ - if (!list_empty (&fd_ctx->eager_locked)) - gf_msg (this->name, GF_LOG_WARNING, 0, - AFR_MSG_INVALID_DATA, "%s: Stale " - "Eager-lock stubs found", - uuid_utoa (fd->inode->gfid)); - _afr_cleanup_fd_ctx (fd_ctx); - } out: @@ -3350,23 +3349,6 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) goto out; } - ret = pthread_mutex_init (&fd_ctx->delay_lock, NULL); - if (ret) { - GF_FREE (fd_ctx); - fd_ctx = NULL; - goto out; - } - - for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { - fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]), - priv->child_count, - gf_afr_mt_int32_t); - if (!fd_ctx->pre_op_done[i]) { - ret = -ENOMEM; - goto out; - } - } - fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), priv->child_count, gf_afr_mt_int32_t); @@ -3382,26 +3364,8 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED; } - fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->lock_piggyback) { - ret = -ENOMEM; - goto out; - } - - fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->lock_acquired) { - ret = -ENOMEM; - goto out; - } - fd_ctx->readdir_subvol = -1; - INIT_LIST_HEAD (&fd_ctx->eager_locked); - ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); if (ret) gf_msg_debug (this->name, 0, @@ -3473,12 +3437,70 @@ afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) return 0; } +afr_local_t* +afr_wakeup_same_fd_delayed_op (xlator_t *this, afr_lock_t *lock, fd_t *fd) +{ + afr_local_t *local = NULL; + + if (lock->delay_timer) { + local = list_entry(lock->post_op.next, afr_local_t, + transaction.owner_list); + if (fd == local->fd) { + if (gf_timer_call_cancel (this->ctx, + lock->delay_timer)) { + local = NULL; + } else { + lock->delay_timer = NULL; + } + } else { + local = NULL; + } + } + + return local; +} + +void +afr_delayed_changelog_wake_resume (xlator_t *this, inode_t *inode, + call_stub_t *stub) +{ + afr_inode_ctx_t *ctx = NULL; + afr_lock_t *lock = NULL; + afr_local_t *metadata_local = NULL; + afr_local_t *data_local = NULL; + LOCK (&inode->lock); + { + (void)__afr_inode_ctx_get (this, inode, &ctx); + lock = &ctx->lock[AFR_DATA_TRANSACTION]; + data_local = afr_wakeup_same_fd_delayed_op (this, lock, + stub->args.fd); + lock = &ctx->lock[AFR_METADATA_TRANSACTION]; + metadata_local = afr_wakeup_same_fd_delayed_op (this, lock, + stub->args.fd); + } + UNLOCK (&inode->lock); + + if (data_local) { + data_local->transaction.resume_stub = stub; + } else if (metadata_local) { + metadata_local->transaction.resume_stub = stub; + } else { + call_resume (stub); + } + if (data_local) { + afr_delayed_changelog_wake_up_cbk (data_local); + } + if (metadata_local) { + afr_delayed_changelog_wake_up_cbk (metadata_local); + } +} + int afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_local_t *local = NULL; - call_stub_t *stub = NULL; - int op_errno = ENOMEM; + afr_local_t *local = NULL; + call_stub_t *stub = NULL; + int op_errno = ENOMEM; local = AFR_FRAME_INIT (frame, op_errno); if (!local) @@ -3494,7 +3516,7 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) if (!stub) goto out; - afr_delayed_changelog_wake_resume (this, fd, stub); + afr_delayed_changelog_wake_resume (this, fd->inode, stub); return 0; out: @@ -3502,7 +3524,6 @@ out: return 0; } - int afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) @@ -4565,7 +4586,7 @@ afr_forget (xlator_t *this, inode_t *inode) return 0; ctx = (afr_inode_ctx_t *)ctx_int; - GF_FREE (ctx); + afr_inode_ctx_destroy (ctx); return 0; } @@ -5382,21 +5403,6 @@ out: } int -afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count) -{ - int ret = -ENOMEM; - - lk->domain = dom; - lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->locked_nodes) - goto out; - ret = 0; -out: - return ret; -} - -int afr_transaction_local_init (afr_local_t *local, xlator_t *this) { int ret = -ENOMEM; @@ -5407,25 +5413,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (ret < 0) goto out; - if ((local->transaction.type == AFR_DATA_TRANSACTION) || - (local->transaction.type == AFR_METADATA_TRANSACTION)) { - ret = afr_inodelk_init (&local->internal_lock.inodelk[0], - this->name, priv->child_count); - if (ret < 0) - goto out; - } - ret = -ENOMEM; local->pre_op_compat = priv->pre_op_compat; - local->transaction.eager_lock = - GF_CALLOC (sizeof (*local->transaction.eager_lock), - priv->child_count, - gf_afr_mt_int32_t); - - if (!local->transaction.eager_lock) - goto out; - local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op), priv->child_count, gf_afr_mt_char); @@ -5457,9 +5447,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (!local->pending) goto out; - INIT_LIST_HEAD (&local->transaction.eager_locked); - ret = 0; + INIT_LIST_HEAD (&local->transaction.wait_list); + INIT_LIST_HEAD (&local->transaction.owner_list); out: return ret; } @@ -5494,24 +5484,6 @@ out: return; } -void -afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - - local = frame->local; - - if (!local->fd) - return; - - fd_ctx = afr_fd_ctx_get (local->fd, this); - if (!fd_ctx) - return; - - fd_ctx->open_fd_count = local->open_fd_count; -} - int** afr_mark_pending_changelog (afr_private_t *priv, unsigned char *pending, dict_t *xattr, ia_type_t iat) @@ -5620,7 +5592,7 @@ out: int afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this, - inode_t *inode, gf_boolean_t *dsh, + fd_t *fd, gf_boolean_t *dsh, gf_boolean_t *pflag) { int ret = -1; @@ -5630,8 +5602,8 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this, unsigned char *healed_sinks = NULL; unsigned char *undid_pending = NULL; afr_private_t *priv = NULL; - fd_t *fd = NULL; struct afr_reply *locked_replies = NULL; + inode_t *inode = fd->inode; priv = this->private; data_lock = alloca0 (priv->child_count); @@ -5640,18 +5612,6 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this, healed_sinks = alloca0 (priv->child_count); undid_pending = alloca0 (priv->child_count); - /* Heal-info does an open() on the file being examined so that the - * current eager-lock holding client, if present, at some point sees - * open-fd count being > 1 and releases the eager-lock so that heal-info - * doesn't remain blocked forever until IO completes. - */ - ret = afr_selfheal_data_open (this, inode, &fd); - if (ret < 0) { - gf_msg_debug (this->name, -ret, "%s: Failed to open", - uuid_utoa (inode->gfid)); - goto out; - } - locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); ret = afr_selfheal_inodelk (frame, this, inode, this->name, @@ -5674,8 +5634,6 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this, out: if (locked_replies) afr_replies_wipe (locked_replies, priv->child_count); - if (fd) - fd_unref (fd); return ret; } @@ -5760,6 +5718,7 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid, { int ret = -1; + fd_t *fd = NULL; gf_boolean_t dsh = _gf_false; gf_boolean_t msh = _gf_false; gf_boolean_t esh = _gf_false; @@ -5771,6 +5730,21 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid, /* For every heal type hold locks and check if it indeed needs heal */ + + /* Heal-info does an open() on the file being examined so that the + * current eager-lock holding client, if present, at some point sees + * open-fd count being > 1 and releases the eager-lock so that heal-info + * doesn't remain blocked forever until IO completes. + */ + if ((*inode)->ia_type == IA_IFREG) { + ret = afr_selfheal_data_open (this, *inode, &fd); + if (ret < 0) { + gf_msg_debug (this->name, -ret, "%s: Failed to open", + uuid_utoa ((*inode)->gfid)); + goto out; + } + } + if (msh) { ret = afr_selfheal_locked_metadata_inspect (frame, this, *inode, &msh, @@ -5780,7 +5754,7 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid, } if (dsh) { - ret = afr_selfheal_locked_data_inspect (frame, this, *inode, + ret = afr_selfheal_locked_data_inspect (frame, this, fd, &dsh, pending); if (ret == -EIO || (ret == -EAGAIN)) goto out; @@ -5795,6 +5769,8 @@ out: *data_selfheal = dsh; *entry_selfheal = esh; *metadata_selfheal = msh; + if (fd) + fd_unref (fd); return ret; } @@ -6429,6 +6405,7 @@ afr_write_subvol_reset (call_frame_t *frame, xlator_t *this) local = frame->local; LOCK(&local->inode->lock); { + GF_ASSERT (local->inode_ctx->lock_count > 0); local->inode_ctx->lock_count--; if (!local->inode_ctx->lock_count) diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 8893a7db670..9cab08c653a 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -344,14 +344,14 @@ afr_process_post_writev (call_frame_t *frame, xlator_t *this) the xattrs are not reliably pointing at a stale file. */ - afr_fd_report_unstable_write (this, local->fd); + afr_fd_report_unstable_write (this, local); __afr_inode_write_finalize (frame, this); afr_writev_handle_short_writes (frame, this); if (local->update_open_fd_count) - afr_handle_open_fd_count (frame, this); + local->inode_ctx->open_fd_count = local->open_fd_count; } @@ -2593,7 +2593,7 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, local->op = GF_FOP_FSYNC; local->cont.fsync.datasync = datasync; - if (afr_fd_has_witnessed_unstable_write (this, fd)) { + if (afr_fd_has_witnessed_unstable_write (this, fd->inode)) { /* don't care. we only wanted to CLEAR the bit */ } diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index 260815f23d2..be3de01924d 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -52,31 +52,6 @@ afr_entry_lockee_cmp (const void *l1, const void *l2) int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); -static int -afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this); - -static uint64_t afr_lock_number = 1; - -static uint64_t -get_afr_lock_number () -{ - return (++afr_lock_number); -} - -int -afr_set_lock_number (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->lock_number = get_afr_lock_number (); - - return 0; -} - void afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner) { @@ -203,21 +178,16 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; afr_private_t *priv = NULL; - afr_inodelk_t *inodelk = NULL; priv = this->private; local = frame->local; int_lock = &local->internal_lock; - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - - inodelk->lock_count = 0; + int_lock->lock_count = 0; int_lock->lk_attempted_count = 0; int_lock->lock_op_ret = -1; int_lock->lock_op_errno = 0; - memset (inodelk->locked_nodes, 0, - sizeof (*inodelk->locked_nodes) * priv->child_count); memset (int_lock->locked_nodes, 0, sizeof (*int_lock->locked_nodes) * priv->child_count); @@ -286,12 +256,7 @@ void afr_update_uninodelk (afr_local_t *local, afr_internal_lock_t *int_lock, int32_t child_index) { - afr_inodelk_t *inodelk = NULL; - - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - inodelk->locked_nodes[child_index] &= LOCKED_NO; - if (local->transaction.eager_lock) - local->transaction.eager_lock[child_index] = 0; + int_lock->locked_nodes[child_index] &= LOCKED_NO; } @@ -331,35 +296,27 @@ static int afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; struct gf_flock flock = {0,}; - struct gf_flock full_flock = {0,}; - struct gf_flock *flock_use = NULL; int call_count = 0; int i = 0; - int piggyback = 0; - afr_fd_ctx_t *fd_ctx = NULL; - local = frame->local; int_lock = &local->internal_lock; priv = this->private; - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - - flock.l_start = inodelk->flock.l_start; - flock.l_len = inodelk->flock.l_len; + flock.l_start = int_lock->flock.l_start; + flock.l_len = int_lock->flock.l_len; flock.l_type = F_UNLCK; - full_flock.l_type = F_UNLCK; - call_count = afr_locked_nodes_count (inodelk->locked_nodes, + call_count = afr_locked_nodes_count (int_lock->locked_nodes, priv->child_count); int_lock->lk_call_count = call_count; if (!call_count) { + GF_ASSERT (!local->transaction.do_eager_unlock); gf_msg_trace (this->name, 0, "No internal locks unlocked"); @@ -367,64 +324,28 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) goto out; } - if (local->fd) - fd_ctx = afr_fd_ctx_get (local->fd, this); - for (i = 0; i < priv->child_count; i++) { - if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES) + if ((int_lock->locked_nodes[i] & LOCKED_YES) != LOCKED_YES) continue; if (local->fd) { - flock_use = &flock; - if (!local->transaction.eager_lock[i]) { - goto wind; - } - - piggyback = 0; - - LOCK (&local->fd->lock); - { - if (fd_ctx->lock_piggyback[i]) { - fd_ctx->lock_piggyback[i]--; - piggyback = 1; - } else { - fd_ctx->lock_acquired[i]--; - } - } - UNLOCK (&local->fd->lock); - - if (piggyback) { - afr_unlock_inodelk_cbk (frame, (void *) (long) i, - this, 1, 0, NULL); - if (!--call_count) - break; - continue; - } - - flock_use = &full_flock; - wind: STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, (void *) (long)i, priv->children[i], priv->children[i]->fops->finodelk, int_lock->domain, local->fd, - F_SETLK, flock_use, NULL); - - if (!--call_count) - break; - + F_SETLK, &flock, NULL); } else { - STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, (void *) (long)i, priv->children[i], priv->children[i]->fops->inodelk, int_lock->domain, &local->loc, F_SETLK, &flock, NULL); - - if (!--call_count) - break; } + + if (!--call_count) + break; } out: return 0; @@ -512,6 +433,18 @@ out: } +int32_t +afr_unlock_now (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = frame->local; + + if (afr_is_inodelk_transaction(local->transaction.type)) + afr_unlock_inodelk (frame, this); + else + afr_unlock_entrylk (frame, this); + return 0; +} + static int32_t afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) @@ -553,7 +486,7 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if ((op_ret == -1) && (op_errno == ENOSYS)) { - afr_unlock (frame, this); + afr_unlock_now (frame, this); } else { if (op_ret == 0) { if (local->transaction.type == AFR_ENTRY_TRANSACTION || @@ -598,38 +531,6 @@ afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, return 0; } -static int -afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - memcpy (inodelk->locked_nodes, int_lock->locked_nodes, - sizeof (*inodelk->locked_nodes) * priv->child_count); - inodelk->lock_count = int_lock->lock_count; - break; - - case AFR_ENTRY_RENAME_TRANSACTION: - case AFR_ENTRY_TRANSACTION: - /*entrylk_count is being used in both non-blocking and blocking - * modes */ - break; - } - - return 0; - -} - static gf_boolean_t afr_is_entrylk (afr_transaction_type trans_type) { @@ -733,7 +634,6 @@ int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) { afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; struct gf_flock flock = {0,}; @@ -752,10 +652,9 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) if (!is_entrylk) { - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - flock.l_start = inodelk->flock.l_start; - flock.l_len = inodelk->flock.l_len; - flock.l_type = inodelk->flock.l_type; + flock.l_start = int_lock->flock.l_start; + flock.l_len = int_lock->flock.l_len; + flock.l_type = int_lock->flock.l_type; } if (local->fd) { @@ -770,9 +669,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) local->op_ret = -1; int_lock->lock_op_ret = -1; - afr_copy_locked_nodes (frame, this); - - afr_unlock (frame, this); + afr_unlock_now (frame, this); return 0; } @@ -784,9 +681,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) local->op_ret = -1; int_lock->lock_op_ret = -1; - afr_copy_locked_nodes (frame, this); - - afr_unlock(frame, this); + afr_unlock_now(frame, this); return 0; } @@ -798,8 +693,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) gf_msg_debug (this->name, 0, "we're done locking"); - afr_copy_locked_nodes (frame, this); - int_lock->lock_op_ret = 0; int_lock->lock_cbk (frame, this); return 0; @@ -815,7 +708,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) case AFR_METADATA_TRANSACTION: if (local->fd) { - STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, (void *) (long) child_index, priv->children[child_index], @@ -824,7 +716,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) F_SETLKW, &flock, NULL); } else { - STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, (void *) (long) child_index, priv->children[child_index], @@ -841,7 +732,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) *and 'fd-less' children */ if (local->fd) { - STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, (void *) (long) cookie, priv->children[child_index], @@ -850,7 +740,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) int_lock->lockee[lockee_no].basename, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); } else { - STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, (void *) (long) cookie, priv->children[child_index], @@ -922,7 +811,6 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; int_lock = &local->internal_lock; - LOCK (&frame->lock); { if (op_ret < 0 ) { @@ -969,7 +857,7 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "with blocking calls", int_lock->lock_count); - afr_unlock(frame, this); + afr_unlock_now(frame, this); } } @@ -1009,7 +897,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) local->op_errno = EINVAL; int_lock->lock_op_errno = EINVAL; - afr_unlock (frame, this); + afr_unlock_now (frame, this); return -1; } @@ -1021,7 +909,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_INFO_COMMON, "fd not open on any subvolumes. aborting."); - afr_unlock (frame, this); + afr_unlock_now (frame, this); goto out; } @@ -1031,7 +919,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) index = i%copies; lockee_no = i/copies; if (local->child_up[index]) { - STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, (void *) (long) i, priv->children[index], @@ -1053,7 +940,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) index = i%copies; lockee_no = i/copies; if (local->child_up[index]) { - STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, (void *) (long) i, priv->children[index], @@ -1077,18 +963,12 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; int call_count = 0; int child_index = (long) cookie; local = frame->local; int_lock = &local->internal_lock; - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - - if (local->fd) - fd_ctx = afr_fd_ctx_get (local->fd, this); LOCK (&frame->lock); { @@ -1105,43 +985,27 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int_lock->lock_op_errno = op_errno; local->op_errno = op_errno; } - if (local->transaction.eager_lock) - local->transaction.eager_lock[child_index] = 0; } else { - inodelk->locked_nodes[child_index] |= LOCKED_YES; - inodelk->lock_count++; - - if (local->transaction.eager_lock && - local->transaction.eager_lock[child_index] && - local->fd) { - /* piggybacked */ - if (op_ret == 1) { - /* piggybacked */ - } else if (op_ret == 0) { - /* lock acquired from server */ - fd_ctx->lock_acquired[child_index]++; - } - } - - if (local->transaction.type == AFR_DATA_TRANSACTION && - op_ret == 0) { - LOCK(&local->inode->lock); - { - local->inode_ctx->lock_count++; - } - UNLOCK (&local->inode->lock); - } + int_lock->locked_nodes[child_index] |= LOCKED_YES; + int_lock->lock_count++; } call_count = --int_lock->lk_call_count; } UNLOCK (&frame->lock); + if (op_ret == 0 && local->transaction.type == AFR_DATA_TRANSACTION) { + LOCK (&local->inode->lock); + { + local->inode_ctx->lock_count++; + } + UNLOCK (&local->inode->lock); + } if (call_count == 0) { gf_msg_trace (this->name, 0, "Last inode locking reply received"); /* all locks successful. Proceed to call FOP */ - if (inodelk->lock_count == int_lock->lk_expected_count) { + if (int_lock->lock_count == int_lock->lk_expected_count) { gf_msg_trace (this->name, 0, "All servers locked. Calling the cbk"); int_lock->lock_op_ret = 0; @@ -1155,7 +1019,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "Trying again with blocking calls", int_lock->lock_count); - afr_unlock(frame, this); + afr_unlock_now(frame, this); } } @@ -1166,30 +1030,17 @@ int afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; afr_fd_ctx_t *fd_ctx = NULL; int32_t call_count = 0; int i = 0; int ret = 0; - struct gf_flock flock = {0,}; - struct gf_flock full_flock = {0,}; - struct gf_flock *flock_use = NULL; - int piggyback = 0; local = frame->local; int_lock = &local->internal_lock; priv = this->private; - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - - flock.l_start = inodelk->flock.l_start; - flock.l_len = inodelk->flock.l_len; - flock.l_type = inodelk->flock.l_type; - - full_flock.l_type = inodelk->flock.l_type; - initialize_inodelk_variables (frame, this); if (local->fd) { @@ -1205,88 +1056,48 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) local->op_errno = EINVAL; int_lock->lock_op_errno = EINVAL; - afr_unlock (frame, this); + afr_unlock_now (frame, this); ret = -1; goto out; } + } - call_count = internal_lock_count (frame, this); - int_lock->lk_call_count = call_count; - int_lock->lk_expected_count = call_count; - - if (!call_count) { - gf_msg (this->name, GF_LOG_INFO, 0, - AFR_MSG_SUBVOLS_DOWN, - "All bricks are down, aborting."); - afr_unlock (frame, this); - goto out; - } - - /* Send non-blocking inodelk calls only on up children - and where the fd has been opened */ - for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; - - flock_use = &flock; - if (!local->transaction.eager_lock_on) { - goto wind; - } - - piggyback = 0; - local->transaction.eager_lock[i] = 1; - - afr_set_delayed_post_op (frame, this); + call_count = internal_lock_count (frame, this); + int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; - LOCK (&local->fd->lock); - { - if (fd_ctx->lock_acquired[i]) { - fd_ctx->lock_piggyback[i]++; - piggyback = 1; - } - } - UNLOCK (&local->fd->lock); + if (!call_count) { + gf_msg (this->name, GF_LOG_INFO, 0, + AFR_MSG_SUBVOLS_DOWN, + "All bricks are down, aborting."); + afr_unlock_now (frame, this); + goto out; + } - if (piggyback) { - /* (op_ret == 1) => indicate piggybacked lock */ - afr_nonblocking_inodelk_cbk (frame, (void *) (long) i, - this, 1, 0, NULL); - if (!--call_count) - break; - continue; - } - flock_use = &full_flock; - wind: + /* Send non-blocking inodelk calls only on up children + and where the fd has been opened */ + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + if (local->fd) { STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->finodelk, int_lock->domain, local->fd, - F_SETLK, flock_use, NULL); - - if (!--call_count) - break; - } - } else { - call_count = internal_lock_count (frame, this); - int_lock->lk_call_count = call_count; - int_lock->lk_expected_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; + F_SETLK, &int_lock->flock, NULL); + } else { STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, (void *) (long) i, priv->children[i], priv->children[i]->fops->inodelk, int_lock->domain, &local->loc, - F_SETLK, &flock, NULL); - - if (!--call_count) - break; + F_SETLK, &int_lock->flock, NULL); } + if (!--call_count) + break; } out: return ret; @@ -1296,13 +1107,32 @@ int32_t afr_unlock (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; + afr_lock_t *lock = NULL; local = frame->local; - if (afr_is_inodelk_transaction(local->transaction.type)) - afr_unlock_inodelk (frame, this); - else - afr_unlock_entrylk (frame, this); + if (!local->transaction.eager_lock_on) + goto out; + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK (&local->inode->lock); + { + list_del_init (&local->transaction.owner_list); + if (list_empty (&lock->owners) && list_empty (&lock->post_op)) { + local->transaction.do_eager_unlock = _gf_true; + /*TODO: Need to get metadata use on_disk and inherit/uninherit + *GF_ASSERT (!local->inode_ctx->on_disk[local->transaction.type]); + *GF_ASSERT (!local->inode_ctx->inherited[local->transaction.type]); + */ + GF_ASSERT (lock->release); + } + } + UNLOCK (&local->inode->lock); + if (!local->transaction.do_eager_unlock) { + local->internal_lock.lock_cbk (frame, this); + return 0; + } +out: + afr_unlock_now (frame, this); return 0; } diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 7195dfe058c..dc380c6d280 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -2469,6 +2469,7 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid) int data_ret = 1; int or_ret = 0; inode_t *inode = NULL; + fd_t *fd = NULL; gf_boolean_t data_selfheal = _gf_false; gf_boolean_t metadata_selfheal = _gf_false; gf_boolean_t entry_selfheal = _gf_false; @@ -2493,8 +2494,16 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid) goto out; } + if (inode->ia_type == IA_IFREG) { + ret = afr_selfheal_data_open (this, inode, &fd); + if (!fd) { + ret = -EIO; + goto out; + } + } + if (data_selfheal && dataheal_enabled) - data_ret = afr_selfheal_data (frame, this, inode); + data_ret = afr_selfheal_data (frame, this, fd); if (metadata_selfheal && priv->metadata_self_heal) metadata_ret = afr_selfheal_metadata (frame, this, inode); @@ -2516,6 +2525,8 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid) out: if (inode) inode_unref (inode); + if (fd) + fd_unref (fd); return ret; } /* diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 3cf5b32b01d..40dee7a7d6c 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -869,22 +869,15 @@ out: } int -afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode) +afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd) { afr_private_t *priv = NULL; unsigned char *locked_on = NULL; int ret = 0; - fd_t *fd = NULL; + inode_t *inode = fd->inode; priv = this->private; - ret = afr_selfheal_data_open (this, inode, &fd); - if (!fd) { - gf_msg_debug (this->name, -ret, "%s: Failed to open", - uuid_utoa (inode->gfid)); - return -EIO; - } - locked_on = alloca0 (priv->child_count); ret = afr_selfheal_tie_breaker_inodelk (frame, this, inode, @@ -911,8 +904,5 @@ unlock: afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on); - if (fd) - fd_unref (fd); - return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 8e976905e97..cd67d2a3192 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -102,7 +102,7 @@ afr_selfheal_name (xlator_t *this, uuid_t gfid, const char *name, void *gfid_req, dict_t *xdata); int -afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode); +afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd); int afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode); diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index a253c0835f5..ec72d46fb36 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -25,6 +25,18 @@ typedef enum { AFR_TRANSACTION_POST_OP, } afr_xattrop_type_t; +static void +afr_lock_resume_shared (struct list_head *list); + +void +__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared); + +void +afr_changelog_post_op (call_frame_t *frame, xlator_t *this); + +int +afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this); + gf_boolean_t afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this); @@ -168,13 +180,14 @@ afr_transaction_fop (call_frame_t *frame, xlator_t *this) return 0; } - int afr_transaction_done (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - gf_boolean_t unwind = _gf_false; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + gf_boolean_t unwind = _gf_false; + afr_lock_t *lock = NULL; + afr_local_t *lock_local = NULL; priv = this->private; local = frame->local; @@ -188,6 +201,31 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this) if (unwind)/*It definitely did post-op*/ afr_zero_fill_stat (local); } + + if (local->transaction.do_eager_unlock) { + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK (&local->inode->lock); + { + lock->acquired = _gf_false; + lock->release = _gf_false; + list_splice_init (&lock->frozen, + &lock->waiting); + if (list_empty (&lock->waiting)) + goto unlock; + lock_local = list_entry (lock->waiting.next, + afr_local_t, + transaction.wait_list); + list_del_init (&lock_local->transaction.wait_list); + list_add (&lock_local->transaction.owner_list, + &lock->owners); + } +unlock: + UNLOCK (&local->inode->lock); + } + if (lock_local) { + afr_lock (lock_local->transaction.frame, + lock_local->transaction.frame->this); + } local->transaction.unwind (frame, this); AFR_STACK_DESTROY (frame); @@ -195,6 +233,52 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this) return 0; } +static void +afr_lock_fail_shared (afr_local_t *local, struct list_head *list) +{ + afr_local_t *each = NULL; + + while (!list_empty(list)) { + each = list_entry (list->next, afr_local_t, + transaction.wait_list); + list_del_init(&each->transaction.wait_list); + each->op_ret = -1; + each->op_errno = local->op_errno; + afr_transaction_done (each->transaction.frame, + each->transaction.frame->this); + } +} + +static void +afr_handle_lock_acquire_failure (afr_local_t *local, gf_boolean_t locked) +{ + struct list_head shared; + afr_lock_t *lock = NULL; + + if (!local->transaction.eager_lock_on) + goto out; + + lock = &local->inode_ctx->lock[local->transaction.type]; + + INIT_LIST_HEAD (&shared); + LOCK (&local->inode->lock); + { + list_splice_init (&lock->waiting, &shared); + } + UNLOCK (&local->inode->lock); + + afr_lock_fail_shared (local, &shared); + local->transaction.do_eager_unlock = _gf_true; +out: + if (locked) { + local->internal_lock.lock_cbk = afr_transaction_done; + afr_unlock (local->transaction.frame, + local->transaction.frame->this); + } else { + afr_transaction_done (local->transaction.frame, + local->transaction.frame->this); + } +} call_frame_t* afr_transaction_detach_fop_frame (call_frame_t *frame) @@ -334,6 +418,7 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_private_t *priv = NULL; int pre_op_sources_count = 0; + int i = 0; priv = this->private; local = frame->local; @@ -345,11 +430,11 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this) /* If arbiter is the only source, do not proceed. */ if (pre_op_sources_count < 2 && local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) { - local->internal_lock.lock_cbk = afr_transaction_done; local->op_ret = -1; local->op_errno = ENOTCONN; - afr_restore_lk_owner (frame); - afr_unlock (frame, this); + for (i = 0; i < priv->child_count; i++) + local->transaction.failed_subvols[i] = 1; + afr_changelog_post_op (frame, this);/*uninherit should happen*/ } else { afr_transaction_fop (frame, this); } @@ -362,14 +447,16 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - fd_t *fd = NULL; int i = 0; int ret = 0; + int failure_count = 0; + struct list_head shared; + afr_lock_t *lock = NULL; local = frame->local; priv = this->private; - fd = local->fd; + INIT_LIST_HEAD (&shared); if (local->transaction.type == AFR_DATA_TRANSACTION && !local->transaction.inherited) { ret = afr_write_subvol_set (frame, this); @@ -394,22 +481,31 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) just now, before OP */ afr_changelog_pre_op_update (frame, this); - /* The wake up needs to happen independent of - what type of fop arrives here. If it was - a write, then it has already inherited the - lock and changelog. If it was not a write, - then the presumption of the optimization (of - optimizing for successive write operations) - fails. - */ - if (fd) - afr_delayed_changelog_wake_up (this, fd); + if (!local->transaction.eager_lock_on || + local->transaction.inherited) + goto fop; + failure_count = AFR_COUNT (local->transaction.failed_subvols, + priv->child_count); + if (failure_count == priv->child_count) { + afr_handle_lock_acquire_failure (local, _gf_true); + } else { + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK (&local->inode->lock); + { + lock->acquired = _gf_true; + __afr_transaction_wake_shared (local, &shared); + } + UNLOCK (&local->inode->lock); + } + +fop: if (priv->arbiter_count == 1) { afr_txn_arbitrate_fop (frame, this); } else { afr_transaction_fop (frame, this); } + afr_lock_resume_shared (&shared); return 0; } @@ -486,30 +582,14 @@ afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this) } -afr_inodelk_t* -afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom) -{ - afr_inodelk_t *inodelk = NULL; - int i = 0; - - for (i = 0; int_lock->inodelk[i].domain; i++) { - inodelk = &int_lock->inodelk[i]; - if (strcmp (dom, inodelk->domain) == 0) - return inodelk; - } - return NULL; -} - unsigned char* afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock) { unsigned char *locked_nodes = NULL; - afr_inodelk_t *inodelk = NULL; switch (type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - locked_nodes = inodelk->locked_nodes; + locked_nodes = int_lock->locked_nodes; break; case AFR_ENTRY_TRANSACTION: @@ -834,27 +914,19 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - fd_t *fd = NULL; + afr_inode_ctx_t *ctx = NULL; int i = 0; gf_boolean_t ret = _gf_false; - afr_fd_ctx_t *fd_ctx = NULL; int type = 0; local = frame->local; priv = this->private; - fd = local->fd; + ctx = local->inode_ctx; type = afr_index_for_transaction_type (local->transaction.type); if (type != AFR_DATA_TRANSACTION) return !local->transaction.dirtied; - if (!fd) - return !local->transaction.dirtied; - - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) - return _gf_false; - if (local->transaction.no_uninherit) return _gf_false; @@ -868,34 +940,34 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this) if (local->transaction.uninherit_done) return local->transaction.uninherit_value; - LOCK(&fd->lock); + LOCK(&local->inode->lock); { for (i = 0; i < priv->child_count; i++) { if (local->transaction.pre_op[i] != - fd_ctx->pre_op_done[type][i]) { + ctx->pre_op_done[type][i]) { ret = !local->transaction.dirtied; goto unlock; } } - if (fd_ctx->inherited[type]) { + if (ctx->inherited[type]) { ret = _gf_true; - fd_ctx->inherited[type]--; - } else if (fd_ctx->on_disk[type]) { + ctx->inherited[type]--; + } else if (ctx->on_disk[type]) { ret = _gf_false; - fd_ctx->on_disk[type]--; + ctx->on_disk[type]--; } else { /* ASSERT */ ret = _gf_false; } - if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) { + if (!ctx->inherited[type] && !ctx->on_disk[type]) { for (i = 0; i < priv->child_count; i++) - fd_ctx->pre_op_done[type][i] = 0; + ctx->pre_op_done[type][i] = 0; } } unlock: - UNLOCK(&fd->lock); + UNLOCK(&local->inode->lock); local->transaction.uninherit_done = _gf_true; local->transaction.uninherit_value = ret; @@ -909,31 +981,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - fd_t *fd = NULL; int i = 0; gf_boolean_t ret = _gf_false; - afr_fd_ctx_t *fd_ctx = NULL; int type = 0; local = frame->local; priv = this->private; - fd = local->fd; if (local->transaction.type != AFR_DATA_TRANSACTION) return _gf_false; type = afr_index_for_transaction_type (local->transaction.type); - if (!fd) - return _gf_false; - - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) - return _gf_false; - - LOCK(&fd->lock); + LOCK(&local->inode->lock); { - if (!fd_ctx->on_disk[type]) { + if (!local->inode_ctx->on_disk[type]) { /* nothing to inherit yet */ ret = _gf_false; goto unlock; @@ -941,21 +1003,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this) for (i = 0; i < priv->child_count; i++) { if (local->transaction.pre_op[i] != - fd_ctx->pre_op_done[type][i]) { + local->inode_ctx->pre_op_done[type][i]) { /* either inherit exactly, or don't */ ret = _gf_false; goto unlock; } } - fd_ctx->inherited[type]++; + local->inode_ctx->inherited[type]++; ret = _gf_true; local->transaction.inherited = _gf_true; } unlock: - UNLOCK(&fd->lock); + UNLOCK(&local->inode->lock); return ret; } @@ -966,22 +1028,16 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; - fd_t *fd = NULL; - afr_fd_ctx_t *fd_ctx = NULL; int i = 0; gf_boolean_t ret = _gf_false; int type = 0; local = frame->local; priv = this->private; - fd = local->fd; - if (!fd) - return _gf_false; - - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) - return _gf_false; + if (local->transaction.type == AFR_ENTRY_TRANSACTION || + local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) + return _gf_false; if (local->transaction.inherited) /* was already inherited in afr_changelog_pre_op */ @@ -997,26 +1053,26 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this) ret = _gf_false; - LOCK(&fd->lock); + LOCK(&local->inode->lock); { - if (!fd_ctx->on_disk[type]) { + if (!local->inode_ctx->on_disk[type]) { for (i = 0; i < priv->child_count; i++) - fd_ctx->pre_op_done[type][i] = + local->inode_ctx->pre_op_done[type][i] = (!local->transaction.failed_subvols[i]); } else { for (i = 0; i < priv->child_count; i++) - if (fd_ctx->pre_op_done[type][i] != + if (local->inode_ctx->pre_op_done[type][i] != (!local->transaction.failed_subvols[i])) { local->transaction.no_uninherit = 1; goto unlock; } } - fd_ctx->on_disk[type]++; + local->inode_ctx->on_disk[type]++; ret = _gf_true; } unlock: - UNLOCK(&fd->lock); + UNLOCK(&local->inode->lock); return ret; } @@ -1322,6 +1378,9 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) afr_init_optimistic_changelog_for_txn (this, local); + if (afr_changelog_pre_op_inherit (frame, this)) + goto next; + /* This condition should not be met with present code, as * transaction.done will be called if locks are not acquired on even a * single node. @@ -1347,9 +1406,6 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) goto err; } - if (afr_changelog_pre_op_inherit (frame, this)) - goto next; - if (call_count < priv->child_count) pre_nop = _gf_false; @@ -1406,7 +1462,7 @@ err: local->op_ret = -1; local->op_errno = op_errno; - afr_unlock (frame, this); + afr_handle_lock_acquire_failure (local, _gf_true); if (xdata_req) dict_unref (xdata_req); @@ -1416,31 +1472,6 @@ err: int -afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - if (int_lock->lock_op_ret < 0) { - gf_msg (this->name, GF_LOG_INFO, - 0, AFR_MSG_BLOCKING_LKS_FAILED, - "Blocking inodelks failed."); - afr_transaction_done (frame, this); - } else { - - gf_msg_debug (this->name, 0, - "Blocking inodelks done. Proceeding to FOP"); - afr_internal_lock_finish (frame, this); - } - - return 0; -} - - -int afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; @@ -1453,7 +1484,7 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) if (int_lock->lock_op_ret < 0) { gf_msg_debug (this->name, 0, "Non blocking inodelks failed. Proceeding to blocking"); - int_lock->lock_cbk = afr_post_blocking_inodelk_cbk; + int_lock->lock_cbk = afr_internal_lock_finish; afr_blocking_lock (frame, this); } else { @@ -1467,31 +1498,6 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) int -afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - if (int_lock->lock_op_ret < 0) { - gf_msg (this->name, GF_LOG_INFO, 0, - AFR_MSG_BLOCKING_LKS_FAILED, - "Blocking entrylks failed."); - afr_transaction_done (frame, this); - } else { - - gf_msg_debug (this->name, 0, - "Blocking entrylks done. Proceeding to FOP"); - afr_internal_lock_finish (frame, this); - } - - return 0; -} - - -int afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; @@ -1504,7 +1510,7 @@ afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) if (int_lock->lock_op_ret < 0) { gf_msg_debug (this->name, 0, "Non blocking entrylks failed. Proceeding to blocking"); - int_lock->lock_cbk = afr_post_blocking_entrylk_cbk; + int_lock->lock_cbk = afr_internal_lock_finish; afr_blocking_lock (frame, this); } else { @@ -1565,29 +1571,28 @@ int afr_set_transaction_flock (xlator_t *this, afr_local_t *local) { afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; afr_private_t *priv = NULL; int_lock = &local->internal_lock; - inodelk = afr_get_inodelk (int_lock, int_lock->domain); priv = this->private; - if ((priv->arbiter_count || priv->full_lock) && + if ((priv->arbiter_count || local->transaction.eager_lock_on || + priv->full_lock) && local->transaction.type == AFR_DATA_TRANSACTION) { /*Lock entire file to avoid network split brains.*/ - inodelk->flock.l_len = 0; - inodelk->flock.l_start = 0; + int_lock->flock.l_len = 0; + int_lock->flock.l_start = 0; } else { - inodelk->flock.l_len = local->transaction.len; - inodelk->flock.l_start = local->transaction.start; + int_lock->flock.l_len = local->transaction.len; + int_lock->flock.l_start = local->transaction.start; } - inodelk->flock.l_type = F_WRLCK; + int_lock->flock.l_type = F_WRLCK; return 0; } int -afr_lock_rec (call_frame_t *frame, xlator_t *this) +afr_lock (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; @@ -1628,74 +1633,153 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) return 0; } +static gf_boolean_t +afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) +{ + uint64_t start1 = local1->transaction.start; + uint64_t start2 = local2->transaction.start; + uint64_t end1 = 0; + uint64_t end2 = 0; + + if (local1->transaction.len) + end1 = start1 + local1->transaction.len - 1; + else + end1 = ULLONG_MAX; + + if (local2->transaction.len) + end2 = start2 + local2->transaction.len - 1; + else + end2 = ULLONG_MAX; -int -afr_lock (call_frame_t *frame, xlator_t *this) + return ((end1 >= start2) && (end2 >= start1)); +} + +gf_boolean_t +afr_has_lock_conflict (afr_local_t *local, gf_boolean_t waitlist_check) { - afr_set_lock_number (frame, this); + afr_local_t *each = NULL; + afr_lock_t *lock = NULL; - return afr_lock_rec (frame, this); + lock = &local->inode_ctx->lock[local->transaction.type]; + /* + * Once full file lock is acquired in eager-lock phase, overlapping + * writes do not compete for inode-locks, instead are transferred to the + * next writes. Because of this overlapping writes are not ordered. + * This can cause inconsistencies in replication. + * Example: + * Two overlapping writes w1, w2 are sent in parallel on same fd + * in two threads t1, t2. + * Both threads can execute afr_writev_wind in the following manner. + * t1 winds w1 on brick-0 + * t2 winds w2 on brick-0 + * t2 winds w2 on brick-1 + * t1 winds w1 on brick-1 + * + * This check makes sure the locks are not transferred for + * overlapping writes. + */ + list_for_each_entry (each, &lock->owners, transaction.owner_list) { + if (afr_locals_overlap (each, local)) { + return _gf_true; + } + } + + if (!waitlist_check) + return _gf_false; + list_for_each_entry (each, &lock->waiting, transaction.wait_list) { + if (afr_locals_overlap (each, local)) { + return _gf_true; + } + } + return _gf_false; } /* }}} */ - -int -afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) +static void +afr_copy_inodelk_vars (afr_internal_lock_t *dst, afr_internal_lock_t *src, + xlator_t *this) { - afr_changelog_pre_op (frame, this); + afr_private_t *priv = this->private; - return 0; + dst->domain = src->domain; + dst->flock.l_len = src->flock.l_len; + dst->flock.l_start = src->flock.l_start; + dst->flock.l_type = src->flock.l_type; + dst->lock_count = src->lock_count; + memcpy (dst->locked_nodes, src->locked_nodes, + priv->child_count * sizeof (*dst->locked_nodes)); } - void -afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this) +__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + gf_boolean_t conflict = _gf_false; + afr_local_t *each = NULL; + afr_lock_t *lock = &local->inode_ctx->lock[local->transaction.type]; - /* call this function from any of the related optimizations - which benefit from delaying post op are enabled, namely: - - - changelog piggybacking - - eager locking - */ + while (!conflict) { + if (list_empty (&lock->waiting)) + return; + each = list_entry(lock->waiting.next, afr_local_t, + transaction.wait_list); + if (afr_has_lock_conflict (each, _gf_false)) { + conflict = _gf_true; + } + if (conflict && !list_empty (&lock->owners)) + return; + afr_copy_inodelk_vars (&each->internal_lock, + &local->internal_lock, + each->transaction.frame->this); + list_move_tail (&each->transaction.wait_list, shared); + list_add_tail(&each->transaction.owner_list, &lock->owners); + } +} - priv = this->private; - if (!priv) - return; +static void +afr_lock_resume_shared (struct list_head *list) +{ + afr_local_t *each = NULL; - if (!priv->post_op_delay_secs) - return; + while (!list_empty(list)) { + each = list_entry(list->next, afr_local_t, + transaction.wait_list); + list_del_init(&each->transaction.wait_list); + afr_changelog_pre_op (each->transaction.frame, + each->transaction.frame->this); + } +} - local = frame->local; - if (!local) - return; +int +afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = frame->local; + afr_lock_t *lock = NULL; - if (!local->transaction.eager_lock_on) - return; - if (!local->fd) - return; + local->internal_lock.lock_cbk = NULL; + if (!local->transaction.eager_lock_on) { + if (local->internal_lock.lock_op_ret < 0) { + afr_transaction_done (frame, this); + return 0; + } + afr_changelog_pre_op (frame, this); + } else { + lock = &local->inode_ctx->lock[local->transaction.type]; + if (local->internal_lock.lock_op_ret < 0) { + afr_handle_lock_acquire_failure (local, _gf_false); + } else { + lock->event_generation = local->event_generation; + afr_changelog_pre_op (frame, this); + } + } - if (local->op == GF_FOP_WRITE) - local->delayed_post_op = _gf_true; + return 0; } gf_boolean_t -afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this) +afr_are_multiple_fds_opened (afr_local_t *local, xlator_t *this) { - afr_fd_ctx_t *fd_ctx = NULL; - - if (!fd) { - /* If false is returned, it may keep on taking eager-lock - * which may lead to starvation, so return true to avoid that. - */ - gf_msg_callingfn (this->name, GF_LOG_ERROR, EBADF, - AFR_MSG_INVALID_ARG, "Invalid fd"); - return _gf_true; - } /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock * is taken mount2 opened the same file, it won't be able to * perform any data operations until mount1 releases eager-lock. @@ -1703,11 +1787,7 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this) * if open-fd-count is > 1 */ - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) - return _gf_true; - - if (fd_ctx->open_fd_count > 1) + if (local->inode_ctx->open_fd_count > 1) return _gf_true; return _gf_false; @@ -1715,24 +1795,45 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this) gf_boolean_t -is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) +afr_is_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this, + int delay) { - afr_local_t *local = NULL; - gf_boolean_t res = _gf_false; + afr_local_t *local = NULL; + afr_lock_t *lock = NULL; + gf_boolean_t res = _gf_false; local = frame->local; - if (!local) + lock = &local->inode_ctx->lock[local->transaction.type]; + + if (!afr_txn_nothing_failed (frame, this)) { + lock->release = _gf_true; goto out; + } - if (!local->delayed_post_op) + if (afr_are_multiple_fds_opened (local, this)) { + lock->release = _gf_true; goto out; + } - //Mark pending changelog ASAP - if (!afr_txn_nothing_failed (frame, this)) + if (!list_empty (&lock->owners)) + goto out; + else + GF_ASSERT (list_empty (&lock->waiting)); + + if (lock->release) { + goto out; + } + + if (!delay) { goto out; + } - if (local->fd && afr_are_multiple_fds_opened (local->fd, this)) + if ((local->op != GF_FOP_WRITE) && + (local->op != GF_FOP_FXATTROP)) { + /*Only allow writes but shard does [f]xattrops on writes, so + * they are fine too*/ goto out; + } res = _gf_true; out: @@ -1743,50 +1844,61 @@ out: void afr_delayed_changelog_wake_up_cbk (void *data) { - fd_t *fd = NULL; + afr_lock_t *lock = NULL; + afr_local_t *local = data; + afr_local_t *timer_local = NULL; + struct list_head shared; - fd = data; - - afr_delayed_changelog_wake_up (THIS, fd); + INIT_LIST_HEAD (&shared); + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK (&local->inode->lock); + { + timer_local = list_entry(lock->post_op.next, + afr_local_t, + transaction.owner_list); + if (list_empty (&lock->owners) && (local == timer_local)) { + GF_ASSERT (list_empty (&lock->waiting)); + /*Last owner*/ + lock->release = _gf_true; + lock->delay_timer = NULL; + } + } + UNLOCK (&local->inode->lock); + afr_changelog_post_op_now (local->transaction.frame, + local->transaction.frame->this); } /* SET operation */ int -afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) +afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local) { - afr_fd_ctx_t *fdctx = NULL; - - fdctx = afr_fd_ctx_get (fd, this); - - LOCK(&fd->lock); + LOCK(&local->inode->lock); { - fdctx->witnessed_unstable_write = _gf_true; + local->inode_ctx->witnessed_unstable_write = _gf_true; } - UNLOCK(&fd->lock); + UNLOCK(&local->inode->lock); return 0; } /* TEST and CLEAR operation */ gf_boolean_t -afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd) +afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode) { - afr_fd_ctx_t *fdctx = NULL; + afr_inode_ctx_t *ctx = NULL; gf_boolean_t witness = _gf_false; - fdctx = afr_fd_ctx_get (fd, this); - if (!fdctx) - return _gf_true; - - LOCK(&fd->lock); + LOCK(&inode->lock); { - if (fdctx->witnessed_unstable_write) { + (void)__afr_inode_ctx_get (this, inode, &ctx); + + if (ctx->witnessed_unstable_write) { witness = _gf_true; - fdctx->witnessed_unstable_write = _gf_false; + ctx->witnessed_unstable_write = _gf_false; } } - UNLOCK (&fd->lock); + UNLOCK (&inode->lock); return witness; } @@ -1929,7 +2041,7 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) mark a flag in the fdctx whenever an unstable write is witnessed. */ - if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { + if (!afr_fd_has_witnessed_unstable_write (this, local->inode)) { afr_changelog_post_op_now (frame, this); return 0; } @@ -1947,87 +2059,64 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) return 0; } - void -afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, - call_stub_t *stub) +afr_changelog_post_op (call_frame_t *frame, xlator_t *this) { - afr_fd_ctx_t *fd_ctx = NULL; - call_frame_t *prev_frame = NULL; - struct timespec delta = {0, }; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; + struct timespec delta = {0, }; + afr_private_t *priv = NULL; + afr_local_t *local = frame->local; + afr_lock_t *lock = NULL; + gf_boolean_t post_op = _gf_true; + struct list_head shared; priv = this->private; - - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) - goto out; - delta.tv_sec = priv->post_op_delay_secs; delta.tv_nsec = 0; - pthread_mutex_lock (&fd_ctx->delay_lock); - { - prev_frame = fd_ctx->delay_frame; - fd_ctx->delay_frame = NULL; - if (fd_ctx->delay_timer) - gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer); - fd_ctx->delay_timer = NULL; - if (!frame) - goto unlock; - fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta, - afr_delayed_changelog_wake_up_cbk, - fd); - fd_ctx->delay_frame = frame; - } -unlock: - pthread_mutex_unlock (&fd_ctx->delay_lock); - -out: - if (prev_frame) { - local = prev_frame->local; - local->transaction.resume_stub = stub; - afr_changelog_post_op_now (prev_frame, this); - } else if (stub) { - call_resume (stub); - } -} - - -void -afr_changelog_post_op (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - if (is_afr_delayed_changelog_post_op_needed (frame, this)) - afr_delayed_changelog_post_op (this, frame, local->fd, NULL); - else - afr_changelog_post_op_safe (frame, this); -} - + INIT_LIST_HEAD (&shared); + if (!local->transaction.eager_lock_on) + goto out; + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK (&local->inode->lock); + { + list_del_init (&local->transaction.owner_list); + list_add (&local->transaction.owner_list, &lock->post_op); + __afr_transaction_wake_shared (local, &shared); + + if (!afr_is_delayed_changelog_post_op_needed (frame, this, + delta.tv_sec)) { + if (list_empty (&lock->owners)) + lock->release = _gf_true; + goto unlock; + } -/* Wake up the sleeping/delayed post-op, and also register - a stub to have it resumed after this transaction - completely finishes. + GF_ASSERT (lock->delay_timer == NULL); + lock->delay_timer = gf_timer_call_after (this->ctx, delta, + afr_delayed_changelog_wake_up_cbk, + local); + if (!lock->delay_timer) { + lock->release = _gf_true; + } else { + post_op = _gf_false; + } - The @stub gets saved in @local and gets resumed in - afr_local_cleanup() - */ -void -afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub) -{ - afr_delayed_changelog_post_op (this, NULL, fd, stub); -} + } +unlock: + UNLOCK (&local->inode->lock); + if (!list_empty (&shared)) { + afr_lock_resume_shared (&shared); + } -void -afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd) -{ - afr_delayed_changelog_post_op (this, NULL, fd, NULL); +out: + if (post_op) { + if (!local->transaction.eager_lock_on || lock->release) { + afr_changelog_post_op_safe (frame, this); + } else { + afr_changelog_post_op_now (frame, this); + } + } } int @@ -2037,13 +2126,6 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) local = frame->local; - if (local->transaction.eager_lock_on) { - /* We don't need to retain "local" in the - fd list anymore, writes to all subvols - are finished by now */ - afr_remove_eager_lock_stub (local); - } - afr_restore_lk_owner (frame); afr_handle_symmetric_errors (frame, this); @@ -2074,114 +2156,149 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, local->transaction.failed_subvols[child_index] = 1; } - - static gf_boolean_t -afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) +__need_previous_lock_unlocked (afr_local_t *local) { - uint64_t start1 = local1->transaction.start; - uint64_t start2 = local2->transaction.start; - uint64_t end1 = 0; - uint64_t end2 = 0; - - if (local1->transaction.len) - end1 = start1 + local1->transaction.len - 1; - else - end1 = ULLONG_MAX; + afr_lock_t *lock = NULL; - if (local2->transaction.len) - end2 = start2 + local2->transaction.len - 1; - else - end2 = ULLONG_MAX; + if (!local->transaction.eager_lock_on) + return _gf_true; - return ((end1 >= start2) && (end2 >= start1)); + lock = &local->inode_ctx->lock[local->transaction.type]; + if (!lock->acquired) + return _gf_false; + if (lock->acquired && lock->event_generation != local->event_generation) + return _gf_true; + return _gf_false; } void -afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) +__afr_eager_lock_handle (afr_local_t *local, gf_boolean_t *take_lock, + gf_boolean_t *do_pre_op, afr_local_t **timer_local) { - afr_private_t *priv = NULL; - afr_fd_ctx_t *fdctx = NULL; - afr_local_t *each = NULL; + afr_lock_t *lock = NULL; + afr_local_t *owner_local = NULL; + xlator_t *this = local->transaction.frame->this; - priv = this->private; - - if (!local->fd) - return; - - if (local->transaction.type != AFR_DATA_TRANSACTION) - return; + if (local->fd && !afr_are_multiple_fds_opened (local, this)) { + local->transaction.eager_lock_on = _gf_true; + } - if (!priv->eager_lock) - return; + lock = &local->inode_ctx->lock[local->transaction.type]; + if (__need_previous_lock_unlocked (local)) { + if (!list_empty (&lock->owners)) { + lock->release = _gf_true; + } else if (lock->delay_timer) { + lock->release = _gf_true; + if (gf_timer_call_cancel (this->ctx, + lock->delay_timer)) { + /* It will be put in frozen list + * in the code flow below*/ + } else { + *timer_local = list_entry(lock->post_op.next, + afr_local_t, + transaction.owner_list); + lock->delay_timer = NULL; + } + } + if (!local->transaction.eager_lock_on) + goto out; + } - fdctx = afr_fd_ctx_get (local->fd, this); - if (!fdctx) - return; + if (lock->release) { + list_add_tail (&local->transaction.wait_list, + &lock->frozen); + *take_lock = _gf_false; + goto out; + } - if (afr_are_multiple_fds_opened (local->fd, this)) - return; - /* - * Once full file lock is acquired in eager-lock phase, overlapping - * writes do not compete for inode-locks, instead are transferred to the - * next writes. Because of this overlapping writes are not ordered. - * This can cause inconsistencies in replication. - * Example: - * Two overlapping writes w1, w2 are sent in parallel on same fd - * in two threads t1, t2. - * Both threads can execute afr_writev_wind in the following manner. - * t1 winds w1 on brick-0 - * t2 winds w2 on brick-0 - * t2 winds w2 on brick-1 - * t1 winds w1 on brick-1 - * - * This check makes sure the locks are not transferred for - * overlapping writes. - */ - LOCK (&local->fd->lock); - { - list_for_each_entry (each, &fdctx->eager_locked, - transaction.eager_locked) { - if (afr_locals_overlap (each, local)) { - local->transaction.eager_lock_on = _gf_false; - goto unlock; - } + if (lock->delay_timer) { + *take_lock = _gf_false; + if (gf_timer_call_cancel (this->ctx, + lock->delay_timer)) { + list_add_tail (&local->transaction.wait_list, + &lock->frozen); + } else { + *timer_local = list_entry(lock->post_op.next, + afr_local_t, + transaction.owner_list); + afr_copy_inodelk_vars (&local->internal_lock, + &(*timer_local)->internal_lock, + this); + lock->delay_timer = NULL; + *do_pre_op = _gf_true; + list_add_tail (&local->transaction.owner_list, + &lock->owners); } + goto out; + } - local->transaction.eager_lock_on = _gf_true; - list_add_tail (&local->transaction.eager_locked, - &fdctx->eager_locked); + if (!list_empty (&lock->owners)) { + if (!lock->acquired || + afr_has_lock_conflict (local, _gf_true)) { + list_add_tail (&local->transaction.wait_list, + &lock->waiting); + *take_lock = _gf_false; + goto out; + } + owner_local = list_entry (lock->owners.next, + afr_local_t, + transaction.owner_list); + afr_copy_inodelk_vars (&local->internal_lock, + &owner_local->internal_lock, + this); + *take_lock = _gf_false; + *do_pre_op = _gf_true; } -unlock: - UNLOCK (&local->fd->lock); + + if (lock->acquired) + GF_ASSERT (!(*take_lock)); + list_add_tail (&local->transaction.owner_list, &lock->owners); +out: + return; } void -afr_transaction_start (call_frame_t *frame, xlator_t *this) +afr_transaction_start (afr_local_t *local, xlator_t *this) { - afr_local_t *local = frame->local; - fd_t *fd = NULL; + afr_private_t *priv = NULL; + gf_boolean_t take_lock = _gf_true; + gf_boolean_t do_pre_op = _gf_false; + afr_local_t *timer_local = NULL; - afr_transaction_eager_lock_init (local, this); + priv = this->private; - if (local->fd && local->transaction.eager_lock_on) - afr_set_lk_owner (frame, this, local->fd); - else - afr_set_lk_owner (frame, this, frame->root); + if (local->transaction.type != AFR_DATA_TRANSACTION && + local->transaction.type != AFR_METADATA_TRANSACTION) + goto lock_phase; - if (!local->transaction.eager_lock_on && local->loc.inode) { - fd = fd_lookup (local->loc.inode, frame->root->pid); - if (fd == NULL) - fd = fd_lookup_anonymous (local->loc.inode, - GF_ANON_FD_FLAGS); + if (!priv->eager_lock) + goto lock_phase; - if (fd) { - afr_delayed_changelog_wake_up (this, fd); - fd_unref (fd); - } + LOCK (&local->inode->lock); + { + __afr_eager_lock_handle (local, &take_lock, &do_pre_op, + &timer_local); } + UNLOCK (&local->inode->lock); +lock_phase: + if (!local->transaction.eager_lock_on) { + afr_set_lk_owner (local->transaction.frame, this, + local->transaction.frame->root); + } else { + afr_set_lk_owner (local->transaction.frame, this, local->inode); + } + - afr_lock (frame, this); + if (take_lock) { + afr_lock (local->transaction.frame, this); + } else if (do_pre_op) { + afr_changelog_pre_op (local->transaction.frame, this); + } + /*Always call delayed_changelog_wake_up_cbk after calling pre-op above + * so that any inheriting can happen*/ + if (timer_local) + afr_delayed_changelog_wake_up_cbk (timer_local); } int @@ -2194,7 +2311,7 @@ afr_write_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err) goto fail; } - afr_transaction_start (frame, this); + afr_transaction_start (local, this); return 0; fail: local->transaction.unwind (frame, this); @@ -2212,6 +2329,7 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) local = frame->local; priv = this->private; + local->transaction.frame = frame; local->transaction.type = type; @@ -2224,11 +2342,9 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) if (ret < 0) goto out; - if (type == AFR_ENTRY_TRANSACTION || - type == AFR_ENTRY_RENAME_TRANSACTION) { - afr_transaction_start (frame, this); - ret = 0; - goto out; + + if (type != AFR_METADATA_TRANSACTION) { + goto txn_start; } ret = afr_inode_get_readable (frame, local->inode, this, @@ -2238,10 +2354,13 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) event_generation)) { afr_inode_refresh (frame, this, local->inode, local->loc.gfid, afr_write_txn_refresh_done); - } else { - afr_transaction_start (frame, this); + ret = 0; + goto out; } + +txn_start: ret = 0; + afr_transaction_start (local, this); out: return ret; } diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index ddcb1ebe3eb..a27e9a3c0b4 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -17,12 +17,6 @@ void afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index); -int -afr_lock_server_count (afr_private_t *priv, afr_transaction_type type); - -afr_inodelk_t* -afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom); - int32_t afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); @@ -30,9 +24,6 @@ int afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending); void -afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this); - -void afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd); void @@ -57,4 +48,8 @@ afr_pick_error_xdata (afr_local_t *local, afr_private_t *priv, inode_t *inode2, unsigned char *readable2); int afr_transaction_resume (call_frame_t *frame, xlator_t *this); +int +afr_lock (call_frame_t *frame, xlator_t *this); +void +afr_delayed_changelog_wake_up_cbk (void *data); #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index dcaf2887173..b2f3af136bd 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -229,19 +229,12 @@ int afr_entry_lockee_cmp (const void *l1, const void *l2); typedef struct { - char *domain; /* Domain on which inodelk is taken */ - struct gf_flock flock; - unsigned char *locked_nodes; - int32_t lock_count; -} afr_inodelk_t; - -typedef struct { loc_t *lk_loc; int lockee_count; afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX]; - afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX]; + struct gf_flock flock; const char *lk_basename; const char *lower_basename; const char *higher_basename; @@ -254,7 +247,6 @@ typedef struct { int32_t lock_count; int32_t entrylk_lock_count; - uint64_t lock_number; int32_t lk_call_count; int32_t lk_expected_count; int32_t lk_attempted_count; @@ -292,37 +284,9 @@ typedef enum { } afr_fd_open_status_t; typedef struct { - unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS]; - int inherited[AFR_NUM_CHANGE_LOGS]; - int on_disk[AFR_NUM_CHANGE_LOGS]; afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ - - unsigned int *lock_piggyback; - unsigned int *lock_acquired; - int flags; - /* used for delayed-post-op optimization */ - pthread_mutex_t delay_lock; - gf_timer_t *delay_timer; - call_frame_t *delay_frame; - - /* set if any write on this fd was a non stable write - (i.e, without O_SYNC or O_DSYNC) - */ - gf_boolean_t witnessed_unstable_write; - - /* @open_fd_count: - Number of open FDs queried from the server, as queried through - xdata in FOPs. Currently, used to decide if eager-locking must be - temporarily disabled. - */ - uint32_t open_fd_count; - - - /* list of frames currently in progress */ - struct list_head eager_locked; - /* the subvolume on which the latest sequence of readdirs (starting at offset 0) has begun. Till the next readdir request with 0 offset arrives, we continue to read off this subvol. @@ -336,6 +300,20 @@ typedef enum { AFR_FOP_LOCK_QUORUM_FAILED, } afr_fop_lock_state_t; +typedef struct _afr_inode_lock_t { + unsigned int event_generation; + gf_boolean_t release; + gf_boolean_t acquired; + gf_timer_t *delay_timer; + struct list_head owners; /*Transactions that are performing fop*/ + struct list_head post_op;/*Transactions that are done with the fop + *So can not conflict with the fops*/ + struct list_head waiting;/*Transaction that are waiting for + *conflicting transactions to complete*/ + struct list_head frozen;/*Transactions that need to go as part of + * next batch of eager-lock*/ +} afr_lock_t; + typedef struct _afr_inode_ctx { uint64_t read_subvol; uint64_t write_subvol; @@ -343,6 +321,23 @@ typedef struct _afr_inode_ctx { int spb_choice; gf_timer_t *timer; gf_boolean_t need_refresh; + unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS]; + int inherited[AFR_NUM_CHANGE_LOGS]; + int on_disk[AFR_NUM_CHANGE_LOGS]; + + /* set if any write on this fd was a non stable write + (i.e, without O_SYNC or O_DSYNC) + */ + gf_boolean_t witnessed_unstable_write; + + /* @open_fd_count: + Number of open FDs queried from the server, as queried through + xdata in FOPs. Currently, used to decide if eager-locking must be + temporarily disabled. + */ + uint32_t open_fd_count; + /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/ + afr_lock_t lock[2]; } afr_inode_ctx_t; @@ -457,7 +452,6 @@ typedef struct _afr_local { dict_t *dict; int optimistic_change_log; - gf_boolean_t delayed_post_op; /* Is the current writev() going to perform a stable write? i.e, is fd->flags or @flags writev param have O_SYNC or @@ -693,7 +687,7 @@ typedef struct _afr_local { off_t start, len; gf_boolean_t eager_lock_on; - int *eager_lock; + gf_boolean_t do_eager_unlock; char *basename; char *new_basename; @@ -707,7 +701,8 @@ typedef struct _afr_local { of the transaction frame */ call_stub_t *resume_stub; - struct list_head eager_locked; + struct list_head owner_list; + struct list_head wait_list; unsigned char *pre_op; @@ -768,7 +763,8 @@ typedef struct _afr_local { */ afr_changelog_resume_t changelog_resume; - call_frame_t *main_frame; + call_frame_t *main_frame; /*Fop frame*/ + call_frame_t *frame; /*Transaction frame*/ int (*wind) (call_frame_t *frame, xlator_t *this, int subvol); @@ -1009,7 +1005,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd); afr_local_cleanup (frame->local, THIS); \ mem_put (frame->local); \ frame->local = NULL; }; \ - frame->local;}) + frame->local; }) #define AFR_STACK_RESET(frame) \ do { \ @@ -1096,22 +1092,10 @@ afr_filter_xattrs (dict_t *xattr); #define AFR_QUORUM_AUTO INT_MAX int -afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); +afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local); gf_boolean_t -afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd); - -void -afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub); - -int -afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count); - -void -afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this); - -void -afr_remove_eager_lock_stub (afr_local_t *local); +afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode); void afr_reply_wipe (struct afr_reply *reply); @@ -1225,5 +1209,4 @@ afr_write_subvol_reset (call_frame_t *frame, xlator_t *this); int afr_set_inode_local (xlator_t *this, afr_local_t *local, inode_t *inode); - #endif /* __AFR_H__ */ |