diff options
| -rw-r--r-- | tests/bugs/replicate/bug-966018.t | 36 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 315 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-inode-write.c | 6 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-lk-common.c | 348 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 13 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-data.c | 14 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-self-heal.h | 2 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.c | 913 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.h | 13 | ||||
| -rw-r--r-- | xlators/cluster/afr/src/afr.h | 97 | 
10 files changed, 813 insertions, 944 deletions
diff --git a/tests/bugs/replicate/bug-966018.t b/tests/bugs/replicate/bug-966018.t deleted file mode 100644 index 1b5296b498b..00000000000 --- a/tests/bugs/replicate/bug-966018.t +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -. $(dirname $0)/../../include.rc -. $(dirname $0)/../../volume.rc -. $(dirname $0)/../../nfs.rc - -#This tests if cluster.eager-lock blocks metadata operations on nfs/fuse mounts. -#If it is not woken up, INODELK from the next command waits -#for post-op-delay secs. - -cleanup; -TEST glusterd -TEST pidof glusterd - -TEST $CLI volume create $V0 replica 2 $H0:$B0/r2_0 $H0:$B0/r2_1 -TEST $CLI volume set $V0 ensure-durability off -TEST $CLI volume set $V0 cluster.eager-lock on -TEST $CLI volume set $V0 cluster.post-op-delay-secs 3 -TEST $CLI volume set $V0 nfs.disable false - -TEST $CLI volume start $V0 -TEST $CLI volume profile $V0 start -EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available; -TEST mount_nfs $H0:/$V0 $N0 nolock; -TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $M0 -echo 1 > $N0/1 && chmod +x $N0/1 -echo 1 > $M0/1 && chmod +x $M0/1 - -#Check that INODELK MAX latency is not in the order of seconds -#Test if the MAX INODELK fop latency is of the order of seconds. -inodelk_max_latency=$($CLI volume profile $V0 info | grep INODELK | awk 'BEGIN {max = 0} {if ($6 > max) max=$6;} END {print max}' | cut -d. -f 1 | egrep "[0-9]{7,}") - -TEST [ -z $inodelk_max_latency ] -EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $N0 - -cleanup; diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index c9953139b7e..bfd8c2e8c2c 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -121,37 +121,77 @@ afr_is_possibly_under_txn (afr_transaction_type type, afr_local_t *local,          return _gf_false;  } +static void +afr_inode_ctx_destroy (afr_inode_ctx_t *ctx) +{ +        int i = 0; + +        if (!ctx) +                return; + +        for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { +                GF_FREE (ctx->pre_op_done[i]); +        } + +        GF_FREE (ctx); +} +  int  __afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx)  { -        uint64_t                ctx_int = 0; -        int                     ret     = -1; -        afr_inode_ctx_t        *tmp_ctx = NULL; +        uint64_t        ctx_int   = 0; +        int             ret       = -1; +        int             i         = -1; +        int             num_locks = -1; +        afr_inode_ctx_t *ictx     = NULL; +        afr_lock_t      *lock     = NULL; +        afr_private_t   *priv     = this->private;          ret = __inode_ctx_get (inode, this, &ctx_int); -        if (ret) { -                tmp_ctx = GF_CALLOC (1, sizeof (afr_inode_ctx_t), -                                     gf_afr_mt_inode_ctx_t); -                if (!tmp_ctx) -                        goto out; +        if (ret == 0) { +                *ctx = (afr_inode_ctx_t *)ctx_int; +                return 0; +        } -                ctx_int = (long) tmp_ctx; -                ret = __inode_ctx_set (inode, this, &ctx_int); -                if (ret) { -                        GF_FREE (tmp_ctx); +        ictx = GF_CALLOC (1, sizeof (afr_inode_ctx_t), gf_afr_mt_inode_ctx_t); +        if (!ictx) +                goto out; + +        for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { +                ictx->pre_op_done[i] = GF_CALLOC (sizeof *ictx->pre_op_done[i], +                                                  priv->child_count, +                                                  gf_afr_mt_int32_t); +                if (!ictx->pre_op_done[i]) { +                        ret = -ENOMEM;                          goto out;                  } -                tmp_ctx->spb_choice = -1; -                tmp_ctx->read_subvol = 0; -                tmp_ctx->write_subvol = 0; -                tmp_ctx->lock_count = 0; -        } else { -                tmp_ctx = (afr_inode_ctx_t *) ctx_int;          } -        *ctx = tmp_ctx; +        num_locks = sizeof(ictx->lock)/sizeof(afr_lock_t); +        for (i = 0; i < num_locks; i++) { +                lock = &ictx->lock[i]; +                INIT_LIST_HEAD (&lock->post_op); +                INIT_LIST_HEAD (&lock->frozen); +                INIT_LIST_HEAD (&lock->waiting); +                INIT_LIST_HEAD (&lock->owners); +        } + +        ctx_int = (uint64_t)ictx; +        ret = __inode_ctx_set (inode, this, &ctx_int); +        if (ret) { +                goto out; +        } + +        ictx->spb_choice = -1; +        ictx->read_subvol = 0; +        ictx->write_subvol = 0; +        ictx->lock_count = 0;          ret = 0; +        *ctx = ictx;  out: +        if (ret) { +                afr_inode_ctx_destroy (ictx); +        }          return ret;  } @@ -1752,10 +1792,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)          GF_FREE (local->internal_lock.locked_nodes); -        for (i = 0; local->internal_lock.inodelk[i].domain; i++) { -                GF_FREE (local->internal_lock.inodelk[i].locked_nodes); -        } -          GF_FREE (local->internal_lock.lower_locked_nodes);          afr_entry_lockee_cleanup (&local->internal_lock); @@ -1772,7 +1808,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)                  GF_FREE (local->transaction.changelog_xdata);          } -        GF_FREE (local->transaction.eager_lock);          GF_FREE (local->transaction.failed_subvols);          GF_FREE (local->transaction.basename); @@ -1819,16 +1854,6 @@ afr_local_replies_wipe (afr_local_t *local, afr_private_t *priv)  	memset (local->replies, 0, sizeof(*local->replies) * priv->child_count);  } -void -afr_remove_eager_lock_stub (afr_local_t *local) -{ -        LOCK (&local->fd->lock); -        { -                list_del_init (&local->transaction.eager_locked); -        } -        UNLOCK (&local->fd->lock); -} -  static gf_boolean_t  afr_fop_lock_is_unlock (call_frame_t *frame)  { @@ -1933,10 +1958,6 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)  	syncbarrier_destroy (&local->barrier); -        if (local->transaction.eager_lock_on && -            !list_empty (&local->transaction.eager_locked)) -                afr_remove_eager_lock_stub (local); -          afr_local_transaction_cleanup (local, this);          priv = this->private; @@ -3228,22 +3249,8 @@ out:  void  _afr_cleanup_fd_ctx (afr_fd_ctx_t *fd_ctx)  { -        int i = 0; - - -	for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) -		GF_FREE (fd_ctx->pre_op_done[i]); -          GF_FREE (fd_ctx->opened_on); - -        GF_FREE (fd_ctx->lock_piggyback); - -        GF_FREE (fd_ctx->lock_acquired); - -	pthread_mutex_destroy (&fd_ctx->delay_lock); -          GF_FREE (fd_ctx); -          return;  } @@ -3261,15 +3268,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)          fd_ctx = (afr_fd_ctx_t *)(long) ctx;          if (fd_ctx) { -                /*no need to take any locks*/ -                if (!list_empty (&fd_ctx->eager_locked)) -                        gf_msg (this->name, GF_LOG_WARNING, 0, -                                AFR_MSG_INVALID_DATA, "%s: Stale " -                                "Eager-lock stubs found", -                                uuid_utoa (fd->inode->gfid)); -                  _afr_cleanup_fd_ctx (fd_ctx); -          }  out: @@ -3350,23 +3349,6 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)                  goto out;          } -        ret = pthread_mutex_init (&fd_ctx->delay_lock, NULL); -        if (ret) { -                GF_FREE (fd_ctx); -                fd_ctx = NULL; -                goto out; -        } - -	for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { -		fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]), -						    priv->child_count, -						    gf_afr_mt_int32_t); -		if (!fd_ctx->pre_op_done[i]) { -			ret = -ENOMEM; -			goto out; -		} -	} -          fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on),                                         priv->child_count,                                         gf_afr_mt_int32_t); @@ -3382,26 +3364,8 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)  			fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED;  	} -        fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), -                                            priv->child_count, -                                            gf_afr_mt_char); -        if (!fd_ctx->lock_piggyback) { -                ret = -ENOMEM; -                goto out; -        } - -        fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired), -                                           priv->child_count, -                                           gf_afr_mt_char); -        if (!fd_ctx->lock_acquired) { -                ret = -ENOMEM; -                goto out; -        } -  	fd_ctx->readdir_subvol = -1; -        INIT_LIST_HEAD (&fd_ctx->eager_locked); -          ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx);          if (ret)                  gf_msg_debug (this->name, 0, @@ -3473,12 +3437,70 @@ afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)          return 0;  } +afr_local_t* +afr_wakeup_same_fd_delayed_op (xlator_t *this, afr_lock_t *lock, fd_t *fd) +{ +        afr_local_t *local = NULL; + +        if (lock->delay_timer) { +                local = list_entry(lock->post_op.next, afr_local_t, +                                   transaction.owner_list); +                if (fd == local->fd) { +                        if (gf_timer_call_cancel (this->ctx, +                                                  lock->delay_timer)) { +                                local = NULL; +                        } else { +                                lock->delay_timer = NULL; +                        } +                } else { +                        local = NULL; +                } +        } + +        return local; +} + +void +afr_delayed_changelog_wake_resume (xlator_t *this, inode_t *inode, +                                   call_stub_t *stub) +{ +        afr_inode_ctx_t *ctx = NULL; +        afr_lock_t      *lock = NULL; +        afr_local_t     *metadata_local = NULL; +        afr_local_t     *data_local = NULL; +        LOCK (&inode->lock); +        { +                (void)__afr_inode_ctx_get (this, inode, &ctx); +                lock = &ctx->lock[AFR_DATA_TRANSACTION]; +                data_local = afr_wakeup_same_fd_delayed_op (this, lock, +                                                            stub->args.fd); +                lock = &ctx->lock[AFR_METADATA_TRANSACTION]; +                metadata_local = afr_wakeup_same_fd_delayed_op (this, lock, +                                                                stub->args.fd); +        } +        UNLOCK (&inode->lock); + +        if (data_local) { +                data_local->transaction.resume_stub = stub; +        } else if (metadata_local) { +                metadata_local->transaction.resume_stub = stub; +        } else { +                call_resume (stub); +        } +        if (data_local) { +                afr_delayed_changelog_wake_up_cbk (data_local); +        } +        if (metadata_local) { +                afr_delayed_changelog_wake_up_cbk (metadata_local); +        } +} +  int  afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)  { -        afr_local_t   *local = NULL; -        call_stub_t   *stub = NULL; -        int            op_errno   = ENOMEM; +        afr_local_t *local   = NULL; +        call_stub_t *stub    = NULL; +        int         op_errno = ENOMEM;  	local = AFR_FRAME_INIT (frame, op_errno);  	if (!local) @@ -3494,7 +3516,7 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)          if (!stub)                  goto out; -        afr_delayed_changelog_wake_resume (this, fd, stub); +        afr_delayed_changelog_wake_resume (this, fd->inode, stub);  	return 0;  out: @@ -3502,7 +3524,6 @@ out:          return 0;  } -  int  afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  		  int32_t op_ret, int32_t op_errno, dict_t *xdata) @@ -4565,7 +4586,7 @@ afr_forget (xlator_t *this, inode_t *inode)                  return 0;          ctx = (afr_inode_ctx_t *)ctx_int; -        GF_FREE (ctx); +        afr_inode_ctx_destroy (ctx);          return 0;  } @@ -5382,21 +5403,6 @@ out:  }  int -afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count) -{ -        int             ret = -ENOMEM; - -        lk->domain = dom; -        lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), -                                      child_count, gf_afr_mt_char); -        if (NULL == lk->locked_nodes) -                goto out; -        ret = 0; -out: -        return ret; -} - -int  afr_transaction_local_init (afr_local_t *local, xlator_t *this)  {          int            ret = -ENOMEM; @@ -5407,25 +5413,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)          if (ret < 0)                  goto out; -        if ((local->transaction.type == AFR_DATA_TRANSACTION) || -            (local->transaction.type == AFR_METADATA_TRANSACTION)) { -                ret = afr_inodelk_init (&local->internal_lock.inodelk[0], -                                        this->name, priv->child_count); -                if (ret < 0) -                        goto out; -        } -          ret = -ENOMEM;  	local->pre_op_compat = priv->pre_op_compat; -        local->transaction.eager_lock = -                GF_CALLOC (sizeof (*local->transaction.eager_lock), -                           priv->child_count, -                           gf_afr_mt_int32_t); - -        if (!local->transaction.eager_lock) -                goto out; -          local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op),                                                 priv->child_count,                                                 gf_afr_mt_char); @@ -5457,9 +5447,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)          if (!local->pending)                  goto out; -	INIT_LIST_HEAD (&local->transaction.eager_locked); -          ret = 0; +        INIT_LIST_HEAD (&local->transaction.wait_list); +        INIT_LIST_HEAD (&local->transaction.owner_list);  out:          return ret;  } @@ -5494,24 +5484,6 @@ out:          return;  } -void -afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t     *local = NULL; -        afr_fd_ctx_t    *fd_ctx   = NULL; - -        local = frame->local; - -        if (!local->fd) -		return; - -	fd_ctx = afr_fd_ctx_get (local->fd, this); -	if (!fd_ctx) -		return; - -	fd_ctx->open_fd_count = local->open_fd_count; -} -  int**  afr_mark_pending_changelog (afr_private_t *priv, unsigned char *pending,                              dict_t *xattr, ia_type_t iat) @@ -5620,7 +5592,7 @@ out:  int  afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this, -                                  inode_t *inode, gf_boolean_t *dsh, +                                  fd_t *fd, gf_boolean_t *dsh,                                    gf_boolean_t *pflag)  {          int ret = -1; @@ -5630,8 +5602,8 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,          unsigned char *healed_sinks = NULL;          unsigned char *undid_pending = NULL;          afr_private_t   *priv = NULL; -        fd_t          *fd = NULL;          struct afr_reply *locked_replies = NULL; +        inode_t *inode = fd->inode;          priv = this->private;          data_lock = alloca0 (priv->child_count); @@ -5640,18 +5612,6 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,          healed_sinks = alloca0 (priv->child_count);          undid_pending = alloca0 (priv->child_count); -        /* Heal-info does an open() on the file being examined so that the -         * current eager-lock holding client, if present, at some point sees -         * open-fd count being > 1 and releases the eager-lock so that heal-info -         * doesn't remain blocked forever until IO completes. -         */ -        ret = afr_selfheal_data_open (this, inode, &fd); -        if (ret < 0) { -                gf_msg_debug (this->name, -ret, "%s: Failed to open", -                              uuid_utoa (inode->gfid)); -                goto out; -        } -          locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);          ret = afr_selfheal_inodelk (frame, this, inode, this->name, @@ -5674,8 +5634,6 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,  out:          if (locked_replies)                  afr_replies_wipe (locked_replies, priv->child_count); -        if (fd) -                fd_unref (fd);          return ret;  } @@ -5760,6 +5718,7 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid,  {          int ret             = -1; +        fd_t *fd            = NULL;          gf_boolean_t    dsh = _gf_false;          gf_boolean_t    msh = _gf_false;          gf_boolean_t    esh = _gf_false; @@ -5771,6 +5730,21 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid,          /* For every heal type hold locks and check if it indeed needs heal */ + +        /* Heal-info does an open() on the file being examined so that the +         * current eager-lock holding client, if present, at some point sees +         * open-fd count being > 1 and releases the eager-lock so that heal-info +         * doesn't remain blocked forever until IO completes. +         */ +        if ((*inode)->ia_type == IA_IFREG) { +                ret = afr_selfheal_data_open (this, *inode, &fd); +                if (ret < 0) { +                        gf_msg_debug (this->name, -ret, "%s: Failed to open", +                                      uuid_utoa ((*inode)->gfid)); +                        goto out; +                } +        } +          if (msh) {                  ret = afr_selfheal_locked_metadata_inspect (frame, this,                                                              *inode, &msh, @@ -5780,7 +5754,7 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid,          }          if (dsh) { -                ret = afr_selfheal_locked_data_inspect (frame, this, *inode, +                ret = afr_selfheal_locked_data_inspect (frame, this, fd,                                                          &dsh, pending);                  if (ret == -EIO || (ret == -EAGAIN))                          goto out; @@ -5795,6 +5769,8 @@ out:          *data_selfheal = dsh;          *entry_selfheal = esh;          *metadata_selfheal = msh; +        if (fd) +                fd_unref (fd);          return ret;  } @@ -6429,6 +6405,7 @@ afr_write_subvol_reset (call_frame_t *frame, xlator_t *this)          local = frame->local;          LOCK(&local->inode->lock);          { +                GF_ASSERT (local->inode_ctx->lock_count > 0);                  local->inode_ctx->lock_count--;                  if (!local->inode_ctx->lock_count) diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 8893a7db670..9cab08c653a 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -344,14 +344,14 @@ afr_process_post_writev (call_frame_t *frame, xlator_t *this)                     the xattrs are not reliably pointing at                     a stale file.                  */ -                afr_fd_report_unstable_write (this, local->fd); +                afr_fd_report_unstable_write (this, local);          __afr_inode_write_finalize (frame, this);          afr_writev_handle_short_writes (frame, this);          if (local->update_open_fd_count) -                afr_handle_open_fd_count (frame, this); +                local->inode_ctx->open_fd_count = local->open_fd_count;  } @@ -2593,7 +2593,7 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,          local->op = GF_FOP_FSYNC;          local->cont.fsync.datasync = datasync; -	if (afr_fd_has_witnessed_unstable_write (this, fd)) { +	if (afr_fd_has_witnessed_unstable_write (this, fd->inode)) {  		/* don't care. we only wanted to CLEAR the bit */  	} diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index 260815f23d2..be3de01924d 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -52,31 +52,6 @@ afr_entry_lockee_cmp (const void *l1, const void *l2)  int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); -static int -afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this); - -static uint64_t afr_lock_number = 1; - -static uint64_t -get_afr_lock_number () -{ -        return (++afr_lock_number); -} - -int -afr_set_lock_number (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t         *local    = NULL; -        afr_internal_lock_t *int_lock = NULL; - -        local    = frame->local; -        int_lock = &local->internal_lock; - -        int_lock->lock_number = get_afr_lock_number (); - -        return 0; -} -  void  afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner)  { @@ -203,21 +178,16 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this)          afr_local_t         *local    = NULL;          afr_internal_lock_t *int_lock = NULL;          afr_private_t       *priv     = NULL; -        afr_inodelk_t       *inodelk  = NULL;          priv     = this->private;          local    = frame->local;          int_lock = &local->internal_lock; -        inodelk = afr_get_inodelk (int_lock, int_lock->domain); - -        inodelk->lock_count    = 0; +        int_lock->lock_count    = 0;          int_lock->lk_attempted_count = 0;          int_lock->lock_op_ret   = -1;          int_lock->lock_op_errno = 0; -        memset (inodelk->locked_nodes, 0, -                sizeof (*inodelk->locked_nodes) * priv->child_count);          memset (int_lock->locked_nodes, 0,                  sizeof (*int_lock->locked_nodes) * priv->child_count); @@ -286,12 +256,7 @@ void  afr_update_uninodelk (afr_local_t *local, afr_internal_lock_t *int_lock,                      int32_t child_index)  { -        afr_inodelk_t       *inodelk = NULL; - -        inodelk = afr_get_inodelk (int_lock, int_lock->domain); -        inodelk->locked_nodes[child_index] &= LOCKED_NO; -        if (local->transaction.eager_lock) -                local->transaction.eager_lock[child_index] = 0; +        int_lock->locked_nodes[child_index] &= LOCKED_NO;  } @@ -331,35 +296,27 @@ static int  afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)  {          afr_internal_lock_t *int_lock = NULL; -        afr_inodelk_t       *inodelk  = NULL;          afr_local_t         *local    = NULL;          afr_private_t       *priv     = NULL;          struct gf_flock flock = {0,}; -        struct gf_flock full_flock = {0,}; -        struct gf_flock *flock_use = NULL;          int call_count = 0;          int i = 0; -        int piggyback = 0; -        afr_fd_ctx_t        *fd_ctx      = NULL; -          local    = frame->local;          int_lock = &local->internal_lock;          priv     = this->private; -        inodelk = afr_get_inodelk (int_lock, int_lock->domain); - -        flock.l_start = inodelk->flock.l_start; -        flock.l_len   = inodelk->flock.l_len; +        flock.l_start = int_lock->flock.l_start; +        flock.l_len   = int_lock->flock.l_len;          flock.l_type  = F_UNLCK; -        full_flock.l_type = F_UNLCK; -        call_count = afr_locked_nodes_count (inodelk->locked_nodes, +        call_count = afr_locked_nodes_count (int_lock->locked_nodes,                                               priv->child_count);          int_lock->lk_call_count = call_count;          if (!call_count) { +                GF_ASSERT (!local->transaction.do_eager_unlock);                  gf_msg_trace (this->name, 0,                                "No internal locks unlocked"); @@ -367,64 +324,28 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)                  goto out;          } -        if (local->fd) -                fd_ctx = afr_fd_ctx_get (local->fd, this); -          for (i = 0; i < priv->child_count; i++) { -                if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES) +                if ((int_lock->locked_nodes[i] & LOCKED_YES) != LOCKED_YES)                          continue;                  if (local->fd) { -                        flock_use = &flock; -                        if (!local->transaction.eager_lock[i]) { -                                goto wind; -                        } - -                        piggyback = 0; - -                        LOCK (&local->fd->lock); -                        { -                                if (fd_ctx->lock_piggyback[i]) { -                                        fd_ctx->lock_piggyback[i]--; -                                        piggyback = 1; -                                } else { -                                        fd_ctx->lock_acquired[i]--; -                                } -                        } -                        UNLOCK (&local->fd->lock); - -                        if (piggyback) { -                                afr_unlock_inodelk_cbk (frame, (void *) (long) i, -                                                        this, 1, 0, NULL); -                                if (!--call_count) -                                        break; -                                continue; -                        } - -                        flock_use = &full_flock; -                wind:                          STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk,                                             (void *) (long)i,                                             priv->children[i],                                             priv->children[i]->fops->finodelk,                                             int_lock->domain, local->fd, -                                           F_SETLK, flock_use, NULL); - -                        if (!--call_count) -                                break; - +                                           F_SETLK, &flock, NULL);                  } else { -                          STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk,                                             (void *) (long)i,                                             priv->children[i],                                             priv->children[i]->fops->inodelk,                                             int_lock->domain, &local->loc,                                             F_SETLK, &flock, NULL); - -                        if (!--call_count) -                                break;                  } + +                if (!--call_count) +                        break;          }  out:          return 0; @@ -512,6 +433,18 @@ out:  } +int32_t +afr_unlock_now (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = frame->local; + +        if (afr_is_inodelk_transaction(local->transaction.type)) +                afr_unlock_inodelk (frame, this); +        else +                afr_unlock_entrylk (frame, this); +        return 0; +} +  static int32_t  afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                int32_t op_ret, int32_t op_errno, dict_t *xdata) @@ -553,7 +486,7 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          if ((op_ret == -1) &&              (op_errno == ENOSYS)) { -                afr_unlock (frame, this); +                afr_unlock_now (frame, this);          } else {                  if (op_ret == 0) {                          if (local->transaction.type == AFR_ENTRY_TRANSACTION || @@ -598,38 +531,6 @@ afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          return 0;  } -static int -afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) -{ -        afr_internal_lock_t *int_lock = NULL; -        afr_inodelk_t       *inodelk  = NULL; -        afr_local_t         *local    = NULL; -        afr_private_t       *priv     = NULL; - -        priv     = this->private; -        local    = frame->local; -        int_lock = &local->internal_lock; - -        switch (local->transaction.type) { -        case AFR_DATA_TRANSACTION: -        case AFR_METADATA_TRANSACTION: -                inodelk = afr_get_inodelk (int_lock, int_lock->domain); -                memcpy (inodelk->locked_nodes, int_lock->locked_nodes, -                        sizeof (*inodelk->locked_nodes) * priv->child_count); -                inodelk->lock_count = int_lock->lock_count; -                break; - -        case AFR_ENTRY_RENAME_TRANSACTION: -        case AFR_ENTRY_TRANSACTION: -                /*entrylk_count is being used in both non-blocking and blocking -                 * modes */ -                break; -        } - -        return 0; - -} -  static gf_boolean_t  afr_is_entrylk (afr_transaction_type trans_type)  { @@ -733,7 +634,6 @@ int  afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)  {          afr_internal_lock_t *int_lock    = NULL; -        afr_inodelk_t       *inodelk     = NULL;          afr_local_t         *local       = NULL;          afr_private_t       *priv        = NULL;          struct gf_flock flock = {0,}; @@ -752,10 +652,9 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)          if (!is_entrylk) { -                inodelk = afr_get_inodelk (int_lock, int_lock->domain); -                flock.l_start = inodelk->flock.l_start; -                flock.l_len   = inodelk->flock.l_len; -                flock.l_type  = inodelk->flock.l_type; +                flock.l_start = int_lock->flock.l_start; +                flock.l_len   = int_lock->flock.l_len; +                flock.l_type  = int_lock->flock.l_type;          }          if (local->fd) { @@ -770,9 +669,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)                          local->op_ret           = -1;                          int_lock->lock_op_ret   = -1; -                        afr_copy_locked_nodes (frame, this); - -                        afr_unlock (frame, this); +                        afr_unlock_now (frame, this);                          return 0;                  } @@ -784,9 +681,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)                          local->op_ret           = -1;                          int_lock->lock_op_ret   = -1; -                        afr_copy_locked_nodes (frame, this); - -                        afr_unlock(frame, this); +                        afr_unlock_now(frame, this);                          return 0;                  } @@ -798,8 +693,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)                  gf_msg_debug (this->name, 0,                                "we're done locking"); -                afr_copy_locked_nodes (frame, this); -                  int_lock->lock_op_ret = 0;                  int_lock->lock_cbk (frame, this);                  return 0; @@ -815,7 +708,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)          case AFR_METADATA_TRANSACTION:                  if (local->fd) { -                          STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk,                                             (void *) (long) child_index,                                             priv->children[child_index], @@ -824,7 +716,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)                                             F_SETLKW, &flock, NULL);                  } else { -                          STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk,                                             (void *) (long) child_index,                                             priv->children[child_index], @@ -841,7 +732,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)                   *and 'fd-less' children */                  if (local->fd) { -                          STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,                                             (void *) (long) cookie,                                             priv->children[child_index], @@ -850,7 +740,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)                                             int_lock->lockee[lockee_no].basename,                                             ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);                  } else { -                          STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,                                             (void *) (long) cookie,                                             priv->children[child_index], @@ -922,7 +811,6 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,          local    = frame->local;          int_lock = &local->internal_lock; -  	LOCK (&frame->lock);  	{  		if (op_ret < 0 ) { @@ -969,7 +857,7 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                                        "with blocking calls",                                        int_lock->lock_count); -                        afr_unlock(frame, this); +                        afr_unlock_now(frame, this);                  }          } @@ -1009,7 +897,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)                          local->op_errno         = EINVAL;                          int_lock->lock_op_errno = EINVAL; -			afr_unlock (frame, this); +			afr_unlock_now (frame, this);                          return -1;                  } @@ -1021,7 +909,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)                          gf_msg (this->name, GF_LOG_INFO, 0,                                  AFR_MSG_INFO_COMMON,                                  "fd not open on any subvolumes. aborting."); -                        afr_unlock (frame, this); +                        afr_unlock_now (frame, this);                          goto out;                  } @@ -1031,7 +919,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)                          index = i%copies;                          lockee_no = i/copies;                          if (local->child_up[index]) { -                                  STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,                                                     (void *) (long) i,                                                     priv->children[index], @@ -1053,7 +940,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)                          index = i%copies;                          lockee_no = i/copies;                          if (local->child_up[index]) { -                                  STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,                                                     (void *) (long) i,                                                     priv->children[index], @@ -1077,18 +963,12 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                               int32_t op_ret, int32_t op_errno, dict_t *xdata)  {          afr_internal_lock_t *int_lock    = NULL; -        afr_inodelk_t       *inodelk     = NULL;          afr_local_t         *local       = NULL; -        afr_fd_ctx_t        *fd_ctx      = NULL;          int                  call_count  = 0;          int                  child_index = (long) cookie;          local    = frame->local;          int_lock = &local->internal_lock; -        inodelk = afr_get_inodelk (int_lock, int_lock->domain); - -	if (local->fd) -		fd_ctx = afr_fd_ctx_get (local->fd, this);          LOCK (&frame->lock);          { @@ -1105,43 +985,27 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,  				int_lock->lock_op_errno      = op_errno;  				local->op_errno              = op_errno;  			} -			if (local->transaction.eager_lock) -				local->transaction.eager_lock[child_index] = 0;  		} else { -			inodelk->locked_nodes[child_index] |= LOCKED_YES; -			inodelk->lock_count++; - -			if (local->transaction.eager_lock && -			    local->transaction.eager_lock[child_index] && -			    local->fd) { -				/* piggybacked */ -				if (op_ret == 1) { -					/* piggybacked */ -				} else if (op_ret == 0) { -					/* lock acquired from server */ -                                        fd_ctx->lock_acquired[child_index]++; -				} -			} - -                        if (local->transaction.type == AFR_DATA_TRANSACTION && -                            op_ret == 0) { -                                LOCK(&local->inode->lock); -                                { -                                        local->inode_ctx->lock_count++; -                                } -                                UNLOCK (&local->inode->lock); -                        } +			int_lock->locked_nodes[child_index] |= LOCKED_YES; +			int_lock->lock_count++;  		}                  call_count = --int_lock->lk_call_count;          }          UNLOCK (&frame->lock); +        if (op_ret == 0 && local->transaction.type == AFR_DATA_TRANSACTION) { +                LOCK (&local->inode->lock); +                { +                        local->inode_ctx->lock_count++; +                } +                UNLOCK (&local->inode->lock); +        }          if (call_count == 0) {                  gf_msg_trace (this->name, 0,                                "Last inode locking reply received");                  /* all locks successful. Proceed to call FOP */ -                if (inodelk->lock_count == int_lock->lk_expected_count) { +                if (int_lock->lock_count == int_lock->lk_expected_count) {                          gf_msg_trace (this->name, 0,                                        "All servers locked. Calling the cbk");                          int_lock->lock_op_ret = 0; @@ -1155,7 +1019,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,                                        "Trying again with blocking calls",                                        int_lock->lock_count); -                        afr_unlock(frame, this); +                        afr_unlock_now(frame, this);                  }          } @@ -1166,30 +1030,17 @@ int  afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)  {          afr_internal_lock_t *int_lock = NULL; -        afr_inodelk_t       *inodelk  = NULL;          afr_local_t         *local    = NULL;          afr_private_t       *priv     = NULL;          afr_fd_ctx_t        *fd_ctx   = NULL;          int32_t             call_count = 0;          int                 i          = 0;          int                 ret        = 0; -        struct              gf_flock flock = {0,}; -        struct              gf_flock full_flock = {0,}; -        struct              gf_flock *flock_use = NULL; -        int                 piggyback = 0;          local    = frame->local;          int_lock = &local->internal_lock;          priv     = this->private; -        inodelk = afr_get_inodelk (int_lock, int_lock->domain); - -        flock.l_start = inodelk->flock.l_start; -        flock.l_len   = inodelk->flock.l_len; -        flock.l_type  = inodelk->flock.l_type; - -        full_flock.l_type = inodelk->flock.l_type; -          initialize_inodelk_variables (frame, this);          if (local->fd) { @@ -1205,88 +1056,48 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)                          local->op_errno         = EINVAL;                          int_lock->lock_op_errno = EINVAL; -			afr_unlock (frame, this); +			afr_unlock_now (frame, this);                          ret = -1;                          goto out;                  } +        } -                call_count = internal_lock_count (frame, this); -                int_lock->lk_call_count = call_count; -                int_lock->lk_expected_count = call_count; - -                if (!call_count) { -                        gf_msg (this->name, GF_LOG_INFO, 0, -                                AFR_MSG_SUBVOLS_DOWN, -                                "All bricks are down, aborting."); -                        afr_unlock (frame, this); -                        goto out; -                } - -                /* Send non-blocking inodelk calls only on up children -                   and where the fd has been opened */ -                for (i = 0; i < priv->child_count; i++) { -                        if (!local->child_up[i]) -                                continue; - -                        flock_use = &flock; -                        if (!local->transaction.eager_lock_on) { -                                goto wind; -                        } - -                        piggyback = 0; -                        local->transaction.eager_lock[i] = 1; - -			afr_set_delayed_post_op (frame, this); +        call_count = internal_lock_count (frame, this); +        int_lock->lk_call_count = call_count; +        int_lock->lk_expected_count = call_count; -                        LOCK (&local->fd->lock); -                        { -                                if (fd_ctx->lock_acquired[i]) { -                                        fd_ctx->lock_piggyback[i]++; -                                        piggyback = 1; -                                } -                        } -                        UNLOCK (&local->fd->lock); +        if (!call_count) { +                gf_msg (this->name, GF_LOG_INFO, 0, +                        AFR_MSG_SUBVOLS_DOWN, +                        "All bricks are down, aborting."); +                afr_unlock_now (frame, this); +                goto out; +        } -                        if (piggyback) { -                                /* (op_ret == 1) => indicate piggybacked lock */ -                                afr_nonblocking_inodelk_cbk (frame, (void *) (long) i, -                                                             this, 1, 0, NULL); -                                if (!--call_count) -                                        break; -                                continue; -                        } -                        flock_use = &full_flock; -                wind: +        /* Send non-blocking inodelk calls only on up children +           and where the fd has been opened */ +        for (i = 0; i < priv->child_count; i++) { +                if (!local->child_up[i]) +                        continue; +                if (local->fd) {                          STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk,                                             (void *) (long) i,                                             priv->children[i],                                             priv->children[i]->fops->finodelk,                                             int_lock->domain, local->fd, -                                           F_SETLK, flock_use, NULL); - -                        if (!--call_count) -                                break; -                } -        } else { -                call_count = internal_lock_count (frame, this); -                int_lock->lk_call_count = call_count; -                int_lock->lk_expected_count = call_count; - -                for (i = 0; i < priv->child_count; i++) { -                        if (!local->child_up[i]) -                                continue; +                                           F_SETLK, &int_lock->flock, NULL); +                } else {                          STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk,                                             (void *) (long) i,                                             priv->children[i],                                             priv->children[i]->fops->inodelk,                                             int_lock->domain, &local->loc, -                                           F_SETLK, &flock, NULL); - -                        if (!--call_count) -                                break; +                                           F_SETLK, &int_lock->flock, NULL);                  } +                if (!--call_count) +                        break;          }  out:          return ret; @@ -1296,13 +1107,32 @@ int32_t  afr_unlock (call_frame_t *frame, xlator_t *this)  {          afr_local_t *local = NULL; +        afr_lock_t  *lock  = NULL;          local = frame->local; -        if (afr_is_inodelk_transaction(local->transaction.type)) -                afr_unlock_inodelk (frame, this); -        else -                afr_unlock_entrylk (frame, this); +        if (!local->transaction.eager_lock_on) +                goto out; +        lock = &local->inode_ctx->lock[local->transaction.type]; +        LOCK (&local->inode->lock); +        { +                list_del_init (&local->transaction.owner_list); +                if (list_empty (&lock->owners) && list_empty (&lock->post_op)) { +                        local->transaction.do_eager_unlock = _gf_true; +        /*TODO: Need to get metadata use on_disk and inherit/uninherit +         *GF_ASSERT (!local->inode_ctx->on_disk[local->transaction.type]); +         *GF_ASSERT (!local->inode_ctx->inherited[local->transaction.type]); +        */ +                        GF_ASSERT (lock->release); +                } +        } +        UNLOCK (&local->inode->lock); +        if (!local->transaction.do_eager_unlock) { +                local->internal_lock.lock_cbk (frame, this); +                return 0; +        } +out: +        afr_unlock_now (frame, this);          return 0;  } diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 7195dfe058c..dc380c6d280 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -2469,6 +2469,7 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)          int           data_ret          = 1;          int           or_ret            = 0;          inode_t      *inode             = NULL; +        fd_t         *fd                = NULL;  	gf_boolean_t  data_selfheal     = _gf_false;  	gf_boolean_t  metadata_selfheal = _gf_false;  	gf_boolean_t  entry_selfheal    = _gf_false; @@ -2493,8 +2494,16 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)                  goto out;          } +        if (inode->ia_type == IA_IFREG) { +                ret = afr_selfheal_data_open (this, inode, &fd); +                if (!fd) { +                        ret = -EIO; +                        goto out; +                } +        } +  	if (data_selfheal && dataheal_enabled) -                data_ret = afr_selfheal_data (frame, this, inode); +                data_ret = afr_selfheal_data (frame, this, fd);  	if (metadata_selfheal && priv->metadata_self_heal)                  metadata_ret = afr_selfheal_metadata (frame, this, inode); @@ -2516,6 +2525,8 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)  out:          if (inode)                  inode_unref (inode); +        if (fd) +                fd_unref (fd);          return ret;  }  /* diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 3cf5b32b01d..40dee7a7d6c 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -869,22 +869,15 @@ out:  }  int -afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode) +afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd)  {  	afr_private_t *priv = NULL;  	unsigned char *locked_on = NULL;  	int ret = 0; -	fd_t *fd = NULL; +        inode_t *inode = fd->inode;  	priv = this->private; -	ret = afr_selfheal_data_open (this, inode, &fd); -	if (!fd) { -                gf_msg_debug (this->name, -ret, "%s: Failed to open", -                              uuid_utoa (inode->gfid)); -                return -EIO; -        } -  	locked_on = alloca0 (priv->child_count);  	ret = afr_selfheal_tie_breaker_inodelk (frame, this, inode, @@ -911,8 +904,5 @@ unlock:  	afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0,  	                        locked_on); -	if (fd) -		fd_unref (fd); -  	return ret;  } diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 8e976905e97..cd67d2a3192 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -102,7 +102,7 @@ afr_selfheal_name (xlator_t *this, uuid_t gfid, const char *name,                     void *gfid_req, dict_t *xdata);  int -afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode); +afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd);  int  afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode); diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index a253c0835f5..ec72d46fb36 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -25,6 +25,18 @@ typedef enum {          AFR_TRANSACTION_POST_OP,  } afr_xattrop_type_t; +static void +afr_lock_resume_shared (struct list_head *list); + +void +__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared); + +void +afr_changelog_post_op (call_frame_t *frame, xlator_t *this); + +int +afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this); +  gf_boolean_t  afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this); @@ -168,13 +180,14 @@ afr_transaction_fop (call_frame_t *frame, xlator_t *this)          return 0;  } -  int  afr_transaction_done (call_frame_t *frame, xlator_t *this)  { -        afr_local_t *local = NULL; -        afr_private_t *priv = NULL; -        gf_boolean_t unwind = _gf_false; +        afr_local_t   *local      = NULL; +        afr_private_t *priv       = NULL; +        gf_boolean_t  unwind      = _gf_false; +        afr_lock_t    *lock       = NULL; +        afr_local_t   *lock_local = NULL;          priv  = this->private;          local = frame->local; @@ -188,6 +201,31 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this)                  if (unwind)/*It definitely did post-op*/                          afr_zero_fill_stat (local);          } + +        if (local->transaction.do_eager_unlock) { +                lock = &local->inode_ctx->lock[local->transaction.type]; +                LOCK (&local->inode->lock); +                { +                        lock->acquired = _gf_false; +                        lock->release = _gf_false; +                        list_splice_init (&lock->frozen, +                                          &lock->waiting); +                        if (list_empty (&lock->waiting)) +                                goto unlock; +                        lock_local = list_entry (lock->waiting.next, +                                                 afr_local_t, +                                                transaction.wait_list); +                        list_del_init (&lock_local->transaction.wait_list); +                        list_add (&lock_local->transaction.owner_list, +                                  &lock->owners); +                } +unlock: +                UNLOCK (&local->inode->lock); +        } +        if (lock_local) { +                afr_lock (lock_local->transaction.frame, +                          lock_local->transaction.frame->this); +        }          local->transaction.unwind (frame, this);          AFR_STACK_DESTROY (frame); @@ -195,6 +233,52 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this)          return 0;  } +static void +afr_lock_fail_shared (afr_local_t *local, struct list_head *list) +{ +        afr_local_t *each = NULL; + +        while (!list_empty(list)) { +                each = list_entry (list->next, afr_local_t, +                                   transaction.wait_list); +                list_del_init(&each->transaction.wait_list); +                each->op_ret = -1; +                each->op_errno = local->op_errno; +                afr_transaction_done (each->transaction.frame, +                                      each->transaction.frame->this); +        } +} + +static void +afr_handle_lock_acquire_failure (afr_local_t *local, gf_boolean_t locked) +{ +        struct list_head shared; +        afr_lock_t *lock = NULL; + +        if (!local->transaction.eager_lock_on) +                goto out; + +        lock = &local->inode_ctx->lock[local->transaction.type]; + +        INIT_LIST_HEAD (&shared); +        LOCK (&local->inode->lock); +        { +                list_splice_init (&lock->waiting, &shared); +        } +        UNLOCK (&local->inode->lock); + +        afr_lock_fail_shared (local, &shared); +        local->transaction.do_eager_unlock = _gf_true; +out: +        if (locked) { +                local->internal_lock.lock_cbk = afr_transaction_done; +                afr_unlock (local->transaction.frame, +                            local->transaction.frame->this); +        } else { +                afr_transaction_done (local->transaction.frame, +                                      local->transaction.frame->this); +        } +}  call_frame_t*  afr_transaction_detach_fop_frame (call_frame_t *frame) @@ -334,6 +418,7 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)          afr_local_t *local = NULL;          afr_private_t *priv = NULL;          int pre_op_sources_count = 0; +        int i = 0;          priv = this->private;          local = frame->local; @@ -345,11 +430,11 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)          /* If arbiter is the only source, do not proceed. */          if (pre_op_sources_count < 2 &&              local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) { -                local->internal_lock.lock_cbk = afr_transaction_done;                  local->op_ret = -1;                  local->op_errno =  ENOTCONN; -                afr_restore_lk_owner (frame); -                afr_unlock (frame, this); +                for (i = 0; i < priv->child_count; i++) +                        local->transaction.failed_subvols[i] = 1; +                afr_changelog_post_op (frame, this);/*uninherit should happen*/          } else {                  afr_transaction_fop (frame, this);          } @@ -362,14 +447,16 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)  {          afr_local_t   *local = NULL;          afr_private_t *priv  = NULL; -        fd_t          *fd    = NULL;          int           i      = 0;          int           ret    = 0; +        int     failure_count = 0; +        struct list_head shared; +        afr_lock_t *lock = NULL;          local = frame->local;          priv = this->private; -        fd    = local->fd; +        INIT_LIST_HEAD (&shared);          if (local->transaction.type == AFR_DATA_TRANSACTION &&              !local->transaction.inherited) {                  ret = afr_write_subvol_set (frame, this); @@ -394,22 +481,31 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)  		   just now, before OP */  		afr_changelog_pre_op_update (frame, this); -        /* The wake up needs to happen independent of -           what type of fop arrives here. If it was -           a write, then it has already inherited the -           lock and changelog. If it was not a write, -           then the presumption of the optimization (of -           optimizing for successive write operations) -           fails. -        */ -        if (fd) -                afr_delayed_changelog_wake_up (this, fd); +        if (!local->transaction.eager_lock_on || +            local->transaction.inherited) +                goto fop; +        failure_count = AFR_COUNT (local->transaction.failed_subvols, +                                   priv->child_count); +        if (failure_count == priv->child_count) { +                afr_handle_lock_acquire_failure (local, _gf_true); +        } else { +                lock = &local->inode_ctx->lock[local->transaction.type]; +                LOCK (&local->inode->lock); +                { +                        lock->acquired = _gf_true; +                        __afr_transaction_wake_shared (local, &shared); +                } +                UNLOCK (&local->inode->lock); +        } + +fop:          if (priv->arbiter_count == 1) {                  afr_txn_arbitrate_fop (frame, this);          } else {                  afr_transaction_fop (frame, this);          } +        afr_lock_resume_shared (&shared);  	return 0;  } @@ -486,30 +582,14 @@ afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this)  } -afr_inodelk_t* -afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom) -{ -        afr_inodelk_t *inodelk = NULL; -        int           i = 0; - -        for (i = 0; int_lock->inodelk[i].domain; i++) { -                inodelk = &int_lock->inodelk[i]; -                if (strcmp (dom, inodelk->domain) == 0) -                        return inodelk; -        } -        return NULL; -} -  unsigned char*  afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock)  {          unsigned char *locked_nodes = NULL; -        afr_inodelk_t *inodelk = NULL;          switch (type) {          case AFR_DATA_TRANSACTION:          case AFR_METADATA_TRANSACTION: -                inodelk = afr_get_inodelk (int_lock, int_lock->domain); -                locked_nodes = inodelk->locked_nodes; +                locked_nodes = int_lock->locked_nodes;          break;          case AFR_ENTRY_TRANSACTION: @@ -834,27 +914,19 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)  {  	afr_local_t *local = NULL;  	afr_private_t *priv = NULL; -	fd_t *fd = NULL; +        afr_inode_ctx_t *ctx = NULL;  	int i = 0;  	gf_boolean_t ret = _gf_false; -	afr_fd_ctx_t *fd_ctx = NULL;  	int type = 0;  	local = frame->local;  	priv = this->private; -	fd = local->fd; +        ctx = local->inode_ctx;  	type = afr_index_for_transaction_type (local->transaction.type);  	if (type != AFR_DATA_TRANSACTION)  		return !local->transaction.dirtied; -	if (!fd) -		return !local->transaction.dirtied; - -	fd_ctx = afr_fd_ctx_get (fd, this); -	if (!fd_ctx) -		return _gf_false; -  	if (local->transaction.no_uninherit)  		return _gf_false; @@ -868,34 +940,34 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)  	if (local->transaction.uninherit_done)  		return local->transaction.uninherit_value; -	LOCK(&fd->lock); +	LOCK(&local->inode->lock);  	{  		for (i = 0; i < priv->child_count; i++) {  			if (local->transaction.pre_op[i] != -			    fd_ctx->pre_op_done[type][i]) { +			    ctx->pre_op_done[type][i]) {  				ret = !local->transaction.dirtied;  				goto unlock;  			}  		} -		if (fd_ctx->inherited[type]) { +		if (ctx->inherited[type]) {  			ret = _gf_true; -			fd_ctx->inherited[type]--; -		} else if (fd_ctx->on_disk[type]) { +			ctx->inherited[type]--; +		} else if (ctx->on_disk[type]) {  			ret = _gf_false; -			fd_ctx->on_disk[type]--; +			ctx->on_disk[type]--;  		} else {  			/* ASSERT */  			ret = _gf_false;  		} -		if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) { +		if (!ctx->inherited[type] && !ctx->on_disk[type]) {  			for (i = 0; i < priv->child_count; i++) -				fd_ctx->pre_op_done[type][i] = 0; +				ctx->pre_op_done[type][i] = 0;  		}  	}  unlock: -	UNLOCK(&fd->lock); +	UNLOCK(&local->inode->lock);  	local->transaction.uninherit_done = _gf_true;  	local->transaction.uninherit_value = ret; @@ -909,31 +981,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)  {  	afr_local_t *local = NULL;  	afr_private_t *priv = NULL; -	fd_t *fd = NULL;  	int i = 0;  	gf_boolean_t ret = _gf_false; -	afr_fd_ctx_t *fd_ctx = NULL;  	int type = 0;  	local = frame->local;  	priv = this->private; -	fd = local->fd;  	if (local->transaction.type != AFR_DATA_TRANSACTION)  		return _gf_false;  	type = afr_index_for_transaction_type (local->transaction.type); -	if (!fd) -		return _gf_false; - -	fd_ctx = afr_fd_ctx_get (fd, this); -	if (!fd_ctx) -		return _gf_false; - -	LOCK(&fd->lock); +	LOCK(&local->inode->lock);  	{ -		if (!fd_ctx->on_disk[type]) { +		if (!local->inode_ctx->on_disk[type]) {  			/* nothing to inherit yet */  			ret = _gf_false;  			goto unlock; @@ -941,21 +1003,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)  		for (i = 0; i < priv->child_count; i++) {  			if (local->transaction.pre_op[i] != -			    fd_ctx->pre_op_done[type][i]) { +			    local->inode_ctx->pre_op_done[type][i]) {  				/* either inherit exactly, or don't */  				ret = _gf_false;  				goto unlock;  			}  		} -		fd_ctx->inherited[type]++; +		local->inode_ctx->inherited[type]++;  		ret = _gf_true;  		local->transaction.inherited = _gf_true;  	}  unlock: -	UNLOCK(&fd->lock); +	UNLOCK(&local->inode->lock);  	return ret;  } @@ -966,22 +1028,16 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this)  {  	afr_local_t *local = NULL;  	afr_private_t *priv = NULL; -	fd_t *fd = NULL; -	afr_fd_ctx_t *fd_ctx = NULL;  	int i = 0;  	gf_boolean_t ret = _gf_false;  	int type = 0;  	local = frame->local;  	priv = this->private; -	fd = local->fd; -	if (!fd) -		return _gf_false; - -	fd_ctx = afr_fd_ctx_get (fd, this); -	if (!fd_ctx) -		return _gf_false; +        if (local->transaction.type == AFR_ENTRY_TRANSACTION || +            local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) +                return _gf_false;  	if (local->transaction.inherited)  		/* was already inherited in afr_changelog_pre_op */ @@ -997,26 +1053,26 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this)  	ret = _gf_false; -	LOCK(&fd->lock); +	LOCK(&local->inode->lock);  	{ -		if (!fd_ctx->on_disk[type]) { +		if (!local->inode_ctx->on_disk[type]) {  			for (i = 0; i < priv->child_count; i++) -				fd_ctx->pre_op_done[type][i] = +				local->inode_ctx->pre_op_done[type][i] =                                          (!local->transaction.failed_subvols[i]);  		} else {  			for (i = 0; i < priv->child_count; i++) -				if (fd_ctx->pre_op_done[type][i] != +				if (local->inode_ctx->pre_op_done[type][i] !=  				    (!local->transaction.failed_subvols[i])) {  					local->transaction.no_uninherit = 1;  					goto unlock;  				}  		} -		fd_ctx->on_disk[type]++; +		local->inode_ctx->on_disk[type]++;  		ret = _gf_true;  	}  unlock: -	UNLOCK(&fd->lock); +	UNLOCK(&local->inode->lock);  	return ret;  } @@ -1322,6 +1378,9 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)          afr_init_optimistic_changelog_for_txn (this, local); +        if (afr_changelog_pre_op_inherit (frame, this)) +                goto next; +          /* This condition should not be met with present code, as           * transaction.done will be called if locks are not acquired on even a           * single node. @@ -1347,9 +1406,6 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)  		goto err;  	} -	if (afr_changelog_pre_op_inherit (frame, this)) -		goto next; -          if (call_count < priv->child_count)                  pre_nop = _gf_false; @@ -1406,7 +1462,7 @@ err:  	local->op_ret = -1;  	local->op_errno = op_errno; -	afr_unlock (frame, this); +        afr_handle_lock_acquire_failure (local, _gf_true);  	if (xdata_req)  		dict_unref (xdata_req); @@ -1416,31 +1472,6 @@ err:  int -afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) -{ -        afr_internal_lock_t *int_lock = NULL; -        afr_local_t         *local    = NULL; - -        local    = frame->local; -        int_lock = &local->internal_lock; - -        if (int_lock->lock_op_ret < 0) { -                gf_msg (this->name, GF_LOG_INFO, -                        0, AFR_MSG_BLOCKING_LKS_FAILED, -                        "Blocking inodelks failed."); -                afr_transaction_done (frame, this); -        } else { - -                gf_msg_debug (this->name, 0, -                              "Blocking inodelks done. Proceeding to FOP"); -                afr_internal_lock_finish (frame, this); -        } - -        return 0; -} - - -int  afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)  {          afr_internal_lock_t *int_lock = NULL; @@ -1453,7 +1484,7 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)          if (int_lock->lock_op_ret < 0) {                  gf_msg_debug (this->name, 0,                                "Non blocking inodelks failed. Proceeding to blocking"); -                int_lock->lock_cbk = afr_post_blocking_inodelk_cbk; +                int_lock->lock_cbk = afr_internal_lock_finish;                  afr_blocking_lock (frame, this);          } else { @@ -1467,31 +1498,6 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)  int -afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) -{ -        afr_internal_lock_t *int_lock = NULL; -        afr_local_t         *local    = NULL; - -        local    = frame->local; -        int_lock = &local->internal_lock; - -        if (int_lock->lock_op_ret < 0) { -                gf_msg (this->name, GF_LOG_INFO, 0, -                        AFR_MSG_BLOCKING_LKS_FAILED, -                        "Blocking entrylks failed."); -                afr_transaction_done (frame, this); -        } else { - -                gf_msg_debug (this->name, 0, -                             "Blocking entrylks done. Proceeding to FOP"); -                afr_internal_lock_finish (frame, this); -        } - -        return 0; -} - - -int  afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)  {          afr_internal_lock_t *int_lock = NULL; @@ -1504,7 +1510,7 @@ afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)          if (int_lock->lock_op_ret < 0) {                  gf_msg_debug (this->name, 0,                                "Non blocking entrylks failed. Proceeding to blocking"); -                int_lock->lock_cbk = afr_post_blocking_entrylk_cbk; +                int_lock->lock_cbk = afr_internal_lock_finish;                  afr_blocking_lock (frame, this);          } else { @@ -1565,29 +1571,28 @@ int  afr_set_transaction_flock (xlator_t *this, afr_local_t *local)  {          afr_internal_lock_t *int_lock = NULL; -        afr_inodelk_t       *inodelk  = NULL;          afr_private_t       *priv     = NULL;          int_lock = &local->internal_lock; -        inodelk = afr_get_inodelk (int_lock, int_lock->domain);          priv = this->private; -        if ((priv->arbiter_count || priv->full_lock) && +        if ((priv->arbiter_count || local->transaction.eager_lock_on || +             priv->full_lock) &&              local->transaction.type == AFR_DATA_TRANSACTION) {                  /*Lock entire file to avoid network split brains.*/ -                inodelk->flock.l_len   = 0; -                inodelk->flock.l_start = 0; +                int_lock->flock.l_len   = 0; +                int_lock->flock.l_start = 0;          } else { -                inodelk->flock.l_len   = local->transaction.len; -                inodelk->flock.l_start = local->transaction.start; +                int_lock->flock.l_len   = local->transaction.len; +                int_lock->flock.l_start = local->transaction.start;          } -        inodelk->flock.l_type  = F_WRLCK; +        int_lock->flock.l_type  = F_WRLCK;          return 0;  }  int -afr_lock_rec (call_frame_t *frame, xlator_t *this) +afr_lock (call_frame_t *frame, xlator_t *this)  {          afr_internal_lock_t *int_lock = NULL;          afr_local_t         *local    = NULL; @@ -1628,74 +1633,153 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this)          return 0;  } +static gf_boolean_t +afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) +{ +        uint64_t start1 = local1->transaction.start; +        uint64_t start2 = local2->transaction.start; +        uint64_t end1 = 0; +        uint64_t end2 = 0; + +        if (local1->transaction.len) +                end1 = start1 + local1->transaction.len - 1; +        else +                end1 = ULLONG_MAX; + +        if (local2->transaction.len) +                end2 = start2 + local2->transaction.len - 1; +        else +                end2 = ULLONG_MAX; -int -afr_lock (call_frame_t *frame, xlator_t *this) +        return ((end1 >= start2) && (end2 >= start1)); +} + +gf_boolean_t +afr_has_lock_conflict (afr_local_t *local, gf_boolean_t waitlist_check)  { -        afr_set_lock_number (frame, this); +        afr_local_t     *each = NULL; +        afr_lock_t      *lock = NULL; -        return afr_lock_rec (frame, this); +        lock = &local->inode_ctx->lock[local->transaction.type]; +        /* +         * Once full file lock is acquired in eager-lock phase, overlapping +         * writes do not compete for inode-locks, instead are transferred to the +         * next writes. Because of this overlapping writes are not ordered. +         * This can cause inconsistencies in replication. +         * Example: +         * Two overlapping writes w1, w2 are sent in parallel on same fd +         * in two threads t1, t2. +         * Both threads can execute afr_writev_wind in the following manner. +         * t1 winds w1 on brick-0 +         * t2 winds w2 on brick-0 +         * t2 winds w2 on brick-1 +         * t1 winds w1 on brick-1 +         * +         * This check makes sure the locks are not transferred for +         * overlapping writes. +         */ +        list_for_each_entry (each, &lock->owners, transaction.owner_list) { +                if (afr_locals_overlap (each, local)) { +                        return _gf_true; +                } +        } + +        if (!waitlist_check) +                return _gf_false; +        list_for_each_entry (each, &lock->waiting, transaction.wait_list) { +                if (afr_locals_overlap (each, local)) { +                        return _gf_true; +                } +        } +        return _gf_false;  }  /* }}} */ - -int -afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) +static void +afr_copy_inodelk_vars (afr_internal_lock_t *dst, afr_internal_lock_t *src, +                       xlator_t *this)  { -        afr_changelog_pre_op (frame, this); +        afr_private_t *priv = this->private; -        return 0; +        dst->domain = src->domain; +        dst->flock.l_len  = src->flock.l_len; +        dst->flock.l_start  = src->flock.l_start; +        dst->flock.l_type  = src->flock.l_type; +        dst->lock_count = src->lock_count; +        memcpy (dst->locked_nodes, src->locked_nodes, +                priv->child_count * sizeof (*dst->locked_nodes));  } -  void -afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this) +__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared)  { -        afr_local_t    *local = NULL; -        afr_private_t  *priv = NULL; +        gf_boolean_t conflict = _gf_false; +        afr_local_t *each = NULL; +        afr_lock_t *lock = &local->inode_ctx->lock[local->transaction.type]; -        /* call this function from any of the related optimizations -           which benefit from delaying post op are enabled, namely: - -           - changelog piggybacking -           - eager locking -        */ +        while (!conflict) { +                if (list_empty (&lock->waiting)) +                        return; +                each = list_entry(lock->waiting.next, afr_local_t, +                                  transaction.wait_list); +                if (afr_has_lock_conflict (each, _gf_false)) { +                        conflict = _gf_true; +                } +                if (conflict && !list_empty (&lock->owners)) +                        return; +                afr_copy_inodelk_vars (&each->internal_lock, +                                       &local->internal_lock, +                                       each->transaction.frame->this); +                list_move_tail (&each->transaction.wait_list, shared); +                list_add_tail(&each->transaction.owner_list, &lock->owners); +        } +} -        priv = this->private; -        if (!priv) -                return; +static void +afr_lock_resume_shared (struct list_head *list) +{ +        afr_local_t *each = NULL; -        if (!priv->post_op_delay_secs) -                return; +        while (!list_empty(list)) { +                each = list_entry(list->next, afr_local_t, +                                  transaction.wait_list); +                list_del_init(&each->transaction.wait_list); +                afr_changelog_pre_op (each->transaction.frame, +                                      each->transaction.frame->this); +        } +} -        local = frame->local; -        if (!local) -                return; +int +afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) +{ +        afr_local_t *local = frame->local; +        afr_lock_t   *lock  = NULL; -        if (!local->transaction.eager_lock_on) -                return; -        if (!local->fd) -                return; +        local->internal_lock.lock_cbk = NULL; +        if (!local->transaction.eager_lock_on) { +                if (local->internal_lock.lock_op_ret < 0) { +                        afr_transaction_done (frame, this); +                        return 0; +                } +                afr_changelog_pre_op (frame, this); +        } else { +                lock = &local->inode_ctx->lock[local->transaction.type]; +                if (local->internal_lock.lock_op_ret < 0) { +                        afr_handle_lock_acquire_failure (local, _gf_false); +                } else { +                        lock->event_generation = local->event_generation; +                        afr_changelog_pre_op (frame, this); +                } +        } -        if (local->op == GF_FOP_WRITE) -                local->delayed_post_op = _gf_true; +        return 0;  }  gf_boolean_t -afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this) +afr_are_multiple_fds_opened (afr_local_t *local, xlator_t *this)  { -        afr_fd_ctx_t *fd_ctx = NULL; - -        if (!fd) { -                /* If false is returned, it may keep on taking eager-lock -                 * which may lead to starvation, so return true to avoid that. -                 */ -                gf_msg_callingfn (this->name, GF_LOG_ERROR, EBADF, -                                  AFR_MSG_INVALID_ARG, "Invalid fd"); -                return _gf_true; -        }          /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock           * is taken mount2 opened the same file, it won't be able to           * perform any data operations until mount1 releases eager-lock. @@ -1703,11 +1787,7 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)           * if open-fd-count is > 1           */ -        fd_ctx = afr_fd_ctx_get (fd, this); -        if (!fd_ctx) -                return _gf_true; - -        if (fd_ctx->open_fd_count > 1) +        if (local->inode_ctx->open_fd_count > 1)                  return _gf_true;          return _gf_false; @@ -1715,24 +1795,45 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)  gf_boolean_t -is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) +afr_is_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this, +                                         int delay)  { -        afr_local_t      *local = NULL; -        gf_boolean_t      res = _gf_false; +        afr_local_t  *local = NULL; +        afr_lock_t   *lock  = NULL; +        gf_boolean_t res    = _gf_false;          local = frame->local; -        if (!local) +        lock = &local->inode_ctx->lock[local->transaction.type]; + +        if (!afr_txn_nothing_failed (frame, this)) { +                lock->release = _gf_true;                  goto out; +        } -        if (!local->delayed_post_op) +        if (afr_are_multiple_fds_opened (local, this)) { +                lock->release = _gf_true;                  goto out; +        } -        //Mark pending changelog ASAP -        if (!afr_txn_nothing_failed (frame, this)) +        if (!list_empty (&lock->owners)) +                goto out; +        else +                GF_ASSERT (list_empty (&lock->waiting)); + +        if (lock->release) { +                goto out; +        } + +        if (!delay) {                  goto out; +        } -        if (local->fd && afr_are_multiple_fds_opened (local->fd, this)) +        if ((local->op != GF_FOP_WRITE) && +            (local->op != GF_FOP_FXATTROP)) { +                /*Only allow writes but shard does [f]xattrops on writes, so +                 * they are fine too*/                  goto out; +        }          res = _gf_true;  out: @@ -1743,50 +1844,61 @@ out:  void  afr_delayed_changelog_wake_up_cbk (void *data)  { -        fd_t           *fd = NULL; +        afr_lock_t  *lock  = NULL; +        afr_local_t *local = data; +        afr_local_t *timer_local = NULL; +        struct list_head shared; -        fd = data; - -        afr_delayed_changelog_wake_up (THIS, fd); +        INIT_LIST_HEAD (&shared); +        lock = &local->inode_ctx->lock[local->transaction.type]; +        LOCK (&local->inode->lock); +        { +                timer_local = list_entry(lock->post_op.next, +                                         afr_local_t, +                                        transaction.owner_list); +                if (list_empty (&lock->owners) && (local == timer_local)) { +                        GF_ASSERT (list_empty (&lock->waiting)); +                        /*Last owner*/ +                        lock->release = _gf_true; +                        lock->delay_timer = NULL; +                } +        } +        UNLOCK (&local->inode->lock); +        afr_changelog_post_op_now (local->transaction.frame, +                                   local->transaction.frame->this);  }  /* SET operation */  int -afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) +afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local)  { -        afr_fd_ctx_t *fdctx = NULL; - -        fdctx = afr_fd_ctx_get (fd, this); - -        LOCK(&fd->lock); +        LOCK(&local->inode->lock);          { -                fdctx->witnessed_unstable_write = _gf_true; +                local->inode_ctx->witnessed_unstable_write = _gf_true;          } -        UNLOCK(&fd->lock); +        UNLOCK(&local->inode->lock);          return 0;  }  /* TEST and CLEAR operation */  gf_boolean_t -afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd) +afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode)  { -        afr_fd_ctx_t *fdctx = NULL; +        afr_inode_ctx_t *ctx = NULL;          gf_boolean_t witness = _gf_false; -        fdctx = afr_fd_ctx_get (fd, this); -        if (!fdctx) -                return _gf_true; - -        LOCK(&fd->lock); +        LOCK(&inode->lock);          { -                if (fdctx->witnessed_unstable_write) { +                (void)__afr_inode_ctx_get (this, inode, &ctx); + +                if (ctx->witnessed_unstable_write) {                          witness = _gf_true; -                        fdctx->witnessed_unstable_write = _gf_false; +                        ctx->witnessed_unstable_write = _gf_false;                  }          } -        UNLOCK (&fd->lock); +        UNLOCK (&inode->lock);          return witness;  } @@ -1929,7 +2041,7 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)             mark a flag in the fdctx whenever an unstable write is witnessed.             */ -        if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { +        if (!afr_fd_has_witnessed_unstable_write (this, local->inode)) {                  afr_changelog_post_op_now (frame, this);                  return 0;          } @@ -1947,87 +2059,64 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)          return 0;  } -  void -afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, -                               call_stub_t *stub) +afr_changelog_post_op (call_frame_t *frame, xlator_t *this)  { -	afr_fd_ctx_t      *fd_ctx = NULL; -	call_frame_t      *prev_frame = NULL; -	struct timespec    delta = {0, }; -	afr_private_t     *priv = NULL; -	afr_local_t       *local = NULL; +	struct timespec delta   = {0, }; +	afr_private_t   *priv   = NULL; +	afr_local_t     *local  = frame->local; +        afr_lock_t      *lock   = NULL; +        gf_boolean_t    post_op = _gf_true; +        struct list_head  shared;  	priv = this->private; - -	fd_ctx = afr_fd_ctx_get (fd, this); -	if (!fd_ctx) -                goto out; -  	delta.tv_sec = priv->post_op_delay_secs;  	delta.tv_nsec = 0; -	pthread_mutex_lock (&fd_ctx->delay_lock); -	{ -		prev_frame = fd_ctx->delay_frame; -		fd_ctx->delay_frame = NULL; -		if (fd_ctx->delay_timer) -			gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer); -		fd_ctx->delay_timer = NULL; -		if (!frame) -			goto unlock; -		fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta, -							   afr_delayed_changelog_wake_up_cbk, -							   fd); -		fd_ctx->delay_frame = frame; -	} -unlock: -	pthread_mutex_unlock (&fd_ctx->delay_lock); - -out: -	if (prev_frame) { -		local = prev_frame->local; -		local->transaction.resume_stub = stub; -		afr_changelog_post_op_now (prev_frame, this); -	} else if (stub) { -		call_resume (stub); -	} -} - - -void -afr_changelog_post_op (call_frame_t *frame, xlator_t *this) -{ -        afr_local_t  *local = NULL; - -        local = frame->local; - -        if (is_afr_delayed_changelog_post_op_needed (frame, this)) -                afr_delayed_changelog_post_op (this, frame, local->fd, NULL); -        else -                afr_changelog_post_op_safe (frame, this); -} - +        INIT_LIST_HEAD (&shared); +        if (!local->transaction.eager_lock_on) +                goto out; +        lock = &local->inode_ctx->lock[local->transaction.type]; +        LOCK (&local->inode->lock); +	{ +                list_del_init (&local->transaction.owner_list); +                list_add (&local->transaction.owner_list, &lock->post_op); +                __afr_transaction_wake_shared (local, &shared); + +                if (!afr_is_delayed_changelog_post_op_needed (frame, this, +                                                              delta.tv_sec)) { +                        if (list_empty (&lock->owners)) +                                lock->release = _gf_true; +                        goto unlock; +                } -/* Wake up the sleeping/delayed post-op, and also register -   a stub to have it resumed after this transaction -   completely finishes. +                GF_ASSERT (lock->delay_timer == NULL); +		lock->delay_timer = gf_timer_call_after (this->ctx, delta, +                                              afr_delayed_changelog_wake_up_cbk, +                                              local); +                if (!lock->delay_timer) { +                        lock->release = _gf_true; +                } else { +                        post_op = _gf_false; +                } -   The @stub gets saved in @local and gets resumed in -   afr_local_cleanup() -   */ -void -afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub) -{ -        afr_delayed_changelog_post_op (this, NULL, fd, stub); -} +	} +unlock: +        UNLOCK (&local->inode->lock); +        if (!list_empty (&shared)) { +                afr_lock_resume_shared (&shared); +        } -void -afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd) -{ -        afr_delayed_changelog_post_op (this, NULL, fd, NULL); +out: +        if (post_op) { +                if (!local->transaction.eager_lock_on || lock->release) { +                        afr_changelog_post_op_safe (frame, this); +                } else { +                        afr_changelog_post_op_now (frame, this); +                } +        }  }  int @@ -2037,13 +2126,6 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this)          local    = frame->local; -        if (local->transaction.eager_lock_on) { -                /* We don't need to retain "local" in the -                   fd list anymore, writes to all subvols -                   are finished by now */ -                afr_remove_eager_lock_stub (local); -        } -          afr_restore_lk_owner (frame);          afr_handle_symmetric_errors (frame, this); @@ -2074,114 +2156,149 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,  	local->transaction.failed_subvols[child_index] = 1;  } - -  static gf_boolean_t -afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) +__need_previous_lock_unlocked (afr_local_t *local)  { -        uint64_t start1 = local1->transaction.start; -        uint64_t start2 = local2->transaction.start; -        uint64_t end1 = 0; -        uint64_t end2 = 0; - -        if (local1->transaction.len) -                end1 = start1 + local1->transaction.len - 1; -        else -                end1 = ULLONG_MAX; +        afr_lock_t      *lock = NULL; -        if (local2->transaction.len) -                end2 = start2 + local2->transaction.len - 1; -        else -                end2 = ULLONG_MAX; +        if (!local->transaction.eager_lock_on) +                return _gf_true; -        return ((end1 >= start2) && (end2 >= start1)); +        lock = &local->inode_ctx->lock[local->transaction.type]; +        if (!lock->acquired) +                return _gf_false; +        if (lock->acquired && lock->event_generation != local->event_generation) +                return _gf_true; +        return _gf_false;  }  void -afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) +__afr_eager_lock_handle (afr_local_t *local, gf_boolean_t *take_lock, +                         gf_boolean_t *do_pre_op, afr_local_t **timer_local)  { -        afr_private_t *priv = NULL; -        afr_fd_ctx_t  *fdctx = NULL; -        afr_local_t   *each = NULL; +        afr_lock_t      *lock = NULL; +        afr_local_t     *owner_local = NULL; +        xlator_t        *this = local->transaction.frame->this; -        priv = this->private; - -        if (!local->fd) -                return; - -        if (local->transaction.type != AFR_DATA_TRANSACTION) -                return; +        if (local->fd && !afr_are_multiple_fds_opened (local, this)) { +                local->transaction.eager_lock_on = _gf_true; +        } -        if (!priv->eager_lock) -                return; +        lock = &local->inode_ctx->lock[local->transaction.type]; +        if (__need_previous_lock_unlocked (local)) { +                if (!list_empty (&lock->owners)) { +                        lock->release = _gf_true; +                } else if (lock->delay_timer) { +                        lock->release = _gf_true; +                        if (gf_timer_call_cancel (this->ctx, +                                                  lock->delay_timer)) { +                                /* It will be put in frozen list +                                 * in the code flow below*/ +                        } else { +                                *timer_local = list_entry(lock->post_op.next, +                                                          afr_local_t, +                                                        transaction.owner_list); +                                lock->delay_timer = NULL; +                        } +                } +                if (!local->transaction.eager_lock_on) +                        goto out; +        } -        fdctx = afr_fd_ctx_get (local->fd, this); -        if (!fdctx) -                return; +        if (lock->release) { +                list_add_tail (&local->transaction.wait_list, +                               &lock->frozen); +                *take_lock = _gf_false; +                goto out; +        } -        if (afr_are_multiple_fds_opened (local->fd, this)) -                return; -        /* -         * Once full file lock is acquired in eager-lock phase, overlapping -         * writes do not compete for inode-locks, instead are transferred to the -         * next writes. Because of this overlapping writes are not ordered. -         * This can cause inconsistencies in replication. -         * Example: -         * Two overlapping writes w1, w2 are sent in parallel on same fd -         * in two threads t1, t2. -         * Both threads can execute afr_writev_wind in the following manner. -         * t1 winds w1 on brick-0 -         * t2 winds w2 on brick-0 -         * t2 winds w2 on brick-1 -         * t1 winds w1 on brick-1 -         * -         * This check makes sure the locks are not transferred for -         * overlapping writes. -         */ -        LOCK (&local->fd->lock); -        { -                list_for_each_entry (each, &fdctx->eager_locked, -                                     transaction.eager_locked) { -                        if (afr_locals_overlap (each, local)) { -                                local->transaction.eager_lock_on = _gf_false; -                                goto unlock; -                        } +        if (lock->delay_timer) { +                *take_lock = _gf_false; +                if (gf_timer_call_cancel (this->ctx, +                                          lock->delay_timer)) { +                        list_add_tail (&local->transaction.wait_list, +                                       &lock->frozen); +                } else { +                        *timer_local = list_entry(lock->post_op.next, +                                                  afr_local_t, +                                                  transaction.owner_list); +                        afr_copy_inodelk_vars (&local->internal_lock, +                                               &(*timer_local)->internal_lock, +                                               this); +                        lock->delay_timer = NULL; +                        *do_pre_op = _gf_true; +                        list_add_tail (&local->transaction.owner_list, +                                       &lock->owners);                  } +                goto out; +        } -                local->transaction.eager_lock_on = _gf_true; -                list_add_tail (&local->transaction.eager_locked, -                               &fdctx->eager_locked); +        if (!list_empty (&lock->owners)) { +                if (!lock->acquired || +                    afr_has_lock_conflict (local, _gf_true)) { +                        list_add_tail (&local->transaction.wait_list, +                                       &lock->waiting); +                        *take_lock = _gf_false; +                        goto out; +                } +                owner_local = list_entry (lock->owners.next, +                                          afr_local_t, +                                          transaction.owner_list); +                afr_copy_inodelk_vars (&local->internal_lock, +                                       &owner_local->internal_lock, +                                       this); +                *take_lock = _gf_false; +                *do_pre_op = _gf_true;          } -unlock: -        UNLOCK (&local->fd->lock); + +        if (lock->acquired) +                GF_ASSERT (!(*take_lock)); +        list_add_tail (&local->transaction.owner_list, &lock->owners); +out: +        return;  }  void -afr_transaction_start (call_frame_t *frame, xlator_t *this) +afr_transaction_start (afr_local_t *local, xlator_t *this)  { -        afr_local_t   *local = frame->local; -        fd_t          *fd    = NULL; +        afr_private_t   *priv = NULL; +        gf_boolean_t    take_lock  = _gf_true; +        gf_boolean_t    do_pre_op  = _gf_false; +        afr_local_t     *timer_local = NULL; -        afr_transaction_eager_lock_init (local, this); +        priv = this->private; -        if (local->fd && local->transaction.eager_lock_on) -                afr_set_lk_owner (frame, this, local->fd); -        else -                afr_set_lk_owner (frame, this, frame->root); +        if (local->transaction.type != AFR_DATA_TRANSACTION && +            local->transaction.type != AFR_METADATA_TRANSACTION) +                goto lock_phase; -        if (!local->transaction.eager_lock_on && local->loc.inode) { -                fd = fd_lookup (local->loc.inode, frame->root->pid); -                if (fd == NULL) -                        fd = fd_lookup_anonymous (local->loc.inode, -                                                  GF_ANON_FD_FLAGS); +        if (!priv->eager_lock) +                goto lock_phase; -                if (fd) { -                        afr_delayed_changelog_wake_up (this, fd); -                        fd_unref (fd); -                } +        LOCK (&local->inode->lock); +        { +                __afr_eager_lock_handle (local, &take_lock, &do_pre_op, +                                         &timer_local);          } +        UNLOCK (&local->inode->lock); +lock_phase: +        if (!local->transaction.eager_lock_on) { +                afr_set_lk_owner (local->transaction.frame, this, +                                  local->transaction.frame->root); +        } else { +                afr_set_lk_owner (local->transaction.frame, this, local->inode); +        } + -        afr_lock (frame, this); +        if (take_lock) { +                afr_lock (local->transaction.frame, this); +        } else if (do_pre_op) { +                afr_changelog_pre_op (local->transaction.frame, this); +        } +        /*Always call delayed_changelog_wake_up_cbk after calling pre-op above +         * so that any inheriting can happen*/ +        if (timer_local) +                afr_delayed_changelog_wake_up_cbk (timer_local);  }  int @@ -2194,7 +2311,7 @@ afr_write_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)                  goto fail;          } -        afr_transaction_start (frame, this); +        afr_transaction_start (local, this);          return 0;  fail:          local->transaction.unwind (frame, this); @@ -2212,6 +2329,7 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)          local = frame->local;          priv  = this->private; +        local->transaction.frame = frame;          local->transaction.type   = type; @@ -2224,11 +2342,9 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)          if (ret < 0)                  goto out; -        if (type == AFR_ENTRY_TRANSACTION || -            type == AFR_ENTRY_RENAME_TRANSACTION) { -                afr_transaction_start (frame, this); -                ret = 0; -                goto out; + +        if (type != AFR_METADATA_TRANSACTION) { +                goto txn_start;          }          ret = afr_inode_get_readable (frame, local->inode, this, @@ -2238,10 +2354,13 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)                                                    event_generation)) {                  afr_inode_refresh (frame, this, local->inode, local->loc.gfid,                                     afr_write_txn_refresh_done); -        } else { -                afr_transaction_start (frame, this); +                ret = 0; +                goto out;          } + +txn_start:          ret = 0; +        afr_transaction_start (local, this);  out:          return ret;  } diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index ddcb1ebe3eb..a27e9a3c0b4 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -17,12 +17,6 @@ void  afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,  			    int child_index); -int -afr_lock_server_count (afr_private_t *priv, afr_transaction_type type); - -afr_inodelk_t* -afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom); -  int32_t  afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); @@ -30,9 +24,6 @@ int  afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending);  void -afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this); - -void  afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd);  void @@ -57,4 +48,8 @@ afr_pick_error_xdata (afr_local_t *local, afr_private_t *priv,                        inode_t *inode2, unsigned char *readable2);  int  afr_transaction_resume (call_frame_t *frame, xlator_t *this); +int +afr_lock (call_frame_t *frame, xlator_t *this); +void +afr_delayed_changelog_wake_up_cbk (void *data);  #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index dcaf2887173..b2f3af136bd 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -229,19 +229,12 @@ int  afr_entry_lockee_cmp (const void *l1, const void *l2);  typedef struct { -        char    *domain; /* Domain on which inodelk is taken */ -        struct gf_flock flock; -        unsigned char *locked_nodes; -        int32_t lock_count; -} afr_inodelk_t; - -typedef struct {          loc_t *lk_loc;          int                     lockee_count;          afr_entry_lockee_t      lockee[AFR_LOCKEE_COUNT_MAX]; -        afr_inodelk_t       inodelk[AFR_DOM_COUNT_MAX]; +        struct gf_flock flock;          const char *lk_basename;          const char *lower_basename;          const char *higher_basename; @@ -254,7 +247,6 @@ typedef struct {          int32_t lock_count;          int32_t entrylk_lock_count; -        uint64_t lock_number;          int32_t lk_call_count;          int32_t lk_expected_count;          int32_t lk_attempted_count; @@ -292,37 +284,9 @@ typedef enum {  } afr_fd_open_status_t;  typedef struct { -        unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS]; -	int inherited[AFR_NUM_CHANGE_LOGS]; -	int on_disk[AFR_NUM_CHANGE_LOGS];          afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ - -        unsigned int *lock_piggyback; -        unsigned int *lock_acquired; -          int flags; -	/* used for delayed-post-op optimization */ -	pthread_mutex_t    delay_lock; -	gf_timer_t        *delay_timer; -	call_frame_t      *delay_frame; - -	/* set if any write on this fd was a non stable write -	   (i.e, without O_SYNC or O_DSYNC) -	*/ -	gf_boolean_t      witnessed_unstable_write; - -	/* @open_fd_count: -	   Number of open FDs queried from the server, as queried through -	   xdata in FOPs. Currently, used to decide if eager-locking must be -	   temporarily disabled. -	*/ -        uint32_t        open_fd_count; - - -	/* list of frames currently in progress */ -	struct list_head  eager_locked; -  	/* the subvolume on which the latest sequence of readdirs (starting  	   at offset 0) has begun. Till the next readdir request with 0 offset  	   arrives, we continue to read off this subvol. @@ -336,6 +300,20 @@ typedef enum {          AFR_FOP_LOCK_QUORUM_FAILED,  } afr_fop_lock_state_t; +typedef struct _afr_inode_lock_t { +        unsigned int event_generation; +        gf_boolean_t    release; +        gf_boolean_t    acquired; +        gf_timer_t        *delay_timer; +        struct list_head  owners; /*Transactions that are performing fop*/ +        struct list_head  post_op;/*Transactions that are done with the fop +                                   *So can not conflict with the fops*/ +        struct list_head waiting;/*Transaction that are waiting for +                                   *conflicting transactions to complete*/ +        struct list_head frozen;/*Transactions that need to go as part of +                                 * next batch of eager-lock*/ +} afr_lock_t; +  typedef struct _afr_inode_ctx {          uint64_t        read_subvol;          uint64_t        write_subvol; @@ -343,6 +321,23 @@ typedef struct _afr_inode_ctx {          int             spb_choice;          gf_timer_t      *timer;          gf_boolean_t    need_refresh; +        unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS]; +        int inherited[AFR_NUM_CHANGE_LOGS]; +        int on_disk[AFR_NUM_CHANGE_LOGS]; + +        /* set if any write on this fd was a non stable write +           (i.e, without O_SYNC or O_DSYNC) +        */ +        gf_boolean_t      witnessed_unstable_write; + +        /* @open_fd_count: +           Number of open FDs queried from the server, as queried through +           xdata in FOPs. Currently, used to decide if eager-locking must be +           temporarily disabled. +        */ +        uint32_t        open_fd_count; +        /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/ +        afr_lock_t lock[2];  } afr_inode_ctx_t; @@ -457,7 +452,6 @@ typedef struct _afr_local {          dict_t  *dict;          int      optimistic_change_log; -	gf_boolean_t      delayed_post_op;  	/* Is the current writev() going to perform a stable write?  	   i.e, is fd->flags or @flags writev param have O_SYNC or @@ -693,7 +687,7 @@ typedef struct _afr_local {                  off_t start, len;                  gf_boolean_t    eager_lock_on; -                int *eager_lock; +                gf_boolean_t    do_eager_unlock;                  char *basename;                  char *new_basename; @@ -707,7 +701,8 @@ typedef struct _afr_local {  		   of the transaction frame */  		call_stub_t      *resume_stub; -		struct list_head  eager_locked; +		struct list_head  owner_list; +                struct list_head  wait_list;                  unsigned char   *pre_op; @@ -768,7 +763,8 @@ typedef struct _afr_local {  		*/  		afr_changelog_resume_t changelog_resume; -                call_frame_t *main_frame; +                call_frame_t *main_frame; /*Fop frame*/ +                call_frame_t *frame; /*Transaction frame*/                  int (*wind) (call_frame_t *frame, xlator_t *this, int subvol); @@ -1009,7 +1005,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);  		afr_local_cleanup (frame->local, THIS);		       \  		mem_put (frame->local);				       \  		frame->local = NULL; };				       \ -	frame->local;}) +	frame->local; })  #define AFR_STACK_RESET(frame)                                         \          do {                                                           \ @@ -1096,22 +1092,10 @@ afr_filter_xattrs (dict_t *xattr);  #define AFR_QUORUM_AUTO INT_MAX  int -afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); +afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local);  gf_boolean_t -afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd); - -void -afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub); - -int -afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count); - -void -afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this); - -void -afr_remove_eager_lock_stub (afr_local_t *local); +afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode);  void  afr_reply_wipe (struct afr_reply *reply); @@ -1225,5 +1209,4 @@ afr_write_subvol_reset (call_frame_t *frame, xlator_t *this);  int  afr_set_inode_local (xlator_t *this, afr_local_t *local, inode_t *inode); -  #endif /* __AFR_H__ */  | 
