summaryrefslogtreecommitdiffstats
path: root/xlators/cluster/afr/src/afr-transaction.c
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/cluster/afr/src/afr-transaction.c')
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c913
1 files changed, 516 insertions, 397 deletions
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index a253c0835f5..ec72d46fb36 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -25,6 +25,18 @@ typedef enum {
AFR_TRANSACTION_POST_OP,
} afr_xattrop_type_t;
+static void
+afr_lock_resume_shared (struct list_head *list);
+
+void
+__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared);
+
+void
+afr_changelog_post_op (call_frame_t *frame, xlator_t *this);
+
+int
+afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this);
+
gf_boolean_t
afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this);
@@ -168,13 +180,14 @@ afr_transaction_fop (call_frame_t *frame, xlator_t *this)
return 0;
}
-
int
afr_transaction_done (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- gf_boolean_t unwind = _gf_false;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t unwind = _gf_false;
+ afr_lock_t *lock = NULL;
+ afr_local_t *lock_local = NULL;
priv = this->private;
local = frame->local;
@@ -188,6 +201,31 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this)
if (unwind)/*It definitely did post-op*/
afr_zero_fill_stat (local);
}
+
+ if (local->transaction.do_eager_unlock) {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK (&local->inode->lock);
+ {
+ lock->acquired = _gf_false;
+ lock->release = _gf_false;
+ list_splice_init (&lock->frozen,
+ &lock->waiting);
+ if (list_empty (&lock->waiting))
+ goto unlock;
+ lock_local = list_entry (lock->waiting.next,
+ afr_local_t,
+ transaction.wait_list);
+ list_del_init (&lock_local->transaction.wait_list);
+ list_add (&lock_local->transaction.owner_list,
+ &lock->owners);
+ }
+unlock:
+ UNLOCK (&local->inode->lock);
+ }
+ if (lock_local) {
+ afr_lock (lock_local->transaction.frame,
+ lock_local->transaction.frame->this);
+ }
local->transaction.unwind (frame, this);
AFR_STACK_DESTROY (frame);
@@ -195,6 +233,52 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this)
return 0;
}
+static void
+afr_lock_fail_shared (afr_local_t *local, struct list_head *list)
+{
+ afr_local_t *each = NULL;
+
+ while (!list_empty(list)) {
+ each = list_entry (list->next, afr_local_t,
+ transaction.wait_list);
+ list_del_init(&each->transaction.wait_list);
+ each->op_ret = -1;
+ each->op_errno = local->op_errno;
+ afr_transaction_done (each->transaction.frame,
+ each->transaction.frame->this);
+ }
+}
+
+static void
+afr_handle_lock_acquire_failure (afr_local_t *local, gf_boolean_t locked)
+{
+ struct list_head shared;
+ afr_lock_t *lock = NULL;
+
+ if (!local->transaction.eager_lock_on)
+ goto out;
+
+ lock = &local->inode_ctx->lock[local->transaction.type];
+
+ INIT_LIST_HEAD (&shared);
+ LOCK (&local->inode->lock);
+ {
+ list_splice_init (&lock->waiting, &shared);
+ }
+ UNLOCK (&local->inode->lock);
+
+ afr_lock_fail_shared (local, &shared);
+ local->transaction.do_eager_unlock = _gf_true;
+out:
+ if (locked) {
+ local->internal_lock.lock_cbk = afr_transaction_done;
+ afr_unlock (local->transaction.frame,
+ local->transaction.frame->this);
+ } else {
+ afr_transaction_done (local->transaction.frame,
+ local->transaction.frame->this);
+ }
+}
call_frame_t*
afr_transaction_detach_fop_frame (call_frame_t *frame)
@@ -334,6 +418,7 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
int pre_op_sources_count = 0;
+ int i = 0;
priv = this->private;
local = frame->local;
@@ -345,11 +430,11 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)
/* If arbiter is the only source, do not proceed. */
if (pre_op_sources_count < 2 &&
local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) {
- local->internal_lock.lock_cbk = afr_transaction_done;
local->op_ret = -1;
local->op_errno = ENOTCONN;
- afr_restore_lk_owner (frame);
- afr_unlock (frame, this);
+ for (i = 0; i < priv->child_count; i++)
+ local->transaction.failed_subvols[i] = 1;
+ afr_changelog_post_op (frame, this);/*uninherit should happen*/
} else {
afr_transaction_fop (frame, this);
}
@@ -362,14 +447,16 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- fd_t *fd = NULL;
int i = 0;
int ret = 0;
+ int failure_count = 0;
+ struct list_head shared;
+ afr_lock_t *lock = NULL;
local = frame->local;
priv = this->private;
- fd = local->fd;
+ INIT_LIST_HEAD (&shared);
if (local->transaction.type == AFR_DATA_TRANSACTION &&
!local->transaction.inherited) {
ret = afr_write_subvol_set (frame, this);
@@ -394,22 +481,31 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
just now, before OP */
afr_changelog_pre_op_update (frame, this);
- /* The wake up needs to happen independent of
- what type of fop arrives here. If it was
- a write, then it has already inherited the
- lock and changelog. If it was not a write,
- then the presumption of the optimization (of
- optimizing for successive write operations)
- fails.
- */
- if (fd)
- afr_delayed_changelog_wake_up (this, fd);
+ if (!local->transaction.eager_lock_on ||
+ local->transaction.inherited)
+ goto fop;
+ failure_count = AFR_COUNT (local->transaction.failed_subvols,
+ priv->child_count);
+ if (failure_count == priv->child_count) {
+ afr_handle_lock_acquire_failure (local, _gf_true);
+ } else {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK (&local->inode->lock);
+ {
+ lock->acquired = _gf_true;
+ __afr_transaction_wake_shared (local, &shared);
+ }
+ UNLOCK (&local->inode->lock);
+ }
+
+fop:
if (priv->arbiter_count == 1) {
afr_txn_arbitrate_fop (frame, this);
} else {
afr_transaction_fop (frame, this);
}
+ afr_lock_resume_shared (&shared);
return 0;
}
@@ -486,30 +582,14 @@ afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this)
}
-afr_inodelk_t*
-afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom)
-{
- afr_inodelk_t *inodelk = NULL;
- int i = 0;
-
- for (i = 0; int_lock->inodelk[i].domain; i++) {
- inodelk = &int_lock->inodelk[i];
- if (strcmp (dom, inodelk->domain) == 0)
- return inodelk;
- }
- return NULL;
-}
-
unsigned char*
afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock)
{
unsigned char *locked_nodes = NULL;
- afr_inodelk_t *inodelk = NULL;
switch (type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- inodelk = afr_get_inodelk (int_lock, int_lock->domain);
- locked_nodes = inodelk->locked_nodes;
+ locked_nodes = int_lock->locked_nodes;
break;
case AFR_ENTRY_TRANSACTION:
@@ -834,27 +914,19 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- fd_t *fd = NULL;
+ afr_inode_ctx_t *ctx = NULL;
int i = 0;
gf_boolean_t ret = _gf_false;
- afr_fd_ctx_t *fd_ctx = NULL;
int type = 0;
local = frame->local;
priv = this->private;
- fd = local->fd;
+ ctx = local->inode_ctx;
type = afr_index_for_transaction_type (local->transaction.type);
if (type != AFR_DATA_TRANSACTION)
return !local->transaction.dirtied;
- if (!fd)
- return !local->transaction.dirtied;
-
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx)
- return _gf_false;
-
if (local->transaction.no_uninherit)
return _gf_false;
@@ -868,34 +940,34 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)
if (local->transaction.uninherit_done)
return local->transaction.uninherit_value;
- LOCK(&fd->lock);
+ LOCK(&local->inode->lock);
{
for (i = 0; i < priv->child_count; i++) {
if (local->transaction.pre_op[i] !=
- fd_ctx->pre_op_done[type][i]) {
+ ctx->pre_op_done[type][i]) {
ret = !local->transaction.dirtied;
goto unlock;
}
}
- if (fd_ctx->inherited[type]) {
+ if (ctx->inherited[type]) {
ret = _gf_true;
- fd_ctx->inherited[type]--;
- } else if (fd_ctx->on_disk[type]) {
+ ctx->inherited[type]--;
+ } else if (ctx->on_disk[type]) {
ret = _gf_false;
- fd_ctx->on_disk[type]--;
+ ctx->on_disk[type]--;
} else {
/* ASSERT */
ret = _gf_false;
}
- if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) {
+ if (!ctx->inherited[type] && !ctx->on_disk[type]) {
for (i = 0; i < priv->child_count; i++)
- fd_ctx->pre_op_done[type][i] = 0;
+ ctx->pre_op_done[type][i] = 0;
}
}
unlock:
- UNLOCK(&fd->lock);
+ UNLOCK(&local->inode->lock);
local->transaction.uninherit_done = _gf_true;
local->transaction.uninherit_value = ret;
@@ -909,31 +981,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- fd_t *fd = NULL;
int i = 0;
gf_boolean_t ret = _gf_false;
- afr_fd_ctx_t *fd_ctx = NULL;
int type = 0;
local = frame->local;
priv = this->private;
- fd = local->fd;
if (local->transaction.type != AFR_DATA_TRANSACTION)
return _gf_false;
type = afr_index_for_transaction_type (local->transaction.type);
- if (!fd)
- return _gf_false;
-
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx)
- return _gf_false;
-
- LOCK(&fd->lock);
+ LOCK(&local->inode->lock);
{
- if (!fd_ctx->on_disk[type]) {
+ if (!local->inode_ctx->on_disk[type]) {
/* nothing to inherit yet */
ret = _gf_false;
goto unlock;
@@ -941,21 +1003,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)
for (i = 0; i < priv->child_count; i++) {
if (local->transaction.pre_op[i] !=
- fd_ctx->pre_op_done[type][i]) {
+ local->inode_ctx->pre_op_done[type][i]) {
/* either inherit exactly, or don't */
ret = _gf_false;
goto unlock;
}
}
- fd_ctx->inherited[type]++;
+ local->inode_ctx->inherited[type]++;
ret = _gf_true;
local->transaction.inherited = _gf_true;
}
unlock:
- UNLOCK(&fd->lock);
+ UNLOCK(&local->inode->lock);
return ret;
}
@@ -966,22 +1028,16 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- fd_t *fd = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
int i = 0;
gf_boolean_t ret = _gf_false;
int type = 0;
local = frame->local;
priv = this->private;
- fd = local->fd;
- if (!fd)
- return _gf_false;
-
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx)
- return _gf_false;
+ if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
+ local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION)
+ return _gf_false;
if (local->transaction.inherited)
/* was already inherited in afr_changelog_pre_op */
@@ -997,26 +1053,26 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this)
ret = _gf_false;
- LOCK(&fd->lock);
+ LOCK(&local->inode->lock);
{
- if (!fd_ctx->on_disk[type]) {
+ if (!local->inode_ctx->on_disk[type]) {
for (i = 0; i < priv->child_count; i++)
- fd_ctx->pre_op_done[type][i] =
+ local->inode_ctx->pre_op_done[type][i] =
(!local->transaction.failed_subvols[i]);
} else {
for (i = 0; i < priv->child_count; i++)
- if (fd_ctx->pre_op_done[type][i] !=
+ if (local->inode_ctx->pre_op_done[type][i] !=
(!local->transaction.failed_subvols[i])) {
local->transaction.no_uninherit = 1;
goto unlock;
}
}
- fd_ctx->on_disk[type]++;
+ local->inode_ctx->on_disk[type]++;
ret = _gf_true;
}
unlock:
- UNLOCK(&fd->lock);
+ UNLOCK(&local->inode->lock);
return ret;
}
@@ -1322,6 +1378,9 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
afr_init_optimistic_changelog_for_txn (this, local);
+ if (afr_changelog_pre_op_inherit (frame, this))
+ goto next;
+
/* This condition should not be met with present code, as
* transaction.done will be called if locks are not acquired on even a
* single node.
@@ -1347,9 +1406,6 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
goto err;
}
- if (afr_changelog_pre_op_inherit (frame, this))
- goto next;
-
if (call_count < priv->child_count)
pre_nop = _gf_false;
@@ -1406,7 +1462,7 @@ err:
local->op_ret = -1;
local->op_errno = op_errno;
- afr_unlock (frame, this);
+ afr_handle_lock_acquire_failure (local, _gf_true);
if (xdata_req)
dict_unref (xdata_req);
@@ -1416,31 +1472,6 @@ err:
int
-afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- if (int_lock->lock_op_ret < 0) {
- gf_msg (this->name, GF_LOG_INFO,
- 0, AFR_MSG_BLOCKING_LKS_FAILED,
- "Blocking inodelks failed.");
- afr_transaction_done (frame, this);
- } else {
-
- gf_msg_debug (this->name, 0,
- "Blocking inodelks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
- }
-
- return 0;
-}
-
-
-int
afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
@@ -1453,7 +1484,7 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
if (int_lock->lock_op_ret < 0) {
gf_msg_debug (this->name, 0,
"Non blocking inodelks failed. Proceeding to blocking");
- int_lock->lock_cbk = afr_post_blocking_inodelk_cbk;
+ int_lock->lock_cbk = afr_internal_lock_finish;
afr_blocking_lock (frame, this);
} else {
@@ -1467,31 +1498,6 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
int
-afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- if (int_lock->lock_op_ret < 0) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_BLOCKING_LKS_FAILED,
- "Blocking entrylks failed.");
- afr_transaction_done (frame, this);
- } else {
-
- gf_msg_debug (this->name, 0,
- "Blocking entrylks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
- }
-
- return 0;
-}
-
-
-int
afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
@@ -1504,7 +1510,7 @@ afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
if (int_lock->lock_op_ret < 0) {
gf_msg_debug (this->name, 0,
"Non blocking entrylks failed. Proceeding to blocking");
- int_lock->lock_cbk = afr_post_blocking_entrylk_cbk;
+ int_lock->lock_cbk = afr_internal_lock_finish;
afr_blocking_lock (frame, this);
} else {
@@ -1565,29 +1571,28 @@ int
afr_set_transaction_flock (xlator_t *this, afr_local_t *local)
{
afr_internal_lock_t *int_lock = NULL;
- afr_inodelk_t *inodelk = NULL;
afr_private_t *priv = NULL;
int_lock = &local->internal_lock;
- inodelk = afr_get_inodelk (int_lock, int_lock->domain);
priv = this->private;
- if ((priv->arbiter_count || priv->full_lock) &&
+ if ((priv->arbiter_count || local->transaction.eager_lock_on ||
+ priv->full_lock) &&
local->transaction.type == AFR_DATA_TRANSACTION) {
/*Lock entire file to avoid network split brains.*/
- inodelk->flock.l_len = 0;
- inodelk->flock.l_start = 0;
+ int_lock->flock.l_len = 0;
+ int_lock->flock.l_start = 0;
} else {
- inodelk->flock.l_len = local->transaction.len;
- inodelk->flock.l_start = local->transaction.start;
+ int_lock->flock.l_len = local->transaction.len;
+ int_lock->flock.l_start = local->transaction.start;
}
- inodelk->flock.l_type = F_WRLCK;
+ int_lock->flock.l_type = F_WRLCK;
return 0;
}
int
-afr_lock_rec (call_frame_t *frame, xlator_t *this)
+afr_lock (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
afr_local_t *local = NULL;
@@ -1628,74 +1633,153 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this)
return 0;
}
+static gf_boolean_t
+afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
+{
+ uint64_t start1 = local1->transaction.start;
+ uint64_t start2 = local2->transaction.start;
+ uint64_t end1 = 0;
+ uint64_t end2 = 0;
+
+ if (local1->transaction.len)
+ end1 = start1 + local1->transaction.len - 1;
+ else
+ end1 = ULLONG_MAX;
+
+ if (local2->transaction.len)
+ end2 = start2 + local2->transaction.len - 1;
+ else
+ end2 = ULLONG_MAX;
-int
-afr_lock (call_frame_t *frame, xlator_t *this)
+ return ((end1 >= start2) && (end2 >= start1));
+}
+
+gf_boolean_t
+afr_has_lock_conflict (afr_local_t *local, gf_boolean_t waitlist_check)
{
- afr_set_lock_number (frame, this);
+ afr_local_t *each = NULL;
+ afr_lock_t *lock = NULL;
- return afr_lock_rec (frame, this);
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ /*
+ * Once full file lock is acquired in eager-lock phase, overlapping
+ * writes do not compete for inode-locks, instead are transferred to the
+ * next writes. Because of this overlapping writes are not ordered.
+ * This can cause inconsistencies in replication.
+ * Example:
+ * Two overlapping writes w1, w2 are sent in parallel on same fd
+ * in two threads t1, t2.
+ * Both threads can execute afr_writev_wind in the following manner.
+ * t1 winds w1 on brick-0
+ * t2 winds w2 on brick-0
+ * t2 winds w2 on brick-1
+ * t1 winds w1 on brick-1
+ *
+ * This check makes sure the locks are not transferred for
+ * overlapping writes.
+ */
+ list_for_each_entry (each, &lock->owners, transaction.owner_list) {
+ if (afr_locals_overlap (each, local)) {
+ return _gf_true;
+ }
+ }
+
+ if (!waitlist_check)
+ return _gf_false;
+ list_for_each_entry (each, &lock->waiting, transaction.wait_list) {
+ if (afr_locals_overlap (each, local)) {
+ return _gf_true;
+ }
+ }
+ return _gf_false;
}
/* }}} */
-
-int
-afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
+static void
+afr_copy_inodelk_vars (afr_internal_lock_t *dst, afr_internal_lock_t *src,
+ xlator_t *this)
{
- afr_changelog_pre_op (frame, this);
+ afr_private_t *priv = this->private;
- return 0;
+ dst->domain = src->domain;
+ dst->flock.l_len = src->flock.l_len;
+ dst->flock.l_start = src->flock.l_start;
+ dst->flock.l_type = src->flock.l_type;
+ dst->lock_count = src->lock_count;
+ memcpy (dst->locked_nodes, src->locked_nodes,
+ priv->child_count * sizeof (*dst->locked_nodes));
}
-
void
-afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this)
+__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ gf_boolean_t conflict = _gf_false;
+ afr_local_t *each = NULL;
+ afr_lock_t *lock = &local->inode_ctx->lock[local->transaction.type];
- /* call this function from any of the related optimizations
- which benefit from delaying post op are enabled, namely:
-
- - changelog piggybacking
- - eager locking
- */
+ while (!conflict) {
+ if (list_empty (&lock->waiting))
+ return;
+ each = list_entry(lock->waiting.next, afr_local_t,
+ transaction.wait_list);
+ if (afr_has_lock_conflict (each, _gf_false)) {
+ conflict = _gf_true;
+ }
+ if (conflict && !list_empty (&lock->owners))
+ return;
+ afr_copy_inodelk_vars (&each->internal_lock,
+ &local->internal_lock,
+ each->transaction.frame->this);
+ list_move_tail (&each->transaction.wait_list, shared);
+ list_add_tail(&each->transaction.owner_list, &lock->owners);
+ }
+}
- priv = this->private;
- if (!priv)
- return;
+static void
+afr_lock_resume_shared (struct list_head *list)
+{
+ afr_local_t *each = NULL;
- if (!priv->post_op_delay_secs)
- return;
+ while (!list_empty(list)) {
+ each = list_entry(list->next, afr_local_t,
+ transaction.wait_list);
+ list_del_init(&each->transaction.wait_list);
+ afr_changelog_pre_op (each->transaction.frame,
+ each->transaction.frame->this);
+ }
+}
- local = frame->local;
- if (!local)
- return;
+int
+afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = frame->local;
+ afr_lock_t *lock = NULL;
- if (!local->transaction.eager_lock_on)
- return;
- if (!local->fd)
- return;
+ local->internal_lock.lock_cbk = NULL;
+ if (!local->transaction.eager_lock_on) {
+ if (local->internal_lock.lock_op_ret < 0) {
+ afr_transaction_done (frame, this);
+ return 0;
+ }
+ afr_changelog_pre_op (frame, this);
+ } else {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ if (local->internal_lock.lock_op_ret < 0) {
+ afr_handle_lock_acquire_failure (local, _gf_false);
+ } else {
+ lock->event_generation = local->event_generation;
+ afr_changelog_pre_op (frame, this);
+ }
+ }
- if (local->op == GF_FOP_WRITE)
- local->delayed_post_op = _gf_true;
+ return 0;
}
gf_boolean_t
-afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
+afr_are_multiple_fds_opened (afr_local_t *local, xlator_t *this)
{
- afr_fd_ctx_t *fd_ctx = NULL;
-
- if (!fd) {
- /* If false is returned, it may keep on taking eager-lock
- * which may lead to starvation, so return true to avoid that.
- */
- gf_msg_callingfn (this->name, GF_LOG_ERROR, EBADF,
- AFR_MSG_INVALID_ARG, "Invalid fd");
- return _gf_true;
- }
/* Lets say mount1 has eager-lock(full-lock) and after the eager-lock
* is taken mount2 opened the same file, it won't be able to
* perform any data operations until mount1 releases eager-lock.
@@ -1703,11 +1787,7 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
* if open-fd-count is > 1
*/
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx)
- return _gf_true;
-
- if (fd_ctx->open_fd_count > 1)
+ if (local->inode_ctx->open_fd_count > 1)
return _gf_true;
return _gf_false;
@@ -1715,24 +1795,45 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
gf_boolean_t
-is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this)
+afr_is_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this,
+ int delay)
{
- afr_local_t *local = NULL;
- gf_boolean_t res = _gf_false;
+ afr_local_t *local = NULL;
+ afr_lock_t *lock = NULL;
+ gf_boolean_t res = _gf_false;
local = frame->local;
- if (!local)
+ lock = &local->inode_ctx->lock[local->transaction.type];
+
+ if (!afr_txn_nothing_failed (frame, this)) {
+ lock->release = _gf_true;
goto out;
+ }
- if (!local->delayed_post_op)
+ if (afr_are_multiple_fds_opened (local, this)) {
+ lock->release = _gf_true;
goto out;
+ }
- //Mark pending changelog ASAP
- if (!afr_txn_nothing_failed (frame, this))
+ if (!list_empty (&lock->owners))
+ goto out;
+ else
+ GF_ASSERT (list_empty (&lock->waiting));
+
+ if (lock->release) {
+ goto out;
+ }
+
+ if (!delay) {
goto out;
+ }
- if (local->fd && afr_are_multiple_fds_opened (local->fd, this))
+ if ((local->op != GF_FOP_WRITE) &&
+ (local->op != GF_FOP_FXATTROP)) {
+ /*Only allow writes but shard does [f]xattrops on writes, so
+ * they are fine too*/
goto out;
+ }
res = _gf_true;
out:
@@ -1743,50 +1844,61 @@ out:
void
afr_delayed_changelog_wake_up_cbk (void *data)
{
- fd_t *fd = NULL;
+ afr_lock_t *lock = NULL;
+ afr_local_t *local = data;
+ afr_local_t *timer_local = NULL;
+ struct list_head shared;
- fd = data;
-
- afr_delayed_changelog_wake_up (THIS, fd);
+ INIT_LIST_HEAD (&shared);
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK (&local->inode->lock);
+ {
+ timer_local = list_entry(lock->post_op.next,
+ afr_local_t,
+ transaction.owner_list);
+ if (list_empty (&lock->owners) && (local == timer_local)) {
+ GF_ASSERT (list_empty (&lock->waiting));
+ /*Last owner*/
+ lock->release = _gf_true;
+ lock->delay_timer = NULL;
+ }
+ }
+ UNLOCK (&local->inode->lock);
+ afr_changelog_post_op_now (local->transaction.frame,
+ local->transaction.frame->this);
}
/* SET operation */
int
-afr_fd_report_unstable_write (xlator_t *this, fd_t *fd)
+afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local)
{
- afr_fd_ctx_t *fdctx = NULL;
-
- fdctx = afr_fd_ctx_get (fd, this);
-
- LOCK(&fd->lock);
+ LOCK(&local->inode->lock);
{
- fdctx->witnessed_unstable_write = _gf_true;
+ local->inode_ctx->witnessed_unstable_write = _gf_true;
}
- UNLOCK(&fd->lock);
+ UNLOCK(&local->inode->lock);
return 0;
}
/* TEST and CLEAR operation */
gf_boolean_t
-afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd)
+afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode)
{
- afr_fd_ctx_t *fdctx = NULL;
+ afr_inode_ctx_t *ctx = NULL;
gf_boolean_t witness = _gf_false;
- fdctx = afr_fd_ctx_get (fd, this);
- if (!fdctx)
- return _gf_true;
-
- LOCK(&fd->lock);
+ LOCK(&inode->lock);
{
- if (fdctx->witnessed_unstable_write) {
+ (void)__afr_inode_ctx_get (this, inode, &ctx);
+
+ if (ctx->witnessed_unstable_write) {
witness = _gf_true;
- fdctx->witnessed_unstable_write = _gf_false;
+ ctx->witnessed_unstable_write = _gf_false;
}
}
- UNLOCK (&fd->lock);
+ UNLOCK (&inode->lock);
return witness;
}
@@ -1929,7 +2041,7 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
mark a flag in the fdctx whenever an unstable write is witnessed.
*/
- if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) {
+ if (!afr_fd_has_witnessed_unstable_write (this, local->inode)) {
afr_changelog_post_op_now (frame, this);
return 0;
}
@@ -1947,87 +2059,64 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
return 0;
}
-
void
-afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
- call_stub_t *stub)
+afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
{
- afr_fd_ctx_t *fd_ctx = NULL;
- call_frame_t *prev_frame = NULL;
- struct timespec delta = {0, };
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
+ struct timespec delta = {0, };
+ afr_private_t *priv = NULL;
+ afr_local_t *local = frame->local;
+ afr_lock_t *lock = NULL;
+ gf_boolean_t post_op = _gf_true;
+ struct list_head shared;
priv = this->private;
-
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx)
- goto out;
-
delta.tv_sec = priv->post_op_delay_secs;
delta.tv_nsec = 0;
- pthread_mutex_lock (&fd_ctx->delay_lock);
- {
- prev_frame = fd_ctx->delay_frame;
- fd_ctx->delay_frame = NULL;
- if (fd_ctx->delay_timer)
- gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer);
- fd_ctx->delay_timer = NULL;
- if (!frame)
- goto unlock;
- fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta,
- afr_delayed_changelog_wake_up_cbk,
- fd);
- fd_ctx->delay_frame = frame;
- }
-unlock:
- pthread_mutex_unlock (&fd_ctx->delay_lock);
-
-out:
- if (prev_frame) {
- local = prev_frame->local;
- local->transaction.resume_stub = stub;
- afr_changelog_post_op_now (prev_frame, this);
- } else if (stub) {
- call_resume (stub);
- }
-}
-
-
-void
-afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- if (is_afr_delayed_changelog_post_op_needed (frame, this))
- afr_delayed_changelog_post_op (this, frame, local->fd, NULL);
- else
- afr_changelog_post_op_safe (frame, this);
-}
-
+ INIT_LIST_HEAD (&shared);
+ if (!local->transaction.eager_lock_on)
+ goto out;
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK (&local->inode->lock);
+ {
+ list_del_init (&local->transaction.owner_list);
+ list_add (&local->transaction.owner_list, &lock->post_op);
+ __afr_transaction_wake_shared (local, &shared);
+
+ if (!afr_is_delayed_changelog_post_op_needed (frame, this,
+ delta.tv_sec)) {
+ if (list_empty (&lock->owners))
+ lock->release = _gf_true;
+ goto unlock;
+ }
-/* Wake up the sleeping/delayed post-op, and also register
- a stub to have it resumed after this transaction
- completely finishes.
+ GF_ASSERT (lock->delay_timer == NULL);
+ lock->delay_timer = gf_timer_call_after (this->ctx, delta,
+ afr_delayed_changelog_wake_up_cbk,
+ local);
+ if (!lock->delay_timer) {
+ lock->release = _gf_true;
+ } else {
+ post_op = _gf_false;
+ }
- The @stub gets saved in @local and gets resumed in
- afr_local_cleanup()
- */
-void
-afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub)
-{
- afr_delayed_changelog_post_op (this, NULL, fd, stub);
-}
+ }
+unlock:
+ UNLOCK (&local->inode->lock);
+ if (!list_empty (&shared)) {
+ afr_lock_resume_shared (&shared);
+ }
-void
-afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd)
-{
- afr_delayed_changelog_post_op (this, NULL, fd, NULL);
+out:
+ if (post_op) {
+ if (!local->transaction.eager_lock_on || lock->release) {
+ afr_changelog_post_op_safe (frame, this);
+ } else {
+ afr_changelog_post_op_now (frame, this);
+ }
+ }
}
int
@@ -2037,13 +2126,6 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this)
local = frame->local;
- if (local->transaction.eager_lock_on) {
- /* We don't need to retain "local" in the
- fd list anymore, writes to all subvols
- are finished by now */
- afr_remove_eager_lock_stub (local);
- }
-
afr_restore_lk_owner (frame);
afr_handle_symmetric_errors (frame, this);
@@ -2074,114 +2156,149 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
local->transaction.failed_subvols[child_index] = 1;
}
-
-
static gf_boolean_t
-afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
+__need_previous_lock_unlocked (afr_local_t *local)
{
- uint64_t start1 = local1->transaction.start;
- uint64_t start2 = local2->transaction.start;
- uint64_t end1 = 0;
- uint64_t end2 = 0;
-
- if (local1->transaction.len)
- end1 = start1 + local1->transaction.len - 1;
- else
- end1 = ULLONG_MAX;
+ afr_lock_t *lock = NULL;
- if (local2->transaction.len)
- end2 = start2 + local2->transaction.len - 1;
- else
- end2 = ULLONG_MAX;
+ if (!local->transaction.eager_lock_on)
+ return _gf_true;
- return ((end1 >= start2) && (end2 >= start1));
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ if (!lock->acquired)
+ return _gf_false;
+ if (lock->acquired && lock->event_generation != local->event_generation)
+ return _gf_true;
+ return _gf_false;
}
void
-afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this)
+__afr_eager_lock_handle (afr_local_t *local, gf_boolean_t *take_lock,
+ gf_boolean_t *do_pre_op, afr_local_t **timer_local)
{
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fdctx = NULL;
- afr_local_t *each = NULL;
+ afr_lock_t *lock = NULL;
+ afr_local_t *owner_local = NULL;
+ xlator_t *this = local->transaction.frame->this;
- priv = this->private;
-
- if (!local->fd)
- return;
-
- if (local->transaction.type != AFR_DATA_TRANSACTION)
- return;
+ if (local->fd && !afr_are_multiple_fds_opened (local, this)) {
+ local->transaction.eager_lock_on = _gf_true;
+ }
- if (!priv->eager_lock)
- return;
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ if (__need_previous_lock_unlocked (local)) {
+ if (!list_empty (&lock->owners)) {
+ lock->release = _gf_true;
+ } else if (lock->delay_timer) {
+ lock->release = _gf_true;
+ if (gf_timer_call_cancel (this->ctx,
+ lock->delay_timer)) {
+ /* It will be put in frozen list
+ * in the code flow below*/
+ } else {
+ *timer_local = list_entry(lock->post_op.next,
+ afr_local_t,
+ transaction.owner_list);
+ lock->delay_timer = NULL;
+ }
+ }
+ if (!local->transaction.eager_lock_on)
+ goto out;
+ }
- fdctx = afr_fd_ctx_get (local->fd, this);
- if (!fdctx)
- return;
+ if (lock->release) {
+ list_add_tail (&local->transaction.wait_list,
+ &lock->frozen);
+ *take_lock = _gf_false;
+ goto out;
+ }
- if (afr_are_multiple_fds_opened (local->fd, this))
- return;
- /*
- * Once full file lock is acquired in eager-lock phase, overlapping
- * writes do not compete for inode-locks, instead are transferred to the
- * next writes. Because of this overlapping writes are not ordered.
- * This can cause inconsistencies in replication.
- * Example:
- * Two overlapping writes w1, w2 are sent in parallel on same fd
- * in two threads t1, t2.
- * Both threads can execute afr_writev_wind in the following manner.
- * t1 winds w1 on brick-0
- * t2 winds w2 on brick-0
- * t2 winds w2 on brick-1
- * t1 winds w1 on brick-1
- *
- * This check makes sure the locks are not transferred for
- * overlapping writes.
- */
- LOCK (&local->fd->lock);
- {
- list_for_each_entry (each, &fdctx->eager_locked,
- transaction.eager_locked) {
- if (afr_locals_overlap (each, local)) {
- local->transaction.eager_lock_on = _gf_false;
- goto unlock;
- }
+ if (lock->delay_timer) {
+ *take_lock = _gf_false;
+ if (gf_timer_call_cancel (this->ctx,
+ lock->delay_timer)) {
+ list_add_tail (&local->transaction.wait_list,
+ &lock->frozen);
+ } else {
+ *timer_local = list_entry(lock->post_op.next,
+ afr_local_t,
+ transaction.owner_list);
+ afr_copy_inodelk_vars (&local->internal_lock,
+ &(*timer_local)->internal_lock,
+ this);
+ lock->delay_timer = NULL;
+ *do_pre_op = _gf_true;
+ list_add_tail (&local->transaction.owner_list,
+ &lock->owners);
}
+ goto out;
+ }
- local->transaction.eager_lock_on = _gf_true;
- list_add_tail (&local->transaction.eager_locked,
- &fdctx->eager_locked);
+ if (!list_empty (&lock->owners)) {
+ if (!lock->acquired ||
+ afr_has_lock_conflict (local, _gf_true)) {
+ list_add_tail (&local->transaction.wait_list,
+ &lock->waiting);
+ *take_lock = _gf_false;
+ goto out;
+ }
+ owner_local = list_entry (lock->owners.next,
+ afr_local_t,
+ transaction.owner_list);
+ afr_copy_inodelk_vars (&local->internal_lock,
+ &owner_local->internal_lock,
+ this);
+ *take_lock = _gf_false;
+ *do_pre_op = _gf_true;
}
-unlock:
- UNLOCK (&local->fd->lock);
+
+ if (lock->acquired)
+ GF_ASSERT (!(*take_lock));
+ list_add_tail (&local->transaction.owner_list, &lock->owners);
+out:
+ return;
}
void
-afr_transaction_start (call_frame_t *frame, xlator_t *this)
+afr_transaction_start (afr_local_t *local, xlator_t *this)
{
- afr_local_t *local = frame->local;
- fd_t *fd = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t take_lock = _gf_true;
+ gf_boolean_t do_pre_op = _gf_false;
+ afr_local_t *timer_local = NULL;
- afr_transaction_eager_lock_init (local, this);
+ priv = this->private;
- if (local->fd && local->transaction.eager_lock_on)
- afr_set_lk_owner (frame, this, local->fd);
- else
- afr_set_lk_owner (frame, this, frame->root);
+ if (local->transaction.type != AFR_DATA_TRANSACTION &&
+ local->transaction.type != AFR_METADATA_TRANSACTION)
+ goto lock_phase;
- if (!local->transaction.eager_lock_on && local->loc.inode) {
- fd = fd_lookup (local->loc.inode, frame->root->pid);
- if (fd == NULL)
- fd = fd_lookup_anonymous (local->loc.inode,
- GF_ANON_FD_FLAGS);
+ if (!priv->eager_lock)
+ goto lock_phase;
- if (fd) {
- afr_delayed_changelog_wake_up (this, fd);
- fd_unref (fd);
- }
+ LOCK (&local->inode->lock);
+ {
+ __afr_eager_lock_handle (local, &take_lock, &do_pre_op,
+ &timer_local);
}
+ UNLOCK (&local->inode->lock);
+lock_phase:
+ if (!local->transaction.eager_lock_on) {
+ afr_set_lk_owner (local->transaction.frame, this,
+ local->transaction.frame->root);
+ } else {
+ afr_set_lk_owner (local->transaction.frame, this, local->inode);
+ }
+
- afr_lock (frame, this);
+ if (take_lock) {
+ afr_lock (local->transaction.frame, this);
+ } else if (do_pre_op) {
+ afr_changelog_pre_op (local->transaction.frame, this);
+ }
+ /*Always call delayed_changelog_wake_up_cbk after calling pre-op above
+ * so that any inheriting can happen*/
+ if (timer_local)
+ afr_delayed_changelog_wake_up_cbk (timer_local);
}
int
@@ -2194,7 +2311,7 @@ afr_write_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
goto fail;
}
- afr_transaction_start (frame, this);
+ afr_transaction_start (local, this);
return 0;
fail:
local->transaction.unwind (frame, this);
@@ -2212,6 +2329,7 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
local = frame->local;
priv = this->private;
+ local->transaction.frame = frame;
local->transaction.type = type;
@@ -2224,11 +2342,9 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
if (ret < 0)
goto out;
- if (type == AFR_ENTRY_TRANSACTION ||
- type == AFR_ENTRY_RENAME_TRANSACTION) {
- afr_transaction_start (frame, this);
- ret = 0;
- goto out;
+
+ if (type != AFR_METADATA_TRANSACTION) {
+ goto txn_start;
}
ret = afr_inode_get_readable (frame, local->inode, this,
@@ -2238,10 +2354,13 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
event_generation)) {
afr_inode_refresh (frame, this, local->inode, local->loc.gfid,
afr_write_txn_refresh_done);
- } else {
- afr_transaction_start (frame, this);
+ ret = 0;
+ goto out;
}
+
+txn_start:
ret = 0;
+ afr_transaction_start (local, this);
out:
return ret;
}