diff options
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 35 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-lk-common.c | 134 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-algorithm.c | 21 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.c | 89 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-common.h | 6 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-data.c | 44 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-self-heal-metadata.c | 9 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-transaction.c | 565 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-transaction.h | 3 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.c | 2 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.h | 20 |
11 files changed, 500 insertions, 428 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 6c6964deceb..fbf39ed4e1c 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -826,7 +826,8 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) void afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) { - afr_private_t * priv = NULL; + afr_private_t *priv = NULL; + int i = 0; priv = this->private; @@ -836,7 +837,9 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) GF_FREE (local->internal_lock.locked_nodes); - GF_FREE (local->internal_lock.inode_locked_nodes); + for (i = 0; local->internal_lock.inodelk[i].domain; i++) { + GF_FREE (local->internal_lock.inodelk[i].locked_nodes); + } GF_FREE (local->internal_lock.lower_locked_nodes); @@ -4480,11 +4483,6 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, { int ret = -ENOMEM; - lk->inode_locked_nodes = GF_CALLOC (sizeof (*lk->inode_locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->inode_locked_nodes) - goto out; - lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), child_count, gf_afr_mt_char); if (NULL == lk->locked_nodes) @@ -4543,6 +4541,21 @@ out: } int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count) +{ + int ret = -ENOMEM; + + lk->domain = dom; + lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), + child_count, gf_afr_mt_char); + if (NULL == lk->locked_nodes) + goto out; + ret = 0; +out: + return ret; +} + +int afr_transaction_local_init (afr_local_t *local, xlator_t *this) { int child_up_count = 0; @@ -4555,6 +4568,14 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (ret < 0) goto out; + if ((local->transaction.type == AFR_DATA_TRANSACTION) || + (local->transaction.type == AFR_METADATA_TRANSACTION)) { + ret = afr_inodelk_init (&local->internal_lock.inodelk[0], + this->name, priv->child_count); + if (ret < 0) + goto out; + } + ret = -ENOMEM; child_up_count = afr_up_children_count (local->child_up, priv->child_count); diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index 812609819f5..116e1ef6983 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -559,19 +559,20 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; afr_private_t *priv = NULL; - int i = 0; + afr_inodelk_t *inodelk = NULL; priv = this->private; local = frame->local; int_lock = &local->internal_lock; - int_lock->inodelk_lock_count = 0; - int_lock->lock_op_ret = -1; - int_lock->lock_op_errno = 0; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - for (i = 0; i < priv->child_count; i++) { - int_lock->inode_locked_nodes[i] = 0; - } + inodelk->lock_count = 0; + int_lock->lock_op_ret = -1; + int_lock->lock_op_errno = 0; + + memset (inodelk->locked_nodes, 0, + sizeof (*inodelk->locked_nodes) * priv->child_count); return 0; } @@ -652,6 +653,7 @@ afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; int32_t child_index = (long)cookie; local = frame->local; @@ -668,7 +670,8 @@ afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } - int_lock->inode_locked_nodes[child_index] &= LOCKED_NO; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + inodelk->locked_nodes[child_index] &= LOCKED_NO; if (local->transaction.eager_lock) local->transaction.eager_lock[child_index] = 0; @@ -682,6 +685,7 @@ static int afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; struct gf_flock flock = {0,}; @@ -697,12 +701,14 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; priv = this->private; - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; flock.l_type = F_UNLCK; full_flock.l_type = F_UNLCK; - call_count = afr_locked_nodes_count (int_lock->inode_locked_nodes, + call_count = afr_locked_nodes_count (inodelk->locked_nodes, priv->child_count); int_lock->lk_call_count = call_count; @@ -718,8 +724,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) fd_ctx = afr_fd_ctx_get (local->fd, this); for (i = 0; i < priv->child_count; i++) { - if ((int_lock->inode_locked_nodes[i] & LOCKED_YES) - != LOCKED_YES) + if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES) continue; if (local->fd) { @@ -760,7 +765,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) (void *) (long)i, priv->children[i], priv->children[i]->fops->finodelk, - this->name, local->fd, + int_lock->domain, local->fd, F_SETLK, flock_use, NULL); if (!--call_count) @@ -775,7 +780,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) (void *) (long)i, priv->children[i], priv->children[i]->fops->inodelk, - this->name, &local->loc, + int_lock->domain, &local->loc, F_SETLK, &flock, NULL); if (!--call_count) @@ -861,7 +866,7 @@ afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[index], priv->children[index]->fops->entrylk, - this->name, + int_lock->domain, &int_lock->lockee[lockee_no].loc, int_lock->lockee[lockee_no].basename, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); @@ -964,6 +969,7 @@ static int afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -974,10 +980,10 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - memcpy (int_lock->inode_locked_nodes, - int_lock->locked_nodes, - priv->child_count); - int_lock->inodelk_lock_count = int_lock->lock_count; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + memcpy (inodelk->locked_nodes, int_lock->locked_nodes, + sizeof (*inodelk->locked_nodes) * priv->child_count); + inodelk->lock_count = int_lock->lock_count; break; case AFR_ENTRY_RENAME_TRANSACTION: @@ -1028,6 +1034,7 @@ int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; struct gf_flock flock = {0,}; @@ -1042,10 +1049,15 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) priv = this->private; child_index = cookie % priv->child_count; lockee_no = cookie / priv->child_count; + is_entrylk = afr_is_entrylk (int_lock, local->transaction.type); + - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = int_lock->lk_flock.l_type; + if (!is_entrylk) { + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = inodelk->flock.l_type; + } if (local->fd) { ret = fd_ctx_get (local->fd, this, &ctx); @@ -1067,8 +1079,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) } if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { - is_entrylk = afr_is_entrylk (int_lock, local->transaction.type); - if ((is_entrylk && int_lock->entrylk_lock_count == 0) || (!is_entrylk && int_lock->lock_count == 0)) { gf_log (this->name, GF_LOG_INFO, @@ -1117,7 +1127,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->finodelk, - this->name, local->fd, + int_lock->domain, local->fd, F_SETLKW, &flock, NULL); } else { @@ -1130,7 +1140,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->inodelk, - this->name, &local->loc, + int_lock->domain, &local->loc, F_SETLKW, &flock, NULL); } @@ -1151,7 +1161,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) (void *) (long) cookie, priv->children[child_index], priv->children[child_index]->fops->fentrylk, - this->name, local->fd, + int_lock->domain, local->fd, int_lock->lockee[lockee_no].basename, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); } else { @@ -1164,7 +1174,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) (void *) (long) cookie, priv->children[child_index], priv->children[child_index]->fops->entrylk, - this->name, + int_lock->domain, &int_lock->lockee[lockee_no].loc, int_lock->lockee[lockee_no].basename, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); @@ -1393,6 +1403,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; int call_count = 0; int child_index = (long) cookie; @@ -1401,6 +1412,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; int_lock = &local->internal_lock; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION, AFR_LOCK_OP, NULL, op_ret, @@ -1426,9 +1438,8 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (local->transaction.eager_lock) local->transaction.eager_lock[child_index] = 0; } else { - int_lock->inode_locked_nodes[child_index] - |= LOCKED_YES; - int_lock->inodelk_lock_count++; + inodelk->locked_nodes[child_index] |= LOCKED_YES; + inodelk->lock_count++; if (local->transaction.eager_lock && local->transaction.eager_lock[child_index] && @@ -1451,8 +1462,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (this->name, GF_LOG_TRACE, "Last inode locking reply received"); /* all locks successful. Proceed to call FOP */ - if (int_lock->inodelk_lock_count == - int_lock->lk_expected_count) { + if (inodelk->lock_count == int_lock->lk_expected_count) { gf_log (this->name, GF_LOG_TRACE, "All servers locked. Calling the cbk"); int_lock->lock_op_ret = 0; @@ -1476,6 +1486,7 @@ int afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; afr_fd_ctx_t *fd_ctx = NULL; @@ -1491,11 +1502,13 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; priv = this->private; - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = int_lock->lk_flock.l_type; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - full_flock.l_type = int_lock->lk_flock.l_type; + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = inodelk->flock.l_type; + + full_flock.l_type = inodelk->flock.l_type; initialize_inodelk_variables (frame, this); @@ -1569,7 +1582,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->finodelk, - this->name, local->fd, + int_lock->domain, local->fd, F_SETLK, flock_use, NULL); if (!--call_count) @@ -1591,7 +1604,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->inodelk, - this->name, &local->loc, + int_lock->domain, &local->loc, F_SETLK, &flock, NULL); if (!--call_count) @@ -2115,29 +2128,38 @@ out: return ret; } -void -afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, unsigned int child_count) { - afr_local_t *dst_local = NULL; - afr_local_t *src_local = NULL; - afr_internal_lock_t *dst_lock = NULL; - afr_internal_lock_t *src_lock = NULL; + afr_local_t *dst_local = NULL; + afr_local_t *src_local = NULL; + afr_internal_lock_t *dst_lock = NULL; + afr_internal_lock_t *src_lock = NULL; + afr_inodelk_t *dst_inodelk = NULL; + afr_inodelk_t *src_inodelk = NULL; + int ret = -1; - dst_local = dst->local; - dst_lock = &dst_local->internal_lock; src_local = src->local; src_lock = &src_local->internal_lock; - if (src_lock->inode_locked_nodes) { - memcpy (dst_lock->inode_locked_nodes, - src_lock->inode_locked_nodes, - sizeof (*dst_lock->inode_locked_nodes) * child_count); - memset (src_lock->inode_locked_nodes, 0, - sizeof (*src_lock->inode_locked_nodes) * child_count); + src_inodelk = afr_get_inodelk (src_lock, dom); + dst_local = dst->local; + dst_lock = &dst_local->internal_lock; + dst_inodelk = afr_get_inodelk (dst_lock, dom); + if (!dst_inodelk || !src_inodelk) + goto out; + if (src_inodelk->locked_nodes) { + memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes, + sizeof (*dst_inodelk->locked_nodes) * child_count); + memset (src_inodelk->locked_nodes, 0, + sizeof (*src_inodelk->locked_nodes) * child_count); } dst_lock->transaction_lk_type = src_lock->transaction_lk_type; dst_lock->selfheal_lk_type = src_lock->selfheal_lk_type; - dst_lock->inodelk_lock_count = src_lock->inodelk_lock_count; - src_lock->inodelk_lock_count = 0; + dst_inodelk->lock_count = src_inodelk->lock_count; + src_inodelk->lock_count = 0; + ret = 0; +out: + return ret; } diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c index 1d577cfb5ab..83846f152d2 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c @@ -143,7 +143,7 @@ sh_loop_finish (call_frame_t *loop_frame, xlator_t *this) } if (loop_sh && loop_sh->data_lock_held) { - afr_sh_data_unlock (loop_frame, this, + afr_sh_data_unlock (loop_frame, this, this->name, sh_destroy_frame); } else { sh_destroy_frame (loop_frame, this); @@ -214,7 +214,7 @@ sh_loop_frame_create (call_frame_t *sh_frame, xlator_t *this, goto out; //We want the frame to have same lk_owner as sh_frame //so that locks translator allows conflicting locks - new_loop_local = afr_local_copy (local, this); + new_loop_local = afr_self_heal_local_init (local, this); if (!new_loop_local) goto out; new_loop_frame->local = new_loop_local; @@ -273,7 +273,7 @@ sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, new_loop_sh->offset = offset; new_loop_sh->block_size = sh->block_size; afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size, - _gf_true, sh_loop_lock_success, sh_loop_lock_failure); + _gf_true, this->name, sh_loop_lock_success, sh_loop_lock_failure); return 0; out: afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); @@ -754,14 +754,15 @@ out: return sh_priv; } -void -afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, +int +afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, char *dom, unsigned int child_count) { afr_local_t *dst_local = NULL; afr_self_heal_t *dst_sh = NULL; afr_local_t *src_local = NULL; afr_self_heal_t *src_sh = NULL; + int ret = -1; dst_local = dst->local; dst_sh = &dst_local->self_heal; @@ -769,9 +770,12 @@ afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, src_sh = &src_local->self_heal; GF_ASSERT (src_sh->data_lock_held); GF_ASSERT (!dst_sh->data_lock_held); - afr_lk_transfer_datalock (dst, src, child_count); + ret = afr_lk_transfer_datalock (dst, src, dom, child_count); + if (ret) + return ret; src_sh->data_lock_held = _gf_false; dst_sh->data_lock_held = _gf_true; + return 0; } int @@ -793,7 +797,10 @@ afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, ret = sh_loop_frame_create (sh_frame, this, NULL, &first_loop_frame); if (ret) goto out; - afr_sh_transfer_lock (first_loop_frame, sh_frame, priv->child_count); + ret = afr_sh_transfer_lock (first_loop_frame, sh_frame, this->name, + priv->child_count); + if (ret) + goto out; sh->private = afr_sh_priv_init (); if (!sh->private) { ret = -1; diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index f0915b01d2e..14283163372 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -1997,6 +1997,7 @@ afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, int_lock->lk_basename = base_name; int_lock->lk_loc = loc; int_lock->lock_cbk = lock_cbk; + int_lock->domain = this->name; int_lock->lockee_count = 0; afr_init_entry_lockee (&int_lock->lockee[0], local, loc, @@ -2057,13 +2058,13 @@ afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) } afr_local_t* -afr_local_copy (afr_local_t *l, xlator_t *this) +afr_self_heal_local_init (afr_local_t *l, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *lc = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *shc = NULL; - int i = 0; + afr_private_t *priv = NULL; + afr_local_t *lc = NULL; + afr_self_heal_t *sh = NULL; + afr_self_heal_t *shc = NULL; + int ret = 0; priv = this->private; @@ -2088,11 +2089,19 @@ afr_local_copy (afr_local_t *l, xlator_t *this) shc->type = sh->type; uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req); - if (l->loc.path) - loc_copy (&lc->loc, &l->loc); + if (l->loc.path) { + ret = loc_copy (&lc->loc, &l->loc); + if (ret < 0) + goto out; + } lc->child_up = memdup (l->child_up, sizeof (*lc->child_up) * priv->child_count); + if (!lc->child_up) { + ret = -1; + goto out; + } + if (l->xattr_req) lc->xattr_req = dict_ref (l->xattr_req); @@ -2100,59 +2109,25 @@ afr_local_copy (afr_local_t *l, xlator_t *this) lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode); if (l->cont.lookup.xattr) lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr); - if (l->internal_lock.inode_locked_nodes) - lc->internal_lock.inode_locked_nodes = - memdup (l->internal_lock.inode_locked_nodes, - sizeof (*lc->internal_lock.inode_locked_nodes) * priv->child_count); - else - lc->internal_lock.inode_locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes), - priv->child_count, - gf_afr_mt_char); - - if (l->internal_lock.locked_nodes) - lc->internal_lock.locked_nodes = - memdup (l->internal_lock.locked_nodes, - sizeof (*lc->internal_lock.locked_nodes) * priv->child_count); - else - lc->internal_lock.locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), - priv->child_count, - gf_afr_mt_char); - - for (i = 0; i < l->internal_lock.lockee_count; i++) { - loc_copy (&lc->internal_lock.lockee[i].loc, - &l->internal_lock.lockee[i].loc); - - lc->internal_lock.lockee[i].locked_count = - l->internal_lock.lockee[i].locked_count; - - if (l->internal_lock.lockee[i].basename) - lc->internal_lock.lockee[i].basename = - gf_strdup (l->internal_lock.lockee[i].basename); - - if (l->internal_lock.lockee[i].locked_nodes) { - lc->internal_lock.lockee[i].locked_nodes = - memdup (l->internal_lock.lockee[i].locked_nodes, - sizeof (*lc->internal_lock.lockee[i].locked_nodes) * - priv->child_count); - } else { - lc->internal_lock.lockee[i].locked_nodes = - GF_CALLOC (priv->child_count, - sizeof (*lc->internal_lock.lockee[i].locked_nodes), - gf_afr_mt_char); - } + lc->internal_lock.locked_nodes = + GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), + priv->child_count, gf_afr_mt_char); + if (!lc->internal_lock.locked_nodes) { + ret = -1; + goto out; } - lc->internal_lock.lockee_count = l->internal_lock.lockee_count; - - lc->internal_lock.inodelk_lock_count = - l->internal_lock.inodelk_lock_count; - lc->internal_lock.entrylk_lock_count = - l->internal_lock.entrylk_lock_count; + ret = afr_inodelk_init (&lc->internal_lock.inodelk[0], + this->name, priv->child_count); + if (ret) + goto out; out: + if (ret) { + afr_local_cleanup (lc, this); + lc = NULL; + } return lc; } @@ -2241,7 +2216,7 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) afr_set_lk_owner (sh_frame, this, sh_frame->root); afr_set_low_priority (sh_frame); - sh_local = afr_local_copy (local, this); + sh_local = afr_self_heal_local_init (local, this); if (!sh_local) goto out; sh_frame->local = sh_local; diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h index cc67e23fe95..9be1fdff924 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -91,13 +91,13 @@ int afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, int child_index); int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, afr_lock_cbk_t lock_cbk); afr_local_t * -afr_local_copy (afr_local_t *l, xlator_t *this); +afr_self_heal_local_init (afr_local_t *l, xlator_t *this); int afr_sh_data_lock (call_frame_t *frame, xlator_t *this, - off_t start, off_t len, gf_boolean_t block, + off_t start, off_t len, gf_boolean_t block, char *dom, afr_lock_cbk_t success_handler, afr_lock_cbk_t failure_handler); void diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 9c2f3d53c83..3c2726b8df6 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -283,23 +283,33 @@ afr_sh_set_timestamps (call_frame_t *frame, xlator_t *this) //Fun fact, lock_cbk is being used for both lock & unlock int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, afr_lock_cbk_t lock_cbk) { afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; afr_self_heal_t *sh = NULL; + int ret = 0; local = frame->local; int_lock = &local->internal_lock; sh = &local->self_heal; - GF_ASSERT (sh->data_lock_held); - - sh->data_lock_held = _gf_false; + if (strcmp (dom, this->name) == 0) { + sh->data_lock_held = _gf_false; + } else { + ret = -1; + goto out; + } int_lock->lock_cbk = lock_cbk; + int_lock->domain = dom; afr_unlock (frame, this); +out: + if (ret) { + int_lock->lock_op_ret = -1; + int_lock->lock_cbk (frame, this); + } return 0; } @@ -316,7 +326,7 @@ afr_sh_data_finish (call_frame_t *frame, xlator_t *this) "finishing data selfheal of %s", local->loc.path); if (sh->data_lock_held) - afr_sh_data_unlock (frame, this, afr_sh_data_close); + afr_sh_data_unlock (frame, this, this->name, afr_sh_data_close); else afr_sh_data_close (frame, this); @@ -337,7 +347,7 @@ afr_sh_data_fail (call_frame_t *frame, xlator_t *this) afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); if (sh->data_lock_held) - afr_sh_data_unlock (frame, this, afr_sh_data_close); + afr_sh_data_unlock (frame, this, this->name, afr_sh_data_close); else afr_sh_data_close (frame, this); return 0; @@ -380,7 +390,7 @@ afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, goto out; } GF_ASSERT (sh->old_loop_frame); - afr_sh_data_lock (frame, this, 0, 0, _gf_true, + afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, afr_post_sh_big_lock_success, afr_post_sh_big_lock_failure); } @@ -1237,9 +1247,11 @@ afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) } int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t len) +afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, char *dom, + off_t start, off_t len) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; local = frame->local; @@ -1250,11 +1262,14 @@ afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t le afr_set_lock_number (frame, this); - int_lock->lk_flock.l_start = start; - int_lock->lk_flock.l_len = len; - int_lock->lk_flock.l_type = F_WRLCK; int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk; + int_lock->domain = dom; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + inodelk->flock.l_start = start; + inodelk->flock.l_len = len; + inodelk->flock.l_type = F_WRLCK; + afr_nonblocking_inodelk (frame, this); return 0; @@ -1298,7 +1313,7 @@ afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this) int afr_sh_data_lock (call_frame_t *frame, xlator_t *this, off_t start, off_t len, gf_boolean_t block, - afr_lock_cbk_t success_handler, + char *dom, afr_lock_cbk_t success_handler, afr_lock_cbk_t failure_handler) { afr_local_t * local = NULL; @@ -1310,7 +1325,7 @@ afr_sh_data_lock (call_frame_t *frame, xlator_t *this, sh->data_lock_success_handler = success_handler; sh->data_lock_failure_handler = failure_handler; sh->data_lock_block = block; - return afr_sh_data_lock_rec (frame, this, start, len); + return afr_sh_data_lock_rec (frame, this, dom, start, len); } int @@ -1371,7 +1386,7 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, */ block = sh->unwind ? _gf_true : _gf_false; - afr_sh_data_lock (frame, this, 0, 0, block, + afr_sh_data_lock (frame, this, 0, 0, block, this->name, afr_sh_data_big_lock_success, afr_sh_data_fail); } @@ -1493,6 +1508,7 @@ afr_self_heal_data (call_frame_t *frame, xlator_t *this) afr_sh_data_open (frame, this); } else { afr_sh_data_lock (frame, this, 0, 0, _gf_true, + this->name, afr_sh_non_reg_lock_success, afr_sh_data_fail); } diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 1f663b692fc..6d629ce9813 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -579,19 +579,22 @@ int afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; local = frame->local; int_lock = &local->internal_lock; + int_lock->domain = this->name; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); int_lock->transaction_lk_type = AFR_SELFHEAL_LK; int_lock->selfheal_lk_type = AFR_METADATA_SELF_HEAL_LK; afr_set_lock_number (frame, this); - int_lock->lk_flock.l_start = LLONG_MAX - 1; - int_lock->lk_flock.l_len = 0; - int_lock->lk_flock.l_type = F_WRLCK; + inodelk->flock.l_start = LLONG_MAX - 1; + inodelk->flock.l_len = 0; + inodelk->flock.l_type = F_WRLCK; int_lock->lock_cbk = afr_sh_metadata_post_nonblocking_inodelk_cbk; afr_nonblocking_inodelk (frame, this); diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index d307d5b3eee..d6d420910cf 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -79,7 +79,7 @@ afr_save_lk_owner (call_frame_t *frame) local = frame->local; - local->saved_lk_owner = frame->root->lk_owner; + local->saved_lk_owner = frame->root->lk_owner; } @@ -90,10 +90,9 @@ afr_restore_lk_owner (call_frame_t *frame) local = frame->local; - frame->root->lk_owner = local->saved_lk_owner; + frame->root->lk_owner = local->saved_lk_owner; } - static void __mark_all_pending (int32_t *pending[], int child_count, afr_transaction_type type) @@ -443,14 +442,30 @@ out: return; } +afr_inodelk_t* +afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom) +{ + afr_inodelk_t *inodelk = NULL; + int i = 0; + + for (i = 0; int_lock->inodelk[i].domain; i++) { + inodelk = &int_lock->inodelk[i]; + if (strcmp (dom, inodelk->domain) == 0) + return inodelk; + } + return NULL; +} + unsigned char* afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock) { unsigned char *locked_nodes = NULL; + afr_inodelk_t *inodelk = NULL; switch (type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - locked_nodes = int_lock->inode_locked_nodes; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + locked_nodes = inodelk->locked_nodes; break; case AFR_ENTRY_TRANSACTION: @@ -553,22 +568,22 @@ afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr, gf_boolean_t afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int index = -1; - int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int index = -1; + int i = 0; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; index = afr_index_for_transaction_type (local->transaction.type); - for (i = 0; i < priv->child_count; i++) { + for (i = 0; i < priv->child_count; i++) { if (local->pending[i][index] == 0) - return _gf_false; + return _gf_false; } - return _gf_true; + return _gf_true; } @@ -619,7 +634,7 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) goto out; } - nothing_failed = afr_txn_nothing_failed (frame, this); + nothing_failed = afr_txn_nothing_failed (frame, this); afr_compute_txn_changelog (local , priv); @@ -645,14 +660,14 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) break; } - /* local->transaction.postop_piggybacked[] was - precomputed in is_piggyback_postop() when called from - afr_changelog_post_op_safe() - */ + /* local->transaction.postop_piggybacked[] was + precomputed in is_piggyback_postop() when called from + afr_changelog_post_op_safe() + */ - piggyback = 0; - if (local->transaction.postop_piggybacked[i]) - piggyback = 1; + piggyback = 0; + if (local->transaction.postop_piggybacked[i]) + piggyback = 1; afr_set_postop_dict (local, this, xattr[i], piggyback, i); @@ -906,7 +921,7 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) } UNLOCK (&local->fd->lock); - afr_set_delayed_post_op (frame, this); + afr_set_delayed_post_op (frame, this); if (piggyback) afr_changelog_pre_op_cbk (frame, (void *)(long)i, @@ -1179,12 +1194,14 @@ int afr_set_transaction_flock (afr_local_t *local) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; int_lock = &local->internal_lock; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - int_lock->lk_flock.l_len = local->transaction.len; - int_lock->lk_flock.l_start = local->transaction.start; - int_lock->lk_flock.l_type = F_WRLCK; + inodelk->flock.l_len = local->transaction.len; + inodelk->flock.l_start = local->transaction.start; + inodelk->flock.l_type = F_WRLCK; return 0; } @@ -1199,6 +1216,7 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; int_lock->transaction_lk_type = AFR_TRANSACTION_LK; + int_lock->domain = this->name; switch (local->transaction.type) { case AFR_DATA_TRANSACTION: @@ -1259,69 +1277,69 @@ afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) void afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - /* call this function from any of the related optimizations - which benefit from delaying post op are enabled, namely: + /* call this function from any of the related optimizations + which benefit from delaying post op are enabled, namely: - - changelog piggybacking - - eager locking - */ + - changelog piggybacking + - eager locking + */ - priv = this->private; - if (!priv) - return; + priv = this->private; + if (!priv) + return; - if (!priv->post_op_delay_secs) - return; + if (!priv->post_op_delay_secs) + return; local = frame->local; if (!local->transaction.eager_lock_on) return; - if (!local) - return; + if (!local) + return; - if (!local->fd) - return; + if (!local->fd) + return; - if (local->op == GF_FOP_WRITE) - local->delayed_post_op = _gf_true; + if (local->op == GF_FOP_WRITE) + local->delayed_post_op = _gf_true; } gf_boolean_t is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - gf_boolean_t res = _gf_false; + afr_local_t *local = NULL; + gf_boolean_t res = _gf_false; - local = frame->local; - if (!local) - goto out; + local = frame->local; + if (!local) + goto out; - if (!local->delayed_post_op) - goto out; + if (!local->delayed_post_op) + goto out; - res = _gf_true; + res = _gf_true; out: - return res; + return res; } void afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, - call_stub_t *stub); + call_stub_t *stub); void afr_delayed_changelog_wake_up_cbk (void *data) { - fd_t *fd = NULL; + fd_t *fd = NULL; - fd = data; + fd = data; - afr_delayed_changelog_wake_up (THIS, fd); + afr_delayed_changelog_wake_up (THIS, fd); } @@ -1332,39 +1350,39 @@ afr_delayed_changelog_wake_up_cbk (void *data) static gf_boolean_t is_piggyback_post_op (call_frame_t *frame, fd_t *fd) { - afr_fd_ctx_t *fdctx = NULL; - afr_local_t *local = NULL; - gf_boolean_t piggyback = _gf_true; - afr_private_t *priv = NULL; - int i = 0; + afr_fd_ctx_t *fdctx = NULL; + afr_local_t *local = NULL; + gf_boolean_t piggyback = _gf_true; + afr_private_t *priv = NULL; + int i = 0; - priv = frame->this->private; - local = frame->local; - fdctx = afr_fd_ctx_get (fd, frame->this); + priv = frame->this->private; + local = frame->local; + fdctx = afr_fd_ctx_get (fd, frame->this); - LOCK(&fd->lock); - { - piggyback = _gf_true; - - for (i = 0; i < priv->child_count; i++) { - if (!local->transaction.pre_op[i]) - continue; - if (fdctx->pre_op_piggyback[i]) { - fdctx->pre_op_piggyback[i]--; - local->transaction.postop_piggybacked[i] = 1; - } else { - /* For at least _one_ subvolume we cannot - piggyback on the changelog, and have to - perform a hard POST-OP and therefore fsync - if necesssary - */ - piggyback = _gf_false; + LOCK(&fd->lock); + { + piggyback = _gf_true; + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; + if (fdctx->pre_op_piggyback[i]) { + fdctx->pre_op_piggyback[i]--; + local->transaction.postop_piggybacked[i] = 1; + } else { + /* For at least _one_ subvolume we cannot + piggyback on the changelog, and have to + perform a hard POST-OP and therefore fsync + if necesssary + */ + piggyback = _gf_false; GF_ASSERT (fdctx->pre_op_done[i]); fdctx->pre_op_done[i]--; - } - } - } - UNLOCK(&fd->lock); + } + } + } + UNLOCK(&fd->lock); if (!afr_txn_nothing_failed (frame, frame->this)) { /* something failed in this transaction, @@ -1381,117 +1399,117 @@ is_piggyback_post_op (call_frame_t *frame, fd_t *fd) int afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) { - afr_fd_ctx_t *fdctx = NULL; + afr_fd_ctx_t *fdctx = NULL; - fdctx = afr_fd_ctx_get (fd, this); + fdctx = afr_fd_ctx_get (fd, this); - LOCK(&fd->lock); - { - fdctx->witnessed_unstable_write = _gf_true; - } - UNLOCK(&fd->lock); + LOCK(&fd->lock); + { + fdctx->witnessed_unstable_write = _gf_true; + } + UNLOCK(&fd->lock); - return 0; + return 0; } /* TEST and CLEAR operation */ gf_boolean_t afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd) { - afr_fd_ctx_t *fdctx = NULL; - gf_boolean_t witness = _gf_false; + afr_fd_ctx_t *fdctx = NULL; + gf_boolean_t witness = _gf_false; fdctx = afr_fd_ctx_get (fd, this); if (!fdctx) return _gf_true; - LOCK(&fd->lock); - { - if (fdctx->witnessed_unstable_write) { - witness = _gf_true; - fdctx->witnessed_unstable_write = _gf_false; - } - } - UNLOCK (&fd->lock); + LOCK(&fd->lock); + { + if (fdctx->witnessed_unstable_write) { + witness = _gf_true; + fdctx->witnessed_unstable_write = _gf_false; + } + } + UNLOCK (&fd->lock); - return witness; + return witness; } int afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *pre, - struct iatt *post, dict_t *xdata) + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) { - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; int child_index = (long) cookie; - int call_count = -1; - afr_local_t *local = NULL; + int call_count = -1; + afr_local_t *local = NULL; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - if (afr_fop_failed (op_ret, op_errno)) { - /* Failure of fsync() is as good as failure of previous - write(). So treat it like one. - */ - gf_log (this->name, GF_LOG_WARNING, - "fsync(%s) failed on subvolume %s. Transaction was %s", - uuid_utoa (local->fd->inode->gfid), - priv->children[child_index]->name, - gf_fop_list[local->op]); - - afr_transaction_fop_failed (frame, this, child_index); - } + if (afr_fop_failed (op_ret, op_errno)) { + /* Failure of fsync() is as good as failure of previous + write(). So treat it like one. + */ + gf_log (this->name, GF_LOG_WARNING, + "fsync(%s) failed on subvolume %s. Transaction was %s", + uuid_utoa (local->fd->inode->gfid), + priv->children[child_index]->name, + gf_fop_list[local->op]); + + afr_transaction_fop_failed (frame, this, child_index); + } call_count = afr_frame_return (frame); if (call_count == 0) - afr_changelog_post_op_now (frame, this); + afr_changelog_post_op_now (frame, this); - return 0; + return 0; } int afr_changelog_fsync (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - int i = 0; - int call_count = 0; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; call_count = afr_pre_op_done_children_count (local->transaction.pre_op, priv->child_count); - if (!call_count) { - /* will go straight to unlock */ - afr_changelog_post_op_now (frame, this); - return 0; - } + if (!call_count) { + /* will go straight to unlock */ + afr_changelog_post_op_now (frame, this); + return 0; + } - local->call_count = call_count; + local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!local->transaction.pre_op[i]) - continue; + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; - STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->fsync, local->fd, - 1, NULL); - if (!--call_count) - break; - } + STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->fsync, local->fd, + 1, NULL); + if (!--call_count) + break; + } - return 0; + return 0; } -int + int afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; @@ -1500,55 +1518,55 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) local = frame->local; priv = this->private; - if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) { - afr_changelog_post_op_now (frame, this); - return 0; - } + if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) { + afr_changelog_post_op_now (frame, this); + return 0; + } - if (is_piggyback_post_op (frame, local->fd)) { - /* just detected that this post-op is about to - be optimized away as a new write() has - already piggybacked on this frame's changelog. - */ - afr_changelog_post_op_now (frame, this); - return 0; - } + if (is_piggyback_post_op (frame, local->fd)) { + /* just detected that this post-op is about to + be optimized away as a new write() has + already piggybacked on this frame's changelog. + */ + afr_changelog_post_op_now (frame, this); + return 0; + } - /* Calling afr_changelog_post_op_now() now will result in - issuing ->[f]xattrop(). - - Performing a hard POST-OP (->[f]xattrop() FOP) is a more - responsible operation that what it might appear on the surface. - - The changelog of a file (in the xattr of the file on the server) - stores information (pending count) about the state of the file - on the OTHER server. This changelog is blindly trusted, and must - therefore be updated in such a way it remains trustworthy. This - implies that decrementing the pending count (essentially "clearing - the dirty flag") must be done STRICTLY after we are sure that the - operation on the other server has reached stable storage. - - While the backend filesystem on that server will eventually flush - it to stable storage, we (being in userspace) have no mechanism - to get notified when the write became "stable". - - This means we need take matter into our own hands and issue an - fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES, - and get an acknowledgement for it. And we need to wait for the - fsync() acknowledgement before initiating the hard POST-OP. - - However if the FD itself was opened in O_SYNC or O_DSYNC then - we are already guaranteed that the writes were made stable as - part of the FOP itself. The same holds true for NFS stable - writes which happen on an anonymous FD with O_DSYNC or O_SYNC - flag set in the writev() @flags param. For all other write types, - mark a flag in the fdctx whenever an unstable write is witnessed. - */ - - if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { - afr_changelog_post_op_now (frame, this); - return 0; - } + /* Calling afr_changelog_post_op_now() now will result in + issuing ->[f]xattrop(). + + Performing a hard POST-OP (->[f]xattrop() FOP) is a more + responsible operation that what it might appear on the surface. + + The changelog of a file (in the xattr of the file on the server) + stores information (pending count) about the state of the file + on the OTHER server. This changelog is blindly trusted, and must + therefore be updated in such a way it remains trustworthy. This + implies that decrementing the pending count (essentially "clearing + the dirty flag") must be done STRICTLY after we are sure that the + operation on the other server has reached stable storage. + + While the backend filesystem on that server will eventually flush + it to stable storage, we (being in userspace) have no mechanism + to get notified when the write became "stable". + + This means we need take matter into our own hands and issue an + fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES, + and get an acknowledgement for it. And we need to wait for the + fsync() acknowledgement before initiating the hard POST-OP. + + However if the FD itself was opened in O_SYNC or O_DSYNC then + we are already guaranteed that the writes were made stable as + part of the FOP itself. The same holds true for NFS stable + writes which happen on an anonymous FD with O_DSYNC or O_SYNC + flag set in the writev() @flags param. For all other write types, + mark a flag in the fdctx whenever an unstable write is witnessed. + */ + + if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { + afr_changelog_post_op_now (frame, this); + return 0; + } /* Check whether users want durability and perform fsync/post-op * accordingly. @@ -1560,13 +1578,13 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) afr_changelog_post_op_now (frame, this); } - return 0; + return 0; } -void + void afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, - call_stub_t *stub) + call_stub_t *stub) { afr_fd_ctx_t *fd_ctx = NULL; call_frame_t *prev_frame = NULL; @@ -1611,17 +1629,17 @@ out: } -void + void afr_changelog_post_op (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (is_afr_delayed_changelog_post_op_needed (frame, this)) - afr_delayed_changelog_post_op (this, frame, local->fd, NULL); - else - afr_changelog_post_op_safe (frame, this); + if (is_afr_delayed_changelog_post_op_needed (frame, this)) + afr_delayed_changelog_post_op (this, frame, local->fd, NULL); + else + afr_changelog_post_op_safe (frame, this); } @@ -1632,22 +1650,22 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) The @stub gets saved in @local and gets resumed in afr_local_cleanup() -*/ -void + */ + void afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub) { - afr_delayed_changelog_post_op (this, NULL, fd, stub); + afr_delayed_changelog_post_op (this, NULL, fd, stub); } -void + void afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd) { - afr_delayed_changelog_post_op (this, NULL, fd, NULL); + afr_delayed_changelog_post_op (this, NULL, fd, NULL); } -int + int afr_transaction_resume (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; @@ -1658,18 +1676,18 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; priv = this->private; - if (local->transaction.eager_lock_on) { - /* We don't need to retain "local" in the - fd list anymore, writes to all subvols - are finished by now */ - LOCK (&local->fd->lock); - { - list_del_init (&local->transaction.eager_locked); - } - UNLOCK (&local->fd->lock); - } + if (local->transaction.eager_lock_on) { + /* We don't need to retain "local" in the + fd list anymore, writes to all subvols + are finished by now */ + LOCK (&local->fd->lock); + { + list_del_init (&local->transaction.eager_locked); + } + UNLOCK (&local->fd->lock); + } - afr_restore_lk_owner (frame); + afr_restore_lk_owner (frame); if (__fop_changelog_needed (frame, this)) { afr_changelog_post_op (frame, this); @@ -1690,7 +1708,7 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) * afr_transaction_fop_failed - inform that an fop failed */ -void + void afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index) { afr_local_t * local = NULL; @@ -1700,54 +1718,54 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index priv = this->private; __mark_child_dead (local->pending, priv->child_count, - child_index, local->transaction.type); + child_index, local->transaction.type); } -static gf_boolean_t + static gf_boolean_t afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) { - uint64_t start1 = local1->transaction.start; - uint64_t start2 = local2->transaction.start; - uint64_t end1 = 0; - uint64_t end2 = 0; - - if (local1->transaction.len) - end1 = start1 + local1->transaction.len - 1; - else - end1 = ULLONG_MAX; - - if (local2->transaction.len) - end2 = start2 + local2->transaction.len - 1; - else - end2 = ULLONG_MAX; - - return ((end1 >= start2) && (end2 >= start1)); + uint64_t start1 = local1->transaction.start; + uint64_t start2 = local2->transaction.start; + uint64_t end1 = 0; + uint64_t end2 = 0; + + if (local1->transaction.len) + end1 = start1 + local1->transaction.len - 1; + else + end1 = ULLONG_MAX; + + if (local2->transaction.len) + end2 = start2 + local2->transaction.len - 1; + else + end2 = ULLONG_MAX; + + return ((end1 >= start2) && (end2 >= start1)); } -void + void afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) { - afr_private_t *priv = NULL; - afr_fd_ctx_t *fdctx = NULL; - afr_local_t *each = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fdctx = NULL; + afr_local_t *each = NULL; - priv = this->private; + priv = this->private; - if (!local->fd) - return; + if (!local->fd) + return; - if (local->transaction.type != AFR_DATA_TRANSACTION) - return; + if (local->transaction.type != AFR_DATA_TRANSACTION) + return; - if (!priv->eager_lock) - return; + if (!priv->eager_lock) + return; - fdctx = afr_fd_ctx_get (local->fd, this); - if (!fdctx) - return; + fdctx = afr_fd_ctx_get (local->fd, this); + if (!fdctx) + return; /* * Once full file lock is acquired in eager-lock phase, overlapping @@ -1766,22 +1784,22 @@ afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) * This check makes sure the locks are not transferred for * overlapping writes. */ - LOCK (&local->fd->lock); - { - list_for_each_entry (each, &fdctx->eager_locked, - transaction.eager_locked) { - if (afr_locals_overlap (each, local)) { - local->transaction.eager_lock_on = _gf_false; - goto unlock; - } - } - - local->transaction.eager_lock_on = _gf_true; - list_add_tail (&local->transaction.eager_locked, - &fdctx->eager_locked); - } + LOCK (&local->fd->lock); + { + list_for_each_entry (each, &fdctx->eager_locked, + transaction.eager_locked) { + if (afr_locals_overlap (each, local)) { + local->transaction.eager_lock_on = _gf_false; + goto unlock; + } + } + + local->transaction.eager_lock_on = _gf_true; + list_add_tail (&local->transaction.eager_locked, + &fdctx->eager_locked); + } unlock: - UNLOCK (&local->fd->lock); + UNLOCK (&local->fd->lock); } @@ -1800,11 +1818,10 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) local->transaction.type = type; ret = afr_transaction_local_init (local, this); - if (ret < 0) { + if (ret < 0) goto out; - } - afr_transaction_eager_lock_init (local, this); + afr_transaction_eager_lock_init (local, this); if (local->fd && local->transaction.eager_lock_on) afr_set_lk_owner (frame, this, local->fd); diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index 55e8bbcca08..108131276ea 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -23,6 +23,9 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int afr_lock_server_count (afr_private_t *priv, afr_transaction_type type); +afr_inodelk_t* +afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom); + int32_t afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 9a13fbdacd9..4dd1b1be395 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -380,8 +380,6 @@ init (xlator_t *this) AFR_XATTR_PREFIX, trav->xlator->name); if (-1 == ret) { - gf_log (this->name, GF_LOG_ERROR, - "asprintf failed to set pending key"); ret = -ENOMEM; goto out; } diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index e72398a62bc..940f84189d5 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -30,6 +30,7 @@ #define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size" #define AFR_LOCKEE_COUNT_MAX 3 +#define AFR_DOM_COUNT_MAX 2 struct _pump_private; @@ -385,12 +386,19 @@ int afr_entry_lockee_cmp (const void *l1, const void *l2); typedef struct { + char *domain; /* Domain on which inodelk is taken */ + struct gf_flock flock; + unsigned char *locked_nodes; + int32_t lock_count; +} afr_inodelk_t; + +typedef struct { loc_t *lk_loc; - struct gf_flock lk_flock; int lockee_count; afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX]; + afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX]; const char *lk_basename; const char *lower_basename; const char *higher_basename; @@ -399,13 +407,11 @@ typedef struct { unsigned char *locked_nodes; unsigned char *lower_locked_nodes; - unsigned char *inode_locked_nodes; selfheal_lk_type_t selfheal_lk_type; transaction_lk_type_t transaction_lk_type; int32_t lock_count; - int32_t inodelk_lock_count; int32_t entrylk_lock_count; uint64_t lock_number; @@ -416,6 +422,7 @@ typedef struct { int32_t lock_op_ret; int32_t lock_op_errno; afr_lock_cbk_t lock_cbk; + char *domain; /* Domain on which inode/entry lock/unlock in progress.*/ } afr_internal_lock_t; typedef struct _afr_locked_fd { @@ -869,8 +876,8 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this); int afr_internal_lock_finish (call_frame_t *frame, xlator_t *this); -void -afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, unsigned int child_count); int pump_start (call_frame_t *frame, xlator_t *this); @@ -1158,4 +1165,7 @@ afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd); void afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub); +int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count); + #endif /* __AFR_H__ */ |