diff options
Diffstat (limited to 'xlators/cluster/afr/src/afr-transaction.c')
| -rw-r--r-- | xlators/cluster/afr/src/afr-transaction.c | 4032 |
1 files changed, 2498 insertions, 1534 deletions
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 0b91f466e11..a51f79b1f43 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -8,1956 +8,2920 @@ cases as published by the Free Software Foundation. */ -#include "dict.h" -#include "byte-order.h" -#include "common-utils.h" -#include "timer.h" +#include <glusterfs/dict.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/timer.h> #include "afr.h" #include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-messages.h" #include <signal.h> +typedef enum { + AFR_TRANSACTION_PRE_OP, + AFR_TRANSACTION_POST_OP, +} afr_xattrop_type_t; -#define LOCKED_NO 0x0 /* no lock held */ -#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path - of RENAME */ -#define LOCKED_LOWER 0x2 /* for lower_path of RENAME */ +static void +afr_lock_resume_shared(struct list_head *list); -afr_fd_ctx_t * -__afr_fd_ctx_get (fd_t *fd, xlator_t *this) -{ - uint64_t ctx = 0; - int ret = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int i = 0; - afr_private_t *priv = NULL; +static void +afr_post_op_handle_success(call_frame_t *frame, xlator_t *this); - priv = this->private; +static void +afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno); - ret = __fd_ctx_get (fd, this, &ctx); +void +__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared); - if (ret < 0 && fd_is_anonymous (fd)) { - ret = __afr_fd_ctx_set (this, fd); - if (ret < 0) - goto out; +void +afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this); - ret = __fd_ctx_get (fd, this, &ctx); - if (ret < 0) - goto out; +int +afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this); - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - for (i = 0; i < priv->child_count; i++) - fd_ctx->opened_on[i] = AFR_FD_OPENED; - } +gf_boolean_t +afr_changelog_pre_op_uninherit(call_frame_t *frame, xlator_t *this); - fd_ctx = (afr_fd_ctx_t *)(long) ctx; -out: - return fd_ctx; -} +gf_boolean_t +afr_changelog_pre_op_update(call_frame_t *frame, xlator_t *this); +int +afr_changelog_call_count(afr_transaction_type type, + unsigned char *pre_op_subvols, + unsigned char *failed_subvols, + unsigned int child_count); +int +afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume, + afr_xattrop_type_t op); -afr_fd_ctx_t * -afr_fd_ctx_get (fd_t *fd, xlator_t *this) -{ - afr_fd_ctx_t *fd_ctx = NULL; +static void +afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this); - LOCK(&fd->lock); - { - fd_ctx = __afr_fd_ctx_get (fd, this); - } - UNLOCK(&fd->lock); +static int +afr_ta_post_op_do(void *opaque); - return fd_ctx; -} +static int +afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local); +static int +afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this); static void -afr_save_lk_owner (call_frame_t *frame) -{ - afr_local_t * local = NULL; +afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno); - local = frame->local; - - local->saved_lk_owner = frame->root->lk_owner; +void +afr_ta_locked_priv_invalidate(afr_private_t *priv) +{ + priv->ta_bad_child_index = AFR_CHILD_UNKNOWN; + priv->release_ta_notify_dom_lock = _gf_false; + priv->ta_notify_dom_lock_offset = 0; } - static void -afr_restore_lk_owner (call_frame_t *frame) +afr_ta_process_waitq(xlator_t *this) { - afr_local_t * local = NULL; - - local = frame->local; - - frame->root->lk_owner = local->saved_lk_owner; + afr_local_t *entry = NULL; + afr_private_t *priv = this->private; + struct list_head waitq = { + 0, + }; + + INIT_LIST_HEAD(&waitq); + LOCK(&priv->lock); + list_splice_init(&priv->ta_waitq, &waitq); + UNLOCK(&priv->lock); + list_for_each_entry(entry, &waitq, ta_waitq) + { + afr_ta_decide_post_op_state(entry->transaction.frame, this); + } } -static void -__mark_all_pending (int32_t *pending[], int child_count, - afr_transaction_type type) +int +afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque) { - int i = 0; - int j = 0; + afr_ta_process_waitq(ta_frame->this); + STACK_DESTROY(ta_frame->root); + return 0; +} - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); - pending[i][j] = hton32 (1); - } +int +afr_release_notify_lock_for_ta(void *opaque) +{ + xlator_t *this = NULL; + afr_private_t *priv = NULL; + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; + int ret = -1; + + this = (xlator_t *)opaque; + priv = this->private; + ret = afr_fill_ta_loc(this, &loc, _gf_true); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate loc for thin-arbiter."); + goto out; + } + flock.l_type = F_UNLCK; + flock.l_start = priv->ta_notify_dom_lock_offset; + flock.l_len = 1; + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_NOTIFY, &loc, F_SETLK, &flock, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to unlock AFR_TA_DOM_NOTIFY lock."); + } + + LOCK(&priv->lock); + { + afr_ta_locked_priv_invalidate(priv); + } + UNLOCK(&priv->lock); +out: + loc_wipe(&loc); + return ret; } +void +afr_zero_fill_stat(afr_local_t *local) +{ + if (!local) + return; + if (local->transaction.type == AFR_DATA_TRANSACTION || + local->transaction.type == AFR_METADATA_TRANSACTION) { + gf_zero_fill_stat(&local->cont.inode_wfop.prebuf); + gf_zero_fill_stat(&local->cont.inode_wfop.postbuf); + } else if (local->transaction.type == AFR_ENTRY_TRANSACTION || + local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + gf_zero_fill_stat(&local->cont.dir_fop.buf); + gf_zero_fill_stat(&local->cont.dir_fop.preparent); + gf_zero_fill_stat(&local->cont.dir_fop.postparent); + if (local->transaction.type == AFR_ENTRY_TRANSACTION) + return; + gf_zero_fill_stat(&local->cont.dir_fop.prenewparent); + gf_zero_fill_stat(&local->cont.dir_fop.postnewparent); + } +} -static void -__mark_child_dead (int32_t *pending[], int child_count, int child, - afr_transaction_type type) +/* In case of errors afr needs to choose which xdata from lower xlators it needs + * to unwind with. The way it is done is by checking if there are + * any good subvols which failed. Give preference to errnos other than + * ENOTCONN even if the child is source */ +void +afr_pick_error_xdata(afr_local_t *local, afr_private_t *priv, inode_t *inode1, + unsigned char *readable1, inode_t *inode2, + unsigned char *readable2) { - int j = 0; + int s = -1; /*selection*/ + int i = 0; + unsigned char *readable = NULL; + + if (local->xdata_rsp) { + dict_unref(local->xdata_rsp); + local->xdata_rsp = NULL; + } + + readable = alloca0(priv->child_count * sizeof(*readable)); + if (inode2 && readable2) { /*rename fop*/ + AFR_INTERSECT(readable, readable1, readable2, priv->child_count); + } else { + memcpy(readable, readable1, sizeof(*readable) * priv->child_count); + } + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + + if (local->replies[i].op_ret >= 0) + continue; + + if (local->replies[i].op_errno == ENOTCONN) + continue; + + /*Order is important in the following condition*/ + if ((s < 0) || (!readable[s] && readable[i])) + s = i; + } + + if (s != -1 && local->replies[s].xdata) { + local->xdata_rsp = dict_ref(local->replies[s].xdata); + } else if (s == -1) { + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; - j = afr_index_for_transaction_type (type); + if (local->replies[i].op_ret >= 0) + continue; - pending[child][j] = 0; + if (!local->replies[i].xdata) + continue; + local->xdata_rsp = dict_ref(local->replies[i].xdata); + break; + } + } } +gf_boolean_t +afr_needs_changelog_update(afr_local_t *local) +{ + if (local->transaction.type == AFR_DATA_TRANSACTION) + return _gf_true; + if (!local->optimistic_change_log) + return _gf_true; + return _gf_false; +} -static void -__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index) +gf_boolean_t +afr_changelog_has_quorum(afr_local_t *local, xlator_t *this) { - afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; + afr_private_t *priv = NULL; + int i = 0; + unsigned char *success_children = NULL; - local = frame->local; + priv = this->private; + success_children = alloca0(priv->child_count); - if (!local->fd) - return; + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.failed_subvols[i]) { + success_children[i] = 1; + } + } - fd_ctx = afr_fd_ctx_get (local->fd, this); + if (afr_has_quorum(success_children, this, NULL)) { + return _gf_true; + } - if (!fd_ctx) - goto out; + return _gf_false; +} - LOCK (&local->fd->lock); - { - if (local->transaction.type == AFR_DATA_TRANSACTION) - fd_ctx->pre_op_done[child_index]++; - } - UNLOCK (&local->fd->lock); -out: - return; +gf_boolean_t +afr_is_write_subvol_valid(call_frame_t *frame, xlator_t *this) +{ + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + uint64_t write_subvol = 0; + unsigned char *writable = NULL; + uint16_t datamap = 0; + + local = frame->local; + priv = this->private; + writable = alloca0(priv->child_count); + + write_subvol = afr_write_subvol_get(frame, this); + datamap = (write_subvol & 0x00000000ffff0000) >> 16; + for (i = 0; i < priv->child_count; i++) { + if (datamap & (1 << i)) + writable[i] = 1; + + if (writable[i] && !local->transaction.failed_subvols[i]) + return _gf_true; + } + + return _gf_false; } -static void -__mark_non_participant_children (int32_t *pending[], int child_count, - unsigned char *participants, - afr_transaction_type type) +int +afr_transaction_fop(call_frame_t *frame, xlator_t *this) { - int i = 0; - int j = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + unsigned char *failed_subvols = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + + failed_subvols = local->transaction.failed_subvols; + call_count = priv->child_count - + AFR_COUNT(failed_subvols, priv->child_count); + /* Fail if pre-op did not succeed on quorum no. of bricks. */ + if (!afr_changelog_has_quorum(local, this) || !call_count) { + local->op_ret = -1; + /* local->op_errno is already captured in changelog cbk. */ + afr_transaction_resume(frame, this); + return 0; + } + + /* Fail if at least one writeable brick isn't up.*/ + if (local->transaction.type == AFR_DATA_TRANSACTION && + !afr_is_write_subvol_valid(frame, this)) { + local->op_ret = -1; + local->op_errno = EIO; + afr_transaction_resume(frame, this); + return 0; + } - j = afr_index_for_transaction_type (type); - for (i = 0; i < child_count; i++) { - if (!participants[i]) - pending[i][j] = 0; + local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] && !failed_subvols[i]) { + local->transaction.wind(frame, this, i); + + if (!--call_count) + break; } -} + } + return 0; +} -void -__mark_all_success (int32_t *pending[], int child_count, - afr_transaction_type type) +int +afr_transaction_done(call_frame_t *frame, xlator_t *this) { - int i; - int j; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + gf_boolean_t unwind = _gf_false; + afr_lock_t *lock = NULL; + afr_local_t *lock_local = NULL; + + priv = this->private; + local = frame->local; - for (i = 0; i < child_count; i++) { - j = afr_index_for_transaction_type (type); - pending[i][j] = hton32 (-1); + if (priv->consistent_metadata) { + LOCK(&frame->lock); + { + unwind = (local->transaction.main_frame != NULL); + } + UNLOCK(&frame->lock); + if (unwind) /*It definitely did post-op*/ + afr_zero_fill_stat(local); + } + + if (local->transaction.do_eager_unlock) { + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK(&local->inode->lock); + { + lock->acquired = _gf_false; + lock->release = _gf_false; + list_splice_init(&lock->frozen, &lock->waiting); + if (list_empty(&lock->waiting)) + goto unlock; + lock_local = list_entry(lock->waiting.next, afr_local_t, + transaction.wait_list); + list_del_init(&lock_local->transaction.wait_list); + list_add(&lock_local->transaction.owner_list, &lock->owners); } + unlock: + UNLOCK(&local->inode->lock); + } + if (lock_local) { + afr_lock(lock_local->transaction.frame, + lock_local->transaction.frame->this); + } + local->transaction.unwind(frame, this); + + GF_ASSERT(list_empty(&local->transaction.owner_list)); + GF_ASSERT(list_empty(&local->transaction.wait_list)); + AFR_STACK_DESTROY(frame); + + return 0; } -void -_set_all_child_errno (int *child_errno, unsigned int child_count) +static void +afr_lock_fail_shared(afr_local_t *local, struct list_head *list) { - int i = 0; - - for (i = 0; i < child_count; i++) - if (child_errno[i] == 0) - child_errno[i] = ENOTCONN; + afr_local_t *each = NULL; + + while (!list_empty(list)) { + each = list_entry(list->next, afr_local_t, transaction.wait_list); + list_del_init(&each->transaction.wait_list); + each->op_ret = -1; + each->op_errno = local->op_errno; + afr_transaction_done(each->transaction.frame, + each->transaction.frame->this); + } } -void -afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) +static void +afr_handle_lock_acquire_failure(afr_local_t *local) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - fd_t *fd = NULL; + struct list_head shared; + afr_lock_t *lock = NULL; - local = frame->local; - priv = this->private; - fd = local->fd; + if (!local->transaction.eager_lock_on) + goto out; - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); + lock = &local->inode_ctx->lock[local->transaction.type]; - _set_all_child_errno (local->child_errno, priv->child_count); + INIT_LIST_HEAD(&shared); + LOCK(&local->inode->lock); + { + lock->release = _gf_true; + list_splice_init(&lock->waiting, &shared); + } + UNLOCK(&local->inode->lock); - /* Perform fops with the lk-owner from top xlator. - * Eg: lk-owner of posix-lk and flush should be same, - * flush cant clear the posix-lks without that lk-owner. - */ - afr_save_lk_owner (frame); - frame->root->lk_owner = - local->transaction.main_frame->root->lk_owner; - - - /* The wake up needs to happen independent of - what type of fop arrives here. If it was - a write, then it has already inherited the - lock and changelog. If it was not a write, - then the presumption of the optimization (of - optimizing for successive write operations) - fails. - */ - if (fd) - afr_delayed_changelog_wake_up (this, fd); - local->transaction.fop (frame, this); + afr_lock_fail_shared(local, &shared); + local->transaction.do_eager_unlock = _gf_true; +out: + local->internal_lock.lock_cbk = afr_transaction_done; + afr_unlock(local->transaction.frame, local->transaction.frame->this); } - -static int -__changelog_enabled (afr_private_t *priv, afr_transaction_type type) +call_frame_t * +afr_transaction_detach_fop_frame(call_frame_t *frame) { - int ret = 0; + afr_local_t *local = NULL; + call_frame_t *fop_frame = NULL; - switch (type) { - case AFR_DATA_TRANSACTION: - if (priv->data_change_log) - ret = 1; + local = frame->local; - break; + afr_handle_inconsistent_fop(frame, &local->op_ret, &local->op_errno); + LOCK(&frame->lock); + { + fop_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK(&frame->lock); - case AFR_METADATA_TRANSACTION: - if (priv->metadata_change_log) - ret = 1; + return fop_frame; +} - break; +static void +afr_save_lk_owner(call_frame_t *frame) +{ + afr_local_t *local = NULL; - case AFR_ENTRY_TRANSACTION: - case AFR_ENTRY_RENAME_TRANSACTION: - if (priv->entry_change_log) - ret = 1; + local = frame->local; - break; - } - - return ret; + local->saved_lk_owner = frame->root->lk_owner; } - -static int -__fop_changelog_needed (call_frame_t *frame, xlator_t *this) +static void +afr_restore_lk_owner(call_frame_t *frame) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int op_ret = 0; - afr_transaction_type type = -1; - - priv = this->private; - local = frame->local; - type = local->transaction.type; + afr_local_t *local = NULL; - if (__changelog_enabled (priv, type)) { - switch (local->op) { + local = frame->local; - case GF_FOP_WRITE: - case GF_FOP_FTRUNCATE: - op_ret = 1; - break; + frame->root->lk_owner = local->saved_lk_owner; +} - case GF_FOP_FLUSH: - op_ret = 0; - break; +void +__mark_all_success(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i; - default: - op_ret = 1; - } - } + local = frame->local; + priv = this->private; - return op_ret; + for (i = 0; i < priv->child_count; i++) { + local->transaction.failed_subvols[i] = 0; + } } -int -afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending, - int child, afr_xattrop_type_t op) +void +afr_compute_pre_op_sources(call_frame_t *frame, xlator_t *this) { - int i = 0; - int ret = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_transaction_type type = -1; + dict_t *xdata = NULL; + int **matrix = NULL; + int idx = -1; + int i = 0; + int j = 0; + + priv = this->private; + local = frame->local; + type = local->transaction.type; + idx = afr_index_for_transaction_type(type); + matrix = ALLOC_MATRIX(priv->child_count, int); + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.changelog_xdata[i]) + continue; + xdata = local->transaction.changelog_xdata[i]; + afr_selfheal_fill_matrix(this, matrix, i, idx, xdata); + } + + memset(local->transaction.pre_op_sources, 1, priv->child_count); + + /*If lock or pre-op failed on a brick, it is not a source. */ + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) + local->transaction.pre_op_sources[i] = 0; + } + + /* If brick is blamed by others, it is not a source. */ + for (i = 0; i < priv->child_count; i++) + for (j = 0; j < priv->child_count; j++) + if (matrix[i][j] != 0) + local->transaction.pre_op_sources[j] = 0; +} - if (op == LOCAL_FIRST) { - ret = dict_set_static_bin (xattr, priv->pending_key[child], - pending[child], - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); - if (ret) - goto out; - } - for (i = 0; i < priv->child_count; i++) { - if (i == child) - continue; - ret = dict_set_static_bin (xattr, priv->pending_key[i], - pending[i], - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); - /* 3 = data+metadata+entry */ +void +afr_txn_arbitrate_fop(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int pre_op_sources_count = 0; + int i = 0; + + priv = this->private; + local = frame->local; + + afr_compute_pre_op_sources(frame, this); + pre_op_sources_count = AFR_COUNT(local->transaction.pre_op_sources, + priv->child_count); + + /* If arbiter is the only source, do not proceed. */ + if (pre_op_sources_count < 2 && + local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) { + local->op_ret = -1; + local->op_errno = ENOTCONN; + for (i = 0; i < priv->child_count; i++) + local->transaction.failed_subvols[i] = 1; + } + + afr_transaction_fop(frame, this); + + return; +} - if (ret < 0) - goto out; +int +afr_transaction_perform_fop(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int ret = 0; + int failure_count = 0; + struct list_head shared; + afr_lock_t *lock = NULL; + + local = frame->local; + priv = this->private; + + INIT_LIST_HEAD(&shared); + if (local->transaction.type == AFR_DATA_TRANSACTION && + !local->transaction.inherited) { + ret = afr_write_subvol_set(frame, this); + if (ret) { + /*act as if operation failed on all subvols*/ + local->op_ret = -1; + local->op_errno = -ret; + for (i = 0; i < priv->child_count; i++) + local->transaction.failed_subvols[i] = 1; } - if (op == LOCAL_LAST) { - ret = dict_set_static_bin (xattr, priv->pending_key[child], - pending[child], - AFR_NUM_CHANGE_LOGS * sizeof (int32_t)); - if (ret) - goto out; + } + + if (local->pre_op_compat) + /* old mode, pre-op was done as afr_changelog_do() + just now, before OP */ + afr_changelog_pre_op_update(frame, this); + + if (!local->transaction.eager_lock_on || local->transaction.inherited) + goto fop; + failure_count = AFR_COUNT(local->transaction.failed_subvols, + priv->child_count); + if (failure_count == priv->child_count) { + afr_handle_lock_acquire_failure(local); + return 0; + } else { + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK(&local->inode->lock); + { + lock->acquired = _gf_true; + __afr_transaction_wake_shared(local, &shared); } -out: - return ret; + UNLOCK(&local->inode->lock); + } + +fop: + /* Perform fops with the lk-owner from top xlator. + * Eg: lk-owner of posix-lk and flush should be same, + * flush cant clear the posix-lks without that lk-owner. + */ + afr_save_lk_owner(frame); + frame->root->lk_owner = local->transaction.main_frame->root->lk_owner; + + if (priv->arbiter_count == 1) { + afr_txn_arbitrate_fop(frame, this); + } else { + afr_transaction_fop(frame, this); + } + + afr_lock_resume_shared(&shared); + return 0; } int -afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) +afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int **pending) { - int ret = 0; + int i = 0; + int ret = 0; - switch (type) { - case AFR_DATA_TRANSACTION: - ret = priv->child_count; - break; + for (i = 0; i < priv->child_count; i++) { + ret = dict_set_static_bin(xattr, priv->pending_key[i], pending[i], + AFR_NUM_CHANGE_LOGS * sizeof(int)); + /* 3 = data+metadata+entry */ - case AFR_METADATA_TRANSACTION: - ret = priv->child_count; - break; + if (ret) + break; + } - case AFR_ENTRY_TRANSACTION: - case AFR_ENTRY_RENAME_TRANSACTION: - ret = priv->child_count; - break; - } - - return ret; + return ret; } -/* {{{ pending */ - -int32_t -afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr, - dict_t *xdata) +static void +afr_ta_dom_lock_check_and_release(afr_ta_fop_state_t fop_state, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int call_count = -1; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - LOCK (&frame->lock); - { - call_count = --local->call_count; + afr_private_t *priv = this->private; + unsigned int inmem_count = 0; + unsigned int onwire_count = 0; + gf_boolean_t release = _gf_false; + + LOCK(&priv->lock); + { + /*Once we get notify lock release upcall notification, + if any of the fop state counters are non-zero, we will + not release the lock. + */ + onwire_count = priv->ta_on_wire_txn_count; + inmem_count = priv->ta_in_mem_txn_count; + switch (fop_state) { + case TA_GET_INFO_FROM_TA_FILE: + onwire_count = --priv->ta_on_wire_txn_count; + break; + case TA_INFO_IN_MEMORY_SUCCESS: + case TA_INFO_IN_MEMORY_FAILED: + inmem_count = --priv->ta_in_mem_txn_count; + break; + case TA_WAIT_FOR_NOTIFY_LOCK_REL: + GF_ASSERT(0); + break; + case TA_SUCCESS: + break; } - UNLOCK (&frame->lock); - - if (call_count == 0) { - if (local->transaction.resume_stub) { - call_resume (local->transaction.resume_stub); - local->transaction.resume_stub = NULL; - } + release = priv->release_ta_notify_dom_lock; + } + UNLOCK(&priv->lock); - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - local->transaction.done (frame, this); - } else { - int_lock->lock_cbk = local->transaction.done; - afr_unlock (frame, this); - } - } + if (inmem_count != 0 || release == _gf_false || onwire_count != 0) + return; - return 0; + afr_ta_lock_release_synctask(this); } - -void -afr_transaction_rm_stale_children (call_frame_t *frame, xlator_t *this, - inode_t *inode, afr_transaction_type type) -{ - int i = -1; - int count = 0; - int read_child = -1; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int **pending = NULL; - int idx = 0; - int32_t *stale_children = NULL; - int32_t *fresh_children = NULL; - gf_boolean_t rm_stale_children = _gf_false; - - idx = afr_index_for_transaction_type (type); - - priv = this->private; - local = frame->local; - pending = local->pending; - - if (local->op_ret < 0) - goto out; - fresh_children = local->fresh_children; - read_child = afr_inode_get_read_ctx (this, inode, fresh_children); - if (read_child < 0) { - gf_log (this->name, GF_LOG_DEBUG, "Possible split-brain " - "for %s", uuid_utoa (inode->gfid)); - goto out; +static void +afr_ta_process_onwireq(afr_ta_fop_state_t fop_state, xlator_t *this) +{ + afr_private_t *priv = this->private; + afr_local_t *entry = NULL; + int bad_child = AFR_CHILD_UNKNOWN; + + struct list_head onwireq = { + 0, + }; + INIT_LIST_HEAD(&onwireq); + + LOCK(&priv->lock); + { + bad_child = priv->ta_bad_child_index; + if (bad_child == AFR_CHILD_UNKNOWN) { + /*The previous on-wire ta_post_op was a failure. Just dequeue + *one element to wind on-wire again. */ + entry = list_entry(priv->ta_onwireq.next, afr_local_t, ta_onwireq); + list_del_init(&entry->ta_onwireq); + } else { + /* Prepare to process all fops based on bad_child_index. */ + list_splice_init(&priv->ta_onwireq, &onwireq); } + } + UNLOCK(&priv->lock); - for (i = 0; i < priv->child_count; i++) { - if (!afr_is_child_present (fresh_children, - priv->child_count, i)) - continue; - if (pending[i][idx]) - continue; - /* child is down or op failed on it */ - if (!stale_children) - stale_children = afr_children_create (priv->child_count); - if (!stale_children) - goto out; - - rm_stale_children = _gf_true; - stale_children[count++] = i; - gf_log (this->name, GF_LOG_DEBUG, "Removing stale child " - "%d for %s", i, uuid_utoa (inode->gfid)); + if (entry) { + afr_ta_post_op_synctask(this, entry); + return; + } else { + while (!list_empty(&onwireq)) { + entry = list_entry(onwireq.next, afr_local_t, ta_onwireq); + list_del_init(&entry->ta_onwireq); + if (entry->ta_failed_subvol == bad_child) { + afr_post_op_handle_success(entry->transaction.frame, this); + } else { + afr_post_op_handle_failure(entry->transaction.frame, this, EIO); + } } + } +} - if (!rm_stale_children) - goto out; - - afr_inode_rm_stale_children (this, inode, stale_children); -out: - GF_FREE (stale_children); - return; +int +afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + int_lock = &local->internal_lock; + + if (priv->thin_arbiter_count) { + /*fop should not come here with TA_WAIT_FOR_NOTIFY_LOCK_REL state */ + afr_ta_dom_lock_check_and_release(local->fop_state, this); + } + + /* Fail the FOP if post-op did not succeed on quorum no. of bricks. */ + if (!afr_changelog_has_quorum(local, this)) { + local->op_ret = -1; + /*local->op_errno is already captured in changelog cbk*/ + } + + if (local->transaction.resume_stub) { + call_resume(local->transaction.resume_stub); + local->transaction.resume_stub = NULL; + } + + int_lock->lock_cbk = afr_transaction_done; + afr_unlock(frame, this); + + return 0; } -afr_inodelk_t* -afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom) +static void +afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno) { - afr_inodelk_t *inodelk = NULL; - int i = 0; + afr_local_t *local = frame->local; + local->op_ret = -1; + local->op_errno = op_errno; - for (i = 0; int_lock->inodelk[i].domain; i++) { - inodelk = &int_lock->inodelk[i]; - if (strcmp (dom, inodelk->domain) == 0) - return inodelk; - } - return NULL; + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_THIN_ARB, + "Failing %s for gfid %s. Fop state is:%d", gf_fop_list[local->op], + uuid_utoa(local->inode->gfid), local->fop_state); + + afr_changelog_post_op_done(frame, this); } -unsigned char* -afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock) +unsigned char * +afr_locked_nodes_get(afr_transaction_type type, afr_internal_lock_t *int_lock) { - unsigned char *locked_nodes = NULL; - afr_inodelk_t *inodelk = NULL; - switch (type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - locked_nodes = inodelk->locked_nodes; - break; - - case AFR_ENTRY_TRANSACTION: - case AFR_ENTRY_RENAME_TRANSACTION: - /*Because same set of subvols participate in all lockee - * entities*/ - locked_nodes = int_lock->lockee[0].locked_nodes; - break; - } - return locked_nodes; + /*Because same set of subvols participate in all lockee + * entities*/ + return int_lock->lockee[0].locked_nodes; } int -afr_changelog_pre_op_call_count (afr_transaction_type type, - afr_internal_lock_t *int_lock, - unsigned int child_count) +afr_changelog_call_count(afr_transaction_type type, + unsigned char *pre_op_subvols, + unsigned char *failed_subvols, + unsigned int child_count) { - int call_count = 0; - unsigned char *locked_nodes = NULL; + int i = 0; + int call_count = 0; - locked_nodes = afr_locked_nodes_get (type, int_lock); - GF_ASSERT (locked_nodes); + for (i = 0; i < child_count; i++) { + if (pre_op_subvols[i] && !failed_subvols[i]) { + call_count++; + } + } - call_count = afr_locked_children_count (locked_nodes, child_count); - if (type == AFR_ENTRY_RENAME_TRANSACTION) - call_count *= 2; + if (type == AFR_ENTRY_RENAME_TRANSACTION) + call_count *= 2; - return call_count; + return call_count; } -int -afr_changelog_post_op_call_count (afr_transaction_type type, - unsigned char *pre_op, - unsigned int child_count) +gf_boolean_t +afr_txn_nothing_failed(call_frame_t *frame, xlator_t *this) { - int call_count = 0; - - call_count = afr_pre_op_done_children_count (pre_op, child_count); - if (type == AFR_ENTRY_RENAME_TRANSACTION) - call_count *= 2; - - return call_count; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + + if (priv->thin_arbiter_count) { + /* We need to perform post-op even if 1 data brick was down + * before the txn started.*/ + if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count)) + return _gf_false; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] && + local->transaction.failed_subvols[i]) + return _gf_false; + } + + return _gf_true; } void -afr_compute_txn_changelog (afr_local_t *local, afr_private_t *priv) +afr_handle_symmetric_errors(call_frame_t *frame, xlator_t *this) { - int i = 0; - int index = 0; - int32_t postop = 0; - int32_t preop = 1; - int32_t **txn_changelog = NULL; + if (afr_is_symmetric_error(frame, this)) + __mark_all_success(frame, this); +} - txn_changelog = local->transaction.txn_changelog; - index = afr_index_for_transaction_type (local->transaction.type); - for (i = 0; i < priv->child_count; i++) { - postop = ntoh32 (local->pending[i][index]); - txn_changelog[i][index] = hton32 (postop + preop); +gf_boolean_t +afr_has_quorum(unsigned char *subvols, xlator_t *this, call_frame_t *frame) +{ + unsigned int quorum_count = 0; + afr_private_t *priv = NULL; + unsigned int up_children_count = 0; + + priv = this->private; + up_children_count = AFR_COUNT(subvols, priv->child_count); + + if (afr_lookup_has_quorum(frame, up_children_count)) + return _gf_true; + + if (priv->quorum_count == AFR_QUORUM_AUTO) { + /* + * Special case for auto-quorum with an even number of nodes. + * + * A replica set with even count N can only handle the same + * number of failures as odd N-1 before losing "vanilla" + * quorum, and the probability of more simultaneous failures is + * actually higher. For example, with a 1% chance of failure + * we'd have a 0.03% chance of two simultaneous failures with + * N=3 but a 0.06% chance with N=4. However, the special case + * is necessary for N=2 because there's no real quorum in that + * case (i.e. can't normally survive *any* failures). In that + * case, we treat the first node as a tie-breaker, allowing + * quorum to be retained in some cases while still honoring the + * all-important constraint that there can not simultaneously + * be two partitioned sets of nodes each believing they have + * quorum. Of two equally sized sets, the one without that + * first node will lose. + * + * It turns out that the special case is beneficial for higher + * values of N as well. Continuing the example above, the + * probability of losing quorum with N=4 and this type of + * quorum is (very) slightly lower than with N=3 and vanilla + * quorum. The difference becomes even more pronounced with + * higher N. Therefore, even though such replica counts are + * unlikely to be seen in practice, we might as well use the + * "special" quorum then as well. + */ + if ((up_children_count * 2) == priv->child_count) { + return subvols[0]; } + } + + if (priv->quorum_count == AFR_QUORUM_AUTO) { + quorum_count = priv->child_count / 2 + 1; + } else { + quorum_count = priv->quorum_count; + } + + if (up_children_count >= quorum_count) + return _gf_true; + + return _gf_false; } -afr_xattrop_type_t -afr_get_postop_xattrop_type (int32_t **pending, int optimized, int child, - afr_transaction_type type) +static gf_boolean_t +afr_has_fop_quorum(call_frame_t *frame) { - int index = 0; - afr_xattrop_type_t op = LOCAL_LAST; + xlator_t *this = frame->this; + afr_local_t *local = frame->local; + unsigned char *locked_nodes = NULL; - index = afr_index_for_transaction_type (type); - if (optimized && !pending[child][index]) - op = LOCAL_FIRST; - return op; + locked_nodes = afr_locked_nodes_get(local->transaction.type, + &local->internal_lock); + return afr_has_quorum(locked_nodes, this, NULL); } -void -afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr, - int optimized, int child) -{ - int32_t **txn_changelog = NULL; - int32_t **changelog = NULL; - afr_private_t *priv = NULL; - int ret = 0; - afr_xattrop_type_t op = LOCAL_LAST; - - priv = this->private; - txn_changelog = local->transaction.txn_changelog; - op = afr_get_postop_xattrop_type (local->pending, optimized, child, - local->transaction.type); - if (optimized) - changelog = txn_changelog; - else - changelog = local->pending; - ret = afr_set_pending_dict (priv, xattr, changelog, child, op); - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); +static gf_boolean_t +afr_has_fop_cbk_quorum(call_frame_t *frame) +{ + afr_local_t *local = frame->local; + xlator_t *this = frame->this; + afr_private_t *priv = this->private; + unsigned char *success = alloca0(priv->child_count); + int i = 0; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) + if (!local->transaction.failed_subvols[i]) + success[i] = 1; + } + + return afr_has_quorum(success, this, NULL); } - gf_boolean_t -afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) +afr_need_dirty_marking(call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int index = -1; - int i = 0; + afr_private_t *priv = this->private; + afr_local_t *local = NULL; + gf_boolean_t need_dirty = _gf_false; - local = frame->local; - priv = this->private; + local = frame->local; - index = afr_index_for_transaction_type (local->transaction.type); + if (!priv->quorum_count || !local->optimistic_change_log) + return _gf_false; - for (i = 0; i < priv->child_count; i++) { - if (local->pending[i][index] == 0) - return _gf_false; - } + if (local->transaction.type == AFR_DATA_TRANSACTION || + local->transaction.type == AFR_METADATA_TRANSACTION) + return _gf_false; - return _gf_true; + if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count) == + priv->child_count) + return _gf_false; + + if (!afr_has_fop_cbk_quorum(frame)) + need_dirty = _gf_true; + + return need_dirty; } -static void -afr_dir_fop_handle_all_fop_failures (call_frame_t *frame) +void +afr_handle_quorum(call_frame_t *frame, xlator_t *this) { - xlator_t *this = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + const char *file = NULL; + uuid_t gfid = {0}; - this = frame->this; - local = frame->local; - priv = this->private; + local = frame->local; + priv = frame->this->private; - if ((local->transaction.type != AFR_ENTRY_TRANSACTION) && - (local->transaction.type != AFR_ENTRY_RENAME_TRANSACTION)) - return; + if (priv->quorum_count == 0) + return; - if (local->op_ret >= 0) - goto out; + /* If the fop already failed return right away to preserve errno */ + if (local->op_ret == -1) + return; - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); -out: + /* + * Network split may happen just after the fops are unwound, so check + * if the fop succeeded in a way it still follows quorum. If it doesn't, + * mark the fop as failure, mark the changelogs so it reflects that + * failure. + * + * Scenario: + * There are 3 mounts on 3 machines(node1, node2, node3) all writing to + * single file. Network split happened in a way that node1 can't see + * node2, node3. Node2, node3 both of them can't see node1. Now at the + * time of sending write all the bricks are up. Just after write fop is + * wound on node1, network split happens. Node1 thinks write fop failed + * on node2, node3 so marks pending changelog for those 2 extended + * attributes on node1. Node2, node3 thinks writes failed on node1 so + * they mark pending changelog for node1. When the network is stable + * again the file already is in split-brain. These checks prevent + * marking pending changelog on other subvolumes if the fop doesn't + * succeed in a way it is still following quorum. So with this fix what + * is happening is, node1 will have all pending changelog(FOOL) because + * the write succeeded only on node1 but failed on node2, node3 so + * instead of marking pending changelogs on node2, node3 it just treats + * the fop as failure and goes into DIRTY state. Where as node2, node3 + * say they are sources and have pending changelog to node1 so there is + * no split-brain with the fix. The problem is eliminated completely. + */ + + if (afr_has_fop_cbk_quorum(frame)) return; + + if (afr_need_dirty_marking(frame, this)) + goto set_response; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) + afr_transaction_fop_failed(frame, frame->this, i); + } + +set_response: + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + if (local->op_errno == 0) + local->op_errno = afr_quorum_errno(priv); + + if (local->fd) { + gf_uuid_copy(gfid, local->fd->inode->gfid); + file = uuid_utoa(gfid); + } else { + loc_path(&local->loc, local->loc.name); + file = local->loc.path; + } + + gf_msg(frame->this->name, GF_LOG_WARNING, local->op_errno, + AFR_MSG_QUORUM_FAIL, "%s: Failing %s as quorum is not met", file, + gf_fop_list[local->op]); + + switch (local->transaction.type) { + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + afr_pick_error_xdata(local, priv, local->parent, local->readable, + local->parent2, local->readable2); + break; + default: + afr_pick_error_xdata(local, priv, local->inode, local->readable, + NULL, NULL); + break; + } } -static void -afr_data_handle_quota_errors (call_frame_t *frame, xlator_t *this) +int +afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop) { - int i = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - gf_boolean_t all_quota_failures = _gf_false; + afr_private_t *priv = NULL; + + priv = this->private; + loc->parent = inode_ref(priv->root_inode); + gf_uuid_copy(loc->pargfid, loc->parent->gfid); + loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX]; + if (is_gfid_based_fop && gf_uuid_is_null(priv->ta_gfid)) { + /* Except afr_ta_id_file_check() which is path based, all other gluster + * FOPS need gfid.*/ + return -EINVAL; + } + gf_uuid_copy(loc->gfid, priv->ta_gfid); + loc->inode = inode_new(loc->parent->table); + if (!loc->inode) { + loc_wipe(loc); + return -ENOMEM; + } + return 0; +} - local = frame->local; - priv = this->private; - if (local->transaction.type != AFR_DATA_TRANSACTION) - return; - /* - * Idea is to not leave the file in FOOL-FOOL scenario in case on - * all the bricks data transaction failed with EDQUOT to avoid - * increasing un-necessary load of self-heals in the system. - */ - all_quota_failures = _gf_true; - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i] && - (local->child_errno[i] != EDQUOT)) { - all_quota_failures = _gf_false; - break; - } - } - if (all_quota_failures) - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); +static int +afr_ta_post_op_done(int ret, call_frame_t *frame, void *opaque) +{ + xlator_t *this = NULL; + afr_local_t *local = NULL; + call_frame_t *txn_frame = NULL; + afr_ta_fop_state_t fop_state; + + local = (afr_local_t *)opaque; + fop_state = local->fop_state; + txn_frame = local->transaction.frame; + this = frame->this; + + if (ret == 0) { + /*Mark pending xattrs on the up data brick.*/ + afr_post_op_handle_success(txn_frame, this); + } else { + afr_post_op_handle_failure(txn_frame, this, -ret); + } + + STACK_DESTROY(frame->root); + afr_ta_process_onwireq(fop_state, this); + + return 0; } -int -afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) +int ** +afr_set_changelog_xattr(afr_private_t *priv, unsigned char *pending, + dict_t *xattr, afr_local_t *local) { - afr_private_t * priv = this->private; - afr_internal_lock_t *int_lock = NULL; - int i = 0; - int call_count = 0; + int **changelog = NULL; + int idx = 0; + int ret = 0; + int i; + + if (local->is_new_entry == _gf_true) { + changelog = afr_mark_pending_changelog(priv, pending, xattr, + local->cont.dir_fop.buf.ia_type); + } else { + idx = afr_index_for_transaction_type(local->transaction.type); + changelog = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!changelog) { + goto out; + } + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) + changelog[i][idx] = hton32(1); + } + ret = afr_set_pending_dict(priv, xattr, changelog); + if (ret < 0) { + afr_matrix_cleanup(changelog, priv->child_count); + return NULL; + } + } - afr_local_t * local = NULL; - afr_fd_ctx_t *fdctx = NULL; - dict_t **xattr = NULL; - int piggyback = 0; - int nothing_failed = 1; +out: + return changelog; +} - local = frame->local; - int_lock = &local->internal_lock; +static void +afr_ta_locked_xattrop_validate(afr_private_t *priv, afr_local_t *local, + gf_boolean_t *valid) +{ + if (priv->ta_event_gen > local->ta_event_gen) { + /* We can't trust the ta's response anymore.*/ + afr_ta_locked_priv_invalidate(priv); + *valid = _gf_false; + return; + } + return; +} - __mark_non_participant_children (local->pending, priv->child_count, - local->transaction.pre_op, - local->transaction.type); +static int +afr_ta_post_op_do(void *opaque) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t *this = NULL; + dict_t *xattr = NULL; + unsigned char *pending = NULL; + int **changelog = NULL; + int failed_subvol = -1; + int success_subvol = -1; + loc_t loc = { + 0, + }; + int i = 0; + int ret = 0; + gf_boolean_t valid = _gf_true; + + local = (afr_local_t *)opaque; + this = local->transaction.frame->this; + priv = this->private; + + ret = afr_fill_ta_loc(this, &loc, _gf_true); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate loc for thin-arbiter."); + goto out; + } + + xattr = dict_new(); + if (!xattr) { + ret = -ENOMEM; + goto out; + } + + pending = alloca0(priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) { + pending[i] = 1; + failed_subvol = i; + } else { + success_subvol = i; + } + } + + changelog = afr_set_changelog_xattr(priv, pending, xattr, local); + + if (!changelog) { + ret = -ENOMEM; + goto out; + } + + ret = afr_ta_post_op_lock(this, &loc); + if (ret) + goto out; + + ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Post-op on thin-arbiter id file %s failed for gfid %s.", + priv->pending_key[THIN_ARBITER_BRICK_INDEX], + uuid_utoa(local->inode->gfid)); + } + LOCK(&priv->lock); + { + if (ret == 0) { + priv->ta_bad_child_index = failed_subvol; + } else if (ret == -EINVAL) { + priv->ta_bad_child_index = success_subvol; + ret = -EIO; /* TA failed the fop. Return EIO to application. */ + } - afr_data_handle_quota_errors (frame, this); - afr_dir_fop_handle_all_fop_failures (frame); + afr_ta_locked_xattrop_validate(priv, local, &valid); + } + UNLOCK(&priv->lock); + if (valid == _gf_false) { + gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB, + "Post-op on thin-arbiter id file %s for gfid %s invalidated due " + "to event-gen mismatch.", + priv->pending_key[THIN_ARBITER_BRICK_INDEX], + uuid_utoa(local->inode->gfid)); + ret = -EIO; + } + + afr_ta_post_op_unlock(this, &loc); +out: + if (xattr) + dict_unref(xattr); - if (local->fd) - afr_transaction_rm_stale_children (frame, this, - local->fd->inode, - local->transaction.type); + if (changelog) + afr_matrix_cleanup(changelog, priv->child_count); - xattr = alloca (priv->child_count * sizeof (*xattr)); - memset (xattr, 0, (priv->child_count * sizeof (*xattr))); - for (i = 0; i < priv->child_count; i++) { - xattr[i] = dict_new (); - } + loc_wipe(&loc); - call_count = afr_changelog_post_op_call_count (local->transaction.type, - local->transaction.pre_op, - priv->child_count); - local->call_count = call_count; + return ret; +} - if (local->fd) - fdctx = afr_fd_ctx_get (local->fd, this); +static int +afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local) +{ + call_frame_t *ta_frame = NULL; + int ret = 0; + + ta_frame = afr_ta_frame_create(this); + if (!ta_frame) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to create ta_frame"); + goto err; + } + ret = synctask_new(this->ctx->env, afr_ta_post_op_do, afr_ta_post_op_done, + ta_frame, local); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to launch post-op on thin arbiter for gfid %s", + uuid_utoa(local->inode->gfid)); + STACK_DESTROY(ta_frame->root); + goto err; + } + + return ret; +err: + afr_changelog_post_op_fail(local->transaction.frame, this, ENOMEM); + return ret; +} - if (call_count == 0) { - /* no child is up */ - int_lock->lock_cbk = local->transaction.done; - afr_unlock (frame, this); - goto out; +static void +afr_ta_set_fop_state(afr_private_t *priv, afr_local_t *local, + int *on_wire_count) +{ + LOCK(&priv->lock); + { + if (priv->release_ta_notify_dom_lock == _gf_true) { + /* Put the fop in waitq until notify dom lock is released.*/ + local->fop_state = TA_WAIT_FOR_NOTIFY_LOCK_REL; + list_add_tail(&local->ta_waitq, &priv->ta_waitq); + } else if (priv->ta_bad_child_index == AFR_CHILD_UNKNOWN) { + /* Post-op on thin-arbiter to decide success/failure. */ + local->fop_state = TA_GET_INFO_FROM_TA_FILE; + *on_wire_count = ++priv->ta_on_wire_txn_count; + if (*on_wire_count > 1) { + /*Avoid sending multiple on-wire post-ops on TA*/ + list_add_tail(&local->ta_onwireq, &priv->ta_onwireq); + } + } else if (local->ta_failed_subvol == priv->ta_bad_child_index) { + /* Post-op on TA not needed as the fop failed on the in-memory bad + * brick. Just mark pending xattrs on the good data brick.*/ + local->fop_state = TA_INFO_IN_MEMORY_SUCCESS; + priv->ta_in_mem_txn_count++; + } else { + /* Post-op on TA not needed as the fop succeeded only on the + * in-memory bad data brick and not the good one. Fail the fop.*/ + local->fop_state = TA_INFO_IN_MEMORY_FAILED; + priv->ta_in_mem_txn_count++; } + } + UNLOCK(&priv->lock); +} - nothing_failed = afr_txn_nothing_failed (frame, this); +static void +afr_ta_fill_failed_subvol(afr_private_t *priv, afr_local_t *local) +{ + int i = 0; - afr_compute_txn_changelog (local , priv); + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) { + local->ta_failed_subvol = i; + break; + } + } +} - for (i = 0; i < priv->child_count; i++) { - if (!local->transaction.pre_op[i]) - continue; +static void +afr_post_op_handle_success(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; - if (local->transaction.type != AFR_DATA_TRANSACTION) - afr_set_postop_dict (local, this, xattr[i], - local->optimistic_change_log, i); - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - { - if (!fdctx) { - afr_set_postop_dict (local, this, xattr[i], - 0, i); - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - break; - } - - /* local->transaction.postop_piggybacked[] was - precomputed in is_piggyback_postop() when called from - afr_changelog_post_op_safe() - */ - - piggyback = 0; - if (local->transaction.postop_piggybacked[i]) - piggyback = 1; - - afr_set_postop_dict (local, this, xattr[i], - piggyback, i); - - if (nothing_failed && piggyback) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], NULL); - } else { - STACK_WIND_COOKIE (frame, - afr_changelog_post_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - } - break; - case AFR_METADATA_TRANSACTION: - { - if (nothing_failed && local->optimistic_change_log) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - break; - } - - if (local->fd) - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - else - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; + local = frame->local; + if (local->is_new_entry == _gf_true) { + afr_mark_new_entry_changelog(frame, this); + } + afr_changelog_post_op_do(frame, this); - case AFR_ENTRY_RENAME_TRANSACTION: - { - if (nothing_failed && local->optimistic_change_log) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - } else { - STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - call_count--; - } + return; +} - /* - set it again because previous stack_wind - might have already returned (think of case - where subvolume is posix) and would have - used the dict as placeholder for return - value - */ +static void +afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno) +{ + afr_changelog_post_op_fail(frame, this, op_errno); - afr_set_postop_dict (local, this, xattr[i], - local->optimistic_change_log, i); + return; +} - /* fall through */ +static void +afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int on_wire_count = 0; + + priv = this->private; + local = frame->local; + + afr_ta_set_fop_state(priv, local, &on_wire_count); + + switch (local->fop_state) { + case TA_GET_INFO_FROM_TA_FILE: + if (on_wire_count == 1) + afr_ta_post_op_synctask(this, local); + /*else, fop is queued in ta_onwireq.*/ + break; + case TA_WAIT_FOR_NOTIFY_LOCK_REL: + /*Post releasing the notify lock, we will act on this queue*/ + break; + case TA_INFO_IN_MEMORY_SUCCESS: + afr_post_op_handle_success(frame, this); + break; + case TA_INFO_IN_MEMORY_FAILED: + afr_post_op_handle_failure(frame, this, EIO); + break; + default: + break; + } + return; +} - case AFR_ENTRY_TRANSACTION: - { - if (nothing_failed && local->optimistic_change_log) { - afr_changelog_post_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - break; - } - - if (local->fd) - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - else - STACK_WIND (frame, afr_changelog_post_op_cbk, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; - } +static void +afr_handle_failure_using_thin_arbiter(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = this->private; + afr_local_t *local = frame->local; + + afr_ta_fill_failed_subvol(priv, local); + gf_msg_debug(this->name, 0, + "Fop failed on data brick (%s) for gfid=%s. " + "ta info needed to decide fop result.", + priv->children[local->ta_failed_subvol]->name, + uuid_utoa(local->inode->gfid)); + afr_ta_decide_post_op_state(frame, this); +} - if (!--call_count) - break; +void +afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = this->private; + afr_local_t *local = NULL; + dict_t *xattr = NULL; + int i = 0; + int ret = 0; + int idx = 0; + int nothing_failed = 1; + gf_boolean_t need_undirty = _gf_false; + + afr_handle_quorum(frame, this); + local = frame->local; + idx = afr_index_for_transaction_type(local->transaction.type); + + xattr = dict_new(); + if (!xattr) { + afr_changelog_post_op_fail(frame, this, ENOMEM); + goto out; + } + + nothing_failed = afr_txn_nothing_failed(frame, this); + + if (afr_changelog_pre_op_uninherit(frame, this)) + need_undirty = _gf_false; + else + need_undirty = _gf_true; + + if (local->op_ret < 0 && !nothing_failed) { + if (afr_need_dirty_marking(frame, this)) { + local->dirty[idx] = hton32(1); + goto set_dirty; } + afr_changelog_post_op_done(frame, this); + goto out; + } + + if (nothing_failed && !need_undirty) { + afr_changelog_post_op_done(frame, this); + goto out; + } + + if (local->transaction.in_flight_sb) { + afr_changelog_post_op_fail(frame, this, + local->transaction.in_flight_sb_errno); + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) + local->pending[i][idx] = hton32(1); + } + + ret = afr_set_pending_dict(priv, xattr, local->pending); + if (ret < 0) { + afr_changelog_post_op_fail(frame, this, ENOMEM); + goto out; + } + + if (need_undirty) + local->dirty[idx] = hton32(-1); + else + local->dirty[idx] = hton32(0); + +set_dirty: + ret = dict_set_static_bin(xattr, AFR_DIRTY, local->dirty, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + afr_changelog_post_op_fail(frame, this, ENOMEM); + goto out; + } + + afr_changelog_do(frame, this, xattr, afr_changelog_post_op_done, + AFR_TRANSACTION_POST_OP); out: - for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); - } + if (xattr) + dict_unref(xattr); - return 0; + return; } - -int32_t -afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xattr, - dict_t *xdata) +static int +afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = this->private; - int call_count = -1; - int child_index = (long) cookie; - - local = frame->local; - - LOCK (&frame->lock); - { - switch (op_ret) { - case 0: - __mark_pre_op_done_on_fd (frame, this, child_index); - //fallthrough we need to mark the pre_op - case 1: - local->transaction.pre_op[child_index] = 1; - /* special op_ret for piggyback */ - break; - case -1: - if (op_errno == ENOTSUP) { - gf_log (this->name, GF_LOG_ERROR, - "xattrop not supported by %s", - priv->children[child_index]->name); - local->op_ret = -1; - - } else if (!child_went_down (op_ret, op_errno)) { - gf_log (this->name, GF_LOG_ERROR, - "xattrop failed on child %s: %s", - priv->children[child_index]->name, - strerror (op_errno)); - } - local->op_errno = op_errno; - break; - } - - call_count = --local->call_count; - } - UNLOCK (&frame->lock); - - if (call_count == 0) { - if ((local->op_ret == -1) && - (local->op_errno == ENOTSUP)) { - local->transaction.resume (frame, this); - } else { - afr_transaction_perform_fop (frame, this); - } + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int failed_count = 0; + + priv = this->private; + local = frame->local; + + if (priv->thin_arbiter_count) { + failed_count = AFR_COUNT(local->transaction.failed_subvols, + priv->child_count); + if (failed_count == 1) { + afr_handle_failure_using_thin_arbiter(frame, this); + return 0; + } else { + /* Txn either succeeded or failed on both data bricks. Let + * post_op_do handle it as the case might be. */ } + } - return 0; + afr_changelog_post_op_do(frame, this); + return 0; } -int -afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) +gf_boolean_t +afr_changelog_pre_op_uninherit(call_frame_t *frame, xlator_t *this) { - afr_private_t * priv = this->private; - int i = 0; - int ret = 0; - int call_count = 0; - dict_t **xattr = NULL; - afr_fd_ctx_t *fdctx = NULL; - afr_local_t *local = NULL; - int piggyback = 0; - afr_internal_lock_t *int_lock = NULL; - unsigned char *locked_nodes = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_inode_ctx_t *ctx = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + int type = 0; + + local = frame->local; + priv = this->private; + ctx = local->inode_ctx; + + type = afr_index_for_transaction_type(local->transaction.type); + if (type != AFR_DATA_TRANSACTION) + return !local->transaction.dirtied; + + if (local->transaction.no_uninherit) + return _gf_false; - local = frame->local; - int_lock = &local->internal_lock; + /* This function must be idempotent. So check if we + were called before and return the same answer again. - xattr = alloca (priv->child_count * sizeof (*xattr)); - memset (xattr, 0, (priv->child_count * sizeof (*xattr))); + It is important to keep this function idempotent for + the call in afr_changelog_post_op_safe() to not have + side effects on the call from afr_changelog_post_op_now() + */ + if (local->transaction.uninherit_done) + return local->transaction.uninherit_value; + LOCK(&local->inode->lock); + { for (i = 0; i < priv->child_count; i++) { - xattr[i] = dict_new (); + if (local->transaction.pre_op[i] != ctx->pre_op_done[type][i]) { + ret = !local->transaction.dirtied; + goto unlock; + } } - call_count = afr_changelog_pre_op_call_count (local->transaction.type, - int_lock, - priv->child_count); - if (call_count == 0) { - local->internal_lock.lock_cbk = - local->transaction.done; - afr_unlock (frame, this); - goto out; + if (ctx->inherited[type]) { + ret = _gf_true; + ctx->inherited[type]--; + } else if (ctx->on_disk[type]) { + ret = _gf_false; + ctx->on_disk[type]--; + } else { + /* ASSERT */ + ret = _gf_false; } - local->call_count = call_count; - - __mark_all_pending (local->pending, priv->child_count, - local->transaction.type); - - if (local->fd) - fdctx = afr_fd_ctx_get (local->fd, this); - - locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock); - for (i = 0; i < priv->child_count; i++) { - if (!locked_nodes[i]) - continue; - ret = afr_set_pending_dict (priv, xattr[i], local->pending, - i, LOCAL_FIRST); - - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); - - - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - { - if (!fdctx) { - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &(local->loc), - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - break; - } - - LOCK (&local->fd->lock); - { - piggyback = 0; - if (fdctx->pre_op_done[i]) { - fdctx->pre_op_piggyback[i]++; - piggyback = 1; - fdctx->hit++; - } else { - fdctx->miss++; - } - } - UNLOCK (&local->fd->lock); - - afr_set_delayed_post_op (frame, this); - - if (piggyback) - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; - case AFR_METADATA_TRANSACTION: - { - if (local->optimistic_change_log) { - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - break; - } - - if (local->fd) - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &(local->loc), - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; - - case AFR_ENTRY_RENAME_TRANSACTION: - { - if (local->optimistic_change_log) { - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - } else { - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.new_parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - - call_count--; - } + if (!ctx->inherited[type] && !ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) + ctx->pre_op_done[type][i] = 0; + } + } +unlock: + UNLOCK(&local->inode->lock); + local->transaction.uninherit_done = _gf_true; + local->transaction.uninherit_value = ret; - /* - set it again because previous stack_wind - might have already returned (think of case - where subvolume is posix) and would have - used the dict as placeholder for return - value - */ + return ret; +} - ret = afr_set_pending_dict (priv, xattr[i], local->pending, - i, LOCAL_FIRST); +gf_boolean_t +afr_changelog_pre_op_inherit(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + int type = 0; - if (ret < 0) - gf_log (this->name, GF_LOG_INFO, - "failed to set pending entry"); + local = frame->local; + priv = this->private; - /* fall through */ + if (local->transaction.type != AFR_DATA_TRANSACTION) + return _gf_false; - case AFR_ENTRY_TRANSACTION: - { - if (local->optimistic_change_log) { - afr_changelog_pre_op_cbk (frame, (void *)(long)i, - this, 1, 0, xattr[i], - NULL); - break; - } - - if (local->fd) - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fxattrop, - local->fd, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - else - STACK_WIND_COOKIE (frame, - afr_changelog_pre_op_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->xattrop, - &local->transaction.parent_loc, - GF_XATTROP_ADD_ARRAY, xattr[i], - NULL); - } - break; - } + type = afr_index_for_transaction_type(local->transaction.type); - if (!--call_count) - break; + LOCK(&local->inode->lock); + { + if (!local->inode_ctx->on_disk[type]) { + /* nothing to inherit yet */ + ret = _gf_false; + goto unlock; } -out: + for (i = 0; i < priv->child_count; i++) { - dict_unref (xattr[i]); + if (local->transaction.pre_op[i] != + local->inode_ctx->pre_op_done[type][i]) { + /* either inherit exactly, or don't */ + ret = _gf_false; + goto unlock; + } } - return 0; -} + local->inode_ctx->inherited[type]++; + ret = _gf_true; -int -afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "Blocking inodelks failed."); - local->transaction.done (frame, this); - } else { - - gf_log (this->name, GF_LOG_DEBUG, - "Blocking inodelks done. Proceeding to FOP"); - afr_internal_lock_finish (frame, this); - } + local->transaction.inherited = _gf_true; + } +unlock: + UNLOCK(&local->inode->lock); - return 0; + return ret; } - -int -afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) +gf_boolean_t +afr_changelog_pre_op_update(call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + int type = 0; - local = frame->local; - int_lock = &local->internal_lock; + local = frame->local; + priv = this->private; - /* Initiate blocking locks if non-blocking has failed */ - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking inodelks failed. Proceeding to blocking"); - int_lock->lock_cbk = afr_post_blocking_inodelk_cbk; - afr_blocking_lock (frame, this); - } else { + if (local->transaction.type == AFR_ENTRY_TRANSACTION || + local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) + return _gf_false; - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking inodelks done. Proceeding to FOP"); - afr_internal_lock_finish (frame, this); - } + if (local->transaction.inherited) + /* was already inherited in afr_changelog_pre_op */ + return _gf_false; - return 0; -} + if (!local->transaction.dirtied) + return _gf_false; + if (!afr_txn_nothing_failed(frame, this)) + return _gf_false; -int -afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + type = afr_index_for_transaction_type(local->transaction.type); - local = frame->local; - int_lock = &local->internal_lock; + ret = _gf_false; - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "Blocking entrylks failed."); - local->transaction.done (frame, this); + LOCK(&local->inode->lock); + { + if (!local->inode_ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) + local->inode_ctx->pre_op_done[type][i] = + (!local->transaction.failed_subvols[i]); } else { - - gf_log (this->name, GF_LOG_DEBUG, - "Blocking entrylks done. Proceeding to FOP"); - afr_internal_lock_finish (frame, this); + for (i = 0; i < priv->child_count; i++) + if (local->inode_ctx->pre_op_done[type][i] != + (!local->transaction.failed_subvols[i])) { + local->transaction.no_uninherit = 1; + goto unlock; + } } + local->inode_ctx->on_disk[type]++; - return 0; -} + ret = _gf_true; + } +unlock: + UNLOCK(&local->inode->lock); + return ret; +} int -afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) +afr_changelog_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xattr, dict_t *xdata) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + afr_local_t *local = NULL; + int call_count = -1; + int child_index = -1; - local = frame->local; - int_lock = &local->internal_lock; + local = frame->local; + child_index = (long)cookie; - /* Initiate blocking locks if non-blocking has failed */ - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking entrylks failed. Proceeding to blocking"); - int_lock->lock_cbk = afr_post_blocking_entrylk_cbk; - afr_blocking_lock (frame, this); - } else { + if (op_ret == -1) { + local->op_errno = op_errno; + afr_transaction_fop_failed(frame, this, child_index); + } - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking entrylks done. Proceeding to FOP"); - afr_internal_lock_finish (frame, this); - } + if (xattr) + local->transaction.changelog_xdata[child_index] = dict_ref(xattr); - return 0; -} + call_count = afr_frame_return(frame); + if (call_count == 0) { + local->transaction.changelog_resume(frame, this); + } -int -afr_post_blocking_rename_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; + return 0; +} - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "Blocking entrylks failed."); - local->transaction.done (frame, this); - } else { +void +afr_changelog_populate_xdata(call_frame_t *frame, afr_xattrop_type_t op, + dict_t **xdata, dict_t **newloc_xdata) +{ + int i = 0; + int ret = 0; + char *key = NULL; + int keylen = 0; + const char *name = NULL; + dict_t *xdata1 = NULL; + dict_t *xdata2 = NULL; + xlator_t *this = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + gf_boolean_t need_entry_key_set = _gf_true; + + local = frame->local; + this = THIS; + priv = this->private; + + if (local->transaction.type == AFR_DATA_TRANSACTION || + local->transaction.type == AFR_METADATA_TRANSACTION) + goto out; + + if (!priv->esh_granular) + goto out; + + xdata1 = dict_new(); + if (!xdata1) + goto out; + + name = local->loc.name; + if (local->op == GF_FOP_LINK) + name = local->newloc.name; + + switch (op) { + case AFR_TRANSACTION_PRE_OP: + key = GF_XATTROP_ENTRY_IN_KEY; + break; + case AFR_TRANSACTION_POST_OP: + if (afr_txn_nothing_failed(frame, this)) { + key = GF_XATTROP_ENTRY_OUT_KEY; + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.failed_subvols[i]) + continue; + need_entry_key_set = _gf_false; + break; + } + /* If the transaction itself did not fail and there + * are no failed subvolumes, check whether the fop + * failed due to a symmetric error. If it did, do + * not set the ENTRY_OUT xattr which would end up + * deleting a name index which was created possibly by + * an earlier entry txn that may have failed on some + * of the sub-volumes. + */ + if (local->op_ret) + need_entry_key_set = _gf_false; + } else { + key = GF_XATTROP_ENTRY_IN_KEY; + } + break; + } + + if (need_entry_key_set) { + keylen = strlen(key); + ret = dict_set_strn(xdata1, key, keylen, (char *)name); + if (ret) + gf_msg(THIS->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED, + "%s/%s: Could not set %s key during xattrop", + uuid_utoa(local->loc.pargfid), local->loc.name, key); + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + xdata2 = dict_new(); + if (!xdata2) + goto out; - gf_log (this->name, GF_LOG_DEBUG, - "Blocking entrylks done. Proceeding to FOP"); - afr_internal_lock_finish (frame, this); + ret = dict_set_strn(xdata2, key, keylen, + (char *)local->newloc.name); + if (ret) + gf_msg(THIS->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED, + "%s/%s: Could not set %s key during " + "xattrop", + uuid_utoa(local->newloc.pargfid), local->newloc.name, + key); } - return 0; -} + } + *xdata = xdata1; + *newloc_xdata = xdata2; + xdata1 = xdata2 = NULL; +out: + if (xdata1) + dict_unref(xdata1); + return; +} int -afr_post_lower_unlock_cbk (call_frame_t *frame, xlator_t *this) +afr_changelog_prepare(xlator_t *this, call_frame_t *frame, int *call_count, + afr_changelog_resume_t changelog_resume, + afr_xattrop_type_t op, dict_t **xdata, + dict_t **newloc_xdata) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - local = frame->local; - int_lock = &local->internal_lock; + local = frame->local; + priv = this->private; - GF_ASSERT (!int_lock->higher_locked); + *call_count = afr_changelog_call_count( + local->transaction.type, local->transaction.pre_op, + local->transaction.failed_subvols, priv->child_count); - int_lock->lock_cbk = afr_post_blocking_rename_cbk; - afr_blocking_lock (frame, this); + if (*call_count == 0) { + changelog_resume(frame, this); + return -1; + } - return 0; -} + afr_changelog_populate_xdata(frame, op, xdata, newloc_xdata); + local->call_count = *call_count; + local->transaction.changelog_resume = changelog_resume; + return 0; +} int -afr_set_transaction_flock (afr_local_t *local) +afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume, afr_xattrop_type_t op) { - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - - int_lock = &local->internal_lock; - inodelk = afr_get_inodelk (int_lock, int_lock->domain); + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + dict_t *newloc_xdata = NULL; + int i = 0; + int call_count = 0; + int ret = 0; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.changelog_xdata[i]) { + dict_unref(local->transaction.changelog_xdata[i]); + local->transaction.changelog_xdata[i] = NULL; + } + } - inodelk->flock.l_len = local->transaction.len; - inodelk->flock.l_start = local->transaction.start; - inodelk->flock.l_type = F_WRLCK; + ret = afr_changelog_prepare(this, frame, &call_count, changelog_resume, op, + &xdata, &newloc_xdata); + if (ret) return 0; -} - -int -afr_lock_rec (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - int_lock->transaction_lk_type = AFR_TRANSACTION_LK; - int_lock->domain = this->name; + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i] || + local->transaction.failed_subvols[i]) + continue; switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - afr_set_transaction_flock (local); - - int_lock->lock_cbk = afr_post_nonblocking_inodelk_cbk; - - afr_nonblocking_inodelk (frame, this); + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + if (!local->fd) { + STACK_WIND_COOKIE( + frame, afr_changelog_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->xattrop, + &local->loc, GF_XATTROP_ADD_ARRAY, xattr, xdata); + } else { + STACK_WIND_COOKIE( + frame, afr_changelog_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->fxattrop, + local->fd, GF_XATTROP_ADD_ARRAY, xattr, xdata); + } break; + case AFR_ENTRY_RENAME_TRANSACTION: - case AFR_ENTRY_RENAME_TRANSACTION: + STACK_WIND_COOKIE(frame, afr_changelog_cbk, (void *)(long)i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.new_parent_loc, + GF_XATTROP_ADD_ARRAY, xattr, newloc_xdata); + call_count--; - int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk; - afr_nonblocking_entrylk (frame, this); - break; + /* fall through */ - case AFR_ENTRY_TRANSACTION: - int_lock->lk_basename = local->transaction.basename; - if (&local->transaction.parent_loc) - int_lock->lk_loc = &local->transaction.parent_loc; + case AFR_ENTRY_TRANSACTION: + if (local->fd) + STACK_WIND_COOKIE( + frame, afr_changelog_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->fxattrop, + local->fd, GF_XATTROP_ADD_ARRAY, xattr, xdata); else - GF_ASSERT (local->fd); - - int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk; - afr_nonblocking_entrylk (frame, this); + STACK_WIND_COOKIE(frame, afr_changelog_cbk, (void *)(long)i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.parent_loc, + GF_XATTROP_ADD_ARRAY, xattr, xdata); break; } - return 0; -} + if (!--call_count) + break; + } + if (xdata) + dict_unref(xdata); + if (newloc_xdata) + dict_unref(newloc_xdata); + return 0; +} -int -afr_lock (call_frame_t *frame, xlator_t *this) +static void +afr_init_optimistic_changelog_for_txn(xlator_t *this, afr_local_t *local) { - afr_set_lock_number (frame, this); + int locked_count = 0; + afr_private_t *priv = NULL; - return afr_lock_rec (frame, this); -} + priv = this->private; + locked_count = AFR_COUNT(local->transaction.pre_op, priv->child_count); + if (priv->optimistic_change_log && locked_count == priv->child_count) + local->optimistic_change_log = 1; -/* }}} */ + return; +} int -afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) +afr_changelog_pre_op(call_frame_t *frame, xlator_t *this) { - if (__fop_changelog_needed (frame, this)) { - afr_changelog_pre_op (frame, this); + afr_private_t *priv = this->private; + int i = 0; + int ret = 0; + int call_count = 0; + int op_errno = 0; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + unsigned char *locked_nodes = NULL; + int idx = -1; + gf_boolean_t pre_nop = _gf_true; + dict_t *xdata_req = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + idx = afr_index_for_transaction_type(local->transaction.type); + + locked_nodes = afr_locked_nodes_get(local->transaction.type, int_lock); + + for (i = 0; i < priv->child_count; i++) { + if (locked_nodes[i]) { + local->transaction.pre_op[i] = 1; + call_count++; } else { - afr_transaction_perform_fop (frame, this); + local->transaction.failed_subvols[i] = 1; + } + } + + afr_init_optimistic_changelog_for_txn(this, local); + + if (afr_changelog_pre_op_inherit(frame, this)) + goto next; + + /* This condition should not be met with present code, as + * transaction.done will be called if locks are not acquired on even a + * single node. + */ + if (call_count == 0) { + op_errno = ENOTCONN; + goto err; + } + + /* Check if the fop can be performed on at least + * quorum number of nodes. + */ + if (priv->quorum_count && !afr_has_fop_quorum(frame)) { + op_errno = int_lock->lock_op_errno; + if (op_errno == 0) + op_errno = afr_quorum_errno(priv); + goto err; + } + + xdata_req = dict_new(); + if (!xdata_req) { + op_errno = ENOMEM; + goto err; + } + + if (call_count < priv->child_count) + pre_nop = _gf_false; + + /* Set an all-zero pending changelog so that in the cbk, we can get the + * current on-disk values. In a replica 3 volume with arbiter enabled, + * these values are needed to arrive at a go/ no-go of the fop phase to + * avoid ending up in split-brain.*/ + + ret = afr_set_pending_dict(priv, xdata_req, local->pending); + if (ret < 0) { + op_errno = ENOMEM; + goto err; + } + + if (afr_needs_changelog_update(local)) { + local->dirty[idx] = hton32(1); + + ret = dict_set_static_bin(xdata_req, AFR_DIRTY, local->dirty, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + op_errno = ENOMEM; + goto err; } - return 0; -} + pre_nop = _gf_false; + local->transaction.dirtied = 1; + } + if (pre_nop) + goto next; -void -afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + if (!local->pre_op_compat) { + dict_copy(xdata_req, local->xdata_req); + goto next; + } - /* call this function from any of the related optimizations - which benefit from delaying post op are enabled, namely: + afr_changelog_do(frame, this, xdata_req, afr_transaction_perform_fop, + AFR_TRANSACTION_PRE_OP); - - changelog piggybacking - - eager locking - */ + if (xdata_req) + dict_unref(xdata_req); - priv = this->private; - if (!priv) - return; + return 0; +next: + afr_transaction_perform_fop(frame, this); - if (!priv->post_op_delay_secs) - return; + if (xdata_req) + dict_unref(xdata_req); - local = frame->local; - if (!local->transaction.eager_lock_on) - return; + return 0; +err: + local->internal_lock.lock_cbk = afr_transaction_done; + local->op_ret = -1; + local->op_errno = op_errno; - if (!local) - return; + afr_handle_lock_acquire_failure(local); - if (!local->fd) - return; + if (xdata_req) + dict_unref(xdata_req); - if (local->op == GF_FOP_WRITE) - local->delayed_post_op = _gf_true; + return 0; } -gf_boolean_t -afr_are_multiple_fds_opened (inode_t *inode, xlator_t *this) +int +afr_post_nonblocking_lock_cbk(call_frame_t *frame, xlator_t *this) { - afr_inode_ctx_t *ictx = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + + /* Initiate blocking locks if non-blocking has failed */ + if (int_lock->lock_op_ret < 0) { + gf_msg_debug(this->name, 0, + "Non blocking locks failed. Proceeding to blocking"); + int_lock->lock_cbk = afr_internal_lock_finish; + afr_blocking_lock(frame, this); + } else { + gf_msg_debug(this->name, 0, + "Non blocking locks done. Proceeding to FOP"); + + afr_internal_lock_finish(frame, this); + } + + return 0; +} - if (!inode) { - /* If false is returned, it may keep on taking eager-lock - * which may lead to starvation, so return true to avoid that. - */ - gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid inode"); - return _gf_true; - } - /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock - * is taken mount2 opened the same file, it won't be able to - * perform any data operations until mount1 releases eager-lock. - * To avoid such scenario do not enable eager-lock for this transaction - * if open-fd-count is > 1 - */ +int +afr_post_blocking_rename_cbk(call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; - ictx = afr_inode_ctx_get (inode, this); - if (!ictx) - return _gf_true; + local = frame->local; + int_lock = &local->internal_lock; - if (ictx->open_fd_count > 1) - return _gf_true; + if (int_lock->lock_op_ret < 0) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_INTERNAL_LKS_FAILED, + "Blocking entrylks failed."); - return _gf_false; + afr_transaction_done(frame, this); + } else { + gf_msg_debug(this->name, 0, + "Blocking entrylks done. Proceeding to FOP"); + + afr_internal_lock_finish(frame, this); + } + return 0; } -gf_boolean_t -afr_any_fops_failed (afr_local_t *local, afr_private_t *priv) +int +afr_post_lower_unlock_cbk(call_frame_t *frame, xlator_t *this) { - if (local->success_count != priv->child_count) - return _gf_true; - return _gf_false; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + + local = frame->local; + int_lock = &local->internal_lock; + + GF_ASSERT(!int_lock->higher_locked); + + int_lock->lock_cbk = afr_post_blocking_rename_cbk; + afr_blocking_lock(frame, this); + + return 0; } -gf_boolean_t -is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) +int +afr_set_transaction_flock(xlator_t *this, afr_local_t *local, + afr_lockee_t *lockee) { - afr_local_t *local = NULL; - gf_boolean_t res = _gf_false; - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; + struct gf_flock *flock = NULL; + + priv = this->private; + flock = &lockee->flock; + + if ((priv->arbiter_count || local->transaction.eager_lock_on || + priv->full_lock) && + local->transaction.type == AFR_DATA_TRANSACTION) { + /*Lock entire file to avoid network split brains.*/ + flock->l_len = 0; + flock->l_start = 0; + } else { + flock->l_len = local->transaction.len; + flock->l_start = local->transaction.start; + } + flock->l_type = F_WRLCK; + + return 0; +} - priv = this->private; +int +afr_lock(call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + int i = 0; - local = frame->local; - if (!local) - goto out; + local = frame->local; + int_lock = &local->internal_lock; - if (!local->delayed_post_op) - goto out; + int_lock->lock_cbk = afr_post_nonblocking_lock_cbk; + int_lock->domain = this->name; - //Mark pending changelog ASAP - if (afr_any_fops_failed (local, priv)) - goto out; + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + for (i = 0; i < int_lock->lockee_count; i++) { + afr_set_transaction_flock(this, local, &int_lock->lockee[i]); + } - if (local->fd && afr_are_multiple_fds_opened (local->fd->inode, this)) - goto out; + break; - res = _gf_true; -out: - return res; -} + case AFR_ENTRY_TRANSACTION: + int_lock->lk_basename = local->transaction.basename; + if (local->transaction.parent_loc.path) + int_lock->lk_loc = &local->transaction.parent_loc; + else + GF_ASSERT(local->fd); + break; + case AFR_ENTRY_RENAME_TRANSACTION: + break; + } + afr_lock_nonblocking(frame, this); + return 0; +} -void -afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, - call_stub_t *stub); +static gf_boolean_t +afr_locals_overlap(afr_local_t *local1, afr_local_t *local2) +{ + uint64_t start1 = local1->transaction.start; + uint64_t start2 = local2->transaction.start; + uint64_t end1 = 0; + uint64_t end2 = 0; + + if (local1->transaction.len) + end1 = start1 + local1->transaction.len - 1; + else + end1 = ULLONG_MAX; + + if (local2->transaction.len) + end2 = start2 + local2->transaction.len - 1; + else + end2 = ULLONG_MAX; + + return ((end1 >= start2) && (end2 >= start1)); +} -void -afr_delayed_changelog_wake_up_cbk (void *data) +gf_boolean_t +afr_has_lock_conflict(afr_local_t *local, gf_boolean_t waitlist_check) { - fd_t *fd = NULL; + afr_local_t *each = NULL; + afr_lock_t *lock = NULL; + + lock = &local->inode_ctx->lock[local->transaction.type]; + /* + * Once full file lock is acquired in eager-lock phase, overlapping + * writes do not compete for inode-locks, instead are transferred to the + * next writes. Because of this overlapping writes are not ordered. + * This can cause inconsistencies in replication. + * Example: + * Two overlapping writes w1, w2 are sent in parallel on same fd + * in two threads t1, t2. + * Both threads can execute afr_writev_wind in the following manner. + * t1 winds w1 on brick-0 + * t2 winds w2 on brick-0 + * t2 winds w2 on brick-1 + * t1 winds w1 on brick-1 + * + * This check makes sure the locks are not transferred for + * overlapping writes. + */ + list_for_each_entry(each, &lock->owners, transaction.owner_list) + { + if (afr_locals_overlap(each, local)) { + return _gf_true; + } + } - fd = data; + if (!waitlist_check) + return _gf_false; + list_for_each_entry(each, &lock->waiting, transaction.wait_list) + { + if (afr_locals_overlap(each, local)) { + return _gf_true; + } + } + return _gf_false; +} - afr_delayed_changelog_wake_up (THIS, fd); +/* }}} */ +static void +afr_copy_inodelk_vars(afr_internal_lock_t *dst, afr_internal_lock_t *src, + xlator_t *this, int lockee_num) +{ + afr_private_t *priv = this->private; + afr_lockee_t *sl = &src->lockee[lockee_num]; + afr_lockee_t *dl = &dst->lockee[lockee_num]; + + dst->domain = src->domain; + dl->flock.l_len = sl->flock.l_len; + dl->flock.l_start = sl->flock.l_start; + dl->flock.l_type = sl->flock.l_type; + dl->locked_count = sl->locked_count; + memcpy(dl->locked_nodes, sl->locked_nodes, + priv->child_count * sizeof(*dl->locked_nodes)); } +void +__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared) +{ + gf_boolean_t conflict = _gf_false; + afr_local_t *each = NULL; + afr_lock_t *lock = &local->inode_ctx->lock[local->transaction.type]; + + while (!conflict) { + if (list_empty(&lock->waiting)) + return; + each = list_entry(lock->waiting.next, afr_local_t, + transaction.wait_list); + if (afr_has_lock_conflict(each, _gf_false)) { + conflict = _gf_true; + } + if (conflict && !list_empty(&lock->owners)) + return; + afr_copy_inodelk_vars(&each->internal_lock, &local->internal_lock, + each->transaction.frame->this, 0); + list_move_tail(&each->transaction.wait_list, shared); + list_add_tail(&each->transaction.owner_list, &lock->owners); + } +} -/* - Check if the frame is destined to get optimized away - with changelog piggybacking -*/ -static gf_boolean_t -is_piggyback_post_op (call_frame_t *frame, fd_t *fd) +static void +afr_lock_resume_shared(struct list_head *list) { - afr_fd_ctx_t *fdctx = NULL; - afr_local_t *local = NULL; - gf_boolean_t piggyback = _gf_true; - afr_private_t *priv = NULL; - int i = 0; + afr_local_t *each = NULL; + + while (!list_empty(list)) { + each = list_entry(list->next, afr_local_t, transaction.wait_list); + list_del_init(&each->transaction.wait_list); + afr_changelog_pre_op(each->transaction.frame, + each->transaction.frame->this); + } +} - priv = frame->this->private; - local = frame->local; - fdctx = afr_fd_ctx_get (fd, frame->this); +int +afr_internal_lock_finish(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = frame->local; + afr_lock_t *lock = NULL; + + local->internal_lock.lock_cbk = NULL; + if (!local->transaction.eager_lock_on) { + if (local->internal_lock.lock_op_ret < 0) { + afr_transaction_done(frame, this); + return 0; + } + afr_changelog_pre_op(frame, this); + } else { + lock = &local->inode_ctx->lock[local->transaction.type]; + if (local->internal_lock.lock_op_ret < 0) { + afr_handle_lock_acquire_failure(local); + } else { + lock->event_generation = local->event_generation; + afr_changelog_pre_op(frame, this); + } + } - LOCK(&fd->lock); - { - piggyback = _gf_true; + return 0; +} - for (i = 0; i < priv->child_count; i++) { - if (!local->transaction.pre_op[i]) - continue; - if (fdctx->pre_op_piggyback[i]) { - fdctx->pre_op_piggyback[i]--; - local->transaction.postop_piggybacked[i] = 1; - } else { - /* For at least _one_ subvolume we cannot - piggyback on the changelog, and have to - perform a hard POST-OP and therefore fsync - if necesssary - */ - piggyback = _gf_false; - GF_ASSERT (fdctx->pre_op_done[i]); - fdctx->pre_op_done[i]--; - } - } +gf_boolean_t +afr_are_conflicting_ops_waiting(afr_local_t *local, xlator_t *this) +{ + afr_lock_t *lock = NULL; + lock = &local->inode_ctx->lock[local->transaction.type]; + + /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock + * is taken mount2 opened the same file, it won't be able to + * perform any {meta,}data operations until mount1 releases eager-lock. + * To avoid such scenario do not enable eager-lock for this transaction + * if open-fd-count is > 1 for metadata transactions and if num-inodelks > 1 + * for data transactions + */ + + if (local->transaction.type == AFR_METADATA_TRANSACTION) { + if (local->inode_ctx->open_fd_count > 1) { + return _gf_true; } - UNLOCK(&fd->lock); - - if (!afr_txn_nothing_failed (frame, frame->this)) { - /* something failed in this transaction, - we will be performing a hard post-op - */ - return _gf_false; + } else if (local->transaction.type == AFR_DATA_TRANSACTION) { + if (lock->num_inodelks > 1) { + return _gf_true; } + } - return piggyback; + return _gf_false; } +gf_boolean_t +afr_is_delayed_changelog_post_op_needed(call_frame_t *frame, xlator_t *this, + int delay) +{ + afr_local_t *local = NULL; + afr_lock_t *lock = NULL; + gf_boolean_t res = _gf_false; + + local = frame->local; + lock = &local->inode_ctx->lock[local->transaction.type]; + + if (!afr_txn_nothing_failed(frame, this)) { + lock->release = _gf_true; + goto out; + } + + if (afr_are_conflicting_ops_waiting(local, this)) { + lock->release = _gf_true; + goto out; + } + + if (!list_empty(&lock->owners)) + goto out; + else + GF_ASSERT(list_empty(&lock->waiting)); + + if (lock->release) { + goto out; + } + + if (!delay) { + goto out; + } + + if (local->transaction.disable_delayed_post_op) { + goto out; + } + + if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP) && + (local->op != GF_FOP_FSYNC)) { + /*Only allow writes/fsyncs but shard does [f]xattrops on writes, so + * they are fine too*/ + goto out; + } + + res = _gf_true; +out: + return res; +} + +void +afr_delayed_changelog_wake_up_cbk(void *data) +{ + afr_lock_t *lock = NULL; + afr_local_t *local = data; + afr_local_t *timer_local = NULL; + struct list_head shared; + + INIT_LIST_HEAD(&shared); + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK(&local->inode->lock); + { + timer_local = list_entry(lock->post_op.next, afr_local_t, + transaction.owner_list); + if (list_empty(&lock->owners) && (local == timer_local)) { + GF_ASSERT(list_empty(&lock->waiting)); + /*Last owner*/ + lock->release = _gf_true; + lock->delay_timer = NULL; + } + } + UNLOCK(&local->inode->lock); + afr_changelog_post_op_now(local->transaction.frame, + local->transaction.frame->this); +} /* SET operation */ int -afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) +afr_fd_report_unstable_write(xlator_t *this, afr_local_t *local) { - afr_fd_ctx_t *fdctx = NULL; - - fdctx = afr_fd_ctx_get (fd, this); + LOCK(&local->inode->lock); + { + local->inode_ctx->witnessed_unstable_write = _gf_true; + } + UNLOCK(&local->inode->lock); - LOCK(&fd->lock); - { - fdctx->witnessed_unstable_write = _gf_true; - } - UNLOCK(&fd->lock); - - return 0; + return 0; } /* TEST and CLEAR operation */ gf_boolean_t -afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd) +afr_fd_has_witnessed_unstable_write(xlator_t *this, inode_t *inode) { - afr_fd_ctx_t *fdctx = NULL; - gf_boolean_t witness = _gf_false; + afr_inode_ctx_t *ctx = NULL; + gf_boolean_t witness = _gf_false; - fdctx = afr_fd_ctx_get (fd, this); - if (!fdctx) - return _gf_true; + LOCK(&inode->lock); + { + (void)__afr_inode_ctx_get(this, inode, &ctx); - LOCK(&fd->lock); - { - if (fdctx->witnessed_unstable_write) { - witness = _gf_true; - fdctx->witnessed_unstable_write = _gf_false; - } + if (ctx->witnessed_unstable_write) { + witness = _gf_true; + ctx->witnessed_unstable_write = _gf_false; } - UNLOCK (&fd->lock); + } + UNLOCK(&inode->lock); - return witness; + return witness; } - int -afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *pre, - struct iatt *post, dict_t *xdata) -{ - afr_private_t *priv = NULL; - int child_index = (long) cookie; - int call_count = -1; - afr_local_t *local = NULL; - - priv = this->private; - local = frame->local; - - if (afr_fop_failed (op_ret, op_errno)) { - /* Failure of fsync() is as good as failure of previous - write(). So treat it like one. - */ - gf_log (this->name, GF_LOG_WARNING, - "fsync(%s) failed on subvolume %s. Transaction was %s", - uuid_utoa (local->fd->inode->gfid), - priv->children[child_index]->name, - gf_fop_list[local->op]); - - afr_transaction_fop_failed (frame, this, child_index); - } +afr_changelog_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + afr_private_t *priv = NULL; + int child_index = (long)cookie; + int call_count = -1; + afr_local_t *local = NULL; - call_count = afr_frame_return (frame); + priv = this->private; + local = frame->local; - if (call_count == 0) - afr_changelog_post_op_now (frame, this); + if (op_ret != 0) { + /* Failure of fsync() is as good as failure of previous + write(). So treat it like one. + */ + gf_msg(this->name, GF_LOG_WARNING, op_errno, AFR_MSG_FSYNC_FAILED, + "fsync(%s) failed on subvolume %s. Transaction was %s", + uuid_utoa(local->fd->inode->gfid), + priv->children[child_index]->name, gf_fop_list[local->op]); - return 0; -} + afr_transaction_fop_failed(frame, this, child_index); + } + + call_count = afr_frame_return(frame); + if (call_count == 0) + afr_changelog_post_op_now(frame, this); + + return 0; +} int -afr_changelog_fsync (call_frame_t *frame, xlator_t *this) +afr_changelog_fsync(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - int i = 0; - int call_count = 0; - afr_private_t *priv = NULL; - dict_t *xdata = NULL; - GF_UNUSED int ret = -1; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + GF_UNUSED int ret = -1; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - call_count = afr_pre_op_done_children_count (local->transaction.pre_op, - priv->child_count); + call_count = AFR_COUNT(local->transaction.pre_op, priv->child_count); - if (!call_count) { - /* will go straight to unlock */ - afr_changelog_post_op_now (frame, this); - return 0; - } + if (!call_count) { + /* will go straight to unlock */ + afr_changelog_post_op_now(frame, this); + return 0; + } - local->call_count = call_count; + local->call_count = call_count; - xdata = dict_new(); - if (xdata) - ret = dict_set_int32 (xdata, "batch-fsync", 1); + xdata = dict_new(); + if (xdata) { + ret = dict_set_int32_sizen(xdata, "batch-fsync", 1); + ret = dict_set_str(xdata, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + } - for (i = 0; i < priv->child_count; i++) { - if (!local->transaction.pre_op[i]) - continue; + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; - STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->fsync, local->fd, - 1, xdata); - if (!--call_count) - break; - } + STACK_WIND_COOKIE(frame, afr_changelog_fsync_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->fsync, + local->fd, 1, xdata); + if (!--call_count) + break; + } - if (xdata) - dict_unref (xdata); + if (xdata) + dict_unref(xdata); - return 0; + return 0; } - - int -afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) +int +afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; - - if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) { - afr_changelog_post_op_now (frame, this); - return 0; - } + local = frame->local; + priv = this->private; - if (is_piggyback_post_op (frame, local->fd)) { - /* just detected that this post-op is about to - be optimized away as a new write() has - already piggybacked on this frame's changelog. - */ - afr_changelog_post_op_now (frame, this); - return 0; - } + if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) { + afr_changelog_post_op_now(frame, this); + return 0; + } - /* Calling afr_changelog_post_op_now() now will result in - issuing ->[f]xattrop(). - - Performing a hard POST-OP (->[f]xattrop() FOP) is a more - responsible operation that what it might appear on the surface. - - The changelog of a file (in the xattr of the file on the server) - stores information (pending count) about the state of the file - on the OTHER server. This changelog is blindly trusted, and must - therefore be updated in such a way it remains trustworthy. This - implies that decrementing the pending count (essentially "clearing - the dirty flag") must be done STRICTLY after we are sure that the - operation on the other server has reached stable storage. - - While the backend filesystem on that server will eventually flush - it to stable storage, we (being in userspace) have no mechanism - to get notified when the write became "stable". - - This means we need take matter into our own hands and issue an - fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES, - and get an acknowledgement for it. And we need to wait for the - fsync() acknowledgement before initiating the hard POST-OP. - - However if the FD itself was opened in O_SYNC or O_DSYNC then - we are already guaranteed that the writes were made stable as - part of the FOP itself. The same holds true for NFS stable - writes which happen on an anonymous FD with O_DSYNC or O_SYNC - flag set in the writev() @flags param. For all other write types, - mark a flag in the fdctx whenever an unstable write is witnessed. + if (afr_changelog_pre_op_uninherit(frame, this) && + afr_txn_nothing_failed(frame, this)) { + /* just detected that this post-op is about to + be optimized away as a new write() has + already piggybacked on this frame's changelog. */ - - if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { - afr_changelog_post_op_now (frame, this); - return 0; - } - - /* Check whether users want durability and perform fsync/post-op - * accordingly. - */ - if (priv->ensure_durability) { - /* Time to fsync() */ - afr_changelog_fsync (frame, this); - } else { - afr_changelog_post_op_now (frame, this); - } - + afr_changelog_post_op_now(frame, this); + return 0; + } + + /* Calling afr_changelog_post_op_now() now will result in + issuing ->[f]xattrop(). + + Performing a hard POST-OP (->[f]xattrop() FOP) is a more + responsible operation that what it might appear on the surface. + + The changelog of a file (in the xattr of the file on the server) + stores information (pending count) about the state of the file + on the OTHER server. This changelog is blindly trusted, and must + therefore be updated in such a way it remains trustworthy. This + implies that decrementing the pending count (essentially "clearing + the dirty flag") must be done STRICTLY after we are sure that the + operation on the other server has reached stable storage. + + While the backend filesystem on that server will eventually flush + it to stable storage, we (being in userspace) have no mechanism + to get notified when the write became "stable". + + This means we need take matter into our own hands and issue an + fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES, + and get an acknowledgement for it. And we need to wait for the + fsync() acknowledgement before initiating the hard POST-OP. + + However if the FD itself was opened in O_SYNC or O_DSYNC then + we are already guaranteed that the writes were made stable as + part of the FOP itself. The same holds true for NFS stable + writes which happen on an anonymous FD with O_DSYNC or O_SYNC + flag set in the writev() @flags param. For all other write types, + mark a flag in the fdctx whenever an unstable write is witnessed. + */ + + if (!afr_fd_has_witnessed_unstable_write(this, local->inode)) { + afr_changelog_post_op_now(frame, this); return 0; + } + + /* Check whether users want durability and perform fsync/post-op + * accordingly. + */ + if (priv->ensure_durability) { + /* Time to fsync() */ + afr_changelog_fsync(frame, this); + } else { + afr_changelog_post_op_now(frame, this); + } + + return 0; } - void -afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, - call_stub_t *stub) +afr_changelog_post_op(call_frame_t *frame, xlator_t *this) { - afr_fd_ctx_t *fd_ctx = NULL; - call_frame_t *prev_frame = NULL; - struct timeval delta = {0, }; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - priv = this->private; - - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) - goto out; + struct timespec delta = { + 0, + }; + afr_private_t *priv = NULL; + afr_local_t *local = frame->local; + afr_lock_t *lock = NULL; + gf_boolean_t post_op = _gf_true; + struct list_head shared; + + priv = this->private; + delta.tv_sec = priv->post_op_delay_secs; + delta.tv_nsec = 0; + + INIT_LIST_HEAD(&shared); + if (!local->transaction.eager_lock_on) + goto out; + + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK(&local->inode->lock); + { + list_del_init(&local->transaction.owner_list); + list_add(&local->transaction.owner_list, &lock->post_op); + __afr_transaction_wake_shared(local, &shared); + + if (!afr_is_delayed_changelog_post_op_needed(frame, this, + delta.tv_sec)) { + if (list_empty(&lock->owners)) + lock->release = _gf_true; + goto unlock; + } - delta.tv_sec = priv->post_op_delay_secs; - delta.tv_usec = 0; - - pthread_mutex_lock (&fd_ctx->delay_lock); - { - prev_frame = fd_ctx->delay_frame; - fd_ctx->delay_frame = NULL; - if (fd_ctx->delay_timer) - gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer); - fd_ctx->delay_timer = NULL; - if (!frame) - goto unlock; - fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta, - afr_delayed_changelog_wake_up_cbk, - fd); - fd_ctx->delay_frame = frame; - } + GF_ASSERT(lock->delay_timer == NULL); + lock->delay_timer = gf_timer_call_after( + this->ctx, delta, afr_delayed_changelog_wake_up_cbk, local); + if (!lock->delay_timer) { + lock->release = _gf_true; + } else { + post_op = _gf_false; + } + } unlock: - pthread_mutex_unlock (&fd_ctx->delay_lock); + UNLOCK(&local->inode->lock); + + if (!list_empty(&shared)) { + afr_lock_resume_shared(&shared); + } out: - if (prev_frame) { - local = prev_frame->local; - local->transaction.resume_stub = stub; - afr_changelog_post_op_safe (prev_frame, this); - } else if (stub) { - call_resume (stub); - } + if (post_op) { + if (!local->transaction.eager_lock_on || lock->release) { + afr_changelog_post_op_safe(frame, this); + } else { + afr_changelog_post_op_now(frame, this); + } + } } - -void -afr_changelog_post_op (call_frame_t *frame, xlator_t *this) +int +afr_transaction_resume(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (is_afr_delayed_changelog_post_op_needed (frame, this)) - afr_delayed_changelog_post_op (this, frame, local->fd, NULL); - else - afr_changelog_post_op_safe (frame, this); -} + afr_restore_lk_owner(frame); + afr_handle_symmetric_errors(frame, this); + if (!local->pre_op_compat) + /* new mode, pre-op was done along + with OP */ + afr_changelog_pre_op_update(frame, this); -/* Wake up the sleeping/delayed post-op, and also register - a stub to have it resumed after this transaction - completely finishes. + afr_changelog_post_op(frame, this); - The @stub gets saved in @local and gets resumed in - afr_local_cleanup() - */ -void -afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub) -{ - afr_delayed_changelog_post_op (this, NULL, fd, stub); + return 0; } +/** + * afr_transaction_fop_failed - inform that an fop failed + */ void -afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd) +afr_transaction_fop_failed(call_frame_t *frame, xlator_t *this, int child_index) { - afr_delayed_changelog_post_op (this, NULL, fd, NULL); -} + afr_local_t *local = NULL; + + local = frame->local; + local->transaction.failed_subvols[child_index] = 1; +} - int -afr_transaction_resume (call_frame_t *frame, xlator_t *this) +static gf_boolean_t +__need_previous_lock_unlocked(afr_local_t *local) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_lock_t *lock = NULL; - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; + lock = &local->inode_ctx->lock[local->transaction.type]; + if (!lock->acquired) + return _gf_false; + if (lock->acquired && lock->event_generation != local->event_generation) + return _gf_true; + return _gf_false; +} - if (local->transaction.eager_lock_on) { - /* We don't need to retain "local" in the - fd list anymore, writes to all subvols - are finished by now */ - LOCK (&local->fd->lock); - { - list_del_init (&local->transaction.eager_locked); - } - UNLOCK (&local->fd->lock); +void +__afr_eager_lock_handle(afr_local_t *local, gf_boolean_t *take_lock, + gf_boolean_t *do_pre_op, afr_local_t **timer_local) +{ + afr_lock_t *lock = NULL; + afr_local_t *owner_local = NULL; + xlator_t *this = local->transaction.frame->this; + + local->transaction.eager_lock_on = _gf_true; + afr_set_lk_owner(local->transaction.frame, this, local->inode); + + lock = &local->inode_ctx->lock[local->transaction.type]; + if (__need_previous_lock_unlocked(local)) { + if (!list_empty(&lock->owners)) { + lock->release = _gf_true; + } else if (lock->delay_timer) { + lock->release = _gf_true; + if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) { + /* It will be put in frozen list + * in the code flow below*/ + } else { + *timer_local = list_entry(lock->post_op.next, afr_local_t, + transaction.owner_list); + lock->delay_timer = NULL; + } } - - afr_restore_lk_owner (frame); - - if (__fop_changelog_needed (frame, this)) { - afr_changelog_post_op (frame, this); + } + + if (lock->release) { + list_add_tail(&local->transaction.wait_list, &lock->frozen); + *take_lock = _gf_false; + goto out; + } + + if (lock->delay_timer) { + *take_lock = _gf_false; + if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) { + list_add_tail(&local->transaction.wait_list, &lock->frozen); } else { - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - local->transaction.done (frame, this); - } else { - int_lock->lock_cbk = local->transaction.done; - afr_unlock (frame, this); - } + *timer_local = list_entry(lock->post_op.next, afr_local_t, + transaction.owner_list); + afr_copy_inodelk_vars(&local->internal_lock, + &(*timer_local)->internal_lock, this, 0); + lock->delay_timer = NULL; + *do_pre_op = _gf_true; + list_add_tail(&local->transaction.owner_list, &lock->owners); } + goto out; + } - return 0; + if (!list_empty(&lock->owners)) { + if (!lock->acquired || afr_has_lock_conflict(local, _gf_true)) { + list_add_tail(&local->transaction.wait_list, &lock->waiting); + *take_lock = _gf_false; + goto out; + } + owner_local = list_entry(lock->owners.next, afr_local_t, + transaction.owner_list); + afr_copy_inodelk_vars(&local->internal_lock, + &owner_local->internal_lock, this, 0); + *take_lock = _gf_false; + *do_pre_op = _gf_true; + } + + if (lock->acquired) + GF_ASSERT(!(*take_lock)); + list_add_tail(&local->transaction.owner_list, &lock->owners); +out: + return; } - -/** - * afr_transaction_fop_failed - inform that an fop failed - */ - void -afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, - int child_index) +afr_transaction_start(afr_local_t *local, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - - local = frame->local; - priv = this->private; - - __mark_child_dead (local->pending, priv->child_count, - child_index, local->transaction.type); + afr_private_t *priv = NULL; + gf_boolean_t take_lock = _gf_true; + gf_boolean_t do_pre_op = _gf_false; + afr_local_t *timer_local = NULL; + + priv = this->private; + + if (local->transaction.type != AFR_DATA_TRANSACTION && + local->transaction.type != AFR_METADATA_TRANSACTION) + goto lock_phase; + + if (!priv->eager_lock) + goto lock_phase; + + LOCK(&local->inode->lock); + { + __afr_eager_lock_handle(local, &take_lock, &do_pre_op, &timer_local); + } + UNLOCK(&local->inode->lock); +lock_phase: + if (!local->transaction.eager_lock_on) { + afr_set_lk_owner(local->transaction.frame, this, + local->transaction.frame->root); + } + + if (take_lock) { + afr_lock(local->transaction.frame, this); + } else if (do_pre_op) { + afr_changelog_pre_op(local->transaction.frame, this); + } + /*Always call delayed_changelog_wake_up_cbk after calling pre-op above + * so that any inheriting can happen*/ + if (timer_local) + afr_delayed_changelog_wake_up_cbk(timer_local); } - - - static gf_boolean_t -afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) +int +afr_write_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) { - uint64_t start1 = local1->transaction.start; - uint64_t start2 = local2->transaction.start; - uint64_t end1 = 0; - uint64_t end2 = 0; - - if (local1->transaction.len) - end1 = start1 + local1->transaction.len - 1; - else - end1 = ULLONG_MAX; - - if (local2->transaction.len) - end2 = start2 + local2->transaction.len - 1; - else - end2 = ULLONG_MAX; - - return ((end1 >= start2) && (end2 >= start1)); + afr_local_t *local = frame->local; + + if (err) { + AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(-1, err); + goto fail; + } + + afr_transaction_start(local, this); + return 0; +fail: + local->transaction.unwind(frame, this); + AFR_STACK_DESTROY(frame); + return 0; } -void -afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) +int +afr_transaction_lockee_init(call_frame_t *frame) { - afr_private_t *priv = NULL; - afr_fd_ctx_t *fdctx = NULL; - afr_local_t *each = NULL; - - priv = this->private; + afr_local_t *local = frame->local; + afr_internal_lock_t *int_lock = &local->internal_lock; + afr_private_t *priv = frame->this->private; + int ret = 0; - if (!local->fd) - return; - - if (local->transaction.type != AFR_DATA_TRANSACTION) - return; - - if (!priv->eager_lock) - return; - - fdctx = afr_fd_ctx_get (local->fd, this); - if (!fdctx) - return; + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + ret = afr_add_inode_lockee(local, priv->child_count); + break; - if (afr_are_multiple_fds_opened (local->fd->inode, this)) - return; - /* - * Once full file lock is acquired in eager-lock phase, overlapping - * writes do not compete for inode-locks, instead are transferred to the - * next writes. Because of this overlapping writes are not ordered. - * This can cause inconsistencies in replication. - * Example: - * Two overlapping writes w1, w2 are sent in parallel on same fd - * in two threads t1, t2. - * Both threads can execute afr_writev_wind in the following manner. - * t1 winds w1 on brick-0 - * t2 winds w2 on brick-0 - * t2 winds w2 on brick-1 - * t1 winds w1 on brick-1 - * - * This check makes sure the locks are not transferred for - * overlapping writes. - */ - LOCK (&local->fd->lock); - { - list_for_each_entry (each, &fdctx->eager_locked, - transaction.eager_locked) { - if (afr_locals_overlap (each, local)) { - local->transaction.eager_lock_on = _gf_false; - goto unlock; - } + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + ret = afr_add_entry_lockee(local, &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) { + goto out; + } + if (local->op == GF_FOP_RENAME) { + ret = afr_add_entry_lockee( + local, &local->transaction.new_parent_loc, + local->transaction.new_basename, priv->child_count); + if (ret) { + goto out; } - local->transaction.eager_lock_on = _gf_true; - list_add_tail (&local->transaction.eager_locked, - &fdctx->eager_locked); - } -unlock: - UNLOCK (&local->fd->lock); + if (local->newloc.inode && + IA_ISDIR(local->newloc.inode->ia_type)) { + ret = afr_add_entry_lockee(local, &local->newloc, NULL, + priv->child_count); + if (ret) { + goto out; + } + } + } else if (local->op == GF_FOP_RMDIR) { + ret = afr_add_entry_lockee(local, &local->loc, NULL, + priv->child_count); + if (ret) { + goto out; + } + } + + if (int_lock->lockee_count > 1) { + qsort(int_lock->lockee, int_lock->lockee_count, + sizeof(*int_lock->lockee), afr_entry_lockee_cmp); + } + break; + } +out: + return ret; } - int -afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) +afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - fd_t *fd = NULL; - int ret = -1; - - local = frame->local; - priv = this->private; - - local->transaction.resume = afr_transaction_resume; - local->transaction.type = type; - - ret = afr_transaction_local_init (local, this); - if (ret < 0) - goto out; - - afr_transaction_eager_lock_init (local, this); - - if (local->fd && local->transaction.eager_lock_on) - afr_set_lk_owner (frame, this, local->fd); - else - afr_set_lk_owner (frame, this, frame->root); - - if (!local->transaction.eager_lock_on && local->loc.inode) { - fd = fd_lookup (local->loc.inode, frame->root->pid); - if (fd == NULL) - fd = fd_lookup_anonymous (local->loc.inode); - - if (fd) { - afr_delayed_changelog_wake_up (this, fd); - fd_unref (fd); - } - } - - if (afr_lock_server_count (priv, local->transaction.type) == 0) { - afr_internal_lock_finish (frame, this); - } else { - afr_lock (frame, this); - } + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int ret = -1; + int event_generation = 0; + + local = frame->local; + priv = this->private; + local->transaction.frame = frame; + + local->transaction.type = type; + + if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) { + ret = -afr_quorum_errno(priv); + goto out; + } + + if (!afr_is_consistent_io_possible(local, priv, &ret)) { + ret = -ret; /*op_errno to ret conversion*/ + goto out; + } + + if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) { + ret = -afr_quorum_errno(priv); + goto out; + } + + ret = afr_transaction_local_init(local, this); + if (ret < 0) + goto out; + + ret = afr_transaction_lockee_init(frame); + if (ret) + goto out; + + if (type != AFR_METADATA_TRANSACTION) { + goto txn_start; + } + + ret = afr_inode_get_readable(frame, local->inode, this, local->readable, + &event_generation, type); + if (ret < 0 || + afr_is_inode_refresh_reqd(local->inode, this, priv->event_generation, + event_generation)) { + afr_inode_refresh(frame, this, local->inode, local->loc.gfid, + afr_write_txn_refresh_done); ret = 0; + goto out; + } + +txn_start: + ret = 0; + afr_transaction_start(local, this); out: - return ret; + return ret; } |
