summaryrefslogtreecommitdiffstats
path: root/xlators
diff options
context:
space:
mode:
authorRavishankar N <ravishankar@redhat.com>2018-09-23 16:59:58 +0530
committerRavishankar N <ravishankar@redhat.com>2018-10-25 12:26:22 +0000
commit053b1309dc8fbc05fcde5223e734da9f694cf5cc (patch)
treee2eba5c81024b5dc07eef5966289d5e71c3567ee /xlators
parentaae1c402b74fd02ed2f6473b896f108d82aef8e3 (diff)
afr: thin-arbiter 2 domain locking and in-memory state
2 domain locking + xattrop for write-txn failures: -------------------------------------------------- - A post-op wound on TA takes AFR_TA_DOM_NOTIFY range lock and AFR_TA_DOM_MODIFY full lock, does xattrop on TA and releases AFR_TA_DOM_MODIFY lock and stores in-memory which brick is bad. - All further write txn failures are handled based on this in-memory value without querying the TA. - When shd heals the files, it does so by requesting full lock on AFR_TA_DOM_NOTIFY domain. Client uses this as a cue (via upcall), releases AFR_TA_DOM_NOTIFY range lock and invalidates its in-memory notion of which brick is bad. The next write txn failure is wound on TA to again update the in-memory state. - Any incomplete write txns before the AFR_TA_DOM_NOTIFY upcall release request is got is completed before the lock is released. - Any write txns got after the release request are maintained in a ta_waitq. - After the release is complete, the ta_waitq elements are spliced to a separate queue which is then processed one by one. - For fops that come in parallel when the in-memory bad brick is still unknown, only one is wound to TA on wire. The other ones are maintained in a ta_onwireq which is then processed after we get the response from TA. Change-Id: I32c7b61a61776663601ab0040e2f0767eca1fd64 updates: bz#1579788 Signed-off-by: Ravishankar N <ravishankar@redhat.com> Signed-off-by: Ashish Pandey <aspandey@redhat.com>
Diffstat (limited to 'xlators')
-rw-r--r--xlators/cluster/afr/src/afr-common.c204
-rw-r--r--xlators/cluster/afr/src/afr-read-txn.c8
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c485
-rw-r--r--xlators/cluster/afr/src/afr-transaction.h5
-rw-r--r--xlators/cluster/afr/src/afr.c22
-rw-r--r--xlators/cluster/afr/src/afr.h31
6 files changed, 679 insertions, 76 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index e655999c6a6..47120503980 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -4893,6 +4893,13 @@ afr_priv_dump(xlator_t *this)
gf_proc_dump_write("quorum-count", "%d", priv->quorum_count);
}
gf_proc_dump_write("up", "%u", afr_has_quorum(priv->child_up, this));
+ if (priv->thin_arbiter_count) {
+ gf_proc_dump_write("ta_child_up", "%d", priv->ta_child_up);
+ gf_proc_dump_write("ta_bad_child_index", "%d",
+ priv->ta_bad_child_index);
+ gf_proc_dump_write("ta_notify_dom_lock_offset", "%lld",
+ priv->ta_notify_dom_lock_offset);
+ }
return 0;
}
@@ -4904,14 +4911,19 @@ afr_priv_dump(xlator_t *this)
*/
static int
-find_child_index(xlator_t *this, xlator_t *child)
+afr_find_child_index(xlator_t *this, xlator_t *child)
{
afr_private_t *priv = NULL;
+ int child_count = -1;
int i = -1;
priv = this->private;
+ child_count = priv->child_count;
+ if (priv->thin_arbiter_count) {
+ child_count++;
+ }
- for (i = 0; i < priv->child_count; i++) {
+ for (i = 0; i < child_count; i++) {
if ((xlator_t *)child == priv->children[i])
break;
}
@@ -5310,6 +5322,103 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
priv->last_event[idx] = *event;
}
+void
+afr_ta_lock_release_synctask(xlator_t *this)
+{
+ call_frame_t *ta_frame = NULL;
+ int ret = 0;
+
+ ta_frame = afr_ta_frame_create(this);
+ if (!ta_frame) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to create ta_frame");
+ return;
+ }
+
+ ret = synctask_new(this->ctx->env, afr_release_notify_lock_for_ta,
+ afr_ta_lock_release_done, ta_frame, this);
+ if (ret) {
+ STACK_DESTROY(ta_frame->root);
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to release "
+ "AFR_TA_DOM_NOTIFY lock.");
+ }
+}
+
+static void
+afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall)
+{
+ struct gf_upcall_inodelk_contention *lc = NULL;
+ unsigned int inmem_count = 0;
+ unsigned int onwire_count = 0;
+ afr_private_t *priv = this->private;
+
+ lc = upcall->data;
+
+ if (strcmp(lc->domain, AFR_TA_DOM_NOTIFY) != 0)
+ return;
+
+ if (priv->shd.iamshd) {
+ /* shd should ignore AFR_TA_DOM_NOTIFY release requests. */
+ return;
+ }
+ LOCK(&priv->lock);
+ {
+ priv->release_ta_notify_dom_lock = _gf_true;
+ inmem_count = priv->ta_in_mem_txn_count;
+ onwire_count = priv->ta_on_wire_txn_count;
+ }
+ UNLOCK(&priv->lock);
+ if (inmem_count || onwire_count)
+ /* lock release will happen in txn code path after
+ * inflight or on-wire txns are over.*/
+ return;
+
+ afr_ta_lock_release_synctask(this);
+}
+
+static void
+afr_handle_upcall_event(xlator_t *this, struct gf_upcall *upcall)
+{
+ struct gf_upcall_cache_invalidation *up_ci = NULL;
+ afr_private_t *priv = this->private;
+ inode_t *inode = NULL;
+ inode_table_t *itable = NULL;
+ int i = 0;
+
+ switch (upcall->event_type) {
+ case GF_UPCALL_INODELK_CONTENTION:
+ afr_handle_inodelk_contention(this, upcall);
+ break;
+ case GF_UPCALL_CACHE_INVALIDATION:
+ up_ci = (struct gf_upcall_cache_invalidation *)upcall->data;
+
+ /* Since md-cache will be aggressively filtering
+ * lookups, the stale read issue will be more
+ * pronounced. Hence when a pending xattr is set notify
+ * all the md-cache clients to invalidate the existing
+ * stat cache and send the lookup next time */
+ if (!up_ci->dict)
+ break;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!dict_get(up_ci->dict, priv->pending_key[i]))
+ continue;
+ up_ci->flags |= UP_INVAL_ATTR;
+ itable = ((xlator_t *)this->graph->top)->itable;
+ /*Internal processes may not have itable for
+ *top xlator*/
+ if (itable)
+ inode = inode_find(itable, upcall->gfid);
+ if (inode)
+ afr_inode_need_refresh_set(inode, this);
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+}
+
int32_t
afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
{
@@ -5327,10 +5436,6 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
dict_t *output = NULL;
gf_boolean_t had_quorum = _gf_false;
gf_boolean_t has_quorum = _gf_false;
- struct gf_upcall *up_data = NULL;
- struct gf_upcall_cache_invalidation *up_ci = NULL;
- inode_table_t *itable = NULL;
- inode_t *inode = NULL;
int64_t halo_max_latency_msec = 0;
int64_t child_latency_msec = -1;
@@ -5358,7 +5463,7 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
* subsequent revalidate lookup happens on all the dht's subvolumes
* which triggers afr self-heals if any.
*/
- idx = find_child_index(this, child_xlator);
+ idx = afr_find_child_index(this, child_xlator);
if (idx < 0) {
gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP,
"Received child_up from invalid subvolume");
@@ -5407,6 +5512,10 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
goto out;
}
+ if (event == GF_EVENT_UPCALL) {
+ afr_handle_upcall_event(this, data);
+ }
+
LOCK(&priv->lock);
{
had_heard_from_all = __get_heard_from_all_status(this);
@@ -5416,12 +5525,22 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
propagate = 1;
break;
case GF_EVENT_CHILD_UP:
+ if (priv->thin_arbiter_count &&
+ (idx == AFR_CHILD_THIN_ARBITER)) {
+ priv->ta_child_up = 1;
+ break;
+ }
__afr_handle_child_up_event(this, child_xlator, idx,
child_latency_msec, &event,
&call_psh, &up_child);
break;
case GF_EVENT_CHILD_DOWN:
+ if (priv->thin_arbiter_count &&
+ (idx == AFR_CHILD_THIN_ARBITER)) {
+ priv->ta_child_up = 0;
+ break;
+ }
__afr_handle_child_down_event(this, child_xlator, idx,
child_latency_msec, &event,
&call_psh, &up_child);
@@ -5435,34 +5554,6 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
case GF_EVENT_SOME_DESCENDENT_DOWN:
priv->last_event[idx] = event;
break;
- case GF_EVENT_UPCALL:
- up_data = (struct gf_upcall *)data;
- if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION)
- break;
- up_ci = (struct gf_upcall_cache_invalidation *)up_data->data;
-
- /* Since md-cache will be aggressively filtering
- * lookups, the stale read issue will be more
- * pronounced. Hence when a pending xattr is set notify
- * all the md-cache clients to invalidate the existing
- * stat cache and send the lookup next time */
- if (!up_ci->dict)
- break;
- for (i = 0; i < priv->child_count; i++) {
- if (dict_get(up_ci->dict, priv->pending_key[i])) {
- up_ci->flags |= UP_INVAL_ATTR;
- itable = ((xlator_t *)this->graph->top)->itable;
- /*Internal processes may not have itable for top
- * xlator*/
- if (itable)
- inode = inode_find(itable, up_data->gfid);
- if (inode)
- afr_inode_need_refresh_set(inode, this);
-
- break;
- }
- }
- break;
default:
propagate = 1;
break;
@@ -5602,6 +5693,10 @@ afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
}
local->need_full_crawl = _gf_false;
+ if (priv->thin_arbiter_count) {
+ local->ta_child_up = priv->ta_child_up;
+ local->ta_failed_subvol = AFR_CHILD_UNKNOWN;
+ }
INIT_LIST_HEAD(&local->healer);
return 0;
@@ -5715,6 +5810,8 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this)
ret = 0;
INIT_LIST_HEAD(&local->transaction.wait_list);
INIT_LIST_HEAD(&local->transaction.owner_list);
+ INIT_LIST_HEAD(&local->ta_waitq);
+ INIT_LIST_HEAD(&local->ta_onwireq);
out:
return ret;
}
@@ -6703,9 +6800,6 @@ afr_ta_is_fop_called_from_synctask(xlator_t *this)
int
afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
{
- /*Note: At any given time, only one instance of this function must
- * be in progress.*/
-
int ret = 0;
uuid_t gfid = {
0,
@@ -6720,6 +6814,11 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
};
int32_t cmd = 0;
+ /* Clients must take AFR_TA_DOM_NOTIFY lock only when the previous lock
+ * has been released in afr_notify due to upcall notification from shd.
+ */
+ GF_ASSERT(priv->ta_notify_dom_lock_offset == 0);
+
if (!priv->shd.iamshd)
GF_ASSERT(afr_ta_is_fop_called_from_synctask(this));
flock1.l_type = F_WRLCK;
@@ -6731,14 +6830,10 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
flock1.l_len = 0;
} else {
cmd = F_SETLK;
- if (priv->ta_notify_dom_lock_offset) {
- flock1.l_start = priv->ta_notify_dom_lock_offset;
- } else {
- gf_uuid_generate(gfid);
- flock1.l_start = gfid_to_ino(gfid);
- if (flock1.l_start < 0)
- flock1.l_start = -flock1.l_start;
- }
+ gf_uuid_generate(gfid);
+ flock1.l_start = gfid_to_ino(gfid);
+ if (flock1.l_start < 0)
+ flock1.l_start = -flock1.l_start;
flock1.l_len = 1;
}
ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
@@ -6764,7 +6859,7 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
AFR_TA_DOM_MODIFY, loc, F_SETLKW, &flock2, NULL, NULL);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
- "Failed to get AFR_TA_DOM_MODIFY lock.");
+ "Failed to get AFR_TA_DOM_MODIFY lock on %s.", loc->name);
flock1.l_type = F_UNLCK;
ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock1, NULL,
@@ -6829,3 +6924,18 @@ afr_ta_frame_create(xlator_t *this)
afr_set_lk_owner(frame, this, lk_owner);
return frame;
}
+
+gf_boolean_t
+afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local)
+{
+ int data_count = 0;
+
+ data_count = AFR_COUNT(local->child_up, priv->child_count);
+ if (data_count == 2) {
+ return _gf_true;
+ } else if (data_count == 1 && local->ta_child_up) {
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
index 1df39c35fce..1cd5c2eee3b 100644
--- a/xlators/cluster/afr/src/afr-read-txn.c
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -261,6 +261,8 @@ afr_ta_read_txn_synctask(call_frame_t *frame, xlator_t *this)
if (!ta_frame) {
local->op_ret = -1;
local->op_errno = ENOMEM;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to create ta_frame");
goto out;
}
ret = synctask_new(this->ctx->env, afr_ta_read_txn, afr_ta_read_txn_done,
@@ -440,6 +442,12 @@ afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode,
goto read;
}
+ if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) {
+ local->op_ret = -1;
+ local->op_errno = -afr_quorum_errno(priv);
+ goto read;
+ }
+
if (priv->thin_arbiter_count &&
AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) {
afr_ta_read_txn_synctask(frame, this);
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index cf153dea9cc..fb78c198d9c 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -32,7 +32,7 @@ void
__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared);
void
-afr_changelog_post_op(call_frame_t *frame, xlator_t *this);
+afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this);
int
afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this);
@@ -53,6 +53,90 @@ afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr,
afr_changelog_resume_t changelog_resume,
afr_xattrop_type_t op);
+static void
+afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this);
+
+static int
+afr_ta_post_op_do(void *opaque);
+
+static int
+afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local);
+
+static int
+afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this);
+
+static void
+afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno);
+
+static void
+afr_ta_process_waitq(xlator_t *this)
+{
+ afr_local_t *entry = NULL;
+ afr_private_t *priv = this->private;
+ struct list_head waitq = {
+ 0,
+ };
+
+ INIT_LIST_HEAD(&waitq);
+ LOCK(&priv->lock);
+ list_splice_init(&priv->ta_waitq, &waitq);
+ UNLOCK(&priv->lock);
+ list_for_each_entry(entry, &waitq, ta_waitq)
+ {
+ afr_ta_decide_post_op_state(entry->transaction.frame, this);
+ }
+}
+
+int
+afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque)
+{
+ afr_ta_process_waitq(ta_frame->this);
+ STACK_DESTROY(ta_frame->root);
+ return 0;
+}
+
+int
+afr_release_notify_lock_for_ta(void *opaque)
+{
+ xlator_t *this = NULL;
+ afr_private_t *priv = NULL;
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
+ int ret = -1;
+
+ this = (xlator_t *)opaque;
+ priv = this->private;
+ ret = afr_fill_ta_loc(this, &loc);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to populate loc for thin-arbiter.");
+ goto out;
+ }
+ flock.l_type = F_UNLCK;
+ flock.l_start = priv->ta_notify_dom_lock_offset;
+ flock.l_len = 1;
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_NOTIFY, &loc, F_SETLK, &flock, NULL, NULL);
+ if (!ret) {
+ LOCK(&priv->lock);
+ priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
+ priv->release_ta_notify_dom_lock = _gf_false;
+ priv->ta_notify_dom_lock_offset = 0;
+ UNLOCK(&priv->lock);
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to unlock AFR_TA_DOM_NOTIFY lock.");
+ }
+
+out:
+ loc_wipe(&loc);
+ return ret;
+}
+
void
afr_zero_fill_stat(afr_local_t *local)
{
@@ -576,18 +660,109 @@ afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int **pending)
return ret;
}
+static void
+afr_ta_dom_lock_check_and_release(afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ unsigned int inmem_count = 0;
+ unsigned int onwire_count = 0;
+ gf_boolean_t release = _gf_false;
+
+ LOCK(&priv->lock);
+ {
+ /*Once we get notify lock release upcall notification,
+ if two fop states are non empty/non zero, we will
+ not release lock.
+ 1 - If anything in memory txn
+ 2 - If anything in onwire or onwireq
+ */
+ if (local->fop_state == TA_INFO_IN_MEMORY_SUCCESS) {
+ inmem_count = --priv->ta_in_mem_txn_count;
+ } else {
+ inmem_count = priv->ta_in_mem_txn_count;
+ }
+ onwire_count = priv->ta_on_wire_txn_count;
+ release = priv->release_ta_notify_dom_lock;
+ }
+ UNLOCK(&priv->lock);
+
+ if (inmem_count != 0 || release == _gf_false || onwire_count != 0)
+ return;
+
+ afr_ta_lock_release_synctask(this);
+}
+
+static void
+afr_ta_process_onwireq(afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ afr_local_t *entry = NULL;
+ int bad_child = AFR_CHILD_UNKNOWN;
+
+ struct list_head onwireq = {
+ 0,
+ };
+ INIT_LIST_HEAD(&onwireq);
-/* {{{ pending */
+ LOCK(&priv->lock);
+ {
+ if (--priv->ta_on_wire_txn_count == 0) {
+ UNLOCK(&priv->lock);
+ /*Only one write fop came and after taking notify
+ *lock and before doing xattrop, it has received
+ *lock contention upcall, so this is the only place
+ *to find this out and release the lock*/
+ afr_ta_dom_lock_check_and_release(local, this);
+ return;
+ }
+ bad_child = priv->ta_bad_child_index;
+ if (bad_child == AFR_CHILD_UNKNOWN) {
+ /*The previous on-wire ta_post_op was a failure. Just dequeue
+ *one element to wind on-wire again. */
+ entry = list_entry(priv->ta_onwireq.next, afr_local_t, ta_onwireq);
+ list_del_init(&entry->ta_onwireq);
+ } else {
+ /* Prepare to process all fops based on bad_child_index. */
+ list_splice_init(&priv->ta_onwireq, &onwireq);
+ }
+ }
+ UNLOCK(&priv->lock);
+
+ if (entry) {
+ afr_ta_post_op_synctask(this, entry);
+ return;
+ } else {
+ while (!list_empty(&onwireq)) {
+ entry = list_entry(onwireq.next, afr_local_t, ta_onwireq);
+ list_del_init(&entry->ta_onwireq);
+ LOCK(&priv->lock);
+ --priv->ta_on_wire_txn_count;
+ UNLOCK(&priv->lock);
+ if (entry->ta_failed_subvol == bad_child) {
+ afr_changelog_post_op_do(entry->transaction.frame, this);
+ } else {
+ afr_changelog_post_op_fail(entry->transaction.frame, this, EIO);
+ }
+ }
+ }
+}
int
afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
+ afr_private_t *priv = NULL;
local = frame->local;
+ priv = this->private;
int_lock = &local->internal_lock;
+ if (priv->thin_arbiter_count) {
+ /*fop should not come here with TA_WAIT_FOR_NOTIFY_LOCK_REL state */
+ afr_ta_dom_lock_check_and_release(frame->local, this);
+ }
+
/* Fail the FOP if post-op did not succeed on quorum no. of bricks. */
if (!afr_changelog_has_quorum(local, this)) {
local->op_ret = -1;
@@ -605,6 +780,20 @@ afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this)
return 0;
}
+static void
+afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno)
+{
+ afr_local_t *local = frame->local;
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_THIN_ARB,
+ "Failing %s for gfid %s. Fop state is:%d", gf_fop_list[local->op],
+ uuid_utoa(local->inode->gfid), local->fop_state);
+
+ afr_changelog_post_op_done(frame, this);
+}
+
unsigned char *
afr_locked_nodes_get(afr_transaction_type type, afr_internal_lock_t *int_lock)
{
@@ -983,8 +1172,240 @@ out:
return ret;
}
-int
-afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
+static int
+afr_ta_post_op_done(int ret, call_frame_t *frame, void *opaque)
+{
+ xlator_t *this = NULL;
+ afr_local_t *local = NULL;
+
+ local = (afr_local_t *)opaque;
+ this = frame->this;
+
+ STACK_DESTROY(frame->root);
+ afr_ta_process_onwireq(local, this);
+
+ return 0;
+}
+
+static int
+afr_ta_post_op_do(void *opaque)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t *this = NULL;
+ call_frame_t *txn_frame = NULL;
+ dict_t *xattr = NULL;
+ int **pending = NULL;
+ int failed_subvol = -1;
+ int success_subvol = -1;
+ loc_t loc = {
+ 0,
+ };
+ int idx = 0;
+ int i = 0;
+ int ret = 0;
+
+ local = (afr_local_t *)opaque;
+ txn_frame = local->transaction.frame;
+ this = txn_frame->this;
+ priv = this->private;
+ idx = afr_index_for_transaction_type(local->transaction.type);
+
+ ret = afr_fill_ta_loc(this, &loc);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to populate loc for thin-arbiter.");
+ goto out;
+ }
+
+ xattr = dict_new();
+ if (!xattr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!pending) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i]) {
+ pending[i][idx] = hton32(1);
+ failed_subvol = i;
+ } else {
+ success_subvol = i;
+ }
+ }
+
+ ret = afr_set_pending_dict(priv, xattr, pending);
+ if (ret < 0)
+ goto out;
+
+ ret = afr_ta_post_op_lock(this, &loc);
+ if (ret)
+ goto out;
+
+ ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
+ GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL);
+ LOCK(&priv->lock);
+ {
+ if (ret == 0) {
+ priv->ta_bad_child_index = failed_subvol;
+ } else if (ret == -EINVAL) {
+ priv->ta_bad_child_index = success_subvol;
+ }
+ }
+ UNLOCK(&priv->lock);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Post-op on thin-arbiter id file %s failed for gfid %s.",
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX],
+ uuid_utoa(local->inode->gfid));
+ if (ret == -EINVAL)
+ ret = -EIO; /* TA failed the fop. Return EIO to application. */
+ }
+
+ afr_ta_post_op_unlock(this, &loc);
+out:
+ if (xattr)
+ dict_unref(xattr);
+
+ if (pending)
+ afr_matrix_cleanup(pending, priv->child_count);
+
+ loc_wipe(&loc);
+
+ if (ret == 0) {
+ /*Mark pending xattrs on the up data brick.*/
+ afr_changelog_post_op_do(local->transaction.frame, this);
+ } else {
+ afr_changelog_post_op_fail(local->transaction.frame, this, -ret);
+ }
+ return ret;
+}
+
+static int
+afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local)
+{
+ call_frame_t *ta_frame = NULL;
+ int ret = 0;
+
+ ta_frame = afr_ta_frame_create(this);
+ if (!ta_frame) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to create ta_frame");
+ goto err;
+ }
+ ret = synctask_new(this->ctx->env, afr_ta_post_op_do, afr_ta_post_op_done,
+ ta_frame, local);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to launch post-op on thin arbiter for gfid %s",
+ uuid_utoa(local->inode->gfid));
+ STACK_DESTROY(ta_frame->root);
+ goto err;
+ }
+
+ return ret;
+err:
+ afr_changelog_post_op_fail(local->transaction.frame, this, ENOMEM);
+ return ret;
+}
+
+static void
+afr_ta_set_fop_state(afr_private_t *priv, afr_local_t *local,
+ int *on_wire_count)
+{
+ LOCK(&priv->lock);
+ {
+ if (priv->release_ta_notify_dom_lock == _gf_true) {
+ /* Put the fop in waitq until notify dom lock is released.*/
+ local->fop_state = TA_WAIT_FOR_NOTIFY_LOCK_REL;
+ list_add_tail(&local->ta_waitq, &priv->ta_waitq);
+ } else if (priv->ta_bad_child_index == AFR_CHILD_UNKNOWN) {
+ /* Post-op on thin-arbiter to decide success/failure. */
+ local->fop_state = TA_GET_INFO_FROM_TA_FILE;
+ *on_wire_count = ++priv->ta_on_wire_txn_count;
+ if (*on_wire_count > 1) {
+ /*Avoid sending multiple on-wire post-ops on TA*/
+ list_add_tail(&local->ta_onwireq, &priv->ta_onwireq);
+ }
+ } else if (local->ta_failed_subvol == priv->ta_bad_child_index) {
+ /* Post-op on TA not needed as the fop failed on the in-memory bad
+ * brick. Just mark pending xattrs on the good data brick.*/
+ local->fop_state = TA_INFO_IN_MEMORY_SUCCESS;
+ priv->ta_in_mem_txn_count++;
+ } else {
+ /* Post-op on TA not needed as the fop succeeded only on the
+ * in-memory bad data brick and not the good one. Fail the fop.*/
+ local->fop_state = TA_INFO_IN_MEMORY_FAILED;
+ }
+ }
+ UNLOCK(&priv->lock);
+}
+
+static void
+afr_ta_fill_failed_subvol(afr_private_t *priv, afr_local_t *local)
+{
+ int i = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i]) {
+ local->ta_failed_subvol = i;
+ break;
+ }
+ }
+}
+
+static void
+afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int on_wire_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ afr_ta_set_fop_state(priv, local, &on_wire_count);
+
+ switch (local->fop_state) {
+ case TA_GET_INFO_FROM_TA_FILE:
+ if (on_wire_count == 1)
+ afr_ta_post_op_synctask(this, local);
+ /*else, fop is queued in ta_onwireq.*/
+ break;
+ case TA_WAIT_FOR_NOTIFY_LOCK_REL:
+ /*Post releasing the notify lock, we will act on this queue*/
+ break;
+ case TA_INFO_IN_MEMORY_SUCCESS:
+ afr_changelog_post_op_do(frame, this);
+ break;
+ case TA_INFO_IN_MEMORY_FAILED:
+ afr_changelog_post_op_fail(frame, this, EIO);
+ break;
+ }
+ return;
+}
+
+static void
+afr_handle_failure_using_thin_arbiter(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+
+ afr_ta_fill_failed_subvol(priv, local);
+ gf_msg_debug(this->name, 0,
+ "Fop failed on data brick (%s) for gfid=%s. "
+ "ta info needed to decide fop result.",
+ priv->children[local->ta_failed_subvol]->name,
+ uuid_utoa(local->inode->gfid));
+ afr_ta_decide_post_op_state(frame, this);
+}
+
+void
+afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this)
{
afr_private_t *priv = this->private;
afr_local_t *local = NULL;
@@ -1001,9 +1422,7 @@ afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
xattr = dict_new();
if (!xattr) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- afr_changelog_post_op_done(frame, this);
+ afr_changelog_post_op_fail(frame, this, ENOMEM);
goto out;
}
@@ -1030,9 +1449,8 @@ afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
}
if (local->transaction.in_flight_sb) {
- local->op_ret = -1;
- local->op_errno = local->transaction.in_flight_sb_errno;
- afr_changelog_post_op_done(frame, this);
+ afr_changelog_post_op_fail(frame, this,
+ local->transaction.in_flight_sb_errno);
goto out;
}
@@ -1043,17 +1461,7 @@ afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
ret = afr_set_pending_dict(priv, xattr, local->pending);
if (ret < 0) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- afr_changelog_post_op_done(frame, this);
- goto out;
- }
-
- ret = afr_changelog_thin_arbiter_post_op(this, local);
- if (ret < 0) {
- local->op_ret = -1;
- local->op_errno = -ret;
- afr_changelog_post_op_done(frame, this);
+ afr_changelog_post_op_fail(frame, this, ENOMEM);
goto out;
}
@@ -1066,9 +1474,7 @@ set_dirty:
ret = dict_set_static_bin(xattr, AFR_DIRTY, local->dirty,
sizeof(int) * AFR_NUM_CHANGE_LOGS);
if (ret) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- afr_changelog_post_op_done(frame, this);
+ afr_changelog_post_op_fail(frame, this, ENOMEM);
goto out;
}
@@ -1078,6 +1484,32 @@ out:
if (xattr)
dict_unref(xattr);
+ return;
+}
+
+static int
+afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int failed_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (priv->thin_arbiter_count) {
+ failed_count = AFR_COUNT(local->transaction.failed_subvols,
+ priv->child_count);
+ if (failed_count == 1) {
+ afr_handle_failure_using_thin_arbiter(frame, this);
+ return 0;
+ } else {
+ /* Txn either succeeded or failed on both data bricks. Let
+ * post_op_do handle it as the case might be. */
+ }
+ }
+
+ afr_changelog_post_op_do(frame, this);
return 0;
}
@@ -2457,6 +2889,11 @@ afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type)
goto out;
}
+ if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) {
+ ret = -afr_quorum_errno(priv);
+ goto out;
+ }
+
ret = afr_transaction_local_init(local, this);
if (ret < 0)
goto out;
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
index fff8c65e976..35a922544bc 100644
--- a/xlators/cluster/afr/src/afr-transaction.h
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -67,4 +67,9 @@ afr_lock(call_frame_t *frame, xlator_t *this);
void
afr_delayed_changelog_wake_up_cbk(void *data);
+int
+afr_release_notify_lock_for_ta(void *opaque);
+
+int
+afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque);
#endif /* __TRANSACTION_H__ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 26950fd7927..5d5e536ff60 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -336,6 +336,22 @@ out:
return ret;
}
+void
+afr_ta_init(afr_private_t *priv)
+{
+ priv->thin_arbiter_count = 1;
+ priv->child_count--;
+ priv->ta_child_up = 0;
+ priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
+ priv->ta_notify_dom_lock_offset = 0;
+ priv->ta_in_mem_txn_count = 0;
+ priv->ta_on_wire_txn_count = 0;
+ priv->release_ta_notify_dom_lock = _gf_false;
+ INIT_LIST_HEAD(&priv->ta_waitq);
+ INIT_LIST_HEAD(&priv->ta_onwireq);
+ *priv->ta_gfid = 0;
+}
+
int32_t
init(xlator_t *this)
{
@@ -380,11 +396,7 @@ init(xlator_t *this)
GF_OPTION_INIT("arbiter-count", priv->arbiter_count, uint32, out);
GF_OPTION_INIT("thin-arbiter", thin_arbiter, str, out);
if (thin_arbiter && strlen(thin_arbiter) > 0) {
- priv->thin_arbiter_count = 1;
- priv->child_count--;
- priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
- priv->ta_notify_dom_lock_offset = 0;
- *priv->ta_gfid = 0;
+ afr_ta_init(priv);
}
INIT_LIST_HEAD(&priv->healing);
INIT_LIST_HEAD(&priv->heal_waiting);
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 3d2c1950571..6f8015380f0 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -107,8 +107,20 @@ typedef enum {
AFR_CHILD_UNKNOWN = -1,
AFR_CHILD_ZERO,
AFR_CHILD_ONE,
+ AFR_CHILD_THIN_ARBITER,
} afr_child_index;
+typedef enum {
+ TA_WAIT_FOR_NOTIFY_LOCK_REL, /*FOP came after notify domain lock upcall
+ notification and waiting for its release.*/
+ TA_GET_INFO_FROM_TA_FILE, /*FOP needs post-op on ta file to get
+ *info about which brick is bad.*/
+ TA_INFO_IN_MEMORY_SUCCESS, /*Bad brick info is in memory and fop failed
+ *on BAD brick - Success*/
+ TA_INFO_IN_MEMORY_FAILED, /*Bad brick info is in memory and fop failed
+ *on GOOD brick - Failed*/
+} afr_ta_fop_state_t;
+
struct afr_nfsd {
gf_boolean_t iamnfsd;
uint32_t halo_max_latency_msec;
@@ -127,8 +139,14 @@ typedef struct _afr_private {
/* For thin-arbiter. */
unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/
uuid_t ta_gfid;
+ unsigned char ta_child_up;
int ta_bad_child_index;
off_t ta_notify_dom_lock_offset;
+ gf_boolean_t release_ta_notify_dom_lock;
+ unsigned int ta_in_mem_txn_count;
+ unsigned int ta_on_wire_txn_count;
+ struct list_head ta_waitq;
+ struct list_head ta_onwireq;
unsigned char *child_up;
int64_t *child_latency;
@@ -855,6 +873,13 @@ typedef struct _afr_local {
gf_boolean_t is_read_txn;
afr_inode_ctx_t *inode_ctx;
+
+ /*For thin-arbiter transactions.*/
+ unsigned char ta_child_up;
+ struct list_head ta_waitq;
+ struct list_head ta_onwireq;
+ afr_ta_fop_state_t fop_state;
+ int ta_failed_subvol;
} afr_local_t;
typedef struct afr_spbc_timeout {
@@ -1289,4 +1314,10 @@ __afr_get_up_children_count(afr_private_t *priv);
call_frame_t *
afr_ta_frame_create(xlator_t *this);
+
+gf_boolean_t
+afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local);
+
+void
+afr_ta_lock_release_synctask(xlator_t *this);
#endif /* __AFR_H__ */