diff options
author | Ravishankar N <ravishankar@redhat.com> | 2018-09-23 16:59:58 +0530 |
---|---|---|
committer | Ravishankar N <ravishankar@redhat.com> | 2018-10-25 12:26:22 +0000 |
commit | 053b1309dc8fbc05fcde5223e734da9f694cf5cc (patch) | |
tree | e2eba5c81024b5dc07eef5966289d5e71c3567ee /xlators | |
parent | aae1c402b74fd02ed2f6473b896f108d82aef8e3 (diff) |
afr: thin-arbiter 2 domain locking and in-memory state
2 domain locking + xattrop for write-txn failures:
--------------------------------------------------
- A post-op wound on TA takes AFR_TA_DOM_NOTIFY range lock and
AFR_TA_DOM_MODIFY full lock, does xattrop on TA and releases
AFR_TA_DOM_MODIFY lock and stores in-memory which brick is bad.
- All further write txn failures are handled based on this in-memory
value without querying the TA.
- When shd heals the files, it does so by requesting full lock on
AFR_TA_DOM_NOTIFY domain. Client uses this as a cue (via upcall),
releases AFR_TA_DOM_NOTIFY range lock and invalidates its in-memory
notion of which brick is bad. The next write txn failure is wound on TA
to again update the in-memory state.
- Any incomplete write txns before the AFR_TA_DOM_NOTIFY upcall release
request is got is completed before the lock is released.
- Any write txns got after the release request are maintained in a ta_waitq.
- After the release is complete, the ta_waitq elements are spliced to a
separate queue which is then processed one by one.
- For fops that come in parallel when the in-memory bad brick is still
unknown, only one is wound to TA on wire. The other ones are maintained
in a ta_onwireq which is then processed after we get the response from
TA.
Change-Id: I32c7b61a61776663601ab0040e2f0767eca1fd64
updates: bz#1579788
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
Signed-off-by: Ashish Pandey <aspandey@redhat.com>
Diffstat (limited to 'xlators')
-rw-r--r-- | xlators/cluster/afr/src/afr-common.c | 204 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-read-txn.c | 8 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-transaction.c | 485 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr-transaction.h | 5 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.c | 22 | ||||
-rw-r--r-- | xlators/cluster/afr/src/afr.h | 31 |
6 files changed, 679 insertions, 76 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index e655999c6a6..47120503980 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -4893,6 +4893,13 @@ afr_priv_dump(xlator_t *this) gf_proc_dump_write("quorum-count", "%d", priv->quorum_count); } gf_proc_dump_write("up", "%u", afr_has_quorum(priv->child_up, this)); + if (priv->thin_arbiter_count) { + gf_proc_dump_write("ta_child_up", "%d", priv->ta_child_up); + gf_proc_dump_write("ta_bad_child_index", "%d", + priv->ta_bad_child_index); + gf_proc_dump_write("ta_notify_dom_lock_offset", "%lld", + priv->ta_notify_dom_lock_offset); + } return 0; } @@ -4904,14 +4911,19 @@ afr_priv_dump(xlator_t *this) */ static int -find_child_index(xlator_t *this, xlator_t *child) +afr_find_child_index(xlator_t *this, xlator_t *child) { afr_private_t *priv = NULL; + int child_count = -1; int i = -1; priv = this->private; + child_count = priv->child_count; + if (priv->thin_arbiter_count) { + child_count++; + } - for (i = 0; i < priv->child_count; i++) { + for (i = 0; i < child_count; i++) { if ((xlator_t *)child == priv->children[i]) break; } @@ -5310,6 +5322,103 @@ __afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx, priv->last_event[idx] = *event; } +void +afr_ta_lock_release_synctask(xlator_t *this) +{ + call_frame_t *ta_frame = NULL; + int ret = 0; + + ta_frame = afr_ta_frame_create(this); + if (!ta_frame) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to create ta_frame"); + return; + } + + ret = synctask_new(this->ctx->env, afr_release_notify_lock_for_ta, + afr_ta_lock_release_done, ta_frame, this); + if (ret) { + STACK_DESTROY(ta_frame->root); + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to release " + "AFR_TA_DOM_NOTIFY lock."); + } +} + +static void +afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall) +{ + struct gf_upcall_inodelk_contention *lc = NULL; + unsigned int inmem_count = 0; + unsigned int onwire_count = 0; + afr_private_t *priv = this->private; + + lc = upcall->data; + + if (strcmp(lc->domain, AFR_TA_DOM_NOTIFY) != 0) + return; + + if (priv->shd.iamshd) { + /* shd should ignore AFR_TA_DOM_NOTIFY release requests. */ + return; + } + LOCK(&priv->lock); + { + priv->release_ta_notify_dom_lock = _gf_true; + inmem_count = priv->ta_in_mem_txn_count; + onwire_count = priv->ta_on_wire_txn_count; + } + UNLOCK(&priv->lock); + if (inmem_count || onwire_count) + /* lock release will happen in txn code path after + * inflight or on-wire txns are over.*/ + return; + + afr_ta_lock_release_synctask(this); +} + +static void +afr_handle_upcall_event(xlator_t *this, struct gf_upcall *upcall) +{ + struct gf_upcall_cache_invalidation *up_ci = NULL; + afr_private_t *priv = this->private; + inode_t *inode = NULL; + inode_table_t *itable = NULL; + int i = 0; + + switch (upcall->event_type) { + case GF_UPCALL_INODELK_CONTENTION: + afr_handle_inodelk_contention(this, upcall); + break; + case GF_UPCALL_CACHE_INVALIDATION: + up_ci = (struct gf_upcall_cache_invalidation *)upcall->data; + + /* Since md-cache will be aggressively filtering + * lookups, the stale read issue will be more + * pronounced. Hence when a pending xattr is set notify + * all the md-cache clients to invalidate the existing + * stat cache and send the lookup next time */ + if (!up_ci->dict) + break; + for (i = 0; i < priv->child_count; i++) { + if (!dict_get(up_ci->dict, priv->pending_key[i])) + continue; + up_ci->flags |= UP_INVAL_ATTR; + itable = ((xlator_t *)this->graph->top)->itable; + /*Internal processes may not have itable for + *top xlator*/ + if (itable) + inode = inode_find(itable, upcall->gfid); + if (inode) + afr_inode_need_refresh_set(inode, this); + break; + } + break; + default: + break; + } +} + int32_t afr_notify(xlator_t *this, int32_t event, void *data, void *data2) { @@ -5327,10 +5436,6 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) dict_t *output = NULL; gf_boolean_t had_quorum = _gf_false; gf_boolean_t has_quorum = _gf_false; - struct gf_upcall *up_data = NULL; - struct gf_upcall_cache_invalidation *up_ci = NULL; - inode_table_t *itable = NULL; - inode_t *inode = NULL; int64_t halo_max_latency_msec = 0; int64_t child_latency_msec = -1; @@ -5358,7 +5463,7 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) * subsequent revalidate lookup happens on all the dht's subvolumes * which triggers afr self-heals if any. */ - idx = find_child_index(this, child_xlator); + idx = afr_find_child_index(this, child_xlator); if (idx < 0) { gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP, "Received child_up from invalid subvolume"); @@ -5407,6 +5512,10 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) goto out; } + if (event == GF_EVENT_UPCALL) { + afr_handle_upcall_event(this, data); + } + LOCK(&priv->lock); { had_heard_from_all = __get_heard_from_all_status(this); @@ -5416,12 +5525,22 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) propagate = 1; break; case GF_EVENT_CHILD_UP: + if (priv->thin_arbiter_count && + (idx == AFR_CHILD_THIN_ARBITER)) { + priv->ta_child_up = 1; + break; + } __afr_handle_child_up_event(this, child_xlator, idx, child_latency_msec, &event, &call_psh, &up_child); break; case GF_EVENT_CHILD_DOWN: + if (priv->thin_arbiter_count && + (idx == AFR_CHILD_THIN_ARBITER)) { + priv->ta_child_up = 0; + break; + } __afr_handle_child_down_event(this, child_xlator, idx, child_latency_msec, &event, &call_psh, &up_child); @@ -5435,34 +5554,6 @@ afr_notify(xlator_t *this, int32_t event, void *data, void *data2) case GF_EVENT_SOME_DESCENDENT_DOWN: priv->last_event[idx] = event; break; - case GF_EVENT_UPCALL: - up_data = (struct gf_upcall *)data; - if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION) - break; - up_ci = (struct gf_upcall_cache_invalidation *)up_data->data; - - /* Since md-cache will be aggressively filtering - * lookups, the stale read issue will be more - * pronounced. Hence when a pending xattr is set notify - * all the md-cache clients to invalidate the existing - * stat cache and send the lookup next time */ - if (!up_ci->dict) - break; - for (i = 0; i < priv->child_count; i++) { - if (dict_get(up_ci->dict, priv->pending_key[i])) { - up_ci->flags |= UP_INVAL_ATTR; - itable = ((xlator_t *)this->graph->top)->itable; - /*Internal processes may not have itable for top - * xlator*/ - if (itable) - inode = inode_find(itable, up_data->gfid); - if (inode) - afr_inode_need_refresh_set(inode, this); - - break; - } - } - break; default: propagate = 1; break; @@ -5602,6 +5693,10 @@ afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno) } local->need_full_crawl = _gf_false; + if (priv->thin_arbiter_count) { + local->ta_child_up = priv->ta_child_up; + local->ta_failed_subvol = AFR_CHILD_UNKNOWN; + } INIT_LIST_HEAD(&local->healer); return 0; @@ -5715,6 +5810,8 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this) ret = 0; INIT_LIST_HEAD(&local->transaction.wait_list); INIT_LIST_HEAD(&local->transaction.owner_list); + INIT_LIST_HEAD(&local->ta_waitq); + INIT_LIST_HEAD(&local->ta_onwireq); out: return ret; } @@ -6703,9 +6800,6 @@ afr_ta_is_fop_called_from_synctask(xlator_t *this) int afr_ta_post_op_lock(xlator_t *this, loc_t *loc) { - /*Note: At any given time, only one instance of this function must - * be in progress.*/ - int ret = 0; uuid_t gfid = { 0, @@ -6720,6 +6814,11 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc) }; int32_t cmd = 0; + /* Clients must take AFR_TA_DOM_NOTIFY lock only when the previous lock + * has been released in afr_notify due to upcall notification from shd. + */ + GF_ASSERT(priv->ta_notify_dom_lock_offset == 0); + if (!priv->shd.iamshd) GF_ASSERT(afr_ta_is_fop_called_from_synctask(this)); flock1.l_type = F_WRLCK; @@ -6731,14 +6830,10 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc) flock1.l_len = 0; } else { cmd = F_SETLK; - if (priv->ta_notify_dom_lock_offset) { - flock1.l_start = priv->ta_notify_dom_lock_offset; - } else { - gf_uuid_generate(gfid); - flock1.l_start = gfid_to_ino(gfid); - if (flock1.l_start < 0) - flock1.l_start = -flock1.l_start; - } + gf_uuid_generate(gfid); + flock1.l_start = gfid_to_ino(gfid); + if (flock1.l_start < 0) + flock1.l_start = -flock1.l_start; flock1.l_len = 1; } ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], @@ -6764,7 +6859,7 @@ afr_ta_post_op_lock(xlator_t *this, loc_t *loc) AFR_TA_DOM_MODIFY, loc, F_SETLKW, &flock2, NULL, NULL); if (ret) { gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, - "Failed to get AFR_TA_DOM_MODIFY lock."); + "Failed to get AFR_TA_DOM_MODIFY lock on %s.", loc->name); flock1.l_type = F_UNLCK; ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock1, NULL, @@ -6829,3 +6924,18 @@ afr_ta_frame_create(xlator_t *this) afr_set_lk_owner(frame, this, lk_owner); return frame; } + +gf_boolean_t +afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local) +{ + int data_count = 0; + + data_count = AFR_COUNT(local->child_up, priv->child_count); + if (data_count == 2) { + return _gf_true; + } else if (data_count == 1 && local->ta_child_up) { + return _gf_true; + } + + return _gf_false; +} diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index 1df39c35fce..1cd5c2eee3b 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -261,6 +261,8 @@ afr_ta_read_txn_synctask(call_frame_t *frame, xlator_t *this) if (!ta_frame) { local->op_ret = -1; local->op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to create ta_frame"); goto out; } ret = synctask_new(this->ctx->env, afr_ta_read_txn, afr_ta_read_txn_done, @@ -440,6 +442,12 @@ afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode, goto read; } + if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) { + local->op_ret = -1; + local->op_errno = -afr_quorum_errno(priv); + goto read; + } + if (priv->thin_arbiter_count && AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) { afr_ta_read_txn_synctask(frame, this); diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index cf153dea9cc..fb78c198d9c 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -32,7 +32,7 @@ void __afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared); void -afr_changelog_post_op(call_frame_t *frame, xlator_t *this); +afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this); int afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this); @@ -53,6 +53,90 @@ afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr, afr_changelog_resume_t changelog_resume, afr_xattrop_type_t op); +static void +afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this); + +static int +afr_ta_post_op_do(void *opaque); + +static int +afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local); + +static int +afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this); + +static void +afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno); + +static void +afr_ta_process_waitq(xlator_t *this) +{ + afr_local_t *entry = NULL; + afr_private_t *priv = this->private; + struct list_head waitq = { + 0, + }; + + INIT_LIST_HEAD(&waitq); + LOCK(&priv->lock); + list_splice_init(&priv->ta_waitq, &waitq); + UNLOCK(&priv->lock); + list_for_each_entry(entry, &waitq, ta_waitq) + { + afr_ta_decide_post_op_state(entry->transaction.frame, this); + } +} + +int +afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque) +{ + afr_ta_process_waitq(ta_frame->this); + STACK_DESTROY(ta_frame->root); + return 0; +} + +int +afr_release_notify_lock_for_ta(void *opaque) +{ + xlator_t *this = NULL; + afr_private_t *priv = NULL; + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; + int ret = -1; + + this = (xlator_t *)opaque; + priv = this->private; + ret = afr_fill_ta_loc(this, &loc); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to populate loc for thin-arbiter."); + goto out; + } + flock.l_type = F_UNLCK; + flock.l_start = priv->ta_notify_dom_lock_offset; + flock.l_len = 1; + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_NOTIFY, &loc, F_SETLK, &flock, NULL, NULL); + if (!ret) { + LOCK(&priv->lock); + priv->ta_bad_child_index = AFR_CHILD_UNKNOWN; + priv->release_ta_notify_dom_lock = _gf_false; + priv->ta_notify_dom_lock_offset = 0; + UNLOCK(&priv->lock); + } else { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to unlock AFR_TA_DOM_NOTIFY lock."); + } + +out: + loc_wipe(&loc); + return ret; +} + void afr_zero_fill_stat(afr_local_t *local) { @@ -576,18 +660,109 @@ afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int **pending) return ret; } +static void +afr_ta_dom_lock_check_and_release(afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = this->private; + unsigned int inmem_count = 0; + unsigned int onwire_count = 0; + gf_boolean_t release = _gf_false; + + LOCK(&priv->lock); + { + /*Once we get notify lock release upcall notification, + if two fop states are non empty/non zero, we will + not release lock. + 1 - If anything in memory txn + 2 - If anything in onwire or onwireq + */ + if (local->fop_state == TA_INFO_IN_MEMORY_SUCCESS) { + inmem_count = --priv->ta_in_mem_txn_count; + } else { + inmem_count = priv->ta_in_mem_txn_count; + } + onwire_count = priv->ta_on_wire_txn_count; + release = priv->release_ta_notify_dom_lock; + } + UNLOCK(&priv->lock); + + if (inmem_count != 0 || release == _gf_false || onwire_count != 0) + return; + + afr_ta_lock_release_synctask(this); +} + +static void +afr_ta_process_onwireq(afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = this->private; + afr_local_t *entry = NULL; + int bad_child = AFR_CHILD_UNKNOWN; + + struct list_head onwireq = { + 0, + }; + INIT_LIST_HEAD(&onwireq); -/* {{{ pending */ + LOCK(&priv->lock); + { + if (--priv->ta_on_wire_txn_count == 0) { + UNLOCK(&priv->lock); + /*Only one write fop came and after taking notify + *lock and before doing xattrop, it has received + *lock contention upcall, so this is the only place + *to find this out and release the lock*/ + afr_ta_dom_lock_check_and_release(local, this); + return; + } + bad_child = priv->ta_bad_child_index; + if (bad_child == AFR_CHILD_UNKNOWN) { + /*The previous on-wire ta_post_op was a failure. Just dequeue + *one element to wind on-wire again. */ + entry = list_entry(priv->ta_onwireq.next, afr_local_t, ta_onwireq); + list_del_init(&entry->ta_onwireq); + } else { + /* Prepare to process all fops based on bad_child_index. */ + list_splice_init(&priv->ta_onwireq, &onwireq); + } + } + UNLOCK(&priv->lock); + + if (entry) { + afr_ta_post_op_synctask(this, entry); + return; + } else { + while (!list_empty(&onwireq)) { + entry = list_entry(onwireq.next, afr_local_t, ta_onwireq); + list_del_init(&entry->ta_onwireq); + LOCK(&priv->lock); + --priv->ta_on_wire_txn_count; + UNLOCK(&priv->lock); + if (entry->ta_failed_subvol == bad_child) { + afr_changelog_post_op_do(entry->transaction.frame, this); + } else { + afr_changelog_post_op_fail(entry->transaction.frame, this, EIO); + } + } + } +} int afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; + afr_private_t *priv = NULL; local = frame->local; + priv = this->private; int_lock = &local->internal_lock; + if (priv->thin_arbiter_count) { + /*fop should not come here with TA_WAIT_FOR_NOTIFY_LOCK_REL state */ + afr_ta_dom_lock_check_and_release(frame->local, this); + } + /* Fail the FOP if post-op did not succeed on quorum no. of bricks. */ if (!afr_changelog_has_quorum(local, this)) { local->op_ret = -1; @@ -605,6 +780,20 @@ afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this) return 0; } +static void +afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno) +{ + afr_local_t *local = frame->local; + local->op_ret = -1; + local->op_errno = op_errno; + + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_THIN_ARB, + "Failing %s for gfid %s. Fop state is:%d", gf_fop_list[local->op], + uuid_utoa(local->inode->gfid), local->fop_state); + + afr_changelog_post_op_done(frame, this); +} + unsigned char * afr_locked_nodes_get(afr_transaction_type type, afr_internal_lock_t *int_lock) { @@ -983,8 +1172,240 @@ out: return ret; } -int -afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this) +static int +afr_ta_post_op_done(int ret, call_frame_t *frame, void *opaque) +{ + xlator_t *this = NULL; + afr_local_t *local = NULL; + + local = (afr_local_t *)opaque; + this = frame->this; + + STACK_DESTROY(frame->root); + afr_ta_process_onwireq(local, this); + + return 0; +} + +static int +afr_ta_post_op_do(void *opaque) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t *this = NULL; + call_frame_t *txn_frame = NULL; + dict_t *xattr = NULL; + int **pending = NULL; + int failed_subvol = -1; + int success_subvol = -1; + loc_t loc = { + 0, + }; + int idx = 0; + int i = 0; + int ret = 0; + + local = (afr_local_t *)opaque; + txn_frame = local->transaction.frame; + this = txn_frame->this; + priv = this->private; + idx = afr_index_for_transaction_type(local->transaction.type); + + ret = afr_fill_ta_loc(this, &loc); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to populate loc for thin-arbiter."); + goto out; + } + + xattr = dict_new(); + if (!xattr) { + ret = -ENOMEM; + goto out; + } + + pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!pending) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) { + pending[i][idx] = hton32(1); + failed_subvol = i; + } else { + success_subvol = i; + } + } + + ret = afr_set_pending_dict(priv, xattr, pending); + if (ret < 0) + goto out; + + ret = afr_ta_post_op_lock(this, &loc); + if (ret) + goto out; + + ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL); + LOCK(&priv->lock); + { + if (ret == 0) { + priv->ta_bad_child_index = failed_subvol; + } else if (ret == -EINVAL) { + priv->ta_bad_child_index = success_subvol; + } + } + UNLOCK(&priv->lock); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Post-op on thin-arbiter id file %s failed for gfid %s.", + priv->pending_key[THIN_ARBITER_BRICK_INDEX], + uuid_utoa(local->inode->gfid)); + if (ret == -EINVAL) + ret = -EIO; /* TA failed the fop. Return EIO to application. */ + } + + afr_ta_post_op_unlock(this, &loc); +out: + if (xattr) + dict_unref(xattr); + + if (pending) + afr_matrix_cleanup(pending, priv->child_count); + + loc_wipe(&loc); + + if (ret == 0) { + /*Mark pending xattrs on the up data brick.*/ + afr_changelog_post_op_do(local->transaction.frame, this); + } else { + afr_changelog_post_op_fail(local->transaction.frame, this, -ret); + } + return ret; +} + +static int +afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local) +{ + call_frame_t *ta_frame = NULL; + int ret = 0; + + ta_frame = afr_ta_frame_create(this); + if (!ta_frame) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to create ta_frame"); + goto err; + } + ret = synctask_new(this->ctx->env, afr_ta_post_op_do, afr_ta_post_op_done, + ta_frame, local); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to launch post-op on thin arbiter for gfid %s", + uuid_utoa(local->inode->gfid)); + STACK_DESTROY(ta_frame->root); + goto err; + } + + return ret; +err: + afr_changelog_post_op_fail(local->transaction.frame, this, ENOMEM); + return ret; +} + +static void +afr_ta_set_fop_state(afr_private_t *priv, afr_local_t *local, + int *on_wire_count) +{ + LOCK(&priv->lock); + { + if (priv->release_ta_notify_dom_lock == _gf_true) { + /* Put the fop in waitq until notify dom lock is released.*/ + local->fop_state = TA_WAIT_FOR_NOTIFY_LOCK_REL; + list_add_tail(&local->ta_waitq, &priv->ta_waitq); + } else if (priv->ta_bad_child_index == AFR_CHILD_UNKNOWN) { + /* Post-op on thin-arbiter to decide success/failure. */ + local->fop_state = TA_GET_INFO_FROM_TA_FILE; + *on_wire_count = ++priv->ta_on_wire_txn_count; + if (*on_wire_count > 1) { + /*Avoid sending multiple on-wire post-ops on TA*/ + list_add_tail(&local->ta_onwireq, &priv->ta_onwireq); + } + } else if (local->ta_failed_subvol == priv->ta_bad_child_index) { + /* Post-op on TA not needed as the fop failed on the in-memory bad + * brick. Just mark pending xattrs on the good data brick.*/ + local->fop_state = TA_INFO_IN_MEMORY_SUCCESS; + priv->ta_in_mem_txn_count++; + } else { + /* Post-op on TA not needed as the fop succeeded only on the + * in-memory bad data brick and not the good one. Fail the fop.*/ + local->fop_state = TA_INFO_IN_MEMORY_FAILED; + } + } + UNLOCK(&priv->lock); +} + +static void +afr_ta_fill_failed_subvol(afr_private_t *priv, afr_local_t *local) +{ + int i = 0; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.failed_subvols[i]) { + local->ta_failed_subvol = i; + break; + } + } +} + +static void +afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int on_wire_count = 0; + + priv = this->private; + local = frame->local; + + afr_ta_set_fop_state(priv, local, &on_wire_count); + + switch (local->fop_state) { + case TA_GET_INFO_FROM_TA_FILE: + if (on_wire_count == 1) + afr_ta_post_op_synctask(this, local); + /*else, fop is queued in ta_onwireq.*/ + break; + case TA_WAIT_FOR_NOTIFY_LOCK_REL: + /*Post releasing the notify lock, we will act on this queue*/ + break; + case TA_INFO_IN_MEMORY_SUCCESS: + afr_changelog_post_op_do(frame, this); + break; + case TA_INFO_IN_MEMORY_FAILED: + afr_changelog_post_op_fail(frame, this, EIO); + break; + } + return; +} + +static void +afr_handle_failure_using_thin_arbiter(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = this->private; + afr_local_t *local = frame->local; + + afr_ta_fill_failed_subvol(priv, local); + gf_msg_debug(this->name, 0, + "Fop failed on data brick (%s) for gfid=%s. " + "ta info needed to decide fop result.", + priv->children[local->ta_failed_subvol]->name, + uuid_utoa(local->inode->gfid)); + afr_ta_decide_post_op_state(frame, this); +} + +void +afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this) { afr_private_t *priv = this->private; afr_local_t *local = NULL; @@ -1001,9 +1422,7 @@ afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this) xattr = dict_new(); if (!xattr) { - local->op_ret = -1; - local->op_errno = ENOMEM; - afr_changelog_post_op_done(frame, this); + afr_changelog_post_op_fail(frame, this, ENOMEM); goto out; } @@ -1030,9 +1449,8 @@ afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this) } if (local->transaction.in_flight_sb) { - local->op_ret = -1; - local->op_errno = local->transaction.in_flight_sb_errno; - afr_changelog_post_op_done(frame, this); + afr_changelog_post_op_fail(frame, this, + local->transaction.in_flight_sb_errno); goto out; } @@ -1043,17 +1461,7 @@ afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this) ret = afr_set_pending_dict(priv, xattr, local->pending); if (ret < 0) { - local->op_ret = -1; - local->op_errno = ENOMEM; - afr_changelog_post_op_done(frame, this); - goto out; - } - - ret = afr_changelog_thin_arbiter_post_op(this, local); - if (ret < 0) { - local->op_ret = -1; - local->op_errno = -ret; - afr_changelog_post_op_done(frame, this); + afr_changelog_post_op_fail(frame, this, ENOMEM); goto out; } @@ -1066,9 +1474,7 @@ set_dirty: ret = dict_set_static_bin(xattr, AFR_DIRTY, local->dirty, sizeof(int) * AFR_NUM_CHANGE_LOGS); if (ret) { - local->op_ret = -1; - local->op_errno = ENOMEM; - afr_changelog_post_op_done(frame, this); + afr_changelog_post_op_fail(frame, this, ENOMEM); goto out; } @@ -1078,6 +1484,32 @@ out: if (xattr) dict_unref(xattr); + return; +} + +static int +afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int failed_count = 0; + + priv = this->private; + local = frame->local; + + if (priv->thin_arbiter_count) { + failed_count = AFR_COUNT(local->transaction.failed_subvols, + priv->child_count); + if (failed_count == 1) { + afr_handle_failure_using_thin_arbiter(frame, this); + return 0; + } else { + /* Txn either succeeded or failed on both data bricks. Let + * post_op_do handle it as the case might be. */ + } + } + + afr_changelog_post_op_do(frame, this); return 0; } @@ -2457,6 +2889,11 @@ afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type) goto out; } + if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) { + ret = -afr_quorum_errno(priv); + goto out; + } + ret = afr_transaction_local_init(local, this); if (ret < 0) goto out; diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index fff8c65e976..35a922544bc 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -67,4 +67,9 @@ afr_lock(call_frame_t *frame, xlator_t *this); void afr_delayed_changelog_wake_up_cbk(void *data); +int +afr_release_notify_lock_for_ta(void *opaque); + +int +afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque); #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index 26950fd7927..5d5e536ff60 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -336,6 +336,22 @@ out: return ret; } +void +afr_ta_init(afr_private_t *priv) +{ + priv->thin_arbiter_count = 1; + priv->child_count--; + priv->ta_child_up = 0; + priv->ta_bad_child_index = AFR_CHILD_UNKNOWN; + priv->ta_notify_dom_lock_offset = 0; + priv->ta_in_mem_txn_count = 0; + priv->ta_on_wire_txn_count = 0; + priv->release_ta_notify_dom_lock = _gf_false; + INIT_LIST_HEAD(&priv->ta_waitq); + INIT_LIST_HEAD(&priv->ta_onwireq); + *priv->ta_gfid = 0; +} + int32_t init(xlator_t *this) { @@ -380,11 +396,7 @@ init(xlator_t *this) GF_OPTION_INIT("arbiter-count", priv->arbiter_count, uint32, out); GF_OPTION_INIT("thin-arbiter", thin_arbiter, str, out); if (thin_arbiter && strlen(thin_arbiter) > 0) { - priv->thin_arbiter_count = 1; - priv->child_count--; - priv->ta_bad_child_index = AFR_CHILD_UNKNOWN; - priv->ta_notify_dom_lock_offset = 0; - *priv->ta_gfid = 0; + afr_ta_init(priv); } INIT_LIST_HEAD(&priv->healing); INIT_LIST_HEAD(&priv->heal_waiting); diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 3d2c1950571..6f8015380f0 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -107,8 +107,20 @@ typedef enum { AFR_CHILD_UNKNOWN = -1, AFR_CHILD_ZERO, AFR_CHILD_ONE, + AFR_CHILD_THIN_ARBITER, } afr_child_index; +typedef enum { + TA_WAIT_FOR_NOTIFY_LOCK_REL, /*FOP came after notify domain lock upcall + notification and waiting for its release.*/ + TA_GET_INFO_FROM_TA_FILE, /*FOP needs post-op on ta file to get + *info about which brick is bad.*/ + TA_INFO_IN_MEMORY_SUCCESS, /*Bad brick info is in memory and fop failed + *on BAD brick - Success*/ + TA_INFO_IN_MEMORY_FAILED, /*Bad brick info is in memory and fop failed + *on GOOD brick - Failed*/ +} afr_ta_fop_state_t; + struct afr_nfsd { gf_boolean_t iamnfsd; uint32_t halo_max_latency_msec; @@ -127,8 +139,14 @@ typedef struct _afr_private { /* For thin-arbiter. */ unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/ uuid_t ta_gfid; + unsigned char ta_child_up; int ta_bad_child_index; off_t ta_notify_dom_lock_offset; + gf_boolean_t release_ta_notify_dom_lock; + unsigned int ta_in_mem_txn_count; + unsigned int ta_on_wire_txn_count; + struct list_head ta_waitq; + struct list_head ta_onwireq; unsigned char *child_up; int64_t *child_latency; @@ -855,6 +873,13 @@ typedef struct _afr_local { gf_boolean_t is_read_txn; afr_inode_ctx_t *inode_ctx; + + /*For thin-arbiter transactions.*/ + unsigned char ta_child_up; + struct list_head ta_waitq; + struct list_head ta_onwireq; + afr_ta_fop_state_t fop_state; + int ta_failed_subvol; } afr_local_t; typedef struct afr_spbc_timeout { @@ -1289,4 +1314,10 @@ __afr_get_up_children_count(afr_private_t *priv); call_frame_t * afr_ta_frame_create(xlator_t *this); + +gf_boolean_t +afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local); + +void +afr_ta_lock_release_synctask(xlator_t *this); #endif /* __AFR_H__ */ |