diff options
Diffstat (limited to 'xlators/cluster/afr/src')
21 files changed, 3496 insertions, 906 deletions
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index a9acb4094..af01f2ef2 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -51,7 +51,7 @@ #define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000100000000ULL #define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL - +#define AFR_STATISTICS_HISTORY_SIZE 50 int afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, gf_boolean_t fail_conflict); @@ -779,6 +779,12 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) sh = &local->self_heal; priv = this->private; + if (sh->data_sh_info && strcmp (sh->data_sh_info, "")) + GF_FREE (sh->data_sh_info); + + if (sh->metadata_sh_info && strcmp (sh->metadata_sh_info, "")) + GF_FREE (sh->metadata_sh_info); + GF_FREE (sh->buf); GF_FREE (sh->parentbufs); @@ -826,7 +832,8 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) void afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) { - afr_private_t * priv = NULL; + afr_private_t *priv = NULL; + int i = 0; priv = this->private; @@ -836,7 +843,9 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) GF_FREE (local->internal_lock.locked_nodes); - GF_FREE (local->internal_lock.inode_locked_nodes); + for (i = 0; local->internal_lock.inodelk[i].domain; i++) { + GF_FREE (local->internal_lock.inodelk[i].locked_nodes); + } GF_FREE (local->internal_lock.lower_locked_nodes); @@ -1071,6 +1080,88 @@ afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) uuid_copy (loc->pargfid, postparent->ia_gfid); } +/* + * Quota size xattrs are not maintained by afr. There is a + * possibility that they differ even when both the directory changelog xattrs + * suggest everything is fine. So if there is at least one 'source' check among + * the sources which has the maximum quota size. Otherwise check among all the + * available ones for maximum quota size. This way if there is a source and + * stale copies it always votes for the 'source'. + * */ + +static void +afr_handle_quota_size (afr_local_t *local, xlator_t *this, + dict_t *rsp_dict) +{ + int32_t *sources = NULL; + dict_t *xattr = NULL; + data_t *max_data = NULL; + int64_t max_quota_size = -1; + data_t *data = NULL; + int64_t *size = NULL; + int64_t quota_size = -1; + afr_private_t *priv = NULL; + int i = 0; + int ret = -1; + gf_boolean_t source_present = _gf_false; + + priv = this->private; + sources = local->cont.lookup.sources; + + if (rsp_dict == NULL) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "%s: Invalid " + "response dictionary", local->loc.path); + return; + } + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source_present = _gf_true; + break; + } + } + + for (i = 0; i < priv->child_count; i++) { + /* + * If there is at least one source lets check + * for maximum quota sizes among sources, otherwise take the + * maximum of the ones present to be on the safer side. + */ + if (source_present && !sources[i]) + continue; + + xattr = local->cont.lookup.xattrs[i]; + if (!xattr) + continue; + + data = dict_get (xattr, QUOTA_SIZE_KEY); + if (!data) + continue; + + size = (int64_t*)data->data; + quota_size = ntoh64(*size); + gf_log (this->name, GF_LOG_DEBUG, "%s: %d, size: %"PRId64, + local->loc.path, i, quota_size); + if (quota_size > max_quota_size) { + if (max_data) + data_unref (max_data); + + max_quota_size = quota_size; + max_data = data_ref (data); + } + } + + if (max_data) { + ret = dict_set (rsp_dict, QUOTA_SIZE_KEY, max_data); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " + "quota size", local->loc.path); + } + + data_unref (max_data); + } +} + int afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) { @@ -1115,13 +1206,18 @@ afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) ret = -1; goto out; } + gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d", read_child); if (!*xattr) *xattr = dict_ref (local->cont.lookup.xattrs[read_child]); + *buf = local->cont.lookup.bufs[read_child]; *postparent = local->cont.lookup.postparents[read_child]; + if (dict_get (local->xattr_req, QUOTA_SIZE_KEY)) + afr_handle_quota_size (local, this, *xattr); + if (IA_INVAL == local->cont.lookup.inode->ia_type) { /* fix for RT #602 */ local->cont.lookup.inode->ia_type = buf->ia_type; @@ -1270,7 +1366,7 @@ afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) { /* mismatching gfid */ - gf_log (this->name, GF_LOG_WARNING, + gf_log (this->name, GF_LOG_DEBUG, "%s: gfid different on subvolume", local->loc.path); } } @@ -1494,7 +1590,7 @@ afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, child2 = &bufs[success_children[i-1]]; if (FILETYPE_DIFFERS (child1, child2)) { - gf_log (xlator_name, GF_LOG_WARNING, "%s: filetype " + gf_log (xlator_name, GF_LOG_DEBUG, "%s: filetype " "differs on subvolumes (%d, %d)", path, success_children[i-1], success_children[i]); conflicting = _gf_true; @@ -1503,7 +1599,7 @@ afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, if (!gfid || uuid_is_null (child1->ia_gfid)) continue; if (uuid_compare (*gfid, child1->ia_gfid)) { - gf_log (xlator_name, GF_LOG_WARNING, "%s: gfid differs" + gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid differs" " on subvolume %d", path, success_children[i]); conflicting = _gf_true; goto out; @@ -1632,7 +1728,6 @@ afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, afr_local_t *local = NULL; int ret = -1; dict_t *xattr = NULL; - int32_t spb = 0; local = frame->local; @@ -1663,10 +1758,6 @@ afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, local->loc.path, local->self_heal.actual_sh_started); } - - if (local->loc.inode) - spb = afr_is_split_brain (this, local->loc.inode); - ret = dict_set_int32 (xattr, "split-brain", spb); } out: AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, @@ -1741,7 +1832,8 @@ afr_lookup_perform_self_heal (call_frame_t *frame, xlator_t *this, afr_lookup_set_self_heal_params (local, this); if (afr_can_self_heal_proceed (&local->self_heal, priv)) { - if (afr_is_transaction_running (local)) + if (afr_is_transaction_running (local) && + (!local->allow_sh_for_running_transaction)) goto out; reason = "lookup detected pending operations"; @@ -2336,7 +2428,7 @@ afr_lookup (call_frame_t *frame, xlator_t *this, int call_count = 0; uint64_t ctx = 0; int32_t op_errno = 0; - + int allow_sh = 0; priv = this->private; AFR_LOCAL_ALLOC_OR_GOTO (local, out); @@ -2353,6 +2445,13 @@ afr_lookup (call_frame_t *frame, xlator_t *this, goto out; } + if (local->loc.path && + (strcmp (local->loc.path, "/" GF_REPLICATE_TRASH_DIR) == 0)) { + op_errno = EPERM; + ret = -1; + goto out; + } + ret = inode_ctx_get (local->loc.inode, this, &ctx); if (ret == 0) { /* lookup is a revalidate */ @@ -2401,6 +2500,11 @@ afr_lookup (call_frame_t *frame, xlator_t *this, /* By default assume ENOTCONN. On success it will be set to 0. */ local->op_errno = ENOTCONN; + ret = dict_get_int32 (xattr_req, "allow-sh-for-running-transaction", + &allow_sh); + dict_del (xattr_req, "allow-sh-for-running-transaction"); + local->allow_sh_for_running_transaction = allow_sh; + ret = afr_lookup_xattr_req_prepare (local, this, xattr_req, &local->loc, &gfid_req); if (ret) { @@ -2578,15 +2682,42 @@ afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, return 0; } +static int +afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + + priv = this->private; + local = frame->local; + call_count = local->call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_flush_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->flush, + local->fd, NULL); + if (!--call_count) + break; + + } + } + + return 0; +} + int afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; + call_stub_t *stub = NULL; int ret = -1; int op_errno = 0; - int call_count = -1; - int i = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -2602,23 +2733,14 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) goto out; local->fd = fd_ref(fd); - call_count = local->call_count; - - afr_delayed_changelog_wake_up (this, fd); - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_flush_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->flush, - local->fd, NULL); - - if (!--call_count) - break; - } + stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata); + if (!stub) { + ret = -1; + op_errno = ENOMEM; + goto out; } + afr_delayed_changelog_wake_resume (this, fd, stub); ret = 0; out: @@ -2694,6 +2816,16 @@ afr_release (xlator_t *this, fd_t *fd) /* {{{ fsync */ int +afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + +int afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) @@ -2718,13 +2850,13 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_ret = 0; if (local->success_count == 0) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; } if (child_index == read_child) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; } local->success_count++; @@ -2746,10 +2878,11 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, post-op. This guarantee is expected by FUSE graph switching for example. */ - stub = fop_fsync_cbk_stub (frame, default_fsync_cbk, + stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk, local->op_ret, local->op_errno, - &local->cont.fsync.prebuf, - &local->cont.fsync.postbuf, xdata); + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + xdata); if (!stub) { AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); return 0; @@ -3966,6 +4099,8 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) goto out; } + local->append_write = _gf_false; + ret = 0; out: return ret; @@ -3977,11 +4112,6 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, { int ret = -ENOMEM; - lk->inode_locked_nodes = GF_CALLOC (sizeof (*lk->inode_locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->inode_locked_nodes) - goto out; - lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), child_count, gf_afr_mt_char); if (NULL == lk->locked_nodes) @@ -4040,6 +4170,21 @@ out: } int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count) +{ + int ret = -ENOMEM; + + lk->domain = dom; + lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), + child_count, gf_afr_mt_char); + if (NULL == lk->locked_nodes) + goto out; + ret = 0; +out: + return ret; +} + +int afr_transaction_local_init (afr_local_t *local, xlator_t *this) { int child_up_count = 0; @@ -4052,6 +4197,14 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (ret < 0) goto out; + if ((local->transaction.type == AFR_DATA_TRANSACTION) || + (local->transaction.type == AFR_METADATA_TRANSACTION)) { + ret = afr_inodelk_init (&local->internal_lock.inodelk[0], + this->name, priv->child_count); + if (ret < 0) + goto out; + } + ret = -ENOMEM; child_up_count = afr_up_children_count (local->child_up, priv->child_count); @@ -4278,6 +4431,16 @@ afr_priv_destroy (afr_private_t *priv) if (priv->shd.split_brain) eh_destroy (priv->shd.split_brain); + for (i = 0; i < priv->child_count; i++) + { + if (priv->shd.statistics[i]) + eh_destroy (priv->shd.statistics[i]); + } + + GF_FREE (priv->shd.statistics); + + GF_FREE (priv->shd.crawl_events); + GF_FREE (priv->last_event); if (priv->pending_key) { for (i = 0; i < priv->child_count; i++) @@ -4356,3 +4519,73 @@ afr_is_fd_fixable (fd_t *fd) return _gf_true; } + +void +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + inode_t *inode = NULL; + afr_inode_ctx_t *ctx = NULL; + + local = frame->local; + + if (local->fd) + inode = local->fd->inode; + else + inode = local->loc.inode; + + if (!inode) + return; + + LOCK (&inode->lock); + { + ctx = __afr_inode_ctx_get (inode, this); + ctx->open_fd_count = local->open_fd_count; + } + UNLOCK (&inode->lock); +} + +int +afr_initialise_statistics (xlator_t *this) +{ + afr_private_t *priv = NULL; + int ret = -1; + int i = 0; + int child_count = 0; + eh_t *stats_per_brick = NULL; + shd_crawl_event_t ***shd_crawl_events = NULL; + priv = this->private; + + priv->shd.statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count, + gf_common_mt_eh_t); + if (!priv->shd.statistics) { + ret = -1; + goto out; + } + child_count = priv->child_count; + for (i=0; i < child_count ; i++) { + stats_per_brick = eh_new (AFR_STATISTICS_HISTORY_SIZE, + _gf_false, + _destroy_crawl_event_data); + if (!stats_per_brick) { + ret = -1; + goto out; + } + priv->shd.statistics[i] = stats_per_brick; + + } + + shd_crawl_events = (shd_crawl_event_t***)(&priv->shd.crawl_events); + *shd_crawl_events = GF_CALLOC (sizeof(shd_crawl_event_t*), + priv->child_count, + gf_afr_mt_shd_crawl_event_t); + + if (!priv->shd.crawl_events) { + ret = -1; + goto out; + } + ret = 0; +out: + return ret; + +} diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index 0e1718814..689dd84e6 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -253,7 +253,7 @@ unlock: goto out; if (!afr_is_opendir_done (this, local->fd->inode) && - up_children_count > 1) { + up_children_count > 1 && priv->entry_self_heal) { /* * This is the first opendir on this inode. We need @@ -383,14 +383,36 @@ afr_forget_entries (fd_t *fd) } } +static void +afr_readdir_filter_trash_dir (gf_dirent_t *entries, fd_t *fd) +{ + gf_dirent_t * entry = NULL; + gf_dirent_t * tmp = NULL; + + list_for_each_entry_safe (entry, tmp, &entries->list, list) { + if (__is_root_gfid (fd->inode->gfid) && + !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { + list_del_init (&entry->list); + GF_FREE (entry); + } + } +} int32_t afr_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, dict_t *xdata) { - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL); + afr_local_t *local = NULL; + if (op_ret == -1) + goto out; + + local = frame->local; + afr_readdir_filter_trash_dir (entries, local->fd); + +out: + AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL); return 0; } @@ -400,8 +422,16 @@ afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, dict_t *xdata) { - AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, NULL); + afr_local_t *local = NULL; + + if (op_ret == -1) + goto out; + + local = frame->local; + afr_readdir_filter_trash_dir (entries, local->fd); +out: + AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, NULL); return 0; } diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 5a22696ce..1943b719b 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -235,26 +235,6 @@ out: } void -afr_dir_fop_handle_all_fop_failures (call_frame_t *frame) -{ - xlator_t *this = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - this = frame->this; - local = frame->local; - priv = this->private; - - if (local->op_ret >= 0) - goto out; - - __mark_all_success (local->pending, priv->child_count, - local->transaction.type); -out: - return; -} - -void afr_dir_fop_done (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; @@ -273,7 +253,6 @@ afr_dir_fop_done (call_frame_t *frame, xlator_t *this) done: local->transaction.unwind (frame, this); afr_dir_fop_mark_entry_pending_changelog (frame, this); - afr_dir_fop_handle_all_fop_failures (frame); local->transaction.resume (frame, this); } diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index 051b7b164..e06e3b2f2 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -117,6 +117,8 @@ afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, children = priv->children; + AFR_SBRAIN_CHECK_LOC (loc, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; @@ -232,6 +234,8 @@ afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) children = priv->children; + AFR_SBRAIN_CHECK_LOC (loc, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; @@ -348,10 +352,7 @@ afr_fstat (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (fd->inode, out); - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } + AFR_SBRAIN_CHECK_FD (fd, out); AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; @@ -472,6 +473,8 @@ afr_readlink (call_frame_t *frame, xlator_t *this, children = priv->children; + AFR_SBRAIN_CHECK_LOC (loc, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; @@ -1323,6 +1326,62 @@ afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, return ret; } +static int +afr_aggregate_stime_xattr (dict_t *this, char *key, data_t *value, void *data) +{ + int ret = 0; + + if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) + ret = gf_get_min_stime (THIS, data, key, value); + + return ret; +} + +int32_t +afr_common_getxattr_stime_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t callcnt = 0; + + if (!frame || !frame->local || !this) { + gf_log ("", GF_LOG_ERROR, "possible NULL deref"); + goto out; + } + + local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (!dict || (op_ret < 0)) { + local->op_errno = op_errno; + goto cleanup; + } + + if (!local->dict) + local->dict = dict_copy_with_ref (dict, NULL); + else + dict_foreach (dict, afr_aggregate_stime_xattr, + local->dict); + local->op_ret = 0; + } + +cleanup: + UNLOCK (&frame->lock); + + if (!callcnt) { + AFR_STACK_UNWIND (getxattr, frame, local->op_ret, + local->op_errno, local->dict, xdata); + } + +out: + return 0; +} + + static gf_boolean_t afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk, gf_boolean_t is_fgetxattr) @@ -1355,6 +1414,8 @@ afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk, } else { *cbk = afr_getxattr_lockinfo_cbk; } + } else if (fnmatch (GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) { + *cbk = afr_common_getxattr_stime_cbk; } else { is_spl = _gf_false; } @@ -1403,7 +1464,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, int32_t read_child = -1; int ret = -1; fop_getxattr_cbk_t cbk = NULL; - + int afr_xtime_gauge[MCNT_MAX] = {0,}; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1414,6 +1475,8 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, children = priv->children; + AFR_SBRAIN_CHECK_LOC (loc, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; @@ -1452,6 +1515,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, sub_volumes, priv->child_count, MARKER_UUID_TYPE, + marker_uuid_default_gauge, priv->vol_uuid)) { gf_log (this->name, GF_LOG_INFO, @@ -1498,12 +1562,20 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, } + /* don't err out on getting ENOTCONN (brick down) + * from a subset of the bricks + */ + memcpy (afr_xtime_gauge, marker_xtime_default_gauge, + sizeof (afr_xtime_gauge)); + afr_xtime_gauge[MCNT_NOTFOUND] = 0; + afr_xtime_gauge[MCNT_ENOTCONN] = 0; if (cluster_getmarkerattr (frame, this, loc, name, local, afr_getxattr_unwind, sub_volumes, priv->child_count, MARKER_XTIME_TYPE, + afr_xtime_gauge, priv->vol_uuid)) { gf_log (this->name, GF_LOG_INFO, "%s: failed to get marker attr (%s)", @@ -1660,10 +1732,8 @@ afr_fgetxattr (call_frame_t *frame, xlator_t *this, children = priv->children; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } + AFR_SBRAIN_CHECK_FD (fd, out); + AFR_LOCAL_ALLOC_OR_GOTO (local, out); frame->local = local; @@ -1819,10 +1889,7 @@ afr_readv (call_frame_t *frame, xlator_t *this, priv = this->private; children = priv->children; - if (afr_is_split_brain (this, fd->inode)) { - op_errno = EIO; - goto out; - } + AFR_SBRAIN_CHECK_FD (fd, out); AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index ce4fbf226..c1ec69a55 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -39,6 +39,46 @@ #include "afr-transaction.h" #include "afr-self-heal-common.h" +void +__inode_write_fop_cbk (call_frame_t *frame, int child_index, int read_child, + xlator_t *this, int32_t *op_ret, int32_t *op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (afr_fop_failed (*op_ret, *op_errno)) { + local->child_errno[child_index] = *op_errno; + + switch (local->op) { + case GF_FOP_TRUNCATE: + case GF_FOP_FTRUNCATE: + if (*op_errno != EFBIG) + afr_transaction_fop_failed (frame, this, + child_index); + break; + default: + afr_transaction_fop_failed (frame, this, child_index); + break; + } + local->op_errno = *op_errno; + goto out; + } + + if ((local->success_count == 0) || (read_child == child_index)) { + local->op_ret = *op_ret; + if (prebuf) + local->cont.inode_wfop.prebuf = *prebuf; + if (postbuf) + local->cont.inode_wfop.postbuf = *postbuf; + } + + local->success_count++; +out: + return; +} + /* {{{ writev */ void @@ -52,8 +92,8 @@ afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame) dst_local->op_ret = src_local->op_ret; dst_local->op_errno = src_local->op_errno; - dst_local->cont.writev.prebuf = src_local->cont.writev.prebuf; - dst_local->cont.writev.postbuf = src_local->cont.writev.postbuf; + dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf; + dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf; } void @@ -64,8 +104,8 @@ afr_writev_unwind (call_frame_t *frame, xlator_t *this) AFR_STACK_UNWIND (writev, frame, local->op_ret, local->op_errno, - &local->cont.writev.prebuf, - &local->cont.writev.postbuf, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, NULL); } @@ -133,12 +173,17 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; + afr_private_t *priv = NULL; call_frame_t *fop_frame = NULL; int child_index = (long) cookie; int call_count = -1; int read_child = 0; + int ret = 0; + uint32_t open_fd_count = 0; + uint32_t write_is_append = 0; local = frame->local; + priv = this->private; read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); @@ -148,12 +193,14 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); + local->replies[child_index].valid = 1; local->replies[child_index].op_ret = op_ret; local->replies[child_index].op_errno = op_errno; - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); /* stage the best case return value for unwind */ if ((local->success_count == 0) || (op_ret > local->op_ret)) { @@ -162,12 +209,24 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } if (op_ret != -1) { - if ((local->success_count == 0) || - (child_index == read_child)) { - local->cont.writev.prebuf = *prebuf; - local->cont.writev.postbuf = *postbuf; - } - local->success_count++; + if (xdata) { + ret = dict_get_uint32 (xdata, + GLUSTERFS_OPEN_FD_COUNT, + &open_fd_count); + if ((ret == 0) && + (open_fd_count > local->open_fd_count)) { + local->open_fd_count = open_fd_count; + local->update_open_fd_count = _gf_true; + } + + write_is_append = 0; + ret = dict_get_uint32 (xdata, + GLUSTERFS_WRITE_IS_APPEND, + &write_is_append); + if (ret || !write_is_append) + local->append_write = _gf_false; + } + } } UNLOCK (&frame->lock); @@ -176,10 +235,23 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (call_count == 0) { - if (!local->stable_write) - afr_fd_report_unstable_write (this, local->fd); + if (local->update_open_fd_count) + afr_handle_open_fd_count (frame, this); + + if (!local->stable_write && !local->append_write) + /* An appended write removes the necessity to + fsync() the file. This is because self-heal + has the logic to check for larger file when + the xattrs are not reliably pointing at + a stale file. + */ + afr_fd_report_unstable_write (this, local->fd); afr_writev_handle_short_writes (frame, this); + if (afr_any_fops_failed (local, priv)) { + //Don't unwind until post-op is complete + local->transaction.resume (frame, this); + } else { /* * Generally inode-write fops do transaction.unwind then * transaction.resume, but writev needs to make sure that @@ -191,10 +263,11 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, * completed. */ - fop_frame = afr_transaction_detach_fop_frame (frame); - afr_writev_copy_outvars (frame, fop_frame); - local->transaction.resume (frame, this); - afr_writev_unwind (fop_frame, this); + fop_frame = afr_transaction_detach_fop_frame (frame); + afr_writev_copy_outvars (frame, fop_frame); + local->transaction.resume (frame, this); + afr_writev_unwind (fop_frame, this); + } } return 0; } @@ -206,6 +279,8 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) afr_private_t *priv = NULL; int i = 0; int call_count = -1; + dict_t *xdata = NULL; + GF_UNUSED int ret = 0; local = frame->local; priv = this->private; @@ -229,6 +304,19 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) return 0; } + xdata = dict_new (); + if (xdata) { + ret = dict_set_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, + sizeof (uint32_t)); + ret = dict_set_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, + 0); + /* Set append_write to be true speculatively. If on any + server it turns not be true, we unset it in the + callback. + */ + local->append_write = _gf_true; + } + for (i = 0; i < priv->child_count; i++) { if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, @@ -241,13 +329,16 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) local->cont.writev.offset, local->cont.writev.flags, local->cont.writev.iobref, - NULL); + xdata); if (!--call_count) break; } } + if (xdata) + dict_unref (xdata); + return 0; } @@ -515,8 +606,8 @@ afr_truncate_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, local->op_errno, - &local->cont.truncate.prebuf, - &local->cont.truncate.postbuf, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, NULL); } @@ -530,14 +621,11 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; int child_index = (long) cookie; int read_child = 0; int call_count = -1; - int need_unwind = 0; local = frame->local; - priv = this->private; read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); @@ -547,38 +635,22 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno) && op_errno != EFBIG) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.truncate.prebuf = *prebuf; - local->cont.truncate.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.truncate.prebuf = *prebuf; - local->cont.truncate.postbuf = *postbuf; - } - - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if (prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; } - local->op_errno = op_errno; + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); } UNLOCK (&frame->lock); - if (need_unwind) - local->transaction.unwind (frame, this); - call_count = afr_frame_return (frame); if (call_count == 0) { + if (local->stable_write && afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); + local->transaction.resume (frame, this); } @@ -606,6 +678,7 @@ afr_truncate_wind (call_frame_t *frame, xlator_t *this) } local->call_count = call_count; + local->stable_write = _gf_true; for (i = 0; i < priv->child_count; i++) { if (local->transaction.pre_op[i]) { @@ -726,8 +799,8 @@ afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, local->op_errno, - &local->cont.ftruncate.prebuf, - &local->cont.ftruncate.postbuf, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, NULL); } return 0; @@ -740,14 +813,11 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; int child_index = (long) cookie; int call_count = -1; - int need_unwind = 0; int read_child = 0; local = frame->local; - priv = this->private; read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); @@ -757,38 +827,22 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.ftruncate.prebuf = *prebuf; - local->cont.ftruncate.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.ftruncate.prebuf = *prebuf; - local->cont.ftruncate.postbuf = *postbuf; - } - - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if (prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; } - local->op_errno = op_errno; + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); } UNLOCK (&frame->lock); - if (need_unwind) - local->transaction.unwind (frame, this); - call_count = afr_frame_return (frame); if (call_count == 0) { + if (local->stable_write && afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); + local->transaction.resume (frame, this); } @@ -816,6 +870,7 @@ afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) } local->call_count = call_count; + local->stable_write = _gf_true; for (i = 0; i < priv->child_count; i++) { if (local->transaction.pre_op[i]) { @@ -970,8 +1025,8 @@ afr_setattr_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, local->op_errno, - &local->cont.setattr.preop_buf, - &local->cont.setattr.postop_buf, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, NULL); } @@ -1002,29 +1057,14 @@ afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.setattr.preop_buf = *preop; - local->cont.setattr.postop_buf = *postop; - } - - if (child_index == read_child) { - local->cont.setattr.preop_buf = *preop; - local->cont.setattr.postop_buf = *postop; - } + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, preop, postop, + xdata); - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; } - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1179,8 +1219,8 @@ afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, local->op_errno, - &local->cont.fsetattr.preop_buf, - &local->cont.fsetattr.postop_buf, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, NULL); } @@ -1211,29 +1251,14 @@ afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.fsetattr.preop_buf = *preop; - local->cont.fsetattr.postop_buf = *postop; - } - - if (child_index == read_child) { - local->cont.fsetattr.preop_buf = *preop; - local->cont.fsetattr.postop_buf = *postop; - } + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, preop, postop, + xdata); - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; } - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1406,28 +1431,23 @@ int afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int need_unwind = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; local = frame->local; priv = this->private; LOCK (&frame->lock); { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->child_count) { - need_unwind = 1; - } + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + if (local->success_count == priv->child_count) { + need_unwind = 1; } - - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1596,28 +1616,24 @@ int afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int need_unwind = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; local = frame->local; priv = this->private; LOCK (&frame->lock); { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - if (local->success_count == priv->child_count) { - need_unwind = 1; - } + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + if (local->success_count == priv->child_count) { + need_unwind = 1; } - - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1796,28 +1812,23 @@ int afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int need_unwind = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; local = frame->local; priv = this->private; LOCK (&frame->lock); { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + if (local->success_count == priv->wait_count) { + need_unwind = 1; } - - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1985,28 +1996,24 @@ int afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int need_unwind = 0; + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; local = frame->local; priv = this->private; LOCK (&frame->lock); { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } + if (local->success_count == priv->wait_count) { + need_unwind = 1; } - - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -2151,3 +2158,704 @@ out: return 0; } + +static int +afr_fallocate_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, + local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); + } + return 0; +} + +static int +afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + int read_child = 0; + + local = frame->local; + priv = this->private; + + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + + LOCK (&frame->lock); + { + if (child_index == read_child) { + local->read_child_returned = _gf_true; + } + + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); + + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; + } + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + +static int +afr_fallocate_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fallocate, + local->fd, + local->cont.fallocate.mode, + local->cont.fallocate.offset, + local->cont.fallocate.len, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + +static int +afr_fallocate_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +static int +afr_do_fallocate (call_frame_t *frame, xlator_t *this) +{ + call_frame_t * transaction_frame = NULL; + afr_local_t * local = NULL; + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_FALLOCATE; + + local->transaction.fop = afr_fallocate_wind; + local->transaction.done = afr_fallocate_done; + local->transaction.unwind = afr_fallocate_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = local->cont.fallocate.offset; + local->transaction.len = 0; + + /* fallocate can modify the file size */ + op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + op_ret = 0; +out: + if (op_ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fallocate, frame, op_ret, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(fallocate,out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + local->cont.fallocate.mode = mode; + local->cont.fallocate.offset = offset; + local->cont.fallocate.len = len; + + local->fd = fd_ref (fd); + + afr_open_fd_fix (fd, this); + + afr_do_fallocate (frame, this); + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ discard */ + +static int +afr_discard_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (discard, main_frame, local->op_ret, + local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); + } + return 0; +} + +static int +afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + int read_child = 0; + + local = frame->local; + priv = this->private; + + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + + LOCK (&frame->lock); + { + if (child_index == read_child) { + local->read_child_returned = _gf_true; + } + + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); + + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; + } + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + +static int +afr_discard_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->discard, + local->fd, + local->cont.discard.offset, + local->cont.discard.len, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + +static int +afr_discard_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +static int +afr_do_discard (call_frame_t *frame, xlator_t *this) +{ + call_frame_t * transaction_frame = NULL; + afr_local_t * local = NULL; + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_DISCARD; + + local->transaction.fop = afr_discard_wind; + local->transaction.done = afr_discard_done; + local->transaction.unwind = afr_discard_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = local->cont.discard.offset; + local->transaction.len = 0; + + op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + op_ret = 0; +out: + if (op_ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (discard, frame, op_ret, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(discard, out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + local->cont.discard.offset = offset; + local->cont.discard.len = len; + + local->fd = fd_ref (fd); + + afr_open_fd_fix (fd, this); + + afr_do_discard(frame, this); + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + } + + return 0; +} + + +/* {{{ zerofill */ + +static int +afr_zerofill_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (zerofill, main_frame, local->op_ret, + local->op_errno, + &local->cont.zerofill.prebuf, + &local->cont.zerofill.postbuf, + NULL); + } + return 0; +} + +static int +afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + int read_child = 0; + + local = frame->local; + priv = this->private; + + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + + LOCK (&frame->lock); + { + if (child_index == read_child) { + local->read_child_returned = _gf_true; + } + + if (afr_fop_failed (op_ret, op_errno)) { + afr_transaction_fop_failed (frame, this, child_index); + } + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.zerofill.prebuf = *prebuf; + local->cont.zerofill.postbuf = *postbuf; + } + + if (child_index == read_child) { + local->cont.zerofill.prebuf = *prebuf; + local->cont.zerofill.postbuf = *postbuf; + } + + local->success_count++; + + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; + } + } + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) { + local->transaction.unwind (frame, this); + } + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + +static int +afr_zerofill_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->zerofill, + local->fd, + local->cont.zerofill.offset, + local->cont.zerofill.len, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + +static int +afr_zerofill_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +static int +afr_do_zerofill(call_frame_t *frame, xlator_t *this) +{ + call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_ZEROFILL; + + local->transaction.fop = afr_zerofill_wind; + local->transaction.done = afr_zerofill_done; + local->transaction.unwind = afr_zerofill_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = local->cont.zerofill.offset; + local->transaction.len = 0; + + op_ret = afr_transaction (transaction_frame, this, + AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + op_ret = 0; +out: + if (op_ret < 0) { + if (transaction_frame) { + AFR_STACK_DESTROY (transaction_frame); + } + AFR_STACK_UNWIND (zerofill, frame, op_ret, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +int +afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(zerofill, out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) { + goto out; + } + local->cont.zerofill.offset = offset; + local->cont.zerofill.len = len; + + local->fd = fd_ref (fd); + + afr_open_fd_fix (fd, this); + + afr_do_zerofill(frame, this); + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) { + AFR_STACK_DESTROY (transaction_frame); + } + AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +/* }}} */ + + diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h index ed11079fd..8e93ca44a 100644 --- a/xlators/cluster/afr/src/afr-inode-write.h +++ b/xlators/cluster/afr/src/afr-inode-write.h @@ -68,4 +68,15 @@ int32_t afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, dict_t *xdata); +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata); + +int +afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); #endif /* __INODE_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index e091a7939..060d78f35 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -57,11 +57,15 @@ int afr_entry_lockee_cmp (const void *l1, const void *l2) { - const afr_entry_lockee_t *r1 = l1; - const afr_entry_lockee_t *r2 = l2; - int ret = 0; - - ret = uuid_compare (r1->loc.gfid, r2->loc.gfid); + const afr_entry_lockee_t *r1 = l1; + const afr_entry_lockee_t *r2 = l2; + int ret = 0; + uuid_t gfid1 = {0}; + uuid_t gfid2 = {0}; + + loc_gfid ((loc_t*)&r1->loc, gfid1); + loc_gfid ((loc_t*)&r2->loc, gfid2); + ret = uuid_compare (gfid1, gfid2); /*Entrylks with NULL basename are the 'smallest'*/ if (ret == 0) { if (!r1->basename) @@ -75,7 +79,6 @@ afr_entry_lockee_cmp (const void *l1, const void *l2) return -1; else return 1; - } int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); @@ -556,19 +559,23 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; afr_private_t *priv = NULL; - int i = 0; + afr_inodelk_t *inodelk = NULL; priv = this->private; local = frame->local; int_lock = &local->internal_lock; - int_lock->inodelk_lock_count = 0; - int_lock->lock_op_ret = -1; - int_lock->lock_op_errno = 0; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - for (i = 0; i < priv->child_count; i++) { - int_lock->inode_locked_nodes[i] = 0; - } + inodelk->lock_count = 0; + int_lock->lk_attempted_count = 0; + int_lock->lock_op_ret = -1; + int_lock->lock_op_errno = 0; + + memset (inodelk->locked_nodes, 0, + sizeof (*inodelk->locked_nodes) * priv->child_count); + memset (int_lock->locked_nodes, 0, + sizeof (*int_lock->locked_nodes) * priv->child_count); return 0; } @@ -649,7 +656,9 @@ afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; int32_t child_index = (long)cookie; + afr_private_t *priv = NULL; local = frame->local; int_lock = &local->internal_lock; @@ -658,14 +667,18 @@ afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, AFR_UNLOCK_OP, NULL, op_ret, op_errno, child_index); + priv = this->private; + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { - gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on %d " - "unlock by %s", local->loc.path, child_index, + gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on subvolume %s " + "with lock owner %s", local->loc.path, + priv->children[child_index]->name, lkowner_utoa (&frame->root->lk_owner)); } - int_lock->inode_locked_nodes[child_index] &= LOCKED_NO; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + inodelk->locked_nodes[child_index] &= LOCKED_NO; if (local->transaction.eager_lock) local->transaction.eager_lock[child_index] = 0; @@ -679,6 +692,7 @@ static int afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; struct gf_flock flock = {0,}; @@ -694,12 +708,14 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; priv = this->private; - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; flock.l_type = F_UNLCK; full_flock.l_type = F_UNLCK; - call_count = afr_locked_nodes_count (int_lock->inode_locked_nodes, + call_count = afr_locked_nodes_count (inodelk->locked_nodes, priv->child_count); int_lock->lk_call_count = call_count; @@ -715,8 +731,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) fd_ctx = afr_fd_ctx_get (local->fd, this); for (i = 0; i < priv->child_count; i++) { - if ((int_lock->inode_locked_nodes[i] & LOCKED_YES) - != LOCKED_YES) + if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES) continue; if (local->fd) { @@ -757,7 +772,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) (void *) (long)i, priv->children[i], priv->children[i]->fops->finodelk, - this->name, local->fd, + int_lock->domain, local->fd, F_SETLK, flock_use, NULL); if (!--call_count) @@ -772,7 +787,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) (void *) (long)i, priv->children[i], priv->children[i]->fops->inodelk, - this->name, &local->loc, + int_lock->domain, &local->loc, F_SETLK, &flock, NULL); if (!--call_count) @@ -858,7 +873,7 @@ afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[index], priv->children[index]->fops->entrylk, - this->name, + int_lock->domain, &int_lock->lockee[lockee_no].loc, int_lock->lockee[lockee_no].basename, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); @@ -961,6 +976,7 @@ static int afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -971,10 +987,10 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - memcpy (int_lock->inode_locked_nodes, - int_lock->locked_nodes, - priv->child_count); - int_lock->inodelk_lock_count = int_lock->lock_count; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + memcpy (inodelk->locked_nodes, int_lock->locked_nodes, + sizeof (*inodelk->locked_nodes) * priv->child_count); + inodelk->lock_count = int_lock->lock_count; break; case AFR_ENTRY_RENAME_TRANSACTION: @@ -1025,6 +1041,7 @@ int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; struct gf_flock flock = {0,}; @@ -1039,10 +1056,15 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) priv = this->private; child_index = cookie % priv->child_count; lockee_no = cookie / priv->child_count; + is_entrylk = afr_is_entrylk (int_lock, local->transaction.type); + - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = int_lock->lk_flock.l_type; + if (!is_entrylk) { + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = inodelk->flock.l_type; + } if (local->fd) { ret = fd_ctx_get (local->fd, this, &ctx); @@ -1064,8 +1086,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) } if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { - is_entrylk = afr_is_entrylk (int_lock, local->transaction.type); - if ((is_entrylk && int_lock->entrylk_lock_count == 0) || (!is_entrylk && int_lock->lock_count == 0)) { gf_log (this->name, GF_LOG_INFO, @@ -1114,7 +1134,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->finodelk, - this->name, local->fd, + int_lock->domain, local->fd, F_SETLKW, &flock, NULL); } else { @@ -1127,7 +1147,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->inodelk, - this->name, &local->loc, + int_lock->domain, &local->loc, F_SETLKW, &flock, NULL); } @@ -1148,7 +1168,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) (void *) (long) cookie, priv->children[child_index], priv->children[child_index]->fops->fentrylk, - this->name, local->fd, + int_lock->domain, local->fd, int_lock->lockee[lockee_no].basename, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); } else { @@ -1161,7 +1181,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) (void *) (long) cookie, priv->children[child_index], priv->children[child_index]->fops->entrylk, - this->name, + int_lock->domain, &int_lock->lockee[lockee_no].loc, int_lock->lockee[lockee_no].basename, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); @@ -1316,6 +1336,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) local->op_errno = EINVAL; int_lock->lock_op_errno = EINVAL; + afr_unlock (frame, this); return -1; } @@ -1390,6 +1411,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; int call_count = 0; int child_index = (long) cookie; @@ -1398,6 +1420,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; int_lock = &local->internal_lock; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION, AFR_LOCK_OP, NULL, op_ret, @@ -1423,9 +1446,8 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (local->transaction.eager_lock) local->transaction.eager_lock[child_index] = 0; } else { - int_lock->inode_locked_nodes[child_index] - |= LOCKED_YES; - int_lock->inodelk_lock_count++; + inodelk->locked_nodes[child_index] |= LOCKED_YES; + inodelk->lock_count++; if (local->transaction.eager_lock && local->transaction.eager_lock[child_index] && @@ -1448,8 +1470,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, gf_log (this->name, GF_LOG_TRACE, "Last inode locking reply received"); /* all locks successful. Proceed to call FOP */ - if (int_lock->inodelk_lock_count == - int_lock->lk_expected_count) { + if (inodelk->lock_count == int_lock->lk_expected_count) { gf_log (this->name, GF_LOG_TRACE, "All servers locked. Calling the cbk"); int_lock->lock_op_ret = 0; @@ -1473,6 +1494,7 @@ int afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; afr_fd_ctx_t *fd_ctx = NULL; @@ -1488,11 +1510,13 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; priv = this->private; - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = int_lock->lk_flock.l_type; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = inodelk->flock.l_type; - full_flock.l_type = int_lock->lk_flock.l_type; + full_flock.l_type = inodelk->flock.l_type; initialize_inodelk_variables (frame, this); @@ -1508,6 +1532,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) local->op_errno = EINVAL; int_lock->lock_op_errno = EINVAL; + afr_unlock (frame, this); ret = -1; goto out; } @@ -1566,7 +1591,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->finodelk, - this->name, local->fd, + int_lock->domain, local->fd, F_SETLK, flock_use, NULL); if (!--call_count) @@ -1588,7 +1613,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->inodelk, - this->name, &local->loc, + int_lock->domain, &local->loc, F_SETLK, &flock, NULL); if (!--call_count) @@ -2112,29 +2137,38 @@ out: return ret; } -void -afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, unsigned int child_count) { - afr_local_t *dst_local = NULL; - afr_local_t *src_local = NULL; - afr_internal_lock_t *dst_lock = NULL; - afr_internal_lock_t *src_lock = NULL; + afr_local_t *dst_local = NULL; + afr_local_t *src_local = NULL; + afr_internal_lock_t *dst_lock = NULL; + afr_internal_lock_t *src_lock = NULL; + afr_inodelk_t *dst_inodelk = NULL; + afr_inodelk_t *src_inodelk = NULL; + int ret = -1; - dst_local = dst->local; - dst_lock = &dst_local->internal_lock; src_local = src->local; src_lock = &src_local->internal_lock; - if (src_lock->inode_locked_nodes) { - memcpy (dst_lock->inode_locked_nodes, - src_lock->inode_locked_nodes, - sizeof (*dst_lock->inode_locked_nodes) * child_count); - memset (src_lock->inode_locked_nodes, 0, - sizeof (*src_lock->inode_locked_nodes) * child_count); + src_inodelk = afr_get_inodelk (src_lock, dom); + dst_local = dst->local; + dst_lock = &dst_local->internal_lock; + dst_inodelk = afr_get_inodelk (dst_lock, dom); + if (!dst_inodelk || !src_inodelk) + goto out; + if (src_inodelk->locked_nodes) { + memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes, + sizeof (*dst_inodelk->locked_nodes) * child_count); + memset (src_inodelk->locked_nodes, 0, + sizeof (*src_inodelk->locked_nodes) * child_count); } dst_lock->transaction_lk_type = src_lock->transaction_lk_type; dst_lock->selfheal_lk_type = src_lock->selfheal_lk_type; - dst_lock->inodelk_lock_count = src_lock->inodelk_lock_count; - src_lock->inodelk_lock_count = 0; + dst_inodelk->lock_count = src_inodelk->lock_count; + src_inodelk->lock_count = 0; + ret = 0; +out: + return ret; } diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index e01ab366f..73594f265 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -41,7 +41,10 @@ enum gf_afr_mem_types_ { gf_afr_mt_shd_event_t, gf_afr_mt_time_t, gf_afr_mt_pos_data_t, - gf_afr_mt_reply_t, + gf_afr_mt_reply_t, + gf_afr_mt_stats_t, + gf_afr_mt_shd_crawl_event_t, + gf_afr_mt_uint64_t, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c index 1721fd270..83846f152 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c @@ -100,7 +100,7 @@ sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this, } sh_private_cleanup (sh_frame, this); - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { GF_ASSERT (!last_loop_frame); //loop_finish should have happened and the old_loop should be NULL gf_log (this->name, GF_LOG_DEBUG, @@ -143,7 +143,7 @@ sh_loop_finish (call_frame_t *loop_frame, xlator_t *this) } if (loop_sh && loop_sh->data_lock_held) { - afr_sh_data_unlock (loop_frame, this, + afr_sh_data_unlock (loop_frame, this, this->name, sh_destroy_frame); } else { sh_destroy_frame (loop_frame, this); @@ -214,7 +214,7 @@ sh_loop_frame_create (call_frame_t *sh_frame, xlator_t *this, goto out; //We want the frame to have same lk_owner as sh_frame //so that locks translator allows conflicting locks - new_loop_local = afr_local_copy (local, this); + new_loop_local = afr_self_heal_local_init (local, this); if (!new_loop_local) goto out; new_loop_frame->local = new_loop_local; @@ -273,10 +273,10 @@ sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, new_loop_sh->offset = offset; new_loop_sh->block_size = sh->block_size; afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size, - _gf_true, sh_loop_lock_success, sh_loop_lock_failure); + _gf_true, this->name, sh_loop_lock_success, sh_loop_lock_failure); return 0; out: - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); if (old_loop_frame) sh_loop_finish (old_loop_frame, this); sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM); @@ -307,8 +307,9 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, sh_priv->loops_running--; offset = sh_priv->offset; block_size = sh->block_size; - while ((!sh->eof_reached) && (0 == sh->op_failed) && - (sh_priv->loops_running < priv->data_self_heal_window_size) + while ((!sh->eof_reached) && + (!is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) && + (sh_priv->loops_running < priv->data_self_heal_window_size) && (sh_priv->offset < sh->file_size)) { loop++; @@ -327,7 +328,8 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, if (0 == loop) { //loop finish does unlock, but the erasing of the pending //xattrs needs to happen before that so do not finish the loop - if (is_driver_done && !sh->op_failed) + if (is_driver_done && + !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) goto driver_done; if (old_loop_frame) { sh_loop_finish (old_loop_frame, this); @@ -338,7 +340,7 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, //If we have more loops to form we should finish previous loop after //the next loop lock while (loop--) { - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { // op failed in other loop, stop spawning more loops if (old_loop_frame) { sh_loop_finish (old_loop_frame, this); @@ -384,7 +386,7 @@ sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame } if (op_ret == -1) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); if (loop_frame) { sh_loop_finish (loop_frame, this); @@ -432,16 +434,16 @@ sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (loop_sh, op_errno); } else if (op_ret < loop_local->cont.writev.vector->iov_len) { - gf_log(this->name, GF_LOG_ERROR, - "incomplete write to %s on subvolume %s " - "(expected %lu, returned %d)", sh_local->loc.path, - priv->children[child_index]->name, - loop_local->cont.writev.vector->iov_len, op_ret); - sh->op_failed = 1; - } + gf_log (this->name, GF_LOG_ERROR, + "incomplete write to %s on subvolume %s " + "(expected %lu, returned %d)", sh_local->loc.path, + priv->children[child_index]->name, + loop_local->cont.writev.vector->iov_len, op_ret); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } call_count = afr_frame_return (loop_frame); @@ -514,7 +516,7 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, if (op_ret <= 0) { if (op_ret < 0) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); gf_log (this->name, GF_LOG_ERROR, "read failed on %d " "for %s reason :%s", sh->source, sh_local->loc.path, strerror (errno)); @@ -624,7 +626,7 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, "checksum on %s failed on subvolume %s (%s)", sh_local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } else { memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LENGTH, strong_checksum, MD5_DIGEST_LENGTH); @@ -662,7 +664,8 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, } UNLOCK (&sh_priv->lock); - if (write_needed && !sh->op_failed) { + if (write_needed && + !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { sh_loop_read (loop_frame, this); } else { sh_loop_return (sh_frame, this, loop_frame, @@ -751,14 +754,15 @@ out: return sh_priv; } -void -afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, +int +afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, char *dom, unsigned int child_count) { afr_local_t *dst_local = NULL; afr_self_heal_t *dst_sh = NULL; afr_local_t *src_local = NULL; afr_self_heal_t *src_sh = NULL; + int ret = -1; dst_local = dst->local; dst_sh = &dst_local->self_heal; @@ -766,9 +770,12 @@ afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, src_sh = &src_local->self_heal; GF_ASSERT (src_sh->data_lock_held); GF_ASSERT (!dst_sh->data_lock_held); - afr_lk_transfer_datalock (dst, src, child_count); + ret = afr_lk_transfer_datalock (dst, src, dom, child_count); + if (ret) + return ret; src_sh->data_lock_held = _gf_false; dst_sh->data_lock_held = _gf_true; + return 0; } int @@ -790,7 +797,10 @@ afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, ret = sh_loop_frame_create (sh_frame, this, NULL, &first_loop_frame); if (ret) goto out; - afr_sh_transfer_lock (first_loop_frame, sh_frame, priv->child_count); + ret = afr_sh_transfer_lock (first_loop_frame, sh_frame, this->name, + priv->child_count); + if (ret) + goto out; sh->private = afr_sh_priv_init (); if (!sh->private) { ret = -1; @@ -800,7 +810,7 @@ afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, ret = 0; out: if (ret) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); sh_loop_driver_done (sh_frame, this, NULL); } return 0; diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 2538f4c8b..ef92b4205 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -18,6 +18,28 @@ #include "afr-self-heal.h" #include "pump.h" +#define ADD_FMT_STRING(msg, off, sh_str, status, print_log) \ + do { \ + if (AFR_SELF_HEAL_NOT_ATTEMPTED != status) { \ + off += snprintf (msg + off, sizeof (msg) - off, \ + " "sh_str" self heal %s,", \ + get_sh_completion_status (status));\ + print_log = 1; \ + } \ + } while (0) + +#define ADD_FMT_STRING_SYNC(msg, off, sh_str, status, print_log) \ + do { \ + if (AFR_SELF_HEAL_SYNC_BEGIN == status || \ + AFR_SELF_HEAL_FAILED == status) { \ + off += snprintf (msg + off, sizeof (msg) - off, \ + " "sh_str" self heal %s,", \ + get_sh_completion_status (status));\ + print_log = 1; \ + } \ + } while (0) + + void afr_sh_reset (call_frame_t *frame, xlator_t *this) { @@ -141,9 +163,8 @@ afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) GF_FREE (buf); } -void -afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, - const char *loc) +char* +afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this) { afr_private_t * priv = this->private; char *buf = NULL; @@ -173,10 +194,8 @@ afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, + (child_count * child_count * pending_entry_strlen); buf = GF_CALLOC (1, 1 + strlen (msg) + string_length , gf_afr_mt_char); - if (!buf) { - buf = ""; + if (!buf) goto out; - } ptr = buf; ptr += sprintf (ptr, "%s", msg); @@ -192,11 +211,27 @@ afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, ptr += sprintf (ptr, "%s", matrix_end); out: + return buf; +} + +void +afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, + const char *loc) +{ + char *buf = NULL; + char *free_ptr = NULL; + + buf = afr_get_pending_matrix_str (pending_matrix, this); + if (buf) + free_ptr = buf; + else + buf = ""; + + gf_log (this->name, GF_LOG_ERROR, "Unable to self-heal contents of '%s'" " (possible split-brain). Please delete the file from all but " "the preferred subvolume.%s", loc, buf); - if (buf) - GF_FREE (buf); + GF_FREE (free_ptr); return; } @@ -466,6 +501,8 @@ afr_find_biggest_witness_among_fools (int32_t *witnesses, { int i = 0; int biggest_witness = -1; + int biggest_witness_idx = -1; + int biggest_witness_cnt = -1; GF_ASSERT (witnesses); GF_ASSERT (characters); @@ -475,10 +512,21 @@ afr_find_biggest_witness_among_fools (int32_t *witnesses, if (characters[i].type != AFR_NODE_FOOL) continue; - if (biggest_witness < witnesses[i]) + if (biggest_witness < witnesses[i]) { biggest_witness = witnesses[i]; + biggest_witness_idx = i; + biggest_witness_cnt = 1; + continue; + } + + if (biggest_witness == witnesses[i]) + biggest_witness_cnt++; } - return biggest_witness; + + if (biggest_witness_cnt != 1) + return -1; + + return biggest_witness_idx; } int @@ -506,10 +554,84 @@ afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses, return nsources; } + +int +afr_mark_fool_as_source_by_idx (int32_t *sources, int child_count, int idx) +{ + if (idx >= 0 && idx < child_count) { + sources[idx] = 1; + return 1; + } + return 0; +} + + +static int +afr_find_largest_file_size (struct iatt *bufs, int32_t *success_children, + int child_count) +{ + int idx = -1; + int i = -1; + int child = -1; + uint64_t max_size = 0; + uint64_t min_size = 0; + int num_children = 0; + + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + + child = success_children[i]; + if (bufs[child].ia_size > max_size) { + max_size = bufs[child].ia_size; + idx = child; + } + + if ((num_children == 0) || (bufs[child].ia_size < min_size)) { + min_size = bufs[child].ia_size; + } + + num_children++; + } + + /* If sizes are same for all of them, finding sources will have to + * happen with pending changelog. So return -1 + */ + if ((num_children > 1) && (min_size == max_size)) + return -1; + return idx; +} + + +static int +afr_find_newest_file (struct iatt *bufs, int32_t *success_children, + int child_count) +{ + int idx = -1; + int i = -1; + int child = -1; + uint64_t max_ctime = 0; + + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + + child = success_children[i]; + if (bufs[child].ia_ctime > max_ctime) { + max_ctime = bufs[child].ia_ctime; + idx = child; + } + } + + return idx; +} + + static int afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, afr_node_character *characters, - int child_count) + int32_t *success_children, + int child_count, struct iatt *bufs) { int32_t biggest_witness = 0; int nsources = 0; @@ -517,6 +639,11 @@ afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, GF_ASSERT (child_count > 0); + biggest_witness = afr_find_largest_file_size (bufs, success_children, + child_count); + if (biggest_witness != -1) + goto found; + witnesses = GF_CALLOC (child_count, sizeof (*witnesses), gf_afr_mt_int32_t); if (NULL == witnesses) { @@ -529,9 +656,15 @@ afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, biggest_witness = afr_find_biggest_witness_among_fools (witnesses, characters, child_count); - nsources = afr_mark_fool_as_source_by_witness (sources, witnesses, - characters, child_count, - biggest_witness); + if (biggest_witness != -1) + goto found; + + biggest_witness = afr_find_newest_file (bufs, success_children, + child_count); + +found: + nsources = afr_mark_fool_as_source_by_idx (sources, child_count, + biggest_witness); out: GF_FREE (witnesses); return nsources; @@ -875,7 +1008,8 @@ afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, nsources = afr_mark_biggest_of_fools_as_source (sources, pending_matrix, characters, - child_count); + success_children, + child_count, bufs); } out: @@ -1012,14 +1146,13 @@ afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) afr_sh_reset (frame, this); - if (local->govinda_gOvinda) { + if (local->unhealable) { gf_log (this->name, GF_LOG_DEBUG, "split brain found, aborting selfheal of %s", local->loc.path); - sh->op_failed = 1; } - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { sh->completion_cbk (frame, this); } else { gf_log (this->name, GF_LOG_TRACE, @@ -1251,7 +1384,7 @@ out: if (ret) { gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, " "reason: %s", local->loc.path, strerror (-ret)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } afr_sh_missing_entries_finish (frame, this); } @@ -1266,7 +1399,7 @@ afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this, local = frame->local; sh = &local->self_heal; if (op_ret < 0) - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_missing_entries_finish (frame, this); return 0; } @@ -1290,7 +1423,7 @@ sh_missing_entries_create (call_frame_t *frame, xlator_t *this) if (!afr_valid_ia_type (type)) { gf_log (this->name, GF_LOG_ERROR, "%s: unknown file type: 0%o", local->loc.path, type); - local->govinda_gOvinda = 1; + afr_set_local_for_unhealable (local); afr_sh_missing_entries_finish (frame, this); goto out; } @@ -1323,8 +1456,9 @@ afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this, loc = &local->loc; if (op_ret < 0) { - if (op_errno == EIO) - local->govinda_gOvinda = 1; + if (op_errno == EIO) { + afr_set_local_for_unhealable (local); + } // EIO can happen if finding the fresh parent dir failed goto out; } @@ -1386,7 +1520,7 @@ afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this, } return; out: - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_missing_entries_finish (frame, this); return; @@ -1470,7 +1604,7 @@ afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child, LOCK (&frame->lock); { afr_sh_set_error (sh, EIO); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } UNLOCK (&frame->lock); } @@ -1552,7 +1686,7 @@ afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; priv = this->private; - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_missing_entries_finish (frame, this); } else { if (afr_gfid_missing_count (this->name, sh->fresh_children, @@ -1766,7 +1900,7 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this, priv->child_count, ENOENT); if (fresh_child_enoents == fresh_parent_count) { afr_sh_set_error (sh, ENOENT); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_purge_entry (frame, this); } else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children, priv->child_count, local->loc.path, @@ -1780,14 +1914,14 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this, afr_sh_purge_stale_entry (frame, this); } else { op_errno = EIO; - local->govinda_gOvinda = 1; + afr_set_local_for_unhealable (local); goto fail; } return; fail: - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_missing_entries_finish (frame, this); return; @@ -1858,8 +1992,8 @@ afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this, out: afr_sh_set_error (sh, op_errno); - sh->op_failed = 1; - afr_sh_missing_entries_finish (frame, this); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_missing_entries_finish (frame, this); return; } @@ -1962,7 +2096,7 @@ afr_sh_post_nb_entrylk_missing_entry_sh_cbk (call_frame_t *frame, if (int_lock->lock_op_ret < 0) { gf_log (this->name, GF_LOG_INFO, "Non blocking entrylks failed."); - sh->op_failed = -1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_missing_entries_done (frame, this); } else { @@ -1997,6 +2131,7 @@ afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, int_lock->lk_basename = base_name; int_lock->lk_loc = loc; int_lock->lock_cbk = lock_cbk; + int_lock->domain = this->name; int_lock->lockee_count = 0; afr_init_entry_lockee (&int_lock->lockee[0], local, loc, @@ -2041,19 +2176,29 @@ out: static int afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) { + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + sh->sh_type_in_action = AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY; + + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); + afr_self_heal_parent_entrylk (frame, this, - afr_sh_post_nb_entrylk_missing_entry_sh_cbk); + afr_sh_post_nb_entrylk_missing_entry_sh_cbk); return 0; } afr_local_t* -afr_local_copy (afr_local_t *l, xlator_t *this) +afr_self_heal_local_init (afr_local_t *l, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *lc = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *shc = NULL; - int i = 0; + afr_private_t *priv = NULL; + afr_local_t *lc = NULL; + afr_self_heal_t *sh = NULL; + afr_self_heal_t *shc = NULL; + int ret = 0; priv = this->private; @@ -2076,13 +2221,23 @@ afr_local_copy (afr_local_t *l, xlator_t *this) shc->forced_merge = sh->forced_merge; shc->background = sh->background; shc->type = sh->type; + shc->data_sh_info = ""; + shc->metadata_sh_info = ""; uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req); - if (l->loc.path) - loc_copy (&lc->loc, &l->loc); + if (l->loc.path) { + ret = loc_copy (&lc->loc, &l->loc); + if (ret < 0) + goto out; + } lc->child_up = memdup (l->child_up, sizeof (*lc->child_up) * priv->child_count); + if (!lc->child_up) { + ret = -1; + goto out; + } + if (l->xattr_req) lc->xattr_req = dict_ref (l->xattr_req); @@ -2090,59 +2245,25 @@ afr_local_copy (afr_local_t *l, xlator_t *this) lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode); if (l->cont.lookup.xattr) lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr); - if (l->internal_lock.inode_locked_nodes) - lc->internal_lock.inode_locked_nodes = - memdup (l->internal_lock.inode_locked_nodes, - sizeof (*lc->internal_lock.inode_locked_nodes) * priv->child_count); - else - lc->internal_lock.inode_locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes), - priv->child_count, - gf_afr_mt_char); - - if (l->internal_lock.locked_nodes) - lc->internal_lock.locked_nodes = - memdup (l->internal_lock.locked_nodes, - sizeof (*lc->internal_lock.locked_nodes) * priv->child_count); - else - lc->internal_lock.locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), - priv->child_count, - gf_afr_mt_char); - - for (i = 0; i < l->internal_lock.lockee_count; i++) { - loc_copy (&lc->internal_lock.lockee[i].loc, - &l->internal_lock.lockee[i].loc); - - lc->internal_lock.lockee[i].locked_count = - l->internal_lock.lockee[i].locked_count; - - if (l->internal_lock.lockee[i].basename) - lc->internal_lock.lockee[i].basename = - gf_strdup (l->internal_lock.lockee[i].basename); - - if (l->internal_lock.lockee[i].locked_nodes) { - lc->internal_lock.lockee[i].locked_nodes = - memdup (l->internal_lock.lockee[i].locked_nodes, - sizeof (*lc->internal_lock.lockee[i].locked_nodes) * - priv->child_count); - } else { - lc->internal_lock.lockee[i].locked_nodes = - GF_CALLOC (priv->child_count, - sizeof (*lc->internal_lock.lockee[i].locked_nodes), - gf_afr_mt_char); - } + lc->internal_lock.locked_nodes = + GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), + priv->child_count, gf_afr_mt_char); + if (!lc->internal_lock.locked_nodes) { + ret = -1; + goto out; } - lc->internal_lock.lockee_count = l->internal_lock.lockee_count; - - lc->internal_lock.inodelk_lock_count = - l->internal_lock.inodelk_lock_count; - lc->internal_lock.entrylk_lock_count = - l->internal_lock.entrylk_lock_count; + ret = afr_inodelk_init (&lc->internal_lock.inodelk[0], + this->name, priv->child_count); + if (ret) + goto out; out: + if (ret) { + afr_local_cleanup (lc, this); + lc = NULL; + } return lc; } @@ -2155,32 +2276,28 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) afr_local_t * orig_frame_local = NULL; afr_self_heal_t * orig_frame_sh = NULL; char sh_type_str[256] = {0,}; + gf_loglevel_t loglevel = 0; priv = this->private; local = bgsh_frame->local; sh = &local->self_heal; - if (local->govinda_gOvinda) { + if (local->unhealable) { afr_set_split_brain (this, sh->inode, SPB, SPB); - sh->op_failed = 1; } afr_self_heal_type_str_get (sh, sh_type_str, sizeof(sh_type_str)); - if (sh->op_failed) { - gf_loglevel_t loglevel = GF_LOG_ERROR; - if (priv->shd.iamshd) - loglevel = GF_LOG_DEBUG; - - gf_log (this->name, loglevel, "background %s self-heal " - "failed on %s", sh_type_str, local->loc.path); - + if (is_self_heal_failed (sh, AFR_CHECK_ALL) && !priv->shd.iamshd) { + loglevel = GF_LOG_ERROR; + } else if (!is_self_heal_failed (sh, AFR_CHECK_ALL)) { + loglevel = GF_LOG_INFO; } else { - gf_log (this->name, GF_LOG_DEBUG, "background %s self-heal " - "completed on %s", sh_type_str, local->loc.path); - + loglevel = GF_LOG_DEBUG; } + afr_log_self_heal_completion_status (local, loglevel); + FRAME_SU_UNDO (bgsh_frame, afr_local_t); if (!sh->unwound && sh->unwind) { @@ -2188,7 +2305,7 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) orig_frame_sh = &orig_frame_local->self_heal; orig_frame_sh->actual_sh_started = _gf_true; sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, - sh->op_failed); + is_self_heal_failed (sh, AFR_CHECK_ALL)); } if (sh->background) { @@ -2237,7 +2354,7 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) afr_set_lk_owner (sh_frame, this, sh_frame->root); afr_set_low_priority (sh_frame); - sh_local = afr_local_copy (local, this); + sh_local = afr_self_heal_local_init (local, this); if (!sh_local) goto out; sh_frame->local = sh_local; @@ -2302,6 +2419,8 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) sh->do_gfid_self_heal = _gf_false; } + sh->sh_type_in_action = AFR_SELF_HEAL_INVALID; + FRAME_SU_DO (sh_frame, afr_local_t); if (sh->do_missing_entry_self_heal || sh->do_gfid_self_heal) { afr_self_heal_missing_entries (sh_frame, this); @@ -2511,9 +2630,183 @@ out: GF_FREE (erase_xattr); if (ret < 0) { - sh->op_failed = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); finish (frame, this); } return 0; } + +void +afr_set_self_heal_status(afr_self_heal_t *sh, afr_self_heal_status status) +{ + xlator_t *this = NULL; + afr_sh_status_for_all_type *sh_status = &(sh->afr_all_sh_status); + afr_self_heal_type sh_type_in_action = sh->sh_type_in_action; + this = THIS; + + if (!sh) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal" + "Structure"); + goto out; + } + + switch (sh_type_in_action) { + case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: + sh_status->gfid_or_missing_entry_self_heal = status; + break; + case AFR_SELF_HEAL_METADATA: + sh_status->metadata_self_heal = status; + break; + case AFR_SELF_HEAL_DATA: + sh_status->data_self_heal = status; + break; + case AFR_SELF_HEAL_ENTRY: + sh_status->entry_self_heal = status; + break; + case AFR_SELF_HEAL_INVALID: + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid" + "self heal type in action"); + break; + } +out: + return; +} + +void +afr_set_local_for_unhealable (afr_local_t *local) +{ + afr_self_heal_t *sh = NULL; + + sh = &local->self_heal; + + local->unhealable = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +} + +int +is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type) +{ + afr_sh_status_for_all_type sh_status = sh->afr_all_sh_status; + afr_self_heal_type sh_type_in_action = AFR_SELF_HEAL_INVALID; + afr_self_heal_status status = AFR_SELF_HEAL_FAILED; + xlator_t *this = NULL; + int sh_failed = 0; + + this = THIS; + + if (!sh) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal " + "structure"); + sh_failed = 1; + goto out; + } + + if (type == AFR_CHECK_ALL) { + if ((sh_status.gfid_or_missing_entry_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.metadata_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.data_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.entry_self_heal == AFR_SELF_HEAL_FAILED)) + sh_failed = 1; + } else if (type == AFR_CHECK_SPECIFIC) { + sh_type_in_action = sh->sh_type_in_action; + switch (sh_type_in_action) { + case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: + status = sh_status.gfid_or_missing_entry_self_heal; + break; + case AFR_SELF_HEAL_METADATA: + status = sh_status.metadata_self_heal; + break; + case AFR_SELF_HEAL_ENTRY: + status = sh_status.entry_self_heal; + break; + case AFR_SELF_HEAL_DATA: + status = sh_status.data_self_heal; + break; + case AFR_SELF_HEAL_INVALID: + status = AFR_SELF_HEAL_NOT_ATTEMPTED; + break; + } + if (status == AFR_SELF_HEAL_FAILED) + sh_failed = 1; + + } + +out: + return sh_failed; +} + +char * +get_sh_completion_status (afr_self_heal_status status) +{ + + char *not_attempted = " is not attempted"; + char *failed = " failed"; + char *started = " is started"; + char *sync_begin = " is successfully completed"; + char *result = " has unknown status"; + + switch (status) + { + case AFR_SELF_HEAL_NOT_ATTEMPTED: + result = not_attempted; + break; + case AFR_SELF_HEAL_FAILED: + result = failed; + break; + case AFR_SELF_HEAL_STARTED: + result = started; + break; + case AFR_SELF_HEAL_SYNC_BEGIN: + result = sync_begin; + break; + } + + return result; + +} + +void +afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t loglvl) +{ + + char sh_log[4096] = {0}; + afr_self_heal_t *sh = &local->self_heal; + afr_sh_status_for_all_type all_status = sh->afr_all_sh_status; + xlator_t *this = NULL; + size_t off = 0; + int data_sh = 0; + int metadata_sh = 0; + int print_log = 0; + + this = THIS; + + ADD_FMT_STRING (sh_log, off, "gfid or missing entry", + all_status.gfid_or_missing_entry_self_heal, print_log); + ADD_FMT_STRING_SYNC (sh_log, off, "metadata", + all_status.metadata_self_heal, print_log); + if (sh->background) { + ADD_FMT_STRING_SYNC (sh_log, off, "backgroung data", + all_status.data_self_heal, print_log); + } else { + ADD_FMT_STRING_SYNC (sh_log, off, "foreground data", + all_status.data_self_heal, print_log); + } + ADD_FMT_STRING_SYNC (sh_log, off, "entry", all_status.entry_self_heal, + print_log); + + if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.data_self_heal && + strcmp (sh->data_sh_info, "") && sh->data_sh_info ) + data_sh = 1; + if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.metadata_self_heal && + strcmp (sh->metadata_sh_info, "") && sh->metadata_sh_info) + metadata_sh = 1; + + if (!print_log) + return; + + gf_log (this->name, loglvl, "%s %s %s on %s", sh_log, + ((data_sh == 1) ? sh->data_sh_info : ""), + ((metadata_sh == 1) ? sh->metadata_sh_info : ""), + local->loc.path); +} diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h index 035fce543..473264776 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -15,13 +15,6 @@ #define AFR_SH_MIN_PARTICIPANTS 2 typedef enum { - AFR_SELF_HEAL_ENTRY, - AFR_SELF_HEAL_METADATA, - AFR_SELF_HEAL_DATA, - AFR_SELF_HEAL_INVALID = -1, -} afr_self_heal_type; - -typedef enum { AFR_LOOKUP_FAIL_CONFLICTS = 1, AFR_LOOKUP_FAIL_MISSING_GFIDS = 2, } afr_lookup_flags_t; @@ -98,13 +91,13 @@ int afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, int child_index); int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, afr_lock_cbk_t lock_cbk); afr_local_t * -afr_local_copy (afr_local_t *l, xlator_t *this); +afr_self_heal_local_init (afr_local_t *l, xlator_t *this); int afr_sh_data_lock (call_frame_t *frame, xlator_t *this, - off_t start, off_t len, gf_boolean_t block, + off_t start, off_t len, gf_boolean_t block, char *dom, afr_lock_cbk_t success_handler, afr_lock_cbk_t failure_handler); void @@ -133,4 +126,19 @@ int afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, afr_transaction_type type, afr_fxattrop_cbk_t cbk, int (*finish)(call_frame_t *frame, xlator_t *this)); + +void +afr_set_local_for_unhealable (afr_local_t *local); + +int +is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type); + +void +afr_set_self_heal_status (afr_self_heal_t *sh, afr_self_heal_status status); + +void +afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t logl); + +char* +afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this); #endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 2f63ed27d..9de26ee56 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -156,6 +156,25 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this) } int +afr_sh_dom_unlock (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + if (sh->sh_dom_lock_held) + afr_sh_data_unlock (frame, this, priv->sh_domain, + afr_sh_data_close); + else + afr_sh_data_close (frame, this); + return 0; +} + +int afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *statpre, struct iatt *statpost, dict_t *xdata) @@ -283,31 +302,45 @@ afr_sh_set_timestamps (call_frame_t *frame, xlator_t *this) //Fun fact, lock_cbk is being used for both lock & unlock int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, afr_lock_cbk_t lock_cbk) { afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int ret = 0; local = frame->local; int_lock = &local->internal_lock; sh = &local->self_heal; + priv = this->private; - GF_ASSERT (sh->data_lock_held); - - sh->data_lock_held = _gf_false; + if (strcmp (dom, this->name) == 0) { + sh->data_lock_held = _gf_false; + } else if (strcmp (dom, priv->sh_domain) == 0) { + sh->sh_dom_lock_held = _gf_false; + } else { + ret = -1; + goto out; + } int_lock->lock_cbk = lock_cbk; + int_lock->domain = dom; afr_unlock (frame, this); +out: + if (ret) { + int_lock->lock_op_ret = -1; + int_lock->lock_cbk (frame, this); + } return 0; } int afr_sh_data_finish (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; local = frame->local; sh = &local->self_heal; @@ -316,9 +349,9 @@ afr_sh_data_finish (call_frame_t *frame, xlator_t *this) "finishing data selfheal of %s", local->loc.path); if (sh->data_lock_held) - afr_sh_data_unlock (frame, this, afr_sh_data_close); + afr_sh_data_unlock (frame, this, this->name, afr_sh_dom_unlock); else - afr_sh_data_close (frame, this); + afr_sh_dom_unlock (frame, this); return 0; } @@ -335,11 +368,8 @@ afr_sh_data_fail (call_frame_t *frame, xlator_t *this) gf_log (this->name, GF_LOG_DEBUG, "finishing failed data selfheal of %s", local->loc.path); - sh->op_failed = 1; - if (sh->data_lock_held) - afr_sh_data_unlock (frame, this, afr_sh_data_close); - else - afr_sh_data_close (frame, this); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_data_finish (frame, this); return 0; } @@ -362,13 +392,13 @@ afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, "log failed on %s for subvol %s, reason: %s", local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } call_count = afr_frame_return (frame); if (call_count == 0) { - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { if (sh->old_loop_frame) sh_loop_finish (sh->old_loop_frame, this); sh->old_loop_frame = NULL; @@ -380,7 +410,7 @@ afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, goto out; } GF_ASSERT (sh->old_loop_frame); - afr_sh_data_lock (frame, this, 0, 0, _gf_true, + afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, afr_post_sh_big_lock_success, afr_post_sh_big_lock_failure); } @@ -418,7 +448,7 @@ afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, priv->children[child_index]->name, strerror (op_errno)); LOCK (&frame->lock); { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } UNLOCK (&frame->lock); if (sh->old_loop_frame) @@ -428,7 +458,7 @@ afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - if (sh->op_failed) + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) afr_sh_data_fail (frame, this); else afr_sh_data_erase_pending (frame, this); @@ -604,7 +634,7 @@ afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } else { gf_log (this->name, GF_LOG_DEBUG, "ftruncate of %s on subvolume %s completed", @@ -617,7 +647,7 @@ afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - if (sh->op_failed) + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) afr_sh_data_fail (frame, this); else afr_sh_data_sync_prepare (frame, this); @@ -697,6 +727,199 @@ out: return ret; } +char* +afr_get_sizes_str (afr_local_t *local, struct iatt *bufs, xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = 0; + char num[1024] = {0}; + size_t len = 0; + char *sizes_str = NULL; + size_t off = 0; + char *fmt_str = "%llu bytes on %s, "; + char *child_down = " %s,"; + char *child_unknown = " %s,"; + int down_child_present = 0; + int down_count = 0; + int unknown_count = 0; + int unknown_child_present = 0; + char *down_subvol_1 = " down subvolume is "; + char *unknown_subvol_1 = " unknown subvolume is "; + char *down_subvol_2 = " down subvolumes are "; + char *unknown_subvol_2 = " unknown subvolumes are "; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 1) { + len += snprintf (num, sizeof (num), fmt_str, + (unsigned long long) bufs[i].ia_size, + priv->children[i]->name); + } else if (local->child_up[i] == 0) { + len += snprintf (num, sizeof (num), child_down, + priv->children[i]->name); + if (!down_child_present) + down_child_present = 1; + down_count ++; + } else if (local->child_up[i] == -1) { + len += snprintf (num, sizeof (num), child_unknown, + priv->children[i]->name); + if (!unknown_child_present) + unknown_child_present = 1; + unknown_count++; + } + + } + + if (down_child_present) { + if (down_count > 1) + len += snprintf (num, sizeof (num), "%s", + down_subvol_2); + else + len += snprintf (num, sizeof (num), "%s", + down_subvol_1); + } + if (unknown_child_present) { + if (unknown_count > 1) + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_2); + else + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_1); + } + + len++;//for '\0' + + sizes_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); + + if (!sizes_str) + return NULL; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 1) { + off += snprintf (sizes_str + off, len - off, fmt_str, + (unsigned long long) bufs[i].ia_size, + priv->children[i]->name); + } + } + + if (down_child_present) { + if (down_count > 1) { + off += snprintf (sizes_str + off, len - off, "%s", + down_subvol_2); + } else { + off += snprintf (sizes_str + off, len - off, "%s", + down_subvol_1); + } + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 0) { + off += snprintf (sizes_str + off, len - off, child_down, + priv->children[i]->name); + } + } + + if (unknown_child_present) { + if (unknown_count > 1) { + off += snprintf (sizes_str + off, len - off, "%s", + unknown_subvol_2); + } else { + off += snprintf (sizes_str + off, len - off, "%s", + unknown_subvol_1); + } + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == -1) { + off += snprintf (sizes_str + off, len - off, + child_unknown, + priv->children[i]->name); + + } + } + + return sizes_str; +} + +char* +afr_get_sinks_str (xlator_t *this, afr_local_t *local, afr_self_heal_t *sh) +{ + afr_private_t *priv = NULL; + int i = 0; + char num[1024] = {0}; + size_t len = 0; + char *sinks_str = NULL; + char *temp_str = " to sinks "; + char *str_format = " %s,"; + char off = 0; + + priv = this->private; + + len += snprintf (num, sizeof (num), "%s", temp_str); + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { + len += snprintf (num, sizeof (num), str_format, + priv->children[i]->name); + } + } + + len ++; + + sinks_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); + + if (!sinks_str) + return NULL; + + off += snprintf (sinks_str + off, len - off, "%s", temp_str); + + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { + off += snprintf (sinks_str + off, len - off, + str_format, + priv->children[i]->name); + } + } + + return sinks_str; + +} + + +void +afr_set_data_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, xlator_t *this) +{ + char *pending_matrix_str = NULL; + char *sizes_str = NULL; + char *sinks_str = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + + pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, + this); + if (!pending_matrix_str) + pending_matrix_str = ""; + + sizes_str = afr_get_sizes_str (local, sh->buf, this); + if (!sizes_str) + sizes_str = ""; + + sinks_str = afr_get_sinks_str (this, local, sh); + if (!sinks_str) + sinks_str = ""; + + gf_asprintf (&sh->data_sh_info, " data self heal from %s %s with " + "%s data %s", priv->children[sh->source]->name, sinks_str, + sizes_str, pending_matrix_str); + + if (pending_matrix_str && strcmp (pending_matrix_str, "")) + GF_FREE (pending_matrix_str); + + if (sizes_str && strcmp (sizes_str, "")) + GF_FREE (sizes_str); +} + void afr_sh_data_fix (call_frame_t *frame, xlator_t *this) { @@ -718,7 +941,7 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this) if (sh->background && sh->unwind && !sh->unwound) { sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, - sh->op_failed); + is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)); sh->unwound = _gf_true; } @@ -737,6 +960,7 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this) sh->active_sinks); sh->actual_sh_started = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); afr_sh_data_trim_sinks (frame, this); } @@ -817,6 +1041,7 @@ afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this) } afr_sh_data_setattr (frame, this, &sh->buf[tstamp_source]); } else { + afr_set_data_sh_info_str (local, sh, this); if (nsources == 0) { gf_log (this->name, GF_LOG_DEBUG, "No self-heal needed for %s", @@ -1174,6 +1399,22 @@ afr_sh_data_big_lock_success (call_frame_t *frame, xlator_t *this) } int +afr_sh_dom_lock_success (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + sh->sh_dom_lock_held = _gf_true; + afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, + afr_sh_data_big_lock_success, + afr_sh_data_fail); + return 0; +} + +int afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; @@ -1237,9 +1478,11 @@ afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) } int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t len) +afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, char *dom, + off_t start, off_t len) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; local = frame->local; @@ -1250,11 +1493,14 @@ afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t le afr_set_lock_number (frame, this); - int_lock->lk_flock.l_start = start; - int_lock->lk_flock.l_len = len; - int_lock->lk_flock.l_type = F_WRLCK; int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk; + int_lock->domain = dom; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + inodelk->flock.l_start = start; + inodelk->flock.l_len = len; + inodelk->flock.l_type = F_WRLCK; + afr_nonblocking_inodelk (frame, this); return 0; @@ -1298,7 +1544,7 @@ afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this) int afr_sh_data_lock (call_frame_t *frame, xlator_t *this, off_t start, off_t len, gf_boolean_t block, - afr_lock_cbk_t success_handler, + char *dom, afr_lock_cbk_t success_handler, afr_lock_cbk_t failure_handler) { afr_local_t * local = NULL; @@ -1310,7 +1556,7 @@ afr_sh_data_lock (call_frame_t *frame, xlator_t *this, sh->data_lock_success_handler = success_handler; sh->data_lock_failure_handler = failure_handler; sh->data_lock_block = block; - return afr_sh_data_lock_rec (frame, this, start, len); + return afr_sh_data_lock_rec (frame, this, dom, start, len); } int @@ -1322,7 +1568,6 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_private_t *priv = NULL; int call_count = 0; int child_index = 0; - gf_boolean_t block = _gf_true; local = frame->local; sh = &local->self_heal; @@ -1342,7 +1587,7 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } else { gf_log (this->name, GF_LOG_TRACE, "open of %s succeeded on child %s", @@ -1355,7 +1600,7 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_data_fail (frame, this); return 0; } @@ -1364,15 +1609,8 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "fd for %s opened, commencing sync", local->loc.path); - /* - * The read and write self-heal trigger codepaths do not provide - * an unwind callback. We run a trylock in these codepaths - * because we are sensitive to locking latency. - */ - block = sh->unwind ? _gf_true : _gf_false; - afr_sh_data_lock (frame, this, 0, 0, block, - afr_sh_data_big_lock_success, - afr_sh_data_fail); + afr_sh_data_lock (frame, this, 0, 0, _gf_true, priv->sh_domain, + afr_sh_dom_lock_success, afr_sh_data_fail); } return 0; @@ -1477,18 +1715,31 @@ afr_can_start_data_self_heal (afr_self_heal_t *sh, afr_private_t *priv) int afr_self_heal_data (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = this->private; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = this->private; + int ret = -1; local = frame->local; sh = &local->self_heal; + sh->sh_type_in_action = AFR_SELF_HEAL_DATA; + if (afr_can_start_data_self_heal (sh, priv)) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); + ret = afr_inodelk_init (&local->internal_lock.inodelk[1], + priv->sh_domain, priv->child_count); + if (ret < 0) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_data_done (frame, this); + return 0; + } + if (IA_ISREG (sh->type)) { afr_sh_data_open (frame, this); } else { afr_sh_data_lock (frame, this, 0, 0, _gf_true, + this->name, afr_sh_non_reg_lock_success, afr_sh_data_fail); } diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index c3c9f9fca..53491a1d7 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -162,7 +162,7 @@ afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; if (sh->entries_skipped) { - sh->op_failed = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); goto out; } afr_sh_erase_pending (frame, this, AFR_ENTRY_TRANSACTION, @@ -613,6 +613,19 @@ out: return 0; } +static gf_boolean_t +can_skip_entry_self_heal (char *name, loc_t *parent_loc) +{ + if (strcmp (name, ".") == 0) { + return _gf_true; + } else if (strcmp (name, "..") == 0) { + return _gf_true; + } else if (loc_is_root (parent_loc) && + (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0)) { + return _gf_true; + } + return _gf_false; +} int afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, @@ -640,13 +653,7 @@ afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, sh->expunge_done = afr_sh_entry_expunge_entry_done; name = entry->d_name; - - if ((strcmp (name, ".") == 0) - || (strcmp (name, "..") == 0)) { - - gf_log (this->name, GF_LOG_TRACE, - "skipping inspection of %s under %s", - name, local->loc.path); + if (can_skip_entry_self_heal (name, &local->loc)) { op_ret = 0; goto out; } @@ -799,7 +806,7 @@ afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this) active_src = next_active_sink (frame, this, sh->active_source); sh->active_source = active_src; - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { goto out; } @@ -1256,6 +1263,35 @@ afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", impunge_local->loc.path); + /* + * Reason for adding GLUSTERFS_INTERNAL_FOP_KEY : + * + * Problem: + * While a brick is down in a replica pair, lets say the user creates + * one file(file-A) and a hard link to that file(h-file-A). After the + * brick comes back up, entry self-heal is attempted on parent dir of + * these two files. As part of readdir in self-heal it reads both the + * entries file-A and h-file-A for both of them it does name less lookup + * to check if there are any hardlinks already present in the + * destination brick. It finds that there are no hard links already + * present for files file-A, h-file-A. Self-heal does mknods for both + * file-A and h-file-A. This leads to file-A and h-file-A not being + * hardlinks anymore. + * + * Fix: (More like shrinking of race-window, the race itself is still + * present in posix-mknod). + * If mknod comes with the presence of GLUSTERFS_INTERNAL_FOP_KEY then + * posix_mknod checks if there are already any gfid-links and does + * link() instead of mknod. There still can be a race where two + * posix_mknods same gfid see that + * gfid-link file is not present and proceeds with mknods and result in + * two different files with same gfid. + */ + ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + if (ret) + gf_log (this->name, GF_LOG_INFO, "%s: %s set failed", + impunge_local->loc.path, GLUSTERFS_INTERNAL_FOP_KEY); + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, (void *) (long) child_index, priv->children[child_index], @@ -1872,12 +1908,7 @@ afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, active_src = sh->active_source; sh->impunge_done = afr_sh_entry_impunge_entry_done; - if ((strcmp (entry->d_name, ".") == 0) - || (strcmp (entry->d_name, "..") == 0)) { - - gf_log (this->name, GF_LOG_TRACE, - "skipping inspection of %s under %s", - entry->d_name, local->loc.path); + if (can_skip_entry_self_heal (entry->d_name, &local->loc)) { op_ret = 0; goto out; } @@ -1946,7 +1977,7 @@ afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, local->loc.path, priv->children[active_src]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } else { gf_log (this->name, GF_LOG_TRACE, "readdir of %s on subvolume %s complete", @@ -2019,7 +2050,7 @@ afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) active_src = next_active_source (frame, this, sh->active_source); sh->active_source = active_src; - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_entry_finish (frame, this); return 0; } @@ -2068,7 +2099,7 @@ afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } } UNLOCK (&frame->lock); @@ -2076,7 +2107,7 @@ afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_entry_finish (frame, this); return 0; } @@ -2209,6 +2240,7 @@ afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) local->loc.path); sh->actual_sh_started = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); afr_sh_entry_open (frame, this); return 0; @@ -2231,7 +2263,7 @@ afr_sh_entry_fix (call_frame_t *frame, xlator_t *this, priv = this->private; if (op_ret < 0) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_entry_finish (frame, this); goto out; @@ -2294,7 +2326,7 @@ afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this) if (int_lock->lock_op_ret < 0) { gf_log (this->name, GF_LOG_ERROR, "Non Blocking entrylks " "failed for %s.", local->loc.path); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_entry_done (frame, this); } else { @@ -2313,14 +2345,18 @@ afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this) int afr_self_heal_entry (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; afr_private_t *priv = NULL; - + afr_self_heal_t *sh = NULL; priv = this->private; local = frame->local; + sh = &local->self_heal; + + sh->sh_type_in_action = AFR_SELF_HEAL_ENTRY; if (local->self_heal.do_entry_self_heal && priv->entry_self_heal) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); afr_sh_entrylk (frame, this, &local->loc, NULL, afr_sh_post_nonblocking_entry_cbk); } else { diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index cc85d9b9f..fd5da6cfd 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -97,7 +97,7 @@ afr_sh_metadata_fail (call_frame_t *frame, xlator_t *this) local = frame->local; sh = &local->self_heal; - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_metadata_finish (frame, this); return 0; } @@ -406,6 +406,140 @@ afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, return 0; } +static void +afr_set_metadata_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, + xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = 0; + char num[1024] = {0}; + size_t len = 0; + char *string = NULL; + size_t off = 0; + char *source_child = " from source %s to"; + char *format = " %s, "; + char *string_msg = " metadata self heal"; + char *pending_matrix_str = NULL; + int down_child_present = 0; + int unknown_child_present = 0; + char *down_subvol_1 = " down subvolume is "; + char *unknown_subvol_1 = " unknown subvolume is"; + char *down_subvol_2 = " down subvolumes are "; + char *unknown_subvol_2 = " unknown subvolumes are "; + int down_count = 0; + int unknown_count = 0; + + priv = this->private; + + pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, + this); + + if (!pending_matrix_str) + pending_matrix_str = ""; + + len += snprintf (num, sizeof (num), "%s", string_msg); + + for (i = 0; i < priv->child_count; i++) { + if ((sh->source == i) && (local->child_up[i] == 1)) { + len += snprintf (num, sizeof (num), source_child, + priv->children[i]->name); + } else if ((local->child_up[i] == 1) && (sh->sources[i] == 0)) { + len += snprintf (num, sizeof (num), format, + priv->children[i]->name); + } else if (local->child_up[i] == 0) { + len += snprintf (num, sizeof (num), format, + priv->children[i]->name); + if (!down_child_present) + down_child_present = 1; + down_count++; + } else if (local->child_up[i] == -1) { + len += snprintf (num, sizeof (num), format, + priv->children[i]->name); + if (!unknown_child_present) + unknown_child_present = 1; + unknown_count++; + } + } + + if (down_child_present) { + if (down_count > 1) { + len += snprintf (num, sizeof (num), "%s", + down_subvol_2); + } else { + len += snprintf (num, sizeof (num), "%s", + down_subvol_1); + } + } + if (unknown_child_present) { + if (unknown_count > 1) { + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_2); + } else { + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_1); + } + } + + len ++; + + string = GF_CALLOC (len, sizeof (char), gf_common_mt_char); + if (!string) + return; + + off += snprintf (string + off, len - off, "%s", string_msg); + for (i=0; i < priv->child_count; i++) { + if ((sh->source == i) && (local->child_up[i] == 1)) + off += snprintf (string + off, len - off, source_child, + priv->children[i]->name); + } + + for (i = 0; i < priv->child_count; i++) { + if ((local->child_up[i] == 1)&& (sh->sources[i] == 0)) + off += snprintf (string + off, len - off, format, + priv->children[i]->name); + } + + if (down_child_present) { + if (down_count > 1) { + off += snprintf (string + off, len - off, "%s", + down_subvol_2); + } else { + off += snprintf (string + off, len - off, "%s", + down_subvol_1); + } + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 0) + off += snprintf (string + off, len - off, format, + priv->children[i]->name); + } + + if (unknown_child_present) { + if (unknown_count > 1) { + off += snprintf (string + off, len - off, "%s", + unknown_subvol_2); + } else { + off += snprintf (string + off, len - off, "%s", + unknown_subvol_1); + } + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == -1) + off += snprintf (string + off, len - off, format, + priv->children[i]->name); + } + + gf_asprintf (&sh->metadata_sh_info, "%s metadata %s,", string, + pending_matrix_str); + + if (pending_matrix_str && strcmp (pending_matrix_str, "")) + GF_FREE (pending_matrix_str); + + if (string && strcmp (string, "")) + GF_FREE (string); +} int afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) @@ -436,6 +570,8 @@ afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) sh->active_sinks); sh->actual_sh_started = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); + afr_set_metadata_sh_info_str (local, sh, this); STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, priv->children[source], priv->children[source]->fops->getxattr, @@ -461,7 +597,7 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this, priv = this->private; if (op_ret < 0) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_metadata_finish (frame, this); goto out; @@ -579,19 +715,22 @@ int afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; local = frame->local; int_lock = &local->internal_lock; + int_lock->domain = this->name; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); int_lock->transaction_lk_type = AFR_SELFHEAL_LK; int_lock->selfheal_lk_type = AFR_METADATA_SELF_HEAL_LK; afr_set_lock_number (frame, this); - int_lock->lk_flock.l_start = LLONG_MAX - 1; - int_lock->lk_flock.l_len = 0; - int_lock->lk_flock.l_type = F_WRLCK; + inodelk->flock.l_start = LLONG_MAX - 1; + inodelk->flock.l_len = 0; + inodelk->flock.l_type = F_WRLCK; int_lock->lock_cbk = afr_sh_metadata_post_nonblocking_inodelk_cbk; afr_nonblocking_inodelk (frame, this); @@ -618,8 +757,10 @@ afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) local = frame->local; sh = &local->self_heal; + sh->sh_type_in_action = AFR_SELF_HEAL_METADATA; if (afr_can_start_metadata_self_heal (sh, priv)) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); afr_sh_metadata_lock (frame, this); } else { afr_sh_metadata_done (frame, this); diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index 37bc224f5..1b48a1bca 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -25,7 +25,8 @@ typedef enum { typedef enum { HEAL = 1, - INFO + INFO, + STATISTICS_TO_BE_HEALED, } shd_crawl_op; typedef struct shd_dump { @@ -84,6 +85,33 @@ _loc_assign_gfid_path (loc_t *loc) } void +_destroy_crawl_event_data (void *data) +{ + shd_crawl_event_t *crawl_event = NULL; + + if (!data) + goto out; + + crawl_event = (shd_crawl_event_t *)data; + GF_FREE (crawl_event->start_time_str); + GF_FREE (crawl_event->end_time_str); + +out: + return; +} + +void +_destroy_shd_event_data (void *data) +{ + shd_event_t *event = NULL; + if (!data) + goto out; + event = (shd_event_t*)data; + GF_FREE (event->path); +out: + return; +} +void shd_cleanup_event (void *event) { shd_event_t *shd_event = event; @@ -128,6 +156,123 @@ _build_index_loc (xlator_t *this, loc_t *loc, char *name, loc_t *parent) } int +_add_crawl_stats_to_dict (xlator_t *this, dict_t *output, int child, + shd_crawl_event_t *shd_event, struct timeval *tv) +{ + int ret = 0; + uint64_t count = 0; + char key[256] = {0}; + int xl_id = 0; + uint64_t healed_count = 0; + uint64_t split_brain_count = 0; + uint64_t heal_failed_count = 0; + char *start_time_str = NULL; + char *end_time_str = NULL; + char *crawl_type = NULL; + int progress = -1; + + healed_count = shd_event->healed_count; + split_brain_count = shd_event->split_brain_count; + heal_failed_count = shd_event->heal_failed_count; + start_time_str = shd_event->start_time_str; + end_time_str = shd_event->end_time_str; + crawl_type = shd_event->crawl_type; + + if (!start_time_str) { + ret = -1; + goto out; + } + + + ret = dict_get_int32 (output, this->name, &xl_id); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); + goto out; + } + + snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child); + ret = dict_get_uint64 (output, key, &count); + + snprintf (key, sizeof (key), "statistics_healed_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64(output, key, healed_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "healed_count to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_sb_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64 (output, key, split_brain_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "split_brain_count to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_crawl_type-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_dynstr (output, key, gf_strdup (crawl_type)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "crawl_type to output"); + goto out; + } + snprintf (key, sizeof (key), "statistics_heal_failed_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64 (output, key, heal_failed_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "healed_failed_count to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_strt_time-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_dynstr (output, key, gf_strdup(start_time_str)); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "crawl_start_time to outout"); + goto out; + } + + snprintf (key, sizeof (key), "statistics_end_time-%d-%d-%"PRIu64, + xl_id, child, count); + + if (!end_time_str) + end_time_str = "Could not determine the end time"; + ret = dict_set_dynstr (output, key, gf_strdup(end_time_str)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "crawl_end_time to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_inprogress-%d-%d-%"PRIu64, + xl_id, child, count); + + if (shd_event->crawl_inprogress == _gf_true) + progress = 1; + else + progress = 0; + + ret = dict_set_int32 (output, key, progress); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "inprogress to outout"); + goto out; + } + + snprintf (key, sizeof (key), "statistics-%d-%d-count",xl_id, child); + ret = dict_set_uint64 (output, key, count + 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not increment the " + "counter."); + goto out; + } +out: + return ret; +} + +int _add_path_to_dict (xlator_t *this, dict_t *output, int child, char *path, struct timeval *tv, gf_boolean_t dyn) { @@ -233,6 +378,20 @@ out: } int +_add_crawl_event_statistics_to_dict (circular_buffer_t *cb, void *data) +{ + int ret = 0; + shd_dump_t *dump_data = NULL; + shd_crawl_event_t *shd_event = NULL; + + dump_data = data; + shd_event = cb->data; + ret = _add_crawl_stats_to_dict (dump_data->this, dump_data->dict, + dump_data->child, shd_event, &cb->tv); + return ret; +} + +int _add_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict, int child) { shd_dump_t dump_data = {0}; @@ -244,6 +403,26 @@ _add_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict, int child) return 0; } + +int +_add_statistics_to_dict (xlator_t *this, dict_t *dict, int child) +{ + shd_dump_t dump_data = {0}; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + + priv = this->private; + shd = &priv->shd; + + dump_data.this = this; + dump_data.dict = dict; + dump_data.child = child; + eh_dump (shd->statistics[child], &dump_data, + _add_crawl_event_statistics_to_dict); + return 0; + +} + void _remove_stale_index (xlator_t *this, xlator_t *readdir_xl, loc_t *parent, char *fname) @@ -269,6 +448,46 @@ out: } int +_count_hard_links_under_base_indices_dir (xlator_t *this, + afr_crawl_data_t *crawl_data, + gf_dirent_t *entry, loc_t *childloc, + loc_t *parentloc, struct iatt *iattr) +{ + xlator_t *readdir_xl = crawl_data->readdir_xl; + struct iatt parent = {0}; + int ret = 0; + dict_t *output = NULL; + int xl_id = 0; + char key[256] = {0}; + int child = -1; + uint64_t hardlinks = 0; + + output = crawl_data->op_data; + child = crawl_data->child; + + ret = syncop_lookup (readdir_xl, childloc, NULL, iattr, NULL, &parent); + if (ret) + goto out; + + ret = dict_get_int32 (output, this->name, &xl_id); + if (ret) + goto out; + + snprintf (key, sizeof (key), "%d-%d-hardlinks", xl_id, child); + ret = dict_get_uint64 (output, key, &hardlinks); + + /*Removing the count of base_entry under indices/base_indicies and + * entry under indices/xattrop */ + hardlinks = hardlinks + iattr->ia_nlink - 2; + ret = dict_set_uint64 (output, key, hardlinks); + if (ret) + goto out; + +out: + return ret; +} + +int _add_summary_to_dict (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *entry, loc_t *childloc, loc_t *parentloc, struct iatt *iattr) @@ -307,16 +526,18 @@ _crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child, int32_t op_ret, int32_t op_errno, dict_t *xattr_rsp, afr_crawl_data_t *crawl_data) { - int ret = 0; - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - eh_t *eh = NULL; - char *path = NULL; - char gfid_str[64] = {0}; - shd_event_t *event = NULL; - int32_t sh_failed = 0; - gf_boolean_t split_brain = 0; - int32_t actual_sh_done = 0; + int ret = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + eh_t *eh = NULL; + char *path = NULL; + char gfid_str[64] = {0}; + shd_event_t *event = NULL; + int32_t sh_failed = 0; + gf_boolean_t split_brain = 0; + int32_t actual_sh_done = 0; + shd_crawl_event_t **shd_crawl_event = NULL; + priv = this->private; shd = &priv->shd; if (crawl_data->crawl == INDEX) { @@ -343,16 +564,19 @@ _crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child, ret = dict_get_int32 (xattr_rsp, "actual-sh-done", &actual_sh_done); } - split_brain = afr_is_split_brain (this, child->inode); + shd_crawl_event = (shd_crawl_event_t**)(shd->crawl_events); + split_brain = afr_is_split_brain (this, child->inode); if ((op_ret < 0 && op_errno == EIO) || split_brain) { eh = shd->split_brain; + shd_crawl_event[crawl_data->child]->split_brain_count += 1; } else if ((op_ret < 0) || sh_failed) { eh = shd->heal_failed; + shd_crawl_event[crawl_data->child]->heal_failed_count += 1; } else if (actual_sh_done == 1) { - eh = shd->healed; + eh = shd->healed; + shd_crawl_event[crawl_data->child]->healed_count += 1; } - ret = -1; if (eh != NULL) { @@ -408,10 +632,20 @@ _self_heal_entry (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *ent struct iatt parentbuf = {0}; int ret = 0; dict_t *xattr_rsp = NULL; + dict_t *xattr_req = NULL; + + xattr_req = dict_new (); + if (!xattr_req) { + errno = ENOMEM; + ret = -1; + goto out; + } + + ret = dict_set_int32 (xattr_req, "allow-sh-for-running-transaction", 1); gf_log (this->name, GF_LOG_DEBUG, "lookup %s", child->path); - ret = syncop_lookup (this, child, NULL, + ret = syncop_lookup (this, child, xattr_req, iattr, &xattr_rsp, &parentbuf); _crawl_post_sh_action (this, parent, child, ret, errno, xattr_rsp, crawl_data); @@ -420,6 +654,9 @@ _self_heal_entry (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *ent if (ret == 0) ret = _link_inode_update_loc (this, child, iattr); +out: + if (xattr_req) + dict_unref(xattr_req); return ret; } @@ -528,12 +765,20 @@ _do_crawl_op_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl, status = "Started self-heal"; _do_self_heal_on_subvol (this, i, crawl); - } else if (output) { + } else if (output && (op == INFO)) { status = ""; afr_start_crawl (this, i, INDEX, _add_summary_to_dict, output, _gf_false, 0, NULL); + } else if (output && + (op == STATISTICS_TO_BE_HEALED)) { + status = ""; + afr_start_crawl (this, i, + INDEX_TO_BE_HEALED, + _count_hard_links_under_base_indices_dir, + output, _gf_false, + 0, NULL); } } if (output) { @@ -567,8 +812,103 @@ _get_index_summary_on_local_subvols (xlator_t *this, dict_t *output) return _do_crawl_op_on_local_subvols (this, INDEX, INFO, output); } +void +afr_fill_completed_crawl_statistics_to_dict (xlator_t *this, dict_t *dict) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int i = 0; + priv = this->private; + shd= &priv->shd; + for (i = 0; i < priv->child_count; i++) { + if (shd->pos[i] != AFR_POS_LOCAL) + continue; + _add_statistics_to_dict (this, dict, i); + } + + return ; +} + +static void +reset_crawl_event (shd_crawl_event_t *crawl_event) +{ + crawl_event->healed_count = 0; + crawl_event->split_brain_count = 0; + crawl_event->heal_failed_count = 0; + GF_FREE (crawl_event->start_time_str); + crawl_event->start_time_str = NULL; + crawl_event->end_time_str = NULL; + crawl_event->crawl_type = NULL; + crawl_event->crawl_inprogress = _gf_false; + return; +} + +static void +afr_copy_crawl_event_struct (shd_crawl_event_t *src, shd_crawl_event_t *dst) +{ + dst->healed_count = src->healed_count; + dst->split_brain_count = src->split_brain_count; + dst->heal_failed_count = src->heal_failed_count; + dst->start_time_str = gf_strdup (src->start_time_str); + dst->end_time_str = "Crawl is already in progress"; + dst->crawl_type = src->crawl_type; + dst->crawl_inprogress = _gf_true; + return; +} + +static int +afr_fill_crawl_statistics_of_running_crawl(xlator_t *this, dict_t *dict) +{ + shd_crawl_event_t *evnt = NULL; + int ret = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int i = 0; + priv = this->private; + shd = &priv->shd; + + evnt = GF_CALLOC (1, sizeof (shd_crawl_event_t), + gf_afr_mt_shd_crawl_event_t); + if (!evnt) { + ret = -1; + goto out; + } + LOCK (&priv->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (shd->pos[i] != AFR_POS_LOCAL) + continue; + + reset_crawl_event (evnt); + + if (!shd->crawl_events[i]) { + continue; + } + + afr_copy_crawl_event_struct (shd->crawl_events[i], + evnt); + _add_crawl_stats_to_dict (this, dict, i, evnt, NULL); + + } + } + UNLOCK (&priv->lock); + reset_crawl_event (evnt); + GF_FREE (evnt); + +out: + return ret; +} + +static int +_add_local_subvols_crawl_statistics_to_dict (xlator_t *this, dict_t *dict) +{ + int ret = 0; + afr_fill_completed_crawl_statistics_to_dict (this, dict); + ret = afr_fill_crawl_statistics_of_running_crawl (this, dict); + return ret; +} int -_add_all_subvols_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict) +_add_local_subvols_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict) { afr_private_t *priv = NULL; afr_self_heald_t *shd = NULL; @@ -618,16 +958,25 @@ afr_xl_op (xlator_t *this, dict_t *input, dict_t *output) ret = 0; break; case GF_AFR_OP_HEALED_FILES: - ret = _add_all_subvols_eh_to_dict (this, shd->healed, output); + ret = _add_local_subvols_eh_to_dict (this, shd->healed, output); break; case GF_AFR_OP_HEAL_FAILED_FILES: - ret = _add_all_subvols_eh_to_dict (this, shd->heal_failed, + ret = _add_local_subvols_eh_to_dict (this, shd->heal_failed, output); break; case GF_AFR_OP_SPLIT_BRAIN_FILES: - ret = _add_all_subvols_eh_to_dict (this, shd->split_brain, + ret = _add_local_subvols_eh_to_dict (this, shd->split_brain, output); break; + case GF_AFR_OP_STATISTICS: + ret = _add_local_subvols_crawl_statistics_to_dict (this, output); + break; + case GF_AFR_OP_STATISTICS_HEAL_COUNT: + case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: + ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED, + STATISTICS_TO_BE_HEALED, + output); + break; default: gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op); break; @@ -642,7 +991,7 @@ afr_poll_self_heal (void *data) { afr_private_t *priv = NULL; afr_self_heald_t *shd = NULL; - struct timeval timeout = {0}; + struct timespec timeout = {0}; xlator_t *this = NULL; long child = (long)data; gf_timer_t *old_timer = NULL; @@ -666,7 +1015,7 @@ afr_poll_self_heal (void *data) if (shd->enabled && (shd->pos[child] == AFR_POS_LOCAL)) _do_self_heal_on_subvol (this, child, INDEX); timeout.tv_sec = shd->timeout; - timeout.tv_usec = 0; + timeout.tv_nsec = 0; //notify and previous timer should be synchronized. LOCK (&priv->lock); { @@ -800,6 +1149,7 @@ afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data, afr_private_t *priv = NULL; dict_t *xattr = NULL; void *index_gfid = NULL; + void *base_indices_holder_vgfid = NULL; loc_t rootloc = {0}; struct iatt iattr = {0}; struct iatt parent = {0}; @@ -809,7 +1159,7 @@ afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data, priv = this->private; if (crawl_data->crawl == FULL) { afr_build_root_loc (this, dirloc); - } else { + } else if (crawl_data->crawl == INDEX) { afr_build_root_loc (this, &rootloc); ret = syncop_getxattr (readdir_xl, &rootloc, &xattr, GF_XATTROP_INDEX_GFID); @@ -843,6 +1193,47 @@ afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data, ret = _link_inode_update_loc (this, dirloc, &iattr); if (ret) goto out; + } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) { + afr_build_root_loc (this, &rootloc); + ret = syncop_getxattr (readdir_xl, &rootloc, &xattr, + GF_BASE_INDICES_HOLDER_GFID); + if (ret < 0) + goto out; + ret = dict_get_ptr (xattr, GF_BASE_INDICES_HOLDER_GFID, + &base_indices_holder_vgfid); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "index gfid empty " + "on %s", readdir_xl->name); + ret = -1; + goto out; + } + if (!base_indices_holder_vgfid) { + gf_log (this->name, GF_LOG_ERROR, "Base indices holder" + "virtual gfid is null on %s", readdir_xl->name); + ret = -1; + goto out; + } + uuid_copy (dirloc->gfid, base_indices_holder_vgfid); + dirloc->path = ""; + dirloc->inode = inode_new (priv->root_inode->table); + ret = syncop_lookup (readdir_xl, dirloc, NULL, &iattr, NULL, + &parent); + if (ret < 0) { + if (errno != ENOENT) { + gf_log (this->name, GF_LOG_ERROR, "lookup " + "failed for base_indices_holder dir" + " on %s - (%s)", readdir_xl->name, + strerror (errno)); + + } else { + gf_log (this->name, GF_LOG_ERROR, "base_indices" + "_holder is not yet created."); + } + goto out; + } + ret = _link_inode_update_loc (this, dirloc, &iattr); + if (ret) + goto out; } ret = 0; out: @@ -907,6 +1298,16 @@ afr_crawl_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, priv = this->private; if (crawl_data->crawl == FULL) { ret = afr_build_child_loc (this, child, parent, entry->d_name); + } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) { + ret = _build_index_loc (this, child, entry->d_name, parent); + if (ret) + goto out; + child->inode = inode_new (priv->root_inode->table); + if (!child->inode) { + ret = -1; + goto out; + } + child->path = NULL; } else { child->inode = inode_new (priv->root_inode->table); if (!child->inode) @@ -956,10 +1357,14 @@ _process_entries (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries, ret = crawl_data->process_entry (this, crawl_data, entry, &entry_loc, parentloc, &iattr); - if (ret) + if (crawl_data->crawl == INDEX_TO_BE_HEALED && ret) { + goto out; + } else if (ret) { continue; + } - if (crawl_data->crawl == INDEX) + if ((crawl_data->crawl == INDEX) || + (crawl_data->crawl == INDEX_TO_BE_HEALED)) continue; if (!IA_ISDIR (iattr.ia_type)) @@ -974,6 +1379,10 @@ _process_entries (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries, } ret = 0; out: + if ((crawl_data->crawl == INDEX_TO_BE_HEALED) && ret) { + gf_log (this->name, GF_LOG_ERROR,"Failed to get the hardlink " + "count"); + } loc_wipe (&entry_loc); return ret; } @@ -1021,6 +1430,9 @@ _crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data) ret = _process_entries (this, loc, &entries, &offset, crawl_data); + if ((ret < 0) && (crawl_data->crawl == INDEX_TO_BE_HEALED)) { + goto out; + } gf_dirent_free (&entries); free_entries = _gf_false; } @@ -1126,8 +1538,13 @@ afr_dir_crawl (void *data) goto out; ret = afr_crawl_opendir (this, crawl_data, &fd, &dirloc); - if (ret) + if (ret) { + if (crawl_data->crawl == INDEX_TO_BE_HEALED) { + gf_log (this->name, GF_LOG_ERROR, "Failed to open base_" + "indices_holder"); + } goto out; + } ret = _crawl_directory (fd, &dirloc, crawl_data); if (ret) @@ -1141,12 +1558,102 @@ afr_dir_crawl (void *data) out: if (fd) fd_unref (fd); - if (crawl_data->crawl == INDEX) + if ((crawl_data->crawl == INDEX) || + (crawl_data->crawl == INDEX_TO_BE_HEALED )) dirloc.path = NULL; loc_wipe (&dirloc); return ret; } +char * +get_crawl_type_in_string (afr_crawl_type_t crawl) +{ + char *index = "INDEX"; + char *full = "FULL"; + char *crawl_type = NULL; + + if (crawl == INDEX){ + crawl_type = index; + } else if (crawl == FULL) { + crawl_type = full; + } + + return crawl_type; +} + +static int +afr_allocate_crawl_event (xlator_t *this, int child, afr_crawl_type_t crawl) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int ret = 0; + shd_crawl_event_t *crawl_event = NULL; + time_t get_time = 0; + + priv = this->private; + shd = &priv->shd; + + crawl_event = GF_CALLOC (sizeof (shd_crawl_event_t), 1, + gf_afr_mt_shd_crawl_event_t); + if (!crawl_event) { + ret = -1; + goto out; + } + + get_time = time(NULL); + if (get_time == ((time_t)-1)) { + ret = -1; + goto out; + } + + crawl_event->start_time_str = gf_strdup (ctime(&get_time)); + + crawl_event->crawl_type = get_crawl_type_in_string (crawl); + if (!crawl_event->crawl_type) { + ret = -1; + goto out; + } + LOCK (&priv->lock); + { + shd->crawl_events[child] = crawl_event; + } + UNLOCK (&priv->lock); + ret = 0; +out: + return ret; + +} + +static int +afr_put_crawl_event_in_eh (xlator_t *this, int child) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int ret = 0; + time_t get_time = 0; + shd_crawl_event_t **crawl_event = NULL; + + priv = this->private; + shd = &priv->shd; + + get_time = time(NULL); + if (get_time == ((time_t)-1)) { + ret = -1; + goto out; + } + crawl_event = (shd_crawl_event_t**)shd->crawl_events; + LOCK (&priv->lock); + { + crawl_event[child]->end_time_str = gf_strdup (ctime(&get_time)); + ret = eh_save_history (shd->statistics[child], + crawl_event[child]); + crawl_event[child] = NULL; + } + UNLOCK (&priv->lock); +out: + return ret; +} + static int afr_dir_exclusive_crawl (void *data) { @@ -1182,7 +1689,15 @@ afr_dir_exclusive_crawl (void *data) } do { + ret = afr_allocate_crawl_event (this, child, crawl_data->crawl); + if (ret) + goto out; afr_dir_crawl (data); + + ret = afr_put_crawl_event_in_eh (this, child); + if (ret < 0) + goto out; + LOCK (&priv->lock); { if (shd->pending[child] != NONE) { @@ -1239,8 +1754,8 @@ afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl, ret = synctask_new (this->ctx->env, crawler, crawl_done, frame, crawl_data); if (ret) - gf_log (this->name, GF_LOG_ERROR, "Could not create the " - "task for %d ret %d", idx, ret); + gf_log (this->name, GF_LOG_ERROR, "afr crawl failed for child" + " %d with ret %d", idx, ret); out: return; } @@ -1270,4 +1785,3 @@ afr_set_root_gfid (dict_t *dict) return ret; } - diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h index 32a8aaca5..e0c083754 100644 --- a/xlators/cluster/afr/src/afr-self-heald.h +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -29,6 +29,19 @@ typedef struct afr_crawl_data_ { struct iatt *iattr); } afr_crawl_data_t; +typedef struct crawl_event_stats_ { + uint64_t healed_count; + uint64_t split_brain_count; + uint64_t heal_failed_count; + char *start_time_str; + char *end_time_str; + char *crawl_type; + gf_boolean_t crawl_inprogress; +} shd_crawl_event_t; + +void _destroy_crawl_event_data (void *data); +void _destroy_shd_event_data (void *data); + typedef int (*process_entry_cbk_t) (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *entry, loc_t *child, loc_t *parent, struct iatt *iattr); diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 217ff8548..20306e469 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -79,7 +79,7 @@ afr_save_lk_owner (call_frame_t *frame) local = frame->local; - local->saved_lk_owner = frame->root->lk_owner; + local->saved_lk_owner = frame->root->lk_owner; } @@ -90,10 +90,9 @@ afr_restore_lk_owner (call_frame_t *frame) local = frame->local; - frame->root->lk_owner = local->saved_lk_owner; + frame->root->lk_owner = local->saved_lk_owner; } - static void __mark_all_pending (int32_t *pending[], int child_count, afr_transaction_type type) @@ -369,7 +368,7 @@ afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (call_count == 0) { if (local->transaction.resume_stub) { - AFR_CALL_RESUME (local->transaction.resume_stub); + call_resume (local->transaction.resume_stub); local->transaction.resume_stub = NULL; } @@ -443,14 +442,30 @@ out: return; } +afr_inodelk_t* +afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom) +{ + afr_inodelk_t *inodelk = NULL; + int i = 0; + + for (i = 0; int_lock->inodelk[i].domain; i++) { + inodelk = &int_lock->inodelk[i]; + if (strcmp (dom, inodelk->domain) == 0) + return inodelk; + } + return NULL; +} + unsigned char* afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock) { unsigned char *locked_nodes = NULL; + afr_inodelk_t *inodelk = NULL; switch (type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - locked_nodes = int_lock->inode_locked_nodes; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + locked_nodes = inodelk->locked_nodes; break; case AFR_ENTRY_TRANSACTION: @@ -553,24 +568,77 @@ afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr, gf_boolean_t afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int index = -1; - int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int index = -1; + int i = 0; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; index = afr_index_for_transaction_type (local->transaction.type); - for (i = 0; i < priv->child_count; i++) { + for (i = 0; i < priv->child_count; i++) { if (local->pending[i][index] == 0) - return _gf_false; + return _gf_false; } - return _gf_true; + return _gf_true; } +static void +afr_dir_fop_handle_all_fop_failures (call_frame_t *frame) +{ + xlator_t *this = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + this = frame->this; + local = frame->local; + priv = this->private; + + if ((local->transaction.type != AFR_ENTRY_TRANSACTION) && + (local->transaction.type != AFR_ENTRY_RENAME_TRANSACTION)) + return; + + if (local->op_ret >= 0) + goto out; + + __mark_all_success (local->pending, priv->child_count, + local->transaction.type); +out: + return; +} + +static void +afr_data_handle_quota_errors (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + gf_boolean_t all_quota_failures = _gf_false; + + local = frame->local; + priv = this->private; + if (local->transaction.type != AFR_DATA_TRANSACTION) + return; + /* + * Idea is to not leave the file in FOOL-FOOL scenario in case on + * all the bricks data transaction failed with EDQUOT to avoid + * increasing un-necessary load of self-heals in the system. + */ + all_quota_failures = _gf_true; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] && + (local->child_errno[i] != EDQUOT)) { + all_quota_failures = _gf_false; + break; + } + } + if (all_quota_failures) + __mark_all_success (local->pending, priv->child_count, + local->transaction.type); +} int afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) @@ -593,6 +661,9 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) local->transaction.pre_op, local->transaction.type); + afr_data_handle_quota_errors (frame, this); + afr_dir_fop_handle_all_fop_failures (frame); + if (local->fd) afr_transaction_rm_stale_children (frame, this, local->fd->inode, @@ -619,7 +690,7 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) goto out; } - nothing_failed = afr_txn_nothing_failed (frame, this); + nothing_failed = afr_txn_nothing_failed (frame, this); afr_compute_txn_changelog (local , priv); @@ -645,14 +716,14 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) break; } - /* local->transaction.postop_piggybacked[] was - precomputed in is_piggyback_postop() when called from - afr_changelog_post_op_safe() - */ + /* local->transaction.postop_piggybacked[] was + precomputed in is_piggyback_postop() when called from + afr_changelog_post_op_safe() + */ - piggyback = 0; - if (local->transaction.postop_piggybacked[i]) - piggyback = 1; + piggyback = 0; + if (local->transaction.postop_piggybacked[i]) + piggyback = 1; afr_set_postop_dict (local, this, xattr[i], piggyback, i); @@ -906,7 +977,7 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) } UNLOCK (&local->fd->lock); - afr_set_delayed_post_op (frame, this); + afr_set_delayed_post_op (frame, this); if (piggyback) afr_changelog_pre_op_cbk (frame, (void *)(long)i, @@ -1179,12 +1250,14 @@ int afr_set_transaction_flock (afr_local_t *local) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; int_lock = &local->internal_lock; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - int_lock->lk_flock.l_len = local->transaction.len; - int_lock->lk_flock.l_start = local->transaction.start; - int_lock->lk_flock.l_type = F_WRLCK; + inodelk->flock.l_len = local->transaction.len; + inodelk->flock.l_start = local->transaction.start; + inodelk->flock.l_type = F_WRLCK; return 0; } @@ -1199,6 +1272,7 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; int_lock->transaction_lk_type = AFR_TRANSACTION_LK; + int_lock->domain = this->name; switch (local->transaction.type) { case AFR_DATA_TRANSACTION: @@ -1259,69 +1333,115 @@ afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) void afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - /* call this function from any of the related optimizations - which benefit from delaying post op are enabled, namely: + /* call this function from any of the related optimizations + which benefit from delaying post op are enabled, namely: - - changelog piggybacking - - eager locking - */ + - changelog piggybacking + - eager locking + */ - priv = this->private; - if (!priv) - return; + priv = this->private; + if (!priv) + return; - if (!priv->post_op_delay_secs) - return; + if (!priv->post_op_delay_secs) + return; local = frame->local; if (!local->transaction.eager_lock_on) return; - if (!local) - return; + if (!local) + return; - if (!local->fd) - return; + if (!local->fd) + return; - if (local->op == GF_FOP_WRITE) - local->delayed_post_op = _gf_true; + if (local->op == GF_FOP_WRITE) + local->delayed_post_op = _gf_true; } +gf_boolean_t +afr_are_multiple_fds_opened (inode_t *inode, xlator_t *this) +{ + afr_inode_ctx_t *ictx = NULL; + + if (!inode) { + /* If false is returned, it may keep on taking eager-lock + * which may lead to starvation, so return true to avoid that. + */ + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid inode"); + return _gf_true; + } + /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock + * is taken mount2 opened the same file, it won't be able to + * perform any data operations until mount1 releases eager-lock. + * To avoid such scenario do not enable eager-lock for this transaction + * if open-fd-count is > 1 + */ + + ictx = afr_inode_ctx_get (inode, this); + if (!ictx) + return _gf_true; + + if (ictx->open_fd_count > 1) + return _gf_true; + + return _gf_false; +} + +gf_boolean_t +afr_any_fops_failed (afr_local_t *local, afr_private_t *priv) +{ + if (local->success_count != priv->child_count) + return _gf_true; + return _gf_false; +} gf_boolean_t is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - gf_boolean_t res = _gf_false; + afr_local_t *local = NULL; + gf_boolean_t res = _gf_false; + afr_private_t *priv = NULL; - local = frame->local; - if (!local) - goto out; + priv = this->private; - if (!local->delayed_post_op) - goto out; + local = frame->local; + if (!local) + goto out; - res = _gf_true; + if (!local->delayed_post_op) + goto out; + + //Mark pending changelog ASAP + if (afr_any_fops_failed (local, priv)) + goto out; + + if (local->fd && afr_are_multiple_fds_opened (local->fd->inode, this)) + goto out; + + res = _gf_true; out: - return res; + return res; } void afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, - call_stub_t *stub); + call_stub_t *stub); void afr_delayed_changelog_wake_up_cbk (void *data) { - fd_t *fd = NULL; + fd_t *fd = NULL; - fd = data; + fd = data; - afr_delayed_changelog_wake_up (THIS, fd); + afr_delayed_changelog_wake_up (THIS, fd); } @@ -1332,39 +1452,39 @@ afr_delayed_changelog_wake_up_cbk (void *data) static gf_boolean_t is_piggyback_post_op (call_frame_t *frame, fd_t *fd) { - afr_fd_ctx_t *fdctx = NULL; - afr_local_t *local = NULL; - gf_boolean_t piggyback = _gf_true; - afr_private_t *priv = NULL; - int i = 0; + afr_fd_ctx_t *fdctx = NULL; + afr_local_t *local = NULL; + gf_boolean_t piggyback = _gf_true; + afr_private_t *priv = NULL; + int i = 0; - priv = frame->this->private; - local = frame->local; - fdctx = afr_fd_ctx_get (fd, frame->this); + priv = frame->this->private; + local = frame->local; + fdctx = afr_fd_ctx_get (fd, frame->this); - LOCK(&fd->lock); - { - piggyback = _gf_true; - - for (i = 0; i < priv->child_count; i++) { - if (!local->transaction.pre_op[i]) - continue; - if (fdctx->pre_op_piggyback[i]) { - fdctx->pre_op_piggyback[i]--; - local->transaction.postop_piggybacked[i] = 1; - } else { - /* For at least _one_ subvolume we cannot - piggyback on the changelog, and have to - perform a hard POST-OP and therefore fsync - if necesssary - */ - piggyback = _gf_false; + LOCK(&fd->lock); + { + piggyback = _gf_true; + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; + if (fdctx->pre_op_piggyback[i]) { + fdctx->pre_op_piggyback[i]--; + local->transaction.postop_piggybacked[i] = 1; + } else { + /* For at least _one_ subvolume we cannot + piggyback on the changelog, and have to + perform a hard POST-OP and therefore fsync + if necesssary + */ + piggyback = _gf_false; GF_ASSERT (fdctx->pre_op_done[i]); fdctx->pre_op_done[i]--; - } - } - } - UNLOCK(&fd->lock); + } + } + } + UNLOCK(&fd->lock); if (!afr_txn_nothing_failed (frame, frame->this)) { /* something failed in this transaction, @@ -1381,186 +1501,205 @@ is_piggyback_post_op (call_frame_t *frame, fd_t *fd) int afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) { - afr_fd_ctx_t *fdctx = NULL; + afr_fd_ctx_t *fdctx = NULL; - fdctx = afr_fd_ctx_get (fd, this); + fdctx = afr_fd_ctx_get (fd, this); - LOCK(&fd->lock); - { - fdctx->witnessed_unstable_write = _gf_true; - } - UNLOCK(&fd->lock); + LOCK(&fd->lock); + { + fdctx->witnessed_unstable_write = _gf_true; + } + UNLOCK(&fd->lock); - return 0; + return 0; } /* TEST and CLEAR operation */ gf_boolean_t afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd) { - afr_fd_ctx_t *fdctx = NULL; - gf_boolean_t witness = _gf_false; + afr_fd_ctx_t *fdctx = NULL; + gf_boolean_t witness = _gf_false; fdctx = afr_fd_ctx_get (fd, this); + if (!fdctx) + return _gf_true; - LOCK(&fd->lock); - { - if (fdctx->witnessed_unstable_write) { - witness = _gf_true; - fdctx->witnessed_unstable_write = _gf_false; - } - } - UNLOCK (&fd->lock); + LOCK(&fd->lock); + { + if (fdctx->witnessed_unstable_write) { + witness = _gf_true; + fdctx->witnessed_unstable_write = _gf_false; + } + } + UNLOCK (&fd->lock); - return witness; + return witness; } int afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *pre, - struct iatt *post, dict_t *xdata) + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) { - afr_private_t *priv = NULL; + afr_private_t *priv = NULL; int child_index = (long) cookie; - int call_count = -1; - afr_local_t *local = NULL; + int call_count = -1; + afr_local_t *local = NULL; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - if (afr_fop_failed (op_ret, op_errno)) { - /* Failure of fsync() is as good as failure of previous - write(). So treat it like one. - */ - gf_log (this->name, GF_LOG_WARNING, - "fsync(%s) failed on subvolume %s. Transaction was %s", - uuid_utoa (local->fd->inode->gfid), - priv->children[child_index]->name, - gf_fop_list[local->op]); - - afr_transaction_fop_failed (frame, this, child_index); - } + if (afr_fop_failed (op_ret, op_errno)) { + /* Failure of fsync() is as good as failure of previous + write(). So treat it like one. + */ + gf_log (this->name, GF_LOG_WARNING, + "fsync(%s) failed on subvolume %s. Transaction was %s", + uuid_utoa (local->fd->inode->gfid), + priv->children[child_index]->name, + gf_fop_list[local->op]); + + afr_transaction_fop_failed (frame, this, child_index); + } call_count = afr_frame_return (frame); if (call_count == 0) - afr_changelog_post_op_now (frame, this); + afr_changelog_post_op_now (frame, this); - return 0; + return 0; } int afr_changelog_fsync (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - int i = 0; - int call_count = 0; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + GF_UNUSED int ret = -1; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; call_count = afr_pre_op_done_children_count (local->transaction.pre_op, priv->child_count); - if (!call_count) { - /* will go straight to unlock */ - afr_changelog_post_op_now (frame, this); - return 0; - } + if (!call_count) { + /* will go straight to unlock */ + afr_changelog_post_op_now (frame, this); + return 0; + } - local->call_count = call_count; + local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (!local->transaction.pre_op[i]) - continue; + xdata = dict_new(); + if (xdata) + ret = dict_set_int32 (xdata, "batch-fsync", 1); - STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->fsync, local->fd, - 1, NULL); - if (!--call_count) - break; - } + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->fsync, local->fd, + 1, xdata); + if (!--call_count) + break; + } - return 0; + if (xdata) + dict_unref (xdata); + + return 0; } -int + int afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; + afr_private_t *priv = NULL; local = frame->local; + priv = this->private; - if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) { - afr_changelog_post_op_now (frame, this); - return 0; - } - - if (is_piggyback_post_op (frame, local->fd)) { - /* just detected that this post-op is about to - be optimized away as a new write() has - already piggybacked on this frame's changelog. - */ - afr_changelog_post_op_now (frame, this); - return 0; - } + if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) { + afr_changelog_post_op_now (frame, this); + return 0; + } - /* Calling afr_changelog_post_op_now() now will result in - issuing ->[f]xattrop(). - - Performing a hard POST-OP (->[f]xattrop() FOP) is a more - responsible operation that what it might appear on the surface. - - The changelog of a file (in the xattr of the file on the server) - stores information (pending count) about the state of the file - on the OTHER server. This changelog is blindly trusted, and must - therefore be updated in such a way it remains trustworthy. This - implies that decrementing the pending count (essentially "clearing - the dirty flag") must be done STRICTLY after we are sure that the - operation on the other server has reached stable storage. - - While the backend filesystem on that server will eventually flush - it to stable storage, we (being in userspace) have no mechanism - to get notified when the write became "stable". - - This means we need take matter into our own hands and issue an - fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES, - and get an acknowledgement for it. And we need to wait for the - fsync() acknowledgement before initiating the hard POST-OP. - - However if the FD itself was opened in O_SYNC or O_DSYNC then - we are already guaranteed that the writes were made stable as - part of the FOP itself. The same holds true for NFS stable - writes which happen on an anonymous FD with O_DSYNC or O_SYNC - flag set in the writev() @flags param. For all other write types, - mark a flag in the fdctx whenever an unstable write is witnessed. - */ - - if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { - afr_changelog_post_op_now (frame, this); - return 0; - } + if (is_piggyback_post_op (frame, local->fd)) { + /* just detected that this post-op is about to + be optimized away as a new write() has + already piggybacked on this frame's changelog. + */ + afr_changelog_post_op_now (frame, this); + return 0; + } - /* Time to fsync() */ + /* Calling afr_changelog_post_op_now() now will result in + issuing ->[f]xattrop(). + + Performing a hard POST-OP (->[f]xattrop() FOP) is a more + responsible operation that what it might appear on the surface. + + The changelog of a file (in the xattr of the file on the server) + stores information (pending count) about the state of the file + on the OTHER server. This changelog is blindly trusted, and must + therefore be updated in such a way it remains trustworthy. This + implies that decrementing the pending count (essentially "clearing + the dirty flag") must be done STRICTLY after we are sure that the + operation on the other server has reached stable storage. + + While the backend filesystem on that server will eventually flush + it to stable storage, we (being in userspace) have no mechanism + to get notified when the write became "stable". + + This means we need take matter into our own hands and issue an + fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES, + and get an acknowledgement for it. And we need to wait for the + fsync() acknowledgement before initiating the hard POST-OP. + + However if the FD itself was opened in O_SYNC or O_DSYNC then + we are already guaranteed that the writes were made stable as + part of the FOP itself. The same holds true for NFS stable + writes which happen on an anonymous FD with O_DSYNC or O_SYNC + flag set in the writev() @flags param. For all other write types, + mark a flag in the fdctx whenever an unstable write is witnessed. + */ + + if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { + afr_changelog_post_op_now (frame, this); + return 0; + } - afr_changelog_fsync (frame, this); + /* Check whether users want durability and perform fsync/post-op + * accordingly. + */ + if (priv->ensure_durability) { + /* Time to fsync() */ + afr_changelog_fsync (frame, this); + } else { + afr_changelog_post_op_now (frame, this); + } - return 0; + return 0; } void afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, - call_stub_t *stub) + call_stub_t *stub) { afr_fd_ctx_t *fd_ctx = NULL; call_frame_t *prev_frame = NULL; - struct timeval delta = {0, }; + struct timespec delta = {0, }; afr_private_t *priv = NULL; afr_local_t *local = NULL; @@ -1568,10 +1707,10 @@ afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, fd_ctx = afr_fd_ctx_get (fd, this); if (!fd_ctx) - return; + goto out; delta.tv_sec = priv->post_op_delay_secs; - delta.tv_usec = 0; + delta.tv_nsec = 0; pthread_mutex_lock (&fd_ctx->delay_lock); { @@ -1590,6 +1729,7 @@ afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, unlock: pthread_mutex_unlock (&fd_ctx->delay_lock); +out: if (prev_frame) { local = prev_frame->local; local->transaction.resume_stub = stub; @@ -1603,14 +1743,14 @@ unlock: void afr_changelog_post_op (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (is_afr_delayed_changelog_post_op_needed (frame, this)) - afr_delayed_changelog_post_op (this, frame, local->fd, NULL); - else - afr_changelog_post_op_safe (frame, this); + if (is_afr_delayed_changelog_post_op_needed (frame, this)) + afr_delayed_changelog_post_op (this, frame, local->fd, NULL); + else + afr_changelog_post_op_safe (frame, this); } @@ -1621,22 +1761,22 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this) The @stub gets saved in @local and gets resumed in afr_local_cleanup() -*/ + */ void afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub) { - afr_delayed_changelog_post_op (this, NULL, fd, stub); + afr_delayed_changelog_post_op (this, NULL, fd, stub); } void afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd) { - afr_delayed_changelog_post_op (this, NULL, fd, NULL); + afr_delayed_changelog_post_op (this, NULL, fd, NULL); } -int + int afr_transaction_resume (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; @@ -1647,18 +1787,18 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; priv = this->private; - if (local->transaction.eager_lock_on) { - /* We don't need to retain "local" in the - fd list anymore, writes to all subvols - are finished by now */ - LOCK (&local->fd->lock); - { - list_del_init (&local->transaction.eager_locked); - } - UNLOCK (&local->fd->lock); - } + if (local->transaction.eager_lock_on) { + /* We don't need to retain "local" in the + fd list anymore, writes to all subvols + are finished by now */ + LOCK (&local->fd->lock); + { + list_del_init (&local->transaction.eager_locked); + } + UNLOCK (&local->fd->lock); + } - afr_restore_lk_owner (frame); + afr_restore_lk_owner (frame); if (__fop_changelog_needed (frame, this)) { afr_changelog_post_op (frame, this); @@ -1680,7 +1820,8 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) */ void -afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index) +afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, + int child_index) { afr_local_t * local = NULL; afr_private_t * priv = NULL; @@ -1689,55 +1830,56 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index priv = this->private; __mark_child_dead (local->pending, priv->child_count, - child_index, local->transaction.type); + child_index, local->transaction.type); } -static gf_boolean_t + static gf_boolean_t afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) { - uint64_t start1 = local1->transaction.start; - uint64_t start2 = local2->transaction.start; - uint64_t end1 = 0; - uint64_t end2 = 0; - - if (local1->transaction.len) - end1 = start1 + local1->transaction.len - 1; - else - end1 = ULLONG_MAX; - - if (local2->transaction.len) - end2 = start2 + local2->transaction.len - 1; - else - end2 = ULLONG_MAX; - - return ((end1 >= start2) && (end2 >= start1)); -} + uint64_t start1 = local1->transaction.start; + uint64_t start2 = local2->transaction.start; + uint64_t end1 = 0; + uint64_t end2 = 0; + if (local1->transaction.len) + end1 = start1 + local1->transaction.len - 1; + else + end1 = ULLONG_MAX; + + if (local2->transaction.len) + end2 = start2 + local2->transaction.len - 1; + else + end2 = ULLONG_MAX; + + return ((end1 >= start2) && (end2 >= start1)); +} void afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) { - afr_private_t *priv = NULL; - afr_fd_ctx_t *fdctx = NULL; - afr_local_t *each = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fdctx = NULL; + afr_local_t *each = NULL; - priv = this->private; + priv = this->private; - if (!local->fd) - return; + if (!local->fd) + return; - if (local->transaction.type != AFR_DATA_TRANSACTION) - return; + if (local->transaction.type != AFR_DATA_TRANSACTION) + return; - if (!priv->eager_lock) - return; + if (!priv->eager_lock) + return; - fdctx = afr_fd_ctx_get (local->fd, this); - if (!fdctx) - return; + fdctx = afr_fd_ctx_get (local->fd, this); + if (!fdctx) + return; + if (afr_are_multiple_fds_opened (local->fd->inode, this)) + return; /* * Once full file lock is acquired in eager-lock phase, overlapping * writes do not compete for inode-locks, instead are transferred to the @@ -1755,22 +1897,22 @@ afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) * This check makes sure the locks are not transferred for * overlapping writes. */ - LOCK (&local->fd->lock); - { - list_for_each_entry (each, &fdctx->eager_locked, - transaction.eager_locked) { - if (afr_locals_overlap (each, local)) { - local->transaction.eager_lock_on = _gf_false; - goto unlock; - } - } - - local->transaction.eager_lock_on = _gf_true; - list_add_tail (&local->transaction.eager_locked, - &fdctx->eager_locked); - } + LOCK (&local->fd->lock); + { + list_for_each_entry (each, &fdctx->eager_locked, + transaction.eager_locked) { + if (afr_locals_overlap (each, local)) { + local->transaction.eager_lock_on = _gf_false; + goto unlock; + } + } + + local->transaction.eager_lock_on = _gf_true; + list_add_tail (&local->transaction.eager_locked, + &fdctx->eager_locked); + } unlock: - UNLOCK (&local->fd->lock); + UNLOCK (&local->fd->lock); } @@ -1789,11 +1931,10 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) local->transaction.type = type; ret = afr_transaction_local_init (local, this); - if (ret < 0) { + if (ret < 0) goto out; - } - afr_transaction_eager_lock_init (local, this); + afr_transaction_eager_lock_init (local, this); if (local->fd && local->transaction.eager_lock_on) afr_set_lk_owner (frame, this, local->fd); @@ -1802,6 +1943,9 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) if (!local->transaction.eager_lock_on && local->loc.inode) { fd = fd_lookup (local->loc.inode, frame->root->pid); + if (fd == NULL) + fd = fd_lookup_anonymous (local->loc.inode); + if (fd) { afr_delayed_changelog_wake_up (this, fd); fd_unref (fd); diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index 55e8bbcca..fa626fd0d 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -23,6 +23,9 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int afr_lock_server_count (afr_private_t *priv, afr_transaction_type type); +afr_inodelk_t* +afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom); + int32_t afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); @@ -40,4 +43,9 @@ afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd); void __mark_all_success (int32_t *pending[], int child_count, afr_transaction_type type); +gf_boolean_t +afr_any_fops_failed (afr_local_t *local, afr_private_t *priv); + +gf_boolean_t +afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this); #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index bee10fd01..c724eb2ae 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -191,6 +191,8 @@ reconfigure (xlator_t *this, dict_t *options) /* Reset this so we re-discover in case the topology changed. */ GF_OPTION_RECONF ("readdir-failover", priv->readdir_failover, options, bool, out); + GF_OPTION_RECONF ("ensure-durability", priv->ensure_durability, options, + bool, out); priv->did_discovery = _gf_false; ret = 0; @@ -335,6 +337,8 @@ init (xlator_t *this) GF_OPTION_INIT ("post-op-delay-secs", priv->post_op_delay_secs, uint32, out); GF_OPTION_INIT ("readdir-failover", priv->readdir_failover, bool, out); + GF_OPTION_INIT ("ensure-durability", priv->ensure_durability, bool, + out); priv->wait_count = 1; @@ -376,8 +380,6 @@ init (xlator_t *this) AFR_XATTR_PREFIX, trav->xlator->name); if (-1 == ret) { - gf_log (this->name, GF_LOG_ERROR, - "asprintf failed to set pending key"); ret = -ENOMEM; goto out; } @@ -386,6 +388,13 @@ init (xlator_t *this) i++; } + ret = gf_asprintf (&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT, + this->name); + if (-1 == ret) { + ret = -ENOMEM; + goto out; + } + priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event), gf_afr_mt_int32_t); if (!priv->last_event) { @@ -430,15 +439,18 @@ init (xlator_t *this) if (!priv->shd.timer) goto out; - priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false); + priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false, + _destroy_shd_event_data); if (!priv->shd.healed) goto out; - priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false); + priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false, + _destroy_shd_event_data); if (!priv->shd.heal_failed) goto out; - priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false); + priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false, + _destroy_shd_event_data); if (!priv->shd.split_brain) goto out; @@ -448,7 +460,9 @@ init (xlator_t *this) priv->root_inode = inode_ref (this->itable->root); GF_OPTION_INIT ("node-uuid", priv->shd.node_uuid, str, out); GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out); - + ret = afr_initialise_statistics (this); + if (ret) + goto out; ret = 0; out: return ret; @@ -483,6 +497,9 @@ struct xlator_fops fops = { .finodelk = afr_finodelk, .entrylk = afr_entrylk, .fentrylk = afr_fentrylk, + .fallocate = afr_fallocate, + .discard = afr_discard, + .zerofill = afr_zerofill, /* inode read */ .access = afr_access, @@ -765,5 +782,12 @@ struct volume_options options[] = { .description = "readdir(p) will not failover if this option is off", .default_value = "on", }, + { .key = {"ensure-durability"}, + .type = GF_OPTION_TYPE_BOOL, + .description = "Afr performs fsyncs for transactions if this " + "option is on to make sure the changelogs/data is " + "written to the disk", + .default_value = "on", + }, { .key = {NULL} }, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 387ed12ec..21064db58 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -28,8 +28,10 @@ #define AFR_XATTR_PREFIX "trusted.afr" #define AFR_PATHINFO_HEADER "REPLICATE:" #define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size" +#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal" #define AFR_LOCKEE_COUNT_MAX 3 +#define AFR_DOM_COUNT_MAX 3 struct _pump_private; @@ -86,26 +88,30 @@ typedef struct afr_inode_ctx_ { int32_t *fresh_children;//increasing order of latency afr_spb_state_t mdata_spb; afr_spb_state_t data_spb; + uint32_t open_fd_count; } afr_inode_ctx_t; typedef enum { NONE, INDEX, + INDEX_TO_BE_HEALED, FULL, } afr_crawl_type_t; typedef struct afr_self_heald_ { - gf_boolean_t enabled; - gf_boolean_t iamshd; - afr_crawl_type_t *pending; - gf_boolean_t *inprogress; - afr_child_pos_t *pos; - gf_timer_t **timer; - eh_t *healed; - eh_t *heal_failed; - eh_t *split_brain; - char *node_uuid; - int timeout; + gf_boolean_t enabled; + gf_boolean_t iamshd; + afr_crawl_type_t *pending; + gf_boolean_t *inprogress; + afr_child_pos_t *pos; + gf_timer_t **timer; + eh_t *healed; + eh_t *heal_failed; + eh_t *split_brain; + eh_t **statistics; + void **crawl_events; + char *node_uuid; + int timeout; } afr_self_heald_t; typedef struct _afr_private { @@ -170,9 +176,38 @@ typedef struct _afr_private { gf_boolean_t did_discovery; gf_boolean_t readdir_failover; uint64_t sh_readdir_size; + gf_boolean_t ensure_durability; + char *sh_domain; } afr_private_t; +typedef enum { + AFR_SELF_HEAL_NOT_ATTEMPTED, + AFR_SELF_HEAL_STARTED, + AFR_SELF_HEAL_FAILED, + AFR_SELF_HEAL_SYNC_BEGIN, +} afr_self_heal_status; + typedef struct { + afr_self_heal_status gfid_or_missing_entry_self_heal; + afr_self_heal_status metadata_self_heal; + afr_self_heal_status data_self_heal; + afr_self_heal_status entry_self_heal; +} afr_sh_status_for_all_type; + +typedef enum { + AFR_SELF_HEAL_ENTRY, + AFR_SELF_HEAL_METADATA, + AFR_SELF_HEAL_DATA, + AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY, + AFR_SELF_HEAL_INVALID = -1, +} afr_self_heal_type; + +typedef enum { + AFR_CHECK_ALL, + AFR_CHECK_SPECIFIC, +} afr_sh_fail_check_type; + +struct afr_self_heal_ { /* External interface: These are variables (some optional) that are set by whoever has triggered self-heal */ @@ -249,10 +284,10 @@ typedef struct { const char *linkname; gf_boolean_t entries_skipped; - int op_failed; gf_boolean_t actual_sh_started; gf_boolean_t sync_done; gf_boolean_t data_lock_held; + gf_boolean_t sh_dom_lock_held; gf_boolean_t eof_reached; fd_t *healing_fd; int file_has_holes; @@ -263,13 +298,17 @@ typedef struct { uint8_t *checksum; afr_post_remove_call_t post_remove_call; - loc_t parent_loc; + char *data_sh_info; + char *metadata_sh_info; + loc_t parent_loc; call_frame_t *orig_frame; call_frame_t *old_loop_frame; gf_boolean_t unwound; afr_sh_algo_private_t *private; + afr_sh_status_for_all_type afr_all_sh_status; + afr_self_heal_type sh_type_in_action; struct afr_sh_algorithm *algo; afr_lock_cbk_t data_lock_success_handler; @@ -282,7 +321,9 @@ typedef struct { void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this); call_frame_t *sh_frame; -} afr_self_heal_t; +}; + +typedef struct afr_self_heal_ afr_self_heal_t; typedef enum { AFR_DATA_TRANSACTION, /* truncate, write, ... */ @@ -356,12 +397,19 @@ int afr_entry_lockee_cmp (const void *l1, const void *l2); typedef struct { + char *domain; /* Domain on which inodelk is taken */ + struct gf_flock flock; + unsigned char *locked_nodes; + int32_t lock_count; +} afr_inodelk_t; + +typedef struct { loc_t *lk_loc; - struct gf_flock lk_flock; int lockee_count; afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX]; + afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX]; const char *lk_basename; const char *lower_basename; const char *higher_basename; @@ -370,13 +418,11 @@ typedef struct { unsigned char *locked_nodes; unsigned char *lower_locked_nodes; - unsigned char *inode_locked_nodes; selfheal_lk_type_t selfheal_lk_type; transaction_lk_type_t transaction_lk_type; int32_t lock_count; - int32_t inodelk_lock_count; int32_t entrylk_lock_count; uint64_t lock_number; @@ -387,6 +433,7 @@ typedef struct { int32_t lock_op_ret; int32_t lock_op_errno; afr_lock_cbk_t lock_cbk; + char *domain; /* Domain on which inode/entry lock/unlock in progress.*/ } afr_internal_lock_t; typedef struct _afr_locked_fd { @@ -406,9 +453,11 @@ typedef struct _afr_local { unsigned int call_count; unsigned int success_count; unsigned int enoent_count; + uint32_t open_fd_count; + gf_boolean_t update_open_fd_count; - unsigned int govinda_gOvinda; + unsigned int unhealable; unsigned int read_child_index; unsigned char read_child_returned; @@ -448,15 +497,23 @@ typedef struct _afr_local { int optimistic_change_log; gf_boolean_t delayed_post_op; + /* Is the current writev() going to perform a stable write? i.e, is fd->flags or @flags writev param have O_SYNC or O_DSYNC? */ - gf_boolean_t stable_write; + gf_boolean_t stable_write; - /* - This struct contains the arguments for the "continuation" - (scheme-like) of fops + /* This write appended to the file. Nnot necessarily O_APPEND, + just means the offset of write was at the end of file. + */ + gf_boolean_t append_write; + + int allow_sh_for_running_transaction; + + + /* This struct contains the arguments for the "continuation" + (scheme-like) of fops */ int op; @@ -552,7 +609,9 @@ typedef struct _afr_local { struct { struct iatt prebuf; struct iatt postbuf; + } inode_wfop; //common structure for all inode-write-fops + struct { int32_t op_ret; struct iovec *vector; @@ -563,34 +622,21 @@ typedef struct _afr_local { } writev; struct { - struct iatt prebuf; - struct iatt postbuf; - } fsync; - - struct { off_t offset; - struct iatt prebuf; - struct iatt postbuf; } truncate; struct { off_t offset; - struct iatt prebuf; - struct iatt postbuf; } ftruncate; struct { struct iatt in_buf; int32_t valid; - struct iatt preop_buf; - struct iatt postop_buf; } setattr; struct { struct iatt in_buf; int32_t valid; - struct iatt preop_buf; - struct iatt postop_buf; } fsetattr; struct { @@ -652,6 +698,26 @@ typedef struct _afr_local { dict_t *params; char *linkpath; } symlink; + + struct { + int32_t mode; + off_t offset; + size_t len; + } fallocate; + + struct { + off_t offset; + size_t len; + } discard; + + struct { + off_t offset; + size_t len; + struct iatt prebuf; + struct iatt postbuf; + } zerofill; + + } cont; struct { @@ -824,8 +890,8 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this); int afr_internal_lock_finish (call_frame_t *frame, xlator_t *this); -void -afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, unsigned int child_count); int pump_start (call_frame_t *frame, xlator_t *this); @@ -927,22 +993,6 @@ afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd); } \ } while (0); -#define AFR_CALL_RESUME(stub) \ - do { \ - afr_local_t *__local = NULL; \ - xlator_t *__this = NULL; \ - \ - __local = stub->frame->local; \ - __this = stub->frame->this; \ - stub->frame->local = NULL; \ - \ - call_resume (stub); \ - if (__local) { \ - afr_local_cleanup (__local, __this); \ - mem_put (__local); \ - } \ - } while (0) - #define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/ /* allocate and return a string that is the basename of argument */ static inline char * @@ -1120,6 +1170,27 @@ afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count); } while (0); +#define AFR_SBRAIN_MSG "Failed on %s as split-brain is seen. Returning EIO." + +#define AFR_SBRAIN_CHECK_FD(fd, label) do { \ + if (fd->inode && afr_is_split_brain (this, fd->inode)) { \ + op_errno = EIO; \ + gf_log (this->name, GF_LOG_WARNING, \ + AFR_SBRAIN_MSG ,uuid_utoa (fd->inode->gfid)); \ + goto label; \ + } \ +} while (0) + +#define AFR_SBRAIN_CHECK_LOC(loc, label) do { \ + if (loc->inode && afr_is_split_brain (this, loc->inode)) { \ + op_errno = EIO; \ + loc_path (loc, NULL); \ + gf_log (this->name, GF_LOG_WARNING, \ + AFR_SBRAIN_MSG , loc->path); \ + goto label; \ + } \ +} while (0) + int afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); @@ -1129,4 +1200,13 @@ afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd); void afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub); +int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count); + +void +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this); + +afr_inode_ctx_t* +afr_inode_ctx_get (inode_t *inode, xlator_t *this); + #endif /* __AFR_H__ */ diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c index db83410c7..a7f72fb30 100644 --- a/xlators/cluster/afr/src/pump.c +++ b/xlators/cluster/afr/src/pump.c @@ -2501,6 +2501,12 @@ init (xlator_t *this) i++; } + ret = gf_asprintf (&priv->sh_domain, "%s-self-heal", this->name); + if (-1 == ret) { + op_errno = ENOMEM; + goto out; + } + priv->first_lookup = 1; priv->root_inode = NULL; @@ -2532,7 +2538,7 @@ init (xlator_t *this) goto out; } - pump_priv->env = syncenv_new (0); + pump_priv->env = this->ctx->env; if (!pump_priv->env) { gf_log (this->name, GF_LOG_ERROR, "Could not create new sync-environment"); @@ -2573,9 +2579,6 @@ fini (xlator_t *this) if (!pump_priv) goto afr_priv; - if (pump_priv->env) - syncenv_destroy (pump_priv->env); - GF_FREE (pump_priv->resume_path); LOCK_DESTROY (&pump_priv->resume_path_lock); LOCK_DESTROY (&pump_priv->pump_state_lock); |
