From bfc0e16e43815ab6d6e67f4bd26694ebd72b3360 Mon Sep 17 00:00:00 2001 From: Pranith K Date: Thu, 14 Jul 2011 08:07:04 +0000 Subject: cluster/afr: Add fresh children along with read-child to inode context Signed-off-by: Pranith Kumar K Signed-off-by: Anand Avati BUG: 2840 (files not getting self-healed when the first child goes down) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=2840 --- xlators/cluster/afr/src/afr-common.c | 680 +++++++++++++++++------ xlators/cluster/afr/src/afr-dir-read.c | 2 +- xlators/cluster/afr/src/afr-dir-write.c | 134 +++-- xlators/cluster/afr/src/afr-inode-read.c | 55 +- xlators/cluster/afr/src/afr-inode-write.c | 12 +- xlators/cluster/afr/src/afr-mem-types.h | 1 + xlators/cluster/afr/src/afr-open.c | 2 +- xlators/cluster/afr/src/afr-self-heal-common.c | 18 +- xlators/cluster/afr/src/afr-self-heal-data.c | 88 +-- xlators/cluster/afr/src/afr-self-heal-entry.c | 30 +- xlators/cluster/afr/src/afr-self-heal-metadata.c | 38 +- xlators/cluster/afr/src/afr-self-heal.h | 5 +- xlators/cluster/afr/src/afr-transaction.c | 38 +- xlators/cluster/afr/src/afr.c | 1 + xlators/cluster/afr/src/afr.h | 57 +- xlators/cluster/afr/src/pump.c | 9 +- 16 files changed, 817 insertions(+), 353 deletions(-) (limited to 'xlators') diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index b753cbfa8..e8afc6d8d 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -87,187 +87,444 @@ out: return ret; } -uint64_t -afr_is_split_brain (xlator_t *this, inode_t *inode) +afr_inode_ctx_t* +afr_inode_ctx_get_from_addr (uint64_t addr, int32_t child_count) { - int ret = 0; + int ret = -1; + afr_inode_ctx_t *ctx = NULL; + size_t size = 0; - uint64_t ctx = 0; - uint64_t split_brain = 0; + GF_ASSERT (child_count > 0); - VALIDATE_OR_GOTO (inode, out); + if (!addr) { + ctx = GF_CALLOC (1, sizeof (*ctx), + gf_afr_mt_inode_ctx_t); + if (!ctx) + goto out; + size = sizeof (*ctx->fresh_children); + ctx->fresh_children = GF_CALLOC (child_count, size, + gf_afr_mt_int32_t); + if (!ctx->fresh_children) + goto out; + } else { + ctx = (afr_inode_ctx_t*) (long) addr; + } + ret = 0; +out: + if (ret && ctx) { + if (ctx->fresh_children) + GF_FREE (ctx->fresh_children); + GF_FREE (ctx); + ctx = NULL; + } + return ctx; +} + +void +afr_inode_get_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) +{ + GF_ASSERT (inode); + GF_ASSERT (params); + int ret = 0; + afr_inode_ctx_t *ctx = NULL; + afr_private_t *priv = NULL; + int i = 0; + uint64_t ctx_addr = 0; + int32_t read_child = -1; + int32_t *fresh_children = NULL; + + priv = this->private; LOCK (&inode->lock); { - ret = __inode_ctx_get (inode, this, &ctx); - + ret = __inode_ctx_get (inode, this, &ctx_addr); if (ret < 0) goto unlock; - - split_brain = ctx & AFR_ICTX_SPLIT_BRAIN_MASK; + ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count); + if (!ctx) + goto unlock; + switch (params->mask_type) { + case AFR_ICTX_READ_CHILD_MASK: + fresh_children = params->u.read_ctx.fresh_children; + read_child = (int32_t)(ctx->masks & + AFR_ICTX_READ_CHILD_MASK); + params->u.read_ctx.read_child = read_child; + if (!fresh_children) + goto unlock; + for (i = 0; i < priv->child_count; i++) + fresh_children[i] = ctx->fresh_children[i]; + break; + case AFR_ICTX_OPENDIR_DONE_MASK: + params->u.value = ctx->masks & + AFR_ICTX_OPENDIR_DONE_MASK; + break; + case AFR_ICTX_SPLIT_BRAIN_MASK: + params->u.value = ctx->masks & AFR_ICTX_SPLIT_BRAIN_MASK; + break; + } } unlock: UNLOCK (&inode->lock); +} -out: - return split_brain; +uint64_t +afr_is_split_brain (xlator_t *this, inode_t *inode) +{ + afr_inode_params_t params = {0}; + + params.mask_type = AFR_ICTX_SPLIT_BRAIN_MASK; + afr_inode_get_ctx (this, inode, ¶ms); + return params.u.value; +} + +gf_boolean_t +afr_is_opendir_done (xlator_t *this, inode_t *inode) +{ + afr_inode_params_t params = {0}; + + params.mask_type = AFR_ICTX_OPENDIR_DONE_MASK; + afr_inode_get_ctx (this, inode, ¶ms); + return params.u.value; } +int32_t +afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) +{ + afr_inode_params_t params = {0}; + + params.mask_type = AFR_ICTX_READ_CHILD_MASK; + params.u.read_ctx.fresh_children = fresh_children; + afr_inode_get_ctx (this, inode, ¶ms); + return params.u.read_ctx.read_child; +} + void -afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set) +afr_inode_ctx_set_read_ctx (afr_inode_ctx_t *ctx, int32_t read_child, + int32_t *fresh_children, int32_t child_count) { - uint64_t ctx = 0; - int ret = 0; + uint64_t rest_of_mask = 0; + uint64_t mask = 0; + int i = 0; - VALIDATE_OR_GOTO (inode, out); + rest_of_mask = (~AFR_ICTX_READ_CHILD_MASK & ctx->masks); + mask = (AFR_ICTX_READ_CHILD_MASK & read_child); + ctx->masks = rest_of_mask | mask; - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); + /* avoid memcpy as int, int32_t are used interchangeably + */ + for (i = 0; i < child_count; i++) { + if (fresh_children) + ctx->fresh_children[i] = fresh_children[i]; + else + ctx->fresh_children[i] = -1; + } +} - if (ret < 0) { - ctx = 0; - } +void +afr_inode_ctx_set_opendir_done (afr_inode_ctx_t *ctx) +{ + uint64_t rest_of_mask = 0; + uint64_t mask = 0; - if (set) { - ctx = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx) - | (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_SPLIT_BRAIN_MASK); - } else { - ctx = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx); - } + rest_of_mask = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx->masks); + mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK); + ctx->masks = rest_of_mask | mask; +} - ret = __inode_ctx_put (inode, this, ctx); - if (ret) { - gf_log_callingfn (this->name, GF_LOG_INFO, - "failed to set the inode ctx (%s)", - uuid_utoa (inode->gfid)); - } +void +afr_inode_ctx_set_splitbrain (afr_inode_ctx_t *ctx, gf_boolean_t set) +{ + uint64_t rest_of_mask = 0; + uint64_t mask = 0; + + if (set) { + rest_of_mask = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks); + mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_SPLIT_BRAIN_MASK); + ctx->masks = rest_of_mask | mask; + } else { + ctx->masks = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks); } - UNLOCK (&inode->lock); -out: - return; } - -uint64_t -afr_is_opendir_done (xlator_t *this, inode_t *inode) +void +afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) { - int ret = 0; - uint64_t ctx = 0; - uint64_t opendir_done = 0; + GF_ASSERT (inode); + GF_ASSERT (params); - VALIDATE_OR_GOTO (inode, out); + int ret = 0; + afr_inode_ctx_t *ctx = NULL; + afr_private_t *priv = NULL; + uint64_t ctx_addr = 0; + gf_boolean_t set = _gf_false; + int32_t read_child = -1; + int32_t *fresh_children = NULL; + priv = this->private; LOCK (&inode->lock); { - ret = __inode_ctx_get (inode, this, &ctx); - + ret = __inode_ctx_get (inode, this, &ctx_addr); if (ret < 0) + ctx_addr = 0; + ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count); + if (!ctx) goto unlock; - - opendir_done = ctx & AFR_ICTX_OPENDIR_DONE_MASK; + switch (params->mask_type) { + case AFR_ICTX_READ_CHILD_MASK: + read_child = params->u.read_ctx.read_child; + fresh_children = params->u.read_ctx.fresh_children; + afr_inode_ctx_set_read_ctx (ctx, read_child, + fresh_children, + priv->child_count); + break; + case AFR_ICTX_OPENDIR_DONE_MASK: + afr_inode_ctx_set_opendir_done (ctx); + break; + case AFR_ICTX_SPLIT_BRAIN_MASK: + set = params->u.value; + afr_inode_ctx_set_splitbrain (ctx, set); + break; + } + ret = __inode_ctx_put (inode, this, (uint64_t)ctx); + if (ret) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " + "set the inode ctx (%s)", + uuid_utoa (inode->gfid)); + } } unlock: UNLOCK (&inode->lock); - -out: - return opendir_done; } +void +afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set) +{ + afr_inode_params_t params = {0}; + + params.mask_type = AFR_ICTX_SPLIT_BRAIN_MASK; + params.u.value = set; + afr_inode_set_ctx (this, inode, ¶ms); +} void afr_set_opendir_done (xlator_t *this, inode_t *inode) { - uint64_t ctx = 0; - int ret = 0; + afr_inode_params_t params = {0}; - VALIDATE_OR_GOTO (inode, out); + params.mask_type = AFR_ICTX_OPENDIR_DONE_MASK; + afr_inode_set_ctx (this, inode, ¶ms); +} - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); +void +afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, + int32_t *fresh_children) +{ + afr_inode_params_t params = {0}; - if (ret < 0) { - ctx = 0; - } + GF_ASSERT (read_child >= 0); + GF_ASSERT (fresh_children); - ctx = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx) - | (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK); + params.mask_type = AFR_ICTX_READ_CHILD_MASK; + params.u.read_ctx.read_child = read_child; + params.u.read_ctx.fresh_children = fresh_children; + afr_inode_set_ctx (this, inode, ¶ms); +} - ret = __inode_ctx_put (inode, this, ctx); - if (ret) { - gf_log_callingfn (this->name, GF_LOG_INFO, - "failed to set the inode ctx (%s)", - uuid_utoa (inode->gfid)); - } +gf_boolean_t +afr_is_source_child (int32_t *sources, int32_t child_count, int32_t child) +{ + gf_boolean_t source_xattrs = _gf_false; + + GF_ASSERT (child < child_count); + + if ((child >= 0) && (child < child_count) && + sources[child]) { + source_xattrs = _gf_true; } - UNLOCK (&inode->lock); -out: - return; + return source_xattrs; } +gf_boolean_t +afr_is_success_child (int32_t *success_children, int32_t child_count, + int32_t child) +{ + gf_boolean_t success_child = _gf_false; + int i = 0; -uint64_t -afr_read_child (xlator_t *this, inode_t *inode) + GF_ASSERT (child < child_count); + + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + if (child == success_children[i]) { + success_child = _gf_true; + break; + } + } + return success_child; +} + +gf_boolean_t +afr_is_read_child (int32_t *success_children, int32_t *sources, + int32_t child_count, int32_t child) { - int ret = 0; + gf_boolean_t success_child = _gf_false; + gf_boolean_t source = _gf_false; - uint64_t ctx = 0; - uint64_t read_child = 0; + GF_ASSERT (success_children); + GF_ASSERT (child_count > 0); - VALIDATE_OR_GOTO (inode, out); + success_child = afr_is_success_child (success_children, child_count, + child); + if (!success_child) + goto out; + if (NULL == sources) { + source = _gf_true; + goto out; + } + source = afr_is_source_child (sources, child_count, child); +out: + return (success_child && source); +} - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); +/* If sources is NULL the xattrs are assumed to be of source for all + * success_children. + */ +int +afr_select_read_child_from_policy (int32_t *success_children, int32_t child_count, + int32_t prev_read_child, + int32_t config_read_child, int32_t *sources) +{ + int32_t read_child = -1; + int i = 0; - if (ret < 0) - goto unlock; + GF_ASSERT (success_children); + + read_child = prev_read_child; + if (afr_is_read_child (success_children, sources, child_count, + read_child)) + goto out; + + read_child = config_read_child; + if (afr_is_read_child (success_children, sources, child_count, + read_child)) + goto out; - read_child = ctx & AFR_ICTX_READ_CHILD_MASK; + for (i = 0; i < child_count; i++) { + read_child = success_children[i]; + if (read_child < 0) + break; + if (afr_is_read_child (success_children, sources, child_count, + read_child)) + goto out; } -unlock: - UNLOCK (&inode->lock); + read_child = -1; out: return read_child; } - +/* This function should be used when all the success_children are sources + */ void -afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child) +afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, + int32_t *fresh_children, int32_t prev_read_child, + int32_t config_read_child) { - uint64_t ctx = 0; - int ret = 0; + int read_child = -1; + afr_private_t *priv = NULL; - VALIDATE_OR_GOTO (inode, out); + priv = this->private; + read_child = afr_select_read_child_from_policy (fresh_children, + priv->child_count, + prev_read_child, + config_read_child, + NULL); + afr_inode_set_read_ctx (this, inode, read_child, fresh_children); +} + +/* afr_next_call_child () + * This is a common function used by all the read-type fops + * This function should not be called with the inode's read_children array. + * The fop's handler should make a copy of the inode's read_children, + * preferred read_child into the local vars, because while this function is + * in execution there is a chance for inode's read_ctx to change. + */ +int32_t +afr_next_call_child (int32_t *fresh_children, size_t child_count, + int32_t *last_index, int32_t read_child) +{ + int next_index = 0; + int32_t next_call_child = -1; - LOCK (&inode->lock); - { - ret = __inode_ctx_get (inode, this, &ctx); + GF_ASSERT (last_index); - if (ret < 0) { - ctx = 0; - } + next_index = *last_index; +retry: + next_index++; + if (next_index >= child_count) + goto out; + if (fresh_children[next_index] == read_child) + goto retry; + if (fresh_children[next_index] == -1) + goto out; + *last_index = next_index; + next_call_child = fresh_children[next_index]; +out: + return next_call_child; +} - ctx = (~AFR_ICTX_READ_CHILD_MASK & ctx) - | (AFR_ICTX_READ_CHILD_MASK & read_child); + /* This function should not be called with the inode's read_children array. + * The fop's handler should make a copy of the inode's read_children, + * preferred read_child into the local vars, because while this function is + * in execution there is a chance for inode's read_ctx to change. + */ +int32_t +afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child, + int32_t *fresh_children, + int32_t *call_child, int32_t *last_index) +{ + int ret = 0; + afr_private_t *priv = NULL; + int i = 0; - ret = __inode_ctx_put (inode, this, ctx); - if (ret) { - gf_log_callingfn (this->name, GF_LOG_INFO, - "failed to set the inode ctx (%s)", - uuid_utoa (inode->gfid)); + GF_ASSERT (child_up); + GF_ASSERT (call_child); + GF_ASSERT (last_index); + GF_ASSERT (fresh_children); + GF_ASSERT (read_child >= 0); + + priv = this->private; + *call_child = -1; + *last_index = -1; + + if (child_up[read_child]) { + *call_child = read_child; + } else { + for (i = 0; i < priv->child_count; i++) { + if (fresh_children[i] == -1) + break; + if (child_up[fresh_children[i]]) { + *call_child = fresh_children[i]; + ret = 0; + break; + } } - } - UNLOCK (&inode->lock); + if (*call_child == -1) { + ret = -ENOTCONN; + goto out; + } + + *last_index = i; + } out: - return; + gf_log (this->name, GF_LOG_DEBUG, "Returning %d, call_child: %d, " + "last_index: %d", ret, *call_child, *last_index); + return ret; } - void afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) { @@ -325,8 +582,12 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) if (sh->linkname) GF_FREE ((char *)sh->linkname); - if (sh->child_success) - GF_FREE (sh->child_success); + + if (sh->success_children) + GF_FREE (sh->success_children); + + if (sh->fresh_children) + GF_FREE (sh->fresh_children); loc_wipe (&sh->parent_loc); } @@ -398,6 +659,9 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (local->child_up) GF_FREE (local->child_up); + if (local->fresh_children) + GF_FREE (local->fresh_children); + { /* lookup */ if (local->cont.lookup.xattrs) { for (i = 0; i < priv->child_count; i++) { @@ -424,8 +688,8 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) if (local->cont.lookup.bufs) GF_FREE (local->cont.lookup.bufs); - if (local->cont.lookup.child_success) - GF_FREE (local->cont.lookup.child_success); + if (local->cont.lookup.success_children) + GF_FREE (local->cont.lookup.success_children); if (local->cont.lookup.sources) GF_FREE (local->cont.lookup.sources); @@ -734,20 +998,21 @@ int afr_lookup_select_read_child (afr_local_t *local, xlator_t *this, int32_t *read_child) { - int32_t source = -1; - ia_type_t ia_type = 0; - int ret = -1; - afr_transaction_type type = AFR_METADATA_TRANSACTION; - dict_t **xattrs = NULL; - int32_t *child_success = NULL; - struct iatt *bufs = NULL; + int32_t source = -1; + ia_type_t ia_type = 0; + int ret = -1; + afr_transaction_type type = AFR_METADATA_TRANSACTION; + dict_t **xattrs = NULL; + int32_t *success_children = NULL; + struct iatt *bufs = NULL; GF_ASSERT (local); GF_ASSERT (this); + GF_ASSERT (local->success_count > 0); bufs = local->cont.lookup.bufs; - child_success = local->cont.lookup.child_success; - ia_type = local->cont.lookup.bufs[child_success[0]].ia_type; + success_children = local->cont.lookup.success_children; + ia_type = local->cont.lookup.bufs[success_children[0]].ia_type; if (IA_ISDIR (ia_type)) { type = AFR_ENTRY_TRANSACTION; } else if (IA_ISREG (ia_type)) { @@ -773,7 +1038,7 @@ afr_is_self_heal_running (afr_local_t *local) } static void -afr_launch_self_heal (call_frame_t *frame, xlator_t *this, +afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, gf_boolean_t is_background, ia_type_t ia_type, int (*unwind) (call_frame_t *frame, xlator_t *this)) { @@ -782,6 +1047,7 @@ afr_launch_self_heal (call_frame_t *frame, xlator_t *this, GF_ASSERT (frame); GF_ASSERT (this); + GF_ASSERT (inode); local = frame->local; local->self_heal.background = is_background; @@ -796,7 +1062,7 @@ afr_launch_self_heal (call_frame_t *frame, xlator_t *this, "background %s self-heal triggered. path: %s", sh_type_str, local->loc.path); - afr_self_heal (frame, this); + afr_self_heal (frame, this, inode); } static void @@ -813,8 +1079,8 @@ afr_lookup_detect_self_heal (afr_local_t *local, xlator_t *this) bufs = local->cont.lookup.bufs; for (i = 1; i < local->success_count; i++) { - child1 = local->cont.lookup.child_success[i-1]; - child2 = local->cont.lookup.child_success[i];; + child1 = local->cont.lookup.success_children[i-1]; + child2 = local->cont.lookup.success_children[i]; afr_detect_self_heal_by_iatt (local, this, &bufs[child1], &bufs[child2]); } @@ -822,7 +1088,7 @@ afr_lookup_detect_self_heal (afr_local_t *local, xlator_t *this) xattr = local->cont.lookup.xattrs; priv = this->private; for (i = 0; i < local->success_count; i++) { - child1 = local->cont.lookup.child_success[i];; + child1 = local->cont.lookup.success_children[i]; afr_lookup_detect_self_heal_by_xattr (local, this, xattr[child1]); } @@ -860,8 +1126,8 @@ afr_lookup_perform_self_heal_if_needed (call_frame_t *frame, xlator_t *this, goto out; } - afr_launch_self_heal (frame, this, _gf_true, - local->cont.lookup.buf.ia_type, + afr_launch_self_heal (frame, this, local->cont.lookup.inode, + _gf_true, local->cont.lookup.buf.ia_type, afr_self_heal_lookup_unwind); *sh_launched = _gf_true; } @@ -875,22 +1141,22 @@ afr_lookup_split_brain (afr_local_t *local, xlator_t *this) int i = 0; gf_boolean_t symptom = _gf_false; struct iatt *bufs = NULL; - int32_t *child_success = NULL; + int32_t *success_children = NULL; struct iatt *child1 = NULL; struct iatt *child2 = NULL; const char *path = NULL; bufs = local->cont.lookup.bufs; - child_success = local->cont.lookup.child_success; + success_children = local->cont.lookup.success_children; for (i = 1; i < local->success_count; i++) { - child1 = &bufs[child_success[i-1]]; - child2 = &bufs[child_success[i]]; + child1 = &bufs[success_children[i-1]]; + child2 = &bufs[success_children[i]]; /* * TODO: gfid self-heal * if (uuid_compare (child1->ia_gfid, child2->ia_gfid)) { * gf_log (this->name, GF_LOG_WARNING, "%s: gfid differs" * " on subvolumes (%d, %d)", local->loc.path, - * child_success[i-1], child_success[i]); + * success_children[i-1], success_children[i]); * symptom = _gf_true; * } */ @@ -899,7 +1165,7 @@ afr_lookup_split_brain (afr_local_t *local, xlator_t *this) path = local->loc.path; gf_log (this->name, GF_LOG_WARNING, "%s: filetype " "differs on subvolumes (%d, %d)", path, - child_success[i-1], child_success[i]); + success_children[i-1], success_children[i]); symptom = _gf_true; local->govinda_gOvinda = 1; } @@ -909,13 +1175,42 @@ afr_lookup_split_brain (afr_local_t *local, xlator_t *this) return symptom; } +void +afr_get_fresh_children (int32_t *success_children, int32_t *sources, + int32_t *fresh_children, unsigned int child_count) +{ + unsigned int i = 0; + unsigned int j = 0; + + GF_ASSERT (success_children); + GF_ASSERT (sources); + GF_ASSERT (fresh_children); + + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + if (afr_is_read_child (success_children, sources, child_count, + success_children[i])) { + fresh_children[j] = success_children[i]; + j++; + } + } +} + static int -afr_lookup_set_read_child (afr_local_t *local, xlator_t *this, int32_t read_child) +afr_lookup_set_read_ctx (afr_local_t *local, xlator_t *this, int32_t read_child) { + afr_private_t *priv = NULL; + GF_ASSERT (read_child >= 0); - afr_set_read_child (this, local->cont.lookup.inode, read_child); + priv = this->private; local->cont.lookup.read_child = read_child; + afr_get_fresh_children (local->cont.lookup.success_children, + local->cont.lookup.sources, + local->fresh_children, priv->child_count); + afr_inode_set_read_ctx (this, local->cont.lookup.inode, read_child, + local->fresh_children); return 0; } @@ -949,7 +1244,7 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) goto unwind; } - ret = afr_lookup_set_read_child (local, this, read_child); + ret = afr_lookup_set_read_ctx (local, this, read_child); if (ret) goto unwind; @@ -1070,7 +1365,7 @@ afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_ind afr_lookup_cache_args (local, child_index, xattr, buf, postparent); - local->cont.lookup.child_success[local->success_count] = child_index; + local->cont.lookup.success_children[local->success_count] = child_index; local->success_count++; } @@ -1114,9 +1409,8 @@ int afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) { int ret = -ENOMEM; - int32_t *child_success = NULL; struct iatt *iatts = NULL; - int i = 0; + int32_t *success_children = NULL; GF_ASSERT (local); local->cont.lookup.xattrs = GF_CALLOC (child_count, @@ -1135,14 +1429,14 @@ afr_lookup_cont_init (afr_local_t *local, unsigned int child_count) goto out; local->cont.lookup.bufs = iatts; - child_success = GF_CALLOC (child_count, sizeof (*child_success), - gf_afr_mt_char); - if (NULL == child_success) + success_children = afr_fresh_children_create (child_count); + if (NULL == success_children) goto out; - for (i = 0; i < child_count; i++) - child_success[i] = -1; + local->cont.lookup.success_children = success_children; - local->cont.lookup.child_success = child_success; + local->fresh_children = afr_fresh_children_create (child_count); + if (NULL == local->fresh_children) + goto out; local->cont.lookup.read_child = -1; ret = 0; @@ -1181,8 +1475,8 @@ afr_lookup (call_frame_t *frame, xlator_t *this, if (ret == 0) { /* lookup is a revalidate */ - local->read_child_index = afr_read_child (this, - loc->inode); + local->read_child_index = afr_inode_get_read_ctx (this, loc->inode, + NULL); } else { LOCK (&priv->read_child_lock); { @@ -1611,7 +1905,7 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; - read_child = afr_read_child (this, local->fd->inode); + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); LOCK (&frame->lock); { @@ -2532,6 +2826,25 @@ out: return 0; } +int +afr_forget (xlator_t *this, inode_t *inode) +{ + uint64_t ctx_addr = 0; + afr_inode_ctx_t *ctx = NULL; + + inode_ctx_get (inode, this, &ctx_addr); + + if (!ctx_addr) + goto out; + + ctx = (afr_inode_ctx_t *)(long)ctx_addr; + if (ctx->fresh_children) + GF_FREE (ctx->fresh_children); + GF_FREE (ctx); +out: + return 0; +} + int afr_priv_dump (xlator_t *this) { @@ -2759,6 +3072,16 @@ out: int AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv) { + local->op_ret = -1; + local->op_errno = EUCLEAN; + local->call_count = afr_up_children_count (priv->child_count, + priv->child_up); + if (local->call_count == 0) { + gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up"); + return -ENOTCONN; + } + + local->child_up = GF_CALLOC (sizeof (*local->child_up), priv->child_count, gf_afr_mt_char); @@ -2769,16 +3092,6 @@ AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv) memcpy (local->child_up, priv->child_up, sizeof (*local->child_up) * priv->child_count); - local->call_count = afr_up_children_count (priv->child_count, - local->child_up); - local->op_ret = -1; - local->op_errno = EUCLEAN; - - if (local->call_count == 0) { - gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up"); - return -ENOTCONN; - } - return 0; } @@ -2849,6 +3162,10 @@ afr_transaction_local_init (afr_local_t *local, afr_private_t *priv) if (!local->pending) goto out; + local->fresh_children = afr_fresh_children_create (priv->child_count); + if (!local->fresh_children) + goto out; + for (i = 0; i < priv->child_count; i++) { local->pending[i] = GF_CALLOC (sizeof (*local->pending[i]), 3, /* data + metadata + entry */ @@ -2867,3 +3184,50 @@ afr_transaction_local_init (afr_local_t *local, afr_private_t *priv) out: return ret; } + +void +afr_reset_children (int32_t *fresh_children, int32_t child_count) +{ + unsigned int i = 0; + for (i = 0; i < child_count; i++) + fresh_children[i] = -1; +} + +int32_t* +afr_fresh_children_create (int32_t child_count) +{ + int32_t *fresh_children = NULL; + int i = 0; + + GF_ASSERT (child_count > 0); + + fresh_children = GF_CALLOC (child_count, sizeof (*fresh_children), + gf_afr_mt_int32_t); + if (NULL == fresh_children) + goto out; + for (i = 0; i < child_count; i++) + fresh_children[i] = -1; +out: + return fresh_children; +} + +void +afr_fresh_children_add_child (int32_t *fresh_children, int32_t child, + int32_t child_count) +{ + gf_boolean_t child_found = _gf_false; + int i = 0; + + for (i = 0; i < child_count; i++) { + if (fresh_children[i] == -1) + break; + if (fresh_children[i] == child) { + child_found = _gf_true; + break; + } + } + if (!child_found) { + GF_ASSERT (i < child_count); + fresh_children[i] = child; + } +} diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index 1bd2cc963..8593d0c14 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -177,7 +177,7 @@ out: " forced merge option set", sh_type_str, local->loc.path); - afr_self_heal (frame, this); + afr_self_heal (frame, this, local->fd->inode); } else { afr_set_opendir_done (this, local->fd->inode); diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 88c3f728f..6da666804 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -119,13 +119,14 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, fd_t *fd, inode_t *inode, struct iatt *buf, struct iatt *preparent, struct iatt *postparent) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int ret = 0; - int call_count = -1; - int child_index = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + uint64_t ctx = 0; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = 0; + int call_count = -1; + int child_index = -1; + int32_t *fresh_children = NULL; local = frame->local; priv = this->private; @@ -166,18 +167,9 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, fd_ctx->opened_on[child_index] = 1; fd_ctx->flags = local->cont.create.flags; - if (local->success_count == 0) { + if (local->success_count == 0) local->cont.create.buf = *buf; - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } - if (child_index == local->read_child_index) { local->cont.create.read_child_buf = *buf; local->cont.create.preparent = *preparent; @@ -186,6 +178,8 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->cont.create.inode = inode; + fresh_children = local->fresh_children; + fresh_children[local->success_count] = child_index; local->success_count++; } @@ -198,6 +192,10 @@ unlock: call_count = afr_frame_return (frame); if (call_count == 0) { + afr_set_read_ctx_from_policy (this, inode, + local->fresh_children, + local->read_child_index, + priv->read_child); local->transaction.unwind (frame, this); local->transaction.resume (frame, this); @@ -382,10 +380,11 @@ afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *buf, struct iatt *preparent, struct iatt *postparent) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int child_index = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int child_index = -1; + int32_t *fresh_children = NULL; local = frame->local; priv = this->private; @@ -400,18 +399,9 @@ afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret != -1) { local->op_ret = op_ret; - if (local->success_count == 0){ + if (local->success_count == 0) local->cont.mknod.buf = *buf; - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } - if (child_index == local->read_child_index) { local->cont.mknod.read_child_buf = *buf; local->cont.mknod.preparent = *preparent; @@ -420,6 +410,8 @@ afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->cont.mknod.inode = inode; + fresh_children = local->fresh_children; + fresh_children[local->success_count] = child_index; local->success_count++; } @@ -430,6 +422,10 @@ afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { + afr_set_read_ctx_from_policy (this, inode, + local->fresh_children, + local->read_child_index, + priv->read_child); local->transaction.unwind (frame, this); local->transaction.resume (frame, this); @@ -609,10 +605,11 @@ afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *buf, struct iatt *preparent, struct iatt *postparent) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int child_index = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int child_index = -1; + int32_t *fresh_children = NULL; local = frame->local; priv = this->private; @@ -627,18 +624,9 @@ afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret != -1) { local->op_ret = op_ret; - if (local->success_count == 0) { + if (local->success_count == 0) local->cont.mkdir.buf = *buf; - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } - if (child_index == local->read_child_index) { local->cont.mkdir.read_child_buf = *buf; local->cont.mkdir.preparent = *preparent; @@ -647,6 +635,8 @@ afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->cont.mkdir.inode = inode; + fresh_children = local->fresh_children; + fresh_children[local->success_count] = child_index; local->success_count++; } @@ -657,6 +647,10 @@ afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { + afr_set_read_ctx_from_policy (this, inode, + local->fresh_children, + local->read_child_index, + priv->read_child); local->transaction.unwind (frame, this); local->transaction.resume (frame, this); @@ -837,10 +831,11 @@ afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *buf, struct iatt *preparent, struct iatt *postparent) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int child_index = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int child_index = -1; + int32_t *fresh_children = NULL; local = frame->local; priv = this->private; @@ -857,14 +852,6 @@ afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (local->success_count == 0) { local->cont.link.buf = *buf; - - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } } if (child_index == local->read_child_index) { @@ -875,6 +862,8 @@ afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->cont.link.inode = inode; + fresh_children = local->fresh_children; + fresh_children[local->success_count] = child_index; local->success_count++; } @@ -885,6 +874,10 @@ afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { + afr_set_read_ctx_from_policy (this, inode, + local->fresh_children, + local->read_child_index, + priv->read_child); local->transaction.unwind (frame, this); local->transaction.resume (frame, this); @@ -1062,10 +1055,11 @@ afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *buf, struct iatt *preparent, struct iatt *postparent) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int child_index = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int child_index = -1; + int32_t *fresh_children = NULL; local = frame->local; priv = this->private; @@ -1080,16 +1074,8 @@ afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret != -1) { local->op_ret = op_ret; - if (local->success_count == 0) { + if (local->success_count == 0) local->cont.symlink.buf = *buf; - if (priv->read_child >= 0) { - afr_set_read_child (this, inode, - priv->read_child); - } else { - afr_set_read_child (this, inode, - local->read_child_index); - } - } if (child_index == local->read_child_index) { local->cont.symlink.read_child_buf = *buf; @@ -1099,6 +1085,8 @@ afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->cont.symlink.inode = inode; + fresh_children = local->fresh_children; + fresh_children[local->success_count] = child_index; local->success_count++; } @@ -1109,6 +1097,10 @@ afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { + afr_set_read_ctx_from_policy (this, inode, + local->fresh_children, + local->read_child_index, + priv->read_child); local->transaction.unwind (frame, this); local->transaction.resume (frame, this); @@ -1424,7 +1416,7 @@ afr_rename (call_frame_t *frame, xlator_t *this, loc_copy (&local->loc, oldloc); loc_copy (&local->newloc, newloc); - local->read_child_index = afr_read_child (this, oldloc->inode); + local->read_child_index = afr_inode_get_read_ctx (this, oldloc->inode, NULL); local->cont.rename.ino = oldloc->inode->ino; diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index 742d3687c..f2507f07e 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -134,7 +134,15 @@ afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask) ALLOC_OR_GOTO (local, afr_local_t, out); - read_child = afr_read_child (this, loc->inode); + local->fresh_children = GF_CALLOC (priv->child_count, + sizeof (*local->fresh_children), + gf_afr_mt_int32_t); + if (local->fresh_children) { + op_errno = ENOMEM; + goto out; + } + + read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); if ((read_child >= 0) && (priv->child_up[read_child])) { call_child = read_child; @@ -252,7 +260,14 @@ afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc) frame->local = local; - read_child = afr_read_child (this, loc->inode); + local->fresh_children = GF_CALLOC (priv->child_count, + sizeof (*local->fresh_children), + gf_afr_mt_int32_t); + if (local->fresh_children) { + op_errno = ENOMEM; + goto out; + } + read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); if ((read_child >= 0) && (priv->child_up[read_child])) { call_child = read_child; @@ -375,7 +390,14 @@ afr_fstat (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (fd->inode, out); - read_child = afr_read_child (this, fd->inode); + local->fresh_children = GF_CALLOC (priv->child_count, + sizeof (*local->fresh_children), + gf_afr_mt_int32_t); + if (local->fresh_children) { + op_errno = ENOMEM; + goto out; + } + read_child = afr_inode_get_read_ctx (this, fd->inode, local->fresh_children); if ((read_child >= 0) && (priv->child_up[read_child])) { call_child = read_child; @@ -494,7 +516,14 @@ afr_readlink (call_frame_t *frame, xlator_t *this, frame->local = local; - read_child = afr_read_child (this, loc->inode); + local->fresh_children = GF_CALLOC (priv->child_count, + sizeof (*local->fresh_children), + gf_afr_mt_int32_t); + if (local->fresh_children) { + op_errno = ENOMEM; + goto out; + } + read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); if ((read_child >= 0) && (priv->child_up[read_child])) { call_child = read_child; @@ -879,7 +908,14 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, } } - read_child = afr_read_child (this, loc->inode); + local->fresh_children = GF_CALLOC (priv->child_count, + sizeof (*local->fresh_children), + gf_afr_mt_int32_t); + if (local->fresh_children) { + op_errno = ENOMEM; + goto out; + } + read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); if ((read_child >= 0) && (priv->child_up[read_child])) { call_child = read_child; @@ -1020,7 +1056,14 @@ afr_readv (call_frame_t *frame, xlator_t *this, frame->local = local; - read_child = afr_read_child (this, fd->inode); + local->fresh_children = GF_CALLOC (priv->child_count, + sizeof (*local->fresh_children), + gf_afr_mt_int32_t); + if (local->fresh_children) { + op_errno = ENOMEM; + goto out; + } + read_child = afr_inode_get_read_ctx (this, fd->inode, local->fresh_children); if ((read_child >= 0) && (priv->child_up[read_child])) { call_child = read_child; diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 8b82add62..564bb953a 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -87,7 +87,7 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; - read_child = afr_read_child (this, local->fd->inode); + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); LOCK (&frame->lock); { @@ -343,7 +343,7 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; priv = this->private; - read_child = afr_read_child (this, local->loc.inode); + read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); LOCK (&frame->lock); { @@ -550,7 +550,7 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; priv = this->private; - read_child = afr_read_child (this, local->fd->inode); + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); LOCK (&frame->lock); { @@ -712,8 +712,8 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this, priv = this->private; ALLOC_OR_GOTO (local, afr_local_t, out); - ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { op_errno = -ret; goto out; @@ -797,7 +797,7 @@ afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; priv = this->private; - read_child = afr_read_child (this, local->loc.inode); + read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); LOCK (&frame->lock); { @@ -1004,7 +1004,7 @@ afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; priv = this->private; - read_child = afr_read_child (this, local->fd->inode); + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); LOCK (&frame->lock); { diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index 14064ebcd..de2049589 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -42,6 +42,7 @@ enum gf_afr_mem_types_ { gf_afr_mt_entry_name, gf_afr_mt_pump_priv, gf_afr_mt_locked_fd, + gf_afr_mt_inode_ctx_t, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index e6304a5ea..4aa587399 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -434,7 +434,7 @@ afr_openfd_sh (call_frame_t *frame, xlator_t *this) "path: %s, reason: Replicate up down flush, data lock is held", sh_type_str, local->loc.path); - afr_self_heal (frame, this); + afr_self_heal (frame, this, local->fd->inode); return 0; } diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index abc9ccb0f..16345bee7 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -1640,18 +1640,15 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) afr_local_t * local = NULL; afr_self_heal_t * sh = NULL; char sh_type_str[256] = {0,}; + gf_boolean_t split_brain = _gf_false; priv = this->private; local = bgsh_frame->local; sh = &local->self_heal; - if (local->govinda_gOvinda) { - afr_set_split_brain (this, local->cont.lookup.inode, - _gf_true); - } else { - afr_set_split_brain (this, local->cont.lookup.inode, - _gf_false); - } + if (local->govinda_gOvinda) + split_brain = _gf_true; + afr_set_split_brain (this, sh->inode, split_brain); afr_self_heal_type_str_get (sh, sh_type_str, sizeof(sh_type_str)); @@ -1683,7 +1680,7 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) } int -afr_self_heal (call_frame_t *frame, xlator_t *this) +afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) { afr_local_t *local = NULL; afr_self_heal_t *sh = NULL; @@ -1726,6 +1723,7 @@ afr_self_heal (call_frame_t *frame, xlator_t *this) sh_local = afr_local_copy (local, this); sh_frame->local = sh_local; sh = &sh_local->self_heal; + sh->inode = inode; sh->orig_frame = frame; @@ -1761,8 +1759,8 @@ afr_self_heal (call_frame_t *frame, xlator_t *this) priv->child_count, gf_afr_mt_int32_t); } - sh->child_success = GF_CALLOC (sizeof (*sh->child_success), - priv->child_count, gf_afr_mt_int32_t); + sh->success_children = afr_fresh_children_create (priv->child_count); + sh->fresh_children = afr_fresh_children_create (priv->child_count); FRAME_SU_DO (sh_frame, afr_local_t); diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 3ee1db0e7..f9a257972 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -299,12 +299,25 @@ afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xattr) { + afr_local_t *local = NULL; int call_count = 0; + long i = 0; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + local = frame->local; + priv = this->private; + sh = &local->self_heal; + i = (long)cookie; + + afr_fresh_children_add_child (sh->fresh_children, i, priv->child_count); call_count = afr_frame_return (frame); - if (call_count == 0) + if (call_count == 0) { + afr_inode_set_read_ctx (this, sh->inode, sh->source, + sh->fresh_children); afr_sh_data_finish (frame, this); + } return 0; } @@ -602,7 +615,7 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this) nsources = afr_mark_sources (sh->sources, sh->pending_matrix, sh->buf, priv->child_count, AFR_SELF_HEAL_DATA, - sh->child_success, this->name); + sh->success_children, this->name); if (nsources == 0) { gf_log (this->name, GF_LOG_TRACE, @@ -670,7 +683,11 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this) sh->sources[i] = 0; } - afr_set_read_child (this, local->loc.inode, sh->source); + afr_reset_children (sh->fresh_children, priv->child_count); + afr_get_fresh_children (sh->success_children, sh->sources, + sh->fresh_children, priv->child_count); + afr_inode_set_read_ctx (this, sh->inode, sh->source, + sh->fresh_children); /* quick-read might have read the file, so send xattr from @@ -691,56 +708,6 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this) return 0; } -gf_boolean_t -afr_is_fresh_read_child (int32_t *sources, int32_t child_count, - int32_t read_child) -{ - gf_boolean_t is_fresh_child = _gf_false; - - GF_ASSERT (read_child < child_count); - - if ((read_child >= 0) && (read_child < child_count) && - sources[read_child]) { - is_fresh_child = _gf_true; - } - return is_fresh_child; -} - -static int -afr_select_read_child_from_policy (int32_t *sources, int32_t child_count, - int32_t prev_read_child, - int32_t config_read_child, - int32_t *valid_children) -{ - int32_t read_child = -1; - int i = 0; - - GF_ASSERT (sources); - - read_child = prev_read_child; - if (_gf_true == afr_is_fresh_read_child (sources, child_count, - read_child)) - goto out; - - read_child = config_read_child; - if (_gf_true == afr_is_fresh_read_child (sources, child_count, - read_child)) - goto out; - - for (i = 0; i < child_count; i++) { - read_child = valid_children[i]; - if (read_child < 0) - break; - if (_gf_true == afr_is_fresh_read_child (sources, child_count, - read_child)) - goto out; - } - read_child = -1; - -out: - return read_child; -} - static void afr_destroy_pending_matrix (int32_t **pending_matrix, int32_t child_count) { @@ -796,7 +763,7 @@ afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID; int32_t **pending_matrix = NULL; int32_t *sources = NULL; - int32_t *valid_children = NULL; + int32_t *success_children = NULL; struct iatt *bufs = NULL; int32_t nsources = 0; int32_t prev_read_child = -1; @@ -805,7 +772,7 @@ afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, priv = this->private; bufs = local->cont.lookup.bufs; - valid_children = local->cont.lookup.child_success; + success_children = local->cont.lookup.success_children; sh = &local->self_heal; pending_matrix = afr_create_pending_matrix (priv->child_count); @@ -826,7 +793,7 @@ afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, nsources = afr_mark_sources (sources, pending_matrix, bufs, priv->child_count, sh_type, - valid_children, this->name); + success_children, this->name); if (nsources < 0) { ret = -1; goto out; @@ -834,11 +801,11 @@ afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, prev_read_child = local->read_child_index; config_read_child = priv->read_child; - read_child = afr_select_read_child_from_policy (sources, + read_child = afr_select_read_child_from_policy (success_children, priv->child_count, prev_read_child, config_read_child, - valid_children); + sources); ret = 0; local->cont.lookup.sources = sources; out: @@ -875,7 +842,7 @@ afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie, priv->children[child_index]->name); sh->buf[child_index] = *buf; - sh->child_success[sh->success_count] = child_index; + sh->success_children[sh->success_count] = child_index; sh->success_count++; } } @@ -909,8 +876,7 @@ afr_sh_data_fstat (call_frame_t *frame, xlator_t *this) local->call_count = call_count; - for (i = 0; i < priv->child_count; i++) - sh->child_success[i] = -1; + afr_reset_children (sh->success_children, priv->child_count); sh->success_count = 0; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index 0425644b3..50870afb2 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -125,12 +125,16 @@ afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie, afr_local_t *orig_local = NULL; call_frame_t *orig_frame = NULL; afr_private_t *priv = NULL; + int32_t read_child = -1; local = frame->local; priv = this->private; + sh = &local->self_heal; + i = (long)cookie; + + afr_fresh_children_add_child (sh->fresh_children, i, priv->child_count); if (op_ret == -1) { - i = (long)cookie; gf_log (this->name, GF_LOG_INFO, "%s: failed to erase pending xattrs on %s (%s)", local->loc.path, priv->children[i]->name, @@ -140,8 +144,14 @@ afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie, call_count = afr_frame_return (frame); if (call_count == 0) { - sh = &local->self_heal; - + if (sh->source == -1) { + //this happens if the forced merge option is set + read_child = sh->fresh_children[0]; + } else { + read_child = sh->source; + } + afr_inode_set_read_ctx (this, sh->inode, read_child, + sh->fresh_children); orig_frame = sh->orig_frame; orig_local = orig_frame->local; @@ -2165,7 +2175,7 @@ afr_sh_entry_fix (call_frame_t *frame, xlator_t *this) nsources = afr_mark_sources (sh->sources, sh->pending_matrix, sh->buf, priv->child_count, AFR_SELF_HEAL_ENTRY, - sh->child_success, this->name); + sh->success_children, this->name); if (nsources == 0) { gf_log (this->name, GF_LOG_TRACE, @@ -2180,6 +2190,13 @@ afr_sh_entry_fix (call_frame_t *frame, xlator_t *this) sh->source = source; + afr_reset_children (sh->fresh_children, priv->child_count); + afr_get_fresh_children (sh->success_children, sh->sources, + sh->fresh_children, priv->child_count); + afr_inode_set_read_ctx (this, sh->inode, sh->source, + sh->fresh_children); + + heal: afr_sh_entry_sync_prepare (frame, this); @@ -2208,7 +2225,7 @@ afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie, if (op_ret != -1) { sh->xattr[child_index] = dict_ref (xattr); sh->buf[child_index] = *buf; - sh->child_success[sh->success_count] = child_index; + sh->success_children[sh->success_count] = child_index; sh->success_count++; } } @@ -2258,8 +2275,7 @@ afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this) } } - for (i = 0; i < priv->child_count; i++) - sh->child_success[i] = -1; + afr_reset_children (sh->success_children, priv->child_count); sh->success_count = 0; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index fe1db60e2..5993e9596 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -147,15 +147,32 @@ afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xattr) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; int call_count = 0; + long i = 0; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; local = frame->local; + priv = this->private; + sh = &local->self_heal; + i = (long)cookie; + if ((!IA_ISREG (sh->buf[sh->source].ia_type)) && + (!IA_ISDIR (sh->buf[sh->source].ia_type))) { + afr_fresh_children_add_child (sh->fresh_children, i, + priv->child_count); + } call_count = afr_frame_return (frame); - if (call_count == 0) + if (call_count == 0) { + if ((!IA_ISREG (sh->buf[sh->source].ia_type)) && + (!IA_ISDIR (sh->buf[sh->source].ia_type))) { + afr_inode_set_read_ctx (this, sh->inode, sh->source, + sh->fresh_children); + } afr_sh_metadata_finish (frame, this); + } return 0; } @@ -483,7 +500,7 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this) nsources = afr_mark_sources (sh->sources, sh->pending_matrix, sh->buf, priv->child_count, AFR_SELF_HEAL_METADATA, - sh->child_success, this->name); + sh->success_children, this->name); if (nsources == 0) { gf_log (this->name, GF_LOG_TRACE, @@ -545,6 +562,16 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this) sh->sources[i] = 0; } + if ((!IA_ISREG (sh->buf[source].ia_type)) && + (!IA_ISDIR (sh->buf[source].ia_type))) { + afr_reset_children (sh->fresh_children, + priv->child_count); + afr_get_fresh_children (sh->success_children, sh->sources, + sh->fresh_children, priv->child_count); + afr_inode_set_read_ctx (this, sh->inode, sh->source, + sh->fresh_children); + } + afr_sh_metadata_sync_prepare (frame, this); return 0; @@ -582,7 +609,7 @@ afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, sh->buf[child_index] = *buf; if (xattr) sh->xattr[child_index] = dict_ref (xattr); - sh->child_success[sh->success_count] = child_index; + sh->success_children[sh->success_count] = child_index; sh->success_count++; } else { gf_log (this->name, GF_LOG_INFO, @@ -637,8 +664,7 @@ afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this) } } - for (i = 0; i < priv->child_count; i++) - sh->child_success[i] = -1; + afr_reset_children (sh->success_children, priv->child_count); sh->success_count = 0; for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h index 976dae475..1056a3662 100644 --- a/xlators/cluster/afr/src/afr-self-heal.h +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -49,11 +49,8 @@ int afr_self_heal_get_source (xlator_t *this, afr_local_t *local, dict_t **xattr); int -afr_self_heal (call_frame_t *frame, xlator_t *this); +afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode); -gf_boolean_t -afr_is_fresh_read_child (int32_t *sources, int32_t child_count, - int32_t read_child); int afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local, dict_t **xattr, diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 7652d3d1e..b8d2e27a4 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -407,24 +407,31 @@ void afr_update_read_child (call_frame_t *frame, xlator_t *this, inode_t *inode, afr_transaction_type type) { - int curr_read_child = -1; - int new_read_child = -1; + int curr_read_child = -1; + int new_read_child = -1; afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int **pending = NULL; - int idx = 0; + afr_local_t *local = NULL; + int **pending = NULL; + int idx = 0; + int32_t *fresh_children = NULL; + size_t success_count = 0; idx = afr_index_for_transaction_type (type); priv = this->private; local = frame->local; - curr_read_child = afr_read_child (this, inode); + curr_read_child = afr_inode_get_read_ctx (this, inode, NULL); pending = local->pending; + GF_ASSERT (curr_read_child >= 0); + if (pending[curr_read_child][idx] != 0) - return; + goto out; - /* need to set new read_child */ + fresh_children = GF_CALLOC (priv->child_count, sizeof (*fresh_children), + gf_afr_mt_int32_t); + if (!fresh_children) + goto out; for (new_read_child = 0; new_read_child < priv->child_count; new_read_child++) { @@ -435,15 +442,16 @@ afr_update_read_child (call_frame_t *frame, xlator_t *this, inode_t *inode, if (pending[new_read_child][idx] == 0) /* op just failed */ continue; - - break; + fresh_children[success_count] = new_read_child; + success_count++; } - if (new_read_child == priv->child_count) - /* all children uneligible. leave as-is */ - return; - - afr_set_read_child (this, inode, new_read_child); + afr_inode_set_read_ctx (this, inode, fresh_children[0], + fresh_children); +out: + if (fresh_children) + GF_FREE (fresh_children); + return; } diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index d8939ab4d..c6705fc68 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -814,6 +814,7 @@ struct xlator_dumpops dumpops = { struct xlator_cbks cbks = { .release = afr_release, .releasedir = afr_releasedir, + .forget = afr_forget, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index 55c485f14..0b8f96ec8 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -37,6 +37,22 @@ struct _pump_private; +typedef struct afr_inode_params_ { + uint64_t mask_type; + union { + gf_boolean_t value; + struct { + int32_t read_child; + int32_t *fresh_children; + } read_ctx; + } u; +} afr_inode_params_t; + +typedef struct afr_inode_ctx_ { + uint64_t masks; + int32_t *fresh_children;//increasing order of latency +} afr_inode_ctx_t; + typedef struct _afr_private { gf_lock_t lock; /* to guard access to child_count, etc */ unsigned int child_count; /* total number of children */ @@ -121,6 +137,8 @@ typedef struct { ia_type_t type; /* st_mode of the entry we're doing self-heal on */ + inode_t *inode; /* inode on which the self-heal is + performed on */ /* Function to call to unwind. If self-heal is being done in the background, this function will be called as soon as possible. */ @@ -140,8 +158,10 @@ typedef struct { /* array containing if the lookups succeeded in the order of response */ - int32_t *child_success; + int32_t *success_children; int success_count; + /* array containing the fresh children found in the self-heal process */ + int32_t *fresh_children; /* array of errno's, one for each child */ int *child_errno; @@ -311,6 +331,7 @@ typedef struct _afr_local { glusterfs_fop_t fop; unsigned char *child_up; + int32_t *fresh_children; //in the order of response int32_t *child_errno; @@ -354,8 +375,8 @@ typedef struct _afr_local { struct iatt *postparents; struct iatt *bufs; int32_t read_child; - int32_t *child_success;//in the order of response int32_t *sources; + int32_t *success_children; } lookup; struct { @@ -732,11 +753,12 @@ int pump_start (call_frame_t *frame, xlator_t *this); int afr_fd_ctx_set (xlator_t *this, fd_t *fd); -uint64_t -afr_read_child (xlator_t *this, inode_t *inode); +int32_t +afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children); void -afr_set_read_child (xlator_t *this, inode_t *inode, int32_t read_child); +afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, + int32_t *fresh_children); void afr_build_parent_loc (loc_t *parent, loc_t *child); @@ -772,7 +794,7 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, void afr_set_opendir_done (xlator_t *this, inode_t *inode); -uint64_t +gf_boolean_t afr_is_opendir_done (xlator_t *this, inode_t *inode); void @@ -829,12 +851,24 @@ int32_t afr_marker_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv ); +int32_t * +afr_fresh_children_create (int32_t child_count); + int AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv); int afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, transaction_lk_type_t lk_type); +int +afr_select_read_child_from_policy (int32_t *fresh_children, int32_t child_count, + int32_t prev_read_child, + int32_t config_read_child, int32_t *sources); + +void +afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode, + int32_t *fresh_children, int32_t prev_read_child, + int32_t config_read_child); /** * first_up_child - return the index of the first child that is up @@ -862,4 +896,15 @@ afr_first_up_child (afr_private_t *priv) return ret; } +int32_t +afr_next_call_child (int32_t *fresh_children, size_t child_count, + int32_t *last_index, int32_t read_child); +void +afr_get_fresh_children (int32_t *success_children, int32_t *sources, + int32_t *fresh_children, unsigned int child_count); +void +afr_fresh_children_add_child (int32_t *fresh_children, int32_t child, + int32_t child_count); +void +afr_reset_children (int32_t *fresh_children, int32_t child_count); #endif /* __AFR_H__ */ diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c index 48ce2c945..300b08504 100644 --- a/xlators/cluster/afr/src/pump.c +++ b/xlators/cluster/afr/src/pump.c @@ -1536,7 +1536,14 @@ pump_getxattr (call_frame_t *frame, xlator_t *this, return 0; } - read_child = afr_read_child (this, loc->inode); + local->fresh_children = GF_CALLOC (priv->child_count, + sizeof (*local->fresh_children), + gf_afr_mt_int32_t); + if (local->fresh_children) { + op_errno = ENOMEM; + goto out; + } + read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children); if (read_child >= 0) { call_child = read_child; -- cgit