diff options
Diffstat (limited to 'xlators/cluster')
46 files changed, 8838 insertions, 3705 deletions
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am index ef6bf9786..35d18a6c0 100644 --- a/xlators/cluster/afr/src/Makefile.am +++ b/xlators/cluster/afr/src/Makefile.am @@ -7,11 +7,11 @@ afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \ afr-self-heal-algorithm.c afr-lk-common.c afr-self-heald.c \ $(top_builddir)/xlators/lib/src/libxlator.c -afr_la_LDFLAGS = -module -avoidversion +afr_la_LDFLAGS = -module -avoid-version afr_la_SOURCES = $(afr_common_source) afr.c afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -pump_la_LDFLAGS = -module -avoidversion +pump_la_LDFLAGS = -module -avoid-version pump_la_SOURCES = $(afr_common_source) pump.c pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index c38e52725..af01f2ef2 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -49,10 +49,9 @@ #include "afr-self-heald.h" #include "pump.h" -#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000200000000ULL -#define AFR_ICTX_SPLIT_BRAIN_MASK 0x0000000100000000ULL +#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000100000000ULL #define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL - +#define AFR_STATISTICS_HISTORY_SIZE 50 int afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this, gf_boolean_t fail_conflict); @@ -203,59 +202,86 @@ out: return ret; } -afr_inode_ctx_t* -afr_inode_ctx_get_from_addr (uint64_t addr, int32_t child_count) +void +afr_inode_ctx_destroy (afr_inode_ctx_t *ctx) { - int ret = -1; - afr_inode_ctx_t *ctx = NULL; - size_t size = 0; + if (!ctx) + return; + GF_FREE (ctx->fresh_children); + GF_FREE (ctx); +} - GF_ASSERT (child_count > 0); +afr_inode_ctx_t* +__afr_inode_ctx_get (inode_t *inode, xlator_t *this) +{ + int ret = 0; + uint64_t ctx_addr = 0; + afr_inode_ctx_t *ctx = NULL; + afr_private_t *priv = NULL; - if (!addr) { - ctx = GF_CALLOC (1, sizeof (*ctx), - gf_afr_mt_inode_ctx_t); - if (!ctx) - goto out; - size = sizeof (*ctx->fresh_children); - ctx->fresh_children = GF_CALLOC (child_count, size, - gf_afr_mt_int32_t); - if (!ctx->fresh_children) - goto out; - } else { - ctx = (afr_inode_ctx_t*) (long) addr; + priv = this->private; + ret = __inode_ctx_get (inode, this, &ctx_addr); + if (ret < 0) + ctx_addr = 0; + if (ctx_addr != 0) { + ctx = (afr_inode_ctx_t*) (long) ctx_addr; + goto out; } - ret = 0; + ctx = GF_CALLOC (1, sizeof (*ctx), + gf_afr_mt_inode_ctx_t); + if (!ctx) + goto fail; + ctx->fresh_children = GF_CALLOC (priv->child_count, + sizeof (*ctx->fresh_children), + gf_afr_mt_int32_t); + if (!ctx->fresh_children) + goto fail; + ret = __inode_ctx_put (inode, this, (uint64_t)ctx); + if (ret) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " + "set the inode ctx (%s)", + uuid_utoa (inode->gfid)); + goto fail; + } + out: - if (ret && ctx) { - GF_FREE (ctx->fresh_children); - GF_FREE (ctx); - ctx = NULL; + return ctx; + +fail: + afr_inode_ctx_destroy (ctx); + return NULL; +} + +afr_inode_ctx_t* +afr_inode_ctx_get (inode_t *inode, xlator_t *this) +{ + afr_inode_ctx_t *ctx = NULL; + + LOCK (&inode->lock); + { + ctx = __afr_inode_ctx_get (inode, this); } + UNLOCK (&inode->lock); return ctx; } void -afr_inode_get_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) +afr_inode_get_ctx_params (xlator_t *this, inode_t *inode, + afr_inode_params_t *params) { GF_ASSERT (inode); GF_ASSERT (params); - int ret = 0; afr_inode_ctx_t *ctx = NULL; afr_private_t *priv = NULL; int i = 0; - uint64_t ctx_addr = 0; int32_t read_child = -1; int32_t *fresh_children = NULL; priv = this->private; LOCK (&inode->lock); { - ret = __inode_ctx_get (inode, this, &ctx_addr); - if (ret < 0) - goto unlock; - ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count); + ctx = __afr_inode_ctx_get (inode, this); if (!ctx) goto unlock; switch (params->op) { @@ -274,12 +300,6 @@ afr_inode_get_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) if (ctx->masks & AFR_ICTX_OPENDIR_DONE_MASK) params->u.value = _gf_true; break; - case AFR_INODE_GET_SPLIT_BRAIN: - params->u.value = _gf_false; - if (ctx->masks & AFR_ICTX_SPLIT_BRAIN_MASK) - params->u.value = _gf_true; - ; - break; default: GF_ASSERT (0); break; @@ -292,11 +312,16 @@ unlock: gf_boolean_t afr_is_split_brain (xlator_t *this, inode_t *inode) { - afr_inode_params_t params = {0}; + afr_inode_ctx_t *ctx = NULL; + gf_boolean_t spb = _gf_false; - params.op = AFR_INODE_GET_SPLIT_BRAIN; - afr_inode_get_ctx (this, inode, ¶ms); - return params.u.value; + ctx = afr_inode_ctx_get (inode, this); + if (!ctx) + goto out; + if ((ctx->mdata_spb == SPB) || (ctx->data_spb == SPB)) + spb = _gf_true; +out: + return spb; } gf_boolean_t @@ -305,11 +330,10 @@ afr_is_opendir_done (xlator_t *this, inode_t *inode) afr_inode_params_t params = {0}; params.op = AFR_INODE_GET_OPENDIR_DONE; - afr_inode_get_ctx (this, inode, ¶ms); + afr_inode_get_ctx_params (this, inode, ¶ms); return params.u.value; } - int32_t afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) { @@ -317,7 +341,7 @@ afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children) params.op = AFR_INODE_GET_READ_CTX; params.u.read_ctx.children = fresh_children; - afr_inode_get_ctx (this, inode, ¶ms); + afr_inode_get_ctx_params (this, inode, ¶ms); return params.u.read_ctx.read_child; } @@ -379,31 +403,14 @@ afr_inode_ctx_set_opendir_done (afr_inode_ctx_t *ctx) } void -afr_inode_ctx_set_splitbrain (afr_inode_ctx_t *ctx, gf_boolean_t set) -{ - uint64_t remaining_mask = 0; - uint64_t mask = 0; - - if (set) { - remaining_mask = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks); - mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_SPLIT_BRAIN_MASK); - ctx->masks = remaining_mask | mask; - } else { - ctx->masks = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks); - } -} - -void -afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) +afr_inode_set_ctx_params (xlator_t *this, inode_t *inode, + afr_inode_params_t *params) { GF_ASSERT (inode); GF_ASSERT (params); - int ret = 0; afr_inode_ctx_t *ctx = NULL; afr_private_t *priv = NULL; - uint64_t ctx_addr = 0; - gf_boolean_t set = _gf_false; int32_t read_child = -1; int32_t *fresh_children = NULL; int32_t *stale_children = NULL; @@ -411,10 +418,7 @@ afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) priv = this->private; LOCK (&inode->lock); { - ret = __inode_ctx_get (inode, this, &ctx_addr); - if (ret < 0) - ctx_addr = 0; - ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count); + ctx = __afr_inode_ctx_get (inode, this); if (!ctx) goto unlock; switch (params->op) { @@ -434,33 +438,26 @@ afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params) case AFR_INODE_SET_OPENDIR_DONE: afr_inode_ctx_set_opendir_done (ctx); break; - case AFR_INODE_SET_SPLIT_BRAIN: - set = params->u.value; - afr_inode_ctx_set_splitbrain (ctx, set); - break; default: GF_ASSERT (0); break; } - ret = __inode_ctx_put (inode, this, (uint64_t)ctx); - if (ret) { - gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to " - "set the inode ctx (%s)", - uuid_utoa (inode->gfid)); - } } unlock: UNLOCK (&inode->lock); } void -afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set) +afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, + afr_spb_state_t data_spb) { - afr_inode_params_t params = {0}; + afr_inode_ctx_t *ctx = NULL; - params.op = AFR_INODE_SET_SPLIT_BRAIN; - params.u.value = set; - afr_inode_set_ctx (this, inode, ¶ms); + ctx = afr_inode_ctx_get (inode, this); + if (mdata_spb != DONT_KNOW) + ctx->mdata_spb = mdata_spb; + if (data_spb != DONT_KNOW) + ctx->data_spb = data_spb; } void @@ -469,7 +466,7 @@ afr_set_opendir_done (xlator_t *this, inode_t *inode) afr_inode_params_t params = {0}; params.op = AFR_INODE_SET_OPENDIR_DONE; - afr_inode_set_ctx (this, inode, ¶ms); + afr_inode_set_ctx_params (this, inode, ¶ms); } void @@ -488,7 +485,7 @@ afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child, params.op = AFR_INODE_SET_READ_CTX; params.u.read_ctx.read_child = read_child; params.u.read_ctx.children = fresh_children; - afr_inode_set_ctx (this, inode, ¶ms); + afr_inode_set_ctx_params (this, inode, ¶ms); } void @@ -501,7 +498,7 @@ afr_inode_rm_stale_children (xlator_t *this, inode_t *inode, params.op = AFR_INODE_RM_STALE_CHILDREN; params.u.read_ctx.children = stale_children; - afr_inode_set_ctx (this, inode, ¶ms); + afr_inode_set_ctx_params (this, inode, ¶ms); } gf_boolean_t @@ -782,6 +779,12 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) sh = &local->self_heal; priv = this->private; + if (sh->data_sh_info && strcmp (sh->data_sh_info, "")) + GF_FREE (sh->data_sh_info); + + if (sh->metadata_sh_info && strcmp (sh->metadata_sh_info, "")) + GF_FREE (sh->metadata_sh_info); + GF_FREE (sh->buf); GF_FREE (sh->parentbufs); @@ -829,7 +832,8 @@ afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) void afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) { - afr_private_t * priv = NULL; + afr_private_t *priv = NULL; + int i = 0; priv = this->private; @@ -839,12 +843,13 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) GF_FREE (local->internal_lock.locked_nodes); - GF_FREE (local->internal_lock.inode_locked_nodes); - - GF_FREE (local->internal_lock.entry_locked_nodes); + for (i = 0; local->internal_lock.inodelk[i].domain; i++) { + GF_FREE (local->internal_lock.inodelk[i].locked_nodes); + } GF_FREE (local->internal_lock.lower_locked_nodes); + afr_entry_lockee_cleanup (&local->internal_lock); GF_FREE (local->transaction.pre_op); GF_FREE (local->transaction.eager_lock); @@ -854,6 +859,8 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) loc_wipe (&local->transaction.parent_loc); loc_wipe (&local->transaction.new_parent_loc); + + GF_FREE (local->transaction.postop_piggybacked); } @@ -891,8 +898,6 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) GF_FREE (local->fresh_children); - GF_FREE (local->fd_open_on); - { /* lookup */ if (local->cont.lookup.xattrs) { afr_reset_xattr (local->cont.lookup.xattrs, @@ -1075,6 +1080,88 @@ afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent) uuid_copy (loc->pargfid, postparent->ia_gfid); } +/* + * Quota size xattrs are not maintained by afr. There is a + * possibility that they differ even when both the directory changelog xattrs + * suggest everything is fine. So if there is at least one 'source' check among + * the sources which has the maximum quota size. Otherwise check among all the + * available ones for maximum quota size. This way if there is a source and + * stale copies it always votes for the 'source'. + * */ + +static void +afr_handle_quota_size (afr_local_t *local, xlator_t *this, + dict_t *rsp_dict) +{ + int32_t *sources = NULL; + dict_t *xattr = NULL; + data_t *max_data = NULL; + int64_t max_quota_size = -1; + data_t *data = NULL; + int64_t *size = NULL; + int64_t quota_size = -1; + afr_private_t *priv = NULL; + int i = 0; + int ret = -1; + gf_boolean_t source_present = _gf_false; + + priv = this->private; + sources = local->cont.lookup.sources; + + if (rsp_dict == NULL) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "%s: Invalid " + "response dictionary", local->loc.path); + return; + } + + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source_present = _gf_true; + break; + } + } + + for (i = 0; i < priv->child_count; i++) { + /* + * If there is at least one source lets check + * for maximum quota sizes among sources, otherwise take the + * maximum of the ones present to be on the safer side. + */ + if (source_present && !sources[i]) + continue; + + xattr = local->cont.lookup.xattrs[i]; + if (!xattr) + continue; + + data = dict_get (xattr, QUOTA_SIZE_KEY); + if (!data) + continue; + + size = (int64_t*)data->data; + quota_size = ntoh64(*size); + gf_log (this->name, GF_LOG_DEBUG, "%s: %d, size: %"PRId64, + local->loc.path, i, quota_size); + if (quota_size > max_quota_size) { + if (max_data) + data_unref (max_data); + + max_quota_size = quota_size; + max_data = data_ref (data); + } + } + + if (max_data) { + ret = dict_set (rsp_dict, QUOTA_SIZE_KEY, max_data); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " + "quota size", local->loc.path); + } + + data_unref (max_data); + } +} + int afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) { @@ -1119,13 +1206,18 @@ afr_lookup_build_response_params (afr_local_t *local, xlator_t *this) ret = -1; goto out; } + gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d", read_child); if (!*xattr) *xattr = dict_ref (local->cont.lookup.xattrs[read_child]); + *buf = local->cont.lookup.bufs[read_child]; *postparent = local->cont.lookup.postparents[read_child]; + if (dict_get (local->xattr_req, QUOTA_SIZE_KEY)) + afr_handle_quota_size (local, this, *xattr); + if (IA_INVAL == local->cont.lookup.inode->ia_type) { /* fix for RT #602 */ local->cont.lookup.inode->ia_type = buf->ia_type; @@ -1274,7 +1366,7 @@ afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this, if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) { /* mismatching gfid */ - gf_log (this->name, GF_LOG_WARNING, + gf_log (this->name, GF_LOG_DEBUG, "%s: gfid different on subvolume", local->loc.path); } } @@ -1498,7 +1590,7 @@ afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, child2 = &bufs[success_children[i-1]]; if (FILETYPE_DIFFERS (child1, child2)) { - gf_log (xlator_name, GF_LOG_WARNING, "%s: filetype " + gf_log (xlator_name, GF_LOG_DEBUG, "%s: filetype " "differs on subvolumes (%d, %d)", path, success_children[i-1], success_children[i]); conflicting = _gf_true; @@ -1507,7 +1599,7 @@ afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children, if (!gfid || uuid_is_null (child1->ia_gfid)) continue; if (uuid_compare (*gfid, child1->ia_gfid)) { - gf_log (xlator_name, GF_LOG_WARNING, "%s: gfid differs" + gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid differs" " on subvolume %d", path, success_children[i]); conflicting = _gf_true; goto out; @@ -1641,8 +1733,8 @@ afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, if (op_ret == -1) { local->op_ret = -1; - if (afr_error_more_important (local->op_errno, op_errno)) - local->op_errno = op_errno; + local->op_errno = afr_most_important_error(local->op_errno, + op_errno, _gf_true); goto out; } else { @@ -1666,7 +1758,6 @@ afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this, local->loc.path, local->self_heal.actual_sh_started); } - } out: AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, @@ -1741,7 +1832,8 @@ afr_lookup_perform_self_heal (call_frame_t *frame, xlator_t *this, afr_lookup_set_self_heal_params (local, this); if (afr_can_self_heal_proceed (&local->self_heal, priv)) { - if (afr_is_transaction_running (local)) + if (afr_is_transaction_running (local) && + (!local->allow_sh_for_running_transaction)) goto out; reason = "lookup detected pending operations"; @@ -1920,6 +2012,57 @@ out: return; } +gf_boolean_t +afr_is_entry_possibly_under_creation (afr_local_t *local, xlator_t *this) +{ + /* + * We need to perform this test in lookup done and treat on going + * create/DELETE as ENOENT. + * Reason: + Multiple clients A, B and C are attempting 'mkdir -p /mnt/a/b/c' + + 1 Client A is in the middle of mkdir(/a). It has acquired lock. + It has performed mkdir(/a) on one subvol, and second one is still + in progress + 2 Client B performs a lookup, sees directory /a on one, + ENOENT on the other, succeeds lookup. + 3 Client B performs lookup on /a/b on both subvols, both return ENOENT + (one subvol because /a/b does not exist, another because /a + itself does not exist) + 4 Client B proceeds to mkdir /a/b. It obtains entrylk on inode=/a with + basename=b on one subvol, but fails on other subvol as /a is yet to + be created by Client A. + 5 Client A finishes mkdir of /a on other subvol + 6 Client C also attempts to create /a/b, lookup returns ENOENT on + both subvols. + 7 Client C tries to obtain entrylk on on inode=/a with basename=b, + obtains on one subvol (where B had failed), and waits for B to unlock + on other subvol. + 8 Client B finishes mkdir() on one subvol with GFID-1 and completes + transaction and unlocks + 9 Client C gets the lock on the second subvol, At this stage second + subvol already has /a/b created from Client B, but Client C does not + check that in the middle of mkdir transaction + 10 Client C attempts mkdir /a/b on both subvols. It succeeds on + ONLY ONE (where Client B could not get lock because of + missing parent /a dir) with GFID-2, and gets EEXIST from ONE subvol. + This way we have /a/b in GFID mismatch. One subvol got GFID-1 because + Client B performed transaction on only one subvol (because entrylk() + could not be obtained on second subvol because of missing parent dir -- + caused by premature/speculative succeeding of lookup() on /a when locks + are detected). Other subvol gets GFID-2 from Client C because while + it was waiting for entrylk() on both subvols, Client B was in the + middle of creating mkdir() on only one subvol, and Client C does not + "expect" this when it is between lock() and pre-op()/op() phase of the + transaction. + */ + if (local->cont.lookup.parent_entrylk && local->enoent_count) + return _gf_true; + + return _gf_false; +} + + static void afr_lookup_done (call_frame_t *frame, xlator_t *this) { @@ -1936,6 +2079,12 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) priv = this->private; local = frame->local; + if (afr_is_entry_possibly_under_creation (local, this)) { + local->op_ret = -1; + local->op_errno = ENOENT; + goto unwind; + } + if (local->op_ret < 0) goto unwind; @@ -1993,25 +2142,20 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) * others in that they must be given higher priority while * returning to the user. * - * The hierarchy is ESTALE > ENOENT > others - * + * The hierarchy is ESTALE > EIO > ENOENT > others */ - -gf_boolean_t -afr_error_more_important (int32_t old_errno, int32_t new_errno) +int32_t +afr_most_important_error(int32_t old_errno, int32_t new_errno, + gf_boolean_t eio) { - gf_boolean_t ret = _gf_true; - - /* Nothing should ever overwrite ESTALE */ - if (old_errno == ESTALE) - ret = _gf_false; - - /* Nothing should overwrite ENOENT, except ESTALE/EIO*/ - else if ((old_errno == ENOENT) && (new_errno != ESTALE) - && (new_errno != EIO)) - ret = _gf_false; + if (old_errno == ESTALE || new_errno == ESTALE) + return ESTALE; + if (eio && (old_errno == EIO || new_errno == EIO)) + return EIO; + if (old_errno == ENOENT || new_errno == ENOENT) + return ENOENT; - return ret; + return new_errno; } int32_t @@ -2030,8 +2174,9 @@ afr_resultant_errno_get (int32_t *children, } else { child = i; } - if (afr_error_more_important (op_errno, child_errno[child])) - op_errno = child_errno[child]; + op_errno = afr_most_important_error(op_errno, + child_errno[child], + _gf_false); } return op_errno; } @@ -2043,8 +2188,8 @@ afr_lookup_handle_error (afr_local_t *local, int32_t op_ret, int32_t op_errno) if (op_errno == ENOENT) local->enoent_count++; - if (afr_error_more_important (local->op_errno, op_errno)) - local->op_errno = op_errno; + local->op_errno = afr_most_important_error(local->op_errno, op_errno, + _gf_false); if (local->op_errno == ESTALE) { local->op_ret = -1; @@ -2283,7 +2428,7 @@ afr_lookup (call_frame_t *frame, xlator_t *this, int call_count = 0; uint64_t ctx = 0; int32_t op_errno = 0; - + int allow_sh = 0; priv = this->private; AFR_LOCAL_ALLOC_OR_GOTO (local, out); @@ -2300,6 +2445,13 @@ afr_lookup (call_frame_t *frame, xlator_t *this, goto out; } + if (local->loc.path && + (strcmp (local->loc.path, "/" GF_REPLICATE_TRASH_DIR) == 0)) { + op_errno = EPERM; + ret = -1; + goto out; + } + ret = inode_ctx_get (local->loc.inode, this, &ctx); if (ret == 0) { /* lookup is a revalidate */ @@ -2348,6 +2500,11 @@ afr_lookup (call_frame_t *frame, xlator_t *this, /* By default assume ENOTCONN. On success it will be set to 0. */ local->op_errno = ENOTCONN; + ret = dict_get_int32 (xattr_req, "allow-sh-for-running-transaction", + &allow_sh); + dict_del (xattr_req, "allow-sh-for-running-transaction"); + local->allow_sh_for_running_transaction = allow_sh; + ret = afr_lookup_xattr_req_prepare (local, this, xattr_req, &local->loc, &gfid_req); if (ret) { @@ -2464,10 +2621,11 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) } pthread_mutex_init (&fd_ctx->delay_lock, NULL); - INIT_LIST_HEAD (&fd_ctx->paused_calls); INIT_LIST_HEAD (&fd_ctx->entries); fd_ctx->call_child = -1; + INIT_LIST_HEAD (&fd_ctx->eager_locked); + ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); if (ret) gf_log (this->name, GF_LOG_DEBUG, @@ -2524,15 +2682,42 @@ afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, return 0; } +static int +afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + + priv = this->private; + local = frame->local; + call_count = local->call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_flush_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->flush, + local->fd, NULL); + if (!--call_count) + break; + + } + } + + return 0; +} + int afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { afr_private_t *priv = NULL; afr_local_t *local = NULL; + call_stub_t *stub = NULL; int ret = -1; int op_errno = 0; - int call_count = -1; - int i = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -2548,27 +2733,14 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) goto out; local->fd = fd_ref(fd); - call_count = local->call_count; - - /* - * Ideally we should synchronize flush against completion of writing - * the delayed changelog, but for now we just push it out first... - */ - afr_delayed_changelog_wake_up(this, fd); - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_flush_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->flush, - local->fd, NULL); - - if (!--call_count) - break; - } + stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata); + if (!stub) { + ret = -1; + op_errno = ENOMEM; + goto out; } + afr_delayed_changelog_wake_resume (this, fd, stub); ret = 0; out: @@ -2587,8 +2759,6 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) uint64_t ctx = 0; afr_fd_ctx_t *fd_ctx = NULL; int ret = 0; - afr_fd_paused_call_t *paused_call = NULL; - afr_fd_paused_call_t *tmp = NULL; ret = fd_ctx_get (fd, this, &ctx); if (ret < 0) @@ -2604,12 +2774,6 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) GF_FREE (fd_ctx->locked_on); GF_FREE (fd_ctx->pre_op_piggyback); - list_for_each_entry_safe (paused_call, tmp, &fd_ctx->paused_calls, - call_list) { - list_del_init (&paused_call->call_list); - GF_FREE (paused_call); - } - GF_FREE (fd_ctx->lock_piggyback); GF_FREE (fd_ctx->lock_acquired); @@ -2652,6 +2816,16 @@ afr_release (xlator_t *this, fd_t *fd) /* {{{ fsync */ int +afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, + xdata); + return 0; +} + +int afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) @@ -2660,6 +2834,7 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int call_count = -1; int child_index = (long) cookie; int read_child = 0; + call_stub_t *stub = NULL; local = frame->local; @@ -2675,13 +2850,13 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_ret = 0; if (local->success_count == 0) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; } if (child_index == read_child) { - local->cont.fsync.prebuf = *prebuf; - local->cont.fsync.postbuf = *postbuf; + local->cont.inode_wfop.prebuf = *prebuf; + local->cont.inode_wfop.postbuf = *postbuf; } local->success_count++; @@ -2694,10 +2869,32 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - AFR_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno, - &local->cont.fsync.prebuf, - &local->cont.fsync.postbuf, - NULL); + /* Make a stub out of the frame, and register it + with the waking up post-op. When the call-stub resumes, + we are guaranteed that there was no post-op pending + (i.e changelogs were unset in the server). This is an + essential "guarantee", that fsync() returns only after + completely finishing EVERYTHING, including the delayed + post-op. This guarantee is expected by FUSE graph switching + for example. + */ + stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk, + local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + xdata); + if (!stub) { + AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); + return 0; + } + + /* If no new unstable writes happened between the + time we cleared the unstable write witness flag in afr_fsync + and now, calling afr_delayed_changelog_wake_up() should + wake up and skip over the fsync phase and go straight to + afr_changelog_post_op_now() + */ + afr_delayed_changelog_wake_resume (this, local->fd, stub); } return 0; @@ -2732,6 +2929,10 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, local->fd = fd_ref (fd); + if (afr_fd_has_witnessed_unstable_write (this, fd)) { + /* don't care. we only wanted to CLEAR the bit */ + } + for (i = 0; i < priv->child_count; i++) { if (local->child_up[i]) { STACK_WIND_COOKIE (frame, afr_fsync_cbk, @@ -3889,6 +4090,17 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) goto out; } + local->transaction.postop_piggybacked = GF_CALLOC (priv->child_count, + sizeof (int), + gf_afr_mt_int32_t); + if (!local->transaction.postop_piggybacked) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + local->append_write = _gf_false; + ret = 0; out: return ret; @@ -3900,16 +4112,6 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, { int ret = -ENOMEM; - lk->inode_locked_nodes = GF_CALLOC (sizeof (*lk->inode_locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->inode_locked_nodes) - goto out; - - lk->entry_locked_nodes = GF_CALLOC (sizeof (*lk->entry_locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->entry_locked_nodes) - goto out; - lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), child_count, gf_afr_mt_char); if (NULL == lk->locked_nodes) @@ -3968,6 +4170,21 @@ out: } int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count) +{ + int ret = -ENOMEM; + + lk->domain = dom; + lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), + child_count, gf_afr_mt_char); + if (NULL == lk->locked_nodes) + goto out; + ret = 0; +out: + return ret; +} + +int afr_transaction_local_init (afr_local_t *local, xlator_t *this) { int child_up_count = 0; @@ -3980,6 +4197,14 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (ret < 0) goto out; + if ((local->transaction.type == AFR_DATA_TRANSACTION) || + (local->transaction.type == AFR_METADATA_TRANSACTION)) { + ret = afr_inodelk_init (&local->internal_lock.inodelk[0], + this->name, priv->child_count); + if (ret < 0) + goto out; + } + ret = -ENOMEM; child_up_count = afr_up_children_count (local->child_up, priv->child_count); @@ -4001,14 +4226,6 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) if (!local->fresh_children) goto out; - if (local->fd) { - local->fd_open_on = GF_CALLOC (sizeof (*local->fd_open_on), - priv->child_count, - gf_afr_mt_char); - if (!local->fd_open_on) - goto out; - } - local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op), priv->child_count, gf_afr_mt_char); @@ -4024,6 +4241,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) AFR_NUM_CHANGE_LOGS); if (!local->transaction.txn_changelog) goto out; + + INIT_LIST_HEAD (&local->transaction.eager_locked); + ret = 0; out: return ret; @@ -4211,6 +4431,16 @@ afr_priv_destroy (afr_private_t *priv) if (priv->shd.split_brain) eh_destroy (priv->shd.split_brain); + for (i = 0; i < priv->child_count; i++) + { + if (priv->shd.statistics[i]) + eh_destroy (priv->shd.statistics[i]); + } + + GF_FREE (priv->shd.statistics); + + GF_FREE (priv->shd.crawl_events); + GF_FREE (priv->last_event); if (priv->pending_key) { for (i = 0; i < priv->child_count; i++) @@ -4276,3 +4506,86 @@ afr_prepare_new_entry_pending_matrix (int32_t **pending, } } } + +gf_boolean_t +afr_is_fd_fixable (fd_t *fd) +{ + if (!fd || !fd->inode) + return _gf_false; + else if (fd_is_anonymous (fd)) + return _gf_false; + else if (uuid_is_null (fd->inode->gfid)) + return _gf_false; + + return _gf_true; +} + +void +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + inode_t *inode = NULL; + afr_inode_ctx_t *ctx = NULL; + + local = frame->local; + + if (local->fd) + inode = local->fd->inode; + else + inode = local->loc.inode; + + if (!inode) + return; + + LOCK (&inode->lock); + { + ctx = __afr_inode_ctx_get (inode, this); + ctx->open_fd_count = local->open_fd_count; + } + UNLOCK (&inode->lock); +} + +int +afr_initialise_statistics (xlator_t *this) +{ + afr_private_t *priv = NULL; + int ret = -1; + int i = 0; + int child_count = 0; + eh_t *stats_per_brick = NULL; + shd_crawl_event_t ***shd_crawl_events = NULL; + priv = this->private; + + priv->shd.statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count, + gf_common_mt_eh_t); + if (!priv->shd.statistics) { + ret = -1; + goto out; + } + child_count = priv->child_count; + for (i=0; i < child_count ; i++) { + stats_per_brick = eh_new (AFR_STATISTICS_HISTORY_SIZE, + _gf_false, + _destroy_crawl_event_data); + if (!stats_per_brick) { + ret = -1; + goto out; + } + priv->shd.statistics[i] = stats_per_brick; + + } + + shd_crawl_events = (shd_crawl_event_t***)(&priv->shd.crawl_events); + *shd_crawl_events = GF_CALLOC (sizeof(shd_crawl_event_t*), + priv->child_count, + gf_afr_mt_shd_crawl_event_t); + + if (!priv->shd.crawl_events) { + ret = -1; + goto out; + } + ret = 0; +out: + return ret; + +} diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index c201d45fd..689dd84e6 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -253,7 +253,7 @@ unlock: goto out; if (!afr_is_opendir_done (this, local->fd->inode) && - up_children_count > 1) { + up_children_count > 1 && priv->entry_self_heal) { /* * This is the first opendir on this inode. We need @@ -358,85 +358,6 @@ struct entry_name { struct list_head list; }; - -static gf_boolean_t -remembered_name (const char *name, struct list_head *entries) -{ - struct entry_name *e = NULL; - gf_boolean_t ret = _gf_false; - - list_for_each_entry (e, entries, list) { - if (!strcmp (name, e->name)) { - ret = _gf_true; - goto out; - } - } - -out: - return ret; -} - - -static void -afr_remember_entries (gf_dirent_t *entries, fd_t *fd) -{ - struct entry_name *n = NULL; - gf_dirent_t *entry = NULL; - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - ret = fd_ctx_get (fd, THIS, &ctx); - if (ret < 0) { - gf_log (THIS->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", fd); - return; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - list_for_each_entry (entry, &entries->list, list) { - n = GF_CALLOC (1, sizeof (*n), gf_afr_mt_entry_name); - n->name = gf_strdup (entry->d_name); - INIT_LIST_HEAD (&n->list); - - list_add (&n->list, &fd_ctx->entries); - } -} - - -static off_t -afr_filter_entries (gf_dirent_t *entries, fd_t *fd) -{ - gf_dirent_t *entry = NULL; - gf_dirent_t *tmp = NULL; - int ret = 0; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - off_t offset = 0; - - ret = fd_ctx_get (fd, THIS, &ctx); - if (ret < 0) { - gf_log (THIS->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", fd); - return -1; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - list_for_each_entry_safe (entry, tmp, &entries->list, list) { - offset = entry->d_off; - - if (remembered_name (entry->d_name, &fd_ctx->entries)) { - list_del (&entry->list); - GF_FREE (entry); - } - } - - return offset; -} - - static void afr_forget_entries (fd_t *fd) { @@ -462,14 +383,36 @@ afr_forget_entries (fd_t *fd) } } +static void +afr_readdir_filter_trash_dir (gf_dirent_t *entries, fd_t *fd) +{ + gf_dirent_t * entry = NULL; + gf_dirent_t * tmp = NULL; + + list_for_each_entry_safe (entry, tmp, &entries->list, list) { + if (__is_root_gfid (fd->inode->gfid) && + !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { + list_del_init (&entry->list); + GF_FREE (entry); + } + } +} int32_t afr_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, dict_t *xdata) { - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL); + afr_local_t *local = NULL; + + if (op_ret == -1) + goto out; + local = frame->local; + afr_readdir_filter_trash_dir (entries, local->fd); + +out: + AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL); return 0; } @@ -479,116 +422,16 @@ afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - xlator_t ** children = NULL; - int32_t next_call_child = -1; - int ret = 0; - int32_t *last_index = NULL; - int32_t read_child = -1; - int32_t *fresh_children = NULL; - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - off_t offset = 0; - int32_t call_child = -1; - - priv = this->private; - children = priv->children; - - local = frame->local; + afr_local_t *local = NULL; - if ((priv->readdir_failover == _gf_false) && (op_ret < 0)) + if (op_ret == -1) goto out; - read_child = (long) cookie; - last_index = &local->cont.readdir.last_index; - fresh_children = local->fresh_children; - - /* the value of the last_index changes if afr_next_call_child is - * called. So to find the call_child of this callback use last_index - * before the next_call_child call. - */ - if (*last_index == -1) - call_child = read_child; - else - call_child = fresh_children[*last_index]; - - if (priv->strict_readdir) { - ret = fd_ctx_get (local->fd, this, &ctx); - if (ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "could not get fd ctx for fd=%p", local->fd); - op_ret = -1; - op_errno = -ret; - goto out; - } - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; - - if (op_ret == -1) { - next_call_child = afr_next_call_child (fresh_children, - local->child_up, - priv->child_count, - last_index, - read_child); - if (next_call_child < 0) - goto out; - gf_log (this->name, GF_LOG_TRACE, - "starting readdir afresh on child %d, offset %"PRId64, - next_call_child, (uint64_t) 0); - - fd_ctx->failed_over = _gf_true; - - STACK_WIND_COOKIE (frame, afr_readdirp_cbk, - (void *) (long) read_child, - children[next_call_child], - children[next_call_child]->fops->readdirp, - local->fd, - local->cont.readdir.size, 0, - local->cont.readdir.dict); - return 0; - } - } - - if (priv->strict_readdir) { - if (fd_ctx->failed_over) { - if (list_empty (&entries->list)) { - gf_log (this->name, GF_LOG_DEBUG, - "no entries found"); - goto out; - } - - offset = afr_filter_entries (entries, local->fd); - - afr_remember_entries (entries, local->fd); - - if (list_empty (&entries->list)) { - /* All the entries we got were duplicate. We - shouldn't send an empty list now, because - that will make the application stop reading. So - try to get more entries */ - - gf_log (this->name, GF_LOG_TRACE, - "trying to fetch non-duplicate entries " - "from offset %"PRId64", child %s", - offset, children[call_child]->name); - - STACK_WIND_COOKIE (frame, afr_readdirp_cbk, - (void *) (long) read_child, - children[call_child], - children[call_child]->fops->readdirp, - local->fd, local->cont.readdir.size, offset, - local->cont.readdir.dict); - return 0; - } - } else { - afr_remember_entries (entries, local->fd); - } - } + local = frame->local; + afr_readdir_filter_trash_dir (entries, local->fd); out: AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, NULL); - return 0; } @@ -654,20 +497,6 @@ afr_do_readdir (call_frame_t *frame, xlator_t *this, local->cont.readdir.size = size; local->cont.readdir.dict = (dict)? dict_ref (dict) : NULL; - if (priv->strict_readdir) { - if (fd_ctx->last_tried != call_child) { - gf_log (this->name, GF_LOG_TRACE, - "first up child has changed from %d to %d, " - "restarting readdir from offset 0", - fd_ctx->last_tried, call_child); - - fd_ctx->failed_over = _gf_true; - offset = 0; - } - - fd_ctx->last_tried = call_child; - } - if (whichop == GF_FOP_READDIR) STACK_WIND_COOKIE (frame, afr_readdir_cbk, (void *) (long) call_child, diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index d86b08b87..1943b719b 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -231,7 +231,7 @@ afr_dir_fop_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this) afr_mark_new_entry_changelog (frame, this); out: - local->transaction.resume (frame, this); + return; } void @@ -253,6 +253,7 @@ afr_dir_fop_done (call_frame_t *frame, xlator_t *this) done: local->transaction.unwind (frame, this); afr_dir_fop_mark_entry_pending_changelog (frame, this); + local->transaction.resume (frame, this); } /* {{{ create */ @@ -418,11 +419,12 @@ afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, mode_t mode, mode_t umask, fd_t *fd, dict_t *params) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -473,7 +475,17 @@ afr_create (call_frame_t *frame, xlator_t *this, local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { op_errno = -ret; @@ -612,11 +624,12 @@ int afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *params) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -666,7 +679,17 @@ afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { op_errno = -ret; @@ -802,16 +825,16 @@ afr_mkdir_done (call_frame_t *frame, xlator_t *this) return 0; } - int afr_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, mode_t umask, dict_t *params) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -848,6 +871,7 @@ afr_mkdir (call_frame_t *frame, xlator_t *this, if (params) local->xdata_req = dict_ref (params); + local->op = GF_FOP_MKDIR; local->transaction.fop = afr_mkdir_wind; local->transaction.done = afr_mkdir_done; local->transaction.unwind = afr_mkdir_unwind; @@ -859,7 +883,17 @@ afr_mkdir (call_frame_t *frame, xlator_t *this, local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { op_errno = -ret; @@ -966,7 +1000,8 @@ afr_link_wind (call_frame_t *frame, xlator_t *this) for (i = 0; i < priv->child_count; i++) { if (local->transaction.pre_op[i]) { - STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i, + STACK_WIND_COOKIE (frame, afr_link_wind_cbk, + (void *) (long) i, priv->children[i], priv->children[i]->fops->link, &local->loc, @@ -998,11 +1033,12 @@ int afr_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1037,6 +1073,7 @@ afr_link (call_frame_t *frame, xlator_t *this, } UNLOCK (&priv->read_child_lock); + local->op = GF_FOP_LINK; local->transaction.fop = afr_link_wind; local->transaction.done = afr_link_done; local->transaction.unwind = afr_link_unwind; @@ -1048,13 +1085,22 @@ afr_link (call_frame_t *frame, xlator_t *this, local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (newloc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { op_errno = -ret; goto out; } - ret = 0; out: if (ret < 0) { @@ -1190,11 +1236,12 @@ int afr_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, loc_t *loc, mode_t umask, dict_t *params) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1231,6 +1278,7 @@ afr_symlink (call_frame_t *frame, xlator_t *this, if (params) local->xdata_req = dict_ref (params); + local->op = GF_FOP_SYMLINK; local->transaction.fop = afr_symlink_wind; local->transaction.done = afr_symlink_done; local->transaction.unwind = afr_symlink_unwind; @@ -1242,7 +1290,17 @@ afr_symlink (call_frame_t *frame, xlator_t *this, local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { op_errno = -ret; @@ -1317,6 +1375,7 @@ afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (afr_fop_failed (op_ret, op_errno) && op_errno != ENOTEMPTY) afr_transaction_fop_failed (frame, this, child_index); local->op_errno = op_errno; + local->child_errno[child_index] = op_errno; if (op_ret > -1) __dir_entry_fop_common_cbk (frame, child_index, this, @@ -1391,11 +1450,13 @@ int afr_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + int nlockee = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1423,6 +1484,7 @@ afr_rename (call_frame_t *frame, xlator_t *this, local->read_child_index = afr_inode_get_read_ctx (this, oldloc->inode, NULL); + local->op = GF_FOP_RENAME; local->transaction.fop = afr_rename_wind; local->transaction.done = afr_rename_done; local->transaction.unwind = afr_rename_unwind; @@ -1439,6 +1501,38 @@ afr_rename (call_frame_t *frame, xlator_t *this, local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (oldloc->path); local->transaction.new_basename = AFR_BASENAME (newloc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = nlockee = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->transaction.new_parent_loc, + local->transaction.new_basename, + priv->child_count); + if (ret) + goto out; + + nlockee++; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + nlockee++; + if (local->newloc.inode && IA_ISDIR (local->newloc.inode->ia_type)) { + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->newloc, + NULL, + priv->child_count); + if (ret) + goto out; + + nlockee++; + } + qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee), + afr_entry_lockee_cmp); + int_lock->lockee_count = nlockee; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); if (ret < 0) { @@ -1578,11 +1672,12 @@ int32_t afr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1610,6 +1705,7 @@ afr_unlink (call_frame_t *frame, xlator_t *this, if (xdata) local->xdata_req = dict_ref (xdata); + local->op = GF_FOP_UNLINK; local->transaction.fop = afr_unlink_wind; local->transaction.done = afr_unlink_done; local->transaction.unwind = afr_unlink_unwind; @@ -1621,7 +1717,17 @@ afr_unlink (call_frame_t *frame, xlator_t *this, local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + int_lock->lockee_count = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[0], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + int_lock->lockee_count++; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { op_errno = -ret; @@ -1695,6 +1801,7 @@ afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (afr_fop_failed (op_ret, op_errno) && (op_errno != ENOTEMPTY)) afr_transaction_fop_failed (frame, this, child_index); local->op_errno = op_errno; + local->child_errno[child_index] = op_errno; if (op_ret > -1) __dir_entry_fop_common_cbk (frame, child_index, this, op_ret, op_errno, NULL, NULL, @@ -1768,11 +1875,13 @@ int afr_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - call_frame_t * transaction_frame = NULL; - int ret = -1; - int op_errno = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + int nlockee = 0; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1798,6 +1907,7 @@ afr_rmdir (call_frame_t *frame, xlator_t *this, local->cont.rmdir.flags = flags; loc_copy (&local->loc, loc); + local->op = GF_FOP_RMDIR; local->transaction.fop = afr_rmdir_wind; local->transaction.done = afr_rmdir_done; local->transaction.unwind = afr_rmdir_unwind; @@ -1809,6 +1919,28 @@ afr_rmdir (call_frame_t *frame, xlator_t *this, local->transaction.main_frame = frame; local->transaction.basename = AFR_BASENAME (loc->path); + int_lock = &local->internal_lock; + + int_lock->lockee_count = nlockee = 0; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->transaction.parent_loc, + local->transaction.basename, + priv->child_count); + if (ret) + goto out; + + nlockee++; + ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, + &local->loc, + NULL, + priv->child_count); + if (ret) + goto out; + + nlockee++; + qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee), + afr_entry_lockee_cmp); + int_lock->lockee_count = nlockee; ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); if (ret < 0) { diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index beff715b5..e06e3b2f2 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -117,6 +117,8 @@ afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, children = priv->children; + AFR_SBRAIN_CHECK_LOC (loc, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; @@ -232,6 +234,8 @@ afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) children = priv->children; + AFR_SBRAIN_CHECK_LOC (loc, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; @@ -348,6 +352,8 @@ afr_fstat (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (fd->inode, out); + AFR_SBRAIN_CHECK_FD (fd, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; @@ -377,11 +383,8 @@ afr_fstat (call_frame_t *frame, xlator_t *this, local->fd = fd_ref (fd); - ret = afr_open_fd_fix (frame, this, _gf_false); - if (ret) { - op_errno = -ret; - goto out; - } + afr_open_fd_fix (fd, this); + STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, children[call_child], children[call_child]->fops->fstat, @@ -470,6 +473,8 @@ afr_readlink (call_frame_t *frame, xlator_t *this, children = priv->children; + AFR_SBRAIN_CHECK_LOC (loc, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; @@ -1058,7 +1063,7 @@ unlock: } len = dict_serialized_length (local->dict); - if (len == 0) { + if (len <= 0) { goto unwind; } @@ -1321,6 +1326,62 @@ afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, return ret; } +static int +afr_aggregate_stime_xattr (dict_t *this, char *key, data_t *value, void *data) +{ + int ret = 0; + + if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) + ret = gf_get_min_stime (THIS, data, key, value); + + return ret; +} + +int32_t +afr_common_getxattr_stime_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t callcnt = 0; + + if (!frame || !frame->local || !this) { + gf_log ("", GF_LOG_ERROR, "possible NULL deref"); + goto out; + } + + local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (!dict || (op_ret < 0)) { + local->op_errno = op_errno; + goto cleanup; + } + + if (!local->dict) + local->dict = dict_copy_with_ref (dict, NULL); + else + dict_foreach (dict, afr_aggregate_stime_xattr, + local->dict); + local->op_ret = 0; + } + +cleanup: + UNLOCK (&frame->lock); + + if (!callcnt) { + AFR_STACK_UNWIND (getxattr, frame, local->op_ret, + local->op_errno, local->dict, xdata); + } + +out: + return 0; +} + + static gf_boolean_t afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk, gf_boolean_t is_fgetxattr) @@ -1353,6 +1414,8 @@ afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk, } else { *cbk = afr_getxattr_lockinfo_cbk; } + } else if (fnmatch (GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) { + *cbk = afr_common_getxattr_stime_cbk; } else { is_spl = _gf_false; } @@ -1401,7 +1464,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, int32_t read_child = -1; int ret = -1; fop_getxattr_cbk_t cbk = NULL; - + int afr_xtime_gauge[MCNT_MAX] = {0,}; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (this, out); @@ -1412,6 +1475,8 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, children = priv->children; + AFR_SBRAIN_CHECK_LOC (loc, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; @@ -1434,7 +1499,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, goto out; } if ((strcmp (GF_XATTR_MARKER_KEY, name) == 0) - && (-1 == frame->root->pid)) { + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { local->marker.call_count = priv->child_count; @@ -1450,6 +1515,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, sub_volumes, priv->child_count, MARKER_UUID_TYPE, + marker_uuid_default_gauge, priv->vol_uuid)) { gf_log (this->name, GF_LOG_INFO, @@ -1484,7 +1550,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, if (*priv->vol_uuid) { if ((match_uuid_local (name, priv->vol_uuid) == 0) - && (-1 == frame->root->pid)) { + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { local->marker.call_count = priv->child_count; sub_volumes = alloca ( priv->child_count @@ -1496,12 +1562,20 @@ afr_getxattr (call_frame_t *frame, xlator_t *this, } + /* don't err out on getting ENOTCONN (brick down) + * from a subset of the bricks + */ + memcpy (afr_xtime_gauge, marker_xtime_default_gauge, + sizeof (afr_xtime_gauge)); + afr_xtime_gauge[MCNT_NOTFOUND] = 0; + afr_xtime_gauge[MCNT_ENOTCONN] = 0; if (cluster_getmarkerattr (frame, this, loc, name, local, afr_getxattr_unwind, sub_volumes, priv->child_count, MARKER_XTIME_TYPE, + afr_xtime_gauge, priv->vol_uuid)) { gf_log (this->name, GF_LOG_INFO, "%s: failed to get marker attr (%s)", @@ -1658,6 +1732,8 @@ afr_fgetxattr (call_frame_t *frame, xlator_t *this, children = priv->children; + AFR_SBRAIN_CHECK_FD (fd, out); + AFR_LOCAL_ALLOC_OR_GOTO (local, out); frame->local = local; @@ -1813,6 +1889,8 @@ afr_readv (call_frame_t *frame, xlator_t *this, priv = this->private; children = priv->children; + AFR_SBRAIN_CHECK_FD (fd, out); + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; @@ -1842,11 +1920,8 @@ afr_readv (call_frame_t *frame, xlator_t *this, local->cont.readv.offset = offset; local->cont.readv.flags = flags; - ret = afr_open_fd_fix (frame, this, _gf_false); - if (ret) { - op_errno = -ret; - goto out; - } + afr_open_fd_fix (fd, this); + STACK_WIND_COOKIE (frame, afr_readv_cbk, (void *) (long) call_child, children[call_child], diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index d9d3800fc..c1ec69a55 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -39,34 +39,133 @@ #include "afr-transaction.h" #include "afr-self-heal-common.h" +void +__inode_write_fop_cbk (call_frame_t *frame, int child_index, int read_child, + xlator_t *this, int32_t *op_ret, int32_t *op_errno, + struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (afr_fop_failed (*op_ret, *op_errno)) { + local->child_errno[child_index] = *op_errno; + + switch (local->op) { + case GF_FOP_TRUNCATE: + case GF_FOP_FTRUNCATE: + if (*op_errno != EFBIG) + afr_transaction_fop_failed (frame, this, + child_index); + break; + default: + afr_transaction_fop_failed (frame, this, child_index); + break; + } + local->op_errno = *op_errno; + goto out; + } + + if ((local->success_count == 0) || (read_child == child_index)) { + local->op_ret = *op_ret; + if (prebuf) + local->cont.inode_wfop.prebuf = *prebuf; + if (postbuf) + local->cont.inode_wfop.postbuf = *postbuf; + } + + local->success_count++; +out: + return; +} + /* {{{ writev */ -int +void +afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame) +{ + afr_local_t *src_local = NULL; + afr_local_t *dst_local = NULL; + + src_local = src_frame->local; + dst_local = dst_frame->local; + + dst_local->op_ret = src_local->op_ret; + dst_local->op_errno = src_local->op_errno; + dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf; + dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf; +} + +void afr_writev_unwind (call_frame_t *frame, xlator_t *this) { afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + local = frame->local; + + AFR_STACK_UNWIND (writev, frame, + local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); +} + +call_frame_t* +afr_transaction_detach_fop_frame (call_frame_t *frame) +{ + afr_local_t * local = NULL; + call_frame_t *fop_frame = NULL; local = frame->local; LOCK (&frame->lock); { - if (local->transaction.main_frame) - main_frame = local->transaction.main_frame; + fop_frame = local->transaction.main_frame; local->transaction.main_frame = NULL; } UNLOCK (&frame->lock); - if (main_frame) { - AFR_STACK_UNWIND (writev, main_frame, - local->op_ret, local->op_errno, - &local->cont.writev.prebuf, - &local->cont.writev.postbuf, - NULL); + return fop_frame; +} + +int +afr_transaction_writev_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *fop_frame = NULL; + + fop_frame = afr_transaction_detach_fop_frame (frame); + + if (fop_frame) { + afr_writev_copy_outvars (frame, fop_frame); + afr_writev_unwind (fop_frame, this); } return 0; } +static void +afr_writev_handle_short_writes (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + /* + * We already have the best case result of the writev calls staged + * as the return value. Any writev that returns some value less + * than the best case is now out of sync, so mark the fop as + * failed. Note that fops that have returned with errors have + * already been marked as failed. + */ + for (i = 0; i < priv->child_count; i++) { + if ((!local->replies[i].valid) || + (local->replies[i].op_ret == -1)) + continue; + + if (local->replies[i].op_ret < local->op_ret) + afr_transaction_fop_failed(frame, this, i); + } +} int afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -74,14 +173,17 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; + afr_private_t *priv = NULL; + call_frame_t *fop_frame = NULL; int child_index = (long) cookie; int call_count = -1; int read_child = 0; - afr_private_t *priv = NULL; - int i = 0; + int ret = 0; + uint32_t open_fd_count = 0; + uint32_t write_is_append = 0; local = frame->local; - priv = this->private; + priv = this->private; read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); @@ -91,12 +193,14 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); + local->replies[child_index].valid = 1; local->replies[child_index].op_ret = op_ret; local->replies[child_index].op_errno = op_errno; - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); /* stage the best case return value for unwind */ if ((local->success_count == 0) || (op_ret > local->op_ret)) { @@ -105,12 +209,24 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } if (op_ret != -1) { - if ((local->success_count == 0) || - (child_index == read_child)) { - local->cont.writev.prebuf = *prebuf; - local->cont.writev.postbuf = *postbuf; - } - local->success_count++; + if (xdata) { + ret = dict_get_uint32 (xdata, + GLUSTERFS_OPEN_FD_COUNT, + &open_fd_count); + if ((ret == 0) && + (open_fd_count > local->open_fd_count)) { + local->open_fd_count = open_fd_count; + local->update_open_fd_count = _gf_true; + } + + write_is_append = 0; + ret = dict_get_uint32 (xdata, + GLUSTERFS_WRITE_IS_APPEND, + &write_is_append); + if (ret || !write_is_append) + local->append_write = _gf_false; + } + } } UNLOCK (&frame->lock); @@ -118,24 +234,40 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - /* - * We already have the best case result of the writev calls staged - * as the return value. Any writev that returns some value less - * than the best case is now out of sync, so mark the fop as - * failed. Note that fops that have returned with errors have - * already been marked as failed. - */ - for (i = 0; i < priv->child_count; i++) { - if ((!local->replies[i].valid) || - (local->replies[i].op_ret == -1)) - continue; - - if (local->replies[i].op_ret < local->op_ret) - afr_transaction_fop_failed(frame, this, i); - } - local->transaction.unwind (frame, this); - local->transaction.resume (frame, this); + if (local->update_open_fd_count) + afr_handle_open_fd_count (frame, this); + + if (!local->stable_write && !local->append_write) + /* An appended write removes the necessity to + fsync() the file. This is because self-heal + has the logic to check for larger file when + the xattrs are not reliably pointing at + a stale file. + */ + afr_fd_report_unstable_write (this, local->fd); + + afr_writev_handle_short_writes (frame, this); + if (afr_any_fops_failed (local, priv)) { + //Don't unwind until post-op is complete + local->transaction.resume (frame, this); + } else { + /* + * Generally inode-write fops do transaction.unwind then + * transaction.resume, but writev needs to make sure that + * delayed post-op frame is placed in fdctx before unwind + * happens. This prevents the race of flush doing the + * changelog wakeup first in fuse thread and then this + * writev placing its delayed post-op frame in fdctx. + * This helps flush make sure all the delayed post-ops are + * completed. + */ + + fop_frame = afr_transaction_detach_fop_frame (frame); + afr_writev_copy_outvars (frame, fop_frame); + local->transaction.resume (frame, this); + afr_writev_unwind (fop_frame, this); + } } return 0; } @@ -147,6 +279,8 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) afr_private_t *priv = NULL; int i = 0; int call_count = -1; + dict_t *xdata = NULL; + GF_UNUSED int ret = 0; local = frame->local; priv = this->private; @@ -170,6 +304,19 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) return 0; } + xdata = dict_new (); + if (xdata) { + ret = dict_set_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, + sizeof (uint32_t)); + ret = dict_set_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, + 0); + /* Set append_write to be true speculatively. If on any + server it turns not be true, we unset it in the + callback. + */ + local->append_write = _gf_true; + } + for (i = 0; i < priv->child_count; i++) { if (local->transaction.pre_op[i]) { STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, @@ -182,13 +329,16 @@ afr_writev_wind (call_frame_t *frame, xlator_t *this) local->cont.writev.offset, local->cont.writev.flags, local->cont.writev.iobref, - NULL); + xdata); if (!--call_count) break; } } + if (xdata) + dict_unref (xdata); + return 0; } @@ -228,7 +378,7 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) } transaction_frame->local = local; - frame->local = NULL; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local->op = GF_FOP_WRITE; @@ -236,10 +386,17 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) local->transaction.fop = afr_writev_wind; local->transaction.done = afr_writev_done; - local->transaction.unwind = afr_writev_unwind; + local->transaction.unwind = afr_transaction_writev_unwind; local->transaction.main_frame = frame; if (local->fd->flags & O_APPEND) { + /* + * Backend vfs ignores the 'offset' for append mode fd so + * locking just the region provided for the writev does not + * give consistency gurantee. The actual write may happen at a + * completely different range than the one provided by the + * offset, len in the fop. So lock the entire file. + */ local->transaction.start = 0; local->transaction.len = 0; } else { @@ -265,153 +422,74 @@ out: return 0; } -static int -afr_prepare_loc (call_frame_t *frame, fd_t *fd) +static void +afr_trigger_open_fd_self_heal (fd_t *fd, xlator_t *this) { - afr_local_t *local = NULL; - char *name = NULL; - char *path = NULL; - int ret = 0; - - if ((!fd) || (!fd->inode)) - return -1; - - local = frame->local; - ret = inode_path (fd->inode, NULL, (char **)&path); - if (ret <= 0) { - gf_log (frame->this->name, GF_LOG_DEBUG, - "Unable to get path for gfid: %s", - uuid_utoa (fd->inode->gfid)); - return -1; - } - - if (local->loc.path) { - if (strcmp (path, local->loc.path)) - gf_log (frame->this->name, GF_LOG_DEBUG, - "overwriting old loc->path %s with %s", - local->loc.path, path); - GF_FREE ((char *)local->loc.path); - } - local->loc.path = path; - - name = strrchr (local->loc.path, '/'); - if (name) - name++; - local->loc.name = name; - - if (local->loc.inode) { - inode_unref (local->loc.inode); - } - local->loc.inode = inode_ref (fd->inode); - - if (local->loc.parent) { - inode_unref (local->loc.parent); + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + char *reason = NULL; + int32_t op_errno = 0; + int ret = 0; + + if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid args: " + "fd: %p, inode: %p", fd, + fd ? fd->inode : NULL); + goto out; } - local->loc.parent = inode_parent (local->loc.inode, 0, NULL); - - return 0; -} - -afr_fd_paused_call_t* -afr_paused_call_create (call_frame_t *frame) -{ - afr_local_t *local = NULL; - afr_fd_paused_call_t *paused_call = NULL; + frame = create_frame (this, this->ctx->pool); + if (!frame) + goto out; + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); local = frame->local; - GF_ASSERT (local->fop_call_continue); - - paused_call = GF_CALLOC (1, sizeof (*paused_call), - gf_afr_fd_paused_call_t); - if (paused_call) { - INIT_LIST_HEAD (&paused_call->call_list); - paused_call->frame = frame; - } - - return paused_call; -} - -static int -afr_pause_fd_fop (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx) -{ - afr_fd_paused_call_t *paused_call = NULL; - int ret = 0; - - paused_call = afr_paused_call_create (frame); - if (paused_call) - list_add (&paused_call->call_list, &fd_ctx->paused_calls); - else - ret = -ENOMEM; - - return ret; -} + ret = afr_local_init (local, this->private, &op_errno); + if (ret < 0) + goto out; -static void -afr_trigger_open_fd_self_heal (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - inode_t *inode = NULL; - char *reason = NULL; + local->loc.inode = inode_ref (fd->inode); + ret = loc_path (&local->loc, NULL); + if (ret < 0) + goto out; - local = frame->local; sh = &local->self_heal; - inode = local->fd->inode; - - sh->do_missing_entry_self_heal = _gf_true; - sh->do_gfid_self_heal = _gf_true; - sh->do_data_self_heal = _gf_true; + sh->do_metadata_self_heal = _gf_true; + if (fd->inode->ia_type == IA_IFREG) + sh->do_data_self_heal = _gf_true; + else if (fd->inode->ia_type == IA_IFDIR) + sh->do_entry_self_heal = _gf_true; reason = "subvolume came online"; - afr_launch_self_heal (frame, this, inode, _gf_true, inode->ia_type, - reason, NULL, NULL); + afr_launch_self_heal (frame, this, fd->inode, _gf_true, + fd->inode->ia_type, reason, NULL, NULL); + return; +out: + AFR_STACK_DESTROY (frame); } -int -afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop) -{ - int ret = 0; - int i = 0; - afr_fd_ctx_t *fd_ctx = NULL; - inode_t *inode = NULL; - gf_boolean_t need_self_heal = _gf_false; - int *need_open = NULL; - int need_open_count = 0; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - gf_boolean_t fop_continue = _gf_true; +void +afr_open_fd_fix (fd_t *fd, xlator_t *this) +{ + int ret = 0; + int i = 0; + afr_fd_ctx_t *fd_ctx = NULL; + gf_boolean_t need_self_heal = _gf_false; + int *need_open = NULL; + size_t need_open_count = 0; + afr_private_t *priv = NULL; - local = frame->local; priv = this->private; - GF_ASSERT (local->fd); - - inode = local->fd->inode; - //gfid is not set in rebalance, that case needs to be handled. - if (fd_is_anonymous (local->fd) || - !inode || uuid_is_null (inode->gfid)) { - fop_continue = _gf_true; - goto out; - } - - if (pause_fop) - GF_ASSERT (local->fop_call_continue); - - ret = afr_prepare_loc (frame, local->fd); - if (ret < 0) { - //File does not exist we cant open it. - ret = 0; + if (!afr_is_fd_fixable (fd)) goto out; - } - fd_ctx = afr_fd_ctx_get (local->fd, this); - if (!fd_ctx) { - ret = -EINVAL; + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) goto out; - } - LOCK (&local->fd->lock); + LOCK (&fd->lock); { if (fd_ctx->up_count < priv->up_count) { need_self_heal = _gf_true; @@ -419,55 +497,34 @@ afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop) fd_ctx->down_count = priv->down_count; } + need_open = alloca (priv->child_count * sizeof (*need_open)); for (i = 0; i < priv->child_count; i++) { - if ((fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED) && - local->child_up[i]) { - fd_ctx->opened_on[i] = AFR_FD_OPENING; - if (!need_open) - need_open = GF_CALLOC (priv->child_count, - sizeof (*need_open), - gf_afr_mt_int32_t); - need_open[i] = 1; - need_open_count++; - } else if (pause_fop && local->child_up[i] && - (fd_ctx->opened_on[i] == AFR_FD_OPENING)) { - local->fop_paused = _gf_true; - } - } + need_open[i] = 0; + if (fd_ctx->opened_on[i] != AFR_FD_NOT_OPENED) + continue; + + if (!priv->child_up[i]) + continue; + + fd_ctx->opened_on[i] = AFR_FD_OPENING; - if (local->fop_paused) { - GF_ASSERT (pause_fop); - gf_log (this->name, GF_LOG_INFO, "Pause fd %p", - local->fd); - ret = afr_pause_fd_fop (frame, this, fd_ctx); - if (ret) - goto unlock; - fop_continue = _gf_false; + need_open[i] = 1; + need_open_count++; } } -unlock: - UNLOCK (&local->fd->lock); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "Failed to fix fd for %s", - local->loc.path); - fop_continue = _gf_false; + UNLOCK (&fd->lock); + if (ret) goto out; - } if (need_self_heal) - afr_trigger_open_fd_self_heal (frame, this); + afr_trigger_open_fd_self_heal (fd, this); if (!need_open_count) goto out; - gf_log (this->name, GF_LOG_INFO, "Opening fd %p", local->fd); - afr_fix_open (frame, this, fd_ctx, need_open_count, need_open); - fop_continue = _gf_false; + afr_fix_open (this, fd, need_open_count, need_open); out: - GF_FREE (need_open); - if (fop_continue && local->fop_call_continue) - local->fop_call_continue (frame, this); - return ret; + return; } int @@ -486,6 +543,11 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, priv = this->private; + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(writev,out); AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); @@ -502,13 +564,15 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, local->cont.writev.iobref = iobref_ref (iobref); local->fd = fd_ref (fd); - local->fop_call_continue = afr_do_writev; - ret = afr_open_fd_fix (frame, this, _gf_true); - if (ret) { - op_errno = -ret; - goto out; - } + /* detect here, but set it in writev_wind_cbk *after* the unstable + write is performed + */ + local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC)); + + afr_open_fd_fix (fd, this); + + afr_do_writev (frame, this); ret = 0; out: @@ -542,8 +606,8 @@ afr_truncate_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, local->op_errno, - &local->cont.truncate.prebuf, - &local->cont.truncate.postbuf, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, NULL); } @@ -557,14 +621,11 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; int child_index = (long) cookie; int read_child = 0; int call_count = -1; - int need_unwind = 0; local = frame->local; - priv = this->private; read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL); @@ -574,38 +635,22 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno) && op_errno != EFBIG) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.truncate.prebuf = *prebuf; - local->cont.truncate.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.truncate.prebuf = *prebuf; - local->cont.truncate.postbuf = *postbuf; - } - - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if (prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; } - local->op_errno = op_errno; + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); } UNLOCK (&frame->lock); - if (need_unwind) - local->transaction.unwind (frame, this); - call_count = afr_frame_return (frame); if (call_count == 0) { + if (local->stable_write && afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); + local->transaction.resume (frame, this); } @@ -633,6 +678,7 @@ afr_truncate_wind (call_frame_t *frame, xlator_t *this) } local->call_count = call_count; + local->stable_write = _gf_true; for (i = 0; i < priv->child_count; i++) { if (local->transaction.pre_op[i]) { @@ -753,8 +799,8 @@ afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, local->op_errno, - &local->cont.ftruncate.prebuf, - &local->cont.ftruncate.postbuf, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, NULL); } return 0; @@ -767,14 +813,11 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *postbuf, dict_t *xdata) { afr_local_t * local = NULL; - afr_private_t * priv = NULL; int child_index = (long) cookie; int call_count = -1; - int need_unwind = 0; int read_child = 0; local = frame->local; - priv = this->private; read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); @@ -784,38 +827,22 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.ftruncate.prebuf = *prebuf; - local->cont.ftruncate.postbuf = *postbuf; - } - - if (child_index == read_child) { - local->cont.ftruncate.prebuf = *prebuf; - local->cont.ftruncate.postbuf = *postbuf; - } - - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if (prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; } - local->op_errno = op_errno; + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); } UNLOCK (&frame->lock); - if (need_unwind) - local->transaction.unwind (frame, this); - call_count = afr_frame_return (frame); if (call_count == 0) { + if (local->stable_write && afr_txn_nothing_failed (frame, this)) + local->transaction.unwind (frame, this); + local->transaction.resume (frame, this); } @@ -843,6 +870,7 @@ afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) } local->call_count = call_count; + local->stable_write = _gf_true; for (i = 0; i < priv->child_count; i++) { if (local->transaction.pre_op[i]) { @@ -942,6 +970,10 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this, priv = this->private; + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } QUORUM_CHECK(ftruncate,out); AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); @@ -954,13 +986,10 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this, local->cont.ftruncate.offset = offset; local->fd = fd_ref (fd); - local->fop_call_continue = afr_do_ftruncate; - ret = afr_open_fd_fix (frame, this, _gf_true); - if (ret) { - op_errno = -ret; - goto out; - } + afr_open_fd_fix (fd, this); + + afr_do_ftruncate (frame, this); ret = 0; out: @@ -996,8 +1025,8 @@ afr_setattr_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, local->op_errno, - &local->cont.setattr.preop_buf, - &local->cont.setattr.postop_buf, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, NULL); } @@ -1028,29 +1057,14 @@ afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.setattr.preop_buf = *preop; - local->cont.setattr.postop_buf = *postop; - } - - if (child_index == read_child) { - local->cont.setattr.preop_buf = *preop; - local->cont.setattr.postop_buf = *postop; - } - - local->success_count++; + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, preop, postop, + xdata); - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; } - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1205,8 +1219,8 @@ afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this) if (main_frame) { AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, local->op_errno, - &local->cont.fsetattr.preop_buf, - &local->cont.fsetattr.postop_buf, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, NULL); } @@ -1237,29 +1251,14 @@ afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->read_child_returned = _gf_true; } - if (afr_fop_failed (op_ret, op_errno)) - afr_transaction_fop_failed (frame, this, child_index); - - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - local->cont.fsetattr.preop_buf = *preop; - local->cont.fsetattr.postop_buf = *postop; - } + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, preop, postop, + xdata); - if (child_index == read_child) { - local->cont.fsetattr.preop_buf = *preop; - local->cont.fsetattr.postop_buf = *postop; - } - - local->success_count++; - - if ((local->success_count >= priv->wait_count) - && local->read_child_returned) { - need_unwind = 1; - } + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; } - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1347,6 +1346,11 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this, priv = this->private; + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(fsetattr,out); transaction_frame = copy_frame (frame); @@ -1371,11 +1375,7 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this, local->fd = fd_ref (fd); - ret = afr_open_fd_fix (transaction_frame, this, _gf_false); - if (ret) { - op_errno = -ret; - goto out; - } + afr_open_fd_fix (fd, this); local->transaction.main_frame = frame; local->transaction.start = LLONG_MAX - 1; @@ -1431,28 +1431,23 @@ int afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int need_unwind = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; local = frame->local; priv = this->private; LOCK (&frame->lock); { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->child_count) { - need_unwind = 1; - } + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + if (local->success_count == priv->child_count) { + need_unwind = 1; } - - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1621,28 +1616,24 @@ int afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int need_unwind = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; local = frame->local; priv = this->private; LOCK (&frame->lock); { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - if (local->success_count == priv->child_count) { - need_unwind = 1; - } + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + if (local->success_count == priv->child_count) { + need_unwind = 1; } - - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -1734,6 +1725,11 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this, priv = this->private; + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(fsetxattr,out); AFR_LOCAL_ALLOC_OR_GOTO (local, out); @@ -1816,28 +1812,23 @@ int afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int need_unwind = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; local = frame->local; priv = this->private; LOCK (&frame->lock); { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; - - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); + if (local->success_count == priv->wait_count) { + need_unwind = 1; } - - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -2005,28 +1996,24 @@ int afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int call_count = -1; - int need_unwind = 0; + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int call_count = -1; + int need_unwind = 0; + int child_index = (long) cookie; local = frame->local; priv = this->private; LOCK (&frame->lock); { - if (op_ret != -1) { - if (local->success_count == 0) { - local->op_ret = op_ret; - } - local->success_count++; + __inode_write_fop_cbk (frame, child_index, -1, this, + &op_ret, &op_errno, NULL, NULL, + xdata); - if (local->success_count == priv->wait_count) { - need_unwind = 1; - } + if (local->success_count == priv->wait_count) { + need_unwind = 1; } - - local->op_errno = op_errno; } UNLOCK (&frame->lock); @@ -2119,6 +2106,10 @@ afr_fremovexattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this->private, out); priv = this->private; + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } QUORUM_CHECK(fremovexattr, out); @@ -2167,3 +2158,704 @@ out: return 0; } + +static int +afr_fallocate_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, + local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); + } + return 0; +} + +static int +afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + int read_child = 0; + + local = frame->local; + priv = this->private; + + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + + LOCK (&frame->lock); + { + if (child_index == read_child) { + local->read_child_returned = _gf_true; + } + + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); + + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; + } + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + +static int +afr_fallocate_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fallocate, + local->fd, + local->cont.fallocate.mode, + local->cont.fallocate.offset, + local->cont.fallocate.len, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + +static int +afr_fallocate_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +static int +afr_do_fallocate (call_frame_t *frame, xlator_t *this) +{ + call_frame_t * transaction_frame = NULL; + afr_local_t * local = NULL; + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_FALLOCATE; + + local->transaction.fop = afr_fallocate_wind; + local->transaction.done = afr_fallocate_done; + local->transaction.unwind = afr_fallocate_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = local->cont.fallocate.offset; + local->transaction.len = 0; + + /* fallocate can modify the file size */ + op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + op_ret = 0; +out: + if (op_ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fallocate, frame, op_ret, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(fallocate,out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + local->cont.fallocate.mode = mode; + local->cont.fallocate.offset = offset; + local->cont.fallocate.len = len; + + local->fd = fd_ref (fd); + + afr_open_fd_fix (fd, this); + + afr_do_fallocate (frame, this); + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ discard */ + +static int +afr_discard_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (discard, main_frame, local->op_ret, + local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, + NULL); + } + return 0; +} + +static int +afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + int read_child = 0; + + local = frame->local; + priv = this->private; + + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + + LOCK (&frame->lock); + { + if (child_index == read_child) { + local->read_child_returned = _gf_true; + } + + __inode_write_fop_cbk (frame, child_index, read_child, this, + &op_ret, &op_errno, prebuf, postbuf, + xdata); + + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; + } + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + +static int +afr_discard_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->discard, + local->fd, + local->cont.discard.offset, + local->cont.discard.len, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + +static int +afr_discard_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +static int +afr_do_discard (call_frame_t *frame, xlator_t *this) +{ + call_frame_t * transaction_frame = NULL; + afr_local_t * local = NULL; + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_DISCARD; + + local->transaction.fop = afr_discard_wind; + local->transaction.done = afr_discard_done; + local->transaction.unwind = afr_discard_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = local->cont.discard.offset; + local->transaction.len = 0; + + op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + op_ret = 0; +out: + if (op_ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (discard, frame, op_ret, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(discard, out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + local->cont.discard.offset = offset; + local->cont.discard.len = len; + + local->fd = fd_ref (fd); + + afr_open_fd_fix (fd, this); + + afr_do_discard(frame, this); + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + } + + return 0; +} + + +/* {{{ zerofill */ + +static int +afr_zerofill_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (zerofill, main_frame, local->op_ret, + local->op_errno, + &local->cont.zerofill.prebuf, + &local->cont.zerofill.postbuf, + NULL); + } + return 0; +} + +static int +afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + int read_child = 0; + + local = frame->local; + priv = this->private; + + read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL); + + LOCK (&frame->lock); + { + if (child_index == read_child) { + local->read_child_returned = _gf_true; + } + + if (afr_fop_failed (op_ret, op_errno)) { + afr_transaction_fop_failed (frame, this, child_index); + } + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.zerofill.prebuf = *prebuf; + local->cont.zerofill.postbuf = *postbuf; + } + + if (child_index == read_child) { + local->cont.zerofill.prebuf = *prebuf; + local->cont.zerofill.postbuf = *postbuf; + } + + local->success_count++; + + if ((local->success_count >= priv->wait_count) + && local->read_child_returned) { + need_unwind = 1; + } + } + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) { + local->transaction.unwind (frame, this); + } + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + +static int +afr_zerofill_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i]) { + STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->zerofill, + local->fd, + local->cont.zerofill.offset, + local->cont.zerofill.len, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} + +static int +afr_zerofill_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + +static int +afr_do_zerofill(call_frame_t *frame, xlator_t *this) +{ + call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + int op_ret = -1; + int op_errno = 0; + + local = frame->local; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + goto out; + } + + transaction_frame->local = local; + frame->local = NULL; + + local->op = GF_FOP_ZEROFILL; + + local->transaction.fop = afr_zerofill_wind; + local->transaction.done = afr_zerofill_done; + local->transaction.unwind = afr_zerofill_unwind; + + local->transaction.main_frame = frame; + + local->transaction.start = local->cont.zerofill.offset; + local->transaction.len = 0; + + op_ret = afr_transaction (transaction_frame, this, + AFR_DATA_TRANSACTION); + if (op_ret < 0) { + op_errno = -op_ret; + goto out; + } + + op_ret = 0; +out: + if (op_ret < 0) { + if (transaction_frame) { + AFR_STACK_DESTROY (transaction_frame); + } + AFR_STACK_UNWIND (zerofill, frame, op_ret, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +int +afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + if (afr_is_split_brain (this, fd->inode)) { + op_errno = EIO; + goto out; + } + QUORUM_CHECK(zerofill, out); + + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) { + goto out; + } + local->cont.zerofill.offset = offset; + local->cont.zerofill.len = len; + + local->fd = fd_ref (fd); + + afr_open_fd_fix (fd, this); + + afr_do_zerofill(frame, this); + + ret = 0; +out: + if (ret < 0) { + if (transaction_frame) { + AFR_STACK_DESTROY (transaction_frame); + } + AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, + NULL, NULL); + } + + return 0; +} + +/* }}} */ + + diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h index ed11079fd..8e93ca44a 100644 --- a/xlators/cluster/afr/src/afr-inode-write.h +++ b/xlators/cluster/afr/src/afr-inode-write.h @@ -68,4 +68,15 @@ int32_t afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, dict_t *xdata); +int +afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); + +int +afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata); + +int +afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); #endif /* __INODE_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index 5e61be4d4..060d78f35 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -55,7 +55,33 @@ } while (0); int -afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); +afr_entry_lockee_cmp (const void *l1, const void *l2) +{ + const afr_entry_lockee_t *r1 = l1; + const afr_entry_lockee_t *r2 = l2; + int ret = 0; + uuid_t gfid1 = {0}; + uuid_t gfid2 = {0}; + + loc_gfid ((loc_t*)&r1->loc, gfid1); + loc_gfid ((loc_t*)&r2->loc, gfid2); + ret = uuid_compare (gfid1, gfid2); + /*Entrylks with NULL basename are the 'smallest'*/ + if (ret == 0) { + if (!r1->basename) + return -1; + if (!r2->basename) + return 1; + ret = strcmp (r1->basename, r2->basename); + } + + if (ret <= 0) + return -1; + else + return 1; +} + +int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); static int afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this); @@ -125,16 +151,9 @@ internal_lock_count (call_frame_t *frame, xlator_t *this) local = frame->local; priv = this->private; - if (local->fd) { - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && local->fd_open_on[i]) - ++call_count; - } - } else { - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) - ++call_count; - } + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) + ++call_count; } return call_count; @@ -332,10 +351,13 @@ static void afr_trace_entrylk_in (call_frame_t *frame, xlator_t *this, afr_lock_call_type_t lock_call_type, afr_lock_op_type_t lk_op_type, const char *basename, - int32_t child_index) + int32_t cookie) { afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; + afr_private_t *priv = NULL; + int child_index = 0; + int lockee_no = 0; char lock[256]; char lockee[256]; @@ -343,28 +365,40 @@ afr_trace_entrylk_in (call_frame_t *frame, xlator_t *this, local = frame->local; int_lock = &local->internal_lock; + priv = this->private; + + if (!priv->entrylk_trace) { + return; + } + lockee_no = cookie / priv->child_count; + child_index = cookie % priv->child_count; afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); - afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); + afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, + child_index); afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); gf_log (this->name, GF_LOG_INFO, - "[%s %s] Lock={%s} Lockee={%s} Number={%llu}", + "[%s %s] Lock={%s} Lockee={%s} Number={%llu}, Cookie={%d}", lock_call_type_str, lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", lock, lockee, - (unsigned long long) int_lock->lock_number); + (unsigned long long) int_lock->lock_number, + cookie); } static void afr_trace_entrylk_out (call_frame_t *frame, xlator_t *this, afr_lock_call_type_t lock_call_type, afr_lock_op_type_t lk_op_type, const char *basename, - int op_ret, int op_errno, int32_t child_index) + int op_ret, int op_errno, int32_t cookie) { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int lockee_no = 0; + int child_index = 0; char lock[256]; char lockee[256]; @@ -373,20 +407,30 @@ afr_trace_entrylk_out (call_frame_t *frame, xlator_t *this, local = frame->local; int_lock = &local->internal_lock; + priv = this->private; - afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); + if (!priv->entrylk_trace) { + return; + } + lockee_no = cookie / priv->child_count; + child_index = cookie % priv->child_count; + + afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); + afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, + child_index); afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); afr_print_verdict (op_ret, op_errno, verdict); gf_log (this->name, GF_LOG_INFO, - "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu}", + "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu} Cookie={%d}", lock_call_type_str, lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", verdict, lock, lockee, - (unsigned long long) int_lock->lock_number); + (unsigned long long) int_lock->lock_number, + cookie); } @@ -439,6 +483,47 @@ is_afr_lock_transaction (afr_local_t *local) return ret; } +int +afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local, + loc_t *loc, char *basename, int child_count) +{ + int ret = -1; + + loc_copy (&lockee->loc, loc); + lockee->basename = (basename)? gf_strdup (basename): NULL; + if (basename && !lockee->basename) + goto out; + + lockee->locked_count = 0; + lockee->locked_nodes = GF_CALLOC (child_count, + sizeof (*lockee->locked_nodes), + gf_afr_mt_afr_node_character); + + if (!lockee->locked_nodes) + goto out; + + ret = 0; +out: + return ret; + +} + +void +afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock) +{ + int i = 0; + + for (i = 0; i < int_lock->lockee_count; i++) { + loc_wipe (&int_lock->lockee[i].loc); + if (int_lock->lockee[i].basename) + GF_FREE (int_lock->lockee[i].basename); + if (int_lock->lockee[i].locked_nodes) + GF_FREE (int_lock->lockee[i].locked_nodes); + } + + return; +} + static int initialize_entrylk_variables (call_frame_t *frame, xlator_t *this) { @@ -456,8 +541,13 @@ initialize_entrylk_variables (call_frame_t *frame, xlator_t *this) int_lock->lock_op_ret = -1; int_lock->lock_op_errno = 0; - for (i = 0; i < priv->child_count; i++) { - int_lock->entry_locked_nodes[i] = 0; + for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) { + if (!int_lock->lockee[i].locked_nodes) + break; + int_lock->lockee[i].locked_count = 0; + memset (int_lock->lockee[i].locked_nodes, 0, + sizeof (*int_lock->lockee[i].locked_nodes) * + priv->child_count); } return 0; @@ -469,19 +559,23 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; afr_private_t *priv = NULL; - int i = 0; + afr_inodelk_t *inodelk = NULL; priv = this->private; local = frame->local; int_lock = &local->internal_lock; - int_lock->inodelk_lock_count = 0; - int_lock->lock_op_ret = -1; - int_lock->lock_op_errno = 0; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - for (i = 0; i < priv->child_count; i++) { - int_lock->inode_locked_nodes[i] = 0; - } + inodelk->lock_count = 0; + int_lock->lk_attempted_count = 0; + int_lock->lock_op_ret = -1; + int_lock->lock_op_errno = 0; + + memset (inodelk->locked_nodes, 0, + sizeof (*inodelk->locked_nodes) * priv->child_count); + memset (int_lock->locked_nodes, 0, + sizeof (*int_lock->locked_nodes) * priv->child_count); return 0; } @@ -503,6 +597,18 @@ lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) } int +afr_lockee_locked_nodes_count (afr_internal_lock_t *int_lock) +{ + int call_count = 0; + int i = 0; + + for (i = 0; i < int_lock->lockee_count; i++) + call_count += int_lock->lockee[i].locked_count; + + return call_count; +} + +int afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) { @@ -550,7 +656,9 @@ afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; int32_t child_index = (long)cookie; + afr_private_t *priv = NULL; local = frame->local; int_lock = &local->internal_lock; @@ -559,14 +667,18 @@ afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, AFR_UNLOCK_OP, NULL, op_ret, op_errno, child_index); + priv = this->private; + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { - gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on %d " - "unlock by %s", local->loc.path, child_index, + gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on subvolume %s " + "with lock owner %s", local->loc.path, + priv->children[child_index]->name, lkowner_utoa (&frame->root->lk_owner)); } - int_lock->inode_locked_nodes[child_index] &= LOCKED_NO; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + inodelk->locked_nodes[child_index] &= LOCKED_NO; if (local->transaction.eager_lock) local->transaction.eager_lock[child_index] = 0; @@ -580,6 +692,7 @@ static int afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; struct gf_flock flock = {0,}; @@ -595,12 +708,14 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; priv = this->private; - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; flock.l_type = F_UNLCK; full_flock.l_type = F_UNLCK; - call_count = afr_locked_nodes_count (int_lock->inode_locked_nodes, + call_count = afr_locked_nodes_count (inodelk->locked_nodes, priv->child_count); int_lock->lk_call_count = call_count; @@ -616,8 +731,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) fd_ctx = afr_fd_ctx_get (local->fd, this); for (i = 0; i < priv->child_count; i++) { - if ((int_lock->inode_locked_nodes[i] & LOCKED_YES) - != LOCKED_YES) + if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES) continue; if (local->fd) { @@ -658,7 +772,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) (void *) (long)i, priv->children[i], priv->children[i]->fops->finodelk, - this->name, local->fd, + int_lock->domain, local->fd, F_SETLK, flock_use, NULL); if (!--call_count) @@ -673,7 +787,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) (void *) (long)i, priv->children[i], priv->children[i]->fops->inodelk, - this->name, &local->loc, + int_lock->domain, &local->loc, F_SETLK, &flock, NULL); if (!--call_count) @@ -689,13 +803,22 @@ afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_local_t *local = NULL; - int32_t child_index = (long)cookie; + afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; + int32_t child_index = 0; + int lockee_no = 0; + + priv = this->private; + lockee_no = (int)((long) cookie) / priv->child_count; + child_index = (int) ((long) cookie) % priv->child_count; local = frame->local; + int_lock = &local->internal_lock; AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, - AFR_UNLOCK_OP, NULL, op_ret, - op_errno, child_index); + AFR_UNLOCK_OP, + int_lock->lockee[lockee_no].basename, op_ret, + op_errno, (int) ((long)cookie)); if (op_ret < 0) { gf_log (this->name, GF_LOG_ERROR, @@ -703,6 +826,7 @@ afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->loc.path, child_index, strerror (op_errno)); } + int_lock->lockee[lockee_no].locked_nodes[child_index] &= LOCKED_NO; afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, NULL); return 0; @@ -711,24 +835,22 @@ afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, static int afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - const char *basename = NULL; - loc_t *loc = NULL; - int call_count = 0; - int i = -1; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int index = 0; + int lockee_no = 0; + int copies = 0; + int i = -1; local = frame->local; int_lock = &local->internal_lock; priv = this->private; + copies = priv->child_count; - basename = int_lock->lk_basename; - if (int_lock->lk_loc) - loc = int_lock->lk_loc; + call_count = afr_lockee_locked_nodes_count (int_lock); - call_count = afr_locked_nodes_count (int_lock->entry_locked_nodes, - priv->child_count); int_lock->lk_call_count = call_count; if (!call_count){ @@ -738,18 +860,22 @@ afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) goto out; } - for (i = 0; i < priv->child_count; i++) { - if (int_lock->entry_locked_nodes[i] & LOCKED_YES) { - AFR_TRACE_ENTRYLK_IN (frame, this, - AFR_ENTRYLK_NB_TRANSACTION, - AFR_UNLOCK_OP, basename, i); + for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) { + lockee_no = i / copies; + index = i % copies; + if (int_lock->lockee[lockee_no].locked_nodes[index] & LOCKED_YES) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, + AFR_UNLOCK_OP, + int_lock->lockee[lockee_no].basename, + i); STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - loc, basename, + priv->children[index], + priv->children[index]->fops->entrylk, + int_lock->domain, + &int_lock->lockee[lockee_no].loc, + int_lock->lockee[lockee_no].basename, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); if (!--call_count) @@ -766,13 +892,20 @@ static int32_t afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - int child_index = (long) cookie; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int cky = (long) cookie; + int child_index = 0; + int lockee_no = 0; + priv = this->private; local = frame->local; int_lock = &local->internal_lock; + child_index = ((int)cky) % priv->child_count; + lockee_no = ((int)cky) / priv->child_count; + LOCK (&frame->lock); { if (op_ret == -1) { @@ -788,6 +921,8 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_errno = op_errno; int_lock->lock_op_errno = op_errno; } + + int_lock->lk_attempted_count++; } UNLOCK (&frame->lock); @@ -796,10 +931,17 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_unlock (frame, this); } else { if (op_ret == 0) { - int_lock->locked_nodes[child_index] |= LOCKED_YES; - int_lock->lock_count++; + if (local->transaction.type == AFR_ENTRY_TRANSACTION || + local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + int_lock->lockee[lockee_no].locked_nodes[child_index] |= LOCKED_YES; + int_lock->lockee[lockee_no].locked_count++; + int_lock->entrylk_lock_count++; + } else { + int_lock->locked_nodes[child_index] |= LOCKED_YES; + int_lock->lock_count++; + } } - afr_lock_blocking (frame, this, child_index + 1); + afr_lock_blocking (frame, this, cky + 1); } return 0; @@ -819,79 +961,6 @@ afr_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } static int32_t -afr_lock_lower_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - loc_t *lower = NULL; - loc_t *higher = NULL; - const char *higher_name = NULL; - int child_index = (long) cookie; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/locks xlator on server"); - - local->op_ret = op_ret; - } - - local->op_errno = op_errno; - } - } - UNLOCK (&frame->lock); - - if (op_ret != 0) { - afr_copy_locked_nodes (frame, this); - afr_unlock (frame, this); - goto out; - } else { - int_lock->lower_locked_nodes[child_index] |= LOCKED_LOWER; - int_lock->lock_count++; - } - - /* The lower path has been locked. Now lock the higher path */ - - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, higher_name, child_index); - - - STACK_WIND_COOKIE (frame, afr_lock_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, higher, higher_name, - ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); - -out: - return 0; -} - -static int32_t afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { @@ -907,6 +976,7 @@ static int afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; @@ -917,18 +987,16 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - memcpy (int_lock->inode_locked_nodes, - int_lock->locked_nodes, - priv->child_count); - int_lock->inodelk_lock_count = int_lock->lock_count; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + memcpy (inodelk->locked_nodes, int_lock->locked_nodes, + sizeof (*inodelk->locked_nodes) * priv->child_count); + inodelk->lock_count = int_lock->lock_count; break; case AFR_ENTRY_RENAME_TRANSACTION: case AFR_ENTRY_TRANSACTION: - memcpy (int_lock->entry_locked_nodes, - int_lock->locked_nodes, - priv->child_count); - int_lock->entrylk_lock_count = int_lock->lock_count; + /*entrylk_count is being used in both non-blocking and blocking + * modes */ break; } @@ -936,25 +1004,67 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) } +static inline gf_boolean_t +afr_is_entrylk (afr_internal_lock_t *int_lock, + afr_transaction_type trans_type) +{ + gf_boolean_t is_entrylk = _gf_false; + + if ((int_lock->transaction_lk_type == AFR_SELFHEAL_LK) && + int_lock->selfheal_lk_type == AFR_ENTRY_SELF_HEAL_LK) { + + is_entrylk = _gf_true; + + } else if ((int_lock->transaction_lk_type == AFR_TRANSACTION_LK) && + (trans_type == AFR_ENTRY_TRANSACTION || + trans_type == AFR_ENTRY_RENAME_TRANSACTION)) { + + is_entrylk = _gf_true; + + } else { + is_entrylk = _gf_false; + } + + return is_entrylk; +} + +static gf_boolean_t +_is_lock_wind_needed (afr_local_t *local, int child_index) +{ + if (!local->child_up[child_index]) + return _gf_false; + + return _gf_true; +} + int -afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) +afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; - loc_t *lower = NULL; - const char *lower_name = NULL; struct gf_flock flock = {0,}; uint64_t ctx = 0; int ret = 0; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = int_lock->lk_flock.l_type; + int child_index = 0; + int lockee_no = 0; + gf_boolean_t is_entrylk = _gf_false; + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + child_index = cookie % priv->child_count; + lockee_no = cookie / priv->child_count; + is_entrylk = afr_is_entrylk (int_lock, local->transaction.type); + + + if (!is_entrylk) { + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = inodelk->flock.l_type; + } if (local->fd) { ret = fd_ctx_get (local->fd, this, &ctx); @@ -973,42 +1083,26 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) return 0; } - - /* skip over children that or down - or don't have the fd open */ - - while ((child_index < priv->child_count) - && (!local->child_up[child_index] || - !local->fd_open_on[child_index])) - - child_index++; - } else { - /* skip over children that are down */ - while ((child_index < priv->child_count) - && !local->child_up[child_index]) - child_index++; } - if ((child_index == priv->child_count) && - int_lock->lock_count == 0) { - - gf_log (this->name, GF_LOG_INFO, - "unable to lock on even one child"); - - local->op_ret = -1; - int_lock->lock_op_ret = -1; + if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { + if ((is_entrylk && int_lock->entrylk_lock_count == 0) || + (!is_entrylk && int_lock->lock_count == 0)) { + gf_log (this->name, GF_LOG_INFO, + "unable to lock on even one child"); - afr_copy_locked_nodes (frame, this); + local->op_ret = -1; + int_lock->lock_op_ret = -1; - afr_unlock(frame, this); + afr_copy_locked_nodes (frame, this); - return 0; + afr_unlock(frame, this); + return 0; + } } - if ((child_index == priv->child_count) - || (int_lock->lock_count == int_lock->lk_expected_count)) { - + if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { /* we're done locking */ gf_log (this->name, GF_LOG_DEBUG, @@ -1021,6 +1115,11 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) return 0; } + if (!_is_lock_wind_needed (local, child_index)) { + afr_lock_blocking (frame, this, cookie + 1); + return 0; + } + switch (local->transaction.type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: @@ -1035,7 +1134,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->finodelk, - this->name, local->fd, + int_lock->domain, local->fd, F_SETLKW, &flock, NULL); } else { @@ -1048,50 +1147,29 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) (void *) (long) child_index, priv->children[child_index], priv->children[child_index]->fops->inodelk, - this->name, &local->loc, + int_lock->domain, &local->loc, F_SETLKW, &flock, NULL); } break; case AFR_ENTRY_RENAME_TRANSACTION: - { - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, lower_name, child_index); - - - STACK_WIND_COOKIE (frame, afr_lock_lower_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - this->name, lower, lower_name, - ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); - - break; - } - case AFR_ENTRY_TRANSACTION: + /*Accounting for child_index increments on 'down' + *and 'fd-less' children */ + if (local->fd) { - AFR_TRACE_ENTRYLK_IN (frame, this, - AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, local->transaction.basename, - child_index); + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, + cookie); STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, - (void *) (long) child_index, + (void *) (long) cookie, priv->children[child_index], priv->children[child_index]->fops->fentrylk, - this->name, local->fd, - local->transaction.basename, + int_lock->domain, local->fd, + int_lock->lockee[lockee_no].basename, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); } else { AFR_TRACE_ENTRYLK_IN (frame, this, @@ -1100,12 +1178,12 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index) child_index); STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, - (void *) (long) child_index, + (void *) (long) cookie, priv->children[child_index], priv->children[child_index]->fops->entrylk, - this->name, - &local->transaction.parent_loc, - local->transaction.basename, + int_lock->domain, + &int_lock->lockee[lockee_no].loc, + int_lock->lockee[lockee_no].basename, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); } @@ -1134,11 +1212,12 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this) break; case AFR_ENTRY_RENAME_TRANSACTION: + case AFR_ENTRY_TRANSACTION: up_count = afr_up_children_count (local->child_up, priv->child_count); - int_lock->lk_expected_count = 2 * up_count; - //fallthrough - case AFR_ENTRY_TRANSACTION: + int_lock->lk_call_count = int_lock->lk_expected_count + = (int_lock->lockee_count * + up_count); initialize_entrylk_variables (frame, this); break; } @@ -1154,35 +1233,48 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; - int call_count = 0; - int child_index = (long) cookie; + int call_count = 0; + int child_index = (long) cookie; + int copies = 0; + int index = 0; + int lockee_no = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + copies = priv->child_count; + index = child_index % copies; + lockee_no = child_index / copies; local = frame->local; int_lock = &local->internal_lock; AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, NULL, op_ret, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, op_ret, op_errno, (long) cookie); - if (op_ret < 0 ) { - if (op_errno == ENOSYS) { + LOCK (&frame->lock); + { + if (op_ret < 0 ) { + if (op_errno == ENOSYS) { /* return ENOTSUP */ - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/locks xlator on server"); - local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; - - int_lock->lock_op_errno = op_errno; - local->op_errno = op_errno; - } - } else if (op_ret == 0) { - int_lock->entry_locked_nodes[child_index] |= LOCKED_YES; - int_lock->entrylk_lock_count++; - } + gf_log (this->name, GF_LOG_ERROR, + "subvolume does not support locking. " + "please load features/locks xlator on server"); + local->op_ret = op_ret; + int_lock->lock_op_ret = op_ret; + + int_lock->lock_op_errno = op_errno; + local->op_errno = op_errno; + } + } else if (op_ret == 0) { + int_lock->lockee[lockee_no].locked_nodes[index] |= \ + LOCKED_YES; + int_lock->lockee[lockee_no].locked_count++; + int_lock->entrylk_lock_count++; + } - LOCK (&frame->lock); - { call_count = --int_lock->lk_call_count; } UNLOCK (&frame->lock); @@ -1212,42 +1304,26 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, return 0; } -void -afr_mark_fd_open_on (afr_local_t *local, afr_fd_ctx_t *fd_ctx, - size_t child_count) -{ - int i = 0; - - GF_ASSERT (local->fd_open_on); - - memset (local->fd_open_on, 0, sizeof (*local->fd_open_on)*child_count); - for (i = 0; i < child_count; i++) - if (fd_ctx->opened_on[i] == AFR_FD_OPENED) - local->fd_open_on[i] = 1; -} - int afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - const char *basename = NULL; - loc_t *loc = NULL; - int32_t call_count = 0; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int copies = 0; + int index = 0; + int lockee_no = 0; + int32_t call_count = 0; int i = 0; local = frame->local; int_lock = &local->internal_lock; priv = this->private; + copies = priv->child_count; initialize_entrylk_variables (frame, this); - basename = int_lock->lk_basename; - if (int_lock->lk_loc) - loc = int_lock->lk_loc; - if (local->fd) { fd_ctx = afr_fd_ctx_get (local->fd, this); if (!fd_ctx) { @@ -1260,11 +1336,11 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) local->op_errno = EINVAL; int_lock->lock_op_errno = EINVAL; + afr_unlock (frame, this); return -1; } - afr_mark_fd_open_on (local, fd_ctx, priv->child_count); - call_count = internal_lock_count (frame, this); + call_count = int_lock->lockee_count * internal_lock_count (frame, this); int_lock->lk_call_count = call_count; int_lock->lk_expected_count = call_count; @@ -1277,46 +1353,52 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) /* Send non-blocking entrylk calls only on up children and where the fd has been opened */ - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && local->fd_open_on[i]) { - AFR_TRACE_ENTRYLK_IN (frame, this, - AFR_ENTRYLK_NB_TRANSACTION, - AFR_LOCK_OP, basename, i); + for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) { + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, + i); STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fentrylk, + priv->children[index], + priv->children[index]->fops->fentrylk, this->name, local->fd, - basename, + int_lock->lockee[lockee_no].basename, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); + if (!--call_count) + break; } } } else { - GF_ASSERT (loc); - - call_count = internal_lock_count (frame, this); + call_count = int_lock->lockee_count * internal_lock_count (frame, this); int_lock->lk_call_count = call_count; int_lock->lk_expected_count = call_count; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - AFR_TRACE_ENTRYLK_IN (frame, this, - AFR_ENTRYLK_NB_TRANSACTION, - AFR_LOCK_OP, basename, i); + for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) { + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { + AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, + AFR_LOCK_OP, + int_lock->lockee[lockee_no].basename, + i); STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, loc, basename, + priv->children[index], + priv->children[index]->fops->entrylk, + this->name, &int_lock->lockee[lockee_no].loc, + int_lock->lockee[lockee_no].basename, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); if (!--call_count) break; - } } } @@ -1329,6 +1411,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; int call_count = 0; int child_index = (long) cookie; @@ -1337,58 +1420,57 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; int_lock = &local->internal_lock; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION, AFR_LOCK_OP, NULL, op_ret, op_errno, (long) cookie); - if (op_ret < 0) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - gf_log (this->name, GF_LOG_ERROR, - "subvolume does not support locking. " - "please load features/locks xlator on server"); - local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; - int_lock->lock_op_errno = op_errno; - local->op_errno = op_errno; - } - if (local->transaction.eager_lock) - local->transaction.eager_lock[child_index] = 0; - } else { - int_lock->inode_locked_nodes[child_index] - |= LOCKED_YES; - int_lock->inodelk_lock_count++; - - if (local->transaction.eager_lock && - local->transaction.eager_lock[child_index] && local->fd) { - fd_ctx = afr_fd_ctx_get (local->fd, this); - /* piggybacked */ - - if (op_ret == 1) { - /* piggybacked */ - } else if (op_ret == 0) { - /* lock acquired from server */ - LOCK (&local->fd->lock); - { - fd_ctx->lock_acquired[child_index]++; - } - UNLOCK (&local->fd->lock); - } - } - } + if (local->fd) + fd_ctx = afr_fd_ctx_get (local->fd, this); LOCK (&frame->lock); { + if (op_ret < 0) { + if (op_errno == ENOSYS) { + /* return ENOTSUP */ + gf_log (this->name, GF_LOG_ERROR, + "subvolume does not support locking. " + "please load features/locks xlator on " + "server"); + local->op_ret = op_ret; + int_lock->lock_op_ret = op_ret; + int_lock->lock_op_errno = op_errno; + local->op_errno = op_errno; + } + if (local->transaction.eager_lock) + local->transaction.eager_lock[child_index] = 0; + } else { + inodelk->locked_nodes[child_index] |= LOCKED_YES; + inodelk->lock_count++; + + if (local->transaction.eager_lock && + local->transaction.eager_lock[child_index] && + local->fd) { + /* piggybacked */ + if (op_ret == 1) { + /* piggybacked */ + } else if (op_ret == 0) { + /* lock acquired from server */ + fd_ctx->lock_acquired[child_index]++; + } + } + } + call_count = --int_lock->lk_call_count; } UNLOCK (&frame->lock); + if (call_count == 0) { gf_log (this->name, GF_LOG_TRACE, "Last inode locking reply received"); /* all locks successful. Proceed to call FOP */ - if (int_lock->inodelk_lock_count == - int_lock->lk_expected_count) { + if (inodelk->lock_count == int_lock->lk_expected_count) { gf_log (this->name, GF_LOG_TRACE, "All servers locked. Calling the cbk"); int_lock->lock_op_ret = 0; @@ -1412,6 +1494,7 @@ int afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; afr_fd_ctx_t *fd_ctx = NULL; @@ -1427,11 +1510,13 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; priv = this->private; - flock.l_start = int_lock->lk_flock.l_start; - flock.l_len = int_lock->lk_flock.l_len; - flock.l_type = int_lock->lk_flock.l_type; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + + flock.l_start = inodelk->flock.l_start; + flock.l_len = inodelk->flock.l_len; + flock.l_type = inodelk->flock.l_type; - full_flock.l_type = int_lock->lk_flock.l_type; + full_flock.l_type = inodelk->flock.l_type; initialize_inodelk_variables (frame, this); @@ -1447,11 +1532,11 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) local->op_errno = EINVAL; int_lock->lock_op_errno = EINVAL; + afr_unlock (frame, this); ret = -1; goto out; } - afr_mark_fd_open_on (local, fd_ctx, priv->child_count); call_count = internal_lock_count (frame, this); int_lock->lk_call_count = call_count; int_lock->lk_expected_count = call_count; @@ -1466,11 +1551,11 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) /* Send non-blocking inodelk calls only on up children and where the fd has been opened */ for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i] || !local->fd_open_on[i]) + if (!local->child_up[i]) continue; flock_use = &flock; - if (!priv->eager_lock) { + if (!local->transaction.eager_lock_on) { goto wind; } @@ -1506,7 +1591,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->finodelk, - this->name, local->fd, + int_lock->domain, local->fd, F_SETLK, flock_use, NULL); if (!--call_count) @@ -1528,7 +1613,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->inodelk, - this->name, &local->loc, + int_lock->domain, &local->loc, F_SETLK, &flock, NULL); if (!--call_count) @@ -1539,201 +1624,6 @@ out: return ret; } -static int -__is_lower_locked (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int count = 0; - int i = 0; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER) - count++; - } - - return count; - -} - -static int -__is_higher_locked (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int count = 0; - int i = 0; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (int_lock->locked_nodes[i] & LOCKED_YES) - count++; - } - - return count; - -} - -static int -afr_unlock_lower_entrylk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - const char *basename = NULL; - loc_t *loc = NULL; - int call_count = 0; - int i = -1; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - basename = int_lock->lk_basename; - if (int_lock->lk_loc) - loc = int_lock->lk_loc; - - call_count = __is_lower_locked (frame, this); - int_lock->lk_call_count = call_count; - - if (!call_count){ - gf_log (this->name, GF_LOG_TRACE, - "No internal locks unlocked"); - int_lock->lock_cbk (frame, this); - goto out; - } - - for (i = 0; i < priv->child_count; i++) { - if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER) { - AFR_TRACE_ENTRYLK_IN (frame, this, - AFR_ENTRYLK_NB_TRANSACTION, - AFR_UNLOCK_OP, basename, i); - - STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->entrylk, - this->name, - loc, basename, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); - - if (!--call_count) - break; - - } - } - -out: - return 0; - -} - - -static int -afr_post_unlock_higher_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - local->transaction.done (frame, this); - return 0; -} - -static int -afr_post_unlock_lower_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - loc_t *lower = NULL; - loc_t *higher = NULL; - const char *higher_name = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - higher = (lower == &local->transaction.parent_loc ? - &local->transaction.new_parent_loc : - &local->transaction.parent_loc); - - higher_name = (higher == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - if (__is_higher_locked (frame, this)) { - gf_log (this->name, GF_LOG_DEBUG, - "unlocking higher"); - int_lock->lk_basename = higher_name; - int_lock->lk_loc = higher; - int_lock->lock_cbk = afr_post_unlock_higher_cbk; - - afr_unlock_entrylk (frame, this); - } else - local->transaction.done (frame, this); - - return 0; -} - -static int -afr_rename_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - loc_t *lower = NULL; - const char *lower_name = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - lower = lower_path (&local->transaction.parent_loc, - local->transaction.basename, - &local->transaction.new_parent_loc, - local->transaction.new_basename); - - lower_name = (lower == &local->transaction.parent_loc ? - local->transaction.basename : - local->transaction.new_basename); - - if (__is_lower_locked (frame, this)) { - gf_log (this->name, GF_LOG_DEBUG, - "unlocking lower"); - int_lock->lk_basename = lower_name; - int_lock->lk_loc = lower; - int_lock->lock_cbk = afr_post_unlock_lower_cbk; - - afr_unlock_lower_entrylk (frame, this); - } else - afr_post_unlock_lower_cbk (frame, this); - - return 0; -} - -static int -afr_rename_transaction (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - return (local->transaction.type == - AFR_ENTRY_RENAME_TRANSACTION); - -} - int32_t afr_unlock (call_frame_t *frame, xlator_t *this) { @@ -1745,10 +1635,8 @@ afr_unlock (call_frame_t *frame, xlator_t *this) if (is_afr_lock_transaction (local)) afr_unlock_inodelk (frame, this); else - if (!afr_rename_transaction (frame, this)) - afr_unlock_entrylk (frame, this); - else - afr_rename_unlock (frame, this); + afr_unlock_entrylk (frame, this); + } else { if (is_afr_lock_selfheal (local)) afr_unlock_inodelk (frame, this); @@ -2249,29 +2137,38 @@ out: return ret; } -void -afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, unsigned int child_count) { - afr_local_t *dst_local = NULL; - afr_local_t *src_local = NULL; - afr_internal_lock_t *dst_lock = NULL; - afr_internal_lock_t *src_lock = NULL; + afr_local_t *dst_local = NULL; + afr_local_t *src_local = NULL; + afr_internal_lock_t *dst_lock = NULL; + afr_internal_lock_t *src_lock = NULL; + afr_inodelk_t *dst_inodelk = NULL; + afr_inodelk_t *src_inodelk = NULL; + int ret = -1; - dst_local = dst->local; - dst_lock = &dst_local->internal_lock; src_local = src->local; src_lock = &src_local->internal_lock; - if (src_lock->inode_locked_nodes) { - memcpy (dst_lock->inode_locked_nodes, - src_lock->inode_locked_nodes, - sizeof (*dst_lock->inode_locked_nodes) * child_count); - memset (src_lock->inode_locked_nodes, 0, - sizeof (*src_lock->inode_locked_nodes) * child_count); + src_inodelk = afr_get_inodelk (src_lock, dom); + dst_local = dst->local; + dst_lock = &dst_local->internal_lock; + dst_inodelk = afr_get_inodelk (dst_lock, dom); + if (!dst_inodelk || !src_inodelk) + goto out; + if (src_inodelk->locked_nodes) { + memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes, + sizeof (*dst_inodelk->locked_nodes) * child_count); + memset (src_inodelk->locked_nodes, 0, + sizeof (*src_inodelk->locked_nodes) * child_count); } dst_lock->transaction_lk_type = src_lock->transaction_lk_type; dst_lock->selfheal_lk_type = src_lock->selfheal_lk_type; - dst_lock->inodelk_lock_count = src_lock->inodelk_lock_count; - src_lock->inodelk_lock_count = 0; + dst_inodelk->lock_count = src_inodelk->lock_count; + src_inodelk->lock_count = 0; + ret = 0; +out: + return ret; } diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index e01ab366f..73594f265 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -41,7 +41,10 @@ enum gf_afr_mem_types_ { gf_afr_mt_shd_event_t, gf_afr_mt_time_t, gf_afr_mt_pos_data_t, - gf_afr_mt_reply_t, + gf_afr_mt_reply_t, + gf_afr_mt_stats_t, + gf_afr_mt_shd_crawl_event_t, + gf_afr_mt_uint64_t, gf_afr_mt_end }; #endif diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index c0be197f2..643a5d692 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -249,189 +249,126 @@ out: return 0; } -//NOTE: this function should be called with holding the lock on -//fd to which fd_ctx belongs -void -afr_get_resumable_calls (xlator_t *this, afr_fd_ctx_t *fd_ctx, - struct list_head *list) -{ - afr_fd_paused_call_t *paused_call = NULL; - afr_fd_paused_call_t *tmp = NULL; - afr_local_t *call_local = NULL; - afr_private_t *priv = NULL; - int i = 0; - gf_boolean_t call = _gf_false; - - priv = this->private; - list_for_each_entry_safe (paused_call, tmp, &fd_ctx->paused_calls, - call_list) { - call = _gf_true; - call_local = paused_call->frame->local; - for (i = 0; i < priv->child_count; i++) { - if (call_local->child_up[i] && - (fd_ctx->opened_on[i] == AFR_FD_OPENING)) - call = _gf_false; - } - - if (call) { - list_del_init (&paused_call->call_list); - list_add (&paused_call->call_list, list); - } - } -} - -void -afr_resume_calls (xlator_t *this, struct list_head *list) -{ - afr_fd_paused_call_t *paused_call = NULL; - afr_fd_paused_call_t *tmp = NULL; - afr_local_t *call_local = NULL; - - list_for_each_entry_safe (paused_call, tmp, list, call_list) { - list_del_init (&paused_call->call_list); - call_local = paused_call->frame->local; - call_local->fop_call_continue (paused_call->frame, this); - GF_FREE (paused_call); - } -} - int afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - int call_count = 0; - int child_index = (long) cookie; - struct list_head paused_calls = {0}; - gf_boolean_t fop_paused = _gf_false; - fd_t *local_fd = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int call_count = 0; + int child_index = (long) cookie; priv = this->private; local = frame->local; - fop_paused = local->fop_paused; if (op_ret >= 0) { - gf_log (this->name, GF_LOG_INFO, "fd for %s opened " + gf_log (this->name, GF_LOG_DEBUG, "fd for %s opened " "successfully on subvolume %s", local->loc.path, priv->children[child_index]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, "Failed to open %s " + "on subvolume %s", local->loc.path, + priv->children[child_index]->name); } - local_fd = fd_ref (local->fd); - call_count = afr_frame_return (frame); - //Note: Do not access any thing using the frame outside call_count 0 - - //Note: No frame locking needed for this block of code - fd_ctx = afr_fd_ctx_get (local_fd, this); + fd_ctx = afr_fd_ctx_get (local->fd, this); if (!fd_ctx) { gf_log (this->name, GF_LOG_WARNING, - "failed to get fd context, %p", local_fd); + "failed to get fd context, %p", local->fd); goto out; } - LOCK (&local_fd->lock); + LOCK (&local->fd->lock); { if (op_ret >= 0) { fd_ctx->opened_on[child_index] = AFR_FD_OPENED; } else { fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; } - if (call_count == 0) { - INIT_LIST_HEAD (&paused_calls); - afr_get_resumable_calls (this, fd_ctx, &paused_calls); - } } - UNLOCK (&local_fd->lock); + UNLOCK (&local->fd->lock); out: - if (call_count == 0) { - afr_resume_calls (this, &paused_calls); - //If the fop is paused then resume_calls will continue the fop - if (fop_paused) - goto done; - - if (local->fop_call_continue) - local->fop_call_continue (frame, this); - else - AFR_STACK_DESTROY (frame); - } + call_count = afr_frame_return (frame); + if (call_count == 0) + AFR_STACK_DESTROY (frame); -done: - fd_unref (local_fd); return 0; } -int -afr_fix_open (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx, - int need_open_count, int *need_open) +void +afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int i = 0; - call_frame_t *open_frame = NULL; - afr_local_t *open_local = NULL; - int ret = -1; - ia_type_t ia_type = IA_INVAL; - int32_t op_errno = 0; - - GF_ASSERT (fd_ctx); - GF_ASSERT (need_open_count > 0); - GF_ASSERT (need_open); + afr_private_t *priv = NULL; + int i = 0; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int32_t op_errno = 0; + afr_fd_ctx_t *fd_ctx = NULL; - local = frame->local; priv = this->private; - if (!local->fop_call_continue) { - open_frame = copy_frame (frame); - if (!open_frame) { - ret = -ENOMEM; - goto out; - } - AFR_LOCAL_ALLOC_OR_GOTO (open_frame->local, out); - open_local = open_frame->local; - ret = afr_local_init (open_local, priv, &op_errno); - if (ret < 0) - goto out; - loc_copy (&open_local->loc, &local->loc); - open_local->fd = fd_ref (local->fd); - } else { - ret = 0; - open_frame = frame; - open_local = local; + + if (!afr_is_fd_fixable (fd) || !need_open || !need_open_count) + goto out; + + fd_ctx = afr_fd_ctx_get (fd, this); + if (!fd_ctx) { + ret = -1; + goto out; } - open_local->call_count = need_open_count; + frame = create_frame (this, this->ctx->pool); + if (!frame) { + ret = -1; + goto out; + } - gf_log (this->name, GF_LOG_DEBUG, "need open count: %d", + AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out); + local = frame->local; + ret = afr_local_init (local, priv, &op_errno); + if (ret < 0) + goto out; + + local->loc.inode = inode_ref (fd->inode); + ret = loc_path (&local->loc, NULL); + if (ret < 0) + goto out; + + local->fd = fd_ref (fd); + local->call_count = need_open_count; + + gf_log (this->name, GF_LOG_DEBUG, "need open count: %zd", need_open_count); - ia_type = open_local->fd->inode->ia_type; - GF_ASSERT (ia_type != IA_INVAL); for (i = 0; i < priv->child_count; i++) { if (!need_open[i]) continue; - if (IA_IFDIR == ia_type) { + + if (IA_IFDIR == fd->inode->ia_type) { gf_log (this->name, GF_LOG_DEBUG, "opening fd for dir %s on subvolume %s", local->loc.path, priv->children[i]->name); - STACK_WIND_COOKIE (open_frame, afr_openfd_fix_open_cbk, + STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk, (void*) (long) i, priv->children[i], priv->children[i]->fops->opendir, - &open_local->loc, open_local->fd, + &local->loc, local->fd, NULL); } else { gf_log (this->name, GF_LOG_DEBUG, "opening fd for file %s on subvolume %s", local->loc.path, priv->children[i]->name); - STACK_WIND_COOKIE (open_frame, afr_openfd_fix_open_cbk, + STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk, (void *)(long) i, priv->children[i], priv->children[i]->fops->open, - &open_local->loc, + &local->loc, fd_ctx->flags & (~O_TRUNC), - open_local->fd, NULL); + local->fd, NULL); } } @@ -439,8 +376,7 @@ afr_fix_open (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx, ret = 0; out: if (op_errno) - ret = -op_errno; - if (ret && open_frame) - AFR_STACK_DESTROY (open_frame); - return ret; + ret = -1; //For handling ALLOC_OR_GOTO + if (ret && frame) + AFR_STACK_DESTROY (frame); } diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c index 1721fd270..83846f152 100644 --- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c +++ b/xlators/cluster/afr/src/afr-self-heal-algorithm.c @@ -100,7 +100,7 @@ sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this, } sh_private_cleanup (sh_frame, this); - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { GF_ASSERT (!last_loop_frame); //loop_finish should have happened and the old_loop should be NULL gf_log (this->name, GF_LOG_DEBUG, @@ -143,7 +143,7 @@ sh_loop_finish (call_frame_t *loop_frame, xlator_t *this) } if (loop_sh && loop_sh->data_lock_held) { - afr_sh_data_unlock (loop_frame, this, + afr_sh_data_unlock (loop_frame, this, this->name, sh_destroy_frame); } else { sh_destroy_frame (loop_frame, this); @@ -214,7 +214,7 @@ sh_loop_frame_create (call_frame_t *sh_frame, xlator_t *this, goto out; //We want the frame to have same lk_owner as sh_frame //so that locks translator allows conflicting locks - new_loop_local = afr_local_copy (local, this); + new_loop_local = afr_self_heal_local_init (local, this); if (!new_loop_local) goto out; new_loop_frame->local = new_loop_local; @@ -273,10 +273,10 @@ sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset, new_loop_sh->offset = offset; new_loop_sh->block_size = sh->block_size; afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size, - _gf_true, sh_loop_lock_success, sh_loop_lock_failure); + _gf_true, this->name, sh_loop_lock_success, sh_loop_lock_failure); return 0; out: - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); if (old_loop_frame) sh_loop_finish (old_loop_frame, this); sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM); @@ -307,8 +307,9 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, sh_priv->loops_running--; offset = sh_priv->offset; block_size = sh->block_size; - while ((!sh->eof_reached) && (0 == sh->op_failed) && - (sh_priv->loops_running < priv->data_self_heal_window_size) + while ((!sh->eof_reached) && + (!is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) && + (sh_priv->loops_running < priv->data_self_heal_window_size) && (sh_priv->offset < sh->file_size)) { loop++; @@ -327,7 +328,8 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, if (0 == loop) { //loop finish does unlock, but the erasing of the pending //xattrs needs to happen before that so do not finish the loop - if (is_driver_done && !sh->op_failed) + if (is_driver_done && + !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) goto driver_done; if (old_loop_frame) { sh_loop_finish (old_loop_frame, this); @@ -338,7 +340,7 @@ sh_loop_driver (call_frame_t *sh_frame, xlator_t *this, //If we have more loops to form we should finish previous loop after //the next loop lock while (loop--) { - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { // op failed in other loop, stop spawning more loops if (old_loop_frame) { sh_loop_finish (old_loop_frame, this); @@ -384,7 +386,7 @@ sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame } if (op_ret == -1) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); if (loop_frame) { sh_loop_finish (loop_frame, this); @@ -432,16 +434,16 @@ sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (loop_sh, op_errno); } else if (op_ret < loop_local->cont.writev.vector->iov_len) { - gf_log(this->name, GF_LOG_ERROR, - "incomplete write to %s on subvolume %s " - "(expected %lu, returned %d)", sh_local->loc.path, - priv->children[child_index]->name, - loop_local->cont.writev.vector->iov_len, op_ret); - sh->op_failed = 1; - } + gf_log (this->name, GF_LOG_ERROR, + "incomplete write to %s on subvolume %s " + "(expected %lu, returned %d)", sh_local->loc.path, + priv->children[child_index]->name, + loop_local->cont.writev.vector->iov_len, op_ret); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } call_count = afr_frame_return (loop_frame); @@ -514,7 +516,7 @@ sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie, if (op_ret <= 0) { if (op_ret < 0) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); gf_log (this->name, GF_LOG_ERROR, "read failed on %d " "for %s reason :%s", sh->source, sh_local->loc.path, strerror (errno)); @@ -624,7 +626,7 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, "checksum on %s failed on subvolume %s (%s)", sh_local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } else { memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LENGTH, strong_checksum, MD5_DIGEST_LENGTH); @@ -662,7 +664,8 @@ sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this, } UNLOCK (&sh_priv->lock); - if (write_needed && !sh->op_failed) { + if (write_needed && + !is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { sh_loop_read (loop_frame, this); } else { sh_loop_return (sh_frame, this, loop_frame, @@ -751,14 +754,15 @@ out: return sh_priv; } -void -afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, +int +afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, char *dom, unsigned int child_count) { afr_local_t *dst_local = NULL; afr_self_heal_t *dst_sh = NULL; afr_local_t *src_local = NULL; afr_self_heal_t *src_sh = NULL; + int ret = -1; dst_local = dst->local; dst_sh = &dst_local->self_heal; @@ -766,9 +770,12 @@ afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src, src_sh = &src_local->self_heal; GF_ASSERT (src_sh->data_lock_held); GF_ASSERT (!dst_sh->data_lock_held); - afr_lk_transfer_datalock (dst, src, child_count); + ret = afr_lk_transfer_datalock (dst, src, dom, child_count); + if (ret) + return ret; src_sh->data_lock_held = _gf_false; dst_sh->data_lock_held = _gf_true; + return 0; } int @@ -790,7 +797,10 @@ afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, ret = sh_loop_frame_create (sh_frame, this, NULL, &first_loop_frame); if (ret) goto out; - afr_sh_transfer_lock (first_loop_frame, sh_frame, priv->child_count); + ret = afr_sh_transfer_lock (first_loop_frame, sh_frame, this->name, + priv->child_count); + if (ret) + goto out; sh->private = afr_sh_priv_init (); if (!sh->private) { ret = -1; @@ -800,7 +810,7 @@ afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this, ret = 0; out: if (ret) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); sh_loop_driver_done (sh_frame, this, NULL); } return 0; diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 07603c5b2..ef92b4205 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -18,6 +18,28 @@ #include "afr-self-heal.h" #include "pump.h" +#define ADD_FMT_STRING(msg, off, sh_str, status, print_log) \ + do { \ + if (AFR_SELF_HEAL_NOT_ATTEMPTED != status) { \ + off += snprintf (msg + off, sizeof (msg) - off, \ + " "sh_str" self heal %s,", \ + get_sh_completion_status (status));\ + print_log = 1; \ + } \ + } while (0) + +#define ADD_FMT_STRING_SYNC(msg, off, sh_str, status, print_log) \ + do { \ + if (AFR_SELF_HEAL_SYNC_BEGIN == status || \ + AFR_SELF_HEAL_FAILED == status) { \ + off += snprintf (msg + off, sizeof (msg) - off, \ + " "sh_str" self heal %s,", \ + get_sh_completion_status (status));\ + print_log = 1; \ + } \ + } while (0) + + void afr_sh_reset (call_frame_t *frame, xlator_t *this) { @@ -112,8 +134,8 @@ void afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno) { sh->op_ret = -1; - if (afr_error_more_important (sh->op_errno, op_errno)) - sh->op_errno = op_errno; + sh->op_errno = afr_most_important_error(sh->op_errno, op_errno, + _gf_false); } void @@ -141,6 +163,79 @@ afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) GF_FREE (buf); } +char* +afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this) +{ + afr_private_t * priv = this->private; + char *buf = NULL; + char *ptr = NULL; + int i = 0; + int j = 0; + int child_count = priv->child_count; + char *matrix_begin = "[ [ "; + char *matrix_end = "] ]"; + char *seperator = "] [ "; + int pending_entry_strlen = 12; //Including space after entry + int matrix_begin_strlen = 0; + int matrix_end_strlen = 0; + int seperator_strlen = 0; + int string_length = 0; + char *msg = "- Pending matrix: "; + + /* + * for a list of lists of [ [ a b ] [ c d ] ] + * */ + + matrix_begin_strlen = strlen (matrix_begin); + matrix_end_strlen = strlen (matrix_end); + seperator_strlen = strlen (seperator); + string_length = matrix_begin_strlen + matrix_end_strlen + + (child_count -1) * seperator_strlen + + (child_count * child_count * pending_entry_strlen); + + buf = GF_CALLOC (1, 1 + strlen (msg) + string_length , gf_afr_mt_char); + if (!buf) + goto out; + + ptr = buf; + ptr += sprintf (ptr, "%s", msg); + ptr += sprintf (ptr, "%s", matrix_begin); + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); + } + if (i < priv->child_count -1) + ptr += sprintf (ptr, "%s", seperator); + } + + ptr += sprintf (ptr, "%s", matrix_end); + +out: + return buf; +} + +void +afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, + const char *loc) +{ + char *buf = NULL; + char *free_ptr = NULL; + + buf = afr_get_pending_matrix_str (pending_matrix, this); + if (buf) + free_ptr = buf; + else + buf = ""; + + + gf_log (this->name, GF_LOG_ERROR, "Unable to self-heal contents of '%s'" + " (possible split-brain). Please delete the file from all but " + "the preferred subvolume.%s", loc, buf); + GF_FREE (free_ptr); + return; +} + + void afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count) { @@ -406,6 +501,8 @@ afr_find_biggest_witness_among_fools (int32_t *witnesses, { int i = 0; int biggest_witness = -1; + int biggest_witness_idx = -1; + int biggest_witness_cnt = -1; GF_ASSERT (witnesses); GF_ASSERT (characters); @@ -415,10 +512,21 @@ afr_find_biggest_witness_among_fools (int32_t *witnesses, if (characters[i].type != AFR_NODE_FOOL) continue; - if (biggest_witness < witnesses[i]) + if (biggest_witness < witnesses[i]) { biggest_witness = witnesses[i]; + biggest_witness_idx = i; + biggest_witness_cnt = 1; + continue; + } + + if (biggest_witness == witnesses[i]) + biggest_witness_cnt++; } - return biggest_witness; + + if (biggest_witness_cnt != 1) + return -1; + + return biggest_witness_idx; } int @@ -446,10 +554,84 @@ afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses, return nsources; } + +int +afr_mark_fool_as_source_by_idx (int32_t *sources, int child_count, int idx) +{ + if (idx >= 0 && idx < child_count) { + sources[idx] = 1; + return 1; + } + return 0; +} + + +static int +afr_find_largest_file_size (struct iatt *bufs, int32_t *success_children, + int child_count) +{ + int idx = -1; + int i = -1; + int child = -1; + uint64_t max_size = 0; + uint64_t min_size = 0; + int num_children = 0; + + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + + child = success_children[i]; + if (bufs[child].ia_size > max_size) { + max_size = bufs[child].ia_size; + idx = child; + } + + if ((num_children == 0) || (bufs[child].ia_size < min_size)) { + min_size = bufs[child].ia_size; + } + + num_children++; + } + + /* If sizes are same for all of them, finding sources will have to + * happen with pending changelog. So return -1 + */ + if ((num_children > 1) && (min_size == max_size)) + return -1; + return idx; +} + + +static int +afr_find_newest_file (struct iatt *bufs, int32_t *success_children, + int child_count) +{ + int idx = -1; + int i = -1; + int child = -1; + uint64_t max_ctime = 0; + + for (i = 0; i < child_count; i++) { + if (success_children[i] == -1) + break; + + child = success_children[i]; + if (bufs[child].ia_ctime > max_ctime) { + max_ctime = bufs[child].ia_ctime; + idx = child; + } + } + + return idx; +} + + static int afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, afr_node_character *characters, - int child_count) + int32_t *success_children, + int child_count, struct iatt *bufs) { int32_t biggest_witness = 0; int nsources = 0; @@ -457,6 +639,11 @@ afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, GF_ASSERT (child_count > 0); + biggest_witness = afr_find_largest_file_size (bufs, success_children, + child_count); + if (biggest_witness != -1) + goto found; + witnesses = GF_CALLOC (child_count, sizeof (*witnesses), gf_afr_mt_int32_t); if (NULL == witnesses) { @@ -469,9 +656,15 @@ afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix, biggest_witness = afr_find_biggest_witness_among_fools (witnesses, characters, child_count); - nsources = afr_mark_fool_as_source_by_witness (sources, witnesses, - characters, child_count, - biggest_witness); + if (biggest_witness != -1) + goto found; + + biggest_witness = afr_find_newest_file (bufs, success_children, + child_count); + +found: + nsources = afr_mark_fool_as_source_by_idx (sources, child_count, + biggest_witness); out: GF_FREE (witnesses); return nsources; @@ -815,7 +1008,8 @@ afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix, nsources = afr_mark_biggest_of_fools_as_source (sources, pending_matrix, characters, - child_count); + success_children, + child_count, bufs); } out: @@ -833,14 +1027,46 @@ afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr, int32_t *delta_matrix[], unsigned char success[], int child_count, afr_transaction_type type) { - int i = 0; - int j = 0; + int tgt = 0; + int src = 0; + int value = 0; afr_build_pending_matrix (priv->pending_key, delta_matrix, NULL, xattr, type, priv->child_count); - for (i = 0; i < priv->child_count; i++) - for (j = 0; j < priv->child_count; j++) - delta_matrix[i][j] = -delta_matrix[i][j]; + + /* + * The algorithm here has two parts. First, for each subvol indexed + * as tgt, we try to figure out what count everyone should have for it. + * If the self-heal succeeded, that's easy; the value is zero. + * Otherwise, the value is the maximum of the succeeding nodes' counts. + * Once we know the value, we loop through (possibly for a second time) + * setting each count to the difference so that when we're done all + * succeeding nodes will have the same count for tgt. + */ + for (tgt = 0; tgt < priv->child_count; ++tgt) { + value = 0; + if (!success[tgt]) { + /* Find the maximum. */ + for (src = 0; src < priv->child_count; ++src) { + if (!success[src]) { + continue; + } + if (delta_matrix[src][tgt] > value) { + value = delta_matrix[src][tgt]; + } + } + } + /* Force everyone who succeeded to the chosen value. */ + for (src = 0; src < priv->child_count; ++src) { + if (success[src]) { + delta_matrix[src][tgt] = value + - delta_matrix[src][tgt]; + } + else { + delta_matrix[src][tgt] = 0; + } + } + } } @@ -867,8 +1093,14 @@ afr_sh_delta_to_xattr (xlator_t *this, pending = GF_CALLOC (sizeof (int32_t), 3, gf_afr_mt_int32_t); - if (!pending) + if (!pending) { + gf_log (this->name, GF_LOG_ERROR, + "failed to allocate pending entry " + "for %s[%d] on %s", + priv->pending_key[j], type, + priv->children[i]->name); continue; + } /* 3 = data+metadata+entry */ k = afr_index_for_transaction_type (type); @@ -914,14 +1146,13 @@ afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) afr_sh_reset (frame, this); - if (local->govinda_gOvinda) { + if (local->unhealable) { gf_log (this->name, GF_LOG_DEBUG, "split brain found, aborting selfheal of %s", local->loc.path); - sh->op_failed = 1; } - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { sh->completion_cbk (frame, this); } else { gf_log (this->name, GF_LOG_TRACE, @@ -1153,7 +1384,7 @@ out: if (ret) { gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, " "reason: %s", local->loc.path, strerror (-ret)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } afr_sh_missing_entries_finish (frame, this); } @@ -1168,7 +1399,7 @@ afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this, local = frame->local; sh = &local->self_heal; if (op_ret < 0) - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_missing_entries_finish (frame, this); return 0; } @@ -1192,7 +1423,7 @@ sh_missing_entries_create (call_frame_t *frame, xlator_t *this) if (!afr_valid_ia_type (type)) { gf_log (this->name, GF_LOG_ERROR, "%s: unknown file type: 0%o", local->loc.path, type); - local->govinda_gOvinda = 1; + afr_set_local_for_unhealable (local); afr_sh_missing_entries_finish (frame, this); goto out; } @@ -1225,8 +1456,9 @@ afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this, loc = &local->loc; if (op_ret < 0) { - if (op_errno == EIO) - local->govinda_gOvinda = 1; + if (op_errno == EIO) { + afr_set_local_for_unhealable (local); + } // EIO can happen if finding the fresh parent dir failed goto out; } @@ -1288,7 +1520,7 @@ afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this, } return; out: - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_missing_entries_finish (frame, this); return; @@ -1372,7 +1604,7 @@ afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child, LOCK (&frame->lock); { afr_sh_set_error (sh, EIO); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } UNLOCK (&frame->lock); } @@ -1454,7 +1686,7 @@ afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; priv = this->private; - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_missing_entries_finish (frame, this); } else { if (afr_gfid_missing_count (this->name, sh->fresh_children, @@ -1547,7 +1779,7 @@ afr_sh_purge_entry_common (call_frame_t *frame, xlator_t *this, if (!purge_condition (local, priv, i)) continue; gf_log (this->name, GF_LOG_INFO, "purging the stale entry %s " - "on %d", local->loc.path, i); + "on %s", local->loc.path, priv->children[i]->name); afr_sh_call_entry_expunge_remove (frame, this, (long) i, &sh->buf[i], &sh->parentbufs[i], @@ -1667,10 +1899,8 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this, sh->child_errno, priv->child_count, ENOENT); if (fresh_child_enoents == fresh_parent_count) { - gf_log (this->name, GF_LOG_INFO, "Deleting stale file %s", - local->loc.path); afr_sh_set_error (sh, ENOENT); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_purge_entry (frame, this); } else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children, priv->child_count, local->loc.path, @@ -1684,14 +1914,14 @@ afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this, afr_sh_purge_stale_entry (frame, this); } else { op_errno = EIO; - local->govinda_gOvinda = 1; + afr_set_local_for_unhealable (local); goto fail; } return; fail: - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_missing_entries_finish (frame, this); return; @@ -1762,8 +1992,8 @@ afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this, out: afr_sh_set_error (sh, op_errno); - sh->op_failed = 1; - afr_sh_missing_entries_finish (frame, this); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_missing_entries_finish (frame, this); return; } @@ -1852,7 +2082,8 @@ afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, int -afr_sh_post_nb_entrylk_conflicting_sh_cbk (call_frame_t *frame, xlator_t *this) +afr_sh_post_nb_entrylk_missing_entry_sh_cbk (call_frame_t *frame, + xlator_t *this) { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; @@ -1865,7 +2096,7 @@ afr_sh_post_nb_entrylk_conflicting_sh_cbk (call_frame_t *frame, xlator_t *this) if (int_lock->lock_op_ret < 0) { gf_log (this->name, GF_LOG_INFO, "Non blocking entrylks failed."); - sh->op_failed = -1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_missing_entries_done (frame, this); } else { @@ -1881,40 +2112,14 @@ afr_sh_post_nb_entrylk_conflicting_sh_cbk (call_frame_t *frame, xlator_t *this) } int -afr_sh_post_nb_entrylk_gfid_sh_cbk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - - local = frame->local; - sh = &local->self_heal; - int_lock = &local->internal_lock; - - if (int_lock->lock_op_ret < 0) { - gf_log (this->name, GF_LOG_INFO, - "Non blocking entrylks failed."); - afr_sh_missing_entries_done (frame, this); - } else { - gf_log (this->name, GF_LOG_DEBUG, - "Non blocking entrylks done. Proceeding to FOP"); - afr_sh_common_lookup (frame, this, &local->loc, - afr_sh_missing_entries_lookup_done, - sh->sh_gfid_req, AFR_LOOKUP_FAIL_CONFLICTS| - AFR_LOOKUP_FAIL_MISSING_GFIDS, - NULL); - } - - return 0; -} - -int afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, char *base_name, afr_lock_cbk_t lock_cbk) { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; + afr_private_t *priv = NULL; + priv = this->private; local = frame->local; int_lock = &local->internal_lock; @@ -1926,7 +2131,12 @@ afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, int_lock->lk_basename = base_name; int_lock->lk_loc = loc; int_lock->lock_cbk = lock_cbk; + int_lock->domain = this->name; + int_lock->lockee_count = 0; + afr_init_entry_lockee (&int_lock->lockee[0], local, loc, + base_name, priv->child_count); + int_lock->lockee_count++; afr_nonblocking_entrylk (frame, this); return 0; @@ -1964,27 +2174,31 @@ out: } static int -afr_self_heal_conflicting_entries (call_frame_t *frame, xlator_t *this) +afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) { - afr_self_heal_parent_entrylk (frame, this, - afr_sh_post_nb_entrylk_conflicting_sh_cbk); - return 0; -} + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + sh->sh_type_in_action = AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY; + + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); -static int -afr_self_heal_gfids (call_frame_t *frame, xlator_t *this) -{ afr_self_heal_parent_entrylk (frame, this, - afr_sh_post_nb_entrylk_gfid_sh_cbk); + afr_sh_post_nb_entrylk_missing_entry_sh_cbk); return 0; } -afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) +afr_local_t* +afr_self_heal_local_init (afr_local_t *l, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *lc = NULL; - afr_self_heal_t *sh = NULL; - afr_self_heal_t *shc = NULL; + afr_private_t *priv = NULL; + afr_local_t *lc = NULL; + afr_self_heal_t *sh = NULL; + afr_self_heal_t *shc = NULL; + int ret = 0; priv = this->private; @@ -2007,13 +2221,23 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) shc->forced_merge = sh->forced_merge; shc->background = sh->background; shc->type = sh->type; + shc->data_sh_info = ""; + shc->metadata_sh_info = ""; uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req); - if (l->loc.path) - loc_copy (&lc->loc, &l->loc); + if (l->loc.path) { + ret = loc_copy (&lc->loc, &l->loc); + if (ret < 0) + goto out; + } lc->child_up = memdup (l->child_up, sizeof (*lc->child_up) * priv->child_count); + if (!lc->child_up) { + ret = -1; + goto out; + } + if (l->xattr_req) lc->xattr_req = dict_ref (l->xattr_req); @@ -2021,40 +2245,25 @@ afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this) lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode); if (l->cont.lookup.xattr) lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr); - if (l->internal_lock.inode_locked_nodes) - lc->internal_lock.inode_locked_nodes = - memdup (l->internal_lock.inode_locked_nodes, - sizeof (*lc->internal_lock.inode_locked_nodes) * priv->child_count); - else - lc->internal_lock.inode_locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes), - priv->child_count, - gf_afr_mt_char); - if (l->internal_lock.entry_locked_nodes) - lc->internal_lock.entry_locked_nodes = - memdup (l->internal_lock.entry_locked_nodes, - sizeof (*lc->internal_lock.entry_locked_nodes) * priv->child_count); - else - lc->internal_lock.entry_locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.entry_locked_nodes), - priv->child_count, - gf_afr_mt_char); - if (l->internal_lock.locked_nodes) - lc->internal_lock.locked_nodes = - memdup (l->internal_lock.locked_nodes, - sizeof (*lc->internal_lock.locked_nodes) * priv->child_count); - else - lc->internal_lock.locked_nodes = - GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), - priv->child_count, - gf_afr_mt_char); - lc->internal_lock.inodelk_lock_count = - l->internal_lock.inodelk_lock_count; - lc->internal_lock.entrylk_lock_count = - l->internal_lock.entrylk_lock_count; + lc->internal_lock.locked_nodes = + GF_CALLOC (sizeof (*l->internal_lock.locked_nodes), + priv->child_count, gf_afr_mt_char); + if (!lc->internal_lock.locked_nodes) { + ret = -1; + goto out; + } + + ret = afr_inodelk_init (&lc->internal_lock.inodelk[0], + this->name, priv->child_count); + if (ret) + goto out; out: + if (ret) { + afr_local_cleanup (lc, this); + lc = NULL; + } return lc; } @@ -2067,35 +2276,28 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) afr_local_t * orig_frame_local = NULL; afr_self_heal_t * orig_frame_sh = NULL; char sh_type_str[256] = {0,}; - gf_boolean_t split_brain = _gf_false; + gf_loglevel_t loglevel = 0; priv = this->private; local = bgsh_frame->local; sh = &local->self_heal; - if (local->govinda_gOvinda || sh->mdata_spb || sh->data_spb) { - split_brain = _gf_true; - sh->op_failed = 1; + if (local->unhealable) { + afr_set_split_brain (this, sh->inode, SPB, SPB); } - afr_set_split_brain (this, sh->inode, split_brain); - afr_self_heal_type_str_get (sh, sh_type_str, sizeof(sh_type_str)); - if (sh->op_failed) { - gf_loglevel_t loglevel = GF_LOG_ERROR; - if (priv->shd.iamshd) - loglevel = GF_LOG_DEBUG; - - gf_log (this->name, loglevel, "background %s self-heal " - "failed on %s", sh_type_str, local->loc.path); - + if (is_self_heal_failed (sh, AFR_CHECK_ALL) && !priv->shd.iamshd) { + loglevel = GF_LOG_ERROR; + } else if (!is_self_heal_failed (sh, AFR_CHECK_ALL)) { + loglevel = GF_LOG_INFO; } else { - gf_log (this->name, GF_LOG_DEBUG, "background %s self-heal " - "completed on %s", sh_type_str, local->loc.path); - + loglevel = GF_LOG_DEBUG; } + afr_log_self_heal_completion_status (local, loglevel); + FRAME_SU_UNDO (bgsh_frame, afr_local_t); if (!sh->unwound && sh->unwind) { @@ -2103,7 +2305,7 @@ afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this) orig_frame_sh = &orig_frame_local->self_heal; orig_frame_sh->actual_sh_started = _gf_true; sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, - sh->op_failed); + is_self_heal_failed (sh, AFR_CHECK_ALL)); } if (sh->background) { @@ -2152,7 +2354,7 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) afr_set_lk_owner (sh_frame, this, sh_frame->root); afr_set_low_priority (sh_frame); - sh_local = afr_local_copy (local, this); + sh_local = afr_self_heal_local_init (local, this); if (!sh_local) goto out; sh_frame->local = sh_local; @@ -2217,12 +2419,11 @@ afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode) sh->do_gfid_self_heal = _gf_false; } + sh->sh_type_in_action = AFR_SELF_HEAL_INVALID; + FRAME_SU_DO (sh_frame, afr_local_t); - if (sh->do_missing_entry_self_heal) { - afr_self_heal_conflicting_entries (sh_frame, this); - } else if (sh->do_gfid_self_heal) { - GF_ASSERT (!uuid_is_null (sh->sh_gfid_req)); - afr_self_heal_gfids (sh_frame, this); + if (sh->do_missing_entry_self_heal || sh->do_gfid_self_heal) { + afr_self_heal_missing_entries (sh_frame, this); } else { loc = &sh_local->loc; if (uuid_is_null (loc->inode->gfid) && uuid_is_null (loc->gfid)) { @@ -2429,9 +2630,183 @@ out: GF_FREE (erase_xattr); if (ret < 0) { - sh->op_failed = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); finish (frame, this); } return 0; } + +void +afr_set_self_heal_status(afr_self_heal_t *sh, afr_self_heal_status status) +{ + xlator_t *this = NULL; + afr_sh_status_for_all_type *sh_status = &(sh->afr_all_sh_status); + afr_self_heal_type sh_type_in_action = sh->sh_type_in_action; + this = THIS; + + if (!sh) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal" + "Structure"); + goto out; + } + + switch (sh_type_in_action) { + case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: + sh_status->gfid_or_missing_entry_self_heal = status; + break; + case AFR_SELF_HEAL_METADATA: + sh_status->metadata_self_heal = status; + break; + case AFR_SELF_HEAL_DATA: + sh_status->data_self_heal = status; + break; + case AFR_SELF_HEAL_ENTRY: + sh_status->entry_self_heal = status; + break; + case AFR_SELF_HEAL_INVALID: + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid" + "self heal type in action"); + break; + } +out: + return; +} + +void +afr_set_local_for_unhealable (afr_local_t *local) +{ + afr_self_heal_t *sh = NULL; + + sh = &local->self_heal; + + local->unhealable = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); +} + +int +is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type) +{ + afr_sh_status_for_all_type sh_status = sh->afr_all_sh_status; + afr_self_heal_type sh_type_in_action = AFR_SELF_HEAL_INVALID; + afr_self_heal_status status = AFR_SELF_HEAL_FAILED; + xlator_t *this = NULL; + int sh_failed = 0; + + this = THIS; + + if (!sh) { + gf_log_callingfn (this->name, GF_LOG_ERROR, "Null self heal " + "structure"); + sh_failed = 1; + goto out; + } + + if (type == AFR_CHECK_ALL) { + if ((sh_status.gfid_or_missing_entry_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.metadata_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.data_self_heal == AFR_SELF_HEAL_FAILED) + || (sh_status.entry_self_heal == AFR_SELF_HEAL_FAILED)) + sh_failed = 1; + } else if (type == AFR_CHECK_SPECIFIC) { + sh_type_in_action = sh->sh_type_in_action; + switch (sh_type_in_action) { + case AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY: + status = sh_status.gfid_or_missing_entry_self_heal; + break; + case AFR_SELF_HEAL_METADATA: + status = sh_status.metadata_self_heal; + break; + case AFR_SELF_HEAL_ENTRY: + status = sh_status.entry_self_heal; + break; + case AFR_SELF_HEAL_DATA: + status = sh_status.data_self_heal; + break; + case AFR_SELF_HEAL_INVALID: + status = AFR_SELF_HEAL_NOT_ATTEMPTED; + break; + } + if (status == AFR_SELF_HEAL_FAILED) + sh_failed = 1; + + } + +out: + return sh_failed; +} + +char * +get_sh_completion_status (afr_self_heal_status status) +{ + + char *not_attempted = " is not attempted"; + char *failed = " failed"; + char *started = " is started"; + char *sync_begin = " is successfully completed"; + char *result = " has unknown status"; + + switch (status) + { + case AFR_SELF_HEAL_NOT_ATTEMPTED: + result = not_attempted; + break; + case AFR_SELF_HEAL_FAILED: + result = failed; + break; + case AFR_SELF_HEAL_STARTED: + result = started; + break; + case AFR_SELF_HEAL_SYNC_BEGIN: + result = sync_begin; + break; + } + + return result; + +} + +void +afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t loglvl) +{ + + char sh_log[4096] = {0}; + afr_self_heal_t *sh = &local->self_heal; + afr_sh_status_for_all_type all_status = sh->afr_all_sh_status; + xlator_t *this = NULL; + size_t off = 0; + int data_sh = 0; + int metadata_sh = 0; + int print_log = 0; + + this = THIS; + + ADD_FMT_STRING (sh_log, off, "gfid or missing entry", + all_status.gfid_or_missing_entry_self_heal, print_log); + ADD_FMT_STRING_SYNC (sh_log, off, "metadata", + all_status.metadata_self_heal, print_log); + if (sh->background) { + ADD_FMT_STRING_SYNC (sh_log, off, "backgroung data", + all_status.data_self_heal, print_log); + } else { + ADD_FMT_STRING_SYNC (sh_log, off, "foreground data", + all_status.data_self_heal, print_log); + } + ADD_FMT_STRING_SYNC (sh_log, off, "entry", all_status.entry_self_heal, + print_log); + + if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.data_self_heal && + strcmp (sh->data_sh_info, "") && sh->data_sh_info ) + data_sh = 1; + if (AFR_SELF_HEAL_SYNC_BEGIN == all_status.metadata_self_heal && + strcmp (sh->metadata_sh_info, "") && sh->metadata_sh_info) + metadata_sh = 1; + + if (!print_log) + return; + + gf_log (this->name, loglvl, "%s %s %s on %s", sh_log, + ((data_sh == 1) ? sh->data_sh_info : ""), + ((metadata_sh == 1) ? sh->metadata_sh_info : ""), + local->loc.path); +} diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h index 1c83289a9..473264776 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.h +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -15,13 +15,6 @@ #define AFR_SH_MIN_PARTICIPANTS 2 typedef enum { - AFR_SELF_HEAL_ENTRY, - AFR_SELF_HEAL_METADATA, - AFR_SELF_HEAL_DATA, - AFR_SELF_HEAL_INVALID = -1, -} afr_self_heal_type; - -typedef enum { AFR_LOOKUP_FAIL_CONFLICTS = 1, AFR_LOOKUP_FAIL_MISSING_GFIDS = 2, } afr_lookup_flags_t; @@ -35,6 +28,10 @@ afr_sh_source_count (int sources[], int child_count); void afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); +void +afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this, + const char *loc); + int afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix, unsigned char *ignorant_subvols, @@ -94,13 +91,13 @@ int afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this, int child_index); int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, afr_lock_cbk_t lock_cbk); afr_local_t * -afr_local_copy (afr_local_t *l, xlator_t *this); +afr_self_heal_local_init (afr_local_t *l, xlator_t *this); int afr_sh_data_lock (call_frame_t *frame, xlator_t *this, - off_t start, off_t len, gf_boolean_t block, + off_t start, off_t len, gf_boolean_t block, char *dom, afr_lock_cbk_t success_handler, afr_lock_cbk_t failure_handler); void @@ -129,4 +126,19 @@ int afr_sh_erase_pending (call_frame_t *frame, xlator_t *this, afr_transaction_type type, afr_fxattrop_cbk_t cbk, int (*finish)(call_frame_t *frame, xlator_t *this)); + +void +afr_set_local_for_unhealable (afr_local_t *local); + +int +is_self_heal_failed (afr_self_heal_t *sh, afr_sh_fail_check_type type); + +void +afr_set_self_heal_status (afr_self_heal_t *sh, afr_self_heal_status status); + +void +afr_log_self_heal_completion_status (afr_local_t *local, gf_loglevel_t logl); + +char* +afr_get_pending_matrix_str (int32_t *pending_matrix[], xlator_t *this); #endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index c90ed7c1a..9de26ee56 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -156,6 +156,25 @@ afr_sh_data_close (call_frame_t *frame, xlator_t *this) } int +afr_sh_dom_unlock (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + if (sh->sh_dom_lock_held) + afr_sh_data_unlock (frame, this, priv->sh_domain, + afr_sh_data_close); + else + afr_sh_data_close (frame, this); + return 0; +} + +int afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *statpre, struct iatt *statpost, dict_t *xdata) @@ -190,29 +209,20 @@ afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } int -afr_sh_data_setattr (call_frame_t *frame, xlator_t *this) +afr_sh_data_setattr (call_frame_t *frame, xlator_t *this, struct iatt* stbuf) { afr_local_t *local = NULL; afr_private_t *priv = NULL; afr_self_heal_t *sh = NULL; int i = 0; int call_count = 0; - int source = 0; int32_t valid = 0; - struct iatt stbuf = {0,}; local = frame->local; sh = &local->self_heal; priv = this->private; - source = sh->source; - - valid |= (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); - - stbuf.ia_atime = sh->buf[source].ia_atime; - stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec; - stbuf.ia_mtime = sh->buf[source].ia_mtime; - stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec; + valid = (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME); call_count = afr_set_elem_count_get (sh->success, priv->child_count); @@ -232,7 +242,7 @@ afr_sh_data_setattr (call_frame_t *frame, xlator_t *this) (void *) (long) i, priv->children[i], priv->children[i]->fops->setattr, - &local->loc, &stbuf, valid, NULL); + &local->loc, stbuf, valid, NULL); if (!--call_count) break; @@ -256,7 +266,7 @@ afr_sh_data_setattr_fstat_cbk (call_frame_t *frame, void *cookie, GF_ASSERT (sh->source == child_index); if (op_ret != -1) { sh->buf[child_index] = *buf; - afr_sh_data_setattr (frame, this); + afr_sh_data_setattr (frame, this, buf); } else { gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set " "time-stamps after self-heal", local->loc.path); @@ -292,31 +302,45 @@ afr_sh_set_timestamps (call_frame_t *frame, xlator_t *this) //Fun fact, lock_cbk is being used for both lock & unlock int -afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this, char *dom, afr_lock_cbk_t lock_cbk) { afr_local_t *local = NULL; afr_internal_lock_t *int_lock = NULL; afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int ret = 0; local = frame->local; int_lock = &local->internal_lock; sh = &local->self_heal; + priv = this->private; - GF_ASSERT (sh->data_lock_held); - - sh->data_lock_held = _gf_false; + if (strcmp (dom, this->name) == 0) { + sh->data_lock_held = _gf_false; + } else if (strcmp (dom, priv->sh_domain) == 0) { + sh->sh_dom_lock_held = _gf_false; + } else { + ret = -1; + goto out; + } int_lock->lock_cbk = lock_cbk; + int_lock->domain = dom; afr_unlock (frame, this); +out: + if (ret) { + int_lock->lock_op_ret = -1; + int_lock->lock_cbk (frame, this); + } return 0; } int afr_sh_data_finish (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; local = frame->local; sh = &local->self_heal; @@ -325,9 +349,9 @@ afr_sh_data_finish (call_frame_t *frame, xlator_t *this) "finishing data selfheal of %s", local->loc.path); if (sh->data_lock_held) - afr_sh_data_unlock (frame, this, afr_sh_data_close); + afr_sh_data_unlock (frame, this, this->name, afr_sh_dom_unlock); else - afr_sh_data_close (frame, this); + afr_sh_dom_unlock (frame, this); return 0; } @@ -344,11 +368,8 @@ afr_sh_data_fail (call_frame_t *frame, xlator_t *this) gf_log (this->name, GF_LOG_DEBUG, "finishing failed data selfheal of %s", local->loc.path); - sh->op_failed = 1; - if (sh->data_lock_held) - afr_sh_data_unlock (frame, this, afr_sh_data_close); - else - afr_sh_data_close (frame, this); + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_data_finish (frame, this); return 0; } @@ -371,13 +392,13 @@ afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, "log failed on %s for subvol %s, reason: %s", local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } call_count = afr_frame_return (frame); if (call_count == 0) { - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { if (sh->old_loop_frame) sh_loop_finish (sh->old_loop_frame, this); sh->old_loop_frame = NULL; @@ -389,7 +410,7 @@ afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, goto out; } GF_ASSERT (sh->old_loop_frame); - afr_sh_data_lock (frame, this, 0, 0, _gf_true, + afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, afr_post_sh_big_lock_success, afr_post_sh_big_lock_failure); } @@ -406,6 +427,80 @@ afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) return 0; } +int +afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; + + local = frame->local; + priv = this->private; + sh = &local->self_heal; + + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "%s: Failed to fsync on " + "%s - %s", local->loc.path, + priv->children[child_index]->name, strerror (op_errno)); + LOCK (&frame->lock); + { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + } + UNLOCK (&frame->lock); + if (sh->old_loop_frame) + sh_loop_finish (sh->old_loop_frame, this); + sh->old_loop_frame = NULL; + } + + call_count = afr_frame_return (frame); + if (call_count == 0) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) + afr_sh_data_fail (frame, this); + else + afr_sh_data_erase_pending (frame, this); + } + return 0; +} + +/* + * Before erasing xattrs, make sure the data is written to disk + */ +int +afr_sh_data_fsync (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int i = 0; + int call_count = 0; + + local = frame->local; + priv = this->private; + sh = &local->self_heal; + + call_count = sh->active_sinks; + if (call_count == 0) { + afr_sh_data_erase_pending (frame, this); + return 0; + } + + local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (!sh->success[i] || sh->sources[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_sh_data_fsync_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->fsync, + sh->healing_fd, 1, NULL); + } + + return 0; +} static struct afr_sh_algorithm * sh_algo_from_name (xlator_t *this, char *name) @@ -503,7 +598,7 @@ afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) local = frame->local; sh = &local->self_heal; - sh->algo_completion_cbk = afr_sh_data_erase_pending; + sh->algo_completion_cbk = afr_sh_data_fsync; sh->algo_abort_cbk = afr_sh_data_fail; sh_algo = afr_sh_data_pick_algo (frame, this); @@ -539,7 +634,7 @@ afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } else { gf_log (this->name, GF_LOG_DEBUG, "ftruncate of %s on subvolume %s completed", @@ -552,7 +647,7 @@ afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - if (sh->op_failed) + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) afr_sh_data_fail (frame, this); else afr_sh_data_sync_prepare (frame, this); @@ -632,6 +727,199 @@ out: return ret; } +char* +afr_get_sizes_str (afr_local_t *local, struct iatt *bufs, xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = 0; + char num[1024] = {0}; + size_t len = 0; + char *sizes_str = NULL; + size_t off = 0; + char *fmt_str = "%llu bytes on %s, "; + char *child_down = " %s,"; + char *child_unknown = " %s,"; + int down_child_present = 0; + int down_count = 0; + int unknown_count = 0; + int unknown_child_present = 0; + char *down_subvol_1 = " down subvolume is "; + char *unknown_subvol_1 = " unknown subvolume is "; + char *down_subvol_2 = " down subvolumes are "; + char *unknown_subvol_2 = " unknown subvolumes are "; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 1) { + len += snprintf (num, sizeof (num), fmt_str, + (unsigned long long) bufs[i].ia_size, + priv->children[i]->name); + } else if (local->child_up[i] == 0) { + len += snprintf (num, sizeof (num), child_down, + priv->children[i]->name); + if (!down_child_present) + down_child_present = 1; + down_count ++; + } else if (local->child_up[i] == -1) { + len += snprintf (num, sizeof (num), child_unknown, + priv->children[i]->name); + if (!unknown_child_present) + unknown_child_present = 1; + unknown_count++; + } + + } + + if (down_child_present) { + if (down_count > 1) + len += snprintf (num, sizeof (num), "%s", + down_subvol_2); + else + len += snprintf (num, sizeof (num), "%s", + down_subvol_1); + } + if (unknown_child_present) { + if (unknown_count > 1) + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_2); + else + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_1); + } + + len++;//for '\0' + + sizes_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); + + if (!sizes_str) + return NULL; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 1) { + off += snprintf (sizes_str + off, len - off, fmt_str, + (unsigned long long) bufs[i].ia_size, + priv->children[i]->name); + } + } + + if (down_child_present) { + if (down_count > 1) { + off += snprintf (sizes_str + off, len - off, "%s", + down_subvol_2); + } else { + off += snprintf (sizes_str + off, len - off, "%s", + down_subvol_1); + } + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 0) { + off += snprintf (sizes_str + off, len - off, child_down, + priv->children[i]->name); + } + } + + if (unknown_child_present) { + if (unknown_count > 1) { + off += snprintf (sizes_str + off, len - off, "%s", + unknown_subvol_2); + } else { + off += snprintf (sizes_str + off, len - off, "%s", + unknown_subvol_1); + } + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == -1) { + off += snprintf (sizes_str + off, len - off, + child_unknown, + priv->children[i]->name); + + } + } + + return sizes_str; +} + +char* +afr_get_sinks_str (xlator_t *this, afr_local_t *local, afr_self_heal_t *sh) +{ + afr_private_t *priv = NULL; + int i = 0; + char num[1024] = {0}; + size_t len = 0; + char *sinks_str = NULL; + char *temp_str = " to sinks "; + char *str_format = " %s,"; + char off = 0; + + priv = this->private; + + len += snprintf (num, sizeof (num), "%s", temp_str); + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { + len += snprintf (num, sizeof (num), str_format, + priv->children[i]->name); + } + } + + len ++; + + sinks_str = GF_CALLOC (len, sizeof (char), gf_common_mt_char); + + if (!sinks_str) + return NULL; + + off += snprintf (sinks_str + off, len - off, "%s", temp_str); + + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) && (local->child_up[i] == 1)) { + off += snprintf (sinks_str + off, len - off, + str_format, + priv->children[i]->name); + } + } + + return sinks_str; + +} + + +void +afr_set_data_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, xlator_t *this) +{ + char *pending_matrix_str = NULL; + char *sizes_str = NULL; + char *sinks_str = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + + pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, + this); + if (!pending_matrix_str) + pending_matrix_str = ""; + + sizes_str = afr_get_sizes_str (local, sh->buf, this); + if (!sizes_str) + sizes_str = ""; + + sinks_str = afr_get_sinks_str (this, local, sh); + if (!sinks_str) + sinks_str = ""; + + gf_asprintf (&sh->data_sh_info, " data self heal from %s %s with " + "%s data %s", priv->children[sh->source]->name, sinks_str, + sizes_str, pending_matrix_str); + + if (pending_matrix_str && strcmp (pending_matrix_str, "")) + GF_FREE (pending_matrix_str); + + if (sizes_str && strcmp (sizes_str, "")) + GF_FREE (sizes_str); +} + void afr_sh_data_fix (call_frame_t *frame, xlator_t *this) { @@ -653,7 +941,7 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this) if (sh->background && sh->unwind && !sh->unwound) { sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno, - sh->op_failed); + is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)); sh->unwound = _gf_true; } @@ -672,6 +960,7 @@ afr_sh_data_fix (call_frame_t *frame, xlator_t *this) sh->active_sinks); sh->actual_sh_started = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); afr_sh_data_trim_sinks (frame, this); } @@ -683,6 +972,9 @@ afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this) afr_private_t *priv = NULL; int nsources = 0; int ret = 0; + int *old_sources = NULL; + int tstamp_source = 0; + int i = 0; local = frame->local; sh = &local->self_heal; @@ -690,6 +982,13 @@ afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this) gf_log (this->name, GF_LOG_DEBUG, "Pending matrix for: %s", lkowner_utoa (&frame->root->lk_owner)); + if (sh->sync_done) { + //store sources before sync so that mtime can be set using the + //iatt buf from one of them. + old_sources = alloca (priv->child_count*sizeof (*old_sources)); + memcpy (old_sources, sh->sources, + priv->child_count * sizeof (*old_sources)); + } nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix, sh->sources, sh->success_children, @@ -711,17 +1010,16 @@ afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this) } if (nsources == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Unable to self-heal contents of '%s' (possible " - "split-brain). Please delete the file from all but " - "the preferred subvolume.", local->loc.path); + afr_sh_print_split_brain_log (sh->pending_matrix, this, + local->loc.path); + afr_set_split_brain (this, sh->inode, DONT_KNOW, SPB); - sh->data_spb = _gf_true; afr_sh_data_fail (frame, this); return 0; } - sh->data_spb = _gf_false; + afr_set_split_brain (this, sh->inode, DONT_KNOW, NO_SPB); + ret = afr_sh_inode_set_read_ctx (sh, this); if (ret) { gf_log (this->name, GF_LOG_DEBUG, @@ -732,8 +1030,18 @@ afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this) } if (sh->sync_done) { - afr_sh_data_setattr (frame, this); + /* Perform setattr from one of the old_sources if possible + * Because only they have the correct mtime, the new sources + * (i.e. old sinks) have mtime from last writev in sync. + */ + tstamp_source = sh->source; + for (i = 0; i < priv->child_count; i++) { + if (old_sources[i] && sh->sources[i]) + tstamp_source = i; + } + afr_sh_data_setattr (frame, this, &sh->buf[tstamp_source]); } else { + afr_set_data_sh_info_str (local, sh, this); if (nsources == 0) { gf_log (this->name, GF_LOG_DEBUG, "No self-heal needed for %s", @@ -1091,6 +1399,22 @@ afr_sh_data_big_lock_success (call_frame_t *frame, xlator_t *this) } int +afr_sh_dom_lock_success (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + sh->sh_dom_lock_held = _gf_true; + afr_sh_data_lock (frame, this, 0, 0, _gf_true, this->name, + afr_sh_data_big_lock_success, + afr_sh_data_fail); + return 0; +} + +int afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; @@ -1154,9 +1478,11 @@ afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) } int -afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t len) +afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, char *dom, + off_t start, off_t len) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; local = frame->local; @@ -1167,11 +1493,14 @@ afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t le afr_set_lock_number (frame, this); - int_lock->lk_flock.l_start = start; - int_lock->lk_flock.l_len = len; - int_lock->lk_flock.l_type = F_WRLCK; int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk; + int_lock->domain = dom; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + inodelk->flock.l_start = start; + inodelk->flock.l_len = len; + inodelk->flock.l_type = F_WRLCK; + afr_nonblocking_inodelk (frame, this); return 0; @@ -1215,7 +1544,7 @@ afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this) int afr_sh_data_lock (call_frame_t *frame, xlator_t *this, off_t start, off_t len, gf_boolean_t block, - afr_lock_cbk_t success_handler, + char *dom, afr_lock_cbk_t success_handler, afr_lock_cbk_t failure_handler) { afr_local_t * local = NULL; @@ -1227,7 +1556,7 @@ afr_sh_data_lock (call_frame_t *frame, xlator_t *this, sh->data_lock_success_handler = success_handler; sh->data_lock_failure_handler = failure_handler; sh->data_lock_block = block; - return afr_sh_data_lock_rec (frame, this, start, len); + return afr_sh_data_lock_rec (frame, this, dom, start, len); } int @@ -1239,7 +1568,6 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_private_t *priv = NULL; int call_count = 0; int child_index = 0; - gf_boolean_t block = _gf_true; local = frame->local; sh = &local->self_heal; @@ -1259,7 +1587,7 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } else { gf_log (this->name, GF_LOG_TRACE, "open of %s succeeded on child %s", @@ -1272,7 +1600,7 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_data_fail (frame, this); return 0; } @@ -1281,15 +1609,8 @@ afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, "fd for %s opened, commencing sync", local->loc.path); - /* - * The read and write self-heal trigger codepaths do not provide - * an unwind callback. We run a trylock in these codepaths - * because we are sensitive to locking latency. - */ - block = sh->unwind ? _gf_true : _gf_false; - afr_sh_data_lock (frame, this, 0, 0, block, - afr_sh_data_big_lock_success, - afr_sh_data_fail); + afr_sh_data_lock (frame, this, 0, 0, _gf_true, priv->sh_domain, + afr_sh_dom_lock_success, afr_sh_data_fail); } return 0; @@ -1394,25 +1715,31 @@ afr_can_start_data_self_heal (afr_self_heal_t *sh, afr_private_t *priv) int afr_self_heal_data (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_self_heal_t *sh = NULL; - afr_private_t *priv = this->private; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = this->private; + int ret = -1; local = frame->local; sh = &local->self_heal; - /* Self-heal completion cbk changes inode split-brain status based on - * govinda_gOvinda, mdata_spb, data_spb value. - * Initialize data_spb with current split-brain status. - * If for some reason self-heal fails(locking phase etc), it makes sure - * we retain the split-brain status before this self-heal started. - */ - sh->data_spb = afr_is_split_brain (this, sh->inode); + sh->sh_type_in_action = AFR_SELF_HEAL_DATA; + if (afr_can_start_data_self_heal (sh, priv)) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); + ret = afr_inodelk_init (&local->internal_lock.inodelk[1], + priv->sh_domain, priv->child_count); + if (ret < 0) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_data_done (frame, this); + return 0; + } + if (IA_ISREG (sh->type)) { afr_sh_data_open (frame, this); } else { afr_sh_data_lock (frame, this, 0, 0, _gf_true, + this->name, afr_sh_non_reg_lock_success, afr_sh_data_fail); } diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c index c3c9f9fca..53491a1d7 100644 --- a/xlators/cluster/afr/src/afr-self-heal-entry.c +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -162,7 +162,7 @@ afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) sh = &local->self_heal; if (sh->entries_skipped) { - sh->op_failed = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); goto out; } afr_sh_erase_pending (frame, this, AFR_ENTRY_TRANSACTION, @@ -613,6 +613,19 @@ out: return 0; } +static gf_boolean_t +can_skip_entry_self_heal (char *name, loc_t *parent_loc) +{ + if (strcmp (name, ".") == 0) { + return _gf_true; + } else if (strcmp (name, "..") == 0) { + return _gf_true; + } else if (loc_is_root (parent_loc) && + (strcmp (name, GF_REPLICATE_TRASH_DIR) == 0)) { + return _gf_true; + } + return _gf_false; +} int afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, @@ -640,13 +653,7 @@ afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, sh->expunge_done = afr_sh_entry_expunge_entry_done; name = entry->d_name; - - if ((strcmp (name, ".") == 0) - || (strcmp (name, "..") == 0)) { - - gf_log (this->name, GF_LOG_TRACE, - "skipping inspection of %s under %s", - name, local->loc.path); + if (can_skip_entry_self_heal (name, &local->loc)) { op_ret = 0; goto out; } @@ -799,7 +806,7 @@ afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this) active_src = next_active_sink (frame, this, sh->active_source); sh->active_source = active_src; - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { goto out; } @@ -1256,6 +1263,35 @@ afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed", impunge_local->loc.path); + /* + * Reason for adding GLUSTERFS_INTERNAL_FOP_KEY : + * + * Problem: + * While a brick is down in a replica pair, lets say the user creates + * one file(file-A) and a hard link to that file(h-file-A). After the + * brick comes back up, entry self-heal is attempted on parent dir of + * these two files. As part of readdir in self-heal it reads both the + * entries file-A and h-file-A for both of them it does name less lookup + * to check if there are any hardlinks already present in the + * destination brick. It finds that there are no hard links already + * present for files file-A, h-file-A. Self-heal does mknods for both + * file-A and h-file-A. This leads to file-A and h-file-A not being + * hardlinks anymore. + * + * Fix: (More like shrinking of race-window, the race itself is still + * present in posix-mknod). + * If mknod comes with the presence of GLUSTERFS_INTERNAL_FOP_KEY then + * posix_mknod checks if there are already any gfid-links and does + * link() instead of mknod. There still can be a race where two + * posix_mknods same gfid see that + * gfid-link file is not present and proceeds with mknods and result in + * two different files with same gfid. + */ + ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); + if (ret) + gf_log (this->name, GF_LOG_INFO, "%s: %s set failed", + impunge_local->loc.path, GLUSTERFS_INTERNAL_FOP_KEY); + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, (void *) (long) child_index, priv->children[child_index], @@ -1872,12 +1908,7 @@ afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, active_src = sh->active_source; sh->impunge_done = afr_sh_entry_impunge_entry_done; - if ((strcmp (entry->d_name, ".") == 0) - || (strcmp (entry->d_name, "..") == 0)) { - - gf_log (this->name, GF_LOG_TRACE, - "skipping inspection of %s under %s", - entry->d_name, local->loc.path); + if (can_skip_entry_self_heal (entry->d_name, &local->loc)) { op_ret = 0; goto out; } @@ -1946,7 +1977,7 @@ afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, local->loc.path, priv->children[active_src]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } else { gf_log (this->name, GF_LOG_TRACE, "readdir of %s on subvolume %s complete", @@ -2019,7 +2050,7 @@ afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) active_src = next_active_source (frame, this, sh->active_source); sh->active_source = active_src; - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_entry_finish (frame, this); return 0; } @@ -2068,7 +2099,7 @@ afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->loc.path, priv->children[child_index]->name, strerror (op_errno)); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); } } UNLOCK (&frame->lock); @@ -2076,7 +2107,7 @@ afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); if (call_count == 0) { - if (sh->op_failed) { + if (is_self_heal_failed (sh, AFR_CHECK_SPECIFIC)) { afr_sh_entry_finish (frame, this); return 0; } @@ -2209,6 +2240,7 @@ afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) local->loc.path); sh->actual_sh_started = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); afr_sh_entry_open (frame, this); return 0; @@ -2231,7 +2263,7 @@ afr_sh_entry_fix (call_frame_t *frame, xlator_t *this, priv = this->private; if (op_ret < 0) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_entry_finish (frame, this); goto out; @@ -2294,7 +2326,7 @@ afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this) if (int_lock->lock_op_ret < 0) { gf_log (this->name, GF_LOG_ERROR, "Non Blocking entrylks " "failed for %s.", local->loc.path); - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_entry_done (frame, this); } else { @@ -2313,14 +2345,18 @@ afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this) int afr_self_heal_entry (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; afr_private_t *priv = NULL; - + afr_self_heal_t *sh = NULL; priv = this->private; local = frame->local; + sh = &local->self_heal; + + sh->sh_type_in_action = AFR_SELF_HEAL_ENTRY; if (local->self_heal.do_entry_self_heal && priv->entry_self_heal) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); afr_sh_entrylk (frame, this, &local->loc, NULL, afr_sh_post_nonblocking_entry_cbk); } else { diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c index 647ee47a1..fd5da6cfd 100644 --- a/xlators/cluster/afr/src/afr-self-heal-metadata.c +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -88,6 +88,19 @@ afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this) return 0; } +int +afr_sh_metadata_fail (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); + afr_sh_metadata_finish (frame, this); + return 0; +} int afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, @@ -167,8 +180,13 @@ afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_count = afr_frame_return (frame); - if (call_count == 0) + if (call_count == 0) { + if (local->xattr_req) { + dict_unref (local->xattr_req); + local->xattr_req = NULL; + } afr_sh_metadata_erase_pending (frame, this); + } return 0; } @@ -194,6 +212,86 @@ afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, return 0; } +int +afr_sh_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t *xdata) +{ + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (op_ret < 0) { + afr_sh_metadata_sync_cbk (frame, cookie, + this, -1, op_errno, xdata); + goto out; + } + + i = (long) cookie; + + STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->setxattr, + &local->loc, local->xattr_req, 0, NULL); + + out: + return 0; +} + +inline void +afr_prune_special_keys (dict_t *xattr_dict) +{ + dict_del (xattr_dict, GF_SELINUX_XATTR_KEY); +} + +inline void +afr_prune_pending_keys (dict_t *xattr_dict, afr_private_t *priv) +{ + int i = 0; + + for (; i < priv->child_count; i++) { + dict_del (xattr_dict, priv->pending_key[i]); + } +} + +int +afr_sh_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) +{ + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (op_ret < 0) { + afr_sh_metadata_sync_cbk (frame, cookie, + this, -1, op_errno, xdata); + goto out; + } + + afr_prune_pending_keys (xattr, priv); + + afr_prune_special_keys (xattr); + + i = (long) cookie; + + /* send removexattr in bulk via xdata */ + STACK_WIND_COOKIE (frame, afr_sh_removexattr_cbk, + cookie, + priv->children[i], + priv->children[i]->fops->removexattr, + &local->loc, "", xattr); + + out: + return 0; +} int afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) @@ -219,9 +317,10 @@ afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) /* * 2 calls per sink - setattr, setxattr */ - if (xattr) + if (xattr) { call_count = active_sinks * 2; - else + local->xattr_req = dict_ref (xattr); + } else call_count = active_sinks; local->call_count = call_count; @@ -264,11 +363,11 @@ afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) if (!xattr) continue; - STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, + STACK_WIND_COOKIE (frame, afr_sh_getxattr_cbk, (void *) (long) i, priv->children[i], - priv->children[i]->fops->setxattr, - &local->loc, xattr, 0, NULL); + priv->children[i]->fops->getxattr, + &local->loc, NULL, NULL); call_count--; } @@ -286,8 +385,6 @@ afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_private_t *priv = NULL; int source = 0; - int i; - local = frame->local; sh = &local->self_heal; priv = this->private; @@ -302,16 +399,147 @@ afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, afr_sh_metadata_sync (frame, this, NULL); } else { - for (i = 0; i < priv->child_count; i++) { - dict_del (xattr, priv->pending_key[i]); - } - + afr_prune_pending_keys (xattr, priv); afr_sh_metadata_sync (frame, this, xattr); } return 0; } +static void +afr_set_metadata_sh_info_str (afr_local_t *local, afr_self_heal_t *sh, + xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = 0; + char num[1024] = {0}; + size_t len = 0; + char *string = NULL; + size_t off = 0; + char *source_child = " from source %s to"; + char *format = " %s, "; + char *string_msg = " metadata self heal"; + char *pending_matrix_str = NULL; + int down_child_present = 0; + int unknown_child_present = 0; + char *down_subvol_1 = " down subvolume is "; + char *unknown_subvol_1 = " unknown subvolume is"; + char *down_subvol_2 = " down subvolumes are "; + char *unknown_subvol_2 = " unknown subvolumes are "; + int down_count = 0; + int unknown_count = 0; + + priv = this->private; + + pending_matrix_str = afr_get_pending_matrix_str (sh->pending_matrix, + this); + + if (!pending_matrix_str) + pending_matrix_str = ""; + + len += snprintf (num, sizeof (num), "%s", string_msg); + + for (i = 0; i < priv->child_count; i++) { + if ((sh->source == i) && (local->child_up[i] == 1)) { + len += snprintf (num, sizeof (num), source_child, + priv->children[i]->name); + } else if ((local->child_up[i] == 1) && (sh->sources[i] == 0)) { + len += snprintf (num, sizeof (num), format, + priv->children[i]->name); + } else if (local->child_up[i] == 0) { + len += snprintf (num, sizeof (num), format, + priv->children[i]->name); + if (!down_child_present) + down_child_present = 1; + down_count++; + } else if (local->child_up[i] == -1) { + len += snprintf (num, sizeof (num), format, + priv->children[i]->name); + if (!unknown_child_present) + unknown_child_present = 1; + unknown_count++; + } + } + + if (down_child_present) { + if (down_count > 1) { + len += snprintf (num, sizeof (num), "%s", + down_subvol_2); + } else { + len += snprintf (num, sizeof (num), "%s", + down_subvol_1); + } + } + if (unknown_child_present) { + if (unknown_count > 1) { + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_2); + } else { + len += snprintf (num, sizeof (num), "%s", + unknown_subvol_1); + } + } + + len ++; + + string = GF_CALLOC (len, sizeof (char), gf_common_mt_char); + if (!string) + return; + + off += snprintf (string + off, len - off, "%s", string_msg); + for (i=0; i < priv->child_count; i++) { + if ((sh->source == i) && (local->child_up[i] == 1)) + off += snprintf (string + off, len - off, source_child, + priv->children[i]->name); + } + + for (i = 0; i < priv->child_count; i++) { + if ((local->child_up[i] == 1)&& (sh->sources[i] == 0)) + off += snprintf (string + off, len - off, format, + priv->children[i]->name); + } + + if (down_child_present) { + if (down_count > 1) { + off += snprintf (string + off, len - off, "%s", + down_subvol_2); + } else { + off += snprintf (string + off, len - off, "%s", + down_subvol_1); + } + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == 0) + off += snprintf (string + off, len - off, format, + priv->children[i]->name); + } + + if (unknown_child_present) { + if (unknown_count > 1) { + off += snprintf (string + off, len - off, "%s", + unknown_subvol_2); + } else { + off += snprintf (string + off, len - off, "%s", + unknown_subvol_1); + } + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] == -1) + off += snprintf (string + off, len - off, format, + priv->children[i]->name); + } + + gf_asprintf (&sh->metadata_sh_info, "%s metadata %s,", string, + pending_matrix_str); + + if (pending_matrix_str && strcmp (pending_matrix_str, "")) + GF_FREE (pending_matrix_str); + + if (string && strcmp (string, "")) + GF_FREE (string); +} int afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) @@ -342,6 +570,8 @@ afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) sh->active_sinks); sh->actual_sh_started = _gf_true; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_SYNC_BEGIN); + afr_set_metadata_sh_info_str (local, sh, this); STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, priv->children[source], priv->children[source]->fops->getxattr, @@ -367,7 +597,7 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this, priv = this->private; if (op_ret < 0) { - sh->op_failed = 1; + afr_set_self_heal_status (sh, AFR_SELF_HEAL_FAILED); afr_sh_set_error (sh, op_errno); afr_sh_metadata_finish (frame, this); goto out; @@ -392,18 +622,14 @@ afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this, } if (nsources == -1) { - gf_log (this->name, GF_LOG_ERROR, - "Unable to self-heal permissions/ownership of '%s' " - "(possible split-brain). Please fix the file on " - "all backend volumes", local->loc.path); - - sh->mdata_spb = _gf_true; - - afr_sh_metadata_finish (frame, this); + afr_sh_print_split_brain_log (sh->pending_matrix, this, + local->loc.path); + afr_set_split_brain (this, sh->inode, SPB, DONT_KNOW); + afr_sh_metadata_fail (frame, this); goto out; } - sh->mdata_spb = _gf_false; + afr_set_split_brain (this, sh->inode, NO_SPB, DONT_KNOW); if (nsources == 0) { gf_log (this->name, GF_LOG_TRACE, "No self-heal needed for %s", @@ -489,19 +715,22 @@ int afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; afr_local_t *local = NULL; local = frame->local; int_lock = &local->internal_lock; + int_lock->domain = this->name; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); int_lock->transaction_lk_type = AFR_SELFHEAL_LK; int_lock->selfheal_lk_type = AFR_METADATA_SELF_HEAL_LK; afr_set_lock_number (frame, this); - int_lock->lk_flock.l_start = LLONG_MAX - 1; - int_lock->lk_flock.l_len = 0; - int_lock->lk_flock.l_type = F_WRLCK; + inodelk->flock.l_start = LLONG_MAX - 1; + inodelk->flock.l_len = 0; + inodelk->flock.l_type = F_WRLCK; int_lock->lock_cbk = afr_sh_metadata_post_nonblocking_inodelk_cbk; afr_nonblocking_inodelk (frame, this); @@ -528,16 +757,10 @@ afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) local = frame->local; sh = &local->self_heal; - - /* Self-heal completion cbk changes inode split-brain status based on - * govinda_gOvinda, mdata_spb, data_spb value. - * Initialize mdata_spb with current split-brain status. - * If for some reason self-heal fails(locking phase etc), it makes sure - * we retain the split-brain status before this self-heal started. - */ - sh->mdata_spb = afr_is_split_brain (this, sh->inode); + sh->sh_type_in_action = AFR_SELF_HEAL_METADATA; if (afr_can_start_metadata_self_heal (sh, priv)) { + afr_set_self_heal_status (sh, AFR_SELF_HEAL_STARTED); afr_sh_metadata_lock (frame, this); } else { afr_sh_metadata_done (frame, this); diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c index 214c0fff4..1b48a1bca 100644 --- a/xlators/cluster/afr/src/afr-self-heald.c +++ b/xlators/cluster/afr/src/afr-self-heald.c @@ -25,7 +25,8 @@ typedef enum { typedef enum { HEAL = 1, - INFO + INFO, + STATISTICS_TO_BE_HEALED, } shd_crawl_op; typedef struct shd_dump { @@ -84,6 +85,33 @@ _loc_assign_gfid_path (loc_t *loc) } void +_destroy_crawl_event_data (void *data) +{ + shd_crawl_event_t *crawl_event = NULL; + + if (!data) + goto out; + + crawl_event = (shd_crawl_event_t *)data; + GF_FREE (crawl_event->start_time_str); + GF_FREE (crawl_event->end_time_str); + +out: + return; +} + +void +_destroy_shd_event_data (void *data) +{ + shd_event_t *event = NULL; + if (!data) + goto out; + event = (shd_event_t*)data; + GF_FREE (event->path); +out: + return; +} +void shd_cleanup_event (void *event) { shd_event_t *shd_event = event; @@ -128,6 +156,123 @@ _build_index_loc (xlator_t *this, loc_t *loc, char *name, loc_t *parent) } int +_add_crawl_stats_to_dict (xlator_t *this, dict_t *output, int child, + shd_crawl_event_t *shd_event, struct timeval *tv) +{ + int ret = 0; + uint64_t count = 0; + char key[256] = {0}; + int xl_id = 0; + uint64_t healed_count = 0; + uint64_t split_brain_count = 0; + uint64_t heal_failed_count = 0; + char *start_time_str = NULL; + char *end_time_str = NULL; + char *crawl_type = NULL; + int progress = -1; + + healed_count = shd_event->healed_count; + split_brain_count = shd_event->split_brain_count; + heal_failed_count = shd_event->heal_failed_count; + start_time_str = shd_event->start_time_str; + end_time_str = shd_event->end_time_str; + crawl_type = shd_event->crawl_type; + + if (!start_time_str) { + ret = -1; + goto out; + } + + + ret = dict_get_int32 (output, this->name, &xl_id); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "xl does not have id"); + goto out; + } + + snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child); + ret = dict_get_uint64 (output, key, &count); + + snprintf (key, sizeof (key), "statistics_healed_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64(output, key, healed_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "healed_count to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_sb_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64 (output, key, split_brain_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "split_brain_count to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_crawl_type-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_dynstr (output, key, gf_strdup (crawl_type)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "crawl_type to output"); + goto out; + } + snprintf (key, sizeof (key), "statistics_heal_failed_cnt-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_uint64 (output, key, heal_failed_count); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "healed_failed_count to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_strt_time-%d-%d-%"PRIu64, + xl_id, child, count); + ret = dict_set_dynstr (output, key, gf_strdup(start_time_str)); + + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "crawl_start_time to outout"); + goto out; + } + + snprintf (key, sizeof (key), "statistics_end_time-%d-%d-%"PRIu64, + xl_id, child, count); + + if (!end_time_str) + end_time_str = "Could not determine the end time"; + ret = dict_set_dynstr (output, key, gf_strdup(end_time_str)); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "crawl_end_time to outout"); + goto out; + } + snprintf (key, sizeof (key), "statistics_inprogress-%d-%d-%"PRIu64, + xl_id, child, count); + + if (shd_event->crawl_inprogress == _gf_true) + progress = 1; + else + progress = 0; + + ret = dict_set_int32 (output, key, progress); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not add statistics_" + "inprogress to outout"); + goto out; + } + + snprintf (key, sizeof (key), "statistics-%d-%d-count",xl_id, child); + ret = dict_set_uint64 (output, key, count + 1); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "Could not increment the " + "counter."); + goto out; + } +out: + return ret; +} + +int _add_path_to_dict (xlator_t *this, dict_t *output, int child, char *path, struct timeval *tv, gf_boolean_t dyn) { @@ -182,16 +327,18 @@ out: int _get_path_from_gfid_loc (xlator_t *this, xlator_t *readdir_xl, loc_t *child, - char **fpath) + char **fpath, gf_boolean_t *missing) { dict_t *xattr = NULL; char *path = NULL; int ret = -1; - ret = syncop_getxattr (readdir_xl, child, &xattr, - GFID_TO_PATH_KEY); - if (ret) + ret = syncop_getxattr (readdir_xl, child, &xattr, GFID_TO_PATH_KEY); + if (ret < 0) { + if ((errno == ENOENT) && missing) + *missing = _gf_true; goto out; + } ret = dict_get_str (xattr, GFID_TO_PATH_KEY, &path); if (ret) { gf_log (this->name, GF_LOG_ERROR, "Failed to get path for " @@ -231,6 +378,20 @@ out: } int +_add_crawl_event_statistics_to_dict (circular_buffer_t *cb, void *data) +{ + int ret = 0; + shd_dump_t *dump_data = NULL; + shd_crawl_event_t *shd_event = NULL; + + dump_data = data; + shd_event = cb->data; + ret = _add_crawl_stats_to_dict (dump_data->this, dump_data->dict, + dump_data->child, shd_event, &cb->tv); + return ret; +} + +int _add_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict, int child) { shd_dump_t dump_data = {0}; @@ -242,32 +403,24 @@ _add_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict, int child) return 0; } + int -_add_summary_to_dict (xlator_t *this, afr_crawl_data_t *crawl_data, - gf_dirent_t *entry, - loc_t *childloc, loc_t *parentloc, struct iatt *iattr) +_add_statistics_to_dict (xlator_t *this, dict_t *dict, int child) { - dict_t *output = NULL; - xlator_t *readdir_xl = NULL; - int ret = -1; - char *path = NULL; - - if (uuid_is_null (childloc->gfid)) - goto out; + shd_dump_t dump_data = {0}; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; - output = crawl_data->op_data; - readdir_xl = crawl_data->readdir_xl; + priv = this->private; + shd = &priv->shd; - ret = _get_path_from_gfid_loc (this, readdir_xl, childloc, &path); - if (ret) - goto out; + dump_data.this = this; + dump_data.dict = dict; + dump_data.child = child; + eh_dump (shd->statistics[child], &dump_data, + _add_crawl_event_statistics_to_dict); + return 0; - ret = _add_path_to_dict (this, output, crawl_data->child, path, NULL, - _gf_true); -out: - if (ret && path) - GF_FREE (path); - return ret; } void @@ -294,30 +447,108 @@ out: return; } +int +_count_hard_links_under_base_indices_dir (xlator_t *this, + afr_crawl_data_t *crawl_data, + gf_dirent_t *entry, loc_t *childloc, + loc_t *parentloc, struct iatt *iattr) +{ + xlator_t *readdir_xl = crawl_data->readdir_xl; + struct iatt parent = {0}; + int ret = 0; + dict_t *output = NULL; + int xl_id = 0; + char key[256] = {0}; + int child = -1; + uint64_t hardlinks = 0; + + output = crawl_data->op_data; + child = crawl_data->child; + + ret = syncop_lookup (readdir_xl, childloc, NULL, iattr, NULL, &parent); + if (ret) + goto out; + + ret = dict_get_int32 (output, this->name, &xl_id); + if (ret) + goto out; + + snprintf (key, sizeof (key), "%d-%d-hardlinks", xl_id, child); + ret = dict_get_uint64 (output, key, &hardlinks); + + /*Removing the count of base_entry under indices/base_indicies and + * entry under indices/xattrop */ + hardlinks = hardlinks + iattr->ia_nlink - 2; + ret = dict_set_uint64 (output, key, hardlinks); + if (ret) + goto out; + +out: + return ret; +} + +int +_add_summary_to_dict (xlator_t *this, afr_crawl_data_t *crawl_data, + gf_dirent_t *entry, + loc_t *childloc, loc_t *parentloc, struct iatt *iattr) +{ + dict_t *output = NULL; + xlator_t *readdir_xl = NULL; + int ret = -1; + char *path = NULL; + gf_boolean_t missing = _gf_false; + char gfid_str[64] = {0}; + + if (uuid_is_null (childloc->gfid)) + goto out; + + output = crawl_data->op_data; + readdir_xl = crawl_data->readdir_xl; + + ret = _get_path_from_gfid_loc (this, readdir_xl, childloc, &path, + &missing); + if (ret == 0) { + ret = _add_path_to_dict (this, output, crawl_data->child, path, + NULL, _gf_true); + } else if (missing) { + _remove_stale_index (this, readdir_xl, parentloc, + uuid_utoa_r (childloc->gfid, gfid_str)); + } + +out: + if (ret && path) + GF_FREE (path); + return ret; +} + void _crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child, int32_t op_ret, int32_t op_errno, dict_t *xattr_rsp, afr_crawl_data_t *crawl_data) { - int ret = 0; - afr_private_t *priv = NULL; - afr_self_heald_t *shd = NULL; - eh_t *eh = NULL; - char *path = NULL; - shd_event_t *event = NULL; - int32_t sh_failed = 0; - gf_boolean_t split_brain = 0; - int32_t actual_sh_done = 0; + int ret = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + eh_t *eh = NULL; + char *path = NULL; + char gfid_str[64] = {0}; + shd_event_t *event = NULL; + int32_t sh_failed = 0; + gf_boolean_t split_brain = 0; + int32_t actual_sh_done = 0; + shd_crawl_event_t **shd_crawl_event = NULL; + priv = this->private; shd = &priv->shd; if (crawl_data->crawl == INDEX) { if ((op_ret < 0) && (op_errno == ENOENT)) { _remove_stale_index (this, crawl_data->readdir_xl, - parent, uuid_utoa (child->gfid)); + parent, uuid_utoa_r (child->gfid, + gfid_str)); goto out; } ret = _get_path_from_gfid_loc (this, crawl_data->readdir_xl, - child, &path); + child, &path, NULL); if (ret) goto out; } else { @@ -333,16 +564,19 @@ _crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child, ret = dict_get_int32 (xattr_rsp, "actual-sh-done", &actual_sh_done); } - split_brain = afr_is_split_brain (this, child->inode); + shd_crawl_event = (shd_crawl_event_t**)(shd->crawl_events); + split_brain = afr_is_split_brain (this, child->inode); if ((op_ret < 0 && op_errno == EIO) || split_brain) { eh = shd->split_brain; + shd_crawl_event[crawl_data->child]->split_brain_count += 1; } else if ((op_ret < 0) || sh_failed) { eh = shd->heal_failed; + shd_crawl_event[crawl_data->child]->heal_failed_count += 1; } else if (actual_sh_done == 1) { - eh = shd->healed; + eh = shd->healed; + shd_crawl_event[crawl_data->child]->healed_count += 1; } - ret = -1; if (eh != NULL) { @@ -373,21 +607,56 @@ out: } int +_link_inode_update_loc (xlator_t *this, loc_t *loc, struct iatt *iattr) +{ + inode_t *link_inode = NULL; + int ret = -1; + + link_inode = inode_link (loc->inode, NULL, NULL, iattr); + if (link_inode == NULL) { + gf_log (this->name, GF_LOG_ERROR, "inode link failed " + "on the inode (%s)", uuid_utoa (iattr->ia_gfid)); + goto out; + } + inode_unref (loc->inode); + loc->inode = link_inode; + ret = 0; +out: + return ret; +} + +int _self_heal_entry (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *entry, loc_t *child, loc_t *parent, struct iatt *iattr) { struct iatt parentbuf = {0}; int ret = 0; dict_t *xattr_rsp = NULL; + dict_t *xattr_req = NULL; + + xattr_req = dict_new (); + if (!xattr_req) { + errno = ENOMEM; + ret = -1; + goto out; + } + + ret = dict_set_int32 (xattr_req, "allow-sh-for-running-transaction", 1); gf_log (this->name, GF_LOG_DEBUG, "lookup %s", child->path); - ret = syncop_lookup (this, child, NULL, + ret = syncop_lookup (this, child, xattr_req, iattr, &xattr_rsp, &parentbuf); _crawl_post_sh_action (this, parent, child, ret, errno, xattr_rsp, crawl_data); if (xattr_rsp) dict_unref (xattr_rsp); + if (ret == 0) + ret = _link_inode_update_loc (this, child, iattr); + +out: + if (xattr_req) + dict_unref(xattr_req); return ret; } @@ -496,12 +765,20 @@ _do_crawl_op_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl, status = "Started self-heal"; _do_self_heal_on_subvol (this, i, crawl); - } else if (output) { + } else if (output && (op == INFO)) { status = ""; afr_start_crawl (this, i, INDEX, _add_summary_to_dict, output, _gf_false, 0, NULL); + } else if (output && + (op == STATISTICS_TO_BE_HEALED)) { + status = ""; + afr_start_crawl (this, i, + INDEX_TO_BE_HEALED, + _count_hard_links_under_base_indices_dir, + output, _gf_false, + 0, NULL); } } if (output) { @@ -535,8 +812,103 @@ _get_index_summary_on_local_subvols (xlator_t *this, dict_t *output) return _do_crawl_op_on_local_subvols (this, INDEX, INFO, output); } +void +afr_fill_completed_crawl_statistics_to_dict (xlator_t *this, dict_t *dict) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int i = 0; + priv = this->private; + shd= &priv->shd; + for (i = 0; i < priv->child_count; i++) { + if (shd->pos[i] != AFR_POS_LOCAL) + continue; + _add_statistics_to_dict (this, dict, i); + } + + return ; +} + +static void +reset_crawl_event (shd_crawl_event_t *crawl_event) +{ + crawl_event->healed_count = 0; + crawl_event->split_brain_count = 0; + crawl_event->heal_failed_count = 0; + GF_FREE (crawl_event->start_time_str); + crawl_event->start_time_str = NULL; + crawl_event->end_time_str = NULL; + crawl_event->crawl_type = NULL; + crawl_event->crawl_inprogress = _gf_false; + return; +} + +static void +afr_copy_crawl_event_struct (shd_crawl_event_t *src, shd_crawl_event_t *dst) +{ + dst->healed_count = src->healed_count; + dst->split_brain_count = src->split_brain_count; + dst->heal_failed_count = src->heal_failed_count; + dst->start_time_str = gf_strdup (src->start_time_str); + dst->end_time_str = "Crawl is already in progress"; + dst->crawl_type = src->crawl_type; + dst->crawl_inprogress = _gf_true; + return; +} + +static int +afr_fill_crawl_statistics_of_running_crawl(xlator_t *this, dict_t *dict) +{ + shd_crawl_event_t *evnt = NULL; + int ret = 0; + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int i = 0; + priv = this->private; + shd = &priv->shd; + + evnt = GF_CALLOC (1, sizeof (shd_crawl_event_t), + gf_afr_mt_shd_crawl_event_t); + if (!evnt) { + ret = -1; + goto out; + } + LOCK (&priv->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (shd->pos[i] != AFR_POS_LOCAL) + continue; + + reset_crawl_event (evnt); + + if (!shd->crawl_events[i]) { + continue; + } + + afr_copy_crawl_event_struct (shd->crawl_events[i], + evnt); + _add_crawl_stats_to_dict (this, dict, i, evnt, NULL); + + } + } + UNLOCK (&priv->lock); + reset_crawl_event (evnt); + GF_FREE (evnt); + +out: + return ret; +} + +static int +_add_local_subvols_crawl_statistics_to_dict (xlator_t *this, dict_t *dict) +{ + int ret = 0; + afr_fill_completed_crawl_statistics_to_dict (this, dict); + ret = afr_fill_crawl_statistics_of_running_crawl (this, dict); + return ret; +} int -_add_all_subvols_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict) +_add_local_subvols_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict) { afr_private_t *priv = NULL; afr_self_heald_t *shd = NULL; @@ -586,16 +958,25 @@ afr_xl_op (xlator_t *this, dict_t *input, dict_t *output) ret = 0; break; case GF_AFR_OP_HEALED_FILES: - ret = _add_all_subvols_eh_to_dict (this, shd->healed, output); + ret = _add_local_subvols_eh_to_dict (this, shd->healed, output); break; case GF_AFR_OP_HEAL_FAILED_FILES: - ret = _add_all_subvols_eh_to_dict (this, shd->heal_failed, + ret = _add_local_subvols_eh_to_dict (this, shd->heal_failed, output); break; case GF_AFR_OP_SPLIT_BRAIN_FILES: - ret = _add_all_subvols_eh_to_dict (this, shd->split_brain, + ret = _add_local_subvols_eh_to_dict (this, shd->split_brain, output); break; + case GF_AFR_OP_STATISTICS: + ret = _add_local_subvols_crawl_statistics_to_dict (this, output); + break; + case GF_AFR_OP_STATISTICS_HEAL_COUNT: + case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA: + ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED, + STATISTICS_TO_BE_HEALED, + output); + break; default: gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op); break; @@ -610,7 +991,7 @@ afr_poll_self_heal (void *data) { afr_private_t *priv = NULL; afr_self_heald_t *shd = NULL; - struct timeval timeout = {0}; + struct timespec timeout = {0}; xlator_t *this = NULL; long child = (long)data; gf_timer_t *old_timer = NULL; @@ -634,7 +1015,7 @@ afr_poll_self_heal (void *data) if (shd->enabled && (shd->pos[child] == AFR_POS_LOCAL)) _do_self_heal_on_subvol (this, child, INDEX); timeout.tv_sec = shd->timeout; - timeout.tv_usec = 0; + timeout.tv_nsec = 0; //notify and previous timer should be synchronized. LOCK (&priv->lock); { @@ -733,25 +1114,6 @@ out: } int -_link_inode_update_loc (xlator_t *this, loc_t *loc, struct iatt *iattr) -{ - inode_t *link_inode = NULL; - int ret = -1; - - link_inode = inode_link (loc->inode, NULL, NULL, iattr); - if (link_inode == NULL) { - gf_log (this->name, GF_LOG_ERROR, "inode link failed " - "on the inode (%s)", uuid_utoa (iattr->ia_gfid)); - goto out; - } - inode_unref (loc->inode); - loc->inode = link_inode; - ret = 0; -out: - return ret; -} - -int afr_local_pathinfo (char *pathinfo, gf_boolean_t *local) { int ret = 0; @@ -787,6 +1149,7 @@ afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data, afr_private_t *priv = NULL; dict_t *xattr = NULL; void *index_gfid = NULL; + void *base_indices_holder_vgfid = NULL; loc_t rootloc = {0}; struct iatt iattr = {0}; struct iatt parent = {0}; @@ -796,7 +1159,7 @@ afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data, priv = this->private; if (crawl_data->crawl == FULL) { afr_build_root_loc (this, dirloc); - } else { + } else if (crawl_data->crawl == INDEX) { afr_build_root_loc (this, &rootloc); ret = syncop_getxattr (readdir_xl, &rootloc, &xattr, GF_XATTROP_INDEX_GFID); @@ -830,6 +1193,47 @@ afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data, ret = _link_inode_update_loc (this, dirloc, &iattr); if (ret) goto out; + } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) { + afr_build_root_loc (this, &rootloc); + ret = syncop_getxattr (readdir_xl, &rootloc, &xattr, + GF_BASE_INDICES_HOLDER_GFID); + if (ret < 0) + goto out; + ret = dict_get_ptr (xattr, GF_BASE_INDICES_HOLDER_GFID, + &base_indices_holder_vgfid); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "index gfid empty " + "on %s", readdir_xl->name); + ret = -1; + goto out; + } + if (!base_indices_holder_vgfid) { + gf_log (this->name, GF_LOG_ERROR, "Base indices holder" + "virtual gfid is null on %s", readdir_xl->name); + ret = -1; + goto out; + } + uuid_copy (dirloc->gfid, base_indices_holder_vgfid); + dirloc->path = ""; + dirloc->inode = inode_new (priv->root_inode->table); + ret = syncop_lookup (readdir_xl, dirloc, NULL, &iattr, NULL, + &parent); + if (ret < 0) { + if (errno != ENOENT) { + gf_log (this->name, GF_LOG_ERROR, "lookup " + "failed for base_indices_holder dir" + " on %s - (%s)", readdir_xl->name, + strerror (errno)); + + } else { + gf_log (this->name, GF_LOG_ERROR, "base_indices" + "_holder is not yet created."); + } + goto out; + } + ret = _link_inode_update_loc (this, dirloc, &iattr); + if (ret) + goto out; } ret = 0; out: @@ -894,6 +1298,16 @@ afr_crawl_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, priv = this->private; if (crawl_data->crawl == FULL) { ret = afr_build_child_loc (this, child, parent, entry->d_name); + } else if (crawl_data->crawl == INDEX_TO_BE_HEALED) { + ret = _build_index_loc (this, child, entry->d_name, parent); + if (ret) + goto out; + child->inode = inode_new (priv->root_inode->table); + if (!child->inode) { + ret = -1; + goto out; + } + child->path = NULL; } else { child->inode = inode_new (priv->root_inode->table); if (!child->inode) @@ -943,13 +1357,14 @@ _process_entries (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries, ret = crawl_data->process_entry (this, crawl_data, entry, &entry_loc, parentloc, &iattr); - if (ret) + if (crawl_data->crawl == INDEX_TO_BE_HEALED && ret) { + goto out; + } else if (ret) { continue; + } - ret = _link_inode_update_loc (this, &entry_loc, &iattr); - if (ret) - goto out; - if (crawl_data->crawl == INDEX) + if ((crawl_data->crawl == INDEX) || + (crawl_data->crawl == INDEX_TO_BE_HEALED)) continue; if (!IA_ISDIR (iattr.ia_type)) @@ -964,6 +1379,10 @@ _process_entries (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries, } ret = 0; out: + if ((crawl_data->crawl == INDEX_TO_BE_HEALED) && ret) { + gf_log (this->name, GF_LOG_ERROR,"Failed to get the hardlink " + "count"); + } loc_wipe (&entry_loc); return ret; } @@ -1011,6 +1430,9 @@ _crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data) ret = _process_entries (this, loc, &entries, &offset, crawl_data); + if ((ret < 0) && (crawl_data->crawl == INDEX_TO_BE_HEALED)) { + goto out; + } gf_dirent_free (&entries); free_entries = _gf_false; } @@ -1052,7 +1474,7 @@ afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos) ret = syncop_getxattr (priv->children[child], &loc, &xattr_rsp, GF_XATTR_NODE_UUID_KEY); - if (ret) { + if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "getxattr failed on %s - " "(%s)", priv->children[child]->name, strerror (errno)); goto out; @@ -1116,8 +1538,13 @@ afr_dir_crawl (void *data) goto out; ret = afr_crawl_opendir (this, crawl_data, &fd, &dirloc); - if (ret) + if (ret) { + if (crawl_data->crawl == INDEX_TO_BE_HEALED) { + gf_log (this->name, GF_LOG_ERROR, "Failed to open base_" + "indices_holder"); + } goto out; + } ret = _crawl_directory (fd, &dirloc, crawl_data); if (ret) @@ -1131,12 +1558,102 @@ afr_dir_crawl (void *data) out: if (fd) fd_unref (fd); - if (crawl_data->crawl == INDEX) + if ((crawl_data->crawl == INDEX) || + (crawl_data->crawl == INDEX_TO_BE_HEALED )) dirloc.path = NULL; loc_wipe (&dirloc); return ret; } +char * +get_crawl_type_in_string (afr_crawl_type_t crawl) +{ + char *index = "INDEX"; + char *full = "FULL"; + char *crawl_type = NULL; + + if (crawl == INDEX){ + crawl_type = index; + } else if (crawl == FULL) { + crawl_type = full; + } + + return crawl_type; +} + +static int +afr_allocate_crawl_event (xlator_t *this, int child, afr_crawl_type_t crawl) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int ret = 0; + shd_crawl_event_t *crawl_event = NULL; + time_t get_time = 0; + + priv = this->private; + shd = &priv->shd; + + crawl_event = GF_CALLOC (sizeof (shd_crawl_event_t), 1, + gf_afr_mt_shd_crawl_event_t); + if (!crawl_event) { + ret = -1; + goto out; + } + + get_time = time(NULL); + if (get_time == ((time_t)-1)) { + ret = -1; + goto out; + } + + crawl_event->start_time_str = gf_strdup (ctime(&get_time)); + + crawl_event->crawl_type = get_crawl_type_in_string (crawl); + if (!crawl_event->crawl_type) { + ret = -1; + goto out; + } + LOCK (&priv->lock); + { + shd->crawl_events[child] = crawl_event; + } + UNLOCK (&priv->lock); + ret = 0; +out: + return ret; + +} + +static int +afr_put_crawl_event_in_eh (xlator_t *this, int child) +{ + afr_private_t *priv = NULL; + afr_self_heald_t *shd = NULL; + int ret = 0; + time_t get_time = 0; + shd_crawl_event_t **crawl_event = NULL; + + priv = this->private; + shd = &priv->shd; + + get_time = time(NULL); + if (get_time == ((time_t)-1)) { + ret = -1; + goto out; + } + crawl_event = (shd_crawl_event_t**)shd->crawl_events; + LOCK (&priv->lock); + { + crawl_event[child]->end_time_str = gf_strdup (ctime(&get_time)); + ret = eh_save_history (shd->statistics[child], + crawl_event[child]); + crawl_event[child] = NULL; + } + UNLOCK (&priv->lock); +out: + return ret; +} + static int afr_dir_exclusive_crawl (void *data) { @@ -1172,7 +1689,15 @@ afr_dir_exclusive_crawl (void *data) } do { + ret = afr_allocate_crawl_event (this, child, crawl_data->crawl); + if (ret) + goto out; afr_dir_crawl (data); + + ret = afr_put_crawl_event_in_eh (this, child); + if (ret < 0) + goto out; + LOCK (&priv->lock); { if (shd->pending[child] != NONE) { @@ -1229,8 +1754,8 @@ afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl, ret = synctask_new (this->ctx->env, crawler, crawl_done, frame, crawl_data); if (ret) - gf_log (this->name, GF_LOG_ERROR, "Could not create the " - "task for %d ret %d", idx, ret); + gf_log (this->name, GF_LOG_ERROR, "afr crawl failed for child" + " %d with ret %d", idx, ret); out: return; } @@ -1260,4 +1785,3 @@ afr_set_root_gfid (dict_t *dict) return ret; } - diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h index 32a8aaca5..e0c083754 100644 --- a/xlators/cluster/afr/src/afr-self-heald.h +++ b/xlators/cluster/afr/src/afr-self-heald.h @@ -29,6 +29,19 @@ typedef struct afr_crawl_data_ { struct iatt *iattr); } afr_crawl_data_t; +typedef struct crawl_event_stats_ { + uint64_t healed_count; + uint64_t split_brain_count; + uint64_t heal_failed_count; + char *start_time_str; + char *end_time_str; + char *crawl_type; + gf_boolean_t crawl_inprogress; +} shd_crawl_event_t; + +void _destroy_crawl_event_data (void *data); +void _destroy_shd_event_data (void *data); + typedef int (*process_entry_cbk_t) (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *entry, loc_t *child, loc_t *parent, struct iatt *iattr); diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c index 5e636d8dc..20306e469 100644 --- a/xlators/cluster/afr/src/afr-transaction.c +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -79,7 +79,7 @@ afr_save_lk_owner (call_frame_t *frame) local = frame->local; - local->saved_lk_owner = frame->root->lk_owner; + local->saved_lk_owner = frame->root->lk_owner; } @@ -90,10 +90,9 @@ afr_restore_lk_owner (call_frame_t *frame) local = frame->local; - frame->root->lk_owner = local->saved_lk_owner; + frame->root->lk_owner = local->saved_lk_owner; } - static void __mark_all_pending (int32_t *pending[], int child_count, afr_transaction_type type) @@ -146,34 +145,6 @@ out: return; } - -static void -__mark_pre_op_undone_on_fd (call_frame_t *frame, xlator_t *this, int child_index) -{ - afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - - local = frame->local; - - if (!local->fd) - return; - - fd_ctx = afr_fd_ctx_get (local->fd, this); - - if (!fd_ctx) - goto out; - - LOCK (&local->fd->lock); - { - if (local->transaction.type == AFR_DATA_TRANSACTION) - fd_ctx->pre_op_done[child_index]--; - } - UNLOCK (&local->fd->lock); -out: - return; -} - - static void __mark_non_participant_children (int32_t *pending[], int child_count, unsigned char *participants, @@ -190,7 +161,7 @@ __mark_non_participant_children (int32_t *pending[], int child_count, } -static void +void __mark_all_success (int32_t *pending[], int child_count, afr_transaction_type type) { @@ -218,9 +189,12 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) { afr_local_t *local = NULL; afr_private_t *priv = NULL; + fd_t *fd = NULL; local = frame->local; priv = this->private; + fd = local->fd; + __mark_all_success (local->pending, priv->child_count, local->transaction.type); @@ -234,6 +208,17 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) frame->root->lk_owner = local->transaction.main_frame->root->lk_owner; + + /* The wake up needs to happen independent of + what type of fop arrives here. If it was + a write, then it has already inherited the + lock and changelog. If it was not a write, + then the presumption of the optimization (of + optimizing for successive write operations) + fails. + */ + if (fd) + afr_delayed_changelog_wake_up (this, fd); local->transaction.fop (frame, this); } @@ -382,6 +367,11 @@ afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, UNLOCK (&frame->lock); if (call_count == 0) { + if (local->transaction.resume_stub) { + call_resume (local->transaction.resume_stub); + local->transaction.resume_stub = NULL; + } + if (afr_lock_server_count (priv, local->transaction.type) == 0) { local->transaction.done (frame, this); } else { @@ -452,19 +442,37 @@ out: return; } +afr_inodelk_t* +afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom) +{ + afr_inodelk_t *inodelk = NULL; + int i = 0; + + for (i = 0; int_lock->inodelk[i].domain; i++) { + inodelk = &int_lock->inodelk[i]; + if (strcmp (dom, inodelk->domain) == 0) + return inodelk; + } + return NULL; +} + unsigned char* afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock) { unsigned char *locked_nodes = NULL; + afr_inodelk_t *inodelk = NULL; switch (type) { case AFR_DATA_TRANSACTION: case AFR_METADATA_TRANSACTION: - locked_nodes = int_lock->inode_locked_nodes; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + locked_nodes = inodelk->locked_nodes; break; case AFR_ENTRY_TRANSACTION: case AFR_ENTRY_RENAME_TRANSACTION: - locked_nodes = int_lock->entry_locked_nodes; + /*Because same set of subvols participate in all lockee + * entities*/ + locked_nodes = int_lock->lockee[0].locked_nodes; break; } return locked_nodes; @@ -556,6 +564,82 @@ afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr, "failed to set pending entry"); } + +gf_boolean_t +afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int index = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + index = afr_index_for_transaction_type (local->transaction.type); + + for (i = 0; i < priv->child_count; i++) { + if (local->pending[i][index] == 0) + return _gf_false; + } + + return _gf_true; +} + +static void +afr_dir_fop_handle_all_fop_failures (call_frame_t *frame) +{ + xlator_t *this = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + this = frame->this; + local = frame->local; + priv = this->private; + + if ((local->transaction.type != AFR_ENTRY_TRANSACTION) && + (local->transaction.type != AFR_ENTRY_RENAME_TRANSACTION)) + return; + + if (local->op_ret >= 0) + goto out; + + __mark_all_success (local->pending, priv->child_count, + local->transaction.type); +out: + return; +} + +static void +afr_data_handle_quota_errors (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + gf_boolean_t all_quota_failures = _gf_false; + + local = frame->local; + priv = this->private; + if (local->transaction.type != AFR_DATA_TRANSACTION) + return; + /* + * Idea is to not leave the file in FOOL-FOOL scenario in case on + * all the bricks data transaction failed with EDQUOT to avoid + * increasing un-necessary load of self-heals in the system. + */ + all_quota_failures = _gf_true; + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] && + (local->child_errno[i] != EDQUOT)) { + all_quota_failures = _gf_false; + break; + } + } + if (all_quota_failures) + __mark_all_success (local->pending, priv->child_count, + local->transaction.type); +} + int afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) { @@ -568,7 +652,6 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) afr_fd_ctx_t *fdctx = NULL; dict_t **xattr = NULL; int piggyback = 0; - int index = 0; int nothing_failed = 1; local = frame->local; @@ -578,6 +661,9 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) local->transaction.pre_op, local->transaction.type); + afr_data_handle_quota_errors (frame, this); + afr_dir_fop_handle_all_fop_failures (frame); + if (local->fd) afr_transaction_rm_stale_children (frame, this, local->fd->inode, @@ -604,15 +690,7 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) goto out; } - /* check if something has failed, to handle piggybacking */ - nothing_failed = 1; - index = afr_index_for_transaction_type (local->transaction.type); - for (i = 0; i < priv->child_count; i++) { - if (local->pending[i][index] == 0) { - nothing_failed = 0; - break; - } - } + nothing_failed = afr_txn_nothing_failed (frame, this); afr_compute_txn_changelog (local , priv); @@ -638,15 +716,14 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) break; } - LOCK (&local->fd->lock); - { - piggyback = 0; - if (fdctx->pre_op_piggyback[i]) { - fdctx->pre_op_piggyback[i]--; - piggyback = 1; - } - } - UNLOCK (&local->fd->lock); + /* local->transaction.postop_piggybacked[] was + precomputed in is_piggyback_postop() when called from + afr_changelog_post_op_safe() + */ + + piggyback = 0; + if (local->transaction.postop_piggybacked[i]) + piggyback = 1; afr_set_postop_dict (local, this, xattr[i], piggyback, i); @@ -655,7 +732,6 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) afr_changelog_post_op_cbk (frame, (void *)(long)i, this, 1, 0, xattr[i], NULL); } else { - __mark_pre_op_undone_on_fd (frame, this, i); STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, (void *) (long) i, @@ -901,7 +977,7 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) } UNLOCK (&local->fd->lock); - afr_set_delayed_post_op (frame, this); + afr_set_delayed_post_op (frame, this); if (piggyback) afr_changelog_pre_op_cbk (frame, (void *)(long)i, @@ -1174,12 +1250,14 @@ int afr_set_transaction_flock (afr_local_t *local) { afr_internal_lock_t *int_lock = NULL; + afr_inodelk_t *inodelk = NULL; int_lock = &local->internal_lock; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); - int_lock->lk_flock.l_len = local->transaction.len; - int_lock->lk_flock.l_start = local->transaction.start; - int_lock->lk_flock.l_type = F_WRLCK; + inodelk->flock.l_len = local->transaction.len; + inodelk->flock.l_start = local->transaction.start; + inodelk->flock.l_type = F_WRLCK; return 0; } @@ -1194,6 +1272,7 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) int_lock = &local->internal_lock; int_lock->transaction_lk_type = AFR_TRANSACTION_LK; + int_lock->domain = this->name; switch (local->transaction.type) { case AFR_DATA_TRANSACTION: @@ -1207,8 +1286,8 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) case AFR_ENTRY_RENAME_TRANSACTION: - int_lock->lock_cbk = afr_post_blocking_rename_cbk; - afr_blocking_lock (frame, this); + int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk; + afr_nonblocking_entrylk (frame, this); break; case AFR_ENTRY_TRANSACTION: @@ -1254,84 +1333,384 @@ afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) void afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - /* call this function from any of the related optimizations - which benefit from delaying post op are enabled, namely: + /* call this function from any of the related optimizations + which benefit from delaying post op are enabled, namely: - - changelog piggybacking - - eager locking - */ + - changelog piggybacking + - eager locking + */ - priv = this->private; - if (!priv) - return; + priv = this->private; + if (!priv) + return; - if (!priv->post_op_delay_secs) - return; + if (!priv->post_op_delay_secs) + return; - local = frame->local; - if (!local) - return; + local = frame->local; + if (!local->transaction.eager_lock_on) + return; - if (!local->fd) - return; + if (!local) + return; - if (local->op == GF_FOP_WRITE) - local->delayed_post_op = _gf_true; + if (!local->fd) + return; + + if (local->op == GF_FOP_WRITE) + local->delayed_post_op = _gf_true; } +gf_boolean_t +afr_are_multiple_fds_opened (inode_t *inode, xlator_t *this) +{ + afr_inode_ctx_t *ictx = NULL; + + if (!inode) { + /* If false is returned, it may keep on taking eager-lock + * which may lead to starvation, so return true to avoid that. + */ + gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid inode"); + return _gf_true; + } + /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock + * is taken mount2 opened the same file, it won't be able to + * perform any data operations until mount1 releases eager-lock. + * To avoid such scenario do not enable eager-lock for this transaction + * if open-fd-count is > 1 + */ + + ictx = afr_inode_ctx_get (inode, this); + if (!ictx) + return _gf_true; + + if (ictx->open_fd_count > 1) + return _gf_true; + + return _gf_false; +} + +gf_boolean_t +afr_any_fops_failed (afr_local_t *local, afr_private_t *priv) +{ + if (local->success_count != priv->child_count) + return _gf_true; + return _gf_false; +} gf_boolean_t is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - gf_boolean_t res = _gf_false; + afr_local_t *local = NULL; + gf_boolean_t res = _gf_false; + afr_private_t *priv = NULL; - local = frame->local; - if (!local) - goto out; + priv = this->private; + + local = frame->local; + if (!local) + goto out; - if (!local->delayed_post_op) - goto out; + if (!local->delayed_post_op) + goto out; - res = _gf_true; + //Mark pending changelog ASAP + if (afr_any_fops_failed (local, priv)) + goto out; + + if (local->fd && afr_are_multiple_fds_opened (local->fd->inode, this)) + goto out; + + res = _gf_true; out: - return res; + return res; } void -afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd); +afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, + call_stub_t *stub); void afr_delayed_changelog_wake_up_cbk (void *data) { - fd_t *fd = NULL; + fd_t *fd = NULL; - fd = data; + fd = data; - afr_delayed_changelog_wake_up (THIS, fd); + afr_delayed_changelog_wake_up (THIS, fd); +} + + +/* + Check if the frame is destined to get optimized away + with changelog piggybacking +*/ +static gf_boolean_t +is_piggyback_post_op (call_frame_t *frame, fd_t *fd) +{ + afr_fd_ctx_t *fdctx = NULL; + afr_local_t *local = NULL; + gf_boolean_t piggyback = _gf_true; + afr_private_t *priv = NULL; + int i = 0; + + priv = frame->this->private; + local = frame->local; + fdctx = afr_fd_ctx_get (fd, frame->this); + + LOCK(&fd->lock); + { + piggyback = _gf_true; + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; + if (fdctx->pre_op_piggyback[i]) { + fdctx->pre_op_piggyback[i]--; + local->transaction.postop_piggybacked[i] = 1; + } else { + /* For at least _one_ subvolume we cannot + piggyback on the changelog, and have to + perform a hard POST-OP and therefore fsync + if necesssary + */ + piggyback = _gf_false; + GF_ASSERT (fdctx->pre_op_done[i]); + fdctx->pre_op_done[i]--; + } + } + } + UNLOCK(&fd->lock); + + if (!afr_txn_nothing_failed (frame, frame->this)) { + /* something failed in this transaction, + we will be performing a hard post-op + */ + return _gf_false; + } + + return piggyback; +} + + +/* SET operation */ +int +afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) +{ + afr_fd_ctx_t *fdctx = NULL; + + fdctx = afr_fd_ctx_get (fd, this); + + LOCK(&fd->lock); + { + fdctx->witnessed_unstable_write = _gf_true; + } + UNLOCK(&fd->lock); + + return 0; +} + +/* TEST and CLEAR operation */ +gf_boolean_t +afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd) +{ + afr_fd_ctx_t *fdctx = NULL; + gf_boolean_t witness = _gf_false; + + fdctx = afr_fd_ctx_get (fd, this); + if (!fdctx) + return _gf_true; + + LOCK(&fd->lock); + { + if (fdctx->witnessed_unstable_write) { + witness = _gf_true; + fdctx->witnessed_unstable_write = _gf_false; + } + } + UNLOCK (&fd->lock); + + return witness; +} + + +int +afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) +{ + afr_private_t *priv = NULL; + int child_index = (long) cookie; + int call_count = -1; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (afr_fop_failed (op_ret, op_errno)) { + /* Failure of fsync() is as good as failure of previous + write(). So treat it like one. + */ + gf_log (this->name, GF_LOG_WARNING, + "fsync(%s) failed on subvolume %s. Transaction was %s", + uuid_utoa (local->fd->inode->gfid), + priv->children[child_index]->name, + gf_fop_list[local->op]); + + afr_transaction_fop_failed (frame, this, child_index); + } + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_changelog_post_op_now (frame, this); + + return 0; +} + + +int +afr_changelog_fsync (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + GF_UNUSED int ret = -1; + + local = frame->local; + priv = this->private; + + call_count = afr_pre_op_done_children_count (local->transaction.pre_op, + priv->child_count); + + if (!call_count) { + /* will go straight to unlock */ + afr_changelog_post_op_now (frame, this); + return 0; + } + + local->call_count = call_count; + + xdata = dict_new(); + if (xdata) + ret = dict_set_int32 (xdata, "batch-fsync", 1); + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.pre_op[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk, + (void *) (long) i, priv->children[i], + priv->children[i]->fops->fsync, local->fd, + 1, xdata); + if (!--call_count) + break; + } + + if (xdata) + dict_unref (xdata); + + return 0; +} + + + int +afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) { + afr_changelog_post_op_now (frame, this); + return 0; + } + + if (is_piggyback_post_op (frame, local->fd)) { + /* just detected that this post-op is about to + be optimized away as a new write() has + already piggybacked on this frame's changelog. + */ + afr_changelog_post_op_now (frame, this); + return 0; + } + + /* Calling afr_changelog_post_op_now() now will result in + issuing ->[f]xattrop(). + + Performing a hard POST-OP (->[f]xattrop() FOP) is a more + responsible operation that what it might appear on the surface. + + The changelog of a file (in the xattr of the file on the server) + stores information (pending count) about the state of the file + on the OTHER server. This changelog is blindly trusted, and must + therefore be updated in such a way it remains trustworthy. This + implies that decrementing the pending count (essentially "clearing + the dirty flag") must be done STRICTLY after we are sure that the + operation on the other server has reached stable storage. + + While the backend filesystem on that server will eventually flush + it to stable storage, we (being in userspace) have no mechanism + to get notified when the write became "stable". + + This means we need take matter into our own hands and issue an + fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES, + and get an acknowledgement for it. And we need to wait for the + fsync() acknowledgement before initiating the hard POST-OP. + + However if the FD itself was opened in O_SYNC or O_DSYNC then + we are already guaranteed that the writes were made stable as + part of the FOP itself. The same holds true for NFS stable + writes which happen on an anonymous FD with O_DSYNC or O_SYNC + flag set in the writev() @flags param. For all other write types, + mark a flag in the fdctx whenever an unstable write is witnessed. + */ + + if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { + afr_changelog_post_op_now (frame, this); + return 0; + } + + /* Check whether users want durability and perform fsync/post-op + * accordingly. + */ + if (priv->ensure_durability) { + /* Time to fsync() */ + afr_changelog_fsync (frame, this); + } else { + afr_changelog_post_op_now (frame, this); + } + + return 0; } void -afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd) +afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, + call_stub_t *stub) { afr_fd_ctx_t *fd_ctx = NULL; call_frame_t *prev_frame = NULL; - struct timeval delta = {0, }; + struct timespec delta = {0, }; afr_private_t *priv = NULL; + afr_local_t *local = NULL; priv = this->private; fd_ctx = afr_fd_ctx_get (fd, this); if (!fd_ctx) - return; + goto out; delta.tv_sec = priv->post_op_delay_secs; - delta.tv_usec = 0; + delta.tv_nsec = 0; pthread_mutex_lock (&fd_ctx->delay_lock); { @@ -1350,8 +1729,13 @@ afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd) unlock: pthread_mutex_unlock (&fd_ctx->delay_lock); +out: if (prev_frame) { - afr_changelog_post_op_now (prev_frame, this); + local = prev_frame->local; + local->transaction.resume_stub = stub; + afr_changelog_post_op_safe (prev_frame, this); + } else if (stub) { + call_resume (stub); } } @@ -1359,49 +1743,62 @@ unlock: void afr_changelog_post_op (call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (is_afr_delayed_changelog_post_op_needed (frame, this)) - afr_delayed_changelog_post_op (this, frame, local->fd); - else - afr_changelog_post_op_now (frame, this); + if (is_afr_delayed_changelog_post_op_needed (frame, this)) + afr_delayed_changelog_post_op (this, frame, local->fd, NULL); + else + afr_changelog_post_op_safe (frame, this); +} + + + +/* Wake up the sleeping/delayed post-op, and also register + a stub to have it resumed after this transaction + completely finishes. + + The @stub gets saved in @local and gets resumed in + afr_local_cleanup() + */ +void +afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub) +{ + afr_delayed_changelog_post_op (this, NULL, fd, stub); } void afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd) { - afr_delayed_changelog_post_op (this, NULL, fd); + afr_delayed_changelog_post_op (this, NULL, fd, NULL); } -int + int afr_transaction_resume (call_frame_t *frame, xlator_t *this) { afr_internal_lock_t *int_lock = NULL; afr_local_t *local = NULL; afr_private_t *priv = NULL; - fd_t *fd = NULL; local = frame->local; int_lock = &local->internal_lock; priv = this->private; - fd = local->fd; - if (fd) - /* The wake up needs to happen independent of - what type of fop arrives here. If it was - a write, then it has already inherited the - lock and changelog. If it was not a write, - then the presumption of the optimization (of - optimizing for successive write operations) - fails. - */ - afr_delayed_changelog_wake_up (this, fd); + if (local->transaction.eager_lock_on) { + /* We don't need to retain "local" in the + fd list anymore, writes to all subvols + are finished by now */ + LOCK (&local->fd->lock); + { + list_del_init (&local->transaction.eager_locked); + } + UNLOCK (&local->fd->lock); + } - afr_restore_lk_owner (frame); + afr_restore_lk_owner (frame); if (__fop_changelog_needed (frame, this)) { afr_changelog_post_op (frame, this); @@ -1423,7 +1820,8 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) */ void -afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index) +afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, + int child_index) { afr_local_t * local = NULL; afr_private_t * priv = NULL; @@ -1432,22 +1830,92 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index priv = this->private; __mark_child_dead (local->pending, priv->child_count, - child_index, local->transaction.type); + child_index, local->transaction.type); } -gf_boolean_t -_does_transaction_conflict_with_delayed_post_op (call_frame_t *frame) + + + static gf_boolean_t +afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) { - afr_local_t *local = frame->local; - //check if it is going to compete with inode lock from the fd - if (local->fd) - return _gf_false; - if ((local->transaction.type == AFR_DATA_TRANSACTION) || - (local->transaction.type == AFR_METADATA_TRANSACTION)) - return _gf_true; - return _gf_false; + uint64_t start1 = local1->transaction.start; + uint64_t start2 = local2->transaction.start; + uint64_t end1 = 0; + uint64_t end2 = 0; + + if (local1->transaction.len) + end1 = start1 + local1->transaction.len - 1; + else + end1 = ULLONG_MAX; + + if (local2->transaction.len) + end2 = start2 + local2->transaction.len - 1; + else + end2 = ULLONG_MAX; + + return ((end1 >= start2) && (end2 >= start1)); } +void +afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_fd_ctx_t *fdctx = NULL; + afr_local_t *each = NULL; + + priv = this->private; + + if (!local->fd) + return; + + if (local->transaction.type != AFR_DATA_TRANSACTION) + return; + + if (!priv->eager_lock) + return; + + fdctx = afr_fd_ctx_get (local->fd, this); + if (!fdctx) + return; + + if (afr_are_multiple_fds_opened (local->fd->inode, this)) + return; + /* + * Once full file lock is acquired in eager-lock phase, overlapping + * writes do not compete for inode-locks, instead are transferred to the + * next writes. Because of this overlapping writes are not ordered. + * This can cause inconsistencies in replication. + * Example: + * Two overlapping writes w1, w2 are sent in parallel on same fd + * in two threads t1, t2. + * Both threads can execute afr_writev_wind in the following manner. + * t1 winds w1 on brick-0 + * t2 winds w2 on brick-0 + * t2 winds w2 on brick-1 + * t1 winds w1 on brick-1 + * + * This check makes sure the locks are not transferred for + * overlapping writes. + */ + LOCK (&local->fd->lock); + { + list_for_each_entry (each, &fdctx->eager_locked, + transaction.eager_locked) { + if (afr_locals_overlap (each, local)) { + local->transaction.eager_lock_on = _gf_false; + goto unlock; + } + } + + local->transaction.eager_lock_on = _gf_true; + list_add_tail (&local->transaction.eager_locked, + &fdctx->eager_locked); + } +unlock: + UNLOCK (&local->fd->lock); +} + + int afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) { @@ -1459,23 +1927,25 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) local = frame->local; priv = this->private; + local->transaction.resume = afr_transaction_resume; + local->transaction.type = type; + ret = afr_transaction_local_init (local, this); - if (ret < 0) { + if (ret < 0) goto out; - } - local->transaction.resume = afr_transaction_resume; - local->transaction.type = type; + afr_transaction_eager_lock_init (local, this); - if (local->fd && priv->eager_lock && - local->transaction.type == AFR_DATA_TRANSACTION) + if (local->fd && local->transaction.eager_lock_on) afr_set_lk_owner (frame, this, local->fd); else afr_set_lk_owner (frame, this, frame->root); - if (_does_transaction_conflict_with_delayed_post_op (frame) && - local->loc.inode) { + if (!local->transaction.eager_lock_on && local->loc.inode) { fd = fd_lookup (local->loc.inode, frame->root->pid); + if (fd == NULL) + fd = fd_lookup_anonymous (local->loc.inode); + if (fd) { afr_delayed_changelog_wake_up (this, fd); fd_unref (fd); diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h index e95bc5b14..fa626fd0d 100644 --- a/xlators/cluster/afr/src/afr-transaction.h +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -23,6 +23,9 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int afr_lock_server_count (afr_private_t *priv, afr_transaction_type type); +afr_inodelk_t* +afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom); + int32_t afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); @@ -37,4 +40,12 @@ afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this); void afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd); +void +__mark_all_success (int32_t *pending[], int child_count, + afr_transaction_type type); +gf_boolean_t +afr_any_fops_failed (afr_local_t *local, afr_private_t *priv); + +gf_boolean_t +afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this); #endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c index bee10fd01..c724eb2ae 100644 --- a/xlators/cluster/afr/src/afr.c +++ b/xlators/cluster/afr/src/afr.c @@ -191,6 +191,8 @@ reconfigure (xlator_t *this, dict_t *options) /* Reset this so we re-discover in case the topology changed. */ GF_OPTION_RECONF ("readdir-failover", priv->readdir_failover, options, bool, out); + GF_OPTION_RECONF ("ensure-durability", priv->ensure_durability, options, + bool, out); priv->did_discovery = _gf_false; ret = 0; @@ -335,6 +337,8 @@ init (xlator_t *this) GF_OPTION_INIT ("post-op-delay-secs", priv->post_op_delay_secs, uint32, out); GF_OPTION_INIT ("readdir-failover", priv->readdir_failover, bool, out); + GF_OPTION_INIT ("ensure-durability", priv->ensure_durability, bool, + out); priv->wait_count = 1; @@ -376,8 +380,6 @@ init (xlator_t *this) AFR_XATTR_PREFIX, trav->xlator->name); if (-1 == ret) { - gf_log (this->name, GF_LOG_ERROR, - "asprintf failed to set pending key"); ret = -ENOMEM; goto out; } @@ -386,6 +388,13 @@ init (xlator_t *this) i++; } + ret = gf_asprintf (&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT, + this->name); + if (-1 == ret) { + ret = -ENOMEM; + goto out; + } + priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event), gf_afr_mt_int32_t); if (!priv->last_event) { @@ -430,15 +439,18 @@ init (xlator_t *this) if (!priv->shd.timer) goto out; - priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false); + priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false, + _destroy_shd_event_data); if (!priv->shd.healed) goto out; - priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false); + priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false, + _destroy_shd_event_data); if (!priv->shd.heal_failed) goto out; - priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false); + priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false, + _destroy_shd_event_data); if (!priv->shd.split_brain) goto out; @@ -448,7 +460,9 @@ init (xlator_t *this) priv->root_inode = inode_ref (this->itable->root); GF_OPTION_INIT ("node-uuid", priv->shd.node_uuid, str, out); GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out); - + ret = afr_initialise_statistics (this); + if (ret) + goto out; ret = 0; out: return ret; @@ -483,6 +497,9 @@ struct xlator_fops fops = { .finodelk = afr_finodelk, .entrylk = afr_entrylk, .fentrylk = afr_fentrylk, + .fallocate = afr_fallocate, + .discard = afr_discard, + .zerofill = afr_zerofill, /* inode read */ .access = afr_access, @@ -765,5 +782,12 @@ struct volume_options options[] = { .description = "readdir(p) will not failover if this option is off", .default_value = "on", }, + { .key = {"ensure-durability"}, + .type = GF_OPTION_TYPE_BOOL, + .description = "Afr performs fsyncs for transactions if this " + "option is on to make sure the changelogs/data is " + "written to the disk", + .default_value = "on", + }, { .key = {NULL} }, }; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h index da9cc67c6..21064db58 100644 --- a/xlators/cluster/afr/src/afr.h +++ b/xlators/cluster/afr/src/afr.h @@ -28,6 +28,10 @@ #define AFR_XATTR_PREFIX "trusted.afr" #define AFR_PATHINFO_HEADER "REPLICATE:" #define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size" +#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal" + +#define AFR_LOCKEE_COUNT_MAX 3 +#define AFR_DOM_COUNT_MAX 3 struct _pump_private; @@ -58,10 +62,8 @@ typedef enum { AFR_INODE_SET_READ_CTX = 1, AFR_INODE_RM_STALE_CHILDREN, AFR_INODE_SET_OPENDIR_DONE, - AFR_INODE_SET_SPLIT_BRAIN, AFR_INODE_GET_READ_CTX, AFR_INODE_GET_OPENDIR_DONE, - AFR_INODE_GET_SPLIT_BRAIN, } afr_inode_op_t; typedef struct afr_inode_params_ { @@ -75,29 +77,41 @@ typedef struct afr_inode_params_ { } u; } afr_inode_params_t; +typedef enum afr_spb_state { + DONT_KNOW, + SPB, + NO_SPB +} afr_spb_state_t; + typedef struct afr_inode_ctx_ { uint64_t masks; int32_t *fresh_children;//increasing order of latency + afr_spb_state_t mdata_spb; + afr_spb_state_t data_spb; + uint32_t open_fd_count; } afr_inode_ctx_t; typedef enum { NONE, INDEX, + INDEX_TO_BE_HEALED, FULL, } afr_crawl_type_t; typedef struct afr_self_heald_ { - gf_boolean_t enabled; - gf_boolean_t iamshd; - afr_crawl_type_t *pending; - gf_boolean_t *inprogress; - afr_child_pos_t *pos; - gf_timer_t **timer; - eh_t *healed; - eh_t *heal_failed; - eh_t *split_brain; - char *node_uuid; - int timeout; + gf_boolean_t enabled; + gf_boolean_t iamshd; + afr_crawl_type_t *pending; + gf_boolean_t *inprogress; + afr_child_pos_t *pos; + gf_timer_t **timer; + eh_t *healed; + eh_t *heal_failed; + eh_t *split_brain; + eh_t **statistics; + void **crawl_events; + char *node_uuid; + int timeout; } afr_self_heald_t; typedef struct _afr_private { @@ -162,9 +176,38 @@ typedef struct _afr_private { gf_boolean_t did_discovery; gf_boolean_t readdir_failover; uint64_t sh_readdir_size; + gf_boolean_t ensure_durability; + char *sh_domain; } afr_private_t; +typedef enum { + AFR_SELF_HEAL_NOT_ATTEMPTED, + AFR_SELF_HEAL_STARTED, + AFR_SELF_HEAL_FAILED, + AFR_SELF_HEAL_SYNC_BEGIN, +} afr_self_heal_status; + typedef struct { + afr_self_heal_status gfid_or_missing_entry_self_heal; + afr_self_heal_status metadata_self_heal; + afr_self_heal_status data_self_heal; + afr_self_heal_status entry_self_heal; +} afr_sh_status_for_all_type; + +typedef enum { + AFR_SELF_HEAL_ENTRY, + AFR_SELF_HEAL_METADATA, + AFR_SELF_HEAL_DATA, + AFR_SELF_HEAL_GFID_OR_MISSING_ENTRY, + AFR_SELF_HEAL_INVALID = -1, +} afr_self_heal_type; + +typedef enum { + AFR_CHECK_ALL, + AFR_CHECK_SPECIFIC, +} afr_sh_fail_check_type; + +struct afr_self_heal_ { /* External interface: These are variables (some optional) that are set by whoever has triggered self-heal */ @@ -241,10 +284,10 @@ typedef struct { const char *linkname; gf_boolean_t entries_skipped; - int op_failed; gf_boolean_t actual_sh_started; gf_boolean_t sync_done; gf_boolean_t data_lock_held; + gf_boolean_t sh_dom_lock_held; gf_boolean_t eof_reached; fd_t *healing_fd; int file_has_holes; @@ -255,13 +298,17 @@ typedef struct { uint8_t *checksum; afr_post_remove_call_t post_remove_call; - loc_t parent_loc; + char *data_sh_info; + char *metadata_sh_info; + loc_t parent_loc; call_frame_t *orig_frame; call_frame_t *old_loop_frame; gf_boolean_t unwound; afr_sh_algo_private_t *private; + afr_sh_status_for_all_type afr_all_sh_status; + afr_self_heal_type sh_type_in_action; struct afr_sh_algorithm *algo; afr_lock_cbk_t data_lock_success_handler; @@ -272,11 +319,11 @@ typedef struct { int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this); int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this); void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this); - gf_boolean_t mdata_spb; - gf_boolean_t data_spb; call_frame_t *sh_frame; -} afr_self_heal_t; +}; + +typedef struct afr_self_heal_ afr_self_heal_t; typedef enum { AFR_DATA_TRANSACTION, /* truncate, write, ... */ @@ -338,11 +385,31 @@ afr_index_for_transaction_type (afr_transaction_type type) return -1; /* make gcc happy */ } +typedef struct { + loc_t loc; + char *basename; + unsigned char *locked_nodes; + int locked_count; + +} afr_entry_lockee_t; + +int +afr_entry_lockee_cmp (const void *l1, const void *l2); + +typedef struct { + char *domain; /* Domain on which inodelk is taken */ + struct gf_flock flock; + unsigned char *locked_nodes; + int32_t lock_count; +} afr_inodelk_t; typedef struct { loc_t *lk_loc; - struct gf_flock lk_flock; + int lockee_count; + afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX]; + + afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX]; const char *lk_basename; const char *lower_basename; const char *higher_basename; @@ -351,23 +418,22 @@ typedef struct { unsigned char *locked_nodes; unsigned char *lower_locked_nodes; - unsigned char *inode_locked_nodes; - unsigned char *entry_locked_nodes; selfheal_lk_type_t selfheal_lk_type; transaction_lk_type_t transaction_lk_type; int32_t lock_count; - int32_t inodelk_lock_count; int32_t entrylk_lock_count; uint64_t lock_number; int32_t lk_call_count; int32_t lk_expected_count; + int32_t lk_attempted_count; int32_t lock_op_ret; int32_t lock_op_errno; afr_lock_cbk_t lock_cbk; + char *domain; /* Domain on which inode/entry lock/unlock in progress.*/ } afr_internal_lock_t; typedef struct _afr_locked_fd { @@ -387,9 +453,11 @@ typedef struct _afr_local { unsigned int call_count; unsigned int success_count; unsigned int enoent_count; + uint32_t open_fd_count; + gf_boolean_t update_open_fd_count; - unsigned int govinda_gOvinda; + unsigned int unhealable; unsigned int read_child_index; unsigned char read_child_returned; @@ -406,7 +474,6 @@ typedef struct _afr_local { loc_t newloc; fd_t *fd; - unsigned char *fd_open_on; glusterfs_fop_t fop; @@ -430,12 +497,23 @@ typedef struct _afr_local { int optimistic_change_log; gf_boolean_t delayed_post_op; - gf_boolean_t fop_paused; - int (*fop_call_continue) (call_frame_t *frame, xlator_t *this); - /* - This struct contains the arguments for the "continuation" - (scheme-like) of fops + /* Is the current writev() going to perform a stable write? + i.e, is fd->flags or @flags writev param have O_SYNC or + O_DSYNC? + */ + gf_boolean_t stable_write; + + /* This write appended to the file. Nnot necessarily O_APPEND, + just means the offset of write was at the end of file. + */ + gf_boolean_t append_write; + + int allow_sh_for_running_transaction; + + + /* This struct contains the arguments for the "continuation" + (scheme-like) of fops */ int op; @@ -531,7 +609,9 @@ typedef struct _afr_local { struct { struct iatt prebuf; struct iatt postbuf; + } inode_wfop; //common structure for all inode-write-fops + struct { int32_t op_ret; struct iovec *vector; @@ -542,34 +622,21 @@ typedef struct _afr_local { } writev; struct { - struct iatt prebuf; - struct iatt postbuf; - } fsync; - - struct { off_t offset; - struct iatt prebuf; - struct iatt postbuf; } truncate; struct { off_t offset; - struct iatt prebuf; - struct iatt postbuf; } ftruncate; struct { struct iatt in_buf; int32_t valid; - struct iatt preop_buf; - struct iatt postop_buf; } setattr; struct { struct iatt in_buf; int32_t valid; - struct iatt preop_buf; - struct iatt postop_buf; } fsetattr; struct { @@ -631,11 +698,32 @@ typedef struct _afr_local { dict_t *params; char *linkpath; } symlink; + + struct { + int32_t mode; + off_t offset; + size_t len; + } fallocate; + + struct { + off_t offset; + size_t len; + } discard; + + struct { + off_t offset; + size_t len; + struct iatt prebuf; + struct iatt postbuf; + } zerofill; + + } cont; struct { off_t start, len; + gf_boolean_t eager_lock_on; int *eager_lock; char *basename; @@ -646,6 +734,17 @@ typedef struct _afr_local { afr_transaction_type type; + /* pre-compute the post piggyback status before + entering POST-OP phase + */ + int *postop_piggybacked; + + /* stub to resume on destruction + of the transaction frame */ + call_stub_t *resume_stub; + + struct list_head eager_locked; + int32_t **txn_changelog;//changelog after pre+post ops unsigned char *pre_op; @@ -683,11 +782,6 @@ typedef enum { } afr_fd_open_status_t; typedef struct { - struct list_head call_list; - call_frame_t *frame; -} afr_fd_paused_call_t; - -typedef struct { unsigned int *pre_op_done; afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ unsigned int *pre_op_piggyback; @@ -706,13 +800,20 @@ typedef struct { struct list_head entries; /* needed for readdir failover */ unsigned char *locked_on; /* which subvolumes locks have been successful */ - struct list_head paused_calls; /* queued calls while fix_open happens */ /* used for delayed-post-op optimization */ pthread_mutex_t delay_lock; gf_timer_t *delay_timer; call_frame_t *delay_frame; int call_child; + + /* set if any write on this fd was a non stable write + (i.e, without O_SYNC or O_DSYNC) + */ + gf_boolean_t witnessed_unstable_write; + + /* list of frames currently in progress */ + struct list_head eager_locked; } afr_fd_ctx_t; @@ -748,6 +849,13 @@ int32_t afr_notify (xlator_t *this, int32_t event, void *data, void *data2); int +afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local, + loc_t *loc, char *basename, int child_count); + +void +afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock); + +int afr_attempt_lock_recovery (xlator_t *this, int32_t child_index); int @@ -782,8 +890,8 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this); int afr_internal_lock_finish (call_frame_t *frame, xlator_t *this); -void -afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, +int +afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, unsigned int child_count); int pump_start (call_frame_t *frame, xlator_t *this); @@ -833,7 +941,8 @@ gf_boolean_t afr_is_split_brain (xlator_t *this, inode_t *inode); void -afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set); +afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb, + afr_spb_state_t data_spb); int afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, @@ -948,8 +1057,9 @@ afr_children_rm_child (int32_t *children, int32_t child, int32_t child_count); void afr_reset_children (int32_t *children, int32_t child_count); -gf_boolean_t -afr_error_more_important (int32_t old_errno, int32_t new_errno); +int32_t +afr_most_important_error(int32_t old_errno, int32_t new_errno, + gf_boolean_t eio); int afr_errno_count (int32_t *children, int *child_errno, unsigned int child_count, int32_t op_errno); @@ -992,11 +1102,11 @@ afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode, int (*unwind) (call_frame_t *frame, xlator_t *this, int32_t op_ret, int32_t op_errno, int32_t sh_failed)); -int -afr_fix_open (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx, - int need_open_count, int *need_open); -int -afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop); +void +afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open); + +void +afr_open_fd_fix (fd_t *fd, xlator_t *this); int afr_set_elem_count_get (unsigned char *elems, int child_count); @@ -1030,6 +1140,9 @@ afr_is_errno_set (int *child_errno, int child); gf_boolean_t afr_is_errno_unset (int *child_errno, int child); +gf_boolean_t +afr_is_fd_fixable (fd_t *fd); + void afr_prepare_new_entry_pending_matrix (int32_t **pending, gf_boolean_t (*is_pending) (int *, int), @@ -1056,4 +1169,44 @@ afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count); } \ } while (0); + +#define AFR_SBRAIN_MSG "Failed on %s as split-brain is seen. Returning EIO." + +#define AFR_SBRAIN_CHECK_FD(fd, label) do { \ + if (fd->inode && afr_is_split_brain (this, fd->inode)) { \ + op_errno = EIO; \ + gf_log (this->name, GF_LOG_WARNING, \ + AFR_SBRAIN_MSG ,uuid_utoa (fd->inode->gfid)); \ + goto label; \ + } \ +} while (0) + +#define AFR_SBRAIN_CHECK_LOC(loc, label) do { \ + if (loc->inode && afr_is_split_brain (this, loc->inode)) { \ + op_errno = EIO; \ + loc_path (loc, NULL); \ + gf_log (this->name, GF_LOG_WARNING, \ + AFR_SBRAIN_MSG , loc->path); \ + goto label; \ + } \ +} while (0) + +int +afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); + +gf_boolean_t +afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd); + +void +afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub); + +int +afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count); + +void +afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this); + +afr_inode_ctx_t* +afr_inode_ctx_get (inode_t *inode, xlator_t *this); + #endif /* __AFR_H__ */ diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c index d3823383c..a7f72fb30 100644 --- a/xlators/cluster/afr/src/pump.c +++ b/xlators/cluster/afr/src/pump.c @@ -2443,6 +2443,7 @@ init (xlator_t *this) priv->metadata_change_log = 1; priv->entry_change_log = 1; priv->use_afr_in_pump = 1; + priv->sh_readdir_size = 65536; /* Locking options */ @@ -2500,6 +2501,12 @@ init (xlator_t *this) i++; } + ret = gf_asprintf (&priv->sh_domain, "%s-self-heal", this->name); + if (-1 == ret) { + op_errno = ENOMEM; + goto out; + } + priv->first_lookup = 1; priv->root_inode = NULL; @@ -2531,7 +2538,7 @@ init (xlator_t *this) goto out; } - pump_priv->env = syncenv_new (0); + pump_priv->env = this->ctx->env; if (!pump_priv->env) { gf_log (this->name, GF_LOG_ERROR, "Could not create new sync-environment"); @@ -2572,9 +2579,6 @@ fini (xlator_t *this) if (!pump_priv) goto afr_priv; - if (pump_priv->env) - syncenv_destroy (pump_priv->env); - GF_FREE (pump_priv->resume_path); LOCK_DESTROY (&pump_priv->resume_path_lock); LOCK_DESTROY (&pump_priv->pump_state_lock); diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am index b78af1ff3..174bea841 100644 --- a/xlators/cluster/dht/src/Makefile.am +++ b/xlators/cluster/dht/src/Makefile.am @@ -4,7 +4,7 @@ xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c dht-rebalance.c \ dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c \ - dht-common.c dht-inode-write.c dht-inode-read.c \ + dht-common.c dht-inode-write.c dht-inode-read.c dht-shared.c \ $(top_builddir)/xlators/lib/src/libxlator.c dht_la_SOURCES = $(dht_common_source) dht.c @@ -12,13 +12,13 @@ dht_la_SOURCES = $(dht_common_source) dht.c nufa_la_SOURCES = $(dht_common_source) nufa.c switch_la_SOURCES = $(dht_common_source) switch.c -dht_la_LDFLAGS = -module -avoidversion +dht_la_LDFLAGS = -module -avoid-version dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -nufa_la_LDFLAGS = -module -avoidversion +nufa_la_LDFLAGS = -module -avoid-version nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -switch_la_LDFLAGS = -module -avoidversion +switch_la_LDFLAGS = -module -avoid-version switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la noinst_HEADERS = dht-common.h dht-mem-types.h \ diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 70e737c89..8f61339e6 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -22,6 +22,7 @@ #include "dht-common.h" #include "defaults.h" #include "byte-order.h" +#include "glusterfs-acl.h" #include <sys/time.h> #include <libgen.h> @@ -62,6 +63,11 @@ dht_aggregate (dict_t *this, char *key, data_t *value, void *data) } *size = hton64 (ntoh64 (*size) + ntoh64 (*ptr)); + + } else if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) { + ret = gf_get_min_stime (THIS, dst, key, value); + if (ret < 0) + return ret; } else { /* compare user xattrs only */ if (!strncmp (key, "user.", strlen ("user."))) { @@ -148,9 +154,11 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame) int op_errno = 0; int ret = -1; dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; local = discover_frame->local; layout = local->layout; + conf = this->private; LOCK(&discover_frame->lock); { @@ -193,11 +201,14 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame) "(overlaps/holes present: %s, " "ENOENT errors: %d)", local->loc.path, (ret < 0) ? "yes" : "no", (ret > 0) ? ret : 0); - op_errno = EINVAL; - goto out; + if ((ret > 0) && (ret == conf->subvolume_cnt)) { + op_errno = ESTALE; + goto out; + } } - dht_layout_set (this, local->inode, layout); + if (local->inode) + dht_layout_set (this, local->inode, layout); } DHT_STACK_UNWIND (lookup, main_frame, local->op_ret, local->op_errno, @@ -226,6 +237,7 @@ dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int is_dir = 0; int is_linkfile = 0; int attempt_unwind = 0; + dht_conf_t *conf = 0; GF_VALIDATE_OR_GOTO ("dht", frame, out); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -235,6 +247,7 @@ dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; prev = cookie; + conf = this->private; layout = local->layout; @@ -269,7 +282,8 @@ dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto unlock; } - is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); is_dir = check_is_dir (inode, stbuf, xattr); if (is_dir) { @@ -328,23 +342,20 @@ dht_discover (call_frame_t *frame, xlator_t *this, loc_t *loc) int i = 0; call_frame_t *discover_frame = NULL; - conf = this->private; local = frame->local; - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + ret = dict_set_uint32 (local->xattr_req, conf->xattr_name, 4 * 4); if (ret) gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set 'trusted.glusterfs.dht' key", - loc->path); + "%s: failed to set '%s' key", + loc->path, conf->xattr_name); - ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht.linkto", 256); + ret = dict_set_uint32 (local->xattr_req, conf->link_xattr_name, 256); if (ret) gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set 'trusted.glusterfs.dht.linkto' key", - loc->path); + "%s: failed to set '%s' key", + loc->path, conf->link_xattr_name); call_cnt = conf->subvolume_cnt; local->call_cnt = call_cnt; @@ -430,7 +441,7 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, op_ret, op_errno, xattr); if (op_ret == -1) { - local->op_errno = ENOENT; + local->op_errno = op_errno; gf_log (this->name, GF_LOG_DEBUG, "lookup of %s on %s returned error (%s)", local->loc.path, prev->this->name, @@ -585,7 +596,8 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, layout = local->layout; is_dir = check_is_dir (inode, stbuf, xattr); - is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); if (is_linkfile) { gf_log (this->name, GF_LOG_INFO, @@ -597,7 +609,7 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, } if (is_dir) { - ret = dht_dir_has_layout (xattr); + ret = dht_dir_has_layout (xattr, conf->xattr_name); if (ret >= 0) { if (is_greater_time(local->stbuf.ia_ctime, local->stbuf.ia_ctime_nsec, @@ -701,7 +713,7 @@ cont: if (local->loc.parent) { dht_inode_ctx_time_update (local->loc.parent, this, - postparent, 1); + &local->postparent, 1); } DHT_STRIP_PHASE1_FLAGS (&local->stbuf); @@ -754,12 +766,15 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie, local->stbuf.ia_prot.sticky = 1; } -unwind: if (local->loc.parent) { dht_inode_ctx_time_update (local->loc.parent, this, postparent, 1); } +unwind: + if (local->linked == _gf_true) + dht_linkfile_attr_heal (frame, this); + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, local->inode, &local->stbuf, local->xattr, @@ -883,7 +898,7 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this) hashed_subvol->name); ret = dht_linkfile_create (frame, - dht_lookup_linkfile_create_cbk, + dht_lookup_linkfile_create_cbk, this, cached_subvol, hashed_subvol, &local->loc); return ret; @@ -921,8 +936,9 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, xlator_t *subvol = NULL; loc_t *loc = NULL; xlator_t *link_subvol = NULL; - int ret = -1; - int32_t fd_count = 0; + int ret = -1; + int32_t fd_count = 0; + dht_conf_t *conf = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, out); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -932,6 +948,7 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local = frame->local; loc = &local->loc; + conf = this->private; prev = cookie; subvol = prev->this; @@ -953,7 +970,8 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, loc->path, prev->this->name); } - is_linkfile = check_is_linkfile (inode, buf, xattr); + is_linkfile = check_is_linkfile (inode, buf, xattr, + conf->link_xattr_name); is_dir = check_is_dir (inode, buf, xattr); if (is_linkfile) { @@ -1095,7 +1113,16 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, gf_log (this->name, GF_LOG_INFO, "lookup of %s on %s (following linkfile) failed (%s)", local->loc.path, subvol->name, strerror (op_errno)); - goto err; + + /* If cached subvol returned ENOTCONN, do not do + lookup_everywhere. We need to make sure linkfile does not get + removed, which can take away the namespace, and subvol is + anyways down. */ + + if (op_errno != ENOTCONN) + goto err; + else + goto unwind; } if (check_is_dir (inode, stbuf, xattr)) { @@ -1105,7 +1132,7 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, goto err; } - if (check_is_linkfile (inode, stbuf, xattr)) { + if (check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) { gf_log (this->name, GF_LOG_INFO, "lookup of %s on %s (following linkfile) reached link", local->loc.path, subvol->name); @@ -1133,12 +1160,12 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, op_errno = EINVAL; } -unwind: if (local->loc.parent) { dht_inode_ctx_time_update (local->loc.parent, this, postparent, 1); } +unwind: DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr, postparent); @@ -1282,7 +1309,8 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; } - is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); if (!is_linkfile) { /* non-directory and not a linkfile */ @@ -1322,7 +1350,7 @@ out: * from each of the subvolume. See dht_iatt_merge for reference. */ - if (local->loc.parent) { + if (!op_ret && local->loc.parent) { dht_inode_ctx_time_update (local->loc.parent, this, postparent, 1); } @@ -1387,7 +1415,6 @@ dht_lookup (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); conf = this->private; if (!conf) @@ -1462,7 +1489,7 @@ dht_lookup (call_frame_t *frame, xlator_t *this, * revalidates directly go to the cached-subvolume. */ ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); if (IA_ISDIR (local->inode->ia_type)) { local->call_cnt = call_cnt = conf->subvolume_cnt; @@ -1497,10 +1524,10 @@ dht_lookup (call_frame_t *frame, xlator_t *this, do_fresh_lookup: /* TODO: remove the hard-coding */ ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); ret = dict_set_uint32 (local->xattr_req, - DHT_LINKFILE_KEY, 256); + conf->link_xattr_name, 256); /* need it for self-healing linkfiles which is 'in-migration' state */ @@ -1608,7 +1635,8 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, LOCK (&frame->lock); { - if (op_ret == -1) { + if ((op_ret == -1) && !((op_errno == ENOENT) || + (op_errno == ENOTCONN))) { local->op_errno = op_errno; gf_log (this->name, GF_LOG_DEBUG, "subvolume %s returned -1 (%s)", @@ -1621,7 +1649,7 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, unlock: UNLOCK (&frame->lock); - if (op_ret == -1) + if (local->op_ret == -1) goto err; cached_subvol = dht_subvol_get_cached (this, local->loc.inode); @@ -1645,41 +1673,6 @@ err: return 0; } -static int -dht_ufo_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xdata) -{ - dht_local_t *local = NULL; - int this_call_cnt = 0; - call_frame_t *prev = NULL; - - local = frame->local; - prev = cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_ret = -1; - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_DEBUG, - "subvolume %s returned -1 (%s)", - prev->this->name, strerror (op_errno)); - goto unlock; - } - } -unlock: - UNLOCK (&frame->lock); - - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { - DHT_STACK_UNWIND (setxattr, frame, local->op_ret, - local->op_errno, NULL); - } - - return 0; -} - - int dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, dict_t *xdata) @@ -1752,115 +1745,222 @@ dht_fill_pathinfo_xattr (xlator_t *this, dht_local_t *local, } int -dht_vgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +dht_vgetxattr_alloc_and_fill (dht_local_t *local, dict_t *xattr, xlator_t *this, + int op_errno) { - dht_local_t *local = NULL; - int ret = 0; - int flag = 0; - int this_call_cnt = 0; - char *value_got = NULL; - char layout_buf[8192] = {0,}; - char *xattr_buf = NULL; - dict_t *dict = NULL; - int32_t alloc_len = 0; - int32_t plen = 0; - call_frame_t *prev = NULL; + int ret = -1; + char *value = NULL; + int32_t plen = 0; - local = frame->local; - prev = cookie; + ret = dict_get_str (xattr, local->xsel, &value); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Subvolume %s returned -1 (%s)", this->name, + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + goto out; + } - if (op_ret >= 0) { - ret = dict_get_str (xattr, local->xsel, &value_got); - if (!ret) { - alloc_len = strlen (value_got); + local->alloc_len += strlen(value); - /** - * allocate the buffer:- we allocate 10 bytes extra in - * case we need to append ' Link: ' in the buffer for - * another STACK_WIND - */ + if (!local->xattr_val) { + local->alloc_len += (strlen (DHT_PATHINFO_HEADER) + 10); + local->xattr_val = GF_CALLOC (local->alloc_len, sizeof (char), + gf_common_mt_char); + if (!local->xattr_val) { + ret = -1; + goto out; + } + } + + if (local->xattr_val) { + plen = strlen (local->xattr_val); + if (plen) { + /* extra byte(s) for \0 to be safe */ + local->alloc_len += (plen + 2); + local->xattr_val = GF_REALLOC (local->xattr_val, + local->alloc_len); if (!local->xattr_val) { - alloc_len += (strlen (DHT_PATHINFO_HEADER) + 10); - local->xattr_val = - GF_CALLOC (alloc_len, - sizeof (char), - gf_common_mt_char); + ret = -1; + goto out; } + } - if (local->xattr_val) { - plen = strlen (local->xattr_val); - if (plen) { - void *p; - /* extra byte(s) for \0 to be safe */ - alloc_len += (plen + 2); - p = GF_REALLOC (local->xattr_val, - alloc_len); - if (!p) - goto out; - local->xattr_val = p; - } + (void) strcat (local->xattr_val, value); + (void) strcat (local->xattr_val, " "); + local->op_ret = 0; + } - strcat (local->xattr_val, value_got); - } - local->op_ret = 0; - } + ret = 0; + + out: + return ret; +} + +int +dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this, + gf_boolean_t flag) +{ + int ret = -1; + char *xattr_buf = NULL; + char layout_buf[8192] = {0,}; + + if (flag) + fill_layout_info (local->layout, layout_buf); + + *dict = dict_new (); + if (!*dict) + goto out; + + local->xattr_val[strlen (local->xattr_val) - 1] = '\0'; + + /* we would need max this many bytes to create xattr string + * extra 40 bytes is just an estimated amount of additional + * space required as we include translator name and some + * spaces, brackets etc. when forming the pathinfo string. + * + * For node-uuid we just don't have all the pretty formatting, + * but since this is a generic routine for pathinfo & node-uuid + * we dont have conditional space allocation and try to be + * generic + */ + local->alloc_len += (2 * strlen (this->name)) + + strlen (layout_buf) + + 40; + xattr_buf = GF_CALLOC (local->alloc_len, sizeof (char), + gf_common_mt_char); + if (!xattr_buf) + goto out; + + if (XATTR_IS_PATHINFO (local->xsel)) { + (void) dht_fill_pathinfo_xattr (this, local, xattr_buf, + local->alloc_len, flag, + layout_buf); + } else if (XATTR_IS_NODE_UUID (local->xsel)) { + (void) snprintf (xattr_buf, local->alloc_len, "%s", + local->xattr_val); } else { - local->op_ret = -1; - local->op_errno = op_errno; - gf_log (this->name, GF_LOG_ERROR, "Subvolume %s returned -1 " - "(%s)", prev->this->name, strerror (op_errno)); + gf_log (this->name, GF_LOG_WARNING, + "Unknown local->xsel (%s)", local->xsel); + goto out; } + ret = dict_set_dynstr (*dict, local->xsel, xattr_buf); + GF_FREE (local->xattr_val); + out: - this_call_cnt = dht_frame_return (frame); - if (is_last_call (this_call_cnt)) { + return ret; +} - if (local->op_ret == -1) { - goto unwind; - } +int +dht_vgetxattr_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + int ret = 0; + dht_local_t *local = NULL; + int this_call_cnt = 0; + dict_t *dict = NULL; - if (local->layout->cnt > 1) { - /* Set it for directory */ - fill_layout_info (local->layout, layout_buf); - flag = 1; - } + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (frame->local, out); - dict = dict_new (); - - /* we would need max this many bytes to create xattr string */ - alloc_len += (2 * strlen (this->name)) - + strlen (layout_buf) - + 40; - xattr_buf = GF_CALLOC (alloc_len, - sizeof (char), gf_common_mt_char); - - if (XATTR_IS_PATHINFO (local->xsel)) { - (void) dht_fill_pathinfo_xattr (this, local, xattr_buf, - alloc_len, flag, - layout_buf); - } else if (XATTR_IS_NODE_UUID (local->xsel)) { - (void) snprintf (xattr_buf, alloc_len, "%s", - local->xattr_val); - } else - gf_log (this->name, GF_LOG_WARNING, - "Unknown local->xsel (%s)", local->xsel); + local = frame->local; - ret = dict_set_dynstr (dict, local->xsel, xattr_buf); + LOCK (&frame->lock); + { + this_call_cnt = --local->call_cnt; + if (op_ret < 0) { + if (op_errno != ENOTCONN) { + gf_log (this->name, GF_LOG_ERROR, + "getxattr err (%s) for dir", + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + } - GF_FREE (local->xattr_val); + goto unlock; + } - DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, - xdata); + ret = dht_vgetxattr_alloc_and_fill (local, xattr, this, + op_errno); + if (ret) + gf_log (this->name, GF_LOG_ERROR, + "alloc or fill failure"); + } + unlock: + UNLOCK (&frame->lock); - if (dict) - dict_unref (dict); + if (!is_last_call (this_call_cnt)) + goto out; - return 0; + /* -- last call: do patch ups -- */ + + if (local->op_ret == -1) { + goto unwind; } -unwind: + ret = dht_vgetxattr_fill_and_set (local, &dict, this, _gf_true); + if (ret) + goto unwind; + + DHT_STACK_UNWIND (getxattr, frame, 0, 0, dict, xdata); + goto cleanup; + + unwind: DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, NULL, NULL); + cleanup: + if (dict) + dict_unref (dict); + out: + return 0; +} + +int +dht_vgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +{ + dht_local_t *local = NULL; + int ret = 0; + dict_t *dict = NULL; + call_frame_t *prev = NULL; + gf_boolean_t flag = _gf_true; + + local = frame->local; + prev = cookie; + + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, "Subvolume %s returned -1 " + "(%s)", prev->this->name, strerror (op_errno)); + goto unwind; + } + + ret = dht_vgetxattr_alloc_and_fill (local, xattr, this, + op_errno); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "alloc or fill failure"); + goto unwind; + } + + flag = (local->layout->cnt > 1) ? _gf_true : _gf_false; + + ret = dht_vgetxattr_fill_and_set (local, &dict, this, flag); + if (ret) + goto unwind; + + DHT_STACK_UNWIND (getxattr, frame, 0, 0, dict, xdata); + goto cleanup; + + unwind: + DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, + NULL, NULL); + cleanup: + if (dict) + dict_unref (dict); + return 0; } @@ -1893,10 +1993,13 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { int this_call_cnt = 0; dht_local_t *local = NULL; + dht_conf_t *conf = NULL; VALIDATE_OR_GOTO (frame, out); VALIDATE_OR_GOTO (frame->local, out); + VALIDATE_OR_GOTO (this->private, out); + conf = this->private; local = frame->local; this_call_cnt = dht_frame_return (frame); @@ -1904,8 +2007,8 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (!xattr || (op_ret == -1)) goto out; - if (dict_get (xattr, "trusted.glusterfs.dht")) { - dict_del (xattr, "trusted.glusterfs.dht"); + if (dict_get (xattr, conf->xattr_name)) { + dict_del (xattr, conf->xattr_name); } local->op_ret = 0; @@ -1938,9 +2041,72 @@ dht_getxattr_unwind (call_frame_t *frame, int +dht_getxattr_get_real_filename_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + dict_t *xattr, dict_t *xdata) +{ + int this_call_cnt = 0; + dht_local_t *local = NULL; + + + local = frame->local; + + if (op_ret != -1) { + if (local->xattr) + dict_unref (local->xattr); + local->xattr = dict_ref (xattr); + + if (local->xattr_req) + dict_unref (local->xattr_req); + local->xattr_req = dict_ref (xdata); + } + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno, + local->xattr, local->xattr_req); + } + + return 0; +} + + +int +dht_getxattr_get_real_filename (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *key, dict_t *xdata) +{ + dht_local_t *local = NULL; + int i = 0; + dht_layout_t *layout = NULL; + int cnt = 0; + xlator_t *subvol = NULL; + + + local = frame->local; + layout = local->layout; + + cnt = local->call_cnt = layout->cnt; + + local->op_ret = -1; + local->op_errno = ENODATA; + + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_getxattr_get_real_filename_cbk, + subvol, subvol->fops->getxattr, + loc, key, xdata); + } + + return 0; +} + + +int dht_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key, dict_t *xdata) +#define DHT_IS_DIR(layout) (layout->cnt > 1) { + xlator_t *subvol = NULL; xlator_t *hashed_subvol = NULL; xlator_t *cached_subvol = NULL; @@ -1956,7 +2122,6 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); VALIDATE_OR_GOTO (this->private, err); conf = this->private; @@ -1984,8 +2149,40 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, } } - if (key && ((strcmp (key, GF_XATTR_PATHINFO_KEY) == 0) - || strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0)) { + if (key && + (strncmp (key, GF_XATTR_GET_REAL_FILENAME_KEY, + strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0) + && DHT_IS_DIR(layout)) { + dht_getxattr_get_real_filename (frame, this, loc, key, xdata); + return 0; + } + + /* for file use cached subvolume (obviously!): see if {} + * below + * for directory: + * wind to all subvolumes and exclude subvolumes which + * return ENOTCONN (in callback) + * + * NOTE: Don't trust inode here, as that may not be valid + * (until inode_link() happens) + */ + if (key && DHT_IS_DIR(layout) && + ((strcmp (key, GF_XATTR_PATHINFO_KEY) == 0) + || (strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0))) { + (void) strncpy (local->xsel, key, 256); + cnt = local->call_cnt = layout->cnt; + for (i = 0; i < cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_vgetxattr_dir_cbk, + subvol, subvol->fops->getxattr, + loc, key, NULL); + } + return 0; + } + + /* node-uuid or pathinfo for files */ + if (key && ((strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0) + || (strcmp (key, GF_XATTR_PATHINFO_KEY) == 0))) { cached_subvol = local->cached_subvol; (void) strncpy (local->xsel, key, 256); @@ -1995,6 +2192,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, return 0; } + if (key && (strcmp (key, GF_XATTR_LINKINFO_KEY) == 0)) { hashed_subvol = dht_subvol_get_hashed (this, loc); if (!hashed_subvol) { @@ -2027,13 +2225,13 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, } if (key && (!strcmp (GF_XATTR_MARKER_KEY, key)) - && (-1 == frame->root->pid)) { - - if (loc->inode-> ia_type == IA_IFDIR) { + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + if (DHT_IS_DIR(layout)) { cnt = layout->cnt; } else { cnt = 1; } + sub_volumes = alloca ( cnt * sizeof (xlator_t *)); for (i = 0; i < cnt; i++) *(sub_volumes + i) = layout->list[i].xlator; @@ -2041,7 +2239,8 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, if (cluster_getmarkerattr (frame, this, loc, key, local, dht_getxattr_unwind, sub_volumes, cnt, - MARKER_UUID_TYPE, conf->vol_uuid)) { + MARKER_UUID_TYPE, marker_uuid_default_gauge, + conf->vol_uuid)) { op_errno = EINVAL; goto err; } @@ -2051,8 +2250,8 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, if (key && *conf->vol_uuid) { if ((match_uuid_local (key, conf->vol_uuid) == 0) && - (-1 == frame->root->pid)) { - if (loc->inode-> ia_type == IA_IFDIR) { + (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { + if (DHT_IS_DIR(layout)) { cnt = layout->cnt; } else { cnt = 1; @@ -2065,6 +2264,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, local, dht_getxattr_unwind, sub_volumes, cnt, MARKER_XTIME_TYPE, + marker_xtime_default_gauge, conf->vol_uuid)) { op_errno = EINVAL; goto err; @@ -2074,7 +2274,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this, } } - if (loc->inode-> ia_type == IA_IFDIR) { + if (DHT_IS_DIR(layout)) { cnt = local->call_cnt = layout->cnt; } else { cnt = local->call_cnt = 1; @@ -2094,6 +2294,7 @@ err: return 0; } +#undef DHT_IS_DIR int dht_fgetxattr (call_frame_t *frame, xlator_t *this, @@ -2165,13 +2366,17 @@ dht_fsetxattr (call_frame_t *frame, xlator_t *this, xlator_t *subvol = NULL; dht_local_t *local = NULL; int op_errno = EINVAL; + dht_conf_t *conf = NULL; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); VALIDATE_OR_GOTO (fd->inode, err); + VALIDATE_OR_GOTO (this->private, err); + + conf = this->private; - GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.dht*", xattr, + GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, op_errno, err); local = dht_local_init (frame, NULL, fd, GF_FOP_FSETXATTR); @@ -2275,12 +2480,12 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); - GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.dht*", xattr, + conf = this->private; + + GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr, op_errno, err); - conf = this->private; local = dht_local_init (frame, loc, NULL, GF_FOP_SETXATTR); if (!local) { op_errno = ENOMEM; @@ -2305,25 +2510,6 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, local->call_cnt = call_cnt = layout->cnt; - /* This key is sent by Unified File and Object storage - * to test xattr support in backend. - */ - tmp = dict_get (xattr, "user.ufo-test"); - if (tmp) { - if (IA_ISREG (loc->inode->ia_type)) { - op_errno = ENOTSUP; - goto err; - } - local->op_ret = 0; - for (i = 0; i < call_cnt; i++) { - STACK_WIND (frame, dht_ufo_xattr_cbk, - layout->list[i].xlator, - layout->list[i].xlator->fops->setxattr, - loc, xattr, flags, NULL); - } - return 0; - } - tmp = dict_get (xattr, "distribute.migrate-data"); if (tmp) { if (IA_ISDIR (loc->inode->ia_type)) { @@ -2493,18 +2679,20 @@ dht_removexattr (call_frame_t *frame, xlator_t *this, dht_local_t *local = NULL; dht_layout_t *layout = NULL; int call_cnt = 0; + dht_conf_t *conf = NULL; int i; VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (this->private, err); + + conf = this->private; - GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.dht*", - key, op_errno, err); + GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err); VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); local = dht_local_init (frame, loc, NULL, GF_FOP_REMOVEXATTR); if (!local) { @@ -2556,13 +2744,16 @@ dht_fremovexattr (call_frame_t *frame, xlator_t *this, dht_local_t *local = NULL; dht_layout_t *layout = NULL; int call_cnt = 0; + dht_conf_t *conf = 0; int i; VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (this->private, err); + + conf = this->private; - GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.dht*", - key, op_errno, err); + GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err); VALIDATE_OR_GOTO (frame, err); @@ -2733,7 +2924,6 @@ dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); VALIDATE_OR_GOTO (this->private, err); conf = this->private; @@ -2853,10 +3043,13 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, list_for_each_entry (orig_entry, (&orig_entries->list), list) { next_offset = orig_entry->d_off; - if ((check_is_dir (NULL, (&orig_entry->d_stat), NULL) && - (prev->this != dht_first_up_subvol (this))) || - check_is_linkfile (NULL, (&orig_entry->d_stat), - orig_entry->dict)) { + if (check_is_dir (NULL, (&orig_entry->d_stat), NULL) && + (prev->this != local->first_up_subvol)) { + continue; + } + if (check_is_linkfile (NULL, (&orig_entry->d_stat), + orig_entry->dict, + conf->link_xattr_name)) { continue; } @@ -2933,13 +3126,16 @@ done: } if (conf->readdir_optimize == _gf_true) { - if (next_subvol != dht_first_up_subvol (this)) { + if (next_subvol != local->first_up_subvol) { ret = dict_set_int32 (local->xattr, GF_READDIR_SKIP_DIRS, 1); if (ret) gf_log (this->name, GF_LOG_ERROR, "dict set failed"); - } + } else { + dict_del (local->xattr, + GF_READDIR_SKIP_DIRS); + } } STACK_WIND (frame, dht_readdirp_cbk, @@ -3072,6 +3268,7 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (this->private, err); conf = this->private; @@ -3084,6 +3281,7 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, local->fd = fd_ref (fd); local->size = size; local->xattr_req = (dict)? dict_ref (dict) : NULL; + local->first_up_subvol = dht_first_up_subvol (this); dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff); @@ -3096,20 +3294,22 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, if (local->xattr) { ret = dict_set_uint32 (local->xattr, - "trusted.glusterfs.dht.linkto", - 256); + conf->link_xattr_name, 256); if (ret) gf_log (this->name, GF_LOG_WARNING, - "failed to set 'glusterfs.dht.linkto'" - " key"); + "failed to set '%s' key", + conf->link_xattr_name); if (conf->readdir_optimize == _gf_true) { - if (xvol != dht_first_up_subvol (this)) { + if (xvol != local->first_up_subvol) { ret = dict_set_int32 (local->xattr, GF_READDIR_SKIP_DIRS, 1); if (ret) gf_log (this->name, GF_LOG_ERROR, "Dict set failed"); + } else { + dict_del (local->xattr, + GF_READDIR_SKIP_DIRS); } } } @@ -3245,7 +3445,7 @@ dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, inode_t *inode, struct iatt *stbuf, struct iatt *preparent, struct iatt *postparent, dict_t *xdata) { - call_frame_t *prev = NULL; + xlator_t *prev = NULL; int ret = -1; dht_local_t *local = NULL; @@ -3270,15 +3470,17 @@ dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, postparent, 1); } - ret = dht_layout_preset (this, prev->this, inode); + ret = dht_layout_preset (this, prev, inode); if (ret < 0) { gf_log (this->name, GF_LOG_DEBUG, "could not set pre-set layout for subvolume %s", - prev->this->name); + prev? prev->name: NULL); op_ret = -1; op_errno = EINVAL; goto out; } + if (local->linked == _gf_true) + dht_linkfile_attr_heal (frame, this); out: /* * FIXME: ia_size and st_blocks of preparent and postparent do not have @@ -3287,7 +3489,6 @@ out: * corresponding values from each of the subvolume. * See dht_iatt_merge for reference. */ - DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, stbuf, preparent, postparent, xdata); @@ -3309,12 +3510,17 @@ dht_mknod_linkfile_create_cbk (call_frame_t *frame, void *cookie, goto err; local = frame->local; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + goto err; + } + cached_subvol = local->cached_subvol; - STACK_WIND (frame, dht_newfile_cbk, - cached_subvol, cached_subvol->fops->mknod, - &local->loc, local->mode, local->rdev, local->umask, - local->params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)cached_subvol, + cached_subvol, cached_subvol->fops->mknod, + &local->loc, local->mode, local->rdev, local->umask, + local->params); return 0; err: @@ -3357,11 +3563,13 @@ dht_mknod (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, umask, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, + subvol, subvol->fops->mknod, loc, mode, + rdev, umask, params); } else { - avail_subvol = dht_free_disk_available_subvol (this, subvol); + + avail_subvol = dht_free_disk_available_subvol (this, subvol, + local); if (avail_subvol != subvol) { /* Choose the minimum filled volume, and create the files there */ @@ -3373,14 +3581,15 @@ dht_mknod (call_frame_t *frame, xlator_t *this, local->umask = umask; dht_linkfile_create (frame, dht_mknod_linkfile_create_cbk, - avail_subvol, subvol, loc); + this, avail_subvol, subvol, loc); } else { gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, umask, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, + (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, + rdev, umask, params); } } @@ -3425,9 +3634,9 @@ dht_symlink (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->symlink, - linkname, loc, umask, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->symlink, linkname, loc, umask, + params); return 0; @@ -3541,7 +3750,10 @@ dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dht_inode_ctx_time_update (local->loc.parent, this, postparent, 1); } - + if (local->linked == _gf_true) { + local->stbuf = *stbuf; + dht_linkfile_attr_heal (frame, this); + } out: DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent, @@ -3628,7 +3840,7 @@ dht_link (call_frame_t *frame, xlator_t *this, if (hashed_subvol != cached_subvol) { uuid_copy (local->gfid, oldloc->inode->gfid); - dht_linkfile_create (frame, dht_link_linkfile_cbk, + dht_linkfile_create (frame, dht_link_linkfile_cbk, this, cached_subvol, hashed_subvol, newloc); } else { STACK_WIND (frame, dht_link_cbk, @@ -3685,7 +3897,10 @@ dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, op_errno = EINVAL; goto out; } - + if (local->linked == _gf_true) { + local->stbuf = *stbuf; + dht_linkfile_attr_heal (frame, this); + } out: DHT_STRIP_PHASE1_FLAGS (stbuf); DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, stbuf, preparent, @@ -3775,7 +3990,7 @@ dht_create (call_frame_t *frame, xlator_t *this, } /* Choose the minimum filled volume, and create the files there */ - avail_subvol = dht_free_disk_available_subvol (this, subvol); + avail_subvol = dht_free_disk_available_subvol (this, subvol, local); if (avail_subvol != subvol) { local->params = dict_ref (params); local->flags = flags; @@ -3786,9 +4001,8 @@ dht_create (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_TRACE, "creating %s on %s (link at %s)", loc->path, avail_subvol->name, subvol->name); - dht_linkfile_create (frame, - dht_create_linkfile_create_cbk, - avail_subvol, subvol, loc); + dht_linkfile_create (frame, dht_create_linkfile_create_cbk, + this, avail_subvol, subvol, loc); goto done; } gf_log (this->name, GF_LOG_TRACE, @@ -3861,6 +4075,15 @@ dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ret = dht_layout_merge (this, layout, prev->this, -1, ENOSPC, NULL); } else { + if (op_ret == -1 && op_errno == EEXIST) + /* Very likely just a race between mkdir and + self-heal (from lookup of a concurrent mkdir + attempt). + Ignore error for now. layout setting will + anyways fail if this was a different (old) + pre-existing different directory. + */ + op_ret = 0; ret = dht_layout_merge (this, layout, prev->this, op_ret, op_errno, NULL); } @@ -4318,6 +4541,7 @@ dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_frame_t *main_frame = NULL; dht_local_t *main_local = NULL; int this_call_cnt = 0; + dht_conf_t *conf = this->private; local = frame->local; prev = cookie; @@ -4329,7 +4553,7 @@ dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret != 0) goto err; - if (check_is_linkfile (inode, stbuf, xattr) == 0) { + if (!check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) { main_local->op_ret = -1; main_local->op_errno = ENOTEMPTY; @@ -4364,6 +4588,7 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, dht_local_t *lookup_local = NULL; dht_local_t *local = NULL; dict_t *xattrs = NULL; + dht_conf_t *conf = this->private; local = frame->local; @@ -4372,7 +4597,8 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, continue; if (strcmp (trav->d_name, "..") == 0) continue; - if (check_is_linkfile (NULL, (&trav->d_stat), trav->dict)) { + if (check_is_linkfile (NULL, (&trav->d_stat), trav->dict, + conf->link_xattr_name)) { ret++; continue; } @@ -4390,7 +4616,7 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this, return -1; } - ret = dict_set_uint32 (xattrs, DHT_LINKFILE_KEY, 256); + ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256); if (ret) { gf_log (this->name, GF_LOG_ERROR, "failed to set linkto key" " in dict"); @@ -4513,6 +4739,7 @@ dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, call_frame_t *prev = NULL; dict_t *dict = NULL; int ret = 0; + dht_conf_t *conf = this->private; local = frame->local; prev = cookie; @@ -4536,12 +4763,11 @@ dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto err; } - ret = dict_set_uint32 (dict, - "trusted.glusterfs.dht.linkto", 256); + ret = dict_set_uint32 (dict, conf->link_xattr_name, 256); if (ret) gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set 'trusted.glusterfs.dht.linkto' key", - local->loc.path); + "%s: failed to set '%s' key", + local->loc.path, conf->link_xattr_name); STACK_WIND (frame, dht_rmdir_readdirp_cbk, prev->this, prev->this->fops->readdirp, @@ -4640,7 +4866,6 @@ dht_entrylk (call_frame_t *frame, xlator_t *this, VALIDATE_OR_GOTO (this, err); VALIDATE_OR_GOTO (loc, err); VALIDATE_OR_GOTO (loc->inode, err); - VALIDATE_OR_GOTO (loc->path, err); local = dht_local_init (frame, loc, NULL, GF_FOP_ENTRYLK); if (!local) { @@ -4921,15 +5146,7 @@ unlock: or wait for anything else. Just propagate blindly */ if (have_heard_from_all) { propagate = 1; - if (conf->defrag) { - ret = pthread_create (&conf->defrag->th, NULL, - gf_defrag_start, this); - if (ret) { - conf->defrag = NULL; - GF_FREE (conf->defrag); - kill (getpid(), SIGTERM); - } - } + } @@ -4951,6 +5168,19 @@ unlock: /* continue to check other events for CHILD_UP */ } } + + /* rebalance is started with assert_no_child_down. So we do + * not need to handle CHILD_DOWN event here. + */ + if (conf->defrag) { + ret = gf_thread_create (&conf->defrag->th, NULL, + gf_defrag_start, this); + if (ret) { + conf->defrag = NULL; + GF_FREE (conf->defrag); + kill (getpid(), SIGTERM); + } + } } ret = 0; diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 83ec345d6..5ccd66799 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -13,6 +13,8 @@ #include "config.h" #endif +#include <regex.h> + #include "dht-mem-types.h" #include "libxlator.h" #include "syncop.h" @@ -53,7 +55,7 @@ struct dht_layout { uint32_t start; uint32_t stop; xlator_t *xlator; - } list[0]; + } list[]; }; typedef struct dht_layout dht_layout_t; @@ -160,6 +162,7 @@ struct dht_local { /* which xattr request? */ char xsel[256]; + int32_t alloc_len; char *newpath; @@ -176,7 +179,11 @@ struct dht_local { glusterfs_fop_t fop; + gf_boolean_t linked; + xlator_t *link_subvol; + struct dht_rebalance_ rebalance; + xlator_t *first_up_subvol; }; typedef struct dht_local dht_local_t; @@ -205,15 +212,27 @@ enum gf_defrag_status_t { GF_DEFRAG_STATUS_STOPPED, GF_DEFRAG_STATUS_COMPLETE, GF_DEFRAG_STATUS_FAILED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED, + GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED, + GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE, + GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED, }; typedef enum gf_defrag_status_t gf_defrag_status_t; +typedef struct gf_defrag_pattern_list gf_defrag_pattern_list_t; + +struct gf_defrag_pattern_list { + char path_pattern[256]; + uint64_t size; + gf_defrag_pattern_list_t *next; +}; struct gf_defrag_info_ { uint64_t total_files; uint64_t total_data; uint64_t num_files_lookedup; uint64_t total_failures; + uint64_t skipped; gf_lock_t lock; int cmd; pthread_t th; @@ -226,7 +245,7 @@ struct gf_defrag_info_ { uuid_t node_uuid; struct timeval start_time; gf_boolean_t stats; - + gf_defrag_pattern_list_t *defrag_pattern; }; typedef struct gf_defrag_info_ gf_defrag_info_t; @@ -271,6 +290,17 @@ struct dht_conf { /* Request to filter directory entries in readdir request */ gf_boolean_t readdir_optimize; + + /* Support regex-based name reinterpretation. */ + regex_t rsync_regex; + gf_boolean_t rsync_regex_valid; + regex_t extra_regex; + gf_boolean_t extra_regex_valid; + + /* Support variable xattr names. */ + char *xattr_name; + char *link_xattr_name; + char *wild_xattr_name; }; typedef struct dht_conf dht_conf_t; @@ -301,13 +331,12 @@ typedef enum { #define DHT_MIGRATION_IN_PROGRESS 1 #define DHT_MIGRATION_COMPLETED 2 -#define DHT_LINKFILE_KEY "trusted.glusterfs.dht.linkto" #define DHT_LINKFILE_MODE (S_ISVTX) -#define check_is_linkfile(i,s,x) ( \ +#define check_is_linkfile(i,s,x,n) ( \ ((st_mode_from_ia ((s)->ia_prot, (s)->ia_type) & ~S_IFMT) \ - == DHT_LINKFILE_MODE) && \ - dict_get (x, DHT_LINKFILE_KEY)) + == DHT_LINKFILE_MODE) && \ + dict_get (x, n)) #define IS_DHT_MIGRATION_PHASE2(buf) ( \ IA_ISREG ((buf)->ia_type) && \ @@ -414,12 +443,14 @@ int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc); xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode); xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev); +xlator_t *dht_subvol_next_available (xlator_t *this, xlator_t *prev); int dht_subvol_cnt (xlator_t *this, xlator_t *subvol); -int dht_hash_compute (int type, const char *name, uint32_t *hash_p); +int dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p); int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, - xlator_t *tovol, xlator_t *fromvol, loc_t *loc); + xlator_t *this, xlator_t *tovol, + xlator_t *fromvol, loc_t *loc); int dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc); int dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc); int @@ -437,7 +468,8 @@ dht_layout_sort_volname (dht_layout_t *layout); int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc); gf_boolean_t dht_is_subvol_filled (xlator_t *this, xlator_t *subvol); -xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol); +xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, + dht_local_t *layout); int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx); int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode); @@ -659,7 +691,16 @@ int32_t dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, int32_t valid, dict_t *xdata); int32_t dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, int32_t valid, dict_t *xdata); - +int32_t dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t mode, off_t offset, size_t len, dict_t *xdata); +int32_t dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, dict_t *xdata); +int32_t dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, + off_t offset, size_t len, dict_t *xdata); + +int32_t dht_init (xlator_t *this); +void dht_fini (xlator_t *this); +int dht_reconfigure (xlator_t *this, dict_t *options); int32_t dht_notify (xlator_t *this, int32_t event, void *data, ...); /* definitions for nufa/switch */ @@ -721,7 +762,26 @@ dht_dir_attr_heal (void *data); int dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data); int -dht_dir_has_layout (dict_t *xattr); +dht_dir_has_layout (dict_t *xattr, char *name); gf_boolean_t dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator); +xlator_t * +dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout); +xlator_t * +dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout); +int +dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this); + +void +dht_layout_dump (dht_layout_t *layout, const char *prefix); +int32_t +dht_priv_dump (xlator_t *this); +int32_t +dht_inodectx_dump (xlator_t *this, inode_t *inode); + +int +dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol); + #endif/* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c index 52ea3a32a..fe3955ecb 100644 --- a/xlators/cluster/dht/src/dht-diskusage.c +++ b/xlators/cluster/dht/src/dht-diskusage.c @@ -248,50 +248,167 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol) return is_subvol_filled; } + +/*Get the best subvolume to create the file in*/ xlator_t * -dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol) +dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol, + dht_local_t *local) { - int i = 0; - double max = 0; - double max_inodes = 0; xlator_t *avail_subvol = NULL; dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + loc_t *loc = NULL; conf = this->private; - - LOCK (&conf->subvolume_lock); + if (!local) + goto out; + loc = &local->loc; + if (!local->layout) { + layout = dht_layout_get (this, loc->parent); + + if (!layout) { + gf_log (this->name, GF_LOG_DEBUG, + "layout missing path=%s parent=%s", + loc->path, uuid_utoa (loc->parent->gfid)); + goto out; + } + } else { + layout = dht_layout_ref (this, local->layout); + } + + LOCK (&conf->subvolume_lock); { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (conf->disk_unit == 'p') { - if ((conf->du_stats[i].avail_percent > max) - && (conf->du_stats[i].avail_inodes > max_inodes)) { - max = conf->du_stats[i].avail_percent; - max_inodes = conf->du_stats[i].avail_inodes; - avail_subvol = conf->subvolumes[i]; - } - } else { - if ((conf->du_stats[i].avail_space > max) - && (conf->du_stats[i].avail_inodes > max_inodes)) { - max = conf->du_stats[i].avail_space; - max_inodes = conf->du_stats[i].avail_inodes; - avail_subvol = conf->subvolumes[i]; - } + avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, + layout); + if(!avail_subvol) + { + avail_subvol = dht_subvol_maxspace_nonzeroinode(this, + subvol, + layout); + } - } - } } UNLOCK (&conf->subvolume_lock); - +out: if (!avail_subvol) { - gf_log (this->name, GF_LOG_DEBUG, - "no subvolume has enough free space and inodes to create"); + gf_log (this->name, + GF_LOG_DEBUG, + "no subvolume has enough free space and/or inodes\ + to create"); + avail_subvol = subvol; } - if ((max < conf->min_free_disk) && (max_inodes < conf->min_free_inodes)) - avail_subvol = subvol; + if (layout) + dht_layout_unref (this, layout); + return avail_subvol; +} + +static inline +int32_t dht_subvol_has_err (xlator_t *this, dht_layout_t *layout) +{ + int ret = -1; + int i = 0; + + if (!this || !layout) + goto out; + + /* check if subvol has layout errors, before selecting it */ + for (i = 0; i < layout->cnt; i++) { + if (!strcmp (layout->list[i].xlator->name, this->name) && + (layout->list[i].err != 0)) { + ret = -1; + goto out; + } + } + ret = 0; +out: + return ret; +} + +/*Get subvolume which has both space and inodes more than the min criteria*/ +xlator_t * +dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol, + dht_layout_t *layout) +{ + int i = 0; + double max = 0; + double max_inodes = 0; + int ignore_subvol = 0; + + xlator_t *avail_subvol = NULL; + dht_conf_t *conf = NULL; + + conf = this->private; + + for(i=0; i < conf->subvolume_cnt; i++) { + /* check if subvol has layout errors, before selecting it */ + ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], + layout); + if (ignore_subvol) + continue; + + if ((conf->disk_unit == 'p') && + (conf->du_stats[i].avail_percent > conf->min_free_disk) && + (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) { + if ((conf->du_stats[i].avail_inodes > max_inodes) || + (conf->du_stats[i].avail_percent > max)) { + max = conf->du_stats[i].avail_percent; + max_inodes = conf->du_stats[i].avail_inodes; + avail_subvol = conf->subvolumes[i]; + } + } + + if ((conf->disk_unit != 'p') && + (conf->du_stats[i].avail_space > conf->min_free_disk) && + (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) { + if ((conf->du_stats[i].avail_inodes > max_inodes) || + (conf->du_stats[i].avail_space > max)) { + max = conf->du_stats[i].avail_space; + max_inodes = conf->du_stats[i].avail_inodes; + avail_subvol = conf->subvolumes[i]; + } + } + } + + return avail_subvol; +} - if (!avail_subvol) - avail_subvol = subvol; - return avail_subvol; +/* Get subvol which has atleast one inode and maximum space */ +xlator_t * +dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol, + dht_layout_t *layout) +{ + int i = 0; + double max = 0; + int ignore_subvol = 0; + + xlator_t *avail_subvol = NULL; + dht_conf_t *conf = NULL; + + conf = this->private; + + for (i = 0; i < conf->subvolume_cnt; i++) { + /* check if subvol has layout errors, before selecting it */ + ignore_subvol = dht_subvol_has_err (conf->subvolumes[i], + layout); + if (ignore_subvol) + continue; + + if (conf->disk_unit == 'p') { + if ((conf->du_stats[i].avail_percent > max) + && (conf->du_stats[i].avail_inodes > 0 )) { + max = conf->du_stats[i].avail_percent; + avail_subvol = conf->subvolumes[i]; + } + } else { + if ((conf->du_stats[i].avail_space > max) + && (conf->du_stats[i].avail_inodes > 0)) { + max = conf->du_stats[i].avail_space; + avail_subvol = conf->subvolumes[i]; + } + } + } + + return avail_subvol; } diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c index 97eb1f05f..656cf23a0 100644 --- a/xlators/cluster/dht/src/dht-hashfn.c +++ b/xlators/cluster/dht/src/dht-hashfn.c @@ -44,30 +44,68 @@ dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p) } -#define MAKE_RSYNC_FRIENDLY_NAME(rsync_frndly_name, name) do { \ - rsync_frndly_name = (char *) name; \ - if (name[0] == '.') { \ - char *dot = 0; \ - int namelen = 0; \ - \ - dot = strrchr (name, '.'); \ - if (dot && dot > (name + 1) && *(dot + 1)) { \ - namelen = (dot - name); \ - rsync_frndly_name = alloca (namelen); \ - strncpy (rsync_frndly_name, name + 1, \ - namelen); \ - rsync_frndly_name[namelen - 1] = 0; \ - } \ - } \ - } while (0); +static inline +gf_boolean_t +dht_munge_name (const char *original, char *modified, size_t len, regex_t *re) +{ + regmatch_t matches[2]; + size_t new_len; + if (regexec(re,original,2,matches,0) != REG_NOMATCH) { + if (matches[1].rm_so != -1) { + new_len = matches[1].rm_eo - matches[1].rm_so; + /* Equal would fail due to the NUL at the end. */ + if (new_len < len) { + memcpy (modified,original+matches[1].rm_so, + new_len); + modified[new_len] = '\0'; + return _gf_true; + } + } + } + + /* This is guaranteed safe because of how the dest was allocated. */ + strcpy(modified,original); + return _gf_false; +} int -dht_hash_compute (int type, const char *name, uint32_t *hash_p) +dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p) { - char *rsync_friendly_name = NULL; + char *rsync_friendly_name = NULL; + dht_conf_t *priv = this->private; + size_t len = 0; + gf_boolean_t munged = _gf_false; + + /* + * It wouldn't be safe to use alloca in an inline function that doesn't + * actually get inlined, and it wouldn't be efficient to do a real + * allocation, so we use alloca here (if needed) and pass that to the + * inline. + */ - MAKE_RSYNC_FRIENDLY_NAME (rsync_friendly_name, name); + if (priv->extra_regex_valid) { + len = strlen(name) + 1; + rsync_friendly_name = alloca(len); + munged = dht_munge_name (name, rsync_friendly_name, len, + &priv->extra_regex); + } + + if (!munged && priv->rsync_regex_valid) { + len = strlen(name) + 1; + rsync_friendly_name = alloca(len); + gf_log (this->name, GF_LOG_TRACE, "trying regex for %s", name); + munged = dht_munge_name (name, rsync_friendly_name, len, + &priv->rsync_regex); + if (munged) { + gf_log (this->name, GF_LOG_DEBUG, + "munged down to %s", rsync_friendly_name); + } + } + + if (!munged) { + rsync_friendly_name = (char *)name; + } return dht_hash_compute_internal (type, rsync_friendly_name, hash_p); } diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c index 264172a90..311a48112 100644 --- a/xlators/cluster/dht/src/dht-helper.c +++ b/xlators/cluster/dht/src/dht-helper.c @@ -18,6 +18,28 @@ #include "xlator.h" #include "dht-common.h" +static inline int +dht_inode_ctx_set1 (xlator_t *this, inode_t *inode, xlator_t *subvol) +{ + uint64_t tmp_subvol = 0; + + tmp_subvol = (long)subvol; + return inode_ctx_set1 (inode, this, &tmp_subvol); +} + +int +dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol) +{ + int ret = -1; + uint64_t tmp_subvol = 0; + + ret = inode_ctx_get1 (inode, this, &tmp_subvol); + if (tmp_subvol && subvol) + *subvol = (xlator_t *)tmp_subvol; + + return ret; +} + int dht_frame_return (call_frame_t *frame) @@ -40,6 +62,43 @@ dht_frame_return (call_frame_t *frame) } +static uint64_t +dht_bits_for (uint64_t num) +{ + uint64_t bits = 0, ctrl = 1; + + while (ctrl < num) { + ctrl *= 2; + bits ++; + } + + return bits; +} + +/* + * A slightly "updated" version of the algorithm described in the commit log + * is used here. + * + * The only enhancement is that: + * + * - The number of bits used by the backend filesystem for HUGE d_off which + * is described as 63, and + * - The number of bits used by the d_off presented by the transformation + * upwards which is described as 64, are both made "configurable." + */ + + +#define BACKEND_D_OFF_BITS 63 +#define PRESENT_D_OFF_BITS 63 + +#define ONE 1ULL +#define MASK (~0ULL) +#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS)) +#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS)) + +#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1)) +#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1))) + int dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) { @@ -47,6 +106,9 @@ dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) int cnt = 0; int max = 0; uint64_t y = 0; + uint64_t hi_mask = 0; + uint64_t off_mask = 0; + int max_bits = 0; if (x == ((uint64_t) -1)) { y = (uint64_t) -1; @@ -60,7 +122,23 @@ dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) max = conf->subvolume_cnt; cnt = dht_subvol_cnt (this, subvol); - y = ((x * max) + cnt); + if (max == 1) { + y = x; + goto out; + } + + max_bits = dht_bits_for (max); + + hi_mask = ~(PRESENT_MASK >> (max_bits + 1)); + + if (x & hi_mask) { + /* HUGE d_off */ + off_mask = MASK << max_bits; + y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt; + } else { + /* small d_off */ + y = ((x * max) + cnt); + } out: if (y_p) @@ -135,16 +213,38 @@ dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, int max = 0; uint64_t x = 0; xlator_t *subvol = 0; + int max_bits = 0; + uint64_t off_mask = 0; + uint64_t host_mask = 0; if (!this->private) - goto out; + return -1; conf = this->private; max = conf->subvolume_cnt; - cnt = y % max; - x = y / max; + if (max == 1) { + x = y; + cnt = 0; + goto out; + } + + if (y & TOP_BIT) { + /* HUGE d_off */ + max_bits = dht_bits_for (max); + off_mask = (MASK << max_bits); + host_mask = ~(off_mask); + + x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS; + + cnt = y & host_mask; + } else { + /* small d_off */ + cnt = y % max; + x = y / max; + } +out: subvol = conf->subvolumes[cnt]; if (subvol_p) @@ -153,7 +253,6 @@ dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, if (x_p) *x_p = x; -out: return 0; } @@ -263,20 +362,6 @@ out: return local; } - -char * -basestr (const char *str) -{ - char *basestr = NULL; - - basestr = strrchr (str, '/'); - if (basestr) - basestr ++; - - return basestr; -} - - xlator_t * dht_first_up_subvol (xlator_t *this) { @@ -428,7 +513,36 @@ out: return next; } +/* This func wraps around, if prev is actually the last subvol. + */ +xlator_t * +dht_subvol_next_available (xlator_t *this, xlator_t *prev) +{ + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *next = NULL; + conf = this->private; + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) { + /* if prev is last in conf->subvolumes, then wrap + * around. + */ + if ((i + 1) < conf->subvolume_cnt) { + next = conf->subvolumes[i + 1]; + } else { + next = conf->subvolumes[0]; + } + break; + } + } + +out: + return next; +} int dht_subvol_cnt (xlator_t *this, xlator_t *subvol) { @@ -620,20 +734,36 @@ dht_migration_complete_check_task (void *data) call_frame_t *frame = NULL; loc_t tmp_loc = {0,}; char *path = NULL; + dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + uint64_t tmp_subvol = 0; + int open_failed = 0; this = THIS; frame = data; local = frame->local; + conf = this->private; src_node = local->cached_subvol; - /* getxattr on cached_subvol for 'linkto' value */ - if (!local->loc.inode) + if (!local->loc.inode && !local->fd) + goto out; + + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr + * as root:root. If a fd is already open, access check wont be done*/ + + if (!local->loc.inode) { ret = syncop_fgetxattr (src_node, local->fd, &dict, - DHT_LINKFILE_KEY); - else + conf->link_xattr_name); + } else { + SYNCTASK_SETID (0, 0); ret = syncop_getxattr (src_node, &local->loc, &dict, - DHT_LINKFILE_KEY); + conf->link_xattr_name); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + } if (!ret) dst_node = dht_linkfile_subvol (this, NULL, NULL, dict); @@ -684,10 +814,7 @@ dht_migration_complete_check_task (void *data) /* update inode ctx (the layout) */ dht_layout_unref (this, local->layout); - if (!local->loc.inode) - ret = dht_layout_preset (this, dst_node, local->fd->inode); - else - ret = dht_layout_preset (this, dst_node, local->loc.inode); + ret = dht_layout_preset (this, dst_node, inode); if (ret != 0) { gf_log (this->name, GF_LOG_DEBUG, "%s: could not set preset layout for subvol %s", @@ -705,10 +832,7 @@ dht_migration_complete_check_task (void *data) goto out; } - if (!local->loc.inode) - ret = dht_layout_set (this, local->fd->inode, layout); - else - ret = dht_layout_set (this, local->loc.inode, layout); + ret = dht_layout_set (this, inode, layout); if (ret) { gf_log (this->name, GF_LOG_ERROR, "%s: failed to set the new layout", @@ -719,42 +843,46 @@ dht_migration_complete_check_task (void *data) local->cached_subvol = dst_node; ret = 0; - if (!local->fd) + /* once we detect the migration complete, the inode-ctx2 is no more + required.. delete the ctx and also, it means, open() already + done on all the fd of inode */ + ret = inode_ctx_reset1 (inode, this, &tmp_subvol); + if (tmp_subvol) goto out; - /* once we detect the migration complete, the fd-ctx is no more - required.. delete the ctx, and do one extra 'fd_unref' for open fd */ - ret = fd_ctx_del (local->fd, this, NULL); - if (!ret) { - fd_unref (local->fd); - ret = 0; + if (list_empty (&inode->fd_list)) goto out; - } - /* if 'local->fd' (ie, fd based operation), send a 'open()' on - destination if not already done */ - if (local->loc.inode) { - ret = syncop_open (dst_node, &local->loc, - local->fd->flags, local->fd); - } else { - tmp_loc.inode = local->fd->inode; - inode_path (local->fd->inode, NULL, &path); - if (path) - tmp_loc.path = path; - ret = syncop_open (dst_node, &tmp_loc, - local->fd->flags, local->fd); - GF_FREE (path); + /* perform open as root:root. There is window between linkfile + * creation(root:root) and setattr with the correct uid/gid + */ + SYNCTASK_SETID(0, 0); + + /* perform 'open()' on all the fd's present on the inode */ + tmp_loc.inode = inode; + inode_path (inode, NULL, &path); + if (path) + tmp_loc.path = path; + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (fd_is_anonymous (iter_fd)) + continue; + ret = syncop_open (dst_node, &tmp_loc, + iter_fd->flags, iter_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "failed to open " + "the fd (%p, flags=0%o) on file %s @ %s", + iter_fd, iter_fd->flags, path, dst_node->name); + open_failed = 1; + } } - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to send open() on target file at %s", - local->loc.path, dst_node->name); + GF_FREE (path); + + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + if (open_failed) { + ret = -1; goto out; } - - /* need this unref for the fd on src_node */ - fd_unref (local->fd); ret = 0; out: @@ -798,20 +926,34 @@ dht_rebalance_inprogress_task (void *data) char *path = NULL; struct iatt stbuf = {0,}; loc_t tmp_loc = {0,}; + dht_conf_t *conf = NULL; + inode_t *inode = NULL; + fd_t *iter_fd = NULL; + int open_failed = 0; this = THIS; frame = data; local = frame->local; + conf = this->private; src_node = local->cached_subvol; - /* getxattr on cached_subvol for 'linkto' value */ - if (local->loc.inode) + if (!local->loc.inode && !local->fd) + goto out; + + inode = (!local->fd) ? local->loc.inode : local->fd->inode; + + /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr + * as root:root. If a fd is already open, access check wont be done*/ + if (local->loc.inode) { + SYNCTASK_SETID (0, 0); ret = syncop_getxattr (src_node, &local->loc, &dict, - DHT_LINKFILE_KEY); - else + conf->link_xattr_name); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + } else { ret = syncop_fgetxattr (src_node, local->fd, &dict, - DHT_LINKFILE_KEY); + conf->link_xattr_name); + } if (ret) { gf_log (this->name, GF_LOG_ERROR, @@ -853,33 +995,46 @@ dht_rebalance_inprogress_task (void *data) ret = 0; - if (!local->fd) - goto out; + if (list_empty (&inode->fd_list)) + goto done; + + /* perform open as root:root. There is window between linkfile + * creation(root:root) and setattr with the correct uid/gid + */ + SYNCTASK_SETID (0, 0); + + tmp_loc.inode = inode; + inode_path (inode, NULL, &path); + if (path) + tmp_loc.path = path; + + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (fd_is_anonymous (iter_fd)) + continue; - if (local->loc.inode) { - ret = syncop_open (dst_node, &local->loc, - local->fd->flags, local->fd); - } else { - tmp_loc.inode = local->fd->inode; - inode_path (local->fd->inode, NULL, &path); - if (path) - tmp_loc.path = path; ret = syncop_open (dst_node, &tmp_loc, - local->fd->flags, local->fd); - GF_FREE (path); + iter_fd->flags, iter_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "failed to send open " + "the fd (%p, flags=0%o) on file %s @ %s", + iter_fd, iter_fd->flags, path, dst_node->name); + open_failed = 1; + } } + GF_FREE (path); - if (ret == -1) { - gf_log (this->name, GF_LOG_ERROR, - "%s: failed to send open() on target file at %s", - local->loc.path, dst_node->name); + SYNCTASK_SETID (frame->root->uid, frame->root->gid); + + if (open_failed) { + ret = -1; goto out; } - ret = fd_ctx_set (local->fd, this, (uint64_t)(long)dst_node); +done: + ret = dht_inode_ctx_set1 (this, inode, dst_node); if (ret) { gf_log (this->name, GF_LOG_ERROR, - "%s: failed to set fd-ctx target file at %s", + "%s: failed to set inode-ctx target file at %s", local->loc.path, dst_node->name); goto out; } @@ -931,6 +1086,9 @@ dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat, dht_stat_time_t *time = 0; int ret = -1; + GF_VALIDATE_OR_GOTO (this->name, stat, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + ret = dht_inode_ctx_get (inode, this, &ctx); if (ret) { @@ -949,7 +1107,7 @@ dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat, stat->ia_atime, stat->ia_atime_nsec, inode, post); ret = dht_inode_ctx_set (inode, this, ctx); - +out: return 0; } diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c index f17cb73b9..ece84151a 100644 --- a/xlators/cluster/dht/src/dht-inode-read.c +++ b/xlators/cluster/dht/src/dht-inode-read.c @@ -130,10 +130,11 @@ int dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata) { - uint64_t tmp_subvol = 0; + xlator_t *subvol = 0; dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + inode_t *inode = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -154,21 +155,23 @@ dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (local->call_cnt != 1) goto out; + local->op_errno = op_errno; /* Check if the rebalance phase2 is true */ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) { - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (ret) { + inode = (local->fd) ? local->fd->inode : local->loc.inode; + ret = dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { /* Phase 2 of migration */ local->rebalance.target_op_fn = dht_attr2; ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; } else { /* value is already set in fd_ctx, that means no need to check for whether its complete or not. */ dht_attr2 (this, frame, 0); - } - if (!ret) return 0; + } } out: @@ -381,6 +384,8 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { dht_local_t *local = NULL; int ret = 0; + inode_t *inode = NULL; + xlator_t *subvol = 0; local = frame->local; if (!local) { @@ -396,19 +401,21 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if ((op_ret == -1) && (op_errno != ENOENT)) goto out; + local->op_errno = op_errno; if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) { /* File would be migrated to other node */ - ret = fd_ctx_get (local->fd, this, NULL); - if (ret) { + ret = dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { local->rebalance.target_op_fn = dht_readv2; ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; } else { /* value is already set in fd_ctx, that means no need to check for whether its complete or not. */ dht_readv2 (this, frame, 0); - } - if (!ret) return 0; + } } out: @@ -499,24 +506,34 @@ dht_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int ret = -1; dht_local_t *local = NULL; xlator_t *subvol = NULL; + call_frame_t *prev = NULL; local = frame->local; + prev = cookie; + if (!prev || !prev->this) + goto out; if (local->call_cnt != 1) goto out; if ((op_ret == -1) && (op_errno == ENOTCONN) && IA_ISDIR(local->loc.inode->ia_type)) { - subvol = dht_first_up_subvol (this); + subvol = dht_subvol_next_available (this, prev->this); if (!subvol) goto out; + /* check if we are done with visiting every node */ + if (subvol == local->cached_subvol) { + goto out; + } + STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access, &local->loc, local->rebalance.flags, NULL); return 0; } if ((op_ret == -1) && (op_errno == ENOENT)) { /* File would be migrated to other node */ + local->op_errno = op_errno; local->rebalance.target_op_fn = dht_access2; ret = dht_rebalance_complete_check (frame->this, frame); if (!ret) @@ -604,8 +621,9 @@ int dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, dict_t *xdata) { - dht_local_t *local = NULL; - int ret = -1; + dht_local_t *local = NULL; + inode_t *inode = NULL; + xlator_t *subvol = 0; local = frame->local; @@ -615,8 +633,8 @@ dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, goto out; /* If context is set, then send flush() it to the destination */ - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { + dht_inode_ctx_get1 (this, inode, &subvol); + if (subvol) { dht_flush2 (this, frame, 0); return 0; } @@ -632,14 +650,10 @@ dht_flush2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -701,12 +715,14 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + inode_t *inode = NULL; + xlator_t *subvol = 0; local = frame->local; prev = cookie; local->op_errno = op_errno; - if (op_ret == -1) { + if (op_ret == -1 && (op_errno != ENOENT)) { gf_log (this->name, GF_LOG_DEBUG, "subvolume %s returned -1 (%s)", prev->this->name, strerror (op_errno)); @@ -721,8 +737,9 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, goto out; } - ret = fd_ctx_get (local->fd, this, NULL); - if (ret) { + local->op_errno = op_errno; + dht_inode_ctx_get1 (this, inode, &subvol); + if (!subvol) { local->rebalance.target_op_fn = dht_fsync2; /* Check if the rebalance phase1 is true */ @@ -737,11 +754,12 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, if (IS_DHT_MIGRATION_PHASE2 (postbuf)) { ret = dht_rebalance_complete_check (this, frame); } + if (!ret) + return 0; } else { dht_fsync2 (this, frame, 0); - } - if (!ret) return 0; + } out: DHT_STRIP_PHASE1_FLAGS (postbuf); @@ -757,15 +775,10 @@ dht_fsync2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; - + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c index d4a3ecc39..4b3f3a049 100644 --- a/xlators/cluster/dht/src/dht-inode-write.c +++ b/xlators/cluster/dht/src/dht-inode-write.c @@ -19,6 +19,9 @@ int dht_writev2 (xlator_t *this, call_frame_t *frame, int ret); int dht_truncate2 (xlator_t *this, call_frame_t *frame, int ret); int dht_setattr2 (xlator_t *this, call_frame_t *frame, int ret); +int dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret); +int dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret); +int dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret); int dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -27,8 +30,9 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, { dht_local_t *local = NULL; int ret = -1; + xlator_t *subvol = NULL; - if (op_ret == -1) { + if (op_ret == -1 && (op_errno != ENOENT)) { goto out; } @@ -50,6 +54,7 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->rebalance.target_op_fn = dht_writev2; + local->op_errno = op_errno; /* Phase 2 of migration */ if (IS_DHT_MIGRATION_PHASE2 (postbuf)) { ret = dht_rebalance_complete_check (this, frame); @@ -62,8 +67,8 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dht_iatt_merge (this, &local->stbuf, postbuf, NULL); dht_iatt_merge (this, &local->prebuf, prebuf, NULL); - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { + ret = dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { dht_writev2 (this, frame, 0); return 0; } @@ -87,14 +92,10 @@ dht_writev2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; local = frame->local; - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -169,6 +170,8 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dht_local_t *local = NULL; call_frame_t *prev = NULL; int ret = -1; + xlator_t *subvol = NULL; + inode_t *inode = NULL; GF_VALIDATE_OR_GOTO ("dht", frame, err); GF_VALIDATE_OR_GOTO ("dht", this, out); @@ -198,6 +201,7 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->rebalance.target_op_fn = dht_truncate2; + local->op_errno = op_errno; /* Phase 2 of migration */ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { ret = dht_rebalance_complete_check (this, frame); @@ -209,8 +213,9 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { dht_iatt_merge (this, &local->stbuf, postbuf, NULL); dht_iatt_merge (this, &local->prebuf, prebuf, NULL); - ret = fd_ctx_get (local->fd, this, NULL); - if (!ret) { + inode = (local->fd) ? local->fd->inode : local->loc.inode; + dht_inode_ctx_get1 (this, inode, &subvol); + if (subvol) { dht_truncate2 (this, frame, 0); return 0; } @@ -234,16 +239,13 @@ dht_truncate2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; + inode_t *inode = NULL; local = frame->local; - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + inode = local->fd ? local->fd->inode : local->loc.inode; + dht_inode_ctx_get1 (this, inode, &subvol); if (!subvol) subvol = local->cached_subvol; @@ -346,6 +348,407 @@ err: return 0; } + +int +dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + xlator_t *subvol = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_fallocate2; + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { + dht_fallocate2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (fallocate, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_fallocate_cbk, subvol, subvol->fops->fallocate, + local->fd, local->rebalance.flags, local->rebalance.offset, + local->rebalance.size, NULL); + + return 0; +} + +int +dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_FALLOCATE); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.flags = mode; + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_fallocate_cbk, + subvol, subvol->fops->fallocate, + fd, mode, offset, len, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + +int +dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + xlator_t *subvol = NULL; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_discard2; + + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + if (subvol) { + dht_discard2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (discard, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + + dht_inode_ctx_get1 (this, local->fd->inode, &subvol); + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_discard_cbk, subvol, subvol->fops->discard, + local->fd, local->rebalance.offset, local->rebalance.size, + NULL); + + return 0; +} + +int +dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_DISCARD); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_discard_cbk, subvol, subvol->fops->discard, + fd, offset, len, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +int +dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("dht", frame, err); + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", frame->local, out); + GF_VALIDATE_OR_GOTO ("dht", cookie, out); + + local = frame->local; + prev = cookie; + + if ((op_ret == -1) && (op_errno != ENOENT)) { + local->op_errno = op_errno; + local->op_ret = -1; + gf_log (this->name, GF_LOG_DEBUG, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto out; + } + + if (local->call_cnt != 1) { + if (local->stbuf.ia_blocks) { + dht_iatt_merge (this, postbuf, &local->stbuf, NULL); + dht_iatt_merge (this, prebuf, &local->prebuf, NULL); + } + goto out; + } + local->rebalance.target_op_fn = dht_zerofill2; + /* Phase 2 of migration */ + if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) { + ret = dht_rebalance_complete_check (this, frame); + if (!ret) + return 0; + } + + /* Check if the rebalance phase1 is true */ + if (IS_DHT_MIGRATION_PHASE1 (postbuf)) { + dht_iatt_merge (this, &local->stbuf, postbuf, NULL); + dht_iatt_merge (this, &local->prebuf, prebuf, NULL); + ret = fd_ctx_get (local->fd, this, NULL); + if (!ret) { + dht_zerofill2 (this, frame, 0); + return 0; + } + ret = dht_rebalance_in_progress_check (this, frame); + if (!ret) + return 0; + } + +out: + DHT_STRIP_PHASE1_FLAGS (postbuf); + DHT_STRIP_PHASE1_FLAGS (prebuf); + DHT_STACK_UNWIND (zerofill, frame, op_ret, op_errno, + prebuf, postbuf, xdata); +err: + return 0; +} + +int +dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret) +{ + dht_local_t *local = NULL; + xlator_t *subvol = NULL; + uint64_t tmp_subvol = 0; + int ret = -1; + + local = frame->local; + + if (local->fd) + ret = fd_ctx_get (local->fd, this, &tmp_subvol); + if (!ret) + subvol = (xlator_t *)(long)tmp_subvol; + + if (!subvol) + subvol = local->cached_subvol; + + local->call_cnt = 2; /* This is the second attempt */ + + STACK_WIND(frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill, + local->fd, local->rebalance.offset, local->rebalance.size, + NULL); + + return 0; +} + +int +dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + local = dht_local_init (frame, NULL, fd, GF_FOP_ZEROFILL); + if (!local) { + op_errno = ENOMEM; + goto err; + } + + local->rebalance.offset = offset; + local->rebalance.size = len; + + local->call_cnt = 1; + subvol = local->cached_subvol; + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill, + fd, offset, len, xdata); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + + /* handle cases of migration here for 'setattr()' calls */ int dht_file_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -397,15 +800,13 @@ dht_setattr2 (xlator_t *this, call_frame_t *frame, int op_ret) { dht_local_t *local = NULL; xlator_t *subvol = NULL; - uint64_t tmp_subvol = 0; - int ret = -1; + inode_t *inode = NULL; local = frame->local; - if (local->fd) - ret = fd_ctx_get (local->fd, this, &tmp_subvol); - if (!ret) - subvol = (xlator_t *)(long)tmp_subvol; + inode = (local->fd) ? local->fd->inode : local->loc.inode; + + dht_inode_ctx_get1 (this, inode, &subvol); if (!subvol) subvol = local->cached_subvol; diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c index 8cae52653..38e9970a7 100644 --- a/xlators/cluster/dht/src/dht-layout.c +++ b/xlators/cluster/dht/src/dht-layout.c @@ -158,7 +158,7 @@ dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name) int ret = 0; - ret = dht_hash_compute (layout->type, name, &hash); + ret = dht_hash_compute (this, layout->type, name, &hash); if (ret != 0) { gf_log (this->name, GF_LOG_WARNING, "hash computation failed for type=%d name=%s", @@ -335,11 +335,12 @@ int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, int op_ret, int op_errno, dict_t *xattr) { - int i = 0; - int ret = -1; - int err = -1; - void *disk_layout_raw = NULL; - int disk_layout_len = 0; + int i = 0; + int ret = -1; + int err = -1; + void *disk_layout_raw = NULL; + int disk_layout_len = 0; + dht_conf_t *conf = this->private; if (op_ret != 0) { err = op_errno; @@ -360,12 +361,12 @@ dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, if (xattr) { /* during lookup and not mkdir */ - ret = dict_get_ptr_and_len (xattr, "trusted.glusterfs.dht", + ret = dict_get_ptr_and_len (xattr, conf->xattr_name, &disk_layout_raw, &disk_layout_len); } if (ret != 0) { - layout->list[i].err = -1; + layout->list[i].err = 0; gf_log (this->name, GF_LOG_TRACE, "missing disk layout on %s. err = %d", subvol->name, err); @@ -412,6 +413,22 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j) layout->list[j].err = err_swap; } +void +dht_layout_range_swap (dht_layout_t *layout, int i, int j) +{ + uint32_t start_swap = 0; + uint32_t stop_swap = 0; + + start_swap = layout->list[i].start; + stop_swap = layout->list[i].stop; + + layout->list[i].start = layout->list[j].start; + layout->list[i].stop = layout->list[j].stop; + + layout->list[j].start = start_swap; + layout->list[j].stop = stop_swap; +} + int64_t dht_layout_entry_cmp_volname (dht_layout_t *layout, int i, int j) { @@ -437,12 +454,19 @@ dht_layout_entry_cmp (dht_layout_t *layout, int i, int j) { int64_t diff = 0; + /* swap zero'ed out layouts to front, if needed */ + if (!layout->list[j].start && !layout->list[j].stop) { + diff = (int64_t) layout->list[i].stop + - (int64_t) layout->list[j].stop; + goto out; + } if (layout->list[i].err || layout->list[j].err) diff = layout->list[i].err - layout->list[j].err; else diff = (int64_t) layout->list[i].start - (int64_t) layout->list[j].start; +out: return diff; } @@ -513,23 +537,30 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, prev_stop = last_stop; for (i = 0; i < layout->cnt; i++) { - if (layout->list[i].err) { - switch (layout->list[i].err) { - case -1: - case ENOENT: - missing++; - break; - case ENOTCONN: - down++; - break; - case ENOSPC: - no_space++; - break; - default: - misc++; + switch (layout->list[i].err) { + case -1: + case ENOENT: + missing++; + continue; + case ENOTCONN: + down++; + continue; + case ENOSPC: + no_space++; + continue; + case 0: + /* if err == 0 and start == stop, then it is a non misc++; + * participating subvolume(spread-cnt). Then, do not + * check for anomalies. If start != stop, then treat it + * as misc err */ + if (layout->list[i].start == layout->list[i].stop) { + continue; } + break; + default: + misc++; continue; - } + } is_virgin = 0; @@ -634,30 +665,29 @@ out: } int -dht_dir_has_layout (dict_t *xattr) +dht_dir_has_layout (dict_t *xattr, char *name) { void *disk_layout_raw = NULL; - return dict_get_ptr (xattr, "trusted.glusterfs.dht", - &disk_layout_raw); - + return dict_get_ptr (xattr, name, &disk_layout_raw); } int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, loc_t *loc, dict_t *xattr) { - int idx = 0; - int pos = -1; - int ret = 0; - int err = 0; - int dict_ret = 0; - int32_t disk_layout[4]; - void *disk_layout_raw = NULL; - int32_t count = -1; - uint32_t start_off = -1; - uint32_t stop_off = -1; + int idx = 0; + int pos = -1; + int ret = 0; + int err = 0; + int dict_ret = 0; + int32_t disk_layout[4]; + void *disk_layout_raw = NULL; + int32_t count = -1; + uint32_t start_off = -1; + uint32_t stop_off = -1; + dht_conf_t *conf = this->private; for (idx = 0; idx < layout->cnt; idx++) { @@ -687,7 +717,7 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, goto out; } - dict_ret = dict_get_ptr (xattr, "trusted.glusterfs.dht", + dict_ret = dict_get_ptr (xattr, conf->xattr_name, &disk_layout_raw); if (dict_ret < 0) { diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c index 803f344af..dbc9d0b3c 100644 --- a/xlators/cluster/dht/src/dht-linkfile.c +++ b/xlators/cluster/dht/src/dht-linkfile.c @@ -19,8 +19,37 @@ #include "compat.h" #include "dht-common.h" +int +dht_linkfile_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct iatt *stbuf, dict_t *xattr, + struct iatt *postparent) +{ + char is_linkfile = 0; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + + local = frame->local; + prev = cookie; + conf = this->private; + + if (op_ret) + goto out; + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); + if (!is_linkfile) + gf_log (this->name, GF_LOG_WARNING, "got non-linkfile %s:%s", + prev->this->name, local->loc.path); +out: + local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno, + inode, stbuf, postparent, postparent, + xattr); + return 0; +} +#define is_equal(a, b) (a == b) int dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, inode_t *inode, @@ -28,29 +57,68 @@ dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, struct iatt *postparent, dict_t *xdata) { dht_local_t *local = NULL; + xlator_t *subvol = NULL; + call_frame_t *prev = NULL; + dict_t *xattrs = NULL; + dht_conf_t *conf = NULL; + int ret = -1; local = frame->local; + if (!op_ret) + local->linked = _gf_true; + + FRAME_SU_UNDO (frame, dht_local_t); + + if (op_ret && (op_errno == EEXIST)) { + conf = this->private; + prev = cookie; + subvol = prev->this; + if (!subvol) + goto out; + xattrs = dict_new (); + if (!xattrs) + goto out; + ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set linkto key"); + goto out; + } + + STACK_WIND (frame, dht_linkfile_lookup_cbk, subvol, + subvol->fops->lookup, &local->loc, xattrs); + if (xattrs) + dict_unref (xattrs); + return 0; + } +out: local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno, inode, stbuf, preparent, postparent, xdata); + if (xattrs) + dict_unref (xattrs); return 0; } int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, + xlator_t *this, xlator_t *tovol, xlator_t *fromvol, loc_t *loc) { dht_local_t *local = NULL; dict_t *dict = NULL; int need_unref = 0; int ret = 0; + dht_conf_t *conf = this->private; local = frame->local; local->linkfile.linkfile_cbk = linkfile_cbk; local->linkfile.srcvol = tovol; + local->linked = _gf_false; + dict = local->params; if (!dict) { dict = dict_new (); @@ -71,8 +139,7 @@ dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, gf_log ("dht-linkfile", GF_LOG_INFO, "%s: internal-fop set failed", loc->path); - ret = dict_set_str (dict, "trusted.glusterfs.dht.linkto", - tovol->name); + ret = dict_set_str (dict, conf->link_xattr_name, tovol->name); if (ret < 0) { gf_log (frame->this->name, GF_LOG_INFO, @@ -81,6 +148,10 @@ dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, goto out; } + local->link_subvol = fromvol; + /* Always create as root:root. dht_linkfile_attr_heal fixes the + * ownsership */ + FRAME_SU_DO (frame, dht_local_t); STACK_WIND (frame, dht_linkfile_create_cbk, fromvol, fromvol->fops->mknod, loc, S_IFREG | DHT_LINKFILE_MODE, 0, 0, dict); @@ -173,7 +244,7 @@ dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *stbuf, if (!xattr) goto out; - ret = dict_get_ptr (xattr, "trusted.glusterfs.dht.linkto", &volname); + ret = dict_get_ptr (xattr, conf->link_xattr_name, &volname); if ((-1 == ret) || !volname) goto out; @@ -188,3 +259,70 @@ dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *stbuf, out: return subvol; } + +int +dht_linkfile_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *statpre, + struct iatt *statpost, dict_t *xdata) +{ + dht_local_t *local = NULL; + loc_t *loc = NULL; + + local = frame->local; + loc = &local->loc; + + if (op_ret) + gf_log (this->name, GF_LOG_ERROR, "setattr of uid/gid on %s" + " :<gfid:%s> failed (%s)", + (loc->path? loc->path: "NULL"), + uuid_utoa(local->gfid), strerror(op_errno)); + + DHT_STACK_DESTROY (frame); + + return 0; +} + +int +dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this) +{ + int ret = -1; + call_frame_t *copy = NULL; + dht_local_t *local = NULL; + dht_local_t *copy_local = NULL; + xlator_t *subvol = NULL; + struct iatt stbuf = {0,}; + + local = frame->local; + + GF_VALIDATE_OR_GOTO ("dht", local, out); + GF_VALIDATE_OR_GOTO ("dht", local->link_subvol, out); + + if (local->stbuf.ia_type == IA_INVAL) + return 0; + + uuid_copy (local->loc.gfid, local->stbuf.ia_gfid); + + copy = copy_frame (frame); + + if (!copy) + goto out; + + copy_local = dht_local_init (copy, &local->loc, NULL, 0); + + if (!copy_local) + goto out; + + stbuf = local->stbuf; + subvol = local->link_subvol; + + copy->local = copy_local; + + FRAME_SU_DO (copy, dht_local_t); + + STACK_WIND (copy, dht_linkfile_setattr_cbk, subvol, + subvol->fops->setattr, ©_local->loc, + &stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL); + ret = 0; +out: + return ret; +} diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index a2e96a1b2..bcb19f23e 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -16,6 +16,7 @@ #include "dht-common.h" #include "xlator.h" +#include <fnmatch.h> #define GF_DISK_SECTOR_SIZE 512 #define DHT_REBALANCE_PID 4242 /* Change it if required */ @@ -101,12 +102,16 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs, data_t *data = NULL; struct iatt iatt = {0,}; int32_t op_errno = 0; + dht_conf_t *conf = NULL; GF_VALIDATE_OR_GOTO ("defrag", loc, out); GF_VALIDATE_OR_GOTO ("defrag", loc->name, out); GF_VALIDATE_OR_GOTO ("defrag", stbuf, out); GF_VALIDATE_OR_GOTO ("defrag", this, out); GF_VALIDATE_OR_GOTO ("defrag", xattrs, out); + GF_VALIDATE_OR_GOTO ("defrag", this->private, out); + + conf = this->private; if (uuid_is_null (loc->pargfid)) { gf_log ("", GF_LOG_ERROR, "loc->pargfid is NULL for " @@ -137,10 +142,10 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs, gf_log (this->name, GF_LOG_INFO, "Attempting to migrate hardlink %s " "with gfid %s from %s -> %s", loc->name, uuid_utoa (loc->gfid), cached_subvol->name, hashed_subvol->name); - data = dict_get (xattrs, DHT_LINKFILE_KEY); + data = dict_get (xattrs, conf->link_xattr_name); /* set linkto on cached -> hashed if not present, else link it */ if (!data) { - ret = dict_set_str (xattrs, DHT_LINKFILE_KEY, + ret = dict_set_str (xattrs, conf->link_xattr_name, hashed_subvol->name); if (ret) { gf_log (this->name, GF_LOG_ERROR, "Failed to set " @@ -238,14 +243,16 @@ out: static inline int __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf, - dict_t *dict, fd_t **dst_fd) + dict_t *dict, fd_t **dst_fd, dict_t *xattr) { - xlator_t *this = NULL; - int ret = -1; - fd_t *fd = NULL; - struct iatt new_stbuf = {0,}; + xlator_t *this = NULL; + int ret = -1; + fd_t *fd = NULL; + struct iatt new_stbuf = {0,}; + dht_conf_t *conf = NULL; this = THIS; + conf = this->private; ret = dict_set_static_bin (dict, "gfid-req", stbuf->ia_gfid, 16); if (ret) { @@ -254,7 +261,7 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc goto out; } - ret = dict_set_str (dict, DHT_LINKFILE_KEY, from->name); + ret = dict_set_str (dict, conf->link_xattr_name, from->name); if (ret) { gf_log (this->name, GF_LOG_ERROR, "%s: failed to set gfid in dict for create", loc->path); @@ -292,7 +299,7 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc /* Create the destination with LINKFILE mode, and linkto xattr, if the linkfile already exists, it will just open the file */ ret = syncop_create (to, loc, O_RDWR, DHT_LINKFILE_MODE, fd, - dict); + dict, &new_stbuf); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "failed to create %s on %s (%s)", @@ -300,6 +307,18 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc goto out; } + ret = syncop_fsetxattr (to, fd, xattr, 0); + if (ret == -1) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to set xattr on %s (%s)", + loc->path, to->name, strerror (errno)); + + ret = syncop_ftruncate (to, fd, stbuf->ia_size); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "ftruncate failed for %s on %s (%s)", + loc->path, to->name, strerror (errno)); + ret = syncop_fsetattr (to, fd, stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL, NULL); @@ -327,6 +346,9 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, int ret = -1; xlator_t *this = NULL; + uint64_t src_statfs_blocks = 1; + uint64_t dst_statfs_blocks = 1; + this = THIS; ret = syncop_statfs (from, loc, &src_statfs); @@ -350,22 +372,34 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc, if (flag != GF_DHT_MIGRATE_DATA) goto check_avail_space; - if (((dst_statfs.f_bavail * - dst_statfs.f_bsize) / GF_DISK_SECTOR_SIZE) < - (((src_statfs.f_bavail * src_statfs.f_bsize) / - GF_DISK_SECTOR_SIZE) - stbuf->ia_blocks)) { - gf_log (this->name, GF_LOG_WARNING, - "data movement attempted from node (%s) with" - " higher disk space to a node (%s) with " - "lesser disk space (%s)", from->name, - to->name, loc->path); - - /* this is not a 'failure', but we don't want to - consider this as 'success' too :-/ */ - ret = 1; - goto out; + /* Check: + During rebalance `migrate-data` - Destination subvol experiences + a `reduction` in 'blocks' of free space, at the same time source + subvol gains certain 'blocks' of free space. A valid check is + necessary here to avoid errorneous move to destination where + the space could be scantily available. + */ + if (stbuf) { + dst_statfs_blocks = ((dst_statfs.f_bavail * + dst_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE); + src_statfs_blocks = ((src_statfs.f_bavail * + src_statfs.f_bsize) / + GF_DISK_SECTOR_SIZE); + if ((dst_statfs_blocks - stbuf->ia_blocks) < + (src_statfs_blocks + stbuf->ia_blocks)) { + gf_log (this->name, GF_LOG_WARNING, + "data movement attempted from node (%s) with" + " higher disk space to a node (%s) with " + "lesser disk space (%s)", from->name, + to->name, loc->path); + + /* this is not a 'failure', but we don't want to + consider this as 'success' too :-/ */ + ret = 1; + goto out; + } } - check_avail_space: if (((dst_statfs.f_bavail * dst_statfs.f_bsize) / GF_DISK_SECTOR_SIZE) < stbuf->ia_blocks) { @@ -442,8 +476,10 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc, dict_t *dict = NULL; xlator_t *this = NULL; struct iatt iatt = {0,}; + dht_conf_t *conf = NULL; this = THIS; + conf = this->private; fd = fd_create (loc->inode, DHT_REBALANCE_PID); if (!fd) { @@ -466,7 +502,7 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc, if (!dict) goto out; - ret = dict_set_str (dict, DHT_LINKFILE_KEY, to->name); + ret = dict_set_str (dict, conf->link_xattr_name, to->name); if (ret) { gf_log (this->name, GF_LOG_ERROR, "failed to set xattr in dict for %s (linkto:%s)", @@ -519,12 +555,13 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc, dict_t *dict = NULL; char *link = NULL; struct iatt stbuf = {0,}; + dht_conf_t *conf = this->private; dict = dict_new (); if (!dict) goto out; - ret = dict_set_int32 (dict, DHT_LINKFILE_KEY, 256); + ret = dict_set_int32 (dict, conf->link_xattr_name, 256); if (ret) { gf_log (this->name, GF_LOG_ERROR, "%s: failed to set 'linkto' key in dict", loc->path); @@ -540,12 +577,13 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc, } /* we no more require this key */ - dict_del (dict, DHT_LINKFILE_KEY); + dict_del (dict, conf->link_xattr_name); /* file exists in target node, only if it is 'linkfile' its valid, otherwise, error out */ if (!ret) { - if (!check_is_linkfile (loc->inode, &stbuf, rsp_dict)) { + if (!check_is_linkfile (loc->inode, &stbuf, rsp_dict, + conf->link_xattr_name)) { gf_log (this->name, GF_LOG_WARNING, "%s: file exists in destination", loc->path); ret = -1; @@ -581,7 +619,7 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc, goto out; } - ret = syncop_symlink (to, loc, link, dict); + ret = syncop_symlink (to, loc, link, dict, 0); if (ret) { gf_log (this->name, GF_LOG_WARNING, "%s: creating symlink failed (%s)", @@ -595,7 +633,7 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc, ret = syncop_mknod (to, loc, st_mode_from_ia (buf->ia_prot, buf->ia_type), makedev (ia_major (buf->ia_rdev), - ia_minor (buf->ia_rdev)), dict); + ia_minor (buf->ia_rdev)), dict, 0); if (ret) { gf_log (this->name, GF_LOG_WARNING, "%s: mknod failed (%s)", loc->path, strerror (errno)); @@ -603,6 +641,15 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc, } done: + ret = syncop_setattr (to, loc, buf, + (GF_SET_ATTR_UID | GF_SET_ATTR_GID | + GF_SET_ATTR_MODE), NULL, NULL); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform setattr on %s (%s)", + loc->path, to->name, strerror (errno)); + } + ret = syncop_unlink (from, loc); if (ret) gf_log (this->name, GF_LOG_WARNING, "%s: unlink failed (%s)", @@ -640,6 +687,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, dict_t *xattr = NULL; dict_t *xattr_rsp = NULL; int file_has_holes = 0; + dht_conf_t *conf = this->private; gf_log (this->name, GF_LOG_INFO, "%s: attempting to move from %s to %s", loc->path, from->name, to->name); @@ -648,7 +696,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, if (!dict) goto out; - ret = dict_set_int32 (dict, DHT_LINKFILE_KEY, 256); + ret = dict_set_int32 (dict, conf->link_xattr_name, 256); if (ret) { gf_log (this->name, GF_LOG_ERROR, "%s: failed to set 'linkto' key in dict", loc->path); @@ -664,7 +712,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, } /* we no more require this key */ - dict_del (dict, DHT_LINKFILE_KEY); + dict_del (dict, conf->link_xattr_name); /* preserve source mode, so set the same to the destination */ src_ia_prot = stbuf.ia_prot; @@ -681,9 +729,16 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } + /* TODO: move all xattr related operations to fd based operations */ + ret = syncop_listxattr (from, loc, &xattr); + if (ret == -1) + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to get xattr from %s (%s)", + loc->path, from->name, strerror (errno)); + /* create the destination, with required modes/xattr */ ret = __dht_rebalance_create_dst_file (to, from, loc, &stbuf, - dict, &dst_fd); + dict, &dst_fd, xattr); if (ret) goto out; @@ -700,6 +755,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } + ret = syncop_fstat (from, src_fd, &stbuf); if (ret) { gf_log (this->name, GF_LOG_ERROR, "failed to lookup %s on %s (%s)", @@ -729,19 +785,6 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } - /* TODO: move all xattr related operations to fd based operations */ - ret = syncop_listxattr (from, loc, &xattr); - if (ret == -1) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to get xattr from %s (%s)", - loc->path, from->name, strerror (errno)); - - ret = syncop_setxattr (to, loc, xattr, 0); - if (ret == -1) - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to set xattr on %s (%s)", - loc->path, to->name, strerror (errno)); - /* TODO: Sync the locks */ ret = syncop_fsync (to, dst_fd, 0); @@ -805,6 +848,23 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, goto out; } + /* Free up the data blocks on the source node, as the whole + file is migrated */ + ret = syncop_ftruncate (from, src_fd, 0); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform truncate on %s (%s)", + loc->path, from->name, strerror (errno)); + } + + /* remove the 'linkto' xattr from the destination */ + ret = syncop_fremovexattr (to, dst_fd, conf->link_xattr_name); + if (ret) { + gf_log (this->name, GF_LOG_WARNING, + "%s: failed to perform removexattr on %s (%s)", + loc->path, to->name, strerror (errno)); + } + /* Do a stat and check the gfid before unlink */ ret = syncop_stat (from, loc, &empty_iatt); if (ret) { @@ -825,23 +885,6 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, } } - /* Free up the data blocks on the source node, as the whole - file is migrated */ - ret = syncop_ftruncate (from, src_fd, 0); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to perform truncate on %s (%s)", - loc->path, from->name, strerror (errno)); - } - - /* remove the 'linkto' xattr from the destination */ - ret = syncop_fremovexattr (to, dst_fd, DHT_LINKFILE_KEY); - if (ret) { - gf_log (this->name, GF_LOG_WARNING, - "%s: failed to perform removexattr on %s (%s)", - loc->path, to->name, strerror (errno)); - } - ret = syncop_lookup (this, loc, NULL, NULL, NULL, NULL); if (ret) { gf_log (this->name, GF_LOG_DEBUG, @@ -1031,6 +1074,31 @@ gf_defrag_handle_migrate_error (int32_t op_errno, gf_defrag_info_t *defrag) return 0; } +static gf_boolean_t +gf_defrag_pattern_match (gf_defrag_info_t *defrag, char *name, uint64_t size) +{ + gf_defrag_pattern_list_t *trav = NULL; + gf_boolean_t match = _gf_false; + gf_boolean_t ret = _gf_false; + + GF_VALIDATE_OR_GOTO ("dht", defrag, out); + + trav = defrag->defrag_pattern; + while (trav) { + if (!fnmatch (trav->path_pattern, name, FNM_NOESCAPE)) { + match = _gf_true; + break; + } + trav = trav->next; + } + + if ((match == _gf_true) && (size >= trav->size)) + ret = _gf_true; + + out: + return ret; +} + /* We do a depth first traversal of directories. But before we move into * subdirs, we complete the data migration of those directories whose layouts * have been fixed @@ -1058,6 +1126,7 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, struct timeval end = {0,}; double elapsed = {0,}; struct timeval start = {0,}; + int32_t err = 0; gf_log (this->name, GF_LOG_INFO, "migrate data called on %s", loc->path); @@ -1080,17 +1149,24 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, while ((ret = syncop_readdirp (this, fd, 131072, offset, NULL, &entries)) != 0) { - if (ret < 0) - break; + + if (ret < 0) { + + gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s." + " Aborting migrate-data", + strerror(readdir_operrno)); + goto out; + } /* Need to keep track of ENOENT errno, that means, there is no need to send more readdirp() */ readdir_operrno = errno; - free_entries = _gf_true; - if (list_empty (&entries.list)) break; + + free_entries = _gf_true; + list_for_each_entry_safe (entry, tmp, &entries.list, list) { if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { ret = 1; @@ -1110,6 +1186,12 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, if (defrag->stats == _gf_true) { gettimeofday (&start, NULL); } + if (defrag->defrag_pattern && + (gf_defrag_pattern_match (defrag, entry->d_name, + entry->d_stat.ia_size) + == _gf_false)) { + continue; + } loc_wipe (&entry_loc); ret =dht_build_child_loc (this, &entry_loc, loc, entry->d_name); @@ -1202,9 +1284,21 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, ret = syncop_setxattr (this, &entry_loc, migrate_data, 0); if (ret) { - gf_log (this->name, GF_LOG_ERROR, "migrate-data" - " failed for %s", entry_loc.path); - defrag->total_failures +=1; + err = op_errno; + /* errno is overloaded. See + * rebalance_task_completion () */ + if (err != ENOSPC) { + gf_log (this->name, GF_LOG_DEBUG, + "migrate-data skipped for %s" + " due to space constraints", + entry_loc.path); + defrag->skipped +=1; + } else{ + gf_log (this->name, GF_LOG_ERROR, + "migrate-data failed for %s", + entry_loc.path); + defrag->total_failures +=1; + } } if (ret == -1) { @@ -1284,6 +1378,7 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, dict_t *dict = NULL; off_t offset = 0; struct iatt iatt = {0,}; + int readdirp_errno = 0; ret = syncop_lookup (this, loc, NULL, &iatt, NULL, NULL); if (ret) { @@ -1319,12 +1414,22 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, while ((ret = syncop_readdirp (this, fd, 131072, offset, NULL, &entries)) != 0) { - if ((ret < 0) || (ret && (errno == ENOENT))) - break; - free_entries = _gf_true; + + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s" + ". Aborting fix-layout",strerror(errno)); + goto out; + } + + /* Need to keep track of ENOENT errno, that means, there is no + need to send more readdirp() */ + readdirp_errno = errno; if (list_empty (&entries.list)) break; + + free_entries = _gf_true; + list_for_each_entry_safe (entry, tmp, &entries.list, list) { if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { ret = 1; @@ -1351,7 +1456,7 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, if (uuid_is_null (entry->d_stat.ia_gfid)) { gf_log (this->name, GF_LOG_ERROR, "%s/%s" - "gfid not present", loc->path, + " gfid not present", loc->path, entry->d_name); continue; } @@ -1361,7 +1466,7 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid); if (uuid_is_null (loc->gfid)) { gf_log (this->name, GF_LOG_ERROR, "%s/%s" - "gfid not present", loc->path, + " gfid not present", loc->path, entry->d_name); continue; } @@ -1400,6 +1505,8 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc, gf_dirent_free (&entries); free_entries = _gf_false; INIT_LIST_HEAD (&entries.list); + if (readdirp_errno == ENOENT) + break; } ret = 0; @@ -1590,6 +1697,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict) uint64_t size = 0; uint64_t lookup = 0; uint64_t failures = 0; + uint64_t skipped = 0; char *status = ""; double elapsed = 0; struct timeval end = {0,}; @@ -1606,6 +1714,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict) size = defrag->total_data; lookup = defrag->num_files_lookedup; failures = defrag->total_failures; + skipped = defrag->skipped; gettimeofday (&end, NULL); @@ -1629,6 +1738,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict) gf_log (THIS->name, GF_LOG_WARNING, "failed to set lookedup file count"); + ret = dict_set_int32 (dict, "status", defrag->defrag_status); if (ret) gf_log (THIS->name, GF_LOG_WARNING, @@ -1641,6 +1751,14 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict) } ret = dict_set_uint64 (dict, "failures", failures); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set failure count"); + + ret = dict_set_uint64 (dict, "skipped", skipped); + if (ret) + gf_log (THIS->name, GF_LOG_WARNING, + "failed to set skipped file count"); log: switch (defrag->defrag_status) { case GF_DEFRAG_STATUS_NOT_STARTED: @@ -1658,13 +1776,15 @@ log: case GF_DEFRAG_STATUS_FAILED: status = "failed"; break; + default: + break; } gf_log (THIS->name, GF_LOG_INFO, "Rebalance is %s. Time taken is %.2f " "secs", status, elapsed); gf_log (THIS->name, GF_LOG_INFO, "Files migrated: %"PRIu64", size: %" - PRIu64", lookups: %"PRIu64", failures: %"PRIu64, files, size, - lookup, failures); + PRIu64", lookups: %"PRIu64", failures: %"PRIu64", skipped: " + "%"PRIu64, files, size, lookup, failures, skipped); out: diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c index f1a4364df..5d6f4f232 100644 --- a/xlators/cluster/dht/src/dht-rename.c +++ b/xlators/cluster/dht/src/dht-rename.c @@ -306,7 +306,38 @@ err: NULL, NULL); return 0; } +#define DHT_MARK_FOP_INTERNAL(xattr) do { \ + int tmp = -1; \ + if (!xattr) { \ + xattr = dict_new (); \ + if (!xattr) \ + break; \ + } \ + tmp = dict_set_str (xattr, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); \ + if (tmp) { \ + gf_log (this->name, GF_LOG_ERROR, "Failed to set" \ + " internal dict key for %s", local->loc.path); \ + } \ + }while (0) +int +dht_rename_done (call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + local = frame->local; + + if (local->linked == _gf_true) { + local->linked = _gf_false; + dht_linkfile_attr_heal (frame, this); + } + DHT_STRIP_PHASE1_FLAGS (&local->stbuf); + DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, + &local->stbuf, &local->preoldparent, + &local->postoldparent, &local->preparent, + &local->postparent, NULL); + + return 0; +} int dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, @@ -340,11 +371,7 @@ dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, WIPE (&local->postparent); if (is_last_call (this_call_cnt)) { - DHT_STRIP_PHASE1_FLAGS (&local->stbuf); - DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, - &local->stbuf, &local->preoldparent, - &local->postoldparent, &local->preparent, - &local->postparent, NULL); + dht_rename_done (frame, this); } out: @@ -362,7 +389,7 @@ dht_rename_cleanup (call_frame_t *frame) xlator_t *dst_hashed = NULL; xlator_t *dst_cached = NULL; int call_cnt = 0; - + dict_t *xattr = NULL; local = frame->local; this = frame->this; @@ -386,13 +413,15 @@ dht_rename_cleanup (call_frame_t *frame) if (!call_cnt) goto nolinks; + DHT_MARK_FOP_INTERNAL (xattr); + if (dst_hashed != src_hashed && dst_hashed != src_cached) { gf_log (this->name, GF_LOG_TRACE, "unlinking linkfile %s @ %s => %s", local->loc.path, dst_hashed->name, src_cached->name); STACK_WIND (frame, dht_rename_unlink_cbk, dst_hashed, dst_hashed->fops->unlink, - &local->loc, 0, NULL); + &local->loc, 0, xattr); } if (src_cached != dst_hashed) { @@ -401,9 +430,12 @@ dht_rename_cleanup (call_frame_t *frame) local->loc2.path, src_cached->name); STACK_WIND (frame, dht_rename_unlink_cbk, src_cached, src_cached->fops->unlink, - &local->loc2, 0, NULL); + &local->loc2, 0, xattr); } + if (xattr) + dict_unref (xattr); + return 0; nolinks: @@ -441,6 +473,10 @@ dht_rename_links_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->loc.path, prev->this->name, strerror (op_errno)); } + if (local->linked == _gf_true) { + local->linked = _gf_false; + dht_linkfile_attr_heal (frame, this); + } DHT_STACK_DESTROY (frame); return 0; @@ -463,6 +499,7 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, xlator_t *rename_subvol = NULL; call_frame_t *link_frame = NULL; dht_local_t *link_local = NULL; + dict_t *xattr = NULL; local = frame->local; prev = cookie; @@ -472,6 +509,8 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dst_hashed = local->dst_hashed; dst_cached = local->dst_cached; + if (local->linked == _gf_true) + FRAME_SU_UNDO (frame, dht_local_t); if (op_ret == -1) { gf_log (this->name, GF_LOG_WARNING, "%s: rename on %s failed (%s)", local->loc.path, @@ -501,15 +540,25 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, uuid_copy (link_local->gfid, local->loc.inode->gfid); dht_linkfile_create (link_frame, dht_rename_links_create_cbk, - src_cached, dst_hashed, &link_local->loc); + this, src_cached, dst_hashed, + &link_local->loc); } err: - dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); - dht_iatt_merge (this, &local->preoldparent, preoldparent, prev->this); - dht_iatt_merge (this, &local->postoldparent, postoldparent, prev->this); - dht_iatt_merge (this, &local->preparent, prenewparent, prev->this); - dht_iatt_merge (this, &local->postparent, postnewparent, prev->this); + /* Merge attrs only from src_cached. In case there of src_cached != + * dst_hashed, this ignores linkfile attrs. */ + if (prev->this == src_cached) { + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); + dht_iatt_merge (this, &local->preoldparent, preoldparent, + prev->this); + dht_iatt_merge (this, &local->postoldparent, postoldparent, + prev->this); + dht_iatt_merge (this, &local->preparent, prenewparent, + prev->this); + dht_iatt_merge (this, &local->postparent, postnewparent, + prev->this); + } + /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk * is called. since rename has already happened on rename_subvol, @@ -534,6 +583,8 @@ err: if (local->call_cnt == 0) goto unwind; + DHT_MARK_FOP_INTERNAL (xattr); + if (src_cached != dst_hashed && src_cached != dst_cached) { gf_log (this->name, GF_LOG_TRACE, "deleting old src datafile %s @ %s", @@ -541,7 +592,7 @@ err: STACK_WIND (frame, dht_rename_unlink_cbk, src_cached, src_cached->fops->unlink, - &local->loc, 0, NULL); + &local->loc, 0, xattr); } if (src_hashed != rename_subvol && src_hashed != src_cached) { @@ -551,7 +602,7 @@ err: STACK_WIND (frame, dht_rename_unlink_cbk, src_hashed, src_hashed->fops->unlink, - &local->loc, 0, NULL); + &local->loc, 0, xattr); } if (dst_cached @@ -563,8 +614,10 @@ err: STACK_WIND (frame, dht_rename_unlink_cbk, dst_cached, dst_cached->fops->unlink, - &local->loc2, 0, NULL); + &local->loc2, 0, xattr); } + if (xattr) + dict_unref (xattr); return 0; unwind: @@ -572,16 +625,16 @@ unwind: WIPE (&local->postoldparent); WIPE (&local->preparent); WIPE (&local->postparent); + if (xattr) + dict_unref (xattr); - DHT_STRIP_PHASE1_FLAGS (&local->stbuf); - DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, - &local->stbuf, &local->preoldparent, - &local->postoldparent, &local->preparent, - &local->postparent, NULL); + dht_rename_done (frame, this); return 0; cleanup: + if (xattr) + dict_unref (xattr); dht_rename_cleanup (frame); return 0; @@ -615,6 +668,8 @@ dht_do_rename (call_frame_t *frame) "renaming %s => %s (%s)", local->loc.path, local->loc2.path, rename_subvol->name); + if (local->linked == _gf_true) + FRAME_SU_DO (frame, dht_local_t); STACK_WIND (frame, dht_rename_cbk, rename_subvol, rename_subvol->fops->rename, &local->loc, &local->loc2, NULL); @@ -645,6 +700,9 @@ dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, local->op_ret = -1; if (op_errno != ENOENT) local->op_errno = op_errno; + } else if (local->src_cached == prev->this) { + /* merge of attr returned only from linkfile creation */ + dht_iatt_merge (this, &local->stbuf, stbuf, prev->this); } this_call_cnt = dht_frame_return (frame); @@ -710,6 +768,7 @@ dht_rename_create_links (call_frame_t *frame) xlator_t *dst_hashed = NULL; xlator_t *dst_cached = NULL; int call_cnt = 0; + dict_t *xattr = NULL; local = frame->local; @@ -720,6 +779,7 @@ dht_rename_create_links (call_frame_t *frame) dst_hashed = local->dst_hashed; dst_cached = local->dst_cached; + DHT_MARK_FOP_INTERNAL (xattr); if (src_cached == dst_cached) { if (dst_hashed == dst_cached) @@ -731,7 +791,7 @@ dht_rename_create_links (call_frame_t *frame) STACK_WIND (frame, dht_rename_unlink_links_cbk, dst_hashed, dst_hashed->fops->unlink, - &local->loc2, 0, NULL); + &local->loc2, 0, xattr); return 0; } @@ -748,7 +808,7 @@ dht_rename_create_links (call_frame_t *frame) "linkfile %s @ %s => %s", local->loc.path, dst_hashed->name, src_cached->name); memcpy (local->gfid, local->loc.inode->gfid, 16); - dht_linkfile_create (frame, dht_rename_links_cbk, + dht_linkfile_create (frame, dht_rename_links_cbk, this, src_cached, dst_hashed, &local->loc); } @@ -758,7 +818,7 @@ dht_rename_create_links (call_frame_t *frame) local->loc2.path, src_cached->name); STACK_WIND (frame, dht_rename_links_cbk, src_cached, src_cached->fops->link, - &local->loc, &local->loc2, NULL); + &local->loc, &local->loc2, xattr); } nolinks: @@ -766,6 +826,8 @@ nolinks: /* skip to next step */ dht_do_rename (frame); } + if (xattr) + dict_unref (xattr); return 0; } diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c index fbe4cab3e..3fe96b1c7 100644 --- a/xlators/cluster/dht/src/dht-selfheal.c +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -17,6 +17,7 @@ #include "glusterfs.h" #include "xlator.h" #include "dht-common.h" +#include "glusterfs-acl.h" #define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path) do { \ layout->list[i].start = srt; \ @@ -28,6 +29,13 @@ layout->list[i].xlator->name, path); \ } while (0) +#define DHT_RESET_LAYOUT_RANGE(layout) do { \ + int cnt = 0; \ + for (cnt = 0; cnt < layout->cnt; cnt++ ) { \ + layout->list[cnt].start = 0; \ + layout->list[cnt].stop = 0; \ + } \ + } while (0) static uint32_t dht_overlap_calc (dht_layout_t *old, int o, dht_layout_t *new, int n) @@ -38,12 +46,20 @@ dht_overlap_calc (dht_layout_t *old, int o, dht_layout_t *new, int n) if (old->list[o].err > 0 || new->list[n].err > 0) return 0; + if (old->list[o].start == old->list[o].stop) { + return 0; + } + + if (new->list[n].start == new->list[n].stop) { + return 0; + } + if ((old->list[o].start > new->list[n].stop) || (old->list[o].stop < new->list[n].start)) return 0; - return max (old->list[o].start, new->list[n].start) - - min (old->list[o].stop, new->list[n].stop); + return min (old->list[o].stop, new->list[n].stop) - + max (old->list[o].start, new->list[n].start) + 1; } @@ -110,7 +126,7 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, xlator_t *this = NULL; int32_t *disk_layout = NULL; dht_local_t *local = NULL; - + dht_conf_t *conf = NULL; local = frame->local; if (req_subvol) @@ -123,6 +139,9 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, GF_VALIDATE_OR_GOTO (this->name, layout, err); GF_VALIDATE_OR_GOTO (this->name, local, err); GF_VALIDATE_OR_GOTO (this->name, subvol, err); + VALIDATE_OR_GOTO (this->private, err); + + conf = this->private; xattr = get_new_dict (); if (!xattr) { @@ -137,8 +156,7 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, goto err; } - ret = dict_set_bin (xattr, "trusted.glusterfs.dht", - disk_layout, 4 * 4); + ret = dict_set_bin (xattr, conf->xattr_name, disk_layout, 4 * 4); if (ret == -1) { gf_log (this->name, GF_LOG_WARNING, "%s: (subvol %s) failed to set xattr dictionary", @@ -229,9 +247,12 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) int missing_xattr = 0; int i = 0; xlator_t *this = NULL; + dht_conf_t *conf = NULL; + dht_layout_t *dummy = NULL; local = frame->local; this = frame->this; + conf = this->private; for (i = 0; i < layout->cnt; i++) { if (layout->list[i].err != -1 || !layout->list[i].stop) { @@ -265,6 +286,18 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) if (--missing_xattr == 0) break; } + dummy = dht_layout_new (this, 1); + if (!dummy) + goto out; + for (i = 0; i < conf->subvolume_cnt; i++) { + if (_gf_false == + dht_is_subvol_in_layout (layout, conf->subvolumes[i])) { + dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0, + conf->subvolumes[i]); + } + } + dht_layout_unref (this, dummy); +out: return 0; } @@ -501,7 +534,7 @@ dht_selfheal_layout_alloc_start (xlator_t *this, loc_t *loc, uint32_t hashval = 0; int ret = 0; - ret = dht_hash_compute (layout->type, loc->path, &hashval); + ret = dht_hash_compute (this, layout->type, loc->path, &hashval); if (ret == 0) { start = (hashval % layout->cnt); } @@ -532,9 +565,33 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) for (i = 0; i < layout->cnt; i++) { err = layout->list[i].err; - if (err == -1 || err == 0) { - layout->list[i].err = -1; + if (err == -1 || err == 0 || err == ENOENT) { + /* Setting list[i].err = -1 is an indication for + dht_selfheal_layout_new_directory() to assign + a range. We set it to -1 based on any one of + the three criteria: + + - err == -1 already, which means directory + existed but layout was not set on it. + + - err == 0, which means directory exists and + has an old layout piece which will be + overwritten now. + + - err == ENOENT, which means directory does + not exist (possibly racing with mkdir or + finishing half done mkdir). The missing + directory will be attempted to be recreated. + + It is important to note that it is safe + to race with mkdir() as self-heal and + mkdir are idempotent operations. Both will + strive to set the directory and layouts to + the same final state. + */ count++; + if (!err) + layout->list[i].err = -1; } } @@ -550,11 +607,10 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout) } /* if layout->spread_cnt is set, check if it is <= available - * subvolumes (excluding bricks that are being decommissioned). Else - * return count */ + * subvolumes (down brick and decommissioned bricks are considered + * un-availbale). Else return count (available up bricks) */ count = ((layout->spread_cnt && - (layout->spread_cnt <= - (conf->subvolume_cnt - conf->decommission_subvols_cnt))) ? + (layout->spread_cnt <= count)) ? layout->spread_cnt : ((count) ? count : 1)); return count; @@ -565,18 +621,25 @@ void dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, dht_layout_t *new_layout); void dht_layout_entry_swap (dht_layout_t *layout, int i, int j); +void dht_layout_range_swap (dht_layout_t *layout, int i, int j); + +/* + * It's a bit icky using local variables in a macro, but it makes the rest + * of the code a lot clearer. + */ +#define OV_ENTRY(x,y) table[x*new->cnt+y] void dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc, dht_layout_t *new, dht_layout_t *old) { int i = 0; - int k = 0; + int j = 0; uint32_t curr_overlap = 0; uint32_t max_overlap = 0; int max_overlap_idx = -1; uint32_t overlap = 0; - + uint32_t *table = NULL; dht_layout_sort_volname (old); /* Now both old_layout->list[] and new_layout->list[] @@ -585,31 +648,61 @@ dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc, to the same subvolumes */ + /* Build a table of overlaps between new[i] and old[j]. */ + table = alloca(sizeof(overlap)*old->cnt*new->cnt); + if (!table) { + return; + } + memset(table,0,sizeof(overlap)*old->cnt*new->cnt); + for (i = 0; i < new->cnt; ++i) { + for (j = 0; j < old->cnt; ++j) { + OV_ENTRY(i,j) = dht_overlap_calc(old,j,new,i); + } + } + for (i = 0; i < new->cnt; i++) { - if (new->list[i].err > 0) + if (new->list[i].err > 0) { /* Subvol might be marked for decommission with EINVAL, or some other serious error marked with positive errno. */ continue; + } - curr_overlap = dht_overlap_calc (old, i, new, i); - max_overlap = curr_overlap; - max_overlap_idx = i; - - /* Subvols up to `i' have already found their best - matches. Search only from the rest. - */ - for (k = (i + 1); k < new->cnt; k++) { - overlap = dht_overlap_calc (old, i, new, k); - if (overlap > max_overlap) { - max_overlap_idx = k; - max_overlap = overlap; - } - } - - if (max_overlap > curr_overlap) - dht_layout_entry_swap (new, i, max_overlap_idx); + max_overlap = 0; + max_overlap_idx = i; + for (j = (i + 1); j < new->cnt; ++j) { + if (new->list[j].err > 0) { + /* Subvol might be marked for decommission + with EINVAL, or some other serious error + marked with positive errno. + */ + continue; + } + /* Calculate the overlap now. */ + curr_overlap = OV_ENTRY(i,i) + OV_ENTRY(j,j); + /* Calculate the overlap after the proposed swap. */ + overlap = OV_ENTRY(i,j) + OV_ENTRY(j,i); + /* Are we better than status quo? */ + if (overlap > curr_overlap) { + overlap -= curr_overlap; + /* Are we better than the previous choice? */ + if (overlap > max_overlap) { + max_overlap = overlap; + max_overlap_idx = j; + } + } + } + + if (max_overlap_idx != i) { + dht_layout_range_swap (new, i, max_overlap_idx); + /* Need to swap the table values too. */ + for (j = 0; j < old->cnt; ++j) { + overlap = OV_ENTRY(i,j); + OV_ENTRY(i,j) = OV_ENTRY(max_overlap_idx,j); + OV_ENTRY(max_overlap_idx,j) = overlap; + } + } } } @@ -704,9 +797,11 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout); + /* clear out the range, as we are re-computing here */ + DHT_RESET_LAYOUT_RANGE (layout); for (i = start_subvol; i < layout->cnt; i++) { err = layout->list[i].err; - if (err == -1) { + if (err == -1 || err == ENOENT) { DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, cnt, loc->path); if (--cnt == 0) { @@ -719,7 +814,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc, for (i = 0; i < start_subvol; i++) { err = layout->list[i].err; - if (err == -1) { + if (err == -1 || err == ENOENT) { DHT_SET_LAYOUT_RANGE(layout, i, start, chunk, cnt, loc->path); if (--cnt == 0) { diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c new file mode 100644 index 000000000..70aac7710 --- /dev/null +++ b/xlators/cluster/dht/src/dht-shared.c @@ -0,0 +1,758 @@ +/* + Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com> + This file is part of GlusterFS. + + This file is licensed to you under your choice of the GNU Lesser + General Public License, version 3 or any later version (LGPLv3 or + later), or the GNU General Public License, version 2 (GPLv2), in all + cases as published by the Free Software Foundation. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +/* TODO: add NS locking */ + +#include "statedump.h" +#include "dht-common.h" + +/* TODO: + - use volumename in xattr instead of "dht" + - use NS locks + - handle all cases in self heal layout reconstruction + - complete linkfile selfheal +*/ +struct volume_options options[]; + +void +dht_layout_dump (dht_layout_t *layout, const char *prefix) +{ + + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + + if (!layout) + goto out; + if (!prefix) + goto out; + + gf_proc_dump_build_key(key, prefix, "cnt"); + gf_proc_dump_write(key, "%d", layout->cnt); + gf_proc_dump_build_key(key, prefix, "preset"); + gf_proc_dump_write(key, "%d", layout->preset); + gf_proc_dump_build_key(key, prefix, "gen"); + gf_proc_dump_write(key, "%d", layout->gen); + if (layout->type != IA_INVAL) { + gf_proc_dump_build_key(key, prefix, "inode type"); + gf_proc_dump_write(key, "%d", layout->type); + } + + if (!IA_ISDIR (layout->type)) + goto out; + + for (i = 0; i < layout->cnt; i++) { + gf_proc_dump_build_key(key, prefix,"list[%d].err", i); + gf_proc_dump_write(key, "%d", layout->list[i].err); + gf_proc_dump_build_key(key, prefix,"list[%d].start", i); + gf_proc_dump_write(key, "%u", layout->list[i].start); + gf_proc_dump_build_key(key, prefix,"list[%d].stop", i); + gf_proc_dump_write(key, "%u", layout->list[i].stop); + if (layout->list[i].xlator) { + gf_proc_dump_build_key(key, prefix, + "list[%d].xlator.type", i); + gf_proc_dump_write(key, "%s", + layout->list[i].xlator->type); + gf_proc_dump_build_key(key, prefix, + "list[%d].xlator.name", i); + gf_proc_dump_write(key, "%s", + layout->list[i].xlator->name); + } + } + +out: + return; +} + + +int32_t +dht_priv_dump (xlator_t *this) +{ + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + dht_conf_t *conf = NULL; + int ret = -1; + + if (!this) + goto out; + + conf = this->private; + if (!conf) + goto out; + + ret = TRY_LOCK(&conf->subvolume_lock); + if (ret != 0) { + return ret; + } + + gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name); + gf_proc_dump_build_key(key_prefix,"xlator.cluster.dht","%s.priv", + this->name); + gf_proc_dump_write("subvol_cnt","%d", conf->subvolume_cnt); + for (i = 0; i < conf->subvolume_cnt; i++) { + sprintf (key, "subvolumes[%d]", i); + gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type, + conf->subvolumes[i]->name); + if (conf->file_layouts && conf->file_layouts[i]){ + sprintf (key, "file_layouts[%d]", i); + dht_layout_dump(conf->file_layouts[i], key); + } + if (conf->dir_layouts && conf->dir_layouts[i]) { + sprintf (key, "dir_layouts[%d]", i); + dht_layout_dump(conf->dir_layouts[i], key); + } + if (conf->subvolume_status) { + + sprintf (key, "subvolume_status[%d]", i); + gf_proc_dump_write(key, "%d", + (int)conf->subvolume_status[i]); + } + + } + + gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed); + gf_proc_dump_write("gen", "%d", conf->gen); + gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk); + gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes); + gf_proc_dump_write("disk_unit", "%c", conf->disk_unit); + gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval); + gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit); + if (conf ->du_stats) { + gf_proc_dump_write("du_stats.avail_percent", "%lf", + conf->du_stats->avail_percent); + gf_proc_dump_write("du_stats.avail_space", "%lu", + conf->du_stats->avail_space); + gf_proc_dump_write("du_stats.avail_inodes", "%lf", + conf->du_stats->avail_inodes); + gf_proc_dump_write("du_stats.log", "%lu", conf->du_stats->log); + } + + if (conf->last_stat_fetch.tv_sec) + gf_proc_dump_write("last_stat_fetch", "%s", + ctime(&conf->last_stat_fetch.tv_sec)); + + UNLOCK(&conf->subvolume_lock); + +out: + return ret; +} + +int32_t +dht_inodectx_dump (xlator_t *this, inode_t *inode) +{ + int ret = -1; + dht_layout_t *layout = NULL; + + if (!this) + goto out; + if (!inode) + goto out; + + ret = dht_inode_ctx_layout_get (inode, this, &layout); + + if ((ret != 0) || !layout) + return ret; + + gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name); + dht_layout_dump(layout, "layout"); + +out: + return ret; +} + +void +dht_fini (xlator_t *this) +{ + int i = 0; + dht_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + + conf = this->private; + this->private = NULL; + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + GF_FREE (conf->file_layouts[i]); + } + GF_FREE (conf->file_layouts); + } + + GF_FREE (conf->subvolumes); + + GF_FREE (conf->subvolume_status); + + GF_FREE (conf); + } +out: + return; +} + +int32_t +mem_acct_init (xlator_t *this) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + + ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" + "failed"); + return ret; + } +out: + return ret; +} + + +int +dht_parse_decommissioned_bricks (xlator_t *this, dht_conf_t *conf, + const char *bricks) +{ + int i = 0; + int ret = -1; + char *tmpstr = NULL; + char *dup_brick = NULL; + char *node = NULL; + + if (!conf || !bricks) + goto out; + + dup_brick = gf_strdup (bricks); + node = strtok_r (dup_brick, ",", &tmpstr); + while (node) { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (!strcmp (conf->subvolumes[i]->name, node)) { + conf->decommissioned_bricks[i] = + conf->subvolumes[i]; + conf->decommission_subvols_cnt++; + gf_log (this->name, GF_LOG_INFO, + "decommissioning subvolume %s", + conf->subvolumes[i]->name); + break; + } + } + if (i == conf->subvolume_cnt) { + /* Wrong node given. */ + goto out; + } + node = strtok_r (NULL, ",", &tmpstr); + } + + ret = 0; + conf->decommission_in_progress = 1; +out: + GF_FREE (dup_brick); + + return ret; +} + + +int +dht_decommissioned_remove (xlator_t *this, dht_conf_t *conf) +{ + int i = 0; + int ret = -1; + + if (!conf) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->decommissioned_bricks[i]) { + conf->decommissioned_bricks[i] = NULL; + conf->decommission_subvols_cnt--; + } + } + + ret = 0; +out: + + return ret; +} +void +dht_init_regex (xlator_t *this, dict_t *odict, char *name, + regex_t *re, gf_boolean_t *re_valid) +{ + char *temp_str; + + if (dict_get_str (odict, name, &temp_str) != 0) { + if (strcmp(name,"rsync-hash-regex")) { + return; + } + temp_str = "^\\.(.+)\\.[^.]+$"; + } + + if (*re_valid) { + regfree(re); + *re_valid = _gf_false; + } + + if (!strcmp(temp_str,"none")) { + return; + } + + if (regcomp(re,temp_str,REG_EXTENDED) == 0) { + gf_log (this->name, GF_LOG_INFO, + "using regex %s = %s", name, temp_str); + *re_valid = _gf_true; + } + else { + gf_log (this->name, GF_LOG_WARNING, + "compiling regex %s failed", temp_str); + } +} + +int +dht_reconfigure (xlator_t *this, dict_t *options) +{ + dht_conf_t *conf = NULL; + char *temp_str = NULL; + gf_boolean_t search_unhashed; + int ret = -1; + + GF_VALIDATE_OR_GOTO ("dht", this, out); + GF_VALIDATE_OR_GOTO ("dht", options, out); + + conf = this->private; + if (!conf) + return 0; + + if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) { + /* If option is not "auto", other options _should_ be boolean*/ + if (strcasecmp (temp_str, "auto")) { + if (!gf_string2boolean (temp_str, &search_unhashed)) { + gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" + " lookup-unhashed reconfigured (%s)", + temp_str); + conf->search_unhashed = search_unhashed; + } else { + gf_log(this->name, GF_LOG_ERROR, "Reconfigure:" + " lookup-unhashed should be boolean," + " not (%s), defaulting to (%d)", + temp_str, conf->search_unhashed); + //return -1; + ret = -1; + goto out; + } + } else { + gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" + " lookup-unhashed reconfigured auto "); + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; + } + } + + GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options, + percent_or_size, out); + /* option can be any one of percent or bytes */ + conf->disk_unit = 0; + if (conf->min_free_disk < 100.0) + conf->disk_unit = 'p'; + + GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options, + percent, out); + + GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt, + options, uint32, out); + + GF_OPTION_RECONF ("readdir-optimize", conf->readdir_optimize, options, + bool, out); + if (conf->defrag) { + GF_OPTION_RECONF ("rebalance-stats", conf->defrag->stats, + options, bool, out); + } + + if (dict_get_str (options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks (this, conf, temp_str); + if (ret == -1) + goto out; + } else { + ret = dht_decommissioned_remove (this, conf); + if (ret == -1) + goto out; + } + + dht_init_regex (this, options, "rsync-hash-regex", + &conf->rsync_regex, &conf->rsync_regex_valid); + dht_init_regex (this, options, "extra-hash-regex", + &conf->extra_regex, &conf->extra_regex_valid); + + ret = 0; +out: + return ret; +} + +static int +gf_defrag_pattern_list_fill (xlator_t *this, gf_defrag_info_t *defrag, char *data) +{ + int ret = -1; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_str = NULL; + char *num = NULL; + char *pattern_str = NULL; + char *pattern = NULL; + gf_defrag_pattern_list_t *temp_list = NULL; + gf_defrag_pattern_list_t *pattern_list = NULL; + + if (!this || !defrag || !data) + goto out; + + /* Get the pattern for pattern list. "pattern:<optional-size>" + * eg: *avi, *pdf:10MB, *:1TB + */ + pattern_str = strtok_r (data, ",", &tmp_str); + while (pattern_str) { + dup_str = gf_strdup (pattern_str); + pattern_list = GF_CALLOC (1, sizeof (gf_defrag_pattern_list_t), + 1); + if (!pattern_list) { + goto out; + } + pattern = strtok_r (dup_str, ":", &tmp_str1); + num = strtok_r (NULL, ":", &tmp_str1); + if (!pattern) + goto out; + if (!num) { + if (gf_string2bytesize(pattern, &pattern_list->size) + == 0) { + pattern = "*"; + } + } else if (gf_string2bytesize (num, &pattern_list->size) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", num); + goto out; + } + memcpy (pattern_list->path_pattern, pattern, strlen (dup_str)); + + if (!defrag->defrag_pattern) + temp_list = NULL; + else + temp_list = defrag->defrag_pattern; + + pattern_list->next = temp_list; + + defrag->defrag_pattern = pattern_list; + pattern_list = NULL; + + GF_FREE (dup_str); + dup_str = NULL; + + pattern_str = strtok_r (NULL, ",", &tmp_str); + } + + ret = 0; +out: + if (ret) + GF_FREE (pattern_list); + GF_FREE (dup_str); + + return ret; +} + +int +dht_init (xlator_t *this) +{ + dht_conf_t *conf = NULL; + char *temp_str = NULL; + int ret = -1; + int i = 0; + gf_defrag_info_t *defrag = NULL; + int cmd = 0; + char *node_uuid = NULL; + + + GF_VALIDATE_OR_GOTO ("dht", this, err); + + if (!this->children) { + gf_log (this->name, GF_LOG_CRITICAL, + "Distribute needs more than one subvolume"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile"); + } + + conf = GF_CALLOC (1, sizeof (*conf), gf_dht_mt_dht_conf_t); + if (!conf) { + goto err; + } + + ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd); + + if (cmd) { + defrag = GF_CALLOC (1, sizeof (gf_defrag_info_t), + gf_defrag_info_mt); + + GF_VALIDATE_OR_GOTO (this->name, defrag, err); + + LOCK_INIT (&defrag->lock); + + defrag->is_exiting = 0; + + conf->defrag = defrag; + + ret = dict_get_str (this->options, "node-uuid", &node_uuid); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, "node-uuid not " + "specified"); + goto err; + } + + if (uuid_parse (node_uuid, defrag->node_uuid)) { + gf_log (this->name, GF_LOG_ERROR, "Cannot parse " + "glusterd node uuid"); + goto err; + } + + defrag->cmd = cmd; + + defrag->stats = _gf_false; + } + + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; + if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { + /* If option is not "auto", other options _should_ be boolean */ + if (strcasecmp (temp_str, "auto")) + gf_string2boolean (temp_str, &conf->search_unhashed); + else + conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; + } + + GF_OPTION_INIT ("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool, + err); + + GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err); + + GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, + err); + + GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent, + err); + + conf->dir_spread_cnt = conf->subvolume_cnt; + GF_OPTION_INIT ("directory-layout-spread", conf->dir_spread_cnt, + uint32, err); + + GF_OPTION_INIT ("assert-no-child-down", conf->assert_no_child_down, + bool, err); + + GF_OPTION_INIT ("readdir-optimize", conf->readdir_optimize, bool, err); + + if (defrag) { + GF_OPTION_INIT ("rebalance-stats", defrag->stats, bool, err); + if (dict_get_str (this->options, "rebalance-filter", &temp_str) + == 0) { + if (gf_defrag_pattern_list_fill (this, defrag, temp_str) + == -1) { + gf_log (this->name, GF_LOG_ERROR, "Cannot parse" + " rebalance-filter (%s)", temp_str); + goto err; + } + } + } + + /* option can be any one of percent or bytes */ + conf->disk_unit = 0; + if (conf->min_free_disk < 100) + conf->disk_unit = 'p'; + + ret = dht_init_subvolumes (this, conf); + if (ret == -1) { + goto err; + } + + if (dict_get_str (this->options, "decommissioned-bricks", &temp_str) == 0) { + ret = dht_parse_decommissioned_bricks (this, conf, temp_str); + if (ret == -1) + goto err; + } + + dht_init_regex (this, this->options, "rsync-hash-regex", + &conf->rsync_regex, &conf->rsync_regex_valid); + dht_init_regex (this, this->options, "extra-hash-regex", + &conf->extra_regex, &conf->extra_regex_valid); + + ret = dht_layouts_init (this, conf); + if (ret == -1) { + goto err; + } + + LOCK_INIT (&conf->subvolume_lock); + LOCK_INIT (&conf->layout_lock); + + conf->gen = 1; + + this->local_pool = mem_pool_new (dht_local_t, 512); + if (!this->local_pool) { + gf_log (this->name, GF_LOG_ERROR, + "failed to create local_t's memory pool"); + goto err; + } + + GF_OPTION_INIT ("xattr-name", conf->xattr_name, str, err); + gf_asprintf (&conf->link_xattr_name, "%s.linkto", conf->xattr_name); + gf_asprintf (&conf->wild_xattr_name, "%s*", conf->xattr_name); + if (!conf->link_xattr_name || !conf->wild_xattr_name) { + goto err; + } + + this->private = conf; + + return 0; + +err: + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + GF_FREE (conf->file_layouts[i]); + } + GF_FREE (conf->file_layouts); + } + + GF_FREE (conf->subvolumes); + + GF_FREE (conf->subvolume_status); + + GF_FREE (conf->du_stats); + + GF_FREE (conf->defrag); + + GF_FREE (conf->xattr_name); + GF_FREE (conf->link_xattr_name); + GF_FREE (conf->wild_xattr_name); + + GF_FREE (conf); + } + + return -1; +} + + +struct volume_options options[] = { + { .key = {"lookup-unhashed"}, + .value = {"auto", "yes", "no", "enable", "disable", "1", "0", + "on", "off"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "on", + .description = "This option if set to ON, does a lookup through " + "all the sub-volumes, in case a lookup didn't return any result " + "from the hash subvolume. If set to OFF, it does not do a lookup " + "on the remaining subvolumes." + }, + { .key = {"min-free-disk"}, + .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, + .default_value = "10%", + .description = "Percentage/Size of disk space, after which the " + "process starts balancing out the cluster, and logs will appear " + "in log files", + }, + { .key = {"min-free-inodes"}, + .type = GF_OPTION_TYPE_PERCENT, + .default_value = "5%", + .description = "after system has only N% of inodes, warnings " + "starts to appear in log files", + }, + { .key = {"unhashed-sticky-bit"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + }, + { .key = {"use-readdirp"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", + .description = "This option if set to ON, forces the use of " + "readdirp, and hence also displays the stats of the files." + }, + { .key = {"assert-no-child-down"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option if set to ON, in the event of " + "CHILD_DOWN, will call exit." + }, + { .key = {"directory-layout-spread"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .validate = GF_OPT_VALIDATE_MIN, + .description = "Specifies the directory layout spread." + }, + { .key = {"decommissioned-bricks"}, + .type = GF_OPTION_TYPE_ANY, + .description = "This option if set to ON, decommissions " + "the brick, so that no new data is allowed to be created " + "on that brick." + }, + { .key = {"rebalance-cmd"}, + .type = GF_OPTION_TYPE_INT, + }, + { .key = {"node-uuid"}, + .type = GF_OPTION_TYPE_STR, + }, + { .key = {"rebalance-stats"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option if set to ON displays and logs the " + " time taken for migration of each file, during the rebalance " + "process. If set to OFF, the rebalance logs will only display the " + "time spent in each directory." + }, + { .key = {"readdir-optimize"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "off", + .description = "This option if set to ON enables the optimization " + "that allows DHT to requests non-first subvolumes to filter out " + "directory entries." + }, + { .key = {"rsync-hash-regex"}, + .type = GF_OPTION_TYPE_STR, + /* Setting a default here doesn't work. See dht_init_regex. */ + .description = "Regular expression for stripping temporary-file " + "suffix and prefix used by rsync, to prevent relocation when the " + "file is renamed." + }, + { .key = {"extra-hash-regex"}, + .type = GF_OPTION_TYPE_STR, + /* Setting a default here doesn't work. See dht_init_regex. */ + .description = "Regular expression for stripping temporary-file " + "suffix and prefix used by an application, to prevent relocation when " + "the file is renamed." + }, + { .key = {"rebalance-filter"}, + .type = GF_OPTION_TYPE_STR, + }, + + { .key = {"xattr-name"}, + .type = GF_OPTION_TYPE_STR, + .default_value = "trusted.glusterfs.dht", + .description = "Base for extended attributes used by this " + "translator instance, to avoid conflicts with others above or " + "below it." + }, + + /* NUFA option */ + { .key = {"local-volume-name"}, + .type = GF_OPTION_TYPE_XLATOR + }, + + /* switch option */ + { .key = {"pattern.switch.case"}, + .type = GF_OPTION_TYPE_ANY + }, + + { .key = {NULL} }, +}; diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c index d70d713ac..fc0ca2f77 100644 --- a/xlators/cluster/dht/src/dht.c +++ b/xlators/cluster/dht/src/dht.c @@ -14,502 +14,15 @@ #include "config.h" #endif -/* TODO: add NS locking */ - #include "statedump.h" #include "dht-common.h" -/* TODO: - - use volumename in xattr instead of "dht" - - use NS locks - - handle all cases in self heal layout reconstruction - - complete linkfile selfheal -*/ -struct volume_options options[]; - -void -dht_layout_dump (dht_layout_t *layout, const char *prefix) -{ - - char key[GF_DUMP_MAX_BUF_LEN]; - int i = 0; - - if (!layout) - goto out; - if (!prefix) - goto out; - - gf_proc_dump_build_key(key, prefix, "cnt"); - gf_proc_dump_write(key, "%d", layout->cnt); - gf_proc_dump_build_key(key, prefix, "preset"); - gf_proc_dump_write(key, "%d", layout->preset); - gf_proc_dump_build_key(key, prefix, "gen"); - gf_proc_dump_write(key, "%d", layout->gen); - if (layout->type != IA_INVAL) { - gf_proc_dump_build_key(key, prefix, "inode type"); - gf_proc_dump_write(key, "%d", layout->type); - } - - if (!IA_ISDIR (layout->type)) - goto out; - - for (i = 0; i < layout->cnt; i++) { - gf_proc_dump_build_key(key, prefix,"list[%d].err", i); - gf_proc_dump_write(key, "%d", layout->list[i].err); - gf_proc_dump_build_key(key, prefix,"list[%d].start", i); - gf_proc_dump_write(key, "%u", layout->list[i].start); - gf_proc_dump_build_key(key, prefix,"list[%d].stop", i); - gf_proc_dump_write(key, "%u", layout->list[i].stop); - if (layout->list[i].xlator) { - gf_proc_dump_build_key(key, prefix, - "list[%d].xlator.type", i); - gf_proc_dump_write(key, "%s", - layout->list[i].xlator->type); - gf_proc_dump_build_key(key, prefix, - "list[%d].xlator.name", i); - gf_proc_dump_write(key, "%s", - layout->list[i].xlator->name); - } - } - -out: - return; -} - - -int32_t -dht_priv_dump (xlator_t *this) -{ - char key_prefix[GF_DUMP_MAX_BUF_LEN]; - char key[GF_DUMP_MAX_BUF_LEN]; - int i = 0; - dht_conf_t *conf = NULL; - int ret = -1; - - if (!this) - goto out; - - conf = this->private; - if (!conf) - goto out; - - ret = TRY_LOCK(&conf->subvolume_lock); - if (ret != 0) { - return ret; - } - - gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name); - gf_proc_dump_build_key(key_prefix,"xlator.cluster.dht","%s.priv", - this->name); - gf_proc_dump_write("subvol_cnt","%d", conf->subvolume_cnt); - for (i = 0; i < conf->subvolume_cnt; i++) { - sprintf (key, "subvolumes[%d]", i); - gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type, - conf->subvolumes[i]->name); - if (conf->file_layouts && conf->file_layouts[i]){ - sprintf (key, "file_layouts[%d]", i); - dht_layout_dump(conf->file_layouts[i], key); - } - if (conf->dir_layouts && conf->dir_layouts[i]) { - sprintf (key, "dir_layouts[%d]", i); - dht_layout_dump(conf->dir_layouts[i], key); - } - if (conf->subvolume_status) { - - sprintf (key, "subvolume_status[%d]", i); - gf_proc_dump_write(key, "%d", - (int)conf->subvolume_status[i]); - } - - } - - gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed); - gf_proc_dump_write("gen", "%d", conf->gen); - gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk); - gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes); - gf_proc_dump_write("disk_unit", "%c", conf->disk_unit); - gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval); - gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit); - if (conf ->du_stats) { - gf_proc_dump_write("du_stats.avail_percent", "%lf", - conf->du_stats->avail_percent); - gf_proc_dump_write("du_stats.avail_space", "%lu", - conf->du_stats->avail_space); - gf_proc_dump_write("du_stats.avail_inodes", "%lf", - conf->du_stats->avail_inodes); - gf_proc_dump_write("du_stats.log", "%lu", conf->du_stats->log); - } - - if (conf->last_stat_fetch.tv_sec) - gf_proc_dump_write("last_stat_fetch", "%s", - ctime(&conf->last_stat_fetch.tv_sec)); - - UNLOCK(&conf->subvolume_lock); - -out: - return ret; -} - -int32_t -dht_inodectx_dump (xlator_t *this, inode_t *inode) -{ - int ret = -1; - dht_layout_t *layout = NULL; - - if (!this) - goto out; - if (!inode) - goto out; - - ret = dht_inode_ctx_layout_get (inode, this, &layout); - - if ((ret != 0) || !layout) - return ret; - - gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name); - dht_layout_dump(layout, "layout"); - -out: - return ret; -} - -int -notify (xlator_t *this, int event, void *data, ...) -{ - int ret = -1; - va_list ap; - dict_t *output = NULL; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - - - if (!data) - goto out; - - va_start (ap, data); - output = va_arg (ap, dict_t*); - - ret = dht_notify (this, event, data, output); - -out: - return ret; -} - -void -fini (xlator_t *this) -{ - int i = 0; - dht_conf_t *conf = NULL; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - - conf = this->private; - this->private = NULL; - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - GF_FREE (conf->subvolumes); - - GF_FREE (conf->subvolume_status); - - GF_FREE (conf); - } -out: - return; -} - -int32_t -mem_acct_init (xlator_t *this) -{ - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - - ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1); - - if (ret != 0) { - gf_log (this->name, GF_LOG_ERROR, "Memory accounting init" - "failed"); - return ret; - } -out: - return ret; -} - - -int -dht_parse_decommissioned_bricks (xlator_t *this, dht_conf_t *conf, - const char *bricks) -{ - int i = 0; - int ret = -1; - char *tmpstr = NULL; - char *dup_brick = NULL; - char *node = NULL; - - if (!conf || !bricks) - goto out; - - dup_brick = gf_strdup (bricks); - node = strtok_r (dup_brick, ",", &tmpstr); - while (node) { - for (i = 0; i < conf->subvolume_cnt; i++) { - if (!strcmp (conf->subvolumes[i]->name, node)) { - conf->decommissioned_bricks[i] = - conf->subvolumes[i]; - conf->decommission_subvols_cnt++; - gf_log (this->name, GF_LOG_INFO, - "decommissioning subvolume %s", - conf->subvolumes[i]->name); - break; - } - } - if (i == conf->subvolume_cnt) { - /* Wrong node given. */ - goto out; - } - node = strtok_r (NULL, ",", &tmpstr); - } - - ret = 0; - conf->decommission_in_progress = 1; -out: - GF_FREE (dup_brick); - - return ret; -} - -int -reconfigure (xlator_t *this, dict_t *options) -{ - dht_conf_t *conf = NULL; - char *temp_str = NULL; - gf_boolean_t search_unhashed; - int ret = -1; - - GF_VALIDATE_OR_GOTO ("dht", this, out); - GF_VALIDATE_OR_GOTO ("dht", options, out); - - conf = this->private; - if (!conf) - return 0; - - if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean*/ - if (strcasecmp (temp_str, "auto")) { - if (!gf_string2boolean (temp_str, &search_unhashed)) { - gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" - " lookup-unhashed reconfigured (%s)", - temp_str); - conf->search_unhashed = search_unhashed; - } else { - gf_log(this->name, GF_LOG_ERROR, "Reconfigure:" - " lookup-unhashed should be boolean," - " not (%s), defaulting to (%d)", - temp_str, conf->search_unhashed); - //return -1; - ret = -1; - goto out; - } - } else { - gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:" - " lookup-unhashed reconfigured auto "); - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } - } - - GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options, - percent_or_size, out); - /* option can be any one of percent or bytes */ - conf->disk_unit = 0; - if (conf->min_free_disk < 100.0) - conf->disk_unit = 'p'; - - GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options, - percent, out); - - GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt, - options, uint32, out); - - GF_OPTION_RECONF ("readdir-optimize", conf->readdir_optimize, options, - bool, out); - if (conf->defrag) { - GF_OPTION_RECONF ("rebalance-stats", conf->defrag->stats, - options, bool, out); - } - - if (dict_get_str (options, "decommissioned-bricks", &temp_str) == 0) { - ret = dht_parse_decommissioned_bricks (this, conf, temp_str); - if (ret == -1) - goto out; - } - - ret = 0; -out: - return ret; -} - - -int -init (xlator_t *this) -{ - dht_conf_t *conf = NULL; - char *temp_str = NULL; - int ret = -1; - int i = 0; - gf_defrag_info_t *defrag = NULL; - int cmd = 0; - char *node_uuid = NULL; - - - GF_VALIDATE_OR_GOTO ("dht", this, err); - - if (!this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "Distribute needs more than one subvolume"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } - - conf = GF_CALLOC (1, sizeof (*conf), gf_dht_mt_dht_conf_t); - if (!conf) { - goto err; - } - - ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd); - - if (cmd) { - defrag = GF_CALLOC (1, sizeof (gf_defrag_info_t), - gf_defrag_info_mt); - - GF_VALIDATE_OR_GOTO (this->name, defrag, err); - - LOCK_INIT (&defrag->lock); - - defrag->is_exiting = 0; - - conf->defrag = defrag; - - ret = dict_get_str (this->options, "node-uuid", &node_uuid); - if (ret) { - gf_log (this->name, GF_LOG_ERROR, "node-uuid not " - "specified"); - goto err; - } - - if (uuid_parse (node_uuid, defrag->node_uuid)) { - gf_log (this->name, GF_LOG_ERROR, "Cannot parse " - "glusterd node uuid"); - goto err; - } - - defrag->cmd = cmd; - - defrag->stats = _gf_false; - } - - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; - if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean */ - if (strcasecmp (temp_str, "auto")) - gf_string2boolean (temp_str, &conf->search_unhashed); - else - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } - - GF_OPTION_INIT ("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool, - err); - - GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err); - - GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size, - err); - - GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent, - err); - - conf->dir_spread_cnt = conf->subvolume_cnt; - GF_OPTION_INIT ("directory-layout-spread", conf->dir_spread_cnt, - uint32, err); - - GF_OPTION_INIT ("assert-no-child-down", conf->assert_no_child_down, - bool, err); - - GF_OPTION_INIT ("readdir-optimize", conf->readdir_optimize, bool, err); - - if (defrag) { - GF_OPTION_INIT ("rebalance-stats", defrag->stats, bool, err); - } - - /* option can be any one of percent or bytes */ - conf->disk_unit = 0; - if (conf->min_free_disk < 100) - conf->disk_unit = 'p'; - - ret = dht_init_subvolumes (this, conf); - if (ret == -1) { - goto err; - } - - if (dict_get_str (this->options, "decommissioned-bricks", &temp_str) == 0) { - ret = dht_parse_decommissioned_bricks (this, conf, temp_str); - if (ret == -1) - goto err; - } - - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; - } - - LOCK_INIT (&conf->subvolume_lock); - LOCK_INIT (&conf->layout_lock); - - conf->gen = 1; - - this->local_pool = mem_pool_new (dht_local_t, 512); - if (!this->local_pool) { - gf_log (this->name, GF_LOG_ERROR, - "failed to create local_t's memory pool"); - goto err; - } - - this->private = conf; - - return 0; - -err: - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - GF_FREE (conf->subvolumes); - - GF_FREE (conf->subvolume_status); - - GF_FREE (conf->du_stats); - - GF_FREE (conf->defrag); - - GF_FREE (conf); - } - - return -1; -} - +class_methods_t class_methods = { + .init = dht_init, + .fini = dht_fini, + .reconfigure = dht_reconfigure, + .notify = dht_notify +}; struct xlator_fops fops = { .lookup = dht_lookup, @@ -557,6 +70,9 @@ struct xlator_fops fops = { .fxattrop = dht_fxattrop, .setattr = dht_setattr, .fsetattr = dht_fsetattr, + .fallocate = dht_fallocate, + .discard = dht_discard, + .zerofill = dht_zerofill, }; struct xlator_dumpops dumpops = { @@ -570,59 +86,4 @@ struct xlator_cbks cbks = { // .releasedir = dht_releasedir, .forget = dht_forget }; - - -struct volume_options options[] = { - { .key = {"lookup-unhashed"}, - .value = {"auto", "yes", "no", "enable", "disable", "1", "0", - "on", "off"}, - .type = GF_OPTION_TYPE_STR, - .default_value = "on", - }, - { .key = {"min-free-disk"}, - .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, - .default_value = "10%", - .description = "Percentage/Size of disk space that must be " - "kept free." - }, - { .key = {"min-free-inodes"}, - .type = GF_OPTION_TYPE_PERCENT, - .default_value = "5%", - .description = "Percentage inodes that must be " - "kept free." - }, - { .key = {"unhashed-sticky-bit"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - }, - { .key = {"use-readdirp"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "on", - }, - { .key = {"assert-no-child-down"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - }, - { .key = {"directory-layout-spread"}, - .type = GF_OPTION_TYPE_INT, - }, - { .key = {"decommissioned-bricks"}, - .type = GF_OPTION_TYPE_ANY, - }, - { .key = {"rebalance-cmd"}, - .type = GF_OPTION_TYPE_INT, - }, - { .key = {"node-uuid"}, - .type = GF_OPTION_TYPE_STR, - }, - { .key = {"rebalance-stats"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - }, - { .key = {"readdir-optimize"}, - .type = GF_OPTION_TYPE_BOOL, - .default_value = "off", - }, - - { .key = {NULL} }, -}; +; diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c index 701d7ae8d..e934acdf0 100644 --- a/xlators/cluster/dht/src/nufa.c +++ b/xlators/cluster/dht/src/nufa.c @@ -18,6 +18,8 @@ /* TODO: all 'TODO's in dht.c holds good */ +extern struct volume_options options[]; + int nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, int op_errno, @@ -52,7 +54,8 @@ nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret == -1) goto out; - is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); is_dir = check_is_dir (inode, stbuf, xattr); if (!is_dir && !is_linkfile) { @@ -201,7 +204,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, * revalidates directly go to the cached-subvolume. */ ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "Failed to set dict value."); @@ -222,7 +225,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, } else { do_fresh_lookup: ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "Failed to set dict value."); @@ -231,7 +234,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this, } ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht.linkto", 256); + conf->link_xattr_name, 256); if (ret < 0) { gf_log (this->name, GF_LOG_ERROR, "Failed to set dict value."); @@ -320,7 +323,8 @@ nufa_create (call_frame_t *frame, xlator_t *this, if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) { avail_subvol = dht_free_disk_available_subvol (this, - (xlator_t *)conf->private); + (xlator_t *)conf->private, + local); } if (subvol != avail_subvol) { @@ -330,9 +334,8 @@ nufa_create (call_frame_t *frame, xlator_t *this, local->flags = flags; local->umask = umask; local->cached_subvol = avail_subvol; - dht_linkfile_create (frame, - nufa_create_linkfile_create_cbk, - avail_subvol, subvol, loc); + dht_linkfile_create (frame, nufa_create_linkfile_create_cbk, + this, avail_subvol, subvol, loc); return 0; } @@ -362,17 +365,22 @@ nufa_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dht_local_t *local = NULL; local = frame->local; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + op_ret = -1; + goto err; + } if (op_ret >= 0) { - STACK_WIND (frame, dht_newfile_cbk, - local->cached_subvol, + STACK_WIND_COOKIE (frame, dht_newfile_cbk, + (void *)local->cached_subvol, local->cached_subvol, local->cached_subvol->fops->mknod, &local->loc, local->mode, local->rdev, local->umask, local->params); return 0; } - +err: WIPE (postparent); WIPE (preparent); @@ -420,7 +428,8 @@ nufa_mknod (call_frame_t *frame, xlator_t *this, if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) { avail_subvol = dht_free_disk_available_subvol (this, - (xlator_t *)conf->private); + (xlator_t *)conf->private, + local); } if (avail_subvol != subvol) { @@ -432,7 +441,7 @@ nufa_mknod (call_frame_t *frame, xlator_t *this, local->rdev = rdev; local->cached_subvol = avail_subvol; - dht_linkfile_create (frame, nufa_mknod_linkfile_cbk, + dht_linkfile_create (frame, nufa_mknod_linkfile_cbk, this, avail_subvol, subvol, loc); return 0; } @@ -440,9 +449,9 @@ nufa_mknod (call_frame_t *frame, xlator_t *this, gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, umask, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, rdev, umask, + params); return 0; @@ -455,42 +464,6 @@ err: } -int -notify (xlator_t *this, int event, void *data, ...) -{ - int ret = -1; - - ret = dht_notify (this, event, data); - - return ret; -} - -void -fini (xlator_t *this) -{ - int i = 0; - dht_conf_t *conf = NULL; - - conf = this->private; - - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - GF_FREE (conf->subvolumes); - - GF_FREE (conf->subvolume_status); - - GF_FREE (conf); - } - - return; -} - gf_boolean_t same_first_part (char *str1, char term1, char *str2, char term2) { @@ -511,194 +484,152 @@ same_first_part (char *str1, char term1, char *str2, char term2) } } -int -init (xlator_t *this) -{ - dht_conf_t *conf = NULL; - xlator_list_t *trav = NULL; - data_t *data = NULL; - char *local_volname = NULL; - char *temp_str = NULL; - int ret = -1; - int i = 0; - char my_hostname[256]; - double temp_free_disk = 0; - uint64_t size = 0; - xlator_t *local_subvol = NULL; - char *brick_host = NULL; - xlator_t *kid = NULL; - - if (!this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "NUFA needs more than one subvolume"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } +typedef struct nufa_args { + xlator_t *this; + char *volname; + gf_boolean_t addr_match; +} nufa_args_t; - conf = GF_CALLOC (1, sizeof (*conf), - gf_dht_mt_dht_conf_t); - if (!conf) { - goto err; - } - - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; - if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean */ - if (strcasecmp (temp_str, "auto")) - gf_string2boolean (temp_str, &conf->search_unhashed); - else - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } - - ret = dht_init_subvolumes (this, conf); - if (ret == -1) { - goto err; +static void +nufa_find_local_brick (xlator_t *xl, void *data) +{ + nufa_args_t *args = data; + xlator_t *this = args->this; + char *local_volname = args->volname; + gf_boolean_t addr_match = args->addr_match; + char *brick_host = NULL; + dht_conf_t *conf = this->private; + int ret = -1; + + /*This means a local subvol was already found. We pick the first brick + * that is local*/ + if (conf->private) + return; + + if (strcmp (xl->name, local_volname) == 0) { + conf->private = xl; + gf_log (this->name, GF_LOG_INFO, "Using specified subvol %s", + local_volname); + return; + } + + if (!addr_match) + return; + + ret = dict_get_str (xl->options, "remote-host", &brick_host); + if ((ret == 0) && + (gf_is_same_address (local_volname, brick_host) || + gf_is_local_addr (brick_host))) { + conf->private = xl; + gf_log (this->name, GF_LOG_INFO, "Using the first local " + "subvol %s", xl->name); + return; } - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; - } +} - LOCK_INIT (&conf->subvolume_lock); - LOCK_INIT (&conf->layout_lock); +static void +nufa_to_dht (xlator_t *this) +{ + GF_ASSERT (this); + GF_ASSERT (this->fops); - conf->gen = 1; + this->fops->lookup = dht_lookup; + this->fops->create = dht_create; + this->fops->mknod = dht_mknod; +} - local_volname = "localhost"; - ret = gethostname (my_hostname, 256); - if (ret < 0) { - gf_log (this->name, GF_LOG_WARNING, - "could not find hostname (%s)", - strerror (errno)); +int +nufa_find_local_subvol (xlator_t *this, + void (*fn) (xlator_t *each, void* data), void *data) +{ + int ret = -1; + dht_conf_t *conf = this->private; + xlator_list_t *trav = NULL; + xlator_t *parent = NULL; + xlator_t *candidate = NULL; + + xlator_foreach_depth_first (this, fn, data); + if (!conf->private) { + gf_log (this->name, GF_LOG_ERROR, "Couldn't find a local " + "brick"); + return -1; } - if (ret == 0) - local_volname = my_hostname; - - data = dict_get (this->options, "local-volume-name"); - if (data) { - local_volname = data->data; - } + candidate = conf->private; + trav = candidate->parents; + while (trav) { - for (trav = this->children; trav; trav = trav->next) { - if (strcmp (trav->xlator->name, local_volname) == 0) + parent = trav->xlator; + if (strcmp (parent->type, "cluster/nufa") == 0) { + gf_log (this->name, GF_LOG_INFO, "Found local subvol, " + "%s", candidate->name); + ret = 0; + conf->private = candidate; break; - if (local_subvol) { - continue; - } - kid = trav->xlator; - for (;;) { - if (dict_get_str(trav->xlator->options,"remote-host", - &brick_host) == 0) { - /* Found it. */ - break; - } - if (!kid->children) { - /* Nowhere further to look. */ - gf_log (this->name, GF_LOG_ERROR, - "could not get remote-host"); - goto err; - } - if (kid->children->next) { - /* Multiple choices, can't/shouldn't decide. */ - gf_log (this->name, GF_LOG_ERROR, - "NUFA found fan-out (type %s) volume", - kid->type); - goto err; - } - /* One-to-one xlators are OK, try the next one. */ - kid = kid->children->xlator; } - if (same_first_part(my_hostname,'.',brick_host,'.')) { - local_subvol = trav->xlator; - } - } - if (trav) { - gf_log (this->name, GF_LOG_INFO, - "Using specified subvol %s", local_volname); - conf->private = trav->xlator; - } - else if (local_subvol) { - gf_log (this->name, GF_LOG_INFO, - "Using first local subvol %s", local_subvol->name); - conf->private = local_subvol; - } - else { - gf_log (this->name, GF_LOG_ERROR, - "Could not find specified or local subvol"); - goto err; - + candidate = parent; + trav = parent->parents; } - /* The volume specified exists */ - - conf->min_free_disk = 10; - conf->disk_unit = 'p'; - - if (dict_get_str (this->options, "min-free-disk", - &temp_str) == 0) { - if (gf_string2percent (temp_str, - &temp_free_disk) == 0) { - if (temp_free_disk > 100) { - gf_string2bytesize (temp_str, &size); - conf->min_free_disk = size; - conf->disk_unit = 'b'; - } else { - conf->min_free_disk = temp_free_disk; - conf->disk_unit = 'p'; - } - } else { - gf_string2bytesize (temp_str, &size); - conf->min_free_disk = size; - conf->disk_unit = 'b'; - } - } + return ret; +} - conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t), - gf_dht_mt_dht_du_t); - if (!conf->du_stats) { - goto err; - } +int +nufa_init (xlator_t *this) +{ + data_t *data = NULL; + char *local_volname = NULL; + int ret = -1; + char my_hostname[256]; + gf_boolean_t addr_match = _gf_false; + nufa_args_t args = {0, }; - this->local_pool = mem_pool_new (dht_local_t, 128); - if (!this->local_pool) { - gf_log (this->name, GF_LOG_ERROR, - "failed to create local_t's memory pool"); - goto err; + ret = dht_init(this); + if (ret) { + return ret; } - this->private = conf; - - return 0; - -err: - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - GF_FREE (conf->subvolumes); + if ((data = dict_get (this->options, "local-volume-name"))) { + local_volname = data->data; - GF_FREE (conf->subvolume_status); + } else { + addr_match = _gf_true; + local_volname = "localhost"; + ret = gethostname (my_hostname, 256); + if (ret == 0) + local_volname = my_hostname; - GF_FREE (conf->du_stats); + else + gf_log (this->name, GF_LOG_WARNING, + "could not find hostname (%s)", + strerror (errno)); - GF_FREE (conf); } - return -1; + args.this = this; + args.volname = local_volname; + args.addr_match = addr_match; + ret = nufa_find_local_subvol (this, nufa_find_local_brick, &args); + if (ret) { + gf_log (this->name, GF_LOG_INFO, + "Unable to find local subvolume, switching " + "to dht mode"); + nufa_to_dht (this); + } + return 0; } +class_methods_t class_methods = { + .init = nufa_init, + .fini = dht_fini, + .reconfigure = dht_reconfigure, + .notify = dht_notify +}; + + struct xlator_fops fops = { .lookup = nufa_lookup, .create = nufa_create, @@ -743,19 +674,3 @@ struct xlator_fops fops = { struct xlator_cbks cbks = { .forget = dht_forget }; - - -struct volume_options options[] = { - { .key = {"lookup-unhashed"}, - .value = {"auto", "yes", "no", "enable", "disable", "1", "0", - "on", "off"}, - .type = GF_OPTION_TYPE_STR - }, - { .key = {"local-volume-name"}, - .type = GF_OPTION_TYPE_XLATOR - }, - { .key = {"min-free-disk"}, - .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, - }, - { .key = {NULL} }, -}; diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c index 2b4f97387..d3ea90ba8 100644 --- a/xlators/cluster/dht/src/switch.c +++ b/xlators/cluster/dht/src/switch.c @@ -22,6 +22,8 @@ #include <fnmatch.h> #include <string.h> +extern struct volume_options options[]; + struct switch_sched_array { xlator_t *xl; int32_t eligible; @@ -135,7 +137,8 @@ switch_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, if (op_ret == -1) goto out; - is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr, + conf->link_xattr_name); is_dir = check_is_dir (inode, stbuf, xattr); if (!is_dir && !is_linkfile) { @@ -289,11 +292,11 @@ switch_lookup (call_frame_t *frame, xlator_t *this, * attribute, revalidates directly go to the cached-subvolume. */ ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); if (ret < 0) gf_log (this->name, GF_LOG_WARNING, - "failed to set dict value for " - "trusted.glusterfs.dht"); + "failed to set dict value for %s", + conf->xattr_name); for (i = 0; i < layout->cnt; i++) { subvol = layout->list[i].xlator; @@ -308,18 +311,18 @@ switch_lookup (call_frame_t *frame, xlator_t *this, } else { do_fresh_lookup: ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht", 4 * 4); + conf->xattr_name, 4 * 4); if (ret < 0) gf_log (this->name, GF_LOG_WARNING, - "failed to set dict value for " - "trusted.glusterfs.dht"); + "failed to set dict value for %s", + conf->xattr_name); ret = dict_set_uint32 (local->xattr_req, - "trusted.glusterfs.dht.linkto", 256); + conf->link_xattr_name, 256); if (ret < 0) gf_log (this->name, GF_LOG_WARNING, - "failed to set dict value for " - "trusted.glusterfs.dht.linkto"); + "failed to set dict value for %s", + conf->link_xattr_name); if (!hashed_subvol) { gf_log (this->name, GF_LOG_DEBUG, @@ -434,7 +437,8 @@ switch_create (call_frame_t *frame, xlator_t *this, avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol); if (dht_is_subvol_filled (this, avail_subvol)) { avail_subvol = - dht_free_disk_available_subvol (this, avail_subvol); + dht_free_disk_available_subvol (this, avail_subvol, + local); } if (subvol != avail_subvol) { @@ -443,9 +447,8 @@ switch_create (call_frame_t *frame, xlator_t *this, local->flags = flags; local->umask = umask; local->cached_subvol = avail_subvol; - dht_linkfile_create (frame, - switch_create_linkfile_create_cbk, - avail_subvol, subvol, loc); + dht_linkfile_create (frame, switch_create_linkfile_create_cbk, + this, avail_subvol, subvol, loc); return 0; } @@ -475,17 +478,22 @@ switch_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, dht_local_t *local = NULL; local = frame->local; + if (!local || !local->cached_subvol) { + op_errno = EINVAL; + op_ret = -1; + goto err; + } if (op_ret >= 0) { - STACK_WIND (frame, dht_newfile_cbk, - local->cached_subvol, + STACK_WIND_COOKIE (frame, dht_newfile_cbk, + (void *)local->cached_subvol, local->cached_subvol, local->cached_subvol->fops->mknod, &local->loc, local->mode, local->rdev, local->umask, local->params); return 0; } - +err: DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent, postparent, xdata); return 0; @@ -529,7 +537,8 @@ switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol); if (dht_is_subvol_filled (this, avail_subvol)) { avail_subvol = - dht_free_disk_available_subvol (this, avail_subvol); + dht_free_disk_available_subvol (this, avail_subvol, + local); } if (avail_subvol != subvol) { @@ -542,16 +551,16 @@ switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, local->cached_subvol = avail_subvol; dht_linkfile_create (frame, switch_mknod_linkfile_cbk, - avail_subvol, subvol, loc); + this, avail_subvol, subvol, loc); return 0; } gf_log (this->name, GF_LOG_TRACE, "creating %s on %s", loc->path, subvol->name); - STACK_WIND (frame, dht_newfile_cbk, - subvol, subvol->fops->mknod, - loc, mode, rdev, umask, params); + STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol, + subvol->fops->mknod, loc, mode, rdev, umask, + params); return 0; @@ -564,20 +573,9 @@ err: } -int -notify (xlator_t *this, int event, void *data, ...) -{ - int ret = -1; - - ret = dht_notify (this, event, data); - - return ret; -} - void -fini (xlator_t *this) +switch_fini (xlator_t *this) { - int i = 0; dht_conf_t *conf = NULL; struct switch_struct *trav = NULL; struct switch_struct *prev = NULL; @@ -593,22 +591,9 @@ fini (xlator_t *this) trav = trav->next; GF_FREE (prev); } - - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - GF_FREE (conf->subvolumes); - - GF_FREE (conf->subvolume_status); - - GF_FREE (conf); } - return; + dht_fini(this); } int @@ -834,68 +819,18 @@ err: } -int -init (xlator_t *this) +int32_t +switch_init (xlator_t *this) { dht_conf_t *conf = NULL; data_t *data = NULL; - char *temp_str = NULL; int ret = -1; - int i = 0; - double temp_free_disk = 0; - uint64_t size = 0; - - if (!this->children) { - gf_log (this->name, GF_LOG_CRITICAL, - "SWITCH needs more than one subvolume"); - return -1; - } - - if (!this->parents) { - gf_log (this->name, GF_LOG_WARNING, - "dangling volume. check volfile"); - } - - conf = GF_CALLOC (1, sizeof (*conf), gf_switch_mt_dht_conf_t); - if (!conf) { - goto err; - } - - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON; - if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) { - /* If option is not "auto", other options _should_ be boolean */ - if (strcasecmp (temp_str, "auto")) - gf_string2boolean (temp_str, &conf->search_unhashed); - else - conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO; - } - - conf->unhashed_sticky_bit = 0; - if (dict_get_str (this->options, "unhashed-sticky-bit", - &temp_str) == 0) { - gf_string2boolean (temp_str, &conf->unhashed_sticky_bit); - } - conf->min_free_disk = 10.0; - conf->disk_unit = 'p'; - - if (dict_get_str (this->options, "min-free-disk", - &temp_str) == 0) { - if (gf_string2percent (temp_str, &temp_free_disk) == 0) { - if (temp_free_disk > 100) { - gf_string2bytesize (temp_str, &size); - conf->min_free_disk = size; - conf->disk_unit = 'b'; - } else { - conf->min_free_disk = temp_free_disk; - conf->disk_unit = 'p'; - } - } else { - gf_string2bytesize (temp_str, &size); - conf->min_free_disk = size; - conf->disk_unit = 'b'; - } + ret = dht_init(this); + if (ret) { + return ret; } + conf = this->private; data = dict_get (this->options, "pattern.switch.case"); if (data) { @@ -906,60 +841,23 @@ init (xlator_t *this) } } - ret = dht_init_subvolumes (this, conf); - if (ret == -1) { - goto err; - } - - ret = dht_layouts_init (this, conf); - if (ret == -1) { - goto err; - } - - LOCK_INIT (&conf->subvolume_lock); - LOCK_INIT (&conf->layout_lock); - - conf->gen = 1; - - conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t), - gf_switch_mt_dht_du_t); - if (!conf->du_stats) { - goto err; - } - - this->local_pool = mem_pool_new (dht_local_t, 128); - if (!this->local_pool) { - gf_log (this->name, GF_LOG_ERROR, - "failed to create local_t's memory pool"); - goto err; - } - this->private = conf; - return 0; err: - if (conf) { - if (conf->file_layouts) { - for (i = 0; i < conf->subvolume_cnt; i++) { - GF_FREE (conf->file_layouts[i]); - } - GF_FREE (conf->file_layouts); - } - - GF_FREE (conf->subvolumes); - - GF_FREE (conf->subvolume_status); - - GF_FREE (conf->du_stats); - - GF_FREE (conf); - } - + dht_fini(this); return -1; } +class_methods_t class_methods = { + .init = switch_init, + .fini = switch_fini, + .reconfigure = dht_reconfigure, + .notify = dht_notify +}; + + struct xlator_fops fops = { .lookup = switch_lookup, .create = switch_create, @@ -1004,19 +902,3 @@ struct xlator_fops fops = { struct xlator_cbks cbks = { .forget = dht_forget }; - - -struct volume_options options[] = { - { .key = {"lookup-unhashed"}, - .value = {"auto", "yes", "no", "enable", "disable", "1", "0", - "on", "off"}, - .type = GF_OPTION_TYPE_STR - }, - { .key = {"pattern.switch.case"}, - .type = GF_OPTION_TYPE_ANY - }, - { .key = {"min-free-disk"}, - .type = GF_OPTION_TYPE_PERCENT_OR_SIZET, - }, - { .key = {NULL} }, -}; diff --git a/xlators/cluster/ha/src/Makefile.am b/xlators/cluster/ha/src/Makefile.am index c5da56ff1..5c1364b7f 100644 --- a/xlators/cluster/ha/src/Makefile.am +++ b/xlators/cluster/ha/src/Makefile.am @@ -1,7 +1,7 @@ xlator_LTLIBRARIES = ha.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/cluster -ha_la_LDFLAGS = -module -avoidversion +ha_la_LDFLAGS = -module -avoid-version ha_la_SOURCES = ha-helpers.c ha.c ha_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la diff --git a/xlators/cluster/map/src/Makefile.am b/xlators/cluster/map/src/Makefile.am index a51c13573..a278b05e2 100644 --- a/xlators/cluster/map/src/Makefile.am +++ b/xlators/cluster/map/src/Makefile.am @@ -1,7 +1,7 @@ xlator_LTLIBRARIES = map.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/cluster -map_la_LDFLAGS = -module -avoidversion +map_la_LDFLAGS = -module -avoid-version map_la_SOURCES = map.c map-helper.c map_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la diff --git a/xlators/cluster/stripe/src/Makefile.am b/xlators/cluster/stripe/src/Makefile.am index 42846df83..2d151422a 100644 --- a/xlators/cluster/stripe/src/Makefile.am +++ b/xlators/cluster/stripe/src/Makefile.am @@ -2,7 +2,7 @@ xlator_LTLIBRARIES = stripe.la xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster -stripe_la_LDFLAGS = -module -avoidversion +stripe_la_LDFLAGS = -module -avoid-version stripe_la_SOURCES = stripe.c stripe-helpers.c \ $(top_builddir)/xlators/lib/src/libxlator.c diff --git a/xlators/cluster/stripe/src/stripe-mem-types.h b/xlators/cluster/stripe/src/stripe-mem-types.h index e05ba0c29..e9ac9cf46 100644 --- a/xlators/cluster/stripe/src/stripe-mem-types.h +++ b/xlators/cluster/stripe/src/stripe-mem-types.h @@ -20,6 +20,7 @@ enum gf_stripe_mem_types_ { gf_stripe_mt_stripe_fd_ctx_t, gf_stripe_mt_char, gf_stripe_mt_int8_t, + gf_stripe_mt_int32_t, gf_stripe_mt_xlator_t, gf_stripe_mt_stripe_private_t, gf_stripe_mt_stripe_options, diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c index 43e3c3ed2..69b510e23 100644 --- a/xlators/cluster/stripe/src/stripe.c +++ b/xlators/cluster/stripe/src/stripe.c @@ -3774,6 +3774,524 @@ err: int32_t +stripe_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t callcnt = 0; + stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; + call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; + + LOCK(&frame->lock); + { + callcnt = ++mlocal->call_count; + + if (op_ret == 0) { + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; + + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); + + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; + } + + /* return the first failure */ + if (mlocal->op_ret == 0) { + mlocal->op_ret = op_ret; + mlocal->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + + STRIPE_STACK_UNWIND (fallocate, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); + } +out: + STRIPE_STACK_DESTROY(frame); + return 0; +} + +int32_t +stripe_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_fd_ctx_t *fctx = NULL; + int32_t op_errno = 1; + int32_t idx = 0; + int32_t offset_offset = 0; + int32_t remaining_size = 0; + off_t fill_size = 0; + uint64_t stripe_size = 0; + uint64_t tmp_fctx = 0; + off_t dest_offset = 0; + call_frame_t *fframe = NULL; + stripe_local_t *flocal = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + inode_ctx_get (fd->inode, this, &tmp_fctx); + if (!tmp_fctx) { + op_errno = EINVAL; + goto err; + } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + frame->local = local; + local->stripe_size = stripe_size; + local->fctx = fctx; + + if (!stripe_size) { + gf_log (this->name, GF_LOG_DEBUG, + "Wrong stripe size for the file"); + op_errno = EINVAL; + goto err; + } + + while (1) { + fframe = copy_frame(frame); + flocal = mem_get0(this->local_pool); + if (!flocal) { + op_errno = ENOMEM; + goto err; + } + flocal->orig_frame = frame; + fframe->local = flocal; + + /* send fallocate request to the associated child node */ + idx = (((offset + offset_offset) / + local->stripe_size) % fctx->stripe_count); + + fill_size = (local->stripe_size - + ((offset + offset_offset) % local->stripe_size)); + if (fill_size > remaining_size) + fill_size = remaining_size; + + remaining_size -= fill_size; + + local->wind_count++; + if (remaining_size == 0) + local->unwind = 1; + + dest_offset = offset + offset_offset; + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, fctx->stripe_count); + + /* + * TODO: Create a separate handler for coalesce mode that sends a + * single fallocate per-child (since the ranges are linear). + */ + STACK_WIND(fframe, stripe_fallocate_cbk, fctx->xl_array[idx], + fctx->xl_array[idx]->fops->fallocate, fd, mode, + dest_offset, fill_size, xdata); + + offset_offset += fill_size; + if (remaining_size == 0) + break; + } + + return 0; +err: + if (fframe) + STRIPE_STACK_DESTROY(fframe); + + STRIPE_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + + +int32_t +stripe_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t callcnt = 0; + stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; + call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; + + LOCK(&frame->lock); + { + callcnt = ++mlocal->call_count; + + if (op_ret == 0) { + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; + + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); + + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; + } + + /* return the first failure */ + if (mlocal->op_ret == 0) { + mlocal->op_ret = op_ret; + mlocal->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + + STRIPE_STACK_UNWIND (discard, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); + } +out: + STRIPE_STACK_DESTROY(frame); + return 0; +} + +int32_t +stripe_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_fd_ctx_t *fctx = NULL; + int32_t op_errno = 1; + int32_t idx = 0; + int32_t offset_offset = 0; + int32_t remaining_size = 0; + off_t fill_size = 0; + uint64_t stripe_size = 0; + uint64_t tmp_fctx = 0; + off_t dest_offset = 0; + call_frame_t *fframe = NULL; + stripe_local_t *flocal = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + inode_ctx_get (fd->inode, this, &tmp_fctx); + if (!tmp_fctx) { + op_errno = EINVAL; + goto err; + } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + frame->local = local; + local->stripe_size = stripe_size; + local->fctx = fctx; + + if (!stripe_size) { + gf_log (this->name, GF_LOG_DEBUG, + "Wrong stripe size for the file"); + op_errno = EINVAL; + goto err; + } + + while (1) { + fframe = copy_frame(frame); + flocal = mem_get0(this->local_pool); + if (!flocal) { + op_errno = ENOMEM; + goto err; + } + flocal->orig_frame = frame; + fframe->local = flocal; + + /* send discard request to the associated child node */ + idx = (((offset + offset_offset) / + local->stripe_size) % fctx->stripe_count); + + fill_size = (local->stripe_size - + ((offset + offset_offset) % local->stripe_size)); + if (fill_size > remaining_size) + fill_size = remaining_size; + + remaining_size -= fill_size; + + local->wind_count++; + if (remaining_size == 0) + local->unwind = 1; + + dest_offset = offset + offset_offset; + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, fctx->stripe_count); + + /* + * TODO: Create a separate handler for coalesce mode that sends a + * single discard per-child (since the ranges are linear). + */ + STACK_WIND(fframe, stripe_discard_cbk, fctx->xl_array[idx], + fctx->xl_array[idx]->fops->discard, fd, dest_offset, + fill_size, xdata); + + offset_offset += fill_size; + if (remaining_size == 0) + break; + } + + return 0; +err: + if (fframe) + STRIPE_STACK_DESTROY(fframe); + + STRIPE_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t +stripe_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + int32_t callcnt = 0; + stripe_local_t *local = NULL; + stripe_local_t *mlocal = NULL; + call_frame_t *prev = NULL; + call_frame_t *mframe = NULL; + + if (!this || !frame || !frame->local || !cookie) { + gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref"); + goto out; + } + + prev = cookie; + local = frame->local; + mframe = local->orig_frame; + mlocal = mframe->local; + + LOCK(&frame->lock); + { + callcnt = ++mlocal->call_count; + + if (op_ret == 0) { + mlocal->post_buf = *postbuf; + mlocal->pre_buf = *prebuf; + + mlocal->prebuf_blocks += prebuf->ia_blocks; + mlocal->postbuf_blocks += postbuf->ia_blocks; + + correct_file_size(prebuf, mlocal->fctx, prev); + correct_file_size(postbuf, mlocal->fctx, prev); + + if (mlocal->prebuf_size < prebuf->ia_size) + mlocal->prebuf_size = prebuf->ia_size; + if (mlocal->postbuf_size < postbuf->ia_size) + mlocal->postbuf_size = postbuf->ia_size; + } + + /* return the first failure */ + if (mlocal->op_ret == 0) { + mlocal->op_ret = op_ret; + mlocal->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if ((callcnt == mlocal->wind_count) && mlocal->unwind) { + mlocal->pre_buf.ia_size = mlocal->prebuf_size; + mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks; + mlocal->post_buf.ia_size = mlocal->postbuf_size; + mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks; + + STRIPE_STACK_UNWIND (zerofill, mframe, mlocal->op_ret, + mlocal->op_errno, &mlocal->pre_buf, + &mlocal->post_buf, NULL); + } +out: + STRIPE_STACK_DESTROY(frame); + return 0; +} + +int32_t +stripe_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) +{ + stripe_local_t *local = NULL; + stripe_fd_ctx_t *fctx = NULL; + int32_t op_errno = 1; + int32_t idx = 0; + int32_t offset_offset = 0; + int32_t remaining_size = 0; + off_t fill_size = 0; + uint64_t stripe_size = 0; + uint64_t tmp_fctx = 0; + off_t dest_offset = 0; + call_frame_t *fframe = NULL; + stripe_local_t *flocal = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + inode_ctx_get (fd->inode, this, &tmp_fctx); + if (!tmp_fctx) { + op_errno = EINVAL; + goto err; + } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + fctx = (stripe_fd_ctx_t *)(long)tmp_fctx; + stripe_size = fctx->stripe_size; + + STRIPE_VALIDATE_FCTX (fctx, err); + + remaining_size = len; + + local = mem_get0 (this->local_pool); + if (!local) { + op_errno = ENOMEM; + goto err; + } + frame->local = local; + local->stripe_size = stripe_size; + local->fctx = fctx; + + if (!stripe_size) { + gf_log (this->name, GF_LOG_DEBUG, + "Wrong stripe size for the file"); + op_errno = EINVAL; + goto err; + } + + while (1) { + fframe = copy_frame(frame); + flocal = mem_get0(this->local_pool); + if (!flocal) { + op_errno = ENOMEM; + goto err; + } + flocal->orig_frame = frame; + fframe->local = flocal; + + idx = (((offset + offset_offset) / + local->stripe_size) % fctx->stripe_count); + + fill_size = (local->stripe_size - + ((offset + offset_offset) % local->stripe_size)); + if (fill_size > remaining_size) + fill_size = remaining_size; + + remaining_size -= fill_size; + + local->wind_count++; + if (remaining_size == 0) + local->unwind = 1; + + dest_offset = offset + offset_offset; + if (fctx->stripe_coalesce) + dest_offset = coalesced_offset(dest_offset, + local->stripe_size, + fctx->stripe_count); + + STACK_WIND(fframe, stripe_zerofill_cbk, fctx->xl_array[idx], + fctx->xl_array[idx]->fops->zerofill, fd, + dest_offset, fill_size, xdata); + offset_offset += fill_size; + if (remaining_size == 0) + break; + } + + return 0; +err: + if (fframe) + STRIPE_STACK_DESTROY(fframe); + + STRIPE_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int32_t stripe_release (xlator_t *this, fd_t *fd) { return 0; @@ -3809,6 +4327,7 @@ notify (xlator_t *this, int32_t event, void *data, ...) stripe_private_t *priv = NULL; int down_client = 0; int i = 0; + gf_boolean_t heard_from_all_children = _gf_false; if (!this) return 0; @@ -3820,30 +4339,34 @@ notify (xlator_t *this, int32_t event, void *data, ...) switch (event) { case GF_EVENT_CHILD_UP: - case GF_EVENT_CHILD_CONNECTING: { /* get an index number to set */ for (i = 0; i < priv->child_count; i++) { if (data == priv->xl_array[i]) break; } - priv->state[i] = 1; - for (i = 0; i < priv->child_count; i++) { - if (!priv->state[i]) - down_client++; + + if (priv->child_count == i) { + gf_log (this->name, GF_LOG_ERROR, + "got GF_EVENT_CHILD_UP bad subvolume %s", + data? ((xlator_t *)data)->name: NULL); + break; } LOCK (&priv->lock); { - priv->nodes_down = down_client; if (data == FIRST_CHILD (this)) priv->first_child_down = 0; - if (!priv->nodes_down) - default_notify (this, event, data); + priv->last_event[i] = event; } UNLOCK (&priv->lock); } break; + case GF_EVENT_CHILD_CONNECTING: + { + // 'CONNECTING' doesn't ensure its CHILD_UP, so do nothing + goto out; + } case GF_EVENT_CHILD_DOWN: { /* get an index number to set */ @@ -3851,20 +4374,19 @@ notify (xlator_t *this, int32_t event, void *data, ...) if (data == priv->xl_array[i]) break; } - priv->state[i] = 0; - for (i = 0; i < priv->child_count; i++) { - if (!priv->state[i]) - down_client++; + + if (priv->child_count == i) { + gf_log (this->name, GF_LOG_ERROR, + "got GF_EVENT_CHILD_DOWN bad subvolume %s", + data? ((xlator_t *)data)->name: NULL); + break; } LOCK (&priv->lock); { - priv->nodes_down = down_client; - if (data == FIRST_CHILD (this)) priv->first_child_down = 1; - if (priv->nodes_down) - default_notify (this, event, data); + priv->last_event[i] = event; } UNLOCK (&priv->lock); } @@ -3874,10 +4396,30 @@ notify (xlator_t *this, int32_t event, void *data, ...) { /* */ default_notify (this, event, data); + goto out; } break; } + // Consider child as down if it's last_event is not CHILD_UP + for (i = 0, down_client = 0; i < priv->child_count; i++) + if (priv->last_event[i] != GF_EVENT_CHILD_UP) + down_client++; + + LOCK (&priv->lock); + { + priv->nodes_down = down_client; + } + UNLOCK (&priv->lock); + + heard_from_all_children = _gf_true; + for (i = 0; i < priv->child_count; i++) + if (!priv->last_event[i]) + heard_from_all_children = _gf_false; + + if (heard_from_all_children) + default_notify (this, event, data); +out: return 0; } @@ -3923,6 +4465,37 @@ stripe_setxattr_cbk (call_frame_t *frame, void *cookie, return 0; } +#ifdef HAVE_BD_XLATOR +int +stripe_is_bd (dict_t *this, char *key, data_t *value, void *data) +{ + gf_boolean_t *is_bd = data; + + if (data == NULL) + return 0; + + if (XATTR_IS_BD (key)) + *is_bd = _gf_true; + + return 0; +} + +inline gf_boolean_t +stripe_setxattr_is_bd (dict_t *dict) +{ + gf_boolean_t is_bd = _gf_false; + + if (dict == NULL) + goto out; + + dict_foreach (dict, stripe_is_bd, &is_bd); +out: + return is_bd; +} +#else +#define stripe_setxattr_is_bd(dict) _gf_false +#endif + int stripe_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, int flags, dict_t *xdata) @@ -3932,6 +4505,7 @@ stripe_setxattr (call_frame_t *frame, xlator_t *this, stripe_private_t *priv = NULL; stripe_local_t *local = NULL; int i = 0; + gf_boolean_t is_bd = _gf_false; VALIDATE_OR_GOTO (frame, err); VALIDATE_OR_GOTO (this, err); @@ -3954,11 +4528,15 @@ stripe_setxattr (call_frame_t *frame, xlator_t *this, local->wind_count = priv->child_count; local->op_ret = local->op_errno = 0; + is_bd = stripe_setxattr_is_bd (dict); + /** * Set xattrs for directories on all subvolumes. Additionally - * this power is only given to a special client. + * this power is only given to a special client. Bd xlator + * also needs xattrs for regular files (ie LVs) */ - if ((frame->root->pid == -1) && IA_ISDIR (loc->inode->ia_type)) { + if (((frame->root->pid == GF_CLIENT_PID_GSYNCD) && + IA_ISDIR (loc->inode->ia_type)) || is_bd) { for (i = 0; i < priv->child_count; i++, trav = trav->next) { STACK_WIND (frame, stripe_setxattr_cbk, trav->xlator, trav->xlator->fops->setxattr, @@ -3989,21 +4567,21 @@ stripe_fsetxattr_cbk (call_frame_t *frame, void *cookie, int -stripe_is_lockinfo (dict_t *this, - char *key, - data_t *value, - void *data) +stripe_is_special_key (dict_t *this, + char *key, + data_t *value, + void *data) { - gf_boolean_t *is_lockinfo = NULL; + gf_boolean_t *is_special = NULL; if (data == NULL) { goto out; } - is_lockinfo = data; + is_special = data; - if (XATTR_IS_LOCKINFO (key)) - *is_lockinfo = _gf_true; + if (XATTR_IS_LOCKINFO (key) || XATTR_IS_BD (key)) + *is_special = _gf_true; out: return 0; @@ -4080,7 +4658,7 @@ stripe_fsetxattr_is_special (dict_t *dict) goto out; } - dict_foreach (dict, stripe_is_lockinfo, &is_spl); + dict_foreach (dict, stripe_is_special_key, &is_spl); out: return is_spl; @@ -4598,9 +5176,9 @@ init (xlator_t *this) if (!priv->xl_array) goto out; - priv->state = GF_CALLOC (count, sizeof (int8_t), - gf_stripe_mt_int8_t); - if (!priv->state) + priv->last_event = GF_CALLOC (count, sizeof (int), + gf_stripe_mt_int32_t); + if (!priv->last_event) goto out; priv->child_count = count; @@ -4701,6 +5279,7 @@ fini (xlator_t *this) trav = trav->next; GF_FREE (prev); } + GF_FREE (priv->last_event); LOCK_DESTROY (&priv->lock); GF_FREE (priv); } @@ -4841,10 +5420,10 @@ stripe_vgetxattr_cbk (call_frame_t *frame, void *cookie, xattr->pos = cky; xattr->xattr_value = gf_memdup (xattr_val, - xattr->xattr_value, xattr->xattr_len); - local->xattr_total_len += xattr->xattr_len + 1; + if (xattr->xattr_value != NULL) + local->xattr_total_len += xattr->xattr_len + 1; } } out: @@ -4929,7 +5508,7 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this, if (name && (strcmp (GF_XATTR_MARKER_KEY, name) == 0) - && (-1 == frame->root->pid)) { + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { local->marker.call_count = priv->child_count; sub_volumes = alloca ( priv->child_count * @@ -4944,7 +5523,8 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this, if (cluster_getmarkerattr (frame, this, loc, name, local, stripe_getxattr_unwind, sub_volumes, priv->child_count, - MARKER_UUID_TYPE, priv->vol_uuid)) { + MARKER_UUID_TYPE, marker_uuid_default_gauge, + priv->vol_uuid)) { op_errno = EINVAL; goto err; } @@ -5000,7 +5580,7 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this, if (name &&(*priv->vol_uuid)) { if ((match_uuid_local (name, priv->vol_uuid) == 0) - && (-1 == frame->root->pid)) { + && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) { if (!IA_FILE_OR_DIR (loc->inode->ia_type)) local->marker.call_count = 1; @@ -5023,6 +5603,7 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this, sub_volumes, local->marker.call_count, MARKER_XTIME_TYPE, + marker_xtime_default_gauge, priv->vol_uuid)) { op_errno = EINVAL; goto err; @@ -5196,6 +5777,9 @@ struct xlator_fops fops = { .removexattr = stripe_removexattr, .fremovexattr = stripe_fremovexattr, .readdirp = stripe_readdirp, + .fallocate = stripe_fallocate, + .discard = stripe_discard, + .zerofill = stripe_zerofill, }; struct xlator_cbks cbks = { @@ -5221,10 +5805,10 @@ struct volume_options options[] = { }, { .key = {"coalesce"}, .type = GF_OPTION_TYPE_BOOL, - .default_value = "false", - .description = "Enable coalesce mode to flatten striped files as " - "stored on the server (i.e., eliminate holes caused " - "by the traditional format)." + .default_value = "true", + .description = "Enable/Disable coalesce mode to flatten striped " + "files as stored on the server (i.e., eliminate holes " + "caused by the traditional format)." }, { .key = {NULL} }, }; diff --git a/xlators/cluster/stripe/src/stripe.h b/xlators/cluster/stripe/src/stripe.h index 53b683c73..5673d18f3 100644 --- a/xlators/cluster/stripe/src/stripe.h +++ b/xlators/cluster/stripe/src/stripe.h @@ -98,8 +98,8 @@ struct stripe_private { gf_lock_t lock; uint8_t nodes_down; int8_t first_child_down; + int *last_event; int8_t child_count; - int8_t *state; /* Current state of child node */ gf_boolean_t xattr_supported; /* default yes */ gf_boolean_t coalesce; char vol_uuid[UUID_SIZE + 1]; |
